Changes - j0ke.net Open Build Service

@@ -318,7 +318,8 @@ lea r11, [r10+r10*2] lea rax, [rdi-4] lea r9, [rdi-4+r11] - %define pix_tmp rsp-104 ; 16x6 for the buffer + 8 for x264_deblock_v_luma_sse2's return address + sub rsp, 0x68 + %define pix_tmp rsp ; transpose 6x16 -> tmp space TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp @@ -352,6 +353,7 @@ movq mm3, [pix_tmp+0x40] TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) + add rsp, 0x68 ret

@@ -805,9 +805,10 @@ ; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- cglobal x264_intra_satd_x3_16x16_mmxext -%define sums rsp-32 ; +24 -%define top_1d rsp-64 ; +32 -%define left_1d rsp-96 ; +32 + sub rsp, 96 +%define sums rsp+64 ; size 24 +%define top_1d rsp+32 ; size 32 +%define left_1d rsp ; size 32 mov qword [sums+0], 0 mov qword [sums+8], 0 @@ -913,15 +914,17 @@ movd [parm3q+8], mm2 ; i16x16_dc satd movd [parm3q+4], mm1 ; i16x16_h satd movd [parm3q+0], mm0 ; i16x16_v satd + add rsp, 96 ret ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- cglobal x264_intra_satd_x3_8x8c_mmxext -%define sums rsp-32 ; +24 -%define top_1d rsp-48 ; +16 -%define left_1d rsp-64 ; +16 + sub rsp, 64 +%define sums rsp+32 ; size 24 +%define top_1d rsp+16 ; size 16 +%define left_1d rsp ; size 16 mov qword [sums+0], 0 mov qword [sums+8], 0 @@ -1041,13 +1044,87 @@ movd [parm3q+0], mm0 ; i8x8c_dc satd movd [parm3q+4], mm1 ; i8x8c_h satd movd [parm3q+8], mm2 ; i8x8c_v satd + add rsp, 64 ret +; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ) +; { +; int nmv=0, i, j; +; *(uint32_t*)(masks+width) = 0; +; for( i=0; i<width; i+=8 ) +; { +; uint64_t mask = *(uint64_t*)(masks+i); +; if( !mask ) continue; +; for( j=0; j<8; j++ ) +; if( mask & (255<<j*8) ) +; mvs[nmv++] = i+j; +; } +; return nmv; +; } +cglobal x264_pixel_ads_mvs + ; mvs = parm5q + ; masks = rsp + ; width = r10 + mov dword [rsp+r10], 0 + xor eax, eax + xor esi, esi +.loopi: + mov rdi, [rsp+rsi] + test rdi, rdi + jz .nexti + xor ecx, ecx +%macro TEST 1 + mov [parm5q+rax*2], si + test edi, 0xff<<(%1*8) + setne cl + add eax, ecx + inc esi +%endmacro + TEST 0 + TEST 1 + TEST 2 + TEST 3 + shr rdi, 32 + TEST 0 + TEST 1 + TEST 2 + TEST 3 + cmp esi, r10d + jl .loopi + leave + ret +.nexti: + add esi, 8 + cmp esi, r10d + jl .loopi + leave + ret + +%macro ADS_START 0 + push rbp + mov rbp, rsp + sub rsp, parm6q + sub rsp, 4 + and rsp, ~15 + mov rax, rsp + mov r10d, parm6d + shl parm3q, 1 +%endmacro + +%macro ADS_END 1 + add parm2q, 8*%1 + add parm4q, 8*%1 + add rax, 4*%1 + sub parm6d, 4*%1 + jg .loop + jmp x264_pixel_ads_mvs +%endmacro + ;----------------------------------------------------------------------------- -; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, -; uint16_t *res, int width ) +; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, +; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) ;----------------------------------------------------------------------------- cglobal x264_pixel_ads4_mmxext movq mm6, [parm1q] @@ -1056,7 +1133,7 @@ pshufw mm6, mm6, 0xAA pshufw mm5, mm4, 0 pshufw mm4, mm4, 0xAA - shl parm3q, 1 + ADS_START .loop: movq mm0, [parm2q] movq mm1, [parm2q+16] @@ -1073,19 +1150,19 @@ MMX_ABS mm3, mm1 paddw mm0, mm2 paddw mm0, mm3 - movq [parm4q], mm0 - add parm2q, 8 - add parm4q, 8 - sub parm5d, 4 - jg .loop - nop - ret + pshufw mm1, [rbp+16], 0 + paddusw mm0, [parm4q] + psubusw mm1, mm0 + packsswb mm1, mm1 + movd [rax], mm1 + ADS_END 1 cglobal x264_pixel_ads2_mmxext movq mm6, [parm1q] + pshufw mm5, parm7q, 0 pshufw mm7, mm6, 0 pshufw mm6, mm6, 0xAA - shl parm3q, 1 + ADS_START .loop: movq mm0, [parm2q] movq mm1, [parm2q+parm3q] @@ -1094,16 +1171,17 @@ MMX_ABS mm0, mm2 MMX_ABS mm1, mm3 paddw mm0, mm1 - movq [parm4q], mm0 - add parm2q, 8 - add parm4q, 8 - sub parm5d, 4 - jg .loop - nop - ret + paddusw mm0, [parm4q] + movq mm4, mm5 + psubusw mm4, mm0 + packsswb mm4, mm4 + movd [rax], mm4 + ADS_END 1 cglobal x264_pixel_ads1_mmxext pshufw mm7, [parm1q], 0 + pshufw mm6, parm7q, 0 + ADS_START .loop: movq mm0, [parm2q] movq mm1, [parm2q+8] @@ -1111,11 +1189,113 @@ psubw mm1, mm7 MMX_ABS mm0, mm2 MMX_ABS mm1, mm3 - movq [parm4q], mm0 - movq [parm4q+8], mm1 - add parm2q, 16 - add parm4q, 16 - sub parm5d, 8 - jg .loop - nop - ret + paddusw mm0, [parm4q] + paddusw mm1, [parm4q+8] + movq mm4, mm6 + movq mm5, mm6 + psubusw mm4, mm0 + psubusw mm5, mm1 + packsswb mm4, mm5 + movq [rax], mm4 + ADS_END 2 + +%macro ADS_SSE2 1 +cglobal x264_pixel_ads4_%1 + movdqa xmm4, [parm1q] + pshuflw xmm8, parm7q, 0 + pshuflw xmm7, xmm4, 0 + pshuflw xmm6, xmm4, 0xAA + pshufhw xmm5, xmm4, 0 + pshufhw xmm4, xmm4, 0xAA + punpcklqdq xmm8, xmm8 + punpcklqdq xmm7, xmm7 + punpcklqdq xmm6, xmm6 + punpckhqdq xmm5, xmm5 + punpckhqdq xmm4, xmm4 + ADS_START + movdqu xmm10, [parm2q] + movdqu xmm11, [parm2q+parm3q] +.loop: + movdqa xmm0, xmm10 + movdqu xmm1, [parm2q+16] + movdqa xmm10, xmm1 + psubw xmm0, xmm7 + psubw xmm1, xmm6 + MMX_ABS xmm0, xmm2 + MMX_ABS xmm1, xmm3 + movdqa xmm2, xmm11 + movdqu xmm3, [parm2q+parm3q+16] + movdqa xmm11, xmm3 + psubw xmm2, xmm5 + psubw xmm3, xmm4 + paddw xmm0, xmm1 + movdqu xmm9, [parm4q] + MMX_ABS xmm2, xmm1 + MMX_ABS xmm3, xmm1 + paddw xmm0, xmm2 + paddw xmm0, xmm3 + paddusw xmm0, xmm9 + movdqa xmm1, xmm8 + psubusw xmm1, xmm0 + packsswb xmm1, xmm1 + movq [rax], xmm1 + ADS_END 2 + +cglobal x264_pixel_ads2_%1 + movq xmm6, [parm1q] + pshuflw xmm8, parm7q, 0 + pshuflw xmm7, xmm6, 0 + pshuflw xmm6, xmm6, 0xAA + punpcklqdq xmm8, xmm8 + punpcklqdq xmm7, xmm7 + punpcklqdq xmm6, xmm6 + ADS_START +.loop: + movdqu xmm0, [parm2q] + movdqu xmm1, [parm2q+parm3q] + psubw xmm0, xmm7 + psubw xmm1, xmm6 + movdqu xmm9, [parm4q] + MMX_ABS xmm0, xmm2 + MMX_ABS xmm1, xmm3 + paddw xmm0, xmm1 + paddusw xmm0, xmm9 + movdqa xmm4, xmm8 + psubusw xmm4, xmm0 + packsswb xmm4, xmm4 + movq [rax], xmm4 + ADS_END 2 + +cglobal x264_pixel_ads1_%1 + pshuflw xmm7, [parm1q], 0 + pshuflw xmm8, parm7q, 0 + punpcklqdq xmm7, xmm7 + punpcklqdq xmm8, xmm8 + ADS_START +.loop: + movdqu xmm0, [parm2q] + movdqu xmm1, [parm2q+16] + psubw xmm0, xmm7 + psubw xmm1, xmm7 + movdqu xmm9, [parm4q] + movdqu xmm10, [parm4q+16] + MMX_ABS xmm0, xmm2 + MMX_ABS xmm1, xmm3 + paddusw xmm0, xmm9 + paddusw xmm1, xmm10 + movdqa xmm4, xmm8 + movdqa xmm5, xmm8 + psubusw xmm4, xmm0 + psubusw xmm5, xmm1 + packsswb xmm4, xmm5 + movdqa [rax], xmm4 + ADS_END 4 +%endmacro + +ADS_SSE2 sse2 +%ifdef HAVE_SSE3 +%macro MMX_ABS 2 + pabsw %1, %1 +%endmacro +ADS_SSE2 ssse3 +%endif

@@ -1000,9 +1000,9 @@ pshufd xmm6, xmm4, 0xB1 packssdw xmm1, xmm2 paddd xmm3, xmm5 - pmaddwd xmm1, xmm8 - paddd xmm4, xmm6 pshufd xmm1, xmm1, 0xD8 + paddd xmm4, xmm6 + pmaddwd xmm1, xmm8 movdqa xmm5, xmm3 punpckldq xmm3, xmm4 punpckhdq xmm5, xmm4

@@ -28,7 +28,7 @@ #ifdef SYS_BEOS #include <kernel/OS.h> #endif -#ifdef SYS_MACOSX +#if defined(SYS_MACOSX) || defined(SYS_FREEBSD) #include <sys/types.h> #include <sys/sysctl.h> #endif @@ -219,7 +219,7 @@ #if !defined(HAVE_PTHREAD) return 1; -#elif defined(WIN32) +#elif defined(_WIN32) return pthread_num_processors_np(); #elif defined(SYS_LINUX) @@ -237,7 +237,7 @@ get_system_info( &info ); return info.cpu_count; -#elif defined(SYS_MACOSX) +#elif defined(SYS_MACOSX) || defined(SYS_FREEBSD) int numberOfCPUs; size_t length = sizeof( numberOfCPUs ); if( sysctlbyname("hw.ncpu", &numberOfCPUs, &length, NULL, 0) )

@@ -1579,24 +1579,91 @@ +; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ) +cglobal x264_pixel_ads_mvs + mov ebx, [ebp+24] ; mvs + mov ecx, esp ; masks + mov edi, [ebp+28] ; width + mov dword [ecx+edi], 0 + push esi + push ebp + xor eax, eax + xor esi, esi +.loopi: + mov ebp, [ecx+esi] + mov edx, [ecx+esi+4] + or edx, ebp + jz .nexti + xor edx, edx +%macro TEST 1 + mov [ebx+eax*2], si + test ebp, 0xff<<(%1*8) + setne dl + add eax, edx + inc esi +%endmacro + TEST 0 + TEST 1 + TEST 2 + TEST 3 + mov ebp, [ecx+esi] + TEST 0 + TEST 1 + TEST 2 + TEST 3 + cmp esi, edi + jl .loopi + jmp .end +.nexti: + add esi, 8 + cmp esi, edi + jl .loopi +.end: + pop ebp + pop esi + mov edi, [ebp-8] + mov ebx, [ebp-4] + leave + ret + +%macro ADS_START 0 + push ebp + mov ebp, esp + push ebx + push edi + mov eax, [ebp+12] ; sums + mov ebx, [ebp+16] ; delta + mov ecx, [ebp+20] ; cost_mvx + mov edx, [ebp+28] ; width + sub esp, edx + sub esp, 4 + and esp, ~15 + mov edi, esp + shl ebx, 1 +%endmacro + +%macro ADS_END 1 + add eax, 8*%1 + add ecx, 8*%1 + add edi, 4*%1 + sub edx, 4*%1 + jg .loop + jmp x264_pixel_ads_mvs +%endmacro + ;----------------------------------------------------------------------------- -; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, -; uint16_t *res, int width ) +; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, +; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) ;----------------------------------------------------------------------------- cglobal x264_pixel_ads4_mmxext - push ebx - mov eax, [esp+8] + mov eax, [esp+4] movq mm6, [eax] movq mm4, [eax+8] pshufw mm7, mm6, 0 pshufw mm6, mm6, 0xAA pshufw mm5, mm4, 0 pshufw mm4, mm4, 0xAA - mov eax, [esp+12] - mov ebx, [esp+16] - mov ecx, [esp+20] - mov edx, [esp+24] - shl ebx, 1 + ADS_START .loop: movq mm0, [eax] movq mm1, [eax+16] @@ -1613,25 +1680,20 @@ MMX_ABS mm3, mm1 paddw mm0, mm2 paddw mm0, mm3 - movq [ecx], mm0 - add eax, 8 - add ecx, 8 - sub edx, 4 - jg .loop - pop ebx - ret + pshufw mm1, [ebp+32], 0 + paddusw mm0, [ecx] + psubusw mm1, mm0 + packsswb mm1, mm1 + movd [edi], mm1 + ADS_END 1 cglobal x264_pixel_ads2_mmxext - push ebx - mov eax, [esp+8] + mov eax, [esp+4] movq mm6, [eax] + pshufw mm5, [esp+28], 0 pshufw mm7, mm6, 0 pshufw mm6, mm6, 0xAA - mov eax, [esp+12] - mov ebx, [esp+16] - mov ecx, [esp+20] - mov edx, [esp+24] - shl ebx, 1 + ADS_START .loop: movq mm0, [eax] movq mm1, [eax+ebx] @@ -1640,20 +1702,18 @@ MMX_ABS mm0, mm2 MMX_ABS mm1, mm3 paddw mm0, mm1 - movq [ecx], mm0 - add eax, 8 - add ecx, 8 - sub edx, 4 - jg .loop - pop ebx - ret + paddusw mm0, [ecx] + movq mm4, mm5 + psubusw mm4, mm0 + packsswb mm4, mm4 + movd [edi], mm4 + ADS_END 1 cglobal x264_pixel_ads1_mmxext mov eax, [esp+4] pshufw mm7, [eax], 0 - mov eax, [esp+8] - mov ecx, [esp+16] - mov edx, [esp+20] + pshufw mm6, [esp+28], 0 + ADS_START .loop: movq mm0, [eax] movq mm1, [eax+8] @@ -1661,11 +1721,115 @@ psubw mm1, mm7 MMX_ABS mm0, mm2 MMX_ABS mm1, mm3 - movq [ecx], mm0 - movq [ecx+8], mm1 - add eax, 16 - add ecx, 16 - sub edx, 8 - jg .loop - nop - ret + paddusw mm0, [ecx] + paddusw mm1, [ecx+8] + movq mm4, mm6 + movq mm5, mm6 + psubusw mm4, mm0 + psubusw mm5, mm1 + packsswb mm4, mm5 + movq [edi], mm4 + ADS_END 2 + +%macro ADS_SSE2 1 +cglobal x264_pixel_ads4_%1 + mov eax, [esp+4] ; enc_dc + movdqa xmm4, [eax] + pshuflw xmm7, xmm4, 0 + pshuflw xmm6, xmm4, 0xAA + pshufhw xmm5, xmm4, 0 + pshufhw xmm4, xmm4, 0xAA + punpcklqdq xmm7, xmm7 + punpcklqdq xmm6, xmm6 + punpckhqdq xmm5, xmm5 + punpckhqdq xmm4, xmm4 + ADS_START +.loop: + movdqu xmm0, [eax] + movdqu xmm1, [eax+16] + psubw xmm0, xmm7 + psubw xmm1, xmm6 + MMX_ABS xmm0, xmm2 + MMX_ABS xmm1, xmm3 + movdqu xmm2, [eax+ebx] + movdqu xmm3, [eax+ebx+16] + psubw xmm2, xmm5 + psubw xmm3, xmm4 + paddw xmm0, xmm1 + MMX_ABS xmm2, xmm1 + MMX_ABS xmm3, xmm1 + paddw xmm0, xmm2 + paddw xmm0, xmm3 + movd xmm1, [ebp+32] ; thresh + movdqu xmm2, [ecx] + pshuflw xmm1, xmm1, 0 + punpcklqdq xmm1, xmm1 + paddusw xmm0, xmm2 + psubusw xmm1, xmm0 + packsswb xmm1, xmm1 + movq [edi], xmm1 + ADS_END 2 + +cglobal x264_pixel_ads2_%1 + mov eax, [esp+4] ; enc_dc + movq xmm6, [eax] + movd xmm5, [esp+28] ; thresh + pshuflw xmm7, xmm6, 0 + pshuflw xmm6, xmm6, 0xAA + pshuflw xmm5, xmm5, 0 + punpcklqdq xmm7, xmm7 + punpcklqdq xmm6, xmm6 + punpcklqdq xmm5, xmm5 + ADS_START +.loop: + movdqu xmm0, [eax] + movdqu xmm1, [eax+ebx] + psubw xmm0, xmm7 + psubw xmm1, xmm6 + movdqu xmm4, [ecx] + MMX_ABS xmm0, xmm2 + MMX_ABS xmm1, xmm3 + paddw xmm0, xmm1 + paddusw xmm0, xmm4 + movdqa xmm1, xmm5 + psubusw xmm1, xmm0 + packsswb xmm1, xmm1 + movq [edi], xmm1 + ADS_END 2 + +cglobal x264_pixel_ads1_%1 + mov eax, [esp+4] ; enc_dc + movd xmm7, [eax] + movd xmm6, [esp+28] ; thresh + pshuflw xmm7, xmm7, 0 + pshuflw xmm6, xmm6, 0 + punpcklqdq xmm7, xmm7 + punpcklqdq xmm6, xmm6 + ADS_START +.loop: + movdqu xmm0, [eax] + movdqu xmm1, [eax+16] + psubw xmm0, xmm7 + psubw xmm1, xmm7 + movdqu xmm2, [ecx] + movdqu xmm3, [ecx+16] + MMX_ABS xmm0, xmm4 + MMX_ABS xmm1, xmm5 + paddusw xmm0, xmm2 + paddusw xmm1, xmm3 + movdqa xmm4, xmm6 + movdqa xmm5, xmm6 + psubusw xmm4, xmm0 + psubusw xmm5, xmm1 + packsswb xmm4, xmm5 + movdqa [edi], xmm4 + ADS_END 4 +%endmacro + +ADS_SSE2 sse2 +%ifdef HAVE_SSE3 +%macro MMX_ABS 2 + pabsw %1, %1 +%endmacro +ADS_SSE2 ssse3 +%endif

@@ -973,9 +973,9 @@ pshufd xmm6, xmm4, 0xB1 packssdw xmm1, xmm2 paddd xmm3, xmm5 - pmaddwd xmm1, xmm7 - paddd xmm4, xmm6 pshufd xmm1, xmm1, 0xD8 + paddd xmm4, xmm6 + pmaddwd xmm1, xmm7 movdqa xmm5, xmm3 punpckldq xmm3, xmm4 punpckhdq xmm5, xmm4

@@ -81,11 +81,18 @@ const uint8_t *pix2, int stride2, int sums[2][4] ); float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width ); -void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, - uint16_t *res, int width ); -void x264_pixel_ads2_mmxext( int enc_dc[2], uint16_t *sums, int delta, - uint16_t *res, int width ); -void x264_pixel_ads1_mmxext( int enc_dc[1], uint16_t *sums, int delta, - uint16_t *res, int width ); +#define DECL_ADS( size, suffix ) \ +int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\ + uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ); +DECL_ADS( 4, mmxext ) +DECL_ADS( 2, mmxext ) +DECL_ADS( 1, mmxext ) +DECL_ADS( 4, sse2 ) +DECL_ADS( 2, sse2 ) +DECL_ADS( 1, sse2 ) +DECL_ADS( 4, ssse3 ) +DECL_ADS( 2, ssse3 ) +DECL_ADS( 1, ssse3 ) +#undef DECL_ADS #endif

@@ -430,7 +430,7 @@ uint8_t *ref = frame->plane[0] + y * stride - PADH; uint16_t *line = frame->integral + (y+1) * stride - PADH + 1; uint16_t v = line[0] = 0; - for( x = 0; x < stride-1; x++ ) + for( x = 1; x < stride-1; x++ ) line[x] = v += ref[x] + line[x-stride] - line[x-stride-1]; line -= 8*stride; if( y >= 9-PADV )

@@ -50,7 +50,7 @@ #if defined(_MSC_VER) || defined(SYS_SunOS) || defined(SYS_MACOSX) #define sqrtf sqrt #endif -#ifdef __WIN32__ +#ifdef _WIN32 #define rename(src,dst) (unlink(dst), rename(src,dst)) // POSIX says that rename() removes the destination, but win32 doesn't. #ifndef strtok_r #define strtok_r(str,delim,save) strtok(str,delim)

@@ -408,32 +408,50 @@ /**************************************************************************** * successive elimination ****************************************************************************/ -static void pixel_ads4( int enc_dc[4], uint16_t *sums, int delta, - uint16_t *res, int width ) +static int x264_pixel_ads4( int enc_dc[4], uint16_t *sums, int delta, + uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) { - int i; + int nmv=0, i; for( i=0; i<width; i++, sums++ ) - res[i] = abs( enc_dc[0] - sums[0] ) - + abs( enc_dc[1] - sums[8] ) - + abs( enc_dc[2] - sums[delta] ) - + abs( enc_dc[3] - sums[delta+8] ); + { + int ads = abs( enc_dc[0] - sums[0] ) + + abs( enc_dc[1] - sums[8] ) + + abs( enc_dc[2] - sums[delta] ) + + abs( enc_dc[3] - sums[delta+8] ) + + cost_mvx[i]; + if( ads < thresh ) + mvs[nmv++] = i; + } + return nmv; } -static void pixel_ads2( int enc_dc[2], uint16_t *sums, int delta, - uint16_t *res, int width ) +static int x264_pixel_ads2( int enc_dc[2], uint16_t *sums, int delta, + uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) { - int i; + int nmv=0, i; for( i=0; i<width; i++, sums++ ) - res[i] = abs( enc_dc[0] - sums[0] ) - + abs( enc_dc[1] - sums[delta] ); + { + int ads = abs( enc_dc[0] - sums[0] ) + + abs( enc_dc[1] - sums[delta] ) + + cost_mvx[i]; + if( ads < thresh ) + mvs[nmv++] = i; + } + return nmv; } -static void pixel_ads1( int enc_dc[1], uint16_t *sums, int delta, - uint16_t *res, int width ) +static int x264_pixel_ads1( int enc_dc[1], uint16_t *sums, int delta, + uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) { - int i; + int nmv=0, i; for( i=0; i<width; i++, sums++ ) - res[i] = abs( enc_dc[0] - sums[0] ); + { + int ads = abs( enc_dc[0] - sums[0] ) + + cost_mvx[i]; + if( ads < thresh ) + mvs[nmv++] = i; + } + return nmv; } @@ -459,20 +477,22 @@ pixf->name[PIXEL_4x8] = x264_pixel_##name##_4x8##cpu;\ pixf->name[PIXEL_4x4] = x264_pixel_##name##_4x4##cpu; +#define INIT_ADS( cpu ) \ + pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\ + pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\ + pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu; + INIT7( sad, ); INIT7( sad_x3, ); INIT7( sad_x4, ); INIT7( ssd, ); INIT7( satd, ); INIT4( sa8d, ); + INIT_ADS( ); pixf->ssim_4x4x2_core = ssim_4x4x2_core; pixf->ssim_end4 = ssim_end4; - pixf->ads[PIXEL_16x16] = pixel_ads4; - pixf->ads[PIXEL_16x8] = pixel_ads2; - pixf->ads[PIXEL_8x8] = pixel_ads1; - #ifdef HAVE_MMX if( cpu&X264_CPU_MMX ) { @@ -485,10 +505,7 @@ INIT7( sad_x3, _mmxext ); INIT7( sad_x4, _mmxext ); INIT7( satd, _mmxext ); - - pixf->ads[PIXEL_16x16] = x264_pixel_ads4_mmxext; - pixf->ads[PIXEL_16x8 ] = x264_pixel_ads2_mmxext; - pixf->ads[PIXEL_8x8 ] = x264_pixel_ads1_mmxext; + INIT_ADS( _mmxext ); #ifdef ARCH_X86 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext; @@ -535,6 +552,7 @@ INIT2( sad_x3, _sse2 ); INIT2( sad_x4, _sse2 ); INIT5( satd, _sse2 ); + INIT_ADS( _sse2 ); #ifdef ARCH_X86 if( cpu&X264_CPU_CACHELINE_SPLIT ) @@ -570,6 +588,7 @@ if( cpu&X264_CPU_SSSE3 ) { INIT5( satd, _ssse3 ); + INIT_ADS( _ssse3 ); #ifdef ARCH_X86_64 pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;

@@ -80,9 +80,9 @@ x264_pixel_cmp_x4_t sad_x4[7]; /* abs-diff-sum for successive elimination. - * may round width up to a multiple of 8. */ - void (*ads[7])( int enc_dc[4], uint16_t *sums, int delta, - uint16_t *res, int width ); + * may round width up to a multiple of 16. */ + int (*ads[7])( int enc_dc[4], uint16_t *sums, int delta, + uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ); /* calculate satd of V, H, and DC modes. * may be NULL, in which case just use pred+satd instead. */

@@ -167,16 +167,18 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ); +uint16_t *x264_cost_mv_fpel[52][4]; + /* initialize an array of lambda*nbits for all possible mvs */ static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a ) { static int16_t *p_cost_mv[52]; + int i, j; if( !p_cost_mv[a->i_qp] ) { /* could be faster, but isn't called many times */ /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */ - int i; p_cost_mv[a->i_qp] = x264_malloc( (4*4*2048 + 1) * sizeof(int16_t) ); p_cost_mv[a->i_qp] += 2*4*2048; for( i = 0; i <= 2*4*2048; i++ ) @@ -185,8 +187,19 @@ p_cost_mv[a->i_qp][i] = a->i_lambda * bs_size_se( i ); } } - a->p_cost_mv = p_cost_mv[a->i_qp]; + + /* FIXME is this useful for all me methods? */ + if( h->param.analyse.i_me_method == X264_ME_ESA && !x264_cost_mv_fpel[a->i_qp][0] ) + { + for( j=0; j<4; j++ ) + { + x264_cost_mv_fpel[a->i_qp][j] = x264_malloc( (4*2048 + 1) * sizeof(int16_t) ); + x264_cost_mv_fpel[a->i_qp][j] += 2*2048; + for( i = -2*2048; i < 2*2048; i++ ) + x264_cost_mv_fpel[a->i_qp][j][i] = p_cost_mv[a->i_qp][i*4+j]; + } + } } static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )

@@ -885,7 +885,7 @@ h->i_ref1 = X264_MIN( h->i_ref1, h->frames.i_max_ref1 ); h->i_ref0 = X264_MIN( h->i_ref0, h->frames.i_max_ref0 ); h->i_ref0 = X264_MIN( h->i_ref0, h->param.i_frame_reference ); // if reconfig() has lowered the limit - h->i_ref0 = X264_MIN( h->i_ref0, 16 - h->i_ref1 ); + assert( h->i_ref0 + h->i_ref1 <= 16 ); h->mb.pic.i_fref[0] = h->i_ref0; h->mb.pic.i_fref[1] = h->i_ref1; }

@@ -101,22 +101,19 @@ COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\ } -#define COST_MV_X4_ABS( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\ +#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\ {\ - h->pixf.sad_x4[i_pixel]( m->p_fenc[0],\ + h->pixf.sad_x3[i_pixel]( m->p_fenc[0],\ p_fref + (m0x) + (m0y)*m->i_stride[0],\ p_fref + (m1x) + (m1y)*m->i_stride[0],\ p_fref + (m2x) + (m2y)*m->i_stride[0],\ - p_fref + (m3x) + (m3y)*m->i_stride[0],\ m->i_stride[0], costs );\ - costs[0] += p_cost_mvx[m0x<<2]; /* no cost_mvy */\ - costs[1] += p_cost_mvx[m1x<<2];\ - costs[2] += p_cost_mvx[m2x<<2];\ - costs[3] += p_cost_mvx[m3x<<2];\ + costs[0] += p_cost_mvx[(m0x)<<2]; /* no cost_mvy */\ + costs[1] += p_cost_mvx[(m1x)<<2];\ + costs[2] += p_cost_mvx[(m2x)<<2];\ COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\ COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\ COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\ - COPY3_IF_LT( bcost, costs[3], bmx, m3x, bmy, m3y );\ } /* 1 */ @@ -454,13 +451,16 @@ case X264_ME_ESA: { - const int min_x = X264_MAX( bmx - i_me_range, mv_x_min); - const int min_y = X264_MAX( bmy - i_me_range, mv_y_min); - const int max_x = X264_MIN( bmx + i_me_range, mv_x_max); - const int max_y = X264_MIN( bmy + i_me_range, mv_y_max); - int mx, my; + const int min_x = X264_MAX( bmx - i_me_range, mv_x_min ); + const int min_y = X264_MAX( bmy - i_me_range, mv_y_min ); + const int max_x = X264_MIN( bmx + i_me_range, mv_x_max ); + const int max_y = X264_MIN( bmy + i_me_range, mv_y_max ); + /* SEA is fastest in multiples of 4 */ + const int width = (max_x - min_x + 3) & ~3; + int my; #if 0 /* plain old exhaustive search */ + int mx; for( my = min_y; my <= max_y; my++ ) for( mx = min_x; mx <= max_x; mx++ ) COST_MV( mx, my ); @@ -470,10 +470,13 @@ const int stride = m->i_stride[0]; static uint8_t zero[16*16] = {0,}; uint16_t *sums_base = m->integral; - int enc_dc[4]; + DECLARE_ALIGNED( int, enc_dc[4], 16 ); int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4; int delta = x264_pixel_size[sad_size].w; - uint16_t *ads = x264_malloc((max_x-min_x+8) * sizeof(uint16_t)); + int16_t xs_buf[64]; + int16_t *xs = width<=64 ? xs_buf : x264_malloc( (width+15)*sizeof(int16_t) ); + int xn; + uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2); h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+delta, m->p_fenc[0]+delta*FENC_STRIDE, m->p_fenc[0]+delta+delta*FENC_STRIDE, @@ -487,29 +490,18 @@ for( my = min_y; my <= max_y; my++ ) { - int mvs[3], i_mvs=0; bcost -= p_cost_mvy[my<<2]; - h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta, - ads, max_x-min_x+1 ); - for( mx = min_x; mx <= max_x; mx++ ) - { - if( ads[mx-min_x] < bcost - p_cost_mvx[mx<<2] ) - { - if( i_mvs == 3 ) - { - COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my ); - i_mvs = 0; - } - else - mvs[i_mvs++] = mx; - } - } + xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta, + cost_fpel_mvx+min_x, xs, width, bcost ); + for( i=0; i<xn-2; i+=3 ) + COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my ); bcost += p_cost_mvy[my<<2]; - for( i=0; i<i_mvs; i++ ) - COST_MV( mvs[i], my ); + for( ; i<xn; i++ ) + COST_MV( min_x+xs[i], my ); } - x264_free(ads); + if( xs != xs_buf ) + x264_free( xs ); #endif } break;

@@ -56,6 +56,8 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight ); int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel ); +extern uint16_t *x264_cost_mv_fpel[52][4]; + #define COPY1_IF_LT(x,y)\ if((y)<(x))\ (x)=(y);

@@ -196,7 +196,7 @@ /* extra slot with pyramid so that we don't have to override the * order of forgetting old pictures */ sps->vui.i_max_dec_frame_buffering = - sps->i_num_ref_frames = X264_MIN(16, param->i_frame_reference + sps->vui.i_num_reorder_frames + param->b_bframe_pyramid); + sps->i_num_ref_frames = X264_MIN(16, X264_MAX(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames)); sps->vui.b_bitstream_restriction = 1; if( sps->vui.b_bitstream_restriction )

@@ -36,6 +36,7 @@ x264_predict_t predict_4x4[9+3]; x264_predict8x8_t predict_8x8[9+3]; DECLARE_ALIGNED( uint8_t, edge[33], 8 ); + uint16_t cost_mv[32]; int ret = 0, ok, used_asm; int i, j; @@ -155,20 +156,24 @@ } ok = 1; used_asm = 0; - for( i=0; i<4; i++ ) - if( pixel_asm.ads[i] != pixel_ref.ads[i] ) + for( i=0; i<32; i++ ) + cost_mv[i] = i*10; + for( i=0; i<100; i++ ) + if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] ) { - uint16_t res_a[32], res_c[32]; - uint16_t sums[72]; - int dc[4]; + DECLARE_ALIGNED( uint16_t, sums[72], 16 ); + DECLARE_ALIGNED( int, dc[4], 16 ); + int16_t mvs_a[32], mvs_c[32]; + int mvn_a, mvn_c; + int thresh = rand() & 0x3fff; for( j=0; j<72; j++ ) sums[j] = rand() & 0x3fff; for( j=0; j<4; j++ ) dc[j] = rand() & 0x3fff; used_asm = 1; - pixel_c.ads[i]( dc, sums, 32, res_c, 32 ); - pixel_asm.ads[i]( dc, sums, 32, res_a, 32 ); - if( memcmp(res_a, res_c, sizeof(res_c)) ) + mvn_c = pixel_c.ads[i&3]( dc, sums, 32, cost_mv, mvs_c, 32, thresh ); + mvn_a = pixel_asm.ads[i&3]( dc, sums, 32, cost_mv, mvs_a, 32, thresh ); + if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) ) ok = 0; } report( "esa ads:" );

@@ -1,253 +0,0 @@ -/***************************************************************************** - * simple arithmetic expression evaluator - ***************************************************************************** - * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - *****************************************************************************/ - -/** - * @file eval.c - * simple arithmetic expression evaluator. - * - * see http://joe.hotchkiss.com/programming/eval/eval.html - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <math.h> - -#ifndef NAN - #define NAN 0 -#endif - -#ifndef M_PI -#define M_PI 3.14159265358979323846 -#endif - -#define STACK_SIZE 100 - -typedef struct Parser{ - double stack[STACK_SIZE]; - int stack_index; - char *s; - double *const_value; - const char **const_name; // NULL terminated - double (**func1)(void *, double a); // NULL terminated - const char **func1_name; // NULL terminated - double (**func2)(void *, double a, double b); // NULL terminated - char **func2_name; // NULL terminated - void *opaque; -} Parser; - -static void evalExpression(Parser *p); - -static void push(Parser *p, double d){ - if(p->stack_index+1>= STACK_SIZE){ - fprintf(stderr, "stack overflow in the parser\n"); - return; - } - p->stack[ p->stack_index++ ]= d; -//printf("push %f\n", d); fflush(stdout); -} - -static double pop(Parser *p){ - if(p->stack_index<=0){ - fprintf(stderr, "stack underflow in the parser\n"); - return NAN; - } -//printf("pop\n"); fflush(stdout); - return p->stack[ --p->stack_index ]; -} - -static int strmatch(const char *s, const char *prefix){ - int i; - for(i=0; prefix[i]; i++){ - if(prefix[i] != s[i]) return 0; - } - return 1; -} - -static void evalPrimary(Parser *p){ - double d, d2=NAN; - char *next= p->s; - int i; - - /* number */ - d= strtod(p->s, &next); - if(next != p->s){ - push(p, d); - p->s= next; - return; - } - - /* named constants */ - for(i=0; p->const_name[i]; i++){ - if(strmatch(p->s, p->const_name[i])){ - push(p, p->const_value[i]); - p->s+= strlen(p->const_name[i]); - return; - } - } - - p->s= strchr(p->s, '('); - if(p->s==NULL){ - fprintf(stderr, "Parser: missing ( in \"%s\"\n", next); - return; - } - p->s++; // "(" - evalExpression(p); - d= pop(p); - if(p->s[0]== ','){ - p->s++; // "," - evalExpression(p); - d2= pop(p); - } - if(p->s[0] != ')'){ - fprintf(stderr, "Parser: missing ) in \"%s\"\n", next); - return; - } - p->s++; // ")" - - if( strmatch(next, "sinh" ) ) d= sinh(d); - else if( strmatch(next, "cosh" ) ) d= cosh(d); - else if( strmatch(next, "tanh" ) ) d= tanh(d); - else if( strmatch(next, "sin" ) ) d= sin(d); - else if( strmatch(next, "cos" ) ) d= cos(d); - else if( strmatch(next, "tan" ) ) d= tan(d); - else if( strmatch(next, "exp" ) ) d= exp(d); - else if( strmatch(next, "log" ) ) d= log(d); - else if( strmatch(next, "squish") ) d= 1/(1+exp(4*d)); - else if( strmatch(next, "gauss" ) ) d= exp(-d*d/2)/sqrt(2*M_PI); - else if( strmatch(next, "abs" ) ) d= fabs(d); - else if( strmatch(next, "max" ) ) d= d > d2 ? d : d2; - else if( strmatch(next, "min" ) ) d= d < d2 ? d : d2; - else if( strmatch(next, "gt" ) ) d= d > d2 ? 1.0 : 0.0; - else if( strmatch(next, "gte" ) ) d= d >= d2 ? 1.0 : 0.0; - else if( strmatch(next, "lt" ) ) d= d > d2 ? 0.0 : 1.0; - else if( strmatch(next, "lte" ) ) d= d >= d2 ? 0.0 : 1.0; - else if( strmatch(next, "eq" ) ) d= d == d2 ? 1.0 : 0.0; -// else if( strmatch(next, "l1" ) ) d= 1 + d2*(d - 1); -// else if( strmatch(next, "sq01" ) ) d= (d >= 0.0 && d <=1.0) ? 1.0 : 0.0; - else{ - int error=1; - for(i=0; p->func1_name && p->func1_name[i]; i++){ - if(strmatch(next, p->func1_name[i])){ - d= p->func1[i](p->opaque, d); - error=0; - break; - } - } - - for(i=0; p->func2_name && p->func2_name[i]; i++){ - if(strmatch(next, p->func2_name[i])){ - d= p->func2[i](p->opaque, d, d2); - error=0; - break; - } - } - - if(error){ - fprintf(stderr, "Parser: unknown function in \"%s\"\n", next); - return; - } - } - - push(p, d); -} - -static void evalPow(Parser *p){ - int neg= 0; - if(p->s[0]=='+') p->s++; - - if(p->s[0]=='-'){ - neg= 1; - p->s++; - } - - if(p->s[0]=='('){ - p->s++;; - evalExpression(p); - - if(p->s[0]!=')') - fprintf(stderr, "Parser: missing )\n"); - p->s++; - }else{ - evalPrimary(p); - } - - if(neg) push(p, -pop(p)); -} - -static void evalFactor(Parser *p){ - evalPow(p); - while(p->s[0]=='^'){ - double d; - - p->s++; - evalPow(p); - d= pop(p); - push(p, pow(pop(p), d)); - } -} - -static void evalTerm(Parser *p){ - evalFactor(p); - while(p->s[0]=='*' || p->s[0]=='/'){ - int inv= p->s[0]=='/'; - double d; - - p->s++; - evalFactor(p); - d= pop(p); - if(inv) d= 1.0/d; - push(p, d * pop(p)); - } -} - -static void evalExpression(Parser *p){ - evalTerm(p); - while(p->s[0]=='+' || p->s[0]=='-'){ - int sign= p->s[0]=='-'; - double d; - - p->s++; - evalTerm(p); - d= pop(p); - if(sign) d= -d; - push(p, d + pop(p)); - } -} - -double x264_eval(char *s, double *const_value, const char **const_name, - double (**func1)(void *, double), const char **func1_name, - double (**func2)(void *, double, double), char **func2_name, - void *opaque){ - Parser p; - - p.stack_index=0; - p.s= s; - p.const_value= const_value; - p.const_name = const_name; - p.func1 = func1; - p.func1_name = func1_name; - p.func2 = func2; - p.func2_name = func2_name; - p.opaque = opaque; - - evalExpression(&p); - return pop(&p); -}

@@ -1 +1 @@ -fd1de69b8054ef718b52f5ae1520267a5e5402e8 +2324c7074585b8b3f56e49ae41df9cbca06f6185

@@ -15,9 +15,6 @@ D: Motion estimation (subpel and mixed refs) D: B-RDO -N: Andrew Dunstan -D: win64 asm port - N: bobololo D: Avisynth input D: MP4 muxing @@ -26,9 +23,6 @@ E: sennindemokrit AT gmx DOT net D: x86 asm -N: Placeholder -D: Altivec optimizations - N: Eric Petit E: eric.petit AT lapsus DOT org C: titer @@ -36,9 +30,6 @@ D: BeOS and MacOS X ports. S: France -N: Francesco Corriga -D: VfW - N: Gabriel Bouvigne E: gabriel.bouvigne AT joost DOT com D: 2pass VBV @@ -54,12 +45,6 @@ D: various speed optimizations, bugfixes S: USA -N: Justin Clay -E: justin_clay AT hotmail DOT com -C: wheatgerm -D: Inital work on VfW -S: Nova Scotia, Canada - N: Laurent Aimar E: fenrir AT via.ecp DOT fr C: fenrir @@ -96,7 +81,6 @@ C: chenm001 D: Win32/VC 6.0 port D: gcc asm to nasm conversion -D: VfW S: China N: Phil Jensen @@ -107,10 +91,6 @@ E: radoslaw AT syskin DOT cjb DOT net D: Cached motion compensation -N: Riccardo Stievano -E: walkunafraid AT tin DOT it -D: VfW - N: Tuukka Toivonen E: tuukkat AT ee DOT oulu DOT fi D: Visualization

@@ -10,7 +10,7 @@ common/quant.c common/vlc.c \ encoder/analyse.c encoder/me.c encoder/ratecontrol.c \ encoder/set.c encoder/macroblock.c encoder/cabac.c \ - encoder/cavlc.c encoder/encoder.c encoder/eval.c + encoder/cavlc.c encoder/encoder.c SRCCLI = x264.c matroska.c muxers.c @@ -161,7 +161,7 @@ ifeq ($(SYS),MINGW) $(if $(SONAME), install -m 755 $(SONAME) $(DESTDIR)$(bindir)) else - $(if $(SONAME), ln -sf $(SONAME) $(DESTDIR)$(libdir)/libx264.so) + $(if $(SONAME), ln -sf $(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX)) $(if $(SONAME), install -m 755 $(SONAME) $(DESTDIR)$(libdir)) endif $(if $(IMPLIBNAME), install -m 644 $(IMPLIBNAME) $(DESTDIR)$(libdir)) @@ -172,7 +172,7 @@ uninstall: rm -f $(DESTDIR)$(includedir)/x264.h $(DESTDIR)$(libdir)/libx264.a rm -f $(DESTDIR)$(bindir)/x264 $(DESTDIR)$(libdir)/pkgconfig/x264.pc - $(if $(SONAME), rm -f $(DESTDIR)$(libdir)/$(SONAME) $(DESTDIR)$(libdir)/libx264.so) + $(if $(SONAME), rm -f $(DESTDIR)$(libdir)/$(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX)) $(MAKE) -C gtk uninstall etags: TAGS

@@ -76,7 +76,11 @@ s->i_left -= i_count; if( s->i_left <= 32 ) { +#ifdef WORDS_BIGENDIAN + *(uint32_t*)s->p = s->cur_bits >> (32 - s->i_left); +#else *(uint32_t*)s->p = endian_fix( s->cur_bits << s->i_left ); +#endif s->i_left += 32; s->p += 4; }

@@ -40,7 +40,7 @@ /* aligned for memcpy_aligned starting here */ DECLARE_ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision() - + /* context */ uint8_t state[460]; } x264_cabac_t;

@@ -69,7 +69,7 @@ param->i_keyint_min = 25; param->i_bframe = 0; param->i_scenecut_threshold = 40; - param->b_bframe_adaptive = 1; + param->i_bframe_adaptive = X264_B_ADAPT_FAST; param->i_bframe_bias = 0; param->b_bframe_pyramid = 0; @@ -93,14 +93,13 @@ param->rc.i_qp_step = 4; param->rc.f_ip_factor = 1.4; param->rc.f_pb_factor = 1.3; - param->rc.i_aq_mode = X264_AQ_GLOBAL; + param->rc.i_aq_mode = X264_AQ_VARIANCE; param->rc.f_aq_strength = 1.0; param->rc.b_stat_write = 0; param->rc.psz_stat_out = "x264_2pass.log"; param->rc.b_stat_read = 0; param->rc.psz_stat_in = "x264_2pass.log"; - param->rc.psz_rc_eq = "blurCplx^(1-qComp)"; param->rc.f_qcompress = 0.6; param->rc.f_qblur = 0.5; param->rc.f_complexity_blur = 20; @@ -117,8 +116,10 @@ | X264_ANALYSE_PSUB16x16 | X264_ANALYSE_BSUB16x16; param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL; param->analyse.i_me_method = X264_ME_HEX; + param->analyse.f_psy_rd = 1.0; + param->analyse.f_psy_trellis = 0; param->analyse.i_me_range = 16; - param->analyse.i_subpel_refine = 5; + param->analyse.i_subpel_refine = 6; param->analyse.b_chroma_me = 1; param->analyse.i_mv_range_thread = -1; param->analyse.i_mv_range = -1; // set from level_idc @@ -169,12 +170,12 @@ static int x264_atobool( const char *str, int *b_error ) { - if( !strcmp(str, "1") || - !strcmp(str, "true") || + if( !strcmp(str, "1") || + !strcmp(str, "true") || !strcmp(str, "yes") ) return 1; - if( !strcmp(str, "0") || - !strcmp(str, "false") || + if( !strcmp(str, "0") || + !strcmp(str, "false") || !strcmp(str, "no") ) return 0; *b_error = 1; @@ -329,7 +330,14 @@ OPT("bframes") p->i_bframe = atoi(value); OPT("b-adapt") - p->b_bframe_adaptive = atobool(value); + { + p->i_bframe_adaptive = atobool(value); + if( b_error ) + { + b_error = 0; + p->i_bframe_adaptive = atoi(value); + } + } OPT("b-bias") p->i_bframe_bias = atoi(value); OPT("b-pyramid") @@ -464,6 +472,21 @@ p->analyse.i_mv_range_thread = atoi(value); OPT2("subme", "subq") p->analyse.i_subpel_refine = atoi(value); + OPT("psy-rd") + { + if( 2 == sscanf( value, "%f:%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) || + 2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ) + { } + else if( sscanf( value, "%f", &p->analyse.f_psy_rd ) ) + { + p->analyse.f_psy_trellis = 0; + } + else + { + p->analyse.f_psy_rd = 0; + p->analyse.f_psy_trellis = 0; + } + } OPT("bime") p->analyse.b_bidir_me = atobool(value); OPT("chroma-me") @@ -532,8 +555,6 @@ p->rc.psz_stat_in = strdup(value); p->rc.psz_stat_out = strdup(value); } - OPT("rceq") - p->rc.psz_rc_eq = strdup(value); OPT("qcomp") p->rc.f_qcompress = atof(value); OPT("qblur") @@ -644,7 +665,6 @@ uint8_t *dst = p_data; uint8_t *src = nal->p_payload; uint8_t *end = &nal->p_payload[nal->i_payload]; - int i_count = 0; /* FIXME this code doesn't check overflow */ @@ -669,13 +689,9 @@ i_count = 0; } if( *src == 0 ) - { i_count++; - } else - { i_count = 0; - } *dst++ = *src++; } *pi_data = dst - (uint8_t*)p_data; @@ -683,37 +699,6 @@ return *pi_data; } -/**************************************************************************** - * x264_nal_decode: - ****************************************************************************/ -int x264_nal_decode( x264_nal_t *nal, void *p_data, int i_data ) -{ - uint8_t *src = p_data; - uint8_t *end = &src[i_data]; - uint8_t *dst = nal->p_payload; - - nal->i_type = src[0]&0x1f; - nal->i_ref_idc = (src[0] >> 5)&0x03; - - src++; - - while( src < end ) - { - if( src < end - 3 && src[0] == 0x00 && src[1] == 0x00 && src[2] == 0x03 ) - { - *dst++ = 0x00; - *dst++ = 0x00; - - src += 3; - continue; - } - *dst++ = *src++; - } - - nal->i_payload = dst - (uint8_t*)p_data; - return 0; -} - /**************************************************************************** @@ -856,6 +841,7 @@ s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter ); s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] ); s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine ); + s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis ); s += sprintf( s, " brdo=%d", p->analyse.b_bframe_rdo ); s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references ); s += sprintf( s, " me_range=%d", p->analyse.i_me_range ); @@ -874,7 +860,7 @@ if( p->i_bframe ) { s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d wpredb=%d bime=%d", - p->b_bframe_pyramid, p->b_bframe_adaptive, p->i_bframe_bias, + p->b_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias, p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred, p->analyse.b_bidir_me ); } @@ -893,9 +879,8 @@ else s += sprintf( s, " bitrate=%d ratetol=%.1f", p->rc.i_bitrate, p->rc.f_rate_tolerance ); - s += sprintf( s, " rceq='%s' qcomp=%.2f qpmin=%d qpmax=%d qpstep=%d", - p->rc.psz_rc_eq, p->rc.f_qcompress, - p->rc.i_qp_min, p->rc.i_qp_max, p->rc.i_qp_step ); + s += sprintf( s, " qcomp=%.2f qpmin=%d qpmax=%d qpstep=%d", + p->rc.f_qcompress, p->rc.i_qp_min, p->rc.i_qp_max, p->rc.i_qp_step ); if( p->rc.b_stat_read ) s += sprintf( s, " cplxblur=%.1f qblur=%.1f", p->rc.f_complexity_blur, p->rc.f_qblur );

@@ -262,6 +262,8 @@ int i_frame_size; } out; + /**** thread synchronization starts here ****/ + /* frame number/poc */ int i_frame; @@ -294,6 +296,8 @@ uint16_t (*quant4_bias[4])[16]; /* [4][52][16] */ uint16_t (*quant8_bias[2])[64]; /* [2][52][64] */ + const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */ + DECLARE_ALIGNED_16( uint32_t nr_residual_sum[2][64] ); DECLARE_ALIGNED_16( uint16_t nr_offset[2][64] ); uint32_t nr_count[2]; @@ -307,11 +311,11 @@ struct { /* Frames to be encoded (whose types have been decided) */ - x264_frame_t *current[X264_BFRAME_MAX+3]; + x264_frame_t *current[X264_BFRAME_MAX*4+3]; /* Temporary buffer (frames types not yet decided) */ - x264_frame_t *next[X264_BFRAME_MAX+3]; + x264_frame_t *next[X264_BFRAME_MAX*4+3]; /* Unused frames */ - x264_frame_t *unused[X264_BFRAME_MAX + X264_THREAD_MAX*2 + 16+4]; + x264_frame_t *unused[X264_BFRAME_MAX*4 + X264_THREAD_MAX*2 + 16+4]; /* For adaptive B decision */ x264_frame_t *last_nonb; @@ -370,13 +374,15 @@ int i_mb_xy; int i_b8_xy; int i_b4_xy; - + /* Search parameters */ int i_me_method; int i_subpel_refine; int b_chroma_me; int b_trellis; int b_noise_reduction; + int i_psy_rd; /* Psy RD strength--fixed point value*/ + int i_psy_trellis; /* Psy trellis strength--fixed point value*/ int b_interlaced; @@ -395,13 +401,17 @@ unsigned int i_neighbour; unsigned int i_neighbour8[4]; /* neighbours of each 8x8 or 4x4 block that are available */ unsigned int i_neighbour4[16]; /* at the time the block is coded */ - int i_mb_type_top; - int i_mb_type_left; - int i_mb_type_topleft; - int i_mb_type_topright; + int i_mb_type_top; + int i_mb_type_left; + int i_mb_type_topleft; + int i_mb_type_topright; int i_mb_prev_xy; int i_mb_top_xy; + /**** thread synchronization ends here ****/ + /* subsequent variables are either thread-local or constant, + * and won't be copied from one thread to another */ + /* mb table */ int8_t *type; /* mb type */ int8_t *qp; /* mb qp */ @@ -448,14 +458,26 @@ DECLARE_ALIGNED_16( uint8_t fenc_buf[24*FENC_STRIDE] ); DECLARE_ALIGNED_16( uint8_t fdec_buf[27*FDEC_STRIDE] ); - /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */ + /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */ DECLARE_ALIGNED_16( uint8_t i4x4_fdec_buf[16*16] ); DECLARE_ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] ); DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] ); DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] ); + /* Psy trellis DCT data */ + DECLARE_ALIGNED_16( int16_t fenc_dct8[4][64] ); + DECLARE_ALIGNED_16( int16_t fenc_dct4[16][16] ); + + /* Psy RD SATD scores */ + int fenc_satd[4][4]; + int fenc_satd_sum; + int fenc_sa8d[2][2]; + int fenc_sa8d_sum; + /* pointer over mb of the frame to be compressed */ uint8_t *p_fenc[3]; + /* pointer to the actual source frame, not a block copy */ + uint8_t *p_fenc_plane[3]; /* pointer over mb of the frame to be reconstructed */ uint8_t *p_fdec[3]; @@ -524,11 +546,10 @@ /* Current frame stats */ struct { - /* Headers bits (MV+Ref+MB Block Type */ - int i_hdr_bits; - /* Texture bits (Intra/Predicted) */ - int i_itex_bits; - int i_ptex_bits; + /* MV bits (MV+Ref+Block Type) */ + int i_mv_bits; + /* Texture bits (DCT coefs) */ + int i_tex_bits; /* ? */ int i_misc_bits; /* MB type counts */ @@ -559,7 +580,7 @@ double f_slice_qp[5]; int i_consecutive_bframes[X264_BFRAME_MAX+1]; /* */ - int64_t i_sqe_global[5]; + int64_t i_ssd_global[5]; double f_psnr_average[5]; double f_psnr_mean_y[5]; double f_psnr_mean_u[5];

@@ -460,45 +460,62 @@ // gcc pessimizes multi-dimensional arrays here, even with constant indices #define ZIG(i,y,x) level[i] = dct[0][x*8+y]; +#define ZIGZAG8_FRAME\ + ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\ + ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\ + ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\ + ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\ + ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\ + ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\ + ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\ + ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\ + ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\ + ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\ + ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\ + ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\ + ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\ + ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\ + ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\ + ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\ + +#define ZIGZAG8_FIELD\ + ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\ + ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\ + ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\ + ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\ + ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\ + ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\ + ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\ + ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\ + ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\ + ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\ + ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\ + ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\ + ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\ + ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\ + ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\ + ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7) + +#define ZIGZAG4_FRAME\ + ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\ + ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\ + ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\ + ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3) + +#define ZIGZAG4_FIELD\ + ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\ + ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\ + ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\ + ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3) static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] ) { - ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0) - ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2) - ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1) - ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5) - ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1) - ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2) - ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6) - ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4) - ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0) - ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4) - ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7) - ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3) - ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5) - ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6) - ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6) - ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7) + ZIGZAG8_FRAME } static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] ) { - ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1) - ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1) - ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0) - ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3) - ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1) - ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3) - ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2) - ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4) - ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3) - ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5) - ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4) - ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5) - ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6) - ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6) - ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7) - ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7) + ZIGZAG8_FIELD } #undef ZIG @@ -506,10 +523,7 @@ static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] ) { - ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0) - ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2) - ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2) - ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3) + ZIGZAG4_FRAME } static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] ) @@ -531,26 +545,40 @@ *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\ *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\ *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\ - *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);\ + *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE); +#define COPY8x8\ + *(uint64_t*)(p_dst+0*FDEC_STRIDE) = *(uint64_t*)(p_src+0*FENC_STRIDE);\ + *(uint64_t*)(p_dst+1*FDEC_STRIDE) = *(uint64_t*)(p_src+1*FENC_STRIDE);\ + *(uint64_t*)(p_dst+2*FDEC_STRIDE) = *(uint64_t*)(p_src+2*FENC_STRIDE);\ + *(uint64_t*)(p_dst+3*FDEC_STRIDE) = *(uint64_t*)(p_src+3*FENC_STRIDE);\ + *(uint64_t*)(p_dst+4*FDEC_STRIDE) = *(uint64_t*)(p_src+4*FENC_STRIDE);\ + *(uint64_t*)(p_dst+5*FDEC_STRIDE) = *(uint64_t*)(p_src+5*FENC_STRIDE);\ + *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\ + *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE); static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst ) { - ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0) - ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2) - ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2) - ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3) + ZIGZAG4_FRAME COPY4x4 } static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst ) { - ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0) - ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1) - ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2) - ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3) + ZIGZAG4_FIELD COPY4x4 } +static void zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst ) +{ + ZIGZAG8_FRAME + COPY8x8 +} +static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst ) +{ + ZIGZAG8_FIELD + COPY8x8 +} + #undef ZIG #undef COPY4x4 @@ -560,6 +588,7 @@ { pf->scan_8x8 = zigzag_scan_8x8_field; pf->scan_4x4 = zigzag_scan_4x4_field; + pf->sub_8x8 = zigzag_sub_8x8_field; pf->sub_4x4 = zigzag_sub_4x4_field; #ifdef HAVE_MMX if( cpu&X264_CPU_MMXEXT ) @@ -575,10 +604,22 @@ { pf->scan_8x8 = zigzag_scan_8x8_frame; pf->scan_4x4 = zigzag_scan_4x4_frame; + pf->sub_8x8 = zigzag_sub_8x8_frame; pf->sub_4x4 = zigzag_sub_4x4_frame; #ifdef HAVE_MMX + if( cpu&X264_CPU_MMX ) + pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx; + if( cpu&X264_CPU_MMXEXT ) + pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext; + if( cpu&X264_CPU_SSE2_IS_FAST ) + pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; if( cpu&X264_CPU_SSSE3 ) - pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; + { + pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; + pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; + } + if( cpu&X264_CPU_PHADD_IS_FAST ) + pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; #endif #ifdef ARCH_PPC

@@ -41,6 +41,17 @@ }; #undef W +#define W(i) (i==0 ? FIX8(1.76777) :\ + i==1 ? FIX8(1.11803) :\ + i==2 ? FIX8(0.70711) :0) +static const uint16_t x264_dct4_weight_tab[16] = { + W(0), W(1), W(0), W(1), + W(1), W(2), W(1), W(2), + W(0), W(1), W(0), W(1), + W(1), W(2), W(1), W(2) +}; +#undef W + /* inverse squared */ #define W(i) (i==0 ? FIX8(3.125) :\ i==1 ? FIX8(1.25) :\ @@ -107,6 +118,7 @@ { void (*scan_8x8)( int16_t level[64], int16_t dct[8][8] ); void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] ); + void (*sub_8x8)( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst ); void (*sub_4x4)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst ); } x264_zigzag_function_t;

@@ -77,6 +77,14 @@ CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size ); for( i = 0; i < 4; i++ ) frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size; + + for( j = 0; j <= !!h->param.i_bframe; j++ ) + for( i = 0; i <= h->param.i_bframe; i++ ) + { + CHECKED_MALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) ); + memset( frame->lowres_mvs[j][i], 0, 2*h->mb.i_mb_count*sizeof(int16_t) ); + CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) ); + } } if( h->param.analyse.i_me_method >= X264_ME_ESA ) @@ -97,6 +105,7 @@ CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t)); CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) ); CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) ); + CHECKED_MALLOC( frame->i_intra_cost, i_mb_count * sizeof(uint16_t) ); if( h->param.i_bframe ) { CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) ); @@ -114,6 +123,9 @@ for( j = 0; j < h->param.i_bframe + 2; j++ ) CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) ); + if( h->param.rc.i_aq_mode ) + CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) ); + x264_pthread_mutex_init( &frame->mutex, NULL ); x264_pthread_cond_init( &frame->cv, NULL ); @@ -134,6 +146,13 @@ for( i = 0; i < X264_BFRAME_MAX+2; i++ ) for( j = 0; j < X264_BFRAME_MAX+2; j++ ) x264_free( frame->i_row_satds[i][j] ); + for( j = 0; j < 2; j++ ) + for( i = 0; i <= X264_BFRAME_MAX; i++ ) + { + x264_free( frame->lowres_mvs[j][i] ); + x264_free( frame->lowres_mv_costs[j][i] ); + } + x264_free( frame->f_qp_offset ); x264_free( frame->i_row_bits ); x264_free( frame->i_row_qp ); x264_free( frame->mb_type ); @@ -233,7 +252,7 @@ void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ) { /* during filtering, 8 extra pixels were filtered on each edge, - * but up to 3 of the horizontal ones may be wrong. + * but up to 3 of the horizontal ones may be wrong. we want to expand border from the last filtered pixel */ int b_start = !mb_y; int stride = frame->i_stride[0]; @@ -297,7 +316,7 @@ /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of * entropy coding, but per 64 coeffs for the purpose of deblocking */ -void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] ) +static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] ) { uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width; int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width; @@ -338,82 +357,86 @@ /* Deblocking filter */ - -static const int i_alpha_table[52] = +static const uint8_t i_alpha_table[52+12*2] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 17, 20, 22, 25, 28, 32, 36, 40, 45, 50, 56, 63, 71, 80, 90,101,113,127,144,162,182,203,226, - 255, 255 + 255,255, + 255,255,255,255,255,255,255,255,255,255,255,255, }; -static const int i_beta_table[52] = +static const uint8_t i_beta_table[52+12*2] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, - 18, 18 + 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, }; -static const int i_tc0_table[52][3] = +static const int8_t i_tc0_table[52+12*2][4] = { - { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, - { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, - { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 1 }, - { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 }, { 0, 1, 1 }, { 0, 1, 1 }, { 1, 1, 1 }, - { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 2 }, { 1, 1, 2 }, { 1, 1, 2 }, - { 1, 1, 2 }, { 1, 2, 3 }, { 1, 2, 3 }, { 2, 2, 3 }, { 2, 2, 4 }, { 2, 3, 4 }, - { 2, 3, 4 }, { 3, 3, 5 }, { 3, 4, 6 }, { 3, 4, 6 }, { 4, 5, 7 }, { 4, 5, 8 }, - { 4, 6, 9 }, { 5, 7,10 }, { 6, 8,11 }, { 6, 8,13 }, { 7,10,14 }, { 8,11,16 }, - { 9,12,18 }, {10,13,20 }, {11,15,23 }, {13,17,25 } + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 }, + {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 }, + {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, + {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 }, + {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 }, + {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 }, + {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, }; +#define alpha_table(x) i_alpha_table[(x)+12] +#define beta_table(x) i_beta_table[(x)+12] +#define tc0_table(x) i_tc0_table[(x)+12] /* From ffmpeg */ -static inline int clip_uint8( int a ) -{ - if (a&(~255)) - return (-a)>>31; - else - return a; -} - static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 ) { int i, d; - for( i = 0; i < 4; i++ ) { - if( tc0[i] < 0 ) { + for( i = 0; i < 4; i++ ) + { + if( tc0[i] < 0 ) + { pix += 4*ystride; continue; } - for( d = 0; d < 4; d++ ) { + for( d = 0; d < 4; d++ ) + { const int p2 = pix[-3*xstride]; const int p1 = pix[-2*xstride]; const int p0 = pix[-1*xstride]; const int q0 = pix[ 0*xstride]; const int q1 = pix[ 1*xstride]; const int q2 = pix[ 2*xstride]; - - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) { - + + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { int tc = tc0[i]; int delta; - - if( abs( p2 - p0 ) < beta ) { + if( abs( p2 - p0 ) < beta ) + { pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] ); - tc++; + tc++; } - if( abs( q2 - q0 ) < beta ) { + if( abs( q2 - q0 ) < beta ) + { pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] ); tc++; } - + delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - pix[-1*xstride] = clip_uint8( p0 + delta ); /* p0' */ - pix[ 0*xstride] = clip_uint8( q0 - delta ); /* q0' */ + pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */ + pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */ } pix += ystride; } @@ -421,7 +444,7 @@ } static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) { - deblock_luma_c( pix, stride, 1, alpha, beta, tc0 ); + deblock_luma_c( pix, stride, 1, alpha, beta, tc0 ); } static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) { @@ -431,43 +454,45 @@ static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 ) { int i, d; - for( i = 0; i < 4; i++ ) { + for( i = 0; i < 4; i++ ) + { const int tc = tc0[i]; - if( tc <= 0 ) { + if( tc <= 0 ) + { pix += 2*ystride; continue; } - for( d = 0; d < 2; d++ ) { + for( d = 0; d < 2; d++ ) + { const int p1 = pix[-2*xstride]; const int p0 = pix[-1*xstride]; const int q0 = pix[ 0*xstride]; const int q1 = pix[ 1*xstride]; - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) { - + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - pix[-1*xstride] = clip_uint8( p0 + delta ); /* p0' */ - pix[ 0*xstride] = clip_uint8( q0 - delta ); /* q0' */ + pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */ + pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */ } pix += ystride; } } } static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -{ +{ deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 ); } static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -{ +{ deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 ); } static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta ) { int d; - for( d = 0; d < 16; d++ ) { + for( d = 0; d < 16; d++ ) + { const int p2 = pix[-3*xstride]; const int p1 = pix[-2*xstride]; const int p0 = pix[-1*xstride]; @@ -475,35 +500,31 @@ const int q1 = pix[ 1*xstride]; const int q2 = pix[ 2*xstride]; - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) { - - if(abs( p0 - q0 ) < ((alpha >> 2) + 2) ){ - if( abs( p2 - p0 ) < beta) + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { + if(abs( p0 - q0 ) < ((alpha >> 2) + 2) ) + { + if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */ { const int p3 = pix[-4*xstride]; - /* p0', p1', p2' */ pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; - } else { - /* p0' */ - pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; } - if( abs( q2 - q0 ) < beta) + else /* p0' */ + pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; + if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */ { const int q3 = pix[3*xstride]; - /* q0', q1', q2' */ pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; - } else { - /* q0' */ - pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; } - }else{ - /* p0', q0' */ + else /* q0' */ + pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; + } + else /* p0', q0' */ + { pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; } @@ -512,59 +533,72 @@ } } static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) -{ +{ deblock_luma_intra_c( pix, stride, 1, alpha, beta ); } static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) -{ +{ deblock_luma_intra_c( pix, 1, stride, alpha, beta ); } static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta ) -{ - int d; - for( d = 0; d < 8; d++ ) { +{ + int d; + for( d = 0; d < 8; d++ ) + { const int p1 = pix[-2*xstride]; const int p0 = pix[-1*xstride]; const int q0 = pix[ 0*xstride]; const int q1 = pix[ 1*xstride]; - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) { - + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */ pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */ } - pix += ystride; } } static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) -{ +{ deblock_chroma_intra_c( pix, stride, 1, alpha, beta ); } static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) -{ +{ deblock_chroma_intra_c( pix, 1, stride, alpha, beta ); } -static inline void deblock_edge( x264_t *h, uint8_t *pix, int i_stride, int bS[4], int i_qp, int b_chroma, - x264_deblock_inter_t pf_inter, x264_deblock_intra_t pf_intra ) +static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) { - int i; - const int index_a = x264_clip3( i_qp + h->sh.i_alpha_c0_offset, 0, 51 ); - const int alpha = i_alpha_table[index_a]; - const int beta = i_beta_table[x264_clip3( i_qp + h->sh.i_beta_offset, 0, 51 )]; - - if( bS[0] < 4 ) { - int8_t tc[4]; - for(i=0; i<4; i++) - tc[i] = (bS[i] ? i_tc0_table[index_a][bS[i] - 1] : -1) + b_chroma; - pf_inter( pix, i_stride, alpha, beta, tc ); - } else { - pf_intra( pix, i_stride, alpha, beta ); - } + const int index_a = i_qp + h->sh.i_alpha_c0_offset; + const int alpha = alpha_table(index_a); + const int beta = beta_table(i_qp + h->sh.i_beta_offset); + int8_t tc[4]; + + if( !alpha || !beta ) + return; + + tc[0] = tc0_table(index_a)[bS[0]] + b_chroma; + tc[1] = tc0_table(index_a)[bS[1]] + b_chroma; + tc[2] = tc0_table(index_a)[bS[2]] + b_chroma; + tc[3] = tc0_table(index_a)[bS[3]] + b_chroma; + + pf_inter( pix1, i_stride, alpha, beta, tc ); + if( b_chroma ) + pf_inter( pix2, i_stride, alpha, beta, tc ); +} + +static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra ) +{ + const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset); + const int beta = beta_table(i_qp + h->sh.i_beta_offset); + + if( !alpha || !beta ) + return; + + pf_intra( pix1, i_stride, alpha, beta ); + if( b_chroma ) + pf_intra( pix2, i_stride, alpha, beta ); } void x264_frame_deblock_row( x264_t *h, int mb_y ) @@ -573,152 +607,159 @@ const int s4x4 = 4 * h->mb.i_mb_stride; const int b_interlaced = h->sh.b_mbaff; const int mvy_limit = 4 >> b_interlaced; + const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset); int mb_x; - - int i_stride2[3] = { h->fdec->i_stride[0] << b_interlaced, - h->fdec->i_stride[1] << b_interlaced, - h->fdec->i_stride[2] << b_interlaced }; + int stridey = h->fdec->i_stride[0]; + int stride2y = stridey << b_interlaced; + int strideuv = h->fdec->i_stride[1]; + int stride2uv = strideuv << b_interlaced; if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode ) munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, munge_cavlc_nnz_row ); - for( mb_x = 0; mb_x < h->sps->i_mb_width; ) + for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced ) { const int mb_xy = mb_y * h->mb.i_mb_stride + mb_x; const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x; const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x; const int b_8x8_transform = h->mb.mb_transform_size[mb_xy]; - const int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4; - int i_edge; - - int i_pix_y[3] = { 16*mb_y*h->fdec->i_stride[0] + 16*mb_x, - 8*mb_y*h->fdec->i_stride[1] + 8*mb_x, - 8*mb_y*h->fdec->i_stride[2] + 8*mb_x }; + const int i_qp = h->mb.qp[mb_xy]; + int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4; + uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x; + uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x; + uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x; if( b_interlaced && (mb_y&1) ) { - i_pix_y[0] -= 15*h->fdec->i_stride[0]; - i_pix_y[1] -= 7*h->fdec->i_stride[1]; - i_pix_y[2] -= 7*h->fdec->i_stride[2]; + pixy -= 15*stridey; + pixu -= 7*strideuv; + pixv -= 7*strideuv; } x264_prefetch_fenc( h, h->fdec, mb_x, mb_y ); - /* i_dir == 0 -> vertical edge - * i_dir == 1 -> horizontal edge */ + if( i_qp <= qp_thresh ) + i_edge_end = 1; - #define deblock_dir(i_dir)\ + #define FILTER_DIR(intra, i_dir)\ {\ - int i_start = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\ - int i_qp, i_qpn;\ - for( i_edge = i_start; i_edge < i_edge_end; i_edge++ )\ + /* Y plane */\ + i_qpn= h->mb.qp[mbn_xy];\ + if( i_dir == 0 )\ {\ - int mbn_xy, mbn_8x8, mbn_4x4;\ - int bS[4]; /* filtering strength */\ - if( b_8x8_transform && (i_edge&1) )\ - continue;\ - mbn_xy = i_edge > 0 ? mb_xy : ( i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride );\ - mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8 );\ - mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 );\ - if( b_interlaced && i_edge == 0 && i_dir == 1 )\ + /* vertical edge */\ + deblock_edge##intra( h, pixy + 4*i_edge, NULL,\ + stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\ + h->loopf.deblock_h_luma##intra );\ + if( !(i_edge & 1) )\ {\ - mbn_xy -= h->mb.i_mb_stride;\ - mbn_8x8 -= 2 * s8x8;\ - mbn_4x4 -= 4 * s4x4;\ + /* U/V planes */\ + int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\ + deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\ + stride2uv, bS, i_qpc, 1,\ + h->loopf.deblock_h_chroma##intra );\ }\ - /* *** Get bS for each 4px for the current edge *** */\ - if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy] ) )\ - bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 && !(b_interlaced && i_dir) ? 4 : 3 );\ - else\ + }\ + else\ + {\ + /* horizontal edge */\ + deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\ + stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\ + h->loopf.deblock_v_luma##intra );\ + /* U/V planes */\ + if( !(i_edge & 1) )\ {\ - int i;\ - for( i = 0; i < 4; i++ )\ + int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\ + deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\ + stride2uv, bS, i_qpc, 1,\ + h->loopf.deblock_v_chroma##intra );\ + }\ + }\ + } + + #define DEBLOCK_STRENGTH(i_dir)\ + {\ + /* *** Get bS for each 4px for the current edge *** */\ + if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\ + *(uint32_t*)bS = 0x03030303;\ + else\ + {\ + *(uint32_t*)bS = 0x00000000;\ + for( i = 0; i < 4; i++ )\ + {\ + int x = i_dir == 0 ? i_edge : i;\ + int y = i_dir == 0 ? i : i_edge;\ + int xn = i_dir == 0 ? (x - 1)&0x03 : x;\ + int yn = i_dir == 0 ? y : (y - 1)&0x03;\ + if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\ + h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\ + bS[i] = 2;\ + else\ {\ - int x = i_dir == 0 ? i_edge : i;\ - int y = i_dir == 0 ? i : i_edge;\ - int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;\ - int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;\ - if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\ - h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\ - {\ - bS[i] = 2;\ - }\ - else\ - {\ - /* FIXME: A given frame may occupy more than one position in\ - * the reference list. So we should compare the frame numbers,\ - * not the indices in the ref list.\ - * No harm yet, as we don't generate that case.*/\ - int i8p= mb_8x8+(x/2)+(y/2)*s8x8;\ - int i8q= mbn_8x8+(xn/2)+(yn/2)*s8x8;\ - int i4p= mb_4x4+x+y*s4x4;\ - int i4q= mbn_4x4+xn+yn*s4x4;\ - int l;\ - bS[i] = 0;\ - for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )\ + /* FIXME: A given frame may occupy more than one position in\ + * the reference list. So we should compare the frame numbers,\ + * not the indices in the ref list.\ + * No harm yet, as we don't generate that case.*/\ + int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\ + int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\ + int i4p= mb_4x4+x+y*s4x4;\ + int i4q= mbn_4x4+xn+yn*s4x4;\ + for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )\ + if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||\ + abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||\ + abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )\ {\ - if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||\ - abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||\ - abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )\ - {\ - bS[i] = 1;\ - break;\ - }\ + bS[i] = 1;\ + break;\ }\ - }\ }\ }\ - /* *** filter *** */\ - /* Y plane */\ - i_qp = h->mb.qp[mb_xy];\ - i_qpn= h->mb.qp[mbn_xy];\ - if( i_dir == 0 )\ + }\ + } + + /* i_dir == 0 -> vertical edge + * i_dir == 1 -> horizontal edge */ + #define DEBLOCK_DIR(i_dir)\ + {\ + int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\ + int i_qpn, i, l, mbn_xy, mbn_8x8, mbn_4x4;\ + DECLARE_ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\ + if( i_edge )\ + i_edge+= b_8x8_transform;\ + else\ + {\ + mbn_xy = i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride;\ + mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\ + mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\ + if( b_interlaced && i_dir == 1 )\ {\ - /* vertical edge */\ - deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge],\ - i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\ - h->loopf.deblock_h_luma, h->loopf.deblock_h_luma_intra );\ - if( !(i_edge & 1) )\ - {\ - /* U/V planes */\ - int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\ - i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\ - deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge],\ - i_stride2[1], bS, i_qpc, 1,\ - h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\ - deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge],\ - i_stride2[2], bS, i_qpc, 1,\ - h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\ - }\ + mbn_xy -= h->mb.i_mb_stride;\ + mbn_8x8 -= 2 * s8x8;\ + mbn_4x4 -= 4 * s4x4;\ }\ - else\ + else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\ {\ - /* horizontal edge */\ - deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge*i_stride2[0]],\ - i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\ - h->loopf.deblock_v_luma, h->loopf.deblock_v_luma_intra );\ - /* U/V planes */\ - if( !(i_edge & 1) )\ - {\ - int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\ - i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\ - deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge*i_stride2[1]],\ - i_stride2[1], bS, i_qpc, 1,\ - h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\ - deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge*i_stride2[2]],\ - i_stride2[2], bS, i_qpc, 1,\ - h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\ - }\ + FILTER_DIR( _intra, i_dir );\ + goto end##i_dir;\ }\ + DEBLOCK_STRENGTH(i_dir);\ + if( *(uint32_t*)bS )\ + FILTER_DIR( , i_dir);\ + end##i_dir:\ + i_edge += b_8x8_transform+1;\ + }\ + mbn_xy = mb_xy;\ + mbn_8x8 = mb_8x8;\ + mbn_4x4 = mb_4x4;\ + for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\ + {\ + DEBLOCK_STRENGTH(i_dir);\ + if( *(uint32_t*)bS )\ + FILTER_DIR( , i_dir);\ }\ } - deblock_dir(0); - deblock_dir(1); - - /* next mb */ - if( !b_interlaced || (mb_y&1) ) - mb_x++; - mb_y ^= b_interlaced; + DEBLOCK_DIR(0); + DEBLOCK_DIR(1); } if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode ) @@ -748,12 +789,12 @@ void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); -void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) { x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 ); x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 ); } -void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) +static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) { x264_deblock_v8_luma_intra_mmxext( pix, stride, alpha, beta ); x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta ); @@ -884,6 +925,7 @@ frame = x264_frame_new( h ); assert( frame->i_reference_count == 0 ); frame->i_reference_count = 1; + frame->b_intra_calculated = 0; return frame; }

@@ -62,6 +62,8 @@ /* motion data */ int8_t *mb_type; int16_t (*mv[2])[2]; + int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2]; + int *lowres_mv_costs[2][X264_BFRAME_MAX+1]; int8_t *ref[2]; int i_ref[2]; int ref_poc[2][16]; @@ -71,17 +73,21 @@ * contains the SATD cost of the lowres frame encoded in various modes * FIXME: how big an array do we need? */ int i_cost_est[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]; + int i_cost_est_aq[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]; int i_satd; // the i_cost_est of the selected frametype int i_intra_mbs[X264_BFRAME_MAX+2]; int *i_row_satds[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]; int *i_row_satd; int *i_row_bits; int *i_row_qp; + float *f_qp_offset; + int b_intra_calculated; + uint16_t *i_intra_cost; /* threading */ int i_lines_completed; /* in pixels */ int i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */ - x264_pthread_mutex_t mutex; + x264_pthread_mutex_t mutex; x264_pthread_cond_t cv; } x264_frame_t;

@@ -24,71 +24,6 @@ #include "common.h" -int x264_mb_predict_intra4x4_mode( x264_t *h, int idx ) -{ - const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1]; - const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8]; - const int m = X264_MIN( x264_mb_pred_mode4x4_fix(ma), - x264_mb_pred_mode4x4_fix(mb) ); - - if( m < 0 ) - return I_PRED_4x4_DC; - - return m; -} - -int x264_mb_predict_non_zero_code( x264_t *h, int idx ) -{ - const int za = h->mb.cache.non_zero_count[x264_scan8[idx] - 1]; - const int zb = h->mb.cache.non_zero_count[x264_scan8[idx] - 8]; - - int i_ret = za + zb; - - if( i_ret < 0x80 ) - { - i_ret = ( i_ret + 1 ) >> 1; - } - return i_ret & 0x7f; -} - -int x264_mb_transform_8x8_allowed( x264_t *h ) -{ - // intra and skip are disallowed - // large partitions are allowed - // direct and 8x8 are conditional - static const uint8_t partition_tab[X264_MBTYPE_MAX] = { - 0,0,0,0,1,2,0,2,1,1,1,1,1,1,1,1,1,2,0, - }; - int p, i; - - if( !h->pps->b_transform_8x8_mode ) - return 0; - p = partition_tab[h->mb.i_type]; - if( p < 2 ) - return p; - else if( h->mb.i_type == B_DIRECT ) - return h->sps->b_direct8x8_inference; - else if( h->mb.i_type == P_8x8 ) - { - if( !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8) ) - return 1; - for( i=0; i<4; i++ ) - if( h->mb.i_sub_partition[i] != D_L0_8x8 ) - return 0; - return 1; - } - else // B_8x8 - { - // x264 currently doesn't use sub-8x8 B partitions, so don't check for them - if( h->sps->b_direct8x8_inference ) - return 1; - for( i=0; i<4; i++ ) - if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 ) - return 0; - return 1; - } -} - void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] ) { const int i8 = x264_scan8[idx]; @@ -223,9 +158,9 @@ int i8, i4; int b8x8; const int type_col = h->fref1[0]->mb_type[ h->mb.i_mb_xy ]; - + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 ); - + if( IS_INTRA( type_col ) ) { x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); @@ -338,7 +273,7 @@ if( ref[0] < 0 && ref[1] < 0 ) { - ref[0] = + ref[0] = ref[1] = 0; *(uint64_t*)mv[0] = 0; } @@ -481,7 +416,7 @@ } /* This just improves encoder performance, it's not part of the spec */ -void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[8][2], int *i_mvc ) +void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[9][2], int *i_mvc ) { int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref]; int i = 0; @@ -498,6 +433,13 @@ SET_MVP( h->mb.cache.mv[i_list][x264_scan8[12]] ); } + if( i_ref == 0 && h->frames.b_have_lowres ) + { + int16_t (*lowres_mv)[2] = i_list ? h->fenc->lowres_mvs[1][h->fref1[0]->i_frame-h->fenc->i_frame-1] + : h->fenc->lowres_mvs[0][h->fenc->i_frame-h->fref0[0]->i_frame-1]; + if( lowres_mv[0][0] != 0x7fff ) *(uint32_t*)mvc[i++] = (*(uint32_t*)lowres_mv[h->mb.i_mb_xy]*2)&0xfffeffff; + } + /* spatial predictors */ if( h->mb.i_neighbour & MB_LEFT ) { @@ -612,48 +554,41 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height ) { const int i8 = x264_scan8[0]+x+8*y; - + const int i_ref0 = h->mb.cache.ref[0][i8]; const int i_ref1 = h->mb.cache.ref[1][i8]; + const int weight = h->mb.bipred_weight[i_ref0][i_ref1]; + const int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ); const int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ); + int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ); int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ); - DECLARE_ALIGNED_16( uint8_t tmp[16*16] ); - int i_mode = x264_size2pixel[height][width]; - - x264_mb_mc_0xywh( h, x, y, width, height ); - - h->mc.mc_luma( tmp, 16, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0], - mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height ); + int i_mode = x264_size2pixel[height][width]; + int i_stride0 = 16, i_stride1 = 16; + DECLARE_ALIGNED_16( uint8_t tmp0[16*16] ); + DECLARE_ALIGNED_16( uint8_t tmp1[16*16] ); + uint8_t *src0, *src1; + + src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0], + mvx0 + 4*4*x, mvy0 + 4*4*y, 4*width, 4*height ); + src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0], + mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height ); + h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, + src0, i_stride0, src1, i_stride1, weight ); + if( h->mb.b_interlaced & i_ref0 ) + mvy0 += (h->mb.i_mb_y & 1)*4 - 2; if( h->mb.b_interlaced & i_ref1 ) mvy1 += (h->mb.i_mb_y & 1)*4 - 2; - if( h->param.analyse.b_weighted_bipred ) - { - const int i_ref0 = h->mb.cache.ref[0][i8]; - const int weight = h->mb.bipred_weight[i_ref0][i_ref1]; - - h->mc.avg_weight[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, tmp, 16, weight ); - - h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], - mvx1, mvy1, 2*width, 2*height ); - h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16, weight ); - - h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], - mvx1, mvy1, 2*width, 2*height ); - h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16, weight ); - } - else - { - h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, tmp, 16 ); - - h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], - mvx1, mvy1, 2*width, 2*height ); - h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16 ); - - h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], - mvx1, mvy1, 2*width, 2*height ); - h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16 ); - } + h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], + mvx0, mvy0, 2*width, 2*height ); + h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], + mvx1, mvy1, 2*width, 2*height ); + h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); + h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], + mvx0, mvy0, 2*width, 2*height ); + h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], + mvx1, mvy1, 2*width, 2*height ); + h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); } static void x264_mb_mc_direct8x8( x264_t *h, int x, int y ) @@ -885,6 +820,34 @@ memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) ); memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) ); + /* fdec: fenc: + * yyyyyyy + * yYYYY YYYY + * yYYYY YYYY + * yYYYY YYYY + * yYYYY YYYY + * uuu vvv UUVV + * uUU vVV UUVV + * uUU vVV + */ + h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf; + h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE; + h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8; + h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE; + h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE; + h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16; + + h->mb.i_neighbour4[6] = + h->mb.i_neighbour4[9] = + h->mb.i_neighbour4[12] = + h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT; + h->mb.i_neighbour4[3] = + h->mb.i_neighbour4[7] = + h->mb.i_neighbour4[11] = + h->mb.i_neighbour4[13] = + h->mb.i_neighbour4[15] = + h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT; + return 0; fail: return -1; } @@ -982,8 +945,9 @@ if( h->mb.b_interlaced ) ref_pix_offset[1] += (1-2*(i_mb_y&1)) * i_stride; h->mb.pic.i_stride[i] = i_stride2; + h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset]; h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, - &h->fenc->plane[i][i_pix_offset], i_stride2, w ); + h->mb.pic.p_fenc_plane[i], i_stride2, w ); memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 ); if( h->mb.b_interlaced ) { @@ -1150,23 +1114,6 @@ + !!(h->mb.i_neighbour & MB_TOP); } - /* fdec: fenc: - * yyyyyyy - * yYYYY YYYY - * yYYYY YYYY - * yYYYY YYYY - * yYYYY YYYY - * uuu vvv UUVV - * uUU vVV UUVV - * uUU vVV - */ - h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf; - h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE; - h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8; - h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE; - h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE; - h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16; - if( !h->mb.b_interlaced ) { copy_column8( h->mb.pic.p_fdec[0]-1, h->mb.pic.p_fdec[0]+15 ); @@ -1267,8 +1214,10 @@ h->mb.cache.ref[i_list][i8+2*8] = h->mb.cache.ref[i_list][i8+3*8] = h->mb.ref[i_list][ir + 1*s8x8]; - for( i = 0; i < 4; i++ ) - *(uint32_t*)h->mb.cache.mv[i_list][i8+i*8] = *(uint32_t*)h->mb.mv[i_list][iv + i*s4x4]; + *(uint32_t*)h->mb.cache.mv[i_list][i8+0*8] = *(uint32_t*)h->mb.mv[i_list][iv + 0*s4x4]; + *(uint32_t*)h->mb.cache.mv[i_list][i8+1*8] = *(uint32_t*)h->mb.mv[i_list][iv + 1*s4x4]; + *(uint32_t*)h->mb.cache.mv[i_list][i8+2*8] = *(uint32_t*)h->mb.mv[i_list][iv + 2*s4x4]; + *(uint32_t*)h->mb.cache.mv[i_list][i8+3*8] = *(uint32_t*)h->mb.mv[i_list][iv + 3*s4x4]; } else { @@ -1300,8 +1249,10 @@ { const int i8 = x264_scan8[0] - 1; const int iv = i_mb_4x4 - 1; - for( i = 0; i < 4; i++ ) - *(uint32_t*)h->mb.cache.mvd[i_list][i8+i*8] = *(uint32_t*)h->mb.mvd[i_list][iv + i*s4x4]; + *(uint32_t*)h->mb.cache.mvd[i_list][i8+0*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 0*s4x4]; + *(uint32_t*)h->mb.cache.mvd[i_list][i8+1*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 1*s4x4]; + *(uint32_t*)h->mb.cache.mvd[i_list][i8+2*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 2*s4x4]; + *(uint32_t*)h->mb.cache.mvd[i_list][i8+3*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 3*s4x4]; } else { @@ -1343,19 +1294,9 @@ h->mb.i_neighbour4[8] = h->mb.i_neighbour4[10] = h->mb.i_neighbour8[2] = MB_TOP|MB_TOPRIGHT | ((h->mb.i_neighbour & MB_LEFT) ? (MB_LEFT|MB_TOPLEFT) : 0); - h->mb.i_neighbour4[3] = - h->mb.i_neighbour4[7] = - h->mb.i_neighbour4[11] = - h->mb.i_neighbour4[13] = - h->mb.i_neighbour4[15] = - h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT; h->mb.i_neighbour4[5] = h->mb.i_neighbour8[1] = MB_LEFT | (h->mb.i_neighbour & MB_TOPRIGHT) | ((h->mb.i_neighbour & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0); - h->mb.i_neighbour4[6] = - h->mb.i_neighbour4[9] = - h->mb.i_neighbour4[12] = - h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT; } static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i) @@ -1394,14 +1335,7 @@ x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y ); h->mb.type[i_mb_xy] = i_mb_type; - - if( h->mb.i_type == I_PCM || (h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0) ) - h->mb.i_qp = h->mb.i_last_qp; - h->mb.qp[i_mb_xy] = i_mb_type != I_PCM ? h->mb.i_qp : 0; - - h->mb.i_last_dqp = h->mb.i_qp - h->mb.i_last_qp; - h->mb.i_last_qp = h->mb.i_qp; - h->mb.i_mb_prev_xy = h->mb.i_mb_xy; + h->mb.i_mb_prev_xy = i_mb_xy; /* save intra4x4 */ if( i_mb_type == I_4x4 ) @@ -1416,6 +1350,8 @@ if( i_mb_type == I_PCM ) { + h->mb.qp[i_mb_xy] = 0; + h->mb.i_last_dqp = 0; h->mb.i_cbp_chroma = 2; h->mb.i_cbp_luma = 0xf; h->mb.cbp[i_mb_xy] = 0x72f; /* all set */ @@ -1426,59 +1362,71 @@ else { /* save non zero count */ - for( y = 0; y < 4; y++ ) - *(uint32_t*)&non_zero_count[y*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+y*8]; - for( y = 0; y < 4; y++ ) - *(uint16_t*)&non_zero_count[16+y*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+y*2]-1] >> 8; - + *(uint32_t*)&non_zero_count[0*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+0*8]; + *(uint32_t*)&non_zero_count[1*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+1*8]; + *(uint32_t*)&non_zero_count[2*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+2*8]; + *(uint32_t*)&non_zero_count[3*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+3*8]; + *(uint16_t*)&non_zero_count[16+0*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] >> 8; + *(uint16_t*)&non_zero_count[16+1*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] >> 8; + *(uint16_t*)&non_zero_count[16+2*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] >> 8; + *(uint16_t*)&non_zero_count[16+3*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] >> 8; + + if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 ) + h->mb.i_qp = h->mb.i_last_qp; + h->mb.qp[i_mb_xy] = h->mb.i_qp; + h->mb.i_last_dqp = h->mb.i_qp - h->mb.i_last_qp; + h->mb.i_last_qp = h->mb.i_qp; } if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 ) h->mb.b_transform_8x8 = 0; h->mb.mb_transform_size[i_mb_xy] = h->mb.b_transform_8x8; - if( !IS_INTRA( i_mb_type ) ) + if( h->sh.i_type != SLICE_TYPE_I ) { - h->mb.ref[0][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]]; - h->mb.ref[0][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]]; - h->mb.ref[0][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]]; - h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]]; - for( y = 0; y < 4; y++ ) - { - *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+0]; - *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+2]; - } - if(h->sh.i_type == SLICE_TYPE_B) - { - h->mb.ref[1][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]]; - h->mb.ref[1][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]]; - h->mb.ref[1][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]]; - h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]]; + if( !IS_INTRA( i_mb_type ) ) + { + h->mb.ref[0][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]]; + h->mb.ref[0][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]]; + h->mb.ref[0][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]]; + h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]]; for( y = 0; y < 4; y++ ) { - *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+0]; - *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+2]; + *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+0]; + *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+2]; + } + if( h->sh.i_type == SLICE_TYPE_B ) + { + h->mb.ref[1][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]]; + h->mb.ref[1][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]]; + h->mb.ref[1][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]]; + h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]]; + for( y = 0; y < 4; y++ ) + { + *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+0]; + *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+2]; + } } } - } - else - { - int i_list; - for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2 : 1 ); i_list++ ) + else { - *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+0*s8x8] = (uint8_t)(-1) * 0x0101; - *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+1*s8x8] = (uint8_t)(-1) * 0x0101; - for( y = 0; y < 4; y++ ) + int i_list; + for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2 : 1 ); i_list++ ) { - *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = 0; - *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = 0; + *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+0*s8x8] = (uint8_t)(-1) * 0x0101; + *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+1*s8x8] = (uint8_t)(-1) * 0x0101; + for( y = 0; y < 4; y++ ) + { + *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = 0; + *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = 0; + } } } } if( h->param.b_cabac ) { - if( i_mb_type == I_4x4 || i_mb_type == I_16x16 ) + if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM ) h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ]; else h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC; @@ -1554,7 +1502,12 @@ if( h->param.analyse.b_weighted_bipred && dist_scale_factor >= -64 && dist_scale_factor <= 128 ) + { h->mb.bipred_weight[i_ref0][i_ref1] = 64 - dist_scale_factor; + // ssse3 implementation of biweight doesn't support the extrema. + // if we ever generate them, we'll have to drop that optimization. + assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 ); + } else h->mb.bipred_weight[i_ref0][i_ref1] = 32; }

@@ -251,14 +251,16 @@ 2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE }; -static const uint8_t i_chroma_qp_table[52] = +static const uint8_t i_chroma_qp_table[52+12*2] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 38, 39, 39, - 39, 39 + 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, }; enum cabac_ctx_block_cat_e @@ -312,16 +314,6 @@ * h->mb. need only valid values from other blocks */ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[8][2], int *i_mvc ); - -int x264_mb_predict_intra4x4_mode( x264_t *h, int idx ); -int x264_mb_predict_non_zero_code( x264_t *h, int idx ); - -/* x264_mb_transform_8x8_allowed: - * check whether any partition is smaller than 8x8 (or at least - * might be, according to just partition type.) - * doesn't check for cbp */ -int x264_mb_transform_8x8_allowed( x264_t *h ); - void x264_mb_mc( x264_t *h ); void x264_mb_mc_8x8( x264_t *h, int i8 ); @@ -444,6 +436,72 @@ return i_nz; } +static inline int x264_mb_predict_intra4x4_mode( x264_t *h, int idx ) +{ + const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1]; + const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8]; + const int m = X264_MIN( x264_mb_pred_mode4x4_fix(ma), + x264_mb_pred_mode4x4_fix(mb) ); + + if( m < 0 ) + return I_PRED_4x4_DC; + + return m; +} +static inline int x264_mb_predict_non_zero_code( x264_t *h, int idx ) +{ + const int za = h->mb.cache.non_zero_count[x264_scan8[idx] - 1]; + const int zb = h->mb.cache.non_zero_count[x264_scan8[idx] - 8]; + + int i_ret = za + zb; + + if( i_ret < 0x80 ) + { + i_ret = ( i_ret + 1 ) >> 1; + } + return i_ret & 0x7f; +} +/* x264_mb_transform_8x8_allowed: + * check whether any partition is smaller than 8x8 (or at least + * might be, according to just partition type.) + * doesn't check for cbp */ +static inline int x264_mb_transform_8x8_allowed( x264_t *h ) +{ + // intra and skip are disallowed + // large partitions are allowed + // direct and 8x8 are conditional + static const uint8_t partition_tab[X264_MBTYPE_MAX] = { + 0,0,0,0,1,2,0,2,1,1,1,1,1,1,1,1,1,2,0, + }; + int p, i; + + if( !h->pps->b_transform_8x8_mode ) + return 0; + p = partition_tab[h->mb.i_type]; + if( p < 2 ) + return p; + else if( h->mb.i_type == B_DIRECT ) + return h->sps->b_direct8x8_inference; + else if( h->mb.i_type == P_8x8 ) + { + if( !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8) ) + return 1; + for( i=0; i<4; i++ ) + if( h->mb.i_sub_partition[i] != D_L0_8x8 ) + return 0; + return 1; + } + else // B_8x8 + { + // x264 currently doesn't use sub-8x8 B partitions, so don't check for them + if( h->sps->b_direct8x8_inference ) + return 1; + for( i=0; i<4; i++ ) + if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 ) + return 0; + return 1; + } +} #endif

@@ -49,45 +49,30 @@ } } -static inline void pixel_avg_wxh( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int width, int height ) +static inline void pixel_avg_wxh( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int width, int height ) { int x, y; for( y = 0; y < height; y++ ) { for( x = 0; x < width; x++ ) { - dst[x] = ( dst[x] + src[x] + 1 ) >> 1; + dst[x] = ( src1[x] + src2[x] + 1 ) >> 1; } + src1 += i_src1; + src2 += i_src2; dst += i_dst; - src += i_src; } } -#define PIXEL_AVG_C( name, width, height ) \ -static void name( uint8_t *pix1, int i_stride_pix1, \ - uint8_t *pix2, int i_stride_pix2 ) \ -{ \ - pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \ -} -PIXEL_AVG_C( pixel_avg_16x16, 16, 16 ) -PIXEL_AVG_C( pixel_avg_16x8, 16, 8 ) -PIXEL_AVG_C( pixel_avg_8x16, 8, 16 ) -PIXEL_AVG_C( pixel_avg_8x8, 8, 8 ) -PIXEL_AVG_C( pixel_avg_8x4, 8, 4 ) -PIXEL_AVG_C( pixel_avg_4x8, 4, 8 ) -PIXEL_AVG_C( pixel_avg_4x4, 4, 4 ) -PIXEL_AVG_C( pixel_avg_4x2, 4, 2 ) -PIXEL_AVG_C( pixel_avg_2x4, 2, 4 ) -PIXEL_AVG_C( pixel_avg_2x2, 2, 2 ) - - /* Implicit weighted bipred only: * assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */ -#define op_scale2(x) dst[x] = x264_clip_uint8( (dst[x]*i_weight1 + src[x]*i_weight2 + (1<<5)) >> 6 ) -static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int width, int height, int i_weight1 ){ +#define op_scale2(x) dst[x] = x264_clip_uint8( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 ) +static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int width, int height, int i_weight1 ) +{ int y; const int i_weight2 = 64 - i_weight1; - for(y=0; y<height; y++, dst += i_dst, src += i_src){ + for( y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 ) + { op_scale2(0); op_scale2(1); if(width==2) continue; @@ -109,27 +94,28 @@ op_scale2(15); } } +#undef op_scale2 -#define PIXEL_AVG_WEIGHT_C( width, height ) \ -static void pixel_avg_weight_##width##x##height( \ - uint8_t *pix1, int i_stride_pix1, \ - uint8_t *pix2, int i_stride_pix2, int i_weight1 ) \ +#define PIXEL_AVG_C( name, width, height ) \ +static void name( uint8_t *pix1, int i_stride_pix1, \ + uint8_t *pix2, int i_stride_pix2, \ + uint8_t *pix3, int i_stride_pix3, int weight ) \ { \ - pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height, i_weight1 ); \ + if( weight == 32 )\ + pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \ + else\ + pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \ } - -PIXEL_AVG_WEIGHT_C(16,16) -PIXEL_AVG_WEIGHT_C(16,8) -PIXEL_AVG_WEIGHT_C(8,16) -PIXEL_AVG_WEIGHT_C(8,8) -PIXEL_AVG_WEIGHT_C(8,4) -PIXEL_AVG_WEIGHT_C(4,8) -PIXEL_AVG_WEIGHT_C(4,4) -PIXEL_AVG_WEIGHT_C(4,2) -PIXEL_AVG_WEIGHT_C(2,4) -PIXEL_AVG_WEIGHT_C(2,2) -#undef op_scale2 -#undef PIXEL_AVG_WEIGHT_C +PIXEL_AVG_C( pixel_avg_16x16, 16, 16 ) +PIXEL_AVG_C( pixel_avg_16x8, 16, 8 ) +PIXEL_AVG_C( pixel_avg_8x16, 8, 16 ) +PIXEL_AVG_C( pixel_avg_8x8, 8, 8 ) +PIXEL_AVG_C( pixel_avg_8x4, 8, 4 ) +PIXEL_AVG_C( pixel_avg_4x8, 4, 8 ) +PIXEL_AVG_C( pixel_avg_4x4, 4, 4 ) +PIXEL_AVG_C( pixel_avg_4x2, 4, 2 ) +PIXEL_AVG_C( pixel_avg_2x4, 2, 4 ) +PIXEL_AVG_C( pixel_avg_2x2, 2, 2 ) static void mc_copy( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height ) { @@ -299,9 +285,15 @@ i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres ); x264_frame_expand_border_lowres( frame ); - for( y=0; y<16; y++ ) - for( x=0; x<16; x++ ) - frame->i_cost_est[y][x] = -1; + memset( frame->i_cost_est, -1, sizeof(frame->i_cost_est) ); + + for( x = 0; x < h->param.i_bframe + 2; x++ ) + for( y = 0; y < h->param.i_bframe + 2; y++ ) + frame->i_row_satds[y][x][0] = -1; + + for( y = 0; y <= !!h->param.i_bframe; y++ ) + for( x = 0; x <= h->param.i_bframe; x++ ) + frame->lowres_mvs[y][x][0][0] = 0x7FFF; } static void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, @@ -346,18 +338,8 @@ pf->avg[PIXEL_4x2] = pixel_avg_4x2; pf->avg[PIXEL_2x4] = pixel_avg_2x4; pf->avg[PIXEL_2x2] = pixel_avg_2x2; - - pf->avg_weight[PIXEL_16x16]= pixel_avg_weight_16x16; - pf->avg_weight[PIXEL_16x8] = pixel_avg_weight_16x8; - pf->avg_weight[PIXEL_8x16] = pixel_avg_weight_8x16; - pf->avg_weight[PIXEL_8x8] = pixel_avg_weight_8x8; - pf->avg_weight[PIXEL_8x4] = pixel_avg_weight_8x4; - pf->avg_weight[PIXEL_4x8] = pixel_avg_weight_4x8; - pf->avg_weight[PIXEL_4x4] = pixel_avg_weight_4x4; - pf->avg_weight[PIXEL_4x2] = pixel_avg_weight_4x2; - pf->avg_weight[PIXEL_2x4] = pixel_avg_weight_2x4; - pf->avg_weight[PIXEL_2x2] = pixel_avg_weight_2x2; + pf->copy_16x16_unaligned = mc_copy_w16; pf->copy[PIXEL_16x16] = mc_copy_w16; pf->copy[PIXEL_8x8] = mc_copy_w8; pf->copy[PIXEL_4x4] = mc_copy_w4;

@@ -45,11 +45,11 @@ int mvx, int mvy, int i_width, int i_height ); - void (*avg[10])( uint8_t *dst, int, uint8_t *src, int ); - void (*avg_weight[10])( uint8_t *dst, int, uint8_t *src, int, int i_weight ); + void (*avg[10])( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight ); /* only 16x16, 8x8, and 4x4 defined */ void (*copy[7])( uint8_t *dst, int, uint8_t *src, int, int i_height ); + void (*copy_16x16_unaligned)( uint8_t *dst, int, uint8_t *src, int, int i_height ); void (*plane_copy)( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h); @@ -62,7 +62,7 @@ uint8_t *pix_uv, int stride_uv, int mb_x ); /* prefetch the next few macroblocks of a hpel reference frame */ void (*prefetch_ref)( uint8_t *pix, int stride, int parity ); - + void *(*memcpy_aligned)( void *dst, const void *src, size_t n ); void (*memzero_aligned)( void *dst, int n );

@@ -26,6 +26,7 @@ #endif #include <time.h> +#include "common.h" #include "osdep.h" int64_t x264_mdate( void )

@@ -136,29 +136,49 @@ } -static inline void pixel_sub_wxh( int16_t *diff, int i_size, - uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) -{ - int y, x; - for( y = 0; y < i_size; y++ ) - { - for( x = 0; x < i_size; x++ ) - { - diff[x + y*i_size] = pix1[x] - pix2[x]; - } - pix1 += i_pix1; - pix2 += i_pix2; - } +/**************************************************************************** + * pixel_var_wxh + ****************************************************************************/ +#define PIXEL_VAR_C( name, w, shift ) \ +static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \ +{ \ + uint32_t var = 0, sum = 0, sqr = 0; \ + int x, y; \ + for( y = 0; y < w; y++ ) \ + { \ + for( x = 0; x < w; x++ ) \ + { \ + sum += pix[x]; \ + sqr += pix[x] * pix[x]; \ + } \ + pix += i_stride; \ + } \ + var = sqr - (sum * sum >> shift); \ + *sad = sum; \ + return var; \ +} + +PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 ) +PIXEL_VAR_C( x264_pixel_var_8x8, 8, 6 ) + + +#define HADAMARD4(d0,d1,d2,d3,s0,s1,s2,s3) {\ + int t0 = s0 + s1;\ + int t1 = s0 - s1;\ + int t2 = s2 + s3;\ + int t3 = s2 - s3;\ + d0 = t0 + t2;\ + d2 = t0 - t2;\ + d1 = t1 + t3;\ + d3 = t1 - t3;\ } - /**************************************************************************** * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences ****************************************************************************/ static int pixel_satd_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ) { int16_t tmp[4][4]; - int16_t diff[4][4]; int x, y; int i_satd = 0; @@ -166,32 +186,22 @@ { for( x = 0; x < i_width; x += 4 ) { - int d; - - pixel_sub_wxh( (int16_t*)diff, 4, &pix1[x], i_pix1, &pix2[x], i_pix2 ); + int i; + uint8_t *p1 = pix1+x, *p2 = pix2+x; - for( d = 0; d < 4; d++ ) + for( i=0; i<4; i++, p1+=i_pix1, p2+=i_pix2 ) { - int s01, s23; - int d01, d23; - - s01 = diff[d][0] + diff[d][1]; s23 = diff[d][2] + diff[d][3]; - d01 = diff[d][0] - diff[d][1]; d23 = diff[d][2] - diff[d][3]; - - tmp[d][0] = s01 + s23; - tmp[d][1] = s01 - s23; - tmp[d][2] = d01 - d23; - tmp[d][3] = d01 + d23; + int a0 = p1[0] - p2[0]; + int a1 = p1[1] - p2[1]; + int a2 = p1[2] - p2[2]; + int a3 = p1[3] - p2[3]; + HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 ); } - for( d = 0; d < 4; d++ ) + for( i=0; i<4; i++ ) { - int s01, s23; - int d01, d23; - - s01 = tmp[0][d] + tmp[1][d]; s23 = tmp[2][d] + tmp[3][d]; - d01 = tmp[0][d] - tmp[1][d]; d23 = tmp[2][d] - tmp[3][d]; - - i_satd += abs( s01 + s23 ) + abs( s01 - s23 ) + abs( d01 - d23 ) + abs( d01 + d23 ); + int a0,a1,a2,a3; + HADAMARD4( a0,a1,a2,a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] ); + i_satd += abs(a0) + abs(a1) + abs(a2) + abs(a3); } } @@ -220,30 +230,17 @@ * pixel_sa8d_WxH: sum of 8x8 Hadamard transformed differences ****************************************************************************/ #define SA8D_1D {\ - const int a0 = SRC(0) + SRC(4);\ - const int a4 = SRC(0) - SRC(4);\ - const int a1 = SRC(1) + SRC(5);\ - const int a5 = SRC(1) - SRC(5);\ - const int a2 = SRC(2) + SRC(6);\ - const int a6 = SRC(2) - SRC(6);\ - const int a3 = SRC(3) + SRC(7);\ - const int a7 = SRC(3) - SRC(7);\ - const int b0 = a0 + a2;\ - const int b2 = a0 - a2;\ - const int b1 = a1 + a3;\ - const int b3 = a1 - a3;\ - const int b4 = a4 + a6;\ - const int b6 = a4 - a6;\ - const int b5 = a5 + a7;\ - const int b7 = a5 - a7;\ - DST(0, b0 + b1);\ - DST(1, b0 - b1);\ - DST(2, b2 + b3);\ - DST(3, b2 - b3);\ - DST(4, b4 + b5);\ - DST(5, b4 - b5);\ - DST(6, b6 + b7);\ - DST(7, b6 - b7);\ + int b0,b1,b2,b3,b4,b5,b6,b7;\ + HADAMARD4( b0,b1,b2,b3, SRC(0), SRC(1), SRC(2), SRC(3) );\ + HADAMARD4( b4,b5,b6,b7, SRC(4), SRC(5), SRC(6), SRC(7) );\ + DST(0, b0 + b4);\ + DST(4, b0 - b4);\ + DST(1, b1 + b5);\ + DST(5, b1 - b5);\ + DST(2, b2 + b6);\ + DST(6, b2 - b6);\ + DST(3, b3 + b7);\ + DST(7, b3 - b7);\ } static inline int pixel_sa8d_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, @@ -258,18 +255,28 @@ for( x = 0; x < i_width; x += 8 ) { int i; - pixel_sub_wxh( (int16_t*)diff, 8, pix1+x, i_pix1, pix2+x, i_pix2 ); + uint8_t *p1 = pix1+x, *p2 = pix2+x; -#define SRC(x) diff[i][x] +#define SRC(x) a##x #define DST(x,rhs) diff[i][x] = (rhs) - for( i = 0; i < 8; i++ ) + for( i=0; i<8; i++, p1+=i_pix1, p2+=i_pix2 ) + { + int a0 = p1[0] - p2[0]; + int a1 = p1[1] - p2[1]; + int a2 = p1[2] - p2[2]; + int a3 = p1[3] - p2[3]; + int a4 = p1[4] - p2[4]; + int a5 = p1[5] - p2[5]; + int a6 = p1[6] - p2[6]; + int a7 = p1[7] - p2[7]; SA8D_1D + } #undef SRC #undef DST #define SRC(x) diff[x][i] #define DST(x,rhs) i_satd += abs(rhs) - for( i = 0; i < 8; i++ ) + for( i=0; i<8; i++ ) SA8D_1D #undef SRC #undef DST @@ -292,6 +299,69 @@ PIXEL_SA8D_C( 8, 16 ) PIXEL_SA8D_C( 8, 8 ) + +static uint64_t pixel_hadamard_ac( uint8_t *pix, int stride ) +{ + int16_t tmp[8][8]; + int sum4=0, sum8=0; + int i; + for( i=0; i<8; i++, pix+=stride ) + { + HADAMARD4( tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i], + pix[0], pix[1], pix[2], pix[3] ); + HADAMARD4( tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i], + pix[4], pix[5], pix[6], pix[7] ); + } + for( i=0; i<8; i++ ) + { + int a0,a1,a2,a3,a4,a5,a6,a7; + HADAMARD4( a0,a1,a2,a3, tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3] ); + sum4 += abs(a0) + abs(a1) + abs(a2) + abs(a3); + HADAMARD4( a4,a5,a6,a7, tmp[i][4], tmp[i][5], tmp[i][6], tmp[i][7] ); + sum4 += abs(a4) + abs(a5) + abs(a6) + abs(a7); + tmp[i][0] = a0 + a4; + tmp[i][4] = a0 - a4; + tmp[i][1] = a1 + a5; + tmp[i][5] = a1 - a5; + tmp[i][2] = a2 + a6; + tmp[i][6] = a2 - a6; + tmp[i][3] = a3 + a7; + tmp[i][7] = a3 - a7; + } + for( i=0; i<8; i++ ) + { + sum8 += abs( tmp[0][i] + tmp[4][i] ) + + abs( tmp[0][i] - tmp[4][i] ) + + abs( tmp[1][i] + tmp[5][i] ) + + abs( tmp[1][i] - tmp[5][i] ) + + abs( tmp[2][i] + tmp[6][i] ) + + abs( tmp[2][i] - tmp[6][i] ) + + abs( tmp[3][i] + tmp[7][i] ) + + abs( tmp[3][i] - tmp[7][i] ); + } + sum4 -= tmp[0][0]+tmp[4][0]; + sum8 -= tmp[0][0]+tmp[4][0]; + return ((uint64_t)sum8<<32) + sum4; +} + +#define HADAMARD_AC(w,h) \ +static uint64_t x264_pixel_hadamard_ac_##w##x##h( uint8_t *pix, int stride )\ +{\ + uint64_t sum = pixel_hadamard_ac( pix, stride );\ + if( w==16 )\ + sum += pixel_hadamard_ac( pix+8, stride );\ + if( h==16 )\ + sum += pixel_hadamard_ac( pix+8*stride, stride );\ + if( w==16 && h==16 )\ + sum += pixel_hadamard_ac( pix+8*stride+8, stride );\ + return ((sum>>34)<<32) + ((uint32_t)sum>>1);\ +} +HADAMARD_AC( 16, 16 ) +HADAMARD_AC( 16, 8 ) +HADAMARD_AC( 8, 16 ) +HADAMARD_AC( 8, 8 ) + + /**************************************************************************** * pixel_sad_x4 ****************************************************************************/ @@ -502,20 +572,24 @@ { memset( pixf, 0, sizeof(*pixf) ); -#define INIT2( name, cpu ) \ - pixf->name[PIXEL_16x16] = x264_pixel_##name##_16x16##cpu;\ - pixf->name[PIXEL_16x8] = x264_pixel_##name##_16x8##cpu; -#define INIT4( name, cpu ) \ - INIT2( name, cpu ) \ - pixf->name[PIXEL_8x16] = x264_pixel_##name##_8x16##cpu;\ - pixf->name[PIXEL_8x8] = x264_pixel_##name##_8x8##cpu; -#define INIT5( name, cpu ) \ - INIT4( name, cpu ) \ - pixf->name[PIXEL_8x4] = x264_pixel_##name##_8x4##cpu; -#define INIT7( name, cpu ) \ - INIT5( name, cpu ) \ - pixf->name[PIXEL_4x8] = x264_pixel_##name##_4x8##cpu;\ - pixf->name[PIXEL_4x4] = x264_pixel_##name##_4x4##cpu; +#define INIT2_NAME( name1, name2, cpu ) \ + pixf->name1[PIXEL_16x16] = x264_pixel_##name2##_16x16##cpu;\ + pixf->name1[PIXEL_16x8] = x264_pixel_##name2##_16x8##cpu; +#define INIT4_NAME( name1, name2, cpu ) \ + INIT2_NAME( name1, name2, cpu ) \ + pixf->name1[PIXEL_8x16] = x264_pixel_##name2##_8x16##cpu;\ + pixf->name1[PIXEL_8x8] = x264_pixel_##name2##_8x8##cpu; +#define INIT5_NAME( name1, name2, cpu ) \ + INIT4_NAME( name1, name2, cpu ) \ + pixf->name1[PIXEL_8x4] = x264_pixel_##name2##_8x4##cpu; +#define INIT7_NAME( name1, name2, cpu ) \ + INIT5_NAME( name1, name2, cpu ) \ + pixf->name1[PIXEL_4x8] = x264_pixel_##name2##_4x8##cpu;\ + pixf->name1[PIXEL_4x4] = x264_pixel_##name2##_4x4##cpu; +#define INIT2( name, cpu ) INIT2_NAME( name, name, cpu ) +#define INIT4( name, cpu ) INIT4_NAME( name, name, cpu ) +#define INIT5( name, cpu ) INIT5_NAME( name, name, cpu ) +#define INIT7( name, cpu ) INIT7_NAME( name, name, cpu ) #define INIT_ADS( cpu ) \ pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\ @@ -523,6 +597,7 @@ pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu; INIT7( sad, ); + INIT7_NAME( sad_aligned, sad, ); INIT7( sad_x3, ); INIT7( sad_x4, ); INIT7( ssd, ); @@ -530,8 +605,12 @@ INIT7( satd_x3, ); INIT7( satd_x4, ); INIT4( sa8d, ); + INIT4( hadamard_ac, ); INIT_ADS( ); + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8; + pixf->ssim_4x4x2_core = ssim_4x4x2_core; pixf->ssim_end4 = ssim_end4; @@ -544,13 +623,16 @@ if( cpu&X264_CPU_MMXEXT ) { INIT7( sad, _mmxext ); + INIT7_NAME( sad_aligned, sad, _mmxext ); INIT7( sad_x3, _mmxext ); INIT7( sad_x4, _mmxext ); INIT7( satd, _mmxext ); INIT7( satd_x3, _mmxext ); INIT7( satd_x4, _mmxext ); + INIT4( hadamard_ac, _mmxext ); INIT_ADS( _mmxext ); - + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext; #ifdef ARCH_X86 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext; @@ -582,6 +664,7 @@ } #endif pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext; } @@ -591,8 +674,10 @@ INIT2( sad, _sse2 ); INIT2( sad_x3, _sse2 ); INIT2( sad_x4, _sse2 ); + INIT4( hadamard_ac, _sse2 ); INIT_ADS( _sse2 ); - + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2; #ifdef ARCH_X86 if( cpu&X264_CPU_CACHELINE_64 ) { @@ -608,6 +693,8 @@ INIT5( satd, _sse2 ); INIT5( satd_x3, _sse2 ); INIT5( satd_x4, _sse2 ); + INIT2_NAME( sad_aligned, sad, _sse2_aligned ); + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; @@ -629,10 +716,12 @@ INIT7( satd, _ssse3 ); INIT7( satd_x3, _ssse3 ); INIT7( satd_x4, _ssse3 ); + INIT4( hadamard_ac, _ssse3 ); INIT_ADS( _ssse3 ); pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3; #ifdef ARCH_X86_64

@@ -68,9 +68,14 @@ x264_pixel_cmp_t ssim[7]; x264_pixel_cmp_t sa8d[4]; x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */ + x264_pixel_cmp_t mbcmp_unaligned[7]; /* unaligned mbcmp for subpel */ x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */ x264_pixel_cmp_x3_t fpelcmp_x3[7]; x264_pixel_cmp_x4_t fpelcmp_x4[7]; + x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */ + + int (*var[4])( uint8_t *pix, int stride, uint32_t *sad ); + uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride ); void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); @@ -87,12 +92,14 @@ int (*ads[7])( int enc_dc[4], uint16_t *sums, int delta, uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ); - /* calculate satd of V, H, and DC modes. + /* calculate satd or sad of V, H, and DC modes. * may be NULL, in which case just use pred+satd instead. */ - void (*intra_satd_x3_16x16)( uint8_t *fenc, uint8_t *fdec, int res[3] ); - void (*intra_satd_x3_8x8c)( uint8_t *fenc, uint8_t *fdec, int res[3] ); - void (*intra_satd_x3_4x4)( uint8_t *fenc, uint8_t *fdec, int res[3] ); - void (*intra_sa8d_x3_8x8)( uint8_t *fenc, uint8_t edge[33], int res[3] ); + void (*intra_mbcmp_x3_16x16)( uint8_t *fenc, uint8_t *fdec , int res[3] ); + void (*intra_satd_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); + void (*intra_sad_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); + void (*intra_satd_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); + void (*intra_satd_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); + void (*intra_sa8d_x3_8x8) ( uint8_t *fenc, uint8_t edge[33], int res[3] ); } x264_pixel_function_t; void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );

@@ -27,9 +27,6 @@ #include "common.h" -#ifdef _MSC_VER -#undef HAVE_MMX /* not finished now */ -#endif #ifdef HAVE_MMX # include "x86/predict.h" #endif @@ -646,7 +643,7 @@ SRC(5,0)=SRC(6,1)=SRC(7,2)= F2(t3,t4,t5); SRC(6,0)=SRC(7,1)= F2(t4,t5,t6); SRC(7,0)= F2(t5,t6,t7); - + } static void predict_8x8_vr( uint8_t *src, uint8_t edge[33] ) {

@@ -194,7 +194,7 @@ } } -void x264_denoise_dct_core( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ) +static void x264_denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ) { int i; for( i=1; i<size; i++ ) @@ -218,7 +218,7 @@ pf->dequant_4x4 = dequant_4x4; pf->dequant_8x8 = dequant_8x8; - pf->denoise_dct_core = x264_denoise_dct_core; + pf->denoise_dct = x264_denoise_dct; #ifdef HAVE_MMX if( cpu&X264_CPU_MMX ) @@ -233,7 +233,7 @@ pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx; pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx; } - pf->denoise_dct_core = x264_denoise_dct_core_mmx; + pf->denoise_dct = x264_denoise_dct_mmx; #endif } @@ -257,7 +257,7 @@ pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2; pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2; } - pf->denoise_dct_core = x264_denoise_dct_core_sse2; + pf->denoise_dct = x264_denoise_dct_sse2; } if( cpu&X264_CPU_SSSE3 ) @@ -266,7 +266,7 @@ pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3; pf->quant_4x4 = x264_quant_4x4_ssse3; pf->quant_8x8 = x264_quant_8x8_ssse3; - pf->denoise_dct_core = x264_denoise_dct_core_ssse3; + pf->denoise_dct = x264_denoise_dct_ssse3; } #endif // HAVE_MMX

@@ -33,7 +33,7 @@ void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); - void (*denoise_dct_core)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); + void (*denoise_dct)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); } x264_quant_function_t; void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );

@@ -75,7 +75,7 @@ int quant8_mf[2][6][8][8]; int q, i, j, i_list; int deadzone[4] = { 32 - h->param.analyse.i_luma_deadzone[1], - 32 - h->param.analyse.i_luma_deadzone[0], + 32 - h->param.analyse.i_luma_deadzone[0], 32 - 11, 32 - 21 }; int max_qp_err = -1; @@ -195,7 +195,7 @@ } } -int x264_cqm_parse_jmlist( x264_t *h, const char *buf, const char *name, +static int x264_cqm_parse_jmlist( x264_t *h, const char *buf, const char *name, uint8_t *cqm, const uint8_t *jvt, int length ) { char *p; @@ -247,7 +247,7 @@ int b_error = 0; h->param.i_cqm_preset = X264_CQM_CUSTOM; - + buf = x264_slurp_file( filename ); if( !buf ) {

@@ -28,11 +28,12 @@ { PROFILE_BASELINE = 66, PROFILE_MAIN = 77, - PROFILE_EXTENTED = 88, + PROFILE_EXTENDED = 88, PROFILE_HIGH = 100, PROFILE_HIGH10 = 110, PROFILE_HIGH422 = 122, - PROFILE_HIGH444 = 144 + PROFILE_HIGH444 = 144, + PROFILE_HIGH444_PREDICTIVE = 244, }; enum cqm4_e @@ -94,7 +95,7 @@ int b_aspect_ratio_info_present; int i_sar_width; int i_sar_height; - + int b_overscan_info_present; int b_overscan_info;

@@ -63,20 +63,13 @@ endstruc %macro LOAD_GLOBAL 4 -%ifdef PIC64 +%ifdef PIC ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea lea r11, [%2 GLOBAL] %ifnidn %3, 0 add r11, %3 %endif movzx %1, byte [r11+%4] -%elifdef PIC32 - %ifnidn %3, 0 - lea %1, [%3+%4] - movzx %1, byte [%2+%1 GLOBAL] - %else - movzx %1, byte [%2+%3+%4 GLOBAL] - %endif %else movzx %1, byte [%2+%3+%4] %endif @@ -85,7 +78,6 @@ cglobal x264_cabac_encode_decision_asm, 0,7 movifnidn t0d, r0m movifnidn t1d, r1m - picgetgot t2 mov t5d, [r0+cb.range] movzx t3d, byte [r0+cb.state+t1] mov t4d, t5d @@ -95,22 +87,13 @@ sub t4d, t5d mov t6d, t3d shr t6d, 6 -%ifdef PIC32 - cmp t6d, r2m -%else movifnidn t2d, r2m cmp t6d, t2d -%endif mov t6d, [r0+cb.low] lea t7, [t6+t4] cmovne t4d, t5d cmovne t6d, t7d -%ifdef PIC32 - mov t1, r2m - LOAD_GLOBAL t3d, x264_cabac_transition, t1, t3*2 -%else LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2 -%endif movifnidn t1d, r1m mov [r0+cb.state+t1], t3b .renorm:

@@ -32,21 +32,6 @@ SECTION .text -%macro SBUTTERFLY 4 - mova m%4, m%2 - punpckl%1 m%2, m%3 - punpckh%1 m%4, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE4x4W 5 - SBUTTERFLY wd, %1, %2, %5 - SBUTTERFLY wd, %3, %4, %5 - SBUTTERFLY dq, %1, %3, %5 - SBUTTERFLY dq, %2, %4, %5 - SWAP %2, %3 -%endmacro - ; in: m0..m7 ; out: 0,4,6 in mem, rest in regs %macro DCT8_1D 9 @@ -359,31 +344,6 @@ INIT_XMM -; in: m0..m7, except m6 which is in [%9+0x60] -; out: m0..m7, except m4 which is in [%9+0x40] -%macro TRANSPOSE8x8W 9 - SBUTTERFLY wd, %1, %2, %7 - movdqa [%9+16], m%2 - movdqa m%7, [%9+0x60] - SBUTTERFLY wd, %3, %4, %2 - SBUTTERFLY wd, %5, %6, %2 - SBUTTERFLY wd, %7, %8, %2 - SBUTTERFLY dq, %1, %3, %2 - movdqa [%9], m%3 - movdqa m%2, [%9+16] - SBUTTERFLY dq, %2, %4, %3 - SBUTTERFLY dq, %5, %7, %3 - SBUTTERFLY dq, %6, %8, %3 - SBUTTERFLY qdq, %1, %5, %3 - SBUTTERFLY qdq, %2, %6, %3 - movdqa [%9+0x40], m%2 - movdqa m%3, [%9] - SBUTTERFLY qdq, %3, %7, %2 - SBUTTERFLY qdq, %4, %8, %2 - SWAP %2, %5 - SWAP %4, %7 -%endmacro - ;----------------------------------------------------------------------------- ; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- @@ -402,7 +362,7 @@ UNSPILL r0, 0 DCT8_1D 0,1,2,3,4,5,6,7,r0 UNSPILL r0, 0,4 - TRANSPOSE8x8W 0,1,2,3,4,5,6,7,r0 + TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1 UNSPILL r0, 4 DCT8_1D 0,1,2,3,4,5,6,7,r0 SPILL r0, 1,2,3,5,7 @@ -417,8 +377,7 @@ UNSPILL r1, 1,2,3,5,6,7 IDCT8_1D 0,1,2,3,4,5,6,7,r1 SPILL r1, 6 - TRANSPOSE8x8W 0,1,2,3,4,5,6,7,r1 - picgetgot edx + TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1 paddw m0, [pw_32 GLOBAL] SPILL r1, 0 IDCT8_1D 0,1,2,3,4,5,6,7,r1

@@ -29,35 +29,8 @@ pw_32: times 8 dw 32 SECTION .text - INIT_XMM -%macro SBUTTERFLY 4 - mova m%4, m%2 - punpckl%1 m%2, m%3 - punpckh%1 m%4, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8x8W 9 - SBUTTERFLY wd, %1, %2, %9 - SBUTTERFLY wd, %3, %4, %9 - SBUTTERFLY wd, %5, %6, %9 - SBUTTERFLY wd, %7, %8, %9 - SBUTTERFLY dq, %1, %3, %9 - SBUTTERFLY dq, %2, %4, %9 - SBUTTERFLY dq, %5, %7, %9 - SBUTTERFLY dq, %6, %8, %9 - SBUTTERFLY qdq, %1, %5, %9 - SBUTTERFLY qdq, %2, %6, %9 - SBUTTERFLY qdq, %3, %7, %9 - SBUTTERFLY qdq, %4, %8, %9 - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -SECTION .text - %macro DCT8_1D 10 SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07 SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16 @@ -151,7 +124,7 @@ paddw m%9, m%2 paddw m%9, m%4 paddw m%9, m%6 ; %9=a7 - + movdqa m%10, m%6 psraw m%10, 1 paddw m%10, m%6 @@ -208,7 +181,7 @@ TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 paddw m0, [pw_32 GLOBAL] ; rounding for the >>6 at the end IDCT8_1D 0,1,2,3,4,5,6,7,8,9 - + pxor m9, m9 STORE_DIFF m0, m8, m9, [r0+0*FDEC_STRIDE] STORE_DIFF m1, m8, m9, [r0+1*FDEC_STRIDE]

@@ -28,34 +28,12 @@ SECTION_RODATA pw_1: times 8 dw 1 pw_32: times 8 dw 32 -pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 +pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 +pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11 +pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15 SECTION .text -%macro SBUTTERFLY 4 - mova m%4, m%2 - punpckl%1 m%2, m%3 - punpckh%1 m%4, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE4x4W 5 - SBUTTERFLY wd, %1, %2, %5 - SBUTTERFLY wd, %3, %4, %5 - SBUTTERFLY dq, %1, %3, %5 - SBUTTERFLY dq, %2, %4, %5 - SWAP %2, %3 -%endmacro - -%macro TRANSPOSE2x4x4W 5 - SBUTTERFLY wd, %1, %2, %5 - SBUTTERFLY wd, %3, %4, %5 - SBUTTERFLY dq, %1, %3, %5 - SBUTTERFLY dq, %2, %4, %5 - SBUTTERFLY qdq, %1, %2, %5 - SBUTTERFLY qdq, %3, %4, %5 -%endmacro - %macro HADAMARD4_1D 4 SUMSUB_BADC m%2, m%1, m%4, m%3 SUMSUB_BADC m%4, m%2, m%3, m%1 @@ -65,7 +43,7 @@ ;----------------------------------------------------------------------------- ; void x264_dct4x4dc_mmx( int16_t d[4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_dct4x4dc_mmx, 1,1,1 +cglobal x264_dct4x4dc_mmx, 1,1 movq m0, [r0+ 0] movq m1, [r0+ 8] movq m2, [r0+16] @@ -143,7 +121,7 @@ ;----------------------------------------------------------------------------- ; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_add4x4_idct_mmx, 2,2,1 +cglobal x264_add4x4_idct_mmx, 2,2 .skip_prologue: movq m0, [r1+ 0] movq m1, [r1+ 8] @@ -179,7 +157,7 @@ movhps [r0+56], m3 ret -cglobal x264_add8x8_idct_sse2, 2,2,1 +cglobal x264_add8x8_idct_sse2, 2,2 .skip_prologue: call .8x4 add r1, 64 @@ -221,7 +199,7 @@ ; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] ) ;----------------------------------------------------------------------------- %macro ADD_NxN_IDCT 6 -cglobal %1, 2,2,1 +cglobal %1, 2,2 .skip_prologue: call %2 add r0, %4-%5-%6*FDEC_STRIDE @@ -257,7 +235,264 @@ SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0 ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0 +;----------------------------------------------------------------------------- +; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] ) +;----------------------------------------------------------------------------- +%macro SCAN_8x8 1 +cglobal x264_zigzag_scan_8x8_frame_%1, 2,2 + movdqa xmm0, [r1] + movdqa xmm1, [r1+16] + movdq2q mm0, xmm0 + PALIGNR xmm1, xmm1, 14, xmm2 + movdq2q mm1, xmm1 + + movdqa xmm2, [r1+32] + movdqa xmm3, [r1+48] + PALIGNR xmm2, xmm2, 12, xmm4 + movdq2q mm2, xmm2 + PALIGNR xmm3, xmm3, 10, xmm4 + movdq2q mm3, xmm3 + + punpckhwd xmm0, xmm1 + punpckhwd xmm2, xmm3 + + movq mm4, mm1 + movq mm5, mm1 + movq mm6, mm2 + movq mm7, mm3 + punpckhwd mm1, mm0 + psllq mm0, 16 + psrlq mm3, 16 + punpckhdq mm1, mm1 + punpckhdq mm2, mm0 + punpcklwd mm0, mm4 + punpckhwd mm4, mm3 + punpcklwd mm4, mm2 + punpckhdq mm0, mm2 + punpcklwd mm6, mm3 + punpcklwd mm5, mm7 + punpcklwd mm5, mm6 + + movdqa xmm4, [r1+64] + movdqa xmm5, [r1+80] + movdqa xmm6, [r1+96] + movdqa xmm7, [r1+112] + + movq [r0+2*00], mm0 + movq [r0+2*04], mm4 + movd [r0+2*08], mm1 + movq [r0+2*36], mm5 + movq [r0+2*46], mm6 + + PALIGNR xmm4, xmm4, 14, xmm3 + movdq2q mm4, xmm4 + PALIGNR xmm5, xmm5, 12, xmm3 + movdq2q mm5, xmm5 + PALIGNR xmm6, xmm6, 10, xmm3 + movdq2q mm6, xmm6 +%ifidn %1, ssse3 + PALIGNR xmm7, xmm7, 8, xmm3 + movdq2q mm7, xmm7 +%else + movhlps xmm3, xmm7 + movlhps xmm7, xmm7 + movdq2q mm7, xmm3 +%endif + + punpckhwd xmm4, xmm5 + punpckhwd xmm6, xmm7 + movq mm0, mm4 + movq mm1, mm5 + movq mm3, mm7 + punpcklwd mm7, mm6 + psrlq mm6, 16 + punpcklwd mm4, mm6 + punpcklwd mm5, mm4 + punpckhdq mm4, mm3 + punpcklwd mm3, mm6 + punpckhwd mm3, mm4 + punpckhwd mm0, mm1 + punpckldq mm4, mm0 + punpckhdq mm0, mm6 + pshufw mm4, mm4, 0x6c + + movq [r0+2*14], mm4 + movq [r0+2*25], mm0 + movd [r0+2*54], mm7 + movq [r0+2*56], mm5 + movq [r0+2*60], mm3 + + movdqa xmm3, xmm0 + movdqa xmm7, xmm4 + punpckldq xmm0, xmm2 + punpckldq xmm4, xmm6 + punpckhdq xmm3, xmm2 + punpckhdq xmm7, xmm6 + pshufhw xmm0, xmm0, 0x1b + pshuflw xmm4, xmm4, 0x1b + pshufhw xmm3, xmm3, 0x1b + pshuflw xmm7, xmm7, 0x1b + + movlps [r0+2*10], xmm0 + movhps [r0+2*17], xmm0 + movlps [r0+2*21], xmm3 + movlps [r0+2*28], xmm4 + movhps [r0+2*32], xmm3 + movhps [r0+2*39], xmm4 + movlps [r0+2*43], xmm7 + movhps [r0+2*50], xmm7 + + RET +%endmacro + +INIT_XMM +%define PALIGNR PALIGNR_MMX +SCAN_8x8 sse2 +%define PALIGNR PALIGNR_SSSE3 +SCAN_8x8 ssse3 + +;----------------------------------------------------------------------------- +; void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] ) +;----------------------------------------------------------------------------- +cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2 + movq mm0, [r1] + movq mm1, [r1+2*8] + movq mm2, [r1+2*14] + movq mm3, [r1+2*21] + movq mm4, [r1+2*28] + movq mm5, mm0 + movq mm6, mm1 + psrlq mm0, 16 + punpckldq mm1, mm1 + punpcklwd mm5, mm6 + punpckhwd mm1, mm3 + punpckhwd mm6, mm0 + punpckldq mm5, mm0 + movq mm7, [r1+2*52] + movq mm0, [r1+2*60] + punpckhwd mm1, mm2 + punpcklwd mm2, mm4 + punpckhwd mm4, mm3 + punpckldq mm3, mm3 + punpckhwd mm3, mm2 + movq [r0], mm5 + movq [r0+2*4], mm1 + movq [r0+2*8], mm6 + punpcklwd mm6, mm0 + punpcklwd mm6, mm7 + movq mm1, [r1+2*32] + movq mm5, [r1+2*39] + movq mm2, [r1+2*46] + movq [r0+2*35], mm3 + movq [r0+2*47], mm4 + punpckhwd mm7, mm0 + psllq mm0, 16 + movq mm3, mm5 + punpcklwd mm5, mm1 + punpckhwd mm1, mm2 + punpckhdq mm3, mm3 + movq [r0+2*52], mm6 + movq [r0+2*13], mm5 + movq mm4, [r1+2*11] + movq mm6, [r1+2*25] + punpcklwd mm5, mm7 + punpcklwd mm1, mm3 + punpckhdq mm0, mm7 + movq mm3, [r1+2*4] + movq mm7, [r1+2*18] + punpcklwd mm2, mm5 + movq [r0+2*25], mm1 + movq mm1, mm4 + movq mm5, mm6 + punpcklwd mm4, mm3 + punpcklwd mm6, mm7 + punpckhwd mm1, mm3 + punpckhwd mm5, mm7 + movq mm3, mm6 + movq mm7, mm5 + punpckldq mm6, mm4 + punpckldq mm5, mm1 + punpckhdq mm3, mm4 + punpckhdq mm7, mm1 + movq mm4, [r1+2*35] + movq mm1, [r1+2*49] + pshufw mm6, mm6, 0x1b + pshufw mm5, mm5, 0x1b + movq [r0+2*60], mm0 + movq [r0+2*56], mm2 + movq mm0, [r1+2*42] + movq mm2, [r1+2*56] + movq [r0+2*17], mm3 + movq [r0+2*32], mm7 + movq [r0+2*10], mm6 + movq [r0+2*21], mm5 + movq mm3, mm0 + movq mm7, mm2 + punpcklwd mm0, mm4 + punpcklwd mm2, mm1 + punpckhwd mm3, mm4 + punpckhwd mm7, mm1 + movq mm4, mm2 + movq mm1, mm7 + punpckhdq mm2, mm0 + punpckhdq mm7, mm3 + punpckldq mm4, mm0 + punpckldq mm1, mm3 + pshufw mm2, mm2, 0x1b + pshufw mm7, mm7, 0x1b + movq [r0+2*28], mm4 + movq [r0+2*43], mm1 + movq [r0+2*39], mm2 + movq [r0+2*50], mm7 + RET + +;----------------------------------------------------------------------------- +; void x264_zigzag_scan_4x4_frame_mmx( int16_t level[16], int16_t dct[4][4] ) +;----------------------------------------------------------------------------- +cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2 + movq mm0, [r1] + movq mm1, [r1+8] + movq mm2, [r1+16] + movq mm3, [r1+24] + movq mm4, mm0 + movq mm5, mm1 + movq mm6, mm2 + movq mm7, mm3 + psllq mm3, 16 + psrlq mm0, 16 + punpckldq mm2, mm2 + punpckhdq mm1, mm1 + punpcklwd mm4, mm5 + punpcklwd mm5, mm3 + punpckldq mm4, mm0 + punpckhwd mm5, mm2 + punpckhwd mm0, mm6 + punpckhwd mm6, mm7 + punpcklwd mm1, mm0 + punpckhdq mm3, mm6 + movq [r0], mm4 + movq [r0+8], mm5 + movq [r0+16], mm1 + movq [r0+24], mm3 + RET + +;----------------------------------------------------------------------------- +; void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[4][4] ) +;----------------------------------------------------------------------------- +cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2 + movdqa xmm1, [r1+16] + movdqa xmm0, [r1] + pshufb xmm1, [pb_scan4frameb GLOBAL] + pshufb xmm0, [pb_scan4framea GLOBAL] + movdqa xmm2, xmm1 + psrldq xmm1, 6 + palignr xmm2, xmm0, 6 + pslldq xmm0, 10 + palignr xmm1, xmm0, 10 + movdqa [r0], xmm2 + movdqa [r0+16], xmm1 + RET ;----------------------------------------------------------------------------- ; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] ) @@ -292,14 +527,13 @@ movd [r2+1*FDEC_STRIDE], xmm1 movd [r2+2*FDEC_STRIDE], xmm2 movd [r2+3*FDEC_STRIDE], xmm3 - picgetgot r1 punpckldq xmm0, xmm1 punpckldq xmm2, xmm3 punpckldq xmm4, xmm5 punpckldq xmm6, xmm7 movlhps xmm0, xmm2 movlhps xmm4, xmm6 - movdqa xmm7, [pb_zigzag4 GLOBAL] + movdqa xmm7, [pb_sub4frame GLOBAL] pshufb xmm0, xmm7 pshufb xmm4, xmm7 pxor xmm6, xmm6

@@ -24,32 +24,37 @@ #ifndef X264_I386_DCT_H #define X264_I386_DCT_H -void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct_sse2( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub16x16_dct_sse2( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); - -void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] ); -void x264_add8x8_idct_mmx( uint8_t *p_dst, int16_t dct[4][4][4] ); -void x264_add16x16_idct_mmx( uint8_t *p_dst, int16_t dct[16][4][4] ); -void x264_add8x8_idct_sse2( uint8_t *p_dst, int16_t dct[4][4][4] ); -void x264_add16x16_idct_sse2( uint8_t *p_dst, int16_t dct[16][4][4] ); - -void x264_dct4x4dc_mmx( int16_t d[4][4] ); -void x264_idct4x4dc_mmx( int16_t d[4][4] ); - -void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ); - -void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] ); -void x264_add16x16_idct8_mmx( uint8_t *dst, int16_t dct[4][8][8] ); -void x264_add8x8_idct8_sse2( uint8_t *dst, int16_t dct[8][8] ); +void x264_sub4x4_dct_mmx ( int16_t dct[ 4][4] , uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_mmx ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct_mmx ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct_sse2 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); + +void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] ); +void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] ); +void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][4][4] ); +void x264_add8x8_idct_sse2 ( uint8_t *p_dst, int16_t dct[ 4][4][4] ); +void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][4][4] ); + +void x264_dct4x4dc_mmx ( int16_t d[4][4] ); +void x264_idct4x4dc_mmx ( int16_t d[4][4] ); + +void x264_sub8x8_dct8_mmx ( int16_t dct[8][8] , uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct8_mmx ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct8_sse2 ( int16_t dct[8][8] , uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct8_sse2 ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 ); + +void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct[8][8] ); +void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][8][8] ); +void x264_add8x8_idct8_sse2 ( uint8_t *dst, int16_t dct[8][8] ); void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][8][8] ); +void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[8][8] ); +void x264_zigzag_scan_8x8_frame_sse2 ( int16_t level[64], int16_t dct[8][8] ); +void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] ); +void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[4][4] ); +void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] ); void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] ); -void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst ); +void x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst ); #endif

@@ -373,7 +373,7 @@ ;----------------------------------------------------------------------------- ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_%1, 5,5,1 +cglobal x264_deblock_%2_luma_%1, 5,5 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 @@ -609,7 +609,7 @@ ;----------------------------------------------------------------------------- ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_intra_%1, 4,6,1 +cglobal x264_deblock_%2_luma_intra_%1, 4,6 %ifndef ARCH_X86_64 sub esp, 0x60 %endif @@ -813,7 +813,6 @@ movd m6, [r4] ; tc0 punpcklbw m6, m6 pand m7, m6 - picgetgot r4 DEBLOCK_P0_Q0 ret @@ -862,7 +861,6 @@ LOAD_MASK r2d, r3d movq m5, m1 movq m6, m2 - picgetgot r2 CHROMA_INTRA_P0 m1, m0, m3 CHROMA_INTRA_P0 m2, m3, m0 psubb m1, m5

@@ -36,33 +36,21 @@ SECTION .text ;============================================================================= -; pixel avg +; weighted prediction ;============================================================================= - -;----------------------------------------------------------------------------- -; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride, -; uint8_t *src, int src_stride ); -;----------------------------------------------------------------------------- -%macro AVGH 3 -%assign function_align 8 ; the whole function fits in 8 bytes, so a larger align just wastes space -cglobal x264_pixel_avg_%1x%2_%3 - mov eax, %2 - jmp x264_pixel_avg_w%1_%3 -%assign function_align 16 -%endmacro - -;----------------------------------------------------------------------------- -; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride, -; uint8_t *src, int src_stride, -; int height ); -;----------------------------------------------------------------------------- +; implicit bipred only: +; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 %ifdef ARCH_X86_64 %define t0 r0 %define t1 r1 %define t2 r2 %define t3 r3 - %macro AVG_START 1 - cglobal %1, 4,5 + %define t4 r4 + %define t5 r5 + %define t6d r10d + %define t7d r11d + %macro AVG_START 0 + PROLOGUE 6,7 .height_loop: %endmacro %else @@ -70,79 +58,228 @@ %define t1 r2 %define t2 r3 %define t3 r4 - %macro AVG_START 1 - cglobal %1, 0,5 + %define t4 r5 + %define t5 r6 + %define t6d r1d + %define t7d r2d + %macro AVG_START 0 + PROLOGUE 0,7 mov t0, r0m mov t1, r1m mov t2, r2m mov t3, r3m + mov t4, r4m + mov t5, r5m .height_loop: %endmacro %endif +%macro SPLATW 2 +%if mmsize==16 + pshuflw %1, %2, 0 + movlhps %1, %1 +%else + pshufw %1, %2, 0 +%endif +%endmacro + +%macro BIWEIGHT_MMX 2 + movh m0, %1 + movh m1, %2 + punpcklbw m0, m7 + punpcklbw m1, m7 + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m1 + paddw m0, m6 + psraw m0, 6 +%endmacro + +%macro BIWEIGHT_START_MMX 0 + movd m4, r6m + SPLATW m4, m4 ; weight_dst + mova m5, [pw_64 GLOBAL] + psubw m5, m4 ; weight_src + mova m6, [pw_32 GLOBAL] ; rounding + pxor m7, m7 +%endmacro + +%macro BIWEIGHT_SSSE3 2 + movh m0, %1 + movh m1, %2 + punpcklbw m0, m1 + pmaddubsw m0, m5 + paddw m0, m6 + psraw m0, 6 +%endmacro + +%macro BIWEIGHT_START_SSSE3 0 + movzx t6d, byte r6m ; FIXME x86_64 + mov t7d, 64 + sub t7d, t6d + shl t7d, 8 + add t6d, t7d + movd m5, t6d + mova m6, [pw_32 GLOBAL] + SPLATW m5, m5 ; weight_dst,src +%endmacro + +%macro BIWEIGHT_ROW 4 + BIWEIGHT [%2], [%3] +%if %4==mmsize/2 + packuswb m0, m0 + movh [%1], m0 +%else + SWAP 0, 2 + BIWEIGHT [%2+mmsize/2], [%3+mmsize/2] + packuswb m2, m0 + mova [%1], m2 +%endif +%endmacro + +;----------------------------------------------------------------------------- +; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight ) +;----------------------------------------------------------------------------- +%macro AVG_WEIGHT 2 +cglobal x264_pixel_avg_weight_w%2_%1, 0,0 + BIWEIGHT_START + AVG_START +%if %2==8 && mmsize==16 + BIWEIGHT [t2], [t4] + SWAP 0, 2 + BIWEIGHT [t2+t3], [t4+t5] + packuswb m2, m0 + movlps [t0], m2 + movhps [t0+t1], m2 +%else +%assign x 0 +%rep 1+%2/(mmsize*2) + BIWEIGHT_ROW t0+x, t2+x, t4+x, %2 + BIWEIGHT_ROW t0+x+t1, t2+x+t3, t4+x+t5, %2 +%assign x x+mmsize +%endrep +%endif + lea t0, [t0+t1*2] + lea t2, [t2+t3*2] + lea t4, [t4+t5*2] + sub eax, 2 + jg .height_loop + REP_RET +%endmacro + +%define BIWEIGHT BIWEIGHT_MMX +%define BIWEIGHT_START BIWEIGHT_START_MMX +INIT_MMX +AVG_WEIGHT mmxext, 4 +AVG_WEIGHT mmxext, 8 +AVG_WEIGHT mmxext, 16 +INIT_XMM +%define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext +AVG_WEIGHT sse2, 8 +AVG_WEIGHT sse2, 16 +%define BIWEIGHT BIWEIGHT_SSSE3 +%define BIWEIGHT_START BIWEIGHT_START_SSSE3 +INIT_MMX +AVG_WEIGHT ssse3, 4 +INIT_XMM +AVG_WEIGHT ssse3, 8 +AVG_WEIGHT ssse3, 16 + + + +;============================================================================= +; pixel avg +;============================================================================= + +;----------------------------------------------------------------------------- +; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride, +; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight ); +;----------------------------------------------------------------------------- +%macro AVGH 3 +cglobal x264_pixel_avg_%1x%2_%3,0,0 + mov eax, %2 + cmp dword r6m, 32 + jne x264_pixel_avg_weight_w%1_%3 +%if mmsize == 16 && %1 == 16 + test dword r4m, 15 + jz x264_pixel_avg_w%1_sse2 +%endif + jmp x264_pixel_avg_w%1_mmxext +%endmacro + +;----------------------------------------------------------------------------- +; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride, +; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, +; int height, int weight ); +;----------------------------------------------------------------------------- + %macro AVG_END 0 sub eax, 2 + lea t4, [t4+t5*2] lea t2, [t2+t3*2] lea t0, [t0+t1*2] jg .height_loop REP_RET %endmacro -AVG_START x264_pixel_avg_w4_mmxext - movd mm0, [t2] - movd mm1, [t2+t3] - pavgb mm0, [t0] - pavgb mm1, [t0+t1] - movd [t0], mm0 - movd [t0+t1], mm1 -AVG_END +%macro AVG_FUNC 3 +cglobal %1 + AVG_START + %2 m0, [t2] + %2 m1, [t2+t3] + pavgb m0, [t4] + pavgb m1, [t4+t5] + %3 [t0], m0 + %3 [t0+t1], m1 + AVG_END +%endmacro +INIT_MMX +AVG_FUNC x264_pixel_avg_w4_mmxext, movd, movd AVGH 4, 8, mmxext AVGH 4, 4, mmxext AVGH 4, 2, mmxext -AVG_START x264_pixel_avg_w8_mmxext - movq mm0, [t2] - movq mm1, [t2+t3] - pavgb mm0, [t0] - pavgb mm1, [t0+t1] - movq [t0], mm0 - movq [t0+t1], mm1 -AVG_END - +AVG_FUNC x264_pixel_avg_w8_mmxext, movq, movq AVGH 8, 16, mmxext AVGH 8, 8, mmxext AVGH 8, 4, mmxext -AVG_START x264_pixel_avg_w16_mmxext +cglobal x264_pixel_avg_w16_mmxext + AVG_START movq mm0, [t2 ] movq mm1, [t2+8] movq mm2, [t2+t3 ] movq mm3, [t2+t3+8] - pavgb mm0, [t0 ] - pavgb mm1, [t0+8] - pavgb mm2, [t0+t1 ] - pavgb mm3, [t0+t1+8] + pavgb mm0, [t4 ] + pavgb mm1, [t4+8] + pavgb mm2, [t4+t5 ] + pavgb mm3, [t4+t5+8] movq [t0 ], mm0 movq [t0+8], mm1 movq [t0+t1 ], mm2 movq [t0+t1+8], mm3 -AVG_END + AVG_END AVGH 16, 16, mmxext AVGH 16, 8, mmxext -AVG_START x264_pixel_avg_w16_sse2 - movdqu xmm0, [t2] - movdqu xmm1, [t2+t3] - pavgb xmm0, [t0] - pavgb xmm1, [t0+t1] - movdqa [t0], xmm0 - movdqa [t0+t1], xmm1 -AVG_END - +INIT_XMM +AVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqa AVGH 16, 16, sse2 -AVGH 16, 8, sse2 +AVGH 16, 8, sse2 +AVGH 8, 16, sse2 +AVGH 8, 8, sse2 +AVGH 8, 4, sse2 +AVGH 16, 16, ssse3 +AVGH 16, 8, ssse3 +AVGH 8, 16, ssse3 +AVGH 8, 8, ssse3 +AVGH 8, 4, ssse3 +INIT_MMX +AVGH 4, 8, ssse3 +AVGH 4, 4, ssse3 +AVGH 4, 2, ssse3 @@ -284,17 +421,9 @@ %macro INIT_SHIFT 2 and eax, 7 shl eax, 3 -%ifdef PIC32 - ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx - mov r2, 64 - sub r2, eax - movd %2, eax - movd %1, r2 -%else movd %1, [sw_64 GLOBAL] movd %2, eax psubw %1, %2 -%endif %endmacro %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set @@ -316,7 +445,7 @@ INIT_SHIFT mm6, mm7 mov eax, r4m INIT_SHIFT mm4, mm5 - PROLOGUE 6,6,0 + PROLOGUE 6,6 and r2, ~7 and r4, ~7 sub r4, r2 @@ -474,102 +603,12 @@ ;============================================================================= -; weighted prediction -;============================================================================= -; implicit bipred only: -; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 - -%macro SPLATW 2 -%if mmsize==16 - pshuflw %1, %2, 0 - movlhps %1, %1 -%else - pshufw %1, %2, 0 -%endif -%endmacro - -%macro BIWEIGHT 2 - movh m0, %1 - movh m1, %2 - punpcklbw m0, m7 - punpcklbw m1, m7 - pmullw m0, m4 - pmullw m1, m5 - paddw m0, m1 - paddw m0, m6 - psraw m0, 6 - pmaxsw m0, m7 - packuswb m0, m0 - movh %1, m0 -%endmacro - -%macro BIWEIGHT_START 1 -%ifidn r4m, r4d - movd m4, r4m - SPLATW m4, m4 ; weight_dst -%else - SPLATW m4, r4m -%endif - picgetgot r4 - mova m5, [pw_64 GLOBAL] - psubw m5, m4 ; weight_src - mova m6, [pw_32 GLOBAL] ; rounding - pxor m7, m7 -%if %1 -%ifidn r5m, r5d - %define t0 r5d -%else - %define t0 r4d - mov r4d, r5m -%endif -%endif -.height_loop: -%endmacro - -INIT_MMX -;----------------------------------------------------------------------------- -; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4,1 - BIWEIGHT_START 0 - BIWEIGHT [r0 ], [r2 ] - BIWEIGHT [r0+r1 ], [r2+r3 ] - BIWEIGHT [r0+r1*2], [r2+r3*2] - add r0, r1 - add r2, r3 - BIWEIGHT [r0+r1*2], [r2+r3*2] - RET - -%macro AVG_WEIGHT 2 -cglobal x264_pixel_avg_weight_w%2_%1, 4,5 - BIWEIGHT_START 1 -%assign x 0 -%rep %2*2/mmsize - BIWEIGHT [r0+x], [r2+x] -%assign x x+mmsize/2 -%endrep - add r0, r1 - add r2, r3 - dec t0 - jg .height_loop - REP_RET -%endmacro - -AVG_WEIGHT mmxext, 8 -AVG_WEIGHT mmxext, 16 -INIT_XMM -AVG_WEIGHT sse2, 8 -AVG_WEIGHT sse2, 16 - - - -;============================================================================= ; prefetch ;============================================================================= ; FIXME assumes 64 byte cachelines ;----------------------------------------------------------------------------- -; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, +; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, ; uint8_t *pix_uv, int stride_uv, int mb_x ) ;----------------------------------------------------------------------------- %ifdef ARCH_X86_64 @@ -671,7 +710,7 @@ ; int width, int height ) ;----------------------------------------------------------------------------- %macro MC_CHROMA 1 -cglobal x264_mc_chroma_%1, 0,6,1 +cglobal x264_mc_chroma_%1, 0,6 %if mmsize == 16 cmp dword r6m, 4 jle x264_mc_chroma_mmxext %+ .skip_prologue @@ -833,7 +872,7 @@ MC_CHROMA sse2 INIT_MMX -cglobal x264_mc_chroma_ssse3, 0,6,1 +cglobal x264_mc_chroma_ssse3, 0,6 MC_CHROMA_START and r4d, 7 and r5d, 7

@@ -32,12 +32,25 @@ SECTION .text -%macro LOAD_ADD 3 +%macro LOAD_ADD 4 + movh %4, %3 movh %1, %2 - movh m7, %3 + punpcklbw %4, m0 punpcklbw %1, m0 - punpcklbw m7, m0 - paddw %1, m7 + paddw %1, %4 +%endmacro + +%macro LOAD_ADD_2 6 + mova %5, %3 + mova %1, %4 + mova %6, %5 + mova %2, %1 + punpcklbw %5, m0 + punpcklbw %1, m0 + punpckhbw %6, m0 + punpckhbw %2, m0 + paddw %1, %5 + paddw %2, %6 %endmacro %macro FILT_V2 0 @@ -64,27 +77,27 @@ paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 %endmacro -%macro FILT_H2 0 - psubw m1, m2 - psubw m4, m5 - psraw m1, 2 - psraw m4, 2 - psubw m1, m2 - psubw m4, m5 - paddw m1, m3 - paddw m4, m6 - psraw m1, 2 - psraw m4, 2 - paddw m1, m3 - paddw m4, m6 +%macro FILT_H2 6 + psubw %1, %2 + psubw %4, %5 + psraw %1, 2 + psraw %4, 2 + psubw %1, %2 + psubw %4, %5 + paddw %1, %3 + paddw %4, %6 + psraw %1, 2 + psraw %4, 2 + paddw %1, %3 + paddw %4, %6 %endmacro -%macro FILT_PACK 1 - paddw m1, m7 - paddw m4, m7 - psraw m1, %1 - psraw m4, %1 - packuswb m1, m4 +%macro FILT_PACK 3 + paddw %1, m7 + paddw %2, m7 + psraw %1, %3 + psraw %2, %3 + packuswb %1, %2 %endmacro %macro PALIGNR_MMX 4 @@ -111,7 +124,7 @@ ;----------------------------------------------------------------------------- ; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_v_%1, 5,6,1 +cglobal x264_hpel_filter_v_%1, 5,6 lea r5, [r1+r3] sub r1, r3 sub r1, r3 @@ -120,13 +133,10 @@ neg r4 pxor m0, m0 .loop: - prefetcht0 [r5+r3*2+64] - LOAD_ADD m1, [r1 ], [r5+r3*2] ; a0 - LOAD_ADD m2, [r1+r3 ], [r5+r3 ] ; b0 - LOAD_ADD m3, [r1+r3*2], [r5 ] ; c0 - LOAD_ADD m4, [r1 +mmsize/2], [r5+r3*2+mmsize/2] ; a1 - LOAD_ADD m5, [r1+r3 +mmsize/2], [r5+r3 +mmsize/2] ; b1 - LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5 +mmsize/2] ; c1 + LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1 + LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1 + LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0 + LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1 FILT_V2 mova m7, [pw_16 GLOBAL] mova [r2+r4*2], m1 @@ -136,7 +146,7 @@ psraw m1, 5 psraw m4, 5 packuswb m1, m4 - movnt [r0+r4], m1 + mova [r0+r4], m1 add r1, mmsize add r5, mmsize add r4, mmsize @@ -148,7 +158,7 @@ ;----------------------------------------------------------------------------- ; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_c_mmxext, 3,3,1 +cglobal x264_hpel_filter_c_mmxext, 3,3 add r0, r2 lea r1, [r1+r2*2] neg r2 @@ -167,8 +177,8 @@ paddw m4, [src+14] ; a1 paddw m5, [src+12] ; b1 paddw m6, [src+10] ; c1 - FILT_H2 - FILT_PACK 6 + FILT_H2 m1, m2, m3, m4, m5, m6 + FILT_PACK m1, m4, 6 movntq [r0+r2], m1 add r2, 8 jl .loop @@ -177,7 +187,7 @@ ;----------------------------------------------------------------------------- ; void x264_hpel_filter_h_mmxext( uint8_t *dst, uint8_t *src, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_h_mmxext, 3,3,1 +cglobal x264_hpel_filter_h_mmxext, 3,3 add r0, r2 add r1, r2 neg r2 @@ -211,8 +221,8 @@ punpcklbw m6, m0 paddw m6, m7 ; a1 movq m7, [pw_1 GLOBAL] - FILT_H2 - FILT_PACK 1 + FILT_H2 m1, m2, m3, m4, m5, m6 + FILT_PACK m1, m4, 1 movntq [r0+r2], m1 add r2, 8 jl .loop @@ -224,7 +234,7 @@ ;----------------------------------------------------------------------------- ; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_c_%1, 3,3,1 +cglobal x264_hpel_filter_c_%1, 3,3 add r0, r2 lea r1, [r1+r2*2] neg r2 @@ -267,7 +277,7 @@ ;----------------------------------------------------------------------------- ; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_h_sse2, 3,3,1 +cglobal x264_hpel_filter_h_sse2, 3,3 add r0, r2 add r1, r2 neg r2 @@ -305,24 +315,217 @@ punpcklbw m7, m0 paddw m6, m7 ; c1 mova m7, [pw_1 GLOBAL] ; FIXME xmm8 - FILT_H2 - FILT_PACK 1 + FILT_H2 m1, m2, m3, m4, m5, m6 + FILT_PACK m1, m4, 1 movntdq [r0+r2], m1 add r2, 16 jl .loop REP_RET +;----------------------------------------------------------------------------- +; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width ); +;----------------------------------------------------------------------------- +cglobal x264_hpel_filter_h_ssse3, 3,3 + add r0, r2 + add r1, r2 + neg r2 + %define src r1+r2 + pxor m0, m0 + movh m1, [src-8] + punpcklbw m1, m0 ; 00 -1 00 -2 00 -3 00 -4 00 -5 00 -6 00 -7 00 -8 + movh m2, [src] + punpcklbw m2, m0 + mova m7, [pw_1 GLOBAL] +.loop: + movh m3, [src+8] + punpcklbw m3, m0 + + mova m4, m2 + palignr m2, m1, 14 + mova m5, m3 + palignr m3, m4, 4 + paddw m3, m2 + + mova m2, m4 + palignr m4, m1, 12 + mova m1, m5 + palignr m5, m2, 6 + paddw m5, m4 + + mova m4, m1 + palignr m1, m2, 2 + paddw m1, m2 + + FILT_H m5, m3, m1 + + movh m1, [src+16] + punpcklbw m1, m0 + + mova m3, m4 + palignr m4, m2, 14 + mova m6, m1 + palignr m1, m3, 4 + paddw m1, m4 + + mova m4, m3 + palignr m3, m2, 12 + mova m2, m6 + palignr m6, m4, 6 + paddw m6, m3 + + mova m3, m2 + palignr m2, m4, 2 + paddw m2, m4 + + FILT_H m6, m1, m2 + FILT_PACK m5, m6, 1 + movdqa [r0+r2], m5 + + add r2, 16 + mova m2, m3 + mova m1, m4 + + jl .loop + REP_RET + + %define PALIGNR PALIGNR_MMX HPEL_V sse2 HPEL_C sse2 %define PALIGNR PALIGNR_SSSE3 HPEL_C ssse3 -cglobal x264_sfence +%ifdef ARCH_X86_64 + +%macro DO_FILT_V 5 + LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1 + LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1 + LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1 + FILT_V2 + mova %1, m1 + mova %2, m4 + paddw m1, m15 + paddw m4, m15 + add r3, 16 + add r1, 16 + psraw m1, 5 + psraw m4, 5 + packuswb m1, m4 + movntps [r11+r4+%5], m1 +%endmacro + +%macro DO_FILT_H 4 + mova m1, %2 + PALIGNR m1, %1, 12, m4 + mova m2, %2 + PALIGNR m2, %1, 14, m4 + mova %1, %3 + PALIGNR %3, %2, 6, m4 + mova m3, %1 + PALIGNR m3, %2, 4, m4 + mova m4, %1 + paddw %3, m1 + PALIGNR m4, %2, 2, m1 + paddw m3, m2 + paddw m4, %2 + FILT_H %3, m3, m4 + paddw %3, m15 + psraw %3, %4 +%endmacro + +%macro DO_FILT_CC 4 + DO_FILT_H %1, %2, %3, 6 + DO_FILT_H %2, %1, %4, 6 + packuswb %3, %4 + movntps [r5+r4], %3 +%endmacro + +%macro DO_FILT_HH 4 + DO_FILT_H %1, %2, %3, 1 + DO_FILT_H %2, %1, %4, 1 + packuswb %3, %4 + movntps [r0+r4], %3 +%endmacro + +%macro DO_FILT_H2 6 + DO_FILT_H %1, %2, %3, 6 + psrlw m15, 5 + DO_FILT_H %4, %5, %6, 1 + packuswb %6, %3 +%endmacro + +%macro HPEL 1 +;----------------------------------------------------------------------------- +; void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, +; uint8_t *src, int stride, int width, int height) +;----------------------------------------------------------------------------- +cglobal x264_hpel_filter_%1, 7,7 + mov r10, r3 + sub r5, 16 + mov r11, r1 + and r10, 15 + sub r3, r10 + add r0, r5 + add r11, r5 + add r10, r5 + add r5, r2 + mov r2, r4 + neg r10 + lea r1, [r3+r2] + sub r3, r2 + sub r3, r2 + mov r4, r10 + pxor m0, m0 + pcmpeqw m15, m15 + psrlw m15, 15 ; pw_1 + psllw m15, 4 +;ALIGN 16 +.loopy: +; first filter_v +; prefetching does not help here! lots of variants tested, all slower + DO_FILT_V m8, m7, m13, m12, 0 +;ALIGN 16 +.loopx: + DO_FILT_V m6, m5, m11, m10, 16 +.lastx: + paddw m15, m15 + DO_FILT_CC m9, m8, m7, m6 + movdqa m7, m12 ; not really necessary, but seems free and + movdqa m6, m11 ; gives far shorter code + psrlw m15, 5 + DO_FILT_HH m14, m13, m7, m6 + psllw m15, 4 ; pw_16 + movdqa m7, m5 + movdqa m12, m10 + add r4, 16 + jl .loopx + cmp r4, 16 + jl .lastx +; setup regs for next y + sub r4, r10 + sub r4, r2 + sub r1, r4 + sub r3, r4 + add r0, r2 + add r11, r2 + add r5, r2 + mov r4, r10 + sub r6d, 1 + jg .loopy sfence - ret + RET +%endmacro +%define PALIGNR PALIGNR_MMX +HPEL sse2 +%define PALIGNR PALIGNR_SSSE3 +HPEL ssse3 + +%endif +cglobal x264_sfence + sfence + ret ;----------------------------------------------------------------------------- ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,

@@ -27,29 +27,27 @@ #include <string.h> #include "common/common.h" +#include "mc.h" -/* NASM functions */ -extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg_8x8_mmxext( uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg_8x4_mmxext( uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int ); +#define DECL_SUF( func, args )\ + void func##_mmxext args;\ + void func##_sse2 args;\ + void func##_ssse3 args; + +DECL_SUF( x264_pixel_avg_16x16, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) +DECL_SUF( x264_pixel_avg_16x8, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) +DECL_SUF( x264_pixel_avg_8x16, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) +DECL_SUF( x264_pixel_avg_8x8, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) +DECL_SUF( x264_pixel_avg_8x4, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) +DECL_SUF( x264_pixel_avg_4x8, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) +DECL_SUF( x264_pixel_avg_4x4, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) +DECL_SUF( x264_pixel_avg_4x2, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int ); -extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ); -extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ); -extern void x264_pixel_avg_weight_w8_sse2( uint8_t *, int, uint8_t *, int, int, int ); -extern void x264_pixel_avg_weight_w16_sse2( uint8_t *, int, uint8_t *, int, int, int ); -extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int ); extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int ); extern void x264_prefetch_ref_mmxext( uint8_t *, int, int ); extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride, @@ -86,23 +84,6 @@ PIXEL_AVG_WALL(cache64_sse2) PIXEL_AVG_WALL(sse2) -#define AVG_WEIGHT(W,H,name) \ -void x264_pixel_avg_weight_ ## W ## x ## H ## _##name( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \ -{ \ - x264_pixel_avg_weight_w ## W ## _##name( dst, i_dst, src, i_src, i_weight_dst, H ); \ -} - -AVG_WEIGHT(16,16,mmxext) -AVG_WEIGHT(16,8,mmxext) -AVG_WEIGHT(8,16,mmxext) -AVG_WEIGHT(8,8,mmxext) -AVG_WEIGHT(8,4,mmxext) -AVG_WEIGHT(16,16,sse2) -AVG_WEIGHT(16,8,sse2) -AVG_WEIGHT(8,16,sse2) -AVG_WEIGHT(8,8,sse2) -AVG_WEIGHT(8,4,sse2) - #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\ static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\ {\ @@ -143,7 +124,7 @@ static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; #define MC_LUMA(name,instr1,instr2)\ -void mc_luma_##name( uint8_t *dst, int i_dst_stride,\ +static void mc_luma_##name( uint8_t *dst, int i_dst_stride,\ uint8_t *src[4], int i_src_stride,\ int mvx, int mvy,\ int i_width, int i_height )\ @@ -174,7 +155,7 @@ MC_LUMA(cache64_sse2,cache64_sse2,sse2) #define GET_REF(name)\ -uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\ +static uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\ uint8_t *src[4], int i_src_stride,\ int mvx, int mvy,\ int i_width, int i_height )\ @@ -210,7 +191,7 @@ void x264_hpel_filter_c_##cpuc( uint8_t *dst, int16_t *buf, int width );\ void x264_hpel_filter_h_##cpuh( uint8_t *dst, uint8_t *src, int width );\ void x264_sfence( void );\ -void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,\ +static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,\ int stride, int width, int height )\ {\ int16_t *buf;\ @@ -237,14 +218,20 @@ HPEL(8, mmxext, mmxext, mmxext, mmxext) HPEL(16, sse2_amd, mmxext, mmxext, sse2) +#ifdef ARCH_X86_64 +void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height ); +void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height ); +#else HPEL(16, sse2, sse2, sse2, sse2) -HPEL(16, ssse3, sse2, ssse3, sse2) +HPEL(16, ssse3, sse2, ssse3, ssse3) +#endif void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { if( !(cpu&X264_CPU_MMX) ) return; + pf->copy_16x16_unaligned = x264_mc_copy_w16_mmx; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx; pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx; @@ -267,14 +254,6 @@ pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext; - pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_mmxext; - pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_mmxext; - pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_mmxext; - pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_mmxext; - pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_mmxext; - pf->avg_weight[PIXEL_4x4] = x264_pixel_avg_weight_4x4_mmxext; - // avg_weight_4x8 is rare and 4x2 is not used - pf->plane_copy = x264_plane_copy_mmxext; pf->hpel_filter = x264_hpel_filter_mmxext; pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext; @@ -310,14 +289,9 @@ pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2; pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2; - if( !(cpu&X264_CPU_STACK_MOD4) ) - { - pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_sse2; - pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_sse2; - pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_sse2; - pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_sse2; - pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_sse2; - } + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2; pf->hpel_filter = x264_hpel_filter_sse2; pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2; pf->mc_chroma = x264_mc_chroma_sse2; @@ -336,6 +310,15 @@ if( !(cpu&X264_CPU_SSSE3) ) return; + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_ssse3; + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_ssse3; + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_ssse3; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_ssse3; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_ssse3; + pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_ssse3; + pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3; + pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3; + pf->hpel_filter = x264_hpel_filter_ssse3; pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; pf->mc_chroma = x264_mc_chroma_ssse3;

@@ -26,7 +26,4 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ); -void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); #endif

@@ -25,21 +25,9 @@ %include "x86util.asm" SECTION .text +INIT_MMX -%macro SBUTTERFLY 5 - mov%1 %5, %3 - punpckl%2 %3, %4 - punpckh%2 %5, %4 -%endmacro - -%macro TRANSPOSE4x4W 5 ; abcd-t -> adtc - SBUTTERFLY q, wd, %1, %2, %5 - SBUTTERFLY q, wd, %3, %4, %2 - SBUTTERFLY q, dq, %1, %3, %4 - SBUTTERFLY q, dq, %5, %2, %3 -%endmacro - -%macro LOAD_DIFF_4P 4 ; mmp, mmt, dx, dy +%macro LOAD_DIFF_4P 4 ; mp, mt, dx, dy movd %1, [eax+ebx*%4+%3] movd %2, [ecx+edx*%4+%3] punpcklbw %1, %2 @@ -48,40 +36,40 @@ %endmacro %macro LOAD_DIFF_4x8P 1 ; dx - LOAD_DIFF_4P mm0, mm7, %1, 0 - LOAD_DIFF_4P mm1, mm7, %1, 1 + LOAD_DIFF_4P m0, m7, %1, 0 + LOAD_DIFF_4P m1, m7, %1, 1 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] - LOAD_DIFF_4P mm2, mm7, %1, 0 - LOAD_DIFF_4P mm3, mm7, %1, 1 + LOAD_DIFF_4P m2, m7, %1, 0 + LOAD_DIFF_4P m3, m7, %1, 1 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] - LOAD_DIFF_4P mm4, mm7, %1, 0 - LOAD_DIFF_4P mm5, mm7, %1, 1 + LOAD_DIFF_4P m4, m7, %1, 0 + LOAD_DIFF_4P m5, m7, %1, 1 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] - LOAD_DIFF_4P mm6, mm7, %1, 0 - movq [spill], mm6 - LOAD_DIFF_4P mm7, mm6, %1, 1 - movq mm6, [spill] + LOAD_DIFF_4P m6, m7, %1, 0 + movq [spill], m6 + LOAD_DIFF_4P m7, m6, %1, 1 + movq m6, [spill] %endmacro %macro SUM4x8_MM 0 - movq [spill], mm6 - movq [spill+8], mm7 - ABS2 mm0, mm1, mm6, mm7 - ABS2 mm2, mm3, mm6, mm7 - paddw mm0, mm2 - paddw mm1, mm3 - movq mm6, [spill] - movq mm7, [spill+8] - ABS2 mm4, mm5, mm2, mm3 - ABS2 mm6, mm7, mm2, mm3 - paddw mm4, mm6 - paddw mm5, mm7 - paddw mm0, mm4 - paddw mm1, mm5 - paddw mm0, mm1 + movq [spill], m6 + movq [spill+8], m7 + ABS2 m0, m1, m6, m7 + ABS2 m2, m3, m6, m7 + paddw m0, m2 + paddw m1, m3 + movq m6, [spill] + movq m7, [spill+8] + ABS2 m4, m5, m2, m3 + ABS2 m6, m7, m2, m3 + paddw m4, m6 + paddw m5, m7 + paddw m0, m4 + paddw m1, m5 + paddw m0, m1 %endmacro ;----------------------------------------------------------------------------- @@ -98,67 +86,67 @@ %define spill esp+0x60 ; +16 %define trans esp+0 ; +96 LOAD_DIFF_4x8P 0 - HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - movq [spill], mm0 - TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0 - movq [trans+0x00], mm4 - movq [trans+0x08], mm7 - movq [trans+0x10], mm0 - movq [trans+0x18], mm6 - movq mm0, [spill] - TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4 - movq [trans+0x20], mm0 - movq [trans+0x28], mm3 - movq [trans+0x30], mm4 - movq [trans+0x38], mm2 + movq [spill], m0 + TRANSPOSE4x4W 4, 5, 6, 7, 0 + movq [trans+0x00], m4 + movq [trans+0x08], m5 + movq [trans+0x10], m6 + movq [trans+0x18], m7 + movq m0, [spill] + TRANSPOSE4x4W 0, 1, 2, 3, 4 + movq [trans+0x20], m0 + movq [trans+0x28], m1 + movq [trans+0x30], m2 + movq [trans+0x38], m3 mov eax, [args+4] mov ecx, [args+12] LOAD_DIFF_4x8P 4 - HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - movq [spill], mm7 - TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7 - movq [trans+0x40], mm0 - movq [trans+0x48], mm3 - movq [trans+0x50], mm7 - movq [trans+0x58], mm2 - movq mm7, [spill] - TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0 - movq mm5, [trans+0x00] - movq mm1, [trans+0x08] - movq mm2, [trans+0x10] - movq mm3, [trans+0x18] + movq [spill], m7 + TRANSPOSE4x4W 0, 1, 2, 3, 7 + movq [trans+0x40], m0 + movq [trans+0x48], m1 + movq [trans+0x50], m2 + movq [trans+0x58], m3 + movq m7, [spill] + TRANSPOSE4x4W 4, 5, 6, 7, 0 + movq m0, [trans+0x00] + movq m1, [trans+0x08] + movq m2, [trans+0x10] + movq m3, [trans+0x18] - HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 SUM4x8_MM - movq [trans], mm0 + movq [trans], m0 + + movq m0, [trans+0x20] + movq m1, [trans+0x28] + movq m2, [trans+0x30] + movq m3, [trans+0x38] + movq m4, [trans+0x40] + movq m5, [trans+0x48] + movq m6, [trans+0x50] + movq m7, [trans+0x58] - movq mm0, [trans+0x20] - movq mm1, [trans+0x28] - movq mm2, [trans+0x30] - movq mm3, [trans+0x38] - movq mm4, [trans+0x40] - movq mm5, [trans+0x48] - movq mm6, [trans+0x50] - movq mm7, [trans+0x58] - - HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 SUM4x8_MM - pavgw mm0, [esp] - pshufw mm1, mm0, 01001110b - paddw mm0, mm1 - pshufw mm1, mm0, 10110001b - paddw mm0, mm1 - movd eax, mm0 - and eax, 0xffff - mov ecx, eax ; preserve rounding for 16x16 - add eax, 1 - shr eax, 1 - add esp, 0x70 - pop ebx + pavgw m0, [trans] + pshufw m1, m0, 01001110b + paddw m0, m1 + pshufw m1, m0, 10110001b + paddw m0, m1 + movd eax, m0 + and eax, 0xffff + mov ecx, eax ; preserve rounding for 16x16 + add eax, 1 + shr eax, 1 + add esp, 0x70 + pop ebx ret %undef args %undef spill @@ -184,25 +172,25 @@ %endmacro %macro LOAD_4x8P 1 ; dx - pxor mm7, mm7 - movd mm6, [eax+%1+7*FENC_STRIDE] - movd mm0, [eax+%1+0*FENC_STRIDE] - movd mm1, [eax+%1+1*FENC_STRIDE] - movd mm2, [eax+%1+2*FENC_STRIDE] - movd mm3, [eax+%1+3*FENC_STRIDE] - movd mm4, [eax+%1+4*FENC_STRIDE] - movd mm5, [eax+%1+5*FENC_STRIDE] - punpcklbw mm6, mm7 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - movq [spill], mm6 - punpcklbw mm2, mm7 - punpcklbw mm3, mm7 - movd mm6, [eax+%1+6*FENC_STRIDE] - punpcklbw mm4, mm7 - punpcklbw mm5, mm7 - punpcklbw mm6, mm7 - movq mm7, [spill] + pxor m7, m7 + movd m6, [eax+%1+7*FENC_STRIDE] + movd m0, [eax+%1+0*FENC_STRIDE] + movd m1, [eax+%1+1*FENC_STRIDE] + movd m2, [eax+%1+2*FENC_STRIDE] + movd m3, [eax+%1+3*FENC_STRIDE] + movd m4, [eax+%1+4*FENC_STRIDE] + movd m5, [eax+%1+5*FENC_STRIDE] + punpcklbw m6, m7 + punpcklbw m0, m7 + punpcklbw m1, m7 + movq [spill], m6 + punpcklbw m2, m7 + punpcklbw m3, m7 + movd m6, [eax+%1+6*FENC_STRIDE] + punpcklbw m4, m7 + punpcklbw m5, m7 + punpcklbw m6, m7 + movq m7, [spill] %endmacro ;----------------------------------------------------------------------------- @@ -217,146 +205,146 @@ %define trans esp+0 ; +96 %define sum esp+0 ; +32 LOAD_4x8P 0 - HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - movq [spill], mm0 - TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0 - movq [trans+0x00], mm4 - movq [trans+0x08], mm7 - movq [trans+0x10], mm0 - movq [trans+0x18], mm6 - movq mm0, [spill] - TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4 - movq [trans+0x20], mm0 - movq [trans+0x28], mm3 - movq [trans+0x30], mm4 - movq [trans+0x38], mm2 + movq [spill], m0 + TRANSPOSE4x4W 4, 5, 6, 7, 0 + movq [trans+0x00], m4 + movq [trans+0x08], m5 + movq [trans+0x10], m6 + movq [trans+0x18], m7 + movq m0, [spill] + TRANSPOSE4x4W 0, 1, 2, 3, 4 + movq [trans+0x20], m0 + movq [trans+0x28], m1 + movq [trans+0x30], m2 + movq [trans+0x38], m3 LOAD_4x8P 4 - HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - movq [spill], mm7 - TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7 - movq [trans+0x40], mm0 - movq [trans+0x48], mm3 - movq [trans+0x50], mm7 - movq [trans+0x58], mm2 - movq mm7, [spill] - TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0 - movq mm5, [trans+0x00] - movq mm1, [trans+0x08] - movq mm2, [trans+0x10] - movq mm3, [trans+0x18] - - HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6 - - movq [spill+0], mm5 - movq [spill+8], mm7 - ABS2 mm0, mm1, mm5, mm7 - ABS2 mm2, mm3, mm5, mm7 - paddw mm0, mm2 - paddw mm1, mm3 - paddw mm0, mm1 - ABS2 mm4, mm6, mm2, mm3 - movq mm5, [spill+0] - movq mm7, [spill+8] - paddw mm0, mm4 - paddw mm0, mm6 - ABS1 mm7, mm1 - paddw mm0, mm7 ; 7x4 sum - movq mm6, mm5 - movq mm7, [ecx+8] ; left bottom - psllw mm7, 3 - psubw mm6, mm7 - ABS2 mm5, mm6, mm2, mm3 - paddw mm5, mm0 - paddw mm6, mm0 - movq [sum+0], mm5 ; dc - movq [sum+8], mm6 ; left - - movq mm0, [trans+0x20] - movq mm1, [trans+0x28] - movq mm2, [trans+0x30] - movq mm3, [trans+0x38] - movq mm4, [trans+0x40] - movq mm5, [trans+0x48] - movq mm6, [trans+0x50] - movq mm7, [trans+0x58] - - HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 - - movd [sum+0x10], mm0 - movd [sum+0x12], mm1 - movd [sum+0x14], mm2 - movd [sum+0x16], mm3 - movd [sum+0x18], mm4 - movd [sum+0x1a], mm5 - movd [sum+0x1c], mm6 - movd [sum+0x1e], mm7 - - movq [spill], mm0 - movq [spill+8], mm1 - ABS2 mm2, mm3, mm0, mm1 - ABS2 mm4, mm5, mm0, mm1 - paddw mm2, mm3 - paddw mm4, mm5 - paddw mm2, mm4 - movq mm0, [spill] - movq mm1, [spill+8] - ABS2 mm6, mm7, mm4, mm5 - ABS1 mm1, mm4 - paddw mm2, mm7 - paddw mm1, mm6 - paddw mm2, mm1 ; 7x4 sum - movq mm1, mm0 + movq [spill], m7 + TRANSPOSE4x4W 0, 1, 2, 3, 7 + movq [trans+0x40], m0 + movq [trans+0x48], m1 + movq [trans+0x50], m2 + movq [trans+0x58], m3 + movq m7, [spill] + TRANSPOSE4x4W 4, 5, 6, 7, 0 + movq m0, [trans+0x00] + movq m1, [trans+0x08] + movq m2, [trans+0x10] + movq m3, [trans+0x18] + + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + + movq [spill+0], m0 + movq [spill+8], m1 + ABS2 m2, m3, m0, m1 + ABS2 m4, m5, m0, m1 + paddw m2, m4 + paddw m3, m5 + ABS2 m6, m7, m4, m5 + movq m0, [spill+0] + movq m1, [spill+8] + paddw m2, m6 + paddw m3, m7 + paddw m2, m3 + ABS1 m1, m4 + paddw m2, m1 ; 7x4 sum + movq m7, m0 + movq m1, [ecx+8] ; left bottom + psllw m1, 3 + psubw m7, m1 + ABS2 m0, m7, m5, m3 + paddw m0, m2 + paddw m7, m2 + movq [sum+0], m0 ; dc + movq [sum+8], m7 ; left + + movq m0, [trans+0x20] + movq m1, [trans+0x28] + movq m2, [trans+0x30] + movq m3, [trans+0x38] + movq m4, [trans+0x40] + movq m5, [trans+0x48] + movq m6, [trans+0x50] + movq m7, [trans+0x58] + + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + + movd [sum+0x10], m0 + movd [sum+0x12], m1 + movd [sum+0x14], m2 + movd [sum+0x16], m3 + movd [sum+0x18], m4 + movd [sum+0x1a], m5 + movd [sum+0x1c], m6 + movd [sum+0x1e], m7 + + movq [spill], m0 + movq [spill+8], m1 + ABS2 m2, m3, m0, m1 + ABS2 m4, m5, m0, m1 + paddw m2, m4 + paddw m3, m5 + paddw m2, m3 + movq m0, [spill] + movq m1, [spill+8] + ABS2 m6, m7, m4, m5 + ABS1 m1, m3 + paddw m2, m7 + paddw m1, m6 + paddw m2, m1 ; 7x4 sum + movq m1, m0 - movq mm7, [ecx+0] - psllw mm7, 3 ; left top + movq m7, [ecx+0] + psllw m7, 3 ; left top - movzx edx, word [ecx+0] + movzx edx, word [ecx+0] add dx, [ecx+16] - lea edx, [4*edx+32] - and edx, -64 - movd mm6, edx ; dc - - psubw mm1, mm7 - psubw mm0, mm6 - ABS2 mm0, mm1, mm5, mm6 - movq mm3, [sum+0] ; dc - paddw mm0, mm2 - paddw mm1, mm2 - movq mm2, mm0 - paddw mm0, mm3 - paddw mm1, [sum+8] ; h - psrlq mm2, 16 - paddw mm2, mm3 - - movq mm3, [ecx+16] ; top left - movq mm4, [ecx+24] ; top right - psllw mm3, 3 - psllw mm4, 3 - psubw mm3, [sum+16] - psubw mm4, [sum+24] - ABS2 mm3, mm4, mm5, mm6 - paddw mm2, mm3 - paddw mm2, mm4 ; v - - SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd - mov eax, [args+8] - movd ecx, mm2 - movd edx, mm1 - add ecx, 2 - add edx, 2 - shr ecx, 2 - shr edx, 2 - mov [eax+0], ecx ; i8x8_v satd - mov [eax+4], edx ; i8x8_h satd - movd ecx, mm0 - add ecx, 2 - shr ecx, 2 - mov [eax+8], ecx ; i8x8_dc satd + lea edx, [4*edx+32] + and edx, -64 + movd m6, edx ; dc + + psubw m1, m7 + psubw m0, m6 + ABS2 m0, m1, m5, m6 + movq m3, [sum+0] ; dc + paddw m0, m2 + paddw m1, m2 + movq m2, m0 + paddw m0, m3 + paddw m1, [sum+8] ; h + psrlq m2, 16 + paddw m2, m3 + + movq m3, [ecx+16] ; top left + movq m4, [ecx+24] ; top right + psllw m3, 3 + psllw m4, 3 + psubw m3, [sum+16] + psubw m4, [sum+24] + ABS2 m3, m4, m5, m6 + paddw m2, m3 + paddw m2, m4 ; v + + SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd + mov eax, [args+8] + movd ecx, m2 + movd edx, m1 + add ecx, 2 + add edx, 2 + shr ecx, 2 + shr edx, 2 + mov [eax+0], ecx ; i8x8_v satd + mov [eax+4], edx ; i8x8_h satd + movd ecx, m0 + add ecx, 2 + shr ecx, 2 + mov [eax+8], ecx ; i8x8_dc satd - add esp, 0x70 + add esp, 0x70 ret %undef args %undef spill @@ -370,57 +358,57 @@ ; const uint8_t *pix2, int stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- cglobal x264_pixel_ssim_4x4x2_core_mmxext - push ebx - push edi - mov ebx, [esp+16] - mov edx, [esp+24] - mov edi, 4 - pxor mm0, mm0 + push ebx + push edi + mov ebx, [esp+16] + mov edx, [esp+24] + mov edi, 4 + pxor m0, m0 .loop: - mov eax, [esp+12] - mov ecx, [esp+20] - add eax, edi - add ecx, edi - pxor mm1, mm1 - pxor mm2, mm2 - pxor mm3, mm3 - pxor mm4, mm4 + mov eax, [esp+12] + mov ecx, [esp+20] + add eax, edi + add ecx, edi + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + pxor m4, m4 %rep 4 - movd mm5, [eax] - movd mm6, [ecx] - punpcklbw mm5, mm0 - punpcklbw mm6, mm0 - paddw mm1, mm5 - paddw mm2, mm6 - movq mm7, mm5 - pmaddwd mm5, mm5 - pmaddwd mm7, mm6 - pmaddwd mm6, mm6 - paddd mm3, mm5 - paddd mm4, mm7 - paddd mm3, mm6 - add eax, ebx - add ecx, edx + movd m5, [eax] + movd m6, [ecx] + punpcklbw m5, m0 + punpcklbw m6, m0 + paddw m1, m5 + paddw m2, m6 + movq m7, m5 + pmaddwd m5, m5 + pmaddwd m7, m6 + pmaddwd m6, m6 + paddd m3, m5 + paddd m4, m7 + paddd m3, m6 + add eax, ebx + add ecx, edx %endrep - mov eax, [esp+28] - lea eax, [eax+edi*4] - pshufw mm5, mm1, 0xE - pshufw mm6, mm2, 0xE - paddusw mm1, mm5 - paddusw mm2, mm6 - punpcklwd mm1, mm2 - pshufw mm2, mm1, 0xE - pshufw mm5, mm3, 0xE - pshufw mm6, mm4, 0xE - paddusw mm1, mm2 - paddd mm3, mm5 - paddd mm4, mm6 - punpcklwd mm1, mm0 - punpckldq mm3, mm4 - movq [eax+0], mm1 - movq [eax+8], mm3 - sub edi, 4 - jge .loop + mov eax, [esp+28] + lea eax, [eax+edi*4] + pshufw m5, m1, 0xE + pshufw m6, m2, 0xE + paddusw m1, m5 + paddusw m2, m6 + punpcklwd m1, m2 + pshufw m2, m1, 0xE + pshufw m5, m3, 0xE + pshufw m6, m4, 0xE + paddusw m1, m2 + paddd m3, m5 + paddd m4, m6 + punpcklwd m1, m0 + punpckldq m3, m4 + movq [eax+0], m1 + movq [eax+8], m3 + sub edi, 4 + jge .loop pop edi pop ebx emms

@@ -31,6 +31,8 @@ ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 mask_ff: times 16 db 0xff times 16 db 0 +mask_ac4: dw 0,-1,-1,-1, 0,-1,-1,-1 +mask_ac8: dw 0,-1,-1,-1,-1,-1,-1,-1 SECTION .text @@ -162,6 +164,112 @@ SSD 8, 4, sse2 +;============================================================================= +; variance +;============================================================================= + +%macro VAR_START 0 + pxor m5, m5 ; sum + pxor m6, m6 ; sum squared + pxor m7, m7 ; zero +%ifdef ARCH_X86_64 + %define t3d r3d +%else + %define t3d r2d +%endif +%endmacro + +%macro VAR_END 1 +%if mmsize == 16 + movhlps m0, m5 + paddw m5, m0 +%endif + movifnidn r2d, r2m + movd r1d, m5 + movd [r2], m5 ; return sum + imul r1d, r1d + HADDD m6, m1 + shr r1d, %1 + movd eax, m6 + sub eax, r1d ; sqr - (sum * sum >> shift) + RET +%endmacro + +%macro VAR_2ROW 2 + mov t3d, %2 +.loop: + mova m0, [r0] + mova m1, m0 + mova m3, [r0+%1] + mova m2, m0 + punpcklbw m0, m7 + mova m4, m3 + punpckhbw m1, m7 +%ifidn %1, r1 + lea r0, [r0+%1*2] +%else + add r0, r1 +%endif + punpckhbw m4, m7 + psadbw m2, m7 + paddw m5, m2 + mova m2, m3 + punpcklbw m3, m7 + dec t3d + psadbw m2, m7 + pmaddwd m0, m0 + paddw m5, m2 + pmaddwd m1, m1 + paddd m6, m0 + pmaddwd m3, m3 + paddd m6, m1 + pmaddwd m4, m4 + paddd m6, m3 + paddd m6, m4 + jg .loop +%endmacro + +;----------------------------------------------------------------------------- +; int x264_pixel_var_wxh_mmxext( uint8_t *, int, int * ) +;----------------------------------------------------------------------------- +INIT_MMX +cglobal x264_pixel_var_16x16_mmxext, 2,3 + VAR_START + VAR_2ROW 8, 16 + VAR_END 8 + +cglobal x264_pixel_var_8x8_mmxext, 2,3 + VAR_START + VAR_2ROW r1, 4 + VAR_END 6 + +INIT_XMM +cglobal x264_pixel_var_16x16_sse2, 2,3 + VAR_START + VAR_2ROW r1, 8 + VAR_END 8 + +cglobal x264_pixel_var_8x8_sse2, 2,3 + VAR_START + mov t3d, 4 +.loop: + movh m0, [r0] + movhps m0, [r0+r1] + lea r0, [r0+r1*2] + mova m1, m0 + punpcklbw m0, m7 + mova m2, m1 + punpckhbw m1, m7 + dec t3d + pmaddwd m0, m0 + pmaddwd m1, m1 + psadbw m2, m7 + paddw m5, m2 + paddd m6, m0 + paddd m6, m1 + jnz .loop + VAR_END 6 + ;============================================================================= ; SATD @@ -173,16 +281,18 @@ ; whereas phaddw-based transform doesn't care what order the coefs end up in. %macro PHSUMSUB 3 - movdqa %3, %1 - phaddw %1, %2 - phsubw %3, %2 + movdqa m%3, m%1 + phaddw m%1, m%2 + phsubw m%3, m%2 + SWAP %2, %3 %endmacro -%macro HADAMARD4_ROW_PHADD 5 ; abcd-t -> adtc - PHSUMSUB %1, %2, %5 - PHSUMSUB %3, %4, %2 - PHSUMSUB %1, %3, %4 - PHSUMSUB %5, %2, %3 +%macro HADAMARD4_ROW_PHADD 5 + PHSUMSUB %1, %2, %5 + PHSUMSUB %3, %4, %5 + PHSUMSUB %1, %3, %5 + PHSUMSUB %2, %4, %5 + SWAP %3, %4 %endmacro %macro HADAMARD4_1D 4 @@ -190,102 +300,29 @@ SUMSUB_BADC %1, %3, %2, %4 %endmacro -%macro SBUTTERFLY 5 - mov%1 %5, %3 - punpckl%2 %3, %4 - punpckh%2 %5, %4 -%endmacro - -%macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4x2 to not shuffle registers - mov%1 %5, %3 - punpckh%2 %3, %4 - punpckl%2 %5, %4 -%endmacro - -%macro TRANSPOSE4x4W 5 ; abcd-t -> adtc - SBUTTERFLY q, wd, %1, %2, %5 - SBUTTERFLY q, wd, %3, %4, %2 - SBUTTERFLY q, dq, %1, %3, %4 - SBUTTERFLY q, dq, %5, %2, %3 -%endmacro - -%macro TRANSPOSE4x4D 5 ; abcd-t -> adtc - SBUTTERFLY dqa, dq, %1, %2, %5 - SBUTTERFLY dqa, dq, %3, %4, %2 - SBUTTERFLY dqa, qdq, %1, %3, %4 - SBUTTERFLY dqa, qdq, %5, %2, %3 -%endmacro - -%macro TRANSPOSE2x4x4W 5 ; abcd-t -> abcd - SBUTTERFLY dqa, wd, %1, %2, %5 - SBUTTERFLY dqa, wd, %3, %4, %2 - SBUTTERFLY dqa, dq, %1, %3, %4 - SBUTTERFLY2 dqa, dq, %5, %2, %3 - SBUTTERFLY dqa, qdq, %1, %3, %2 - SBUTTERFLY2 dqa, qdq, %4, %5, %3 -%endmacro - -%ifdef ARCH_X86_64 -%macro TRANSPOSE8x8W 9 ; abcdefgh-t -> afhdtecb - SBUTTERFLY dqa, wd, %1, %2, %9 - SBUTTERFLY dqa, wd, %3, %4, %2 - SBUTTERFLY dqa, wd, %5, %6, %4 - SBUTTERFLY dqa, wd, %7, %8, %6 - SBUTTERFLY dqa, dq, %1, %3, %8 - SBUTTERFLY dqa, dq, %9, %2, %3 - SBUTTERFLY dqa, dq, %5, %7, %2 - SBUTTERFLY dqa, dq, %4, %6, %7 - SBUTTERFLY dqa, qdq, %1, %5, %6 - SBUTTERFLY dqa, qdq, %9, %4, %5 - SBUTTERFLY dqa, qdq, %8, %2, %4 - SBUTTERFLY dqa, qdq, %3, %7, %2 -%endmacro -%else -%macro TRANSPOSE8x8W 9 ; abcdefgh -> afhdgecb - movdqa [%9], %8 - SBUTTERFLY dqa, wd, %1, %2, %8 - movdqa [%9+16], %8 - movdqa %8, [%9] - SBUTTERFLY dqa, wd, %3, %4, %2 - SBUTTERFLY dqa, wd, %5, %6, %4 - SBUTTERFLY dqa, wd, %7, %8, %6 - SBUTTERFLY dqa, dq, %1, %3, %8 - movdqa [%9], %8 - movdqa %8, [16+%9] - SBUTTERFLY dqa, dq, %8, %2, %3 - SBUTTERFLY dqa, dq, %5, %7, %2 - SBUTTERFLY dqa, dq, %4, %6, %7 - SBUTTERFLY dqa, qdq, %1, %5, %6 - SBUTTERFLY dqa, qdq, %8, %4, %5 - movdqa [%9+16], %8 - movdqa %8, [%9] - SBUTTERFLY dqa, qdq, %8, %2, %4 - SBUTTERFLY dqa, qdq, %3, %7, %2 - movdqa %7, [%9+16] -%endmacro -%endif - %macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block) - HADAMARD4_1D mm4, mm5, mm6, mm7 - TRANSPOSE4x4W mm4, mm5, mm6, mm7, %1 - HADAMARD4_1D mm4, mm7, %1, mm6 - ABS2 mm4, mm7, mm3, mm5 - ABS2 %1, mm6, mm3, mm5 - paddw %1, mm4 - paddw mm6, mm7 - pavgw %1, mm6 + %xdefine %%n n%1 + HADAMARD4_1D m4, m5, m6, m7 + TRANSPOSE4x4W 4, 5, 6, 7, %%n + HADAMARD4_1D m4, m5, m6, m7 + ABS2 m4, m5, m3, m %+ %%n + ABS2 m6, m7, m3, m %+ %%n + paddw m6, m4 + paddw m7, m5 + pavgw m6, m7 + SWAP %%n, 6 %endmacro ; in: r4=3*stride1, r5=3*stride2 ; in: %2 = horizontal offset ; in: %3 = whether we need to increment pix1 and pix2 -; clobber: mm3..mm7 +; clobber: m3..m7 ; out: %1 = satd %macro SATD_4x4_MMX 3 - LOAD_DIFF mm4, mm3, none, [r0+%2], [r2+%2] - LOAD_DIFF mm5, mm3, none, [r0+r1+%2], [r2+r3+%2] - LOAD_DIFF mm6, mm3, none, [r0+2*r1+%2], [r2+2*r3+%2] - LOAD_DIFF mm7, mm3, none, [r0+r4+%2], [r2+r5+%2] + LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2] + LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2] + LOAD_DIFF m6, m3, none, [r0+2*r1+%2], [r2+2*r3+%2] + LOAD_DIFF m7, m3, none, [r0+r4+%2], [r2+r5+%2] %if %3 lea r0, [r0+4*r1] lea r2, [r2+4*r3] @@ -294,66 +331,66 @@ %endmacro %macro SATD_8x4_START 1 - SATD_4x4_MMX mm0, 0, 0 - SATD_4x4_MMX mm1, 4, %1 + SATD_4x4_MMX m0, 0, 0 + SATD_4x4_MMX m1, 4, %1 %endmacro %macro SATD_8x4_INC 1 - SATD_4x4_MMX mm2, 0, 0 - paddw mm0, mm1 - SATD_4x4_MMX mm1, 4, %1 - paddw mm0, mm2 + SATD_4x4_MMX m2, 0, 0 + paddw m0, m1 + SATD_4x4_MMX m1, 4, %1 + paddw m0, m2 %endmacro %macro SATD_16x4_START 1 - SATD_4x4_MMX mm0, 0, 0 - SATD_4x4_MMX mm1, 4, 0 - SATD_4x4_MMX mm2, 8, 0 - paddw mm0, mm1 - SATD_4x4_MMX mm1, 12, %1 - paddw mm0, mm2 + SATD_4x4_MMX m0, 0, 0 + SATD_4x4_MMX m1, 4, 0 + SATD_4x4_MMX m2, 8, 0 + paddw m0, m1 + SATD_4x4_MMX m1, 12, %1 + paddw m0, m2 %endmacro %macro SATD_16x4_INC 1 - SATD_4x4_MMX mm2, 0, 0 - paddw mm0, mm1 - SATD_4x4_MMX mm1, 4, 0 - paddw mm0, mm2 - SATD_4x4_MMX mm2, 8, 0 - paddw mm0, mm1 - SATD_4x4_MMX mm1, 12, %1 - paddw mm0, mm2 + SATD_4x4_MMX m2, 0, 0 + paddw m0, m1 + SATD_4x4_MMX m1, 4, 0 + paddw m0, m2 + SATD_4x4_MMX m2, 8, 0 + paddw m0, m1 + SATD_4x4_MMX m1, 12, %1 + paddw m0, m2 %endmacro %macro SATD_8x4_SSE2 1 - LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5 %if %1 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endif - HADAMARD4_1D xmm0, xmm1, xmm2, xmm3 - TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4 - HADAMARD4_1D xmm0, xmm1, xmm2, xmm3 - ABS4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - paddusw xmm0, xmm1 - paddusw xmm2, xmm3 - paddusw xmm6, xmm0 - paddusw xmm6, xmm2 + HADAMARD4_1D m0, m1, m2, m3 + TRANSPOSE2x4x4W 0, 1, 2, 3, 4 + HADAMARD4_1D m0, m1, m2, m3 + ABS4 m0, m1, m2, m3, m4, m5 + paddusw m0, m1 + paddusw m2, m3 + paddusw m6, m0 + paddusw m6, m2 %endmacro %macro SATD_8x4_PHADD 1 - LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5 %if %1 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endif - HADAMARD4_1D xmm0, xmm1, xmm2, xmm3 - HADAMARD4_ROW_PHADD xmm0, xmm1, xmm2, xmm3, xmm4 - ABS4 xmm0, xmm3, xmm4, xmm2, xmm1, xmm5 - paddusw xmm0, xmm3 - paddusw xmm2, xmm4 - paddusw xmm6, xmm0 - paddusw xmm6, xmm2 + HADAMARD4_1D m0, m1, m2, m3 + HADAMARD4_ROW_PHADD 0, 1, 2, 3, 4 + ABS4 m0, m1, m2, m3, m4, m5 + paddusw m0, m1 + paddusw m2, m3 + paddusw m6, m0 + paddusw m6, m2 %endmacro %macro SATD_START_MMX 0 @@ -362,12 +399,12 @@ %endmacro %macro SATD_END_MMX 0 - pshufw mm1, mm0, 01001110b - paddw mm0, mm1 - pshufw mm1, mm0, 10110001b - paddw mm0, mm1 - movd eax, mm0 - and eax, 0xffff + pshufw m1, m0, 01001110b + paddw m0, m1 + pshufw m1, m0, 10110001b + paddw m0, m1 + movd eax, m0 + and eax, 0xffff RET %endmacro @@ -377,27 +414,28 @@ ;----------------------------------------------------------------------------- ; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- +INIT_MMX cglobal x264_pixel_satd_16x16_mmxext, 4,6 SATD_START_MMX SATD_16x4_START 1 SATD_16x4_INC 1 SATD_16x4_INC 1 SATD_16x4_INC 0 - paddw mm0, mm1 - pxor mm3, mm3 - pshufw mm1, mm0, 01001110b - paddw mm0, mm1 - punpcklwd mm0, mm3 - pshufw mm1, mm0, 01001110b - paddd mm0, mm1 - movd eax, mm0 + paddw m0, m1 + pxor m3, m3 + pshufw m1, m0, 01001110b + paddw m0, m1 + punpcklwd m0, m3 + pshufw m1, m0, 01001110b + paddd m0, m1 + movd eax, m0 RET cglobal x264_pixel_satd_16x8_mmxext, 4,6 SATD_START_MMX SATD_16x4_START 1 SATD_16x4_INC 0 - paddw mm0, mm1 + paddw m0, m1 SATD_END_MMX cglobal x264_pixel_satd_8x16_mmxext, 4,6 @@ -406,56 +444,56 @@ SATD_8x4_INC 1 SATD_8x4_INC 1 SATD_8x4_INC 0 - paddw mm0, mm1 + paddw m0, m1 SATD_END_MMX cglobal x264_pixel_satd_8x8_mmxext, 4,6 SATD_START_MMX SATD_8x4_START 1 SATD_8x4_INC 0 - paddw mm0, mm1 + paddw m0, m1 SATD_END_MMX cglobal x264_pixel_satd_8x4_mmxext, 4,6 SATD_START_MMX SATD_8x4_START 0 - paddw mm0, mm1 + paddw m0, m1 SATD_END_MMX %macro SATD_W4 1 +INIT_MMX cglobal x264_pixel_satd_4x8_%1, 4,6 SATD_START_MMX - SATD_4x4_MMX mm0, 0, 1 - SATD_4x4_MMX mm1, 0, 0 - paddw mm0, mm1 + SATD_4x4_MMX m0, 0, 1 + SATD_4x4_MMX m1, 0, 0 + paddw m0, m1 SATD_END_MMX cglobal x264_pixel_satd_4x4_%1, 4,6 SATD_START_MMX - SATD_4x4_MMX mm0, 0, 0 + SATD_4x4_MMX m0, 0, 0 SATD_END_MMX %endmacro SATD_W4 mmxext %macro SATD_START_SSE2 0 - pxor xmm6, xmm6 - lea r4, [3*r1] - lea r5, [3*r3] + pxor m6, m6 + lea r4, [3*r1] + lea r5, [3*r3] %endmacro %macro SATD_END_SSE2 0 - picgetgot ebx - psrlw xmm6, 1 - HADDW xmm6, xmm7 - movd eax, xmm6 + psrlw m6, 1 + HADDW m6, m7 + movd eax, m6 RET %endmacro %macro BACKUP_POINTERS 0 %ifdef ARCH_X86_64 - mov r10, r0 - mov r11, r2 + mov r10, r0 + mov r11, r2 %endif %endmacro @@ -475,6 +513,7 @@ ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SATDS_SSE2 1 +INIT_XMM cglobal x264_pixel_satd_16x16_%1, 4,6 SATD_START_SSE2 BACKUP_POINTERS @@ -526,26 +565,26 @@ lea r4, [3*r1] lea r5, [3*r3] .skip_lea: - LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm8, xmm9 + LOAD_DIFF_8x4P m0, m1, m2, m3, m8, m9 lea r0, [r0+4*r1] lea r2, [r2+4*r3] - LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm8, xmm9 + LOAD_DIFF_8x4P m4, m5, m6, m7, m8, m9 - HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 - TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 - HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1 - - ABS4 xmm0, xmm1, xmm2, xmm3, xmm6, xmm9 - ABS4 xmm4, xmm5, xmm7, xmm8, xmm6, xmm9 - paddusw xmm0, xmm1 - paddusw xmm2, xmm3 - paddusw xmm4, xmm5 - paddusw xmm7, xmm8 - paddusw xmm0, xmm2 - paddusw xmm4, xmm7 - pavgw xmm0, xmm4 - HADDW xmm0, xmm1 - movd eax, xmm0 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + + ABS4 m0, m1, m2, m3, m8, m9 + ABS4 m4, m5, m6, m7, m8, m9 + paddusw m0, m1 + paddusw m2, m3 + paddusw m4, m5 + paddusw m6, m7 + paddusw m0, m2 + paddusw m4, m6 + pavgw m0, m4 + HADDW m0, m1 + movd eax, m0 add r10d, eax ; preserve rounding for 16x16 add eax, 1 shr eax, 1 @@ -576,39 +615,38 @@ sub esp, 32 lea r4, [3*r1] lea r5, [3*r3] - LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm6, xmm7 - movdqa [esp], xmm2 + LOAD_DIFF_8x4P m0, m1, m2, m3, m6, m7 + movdqa [esp], m2 lea r0, [r0+4*r1] lea r2, [r2+4*r3] - LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm2, xmm2 - movdqa xmm2, [esp] + LOAD_DIFF_8x4P m4, m5, m6, m7, m2, m2 + movdqa m2, [esp] - HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 - TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, esp - HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm6, xmm4, xmm2, xmm1 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [esp], [esp+16] + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 %ifidn %1, sse2 - movdqa [esp], xmm6 - movdqa [esp+16], xmm7 + movdqa [esp], m4 + movdqa [esp+16], m2 %endif - ABS2 xmm2, xmm3, xmm6, xmm7 - ABS2 xmm0, xmm1, xmm6, xmm7 - paddusw xmm0, xmm2 - paddusw xmm1, xmm3 + ABS2 m6, m3, m4, m2 + ABS2 m0, m7, m4, m2 + paddusw m0, m6 + paddusw m7, m3 %ifidn %1, sse2 - movdqa xmm6, [esp] - movdqa xmm7, [esp+16] + movdqa m4, [esp] + movdqa m2, [esp+16] %endif - ABS2 xmm4, xmm5, xmm2, xmm3 - ABS2 xmm6, xmm7, xmm2, xmm3 - paddusw xmm4, xmm5 - paddusw xmm6, xmm7 - paddusw xmm0, xmm1 - paddusw xmm4, xmm6 - pavgw xmm0, xmm4 - picgetgot ebx - HADDW xmm0, xmm1 - movd eax, xmm0 + ABS2 m5, m1, m6, m3 + ABS2 m4, m2, m6, m3 + paddusw m5, m1 + paddusw m4, m2 + paddusw m0, m7 + paddusw m5, m4 + pavgw m0, m5 + HADDW m0, m7 + movd eax, m0 mov ecx, eax ; preserve rounding for 16x16 add eax, 1 shr eax, 1 @@ -658,31 +696,32 @@ %macro INTRA_SA8D_SSE2 1 %ifdef ARCH_X86_64 +INIT_XMM ;----------------------------------------------------------------------------- ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res ) ;----------------------------------------------------------------------------- cglobal x264_intra_sa8d_x3_8x8_core_%1 ; 8x8 hadamard - pxor xmm4, xmm4 - movq xmm0, [r0+0*FENC_STRIDE] - movq xmm7, [r0+1*FENC_STRIDE] - movq xmm6, [r0+2*FENC_STRIDE] - movq xmm3, [r0+3*FENC_STRIDE] - movq xmm5, [r0+4*FENC_STRIDE] - movq xmm1, [r0+5*FENC_STRIDE] - movq xmm8, [r0+6*FENC_STRIDE] - movq xmm2, [r0+7*FENC_STRIDE] - punpcklbw xmm0, xmm4 - punpcklbw xmm7, xmm4 - punpcklbw xmm6, xmm4 - punpcklbw xmm3, xmm4 - punpcklbw xmm5, xmm4 - punpcklbw xmm1, xmm4 - punpcklbw xmm8, xmm4 - punpcklbw xmm2, xmm4 - HADAMARD8_1D xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2 - TRANSPOSE8x8W xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4 - HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 + pxor m8, m8 + movq m0, [r0+0*FENC_STRIDE] + movq m1, [r0+1*FENC_STRIDE] + movq m2, [r0+2*FENC_STRIDE] + movq m3, [r0+3*FENC_STRIDE] + movq m4, [r0+4*FENC_STRIDE] + movq m5, [r0+5*FENC_STRIDE] + movq m6, [r0+6*FENC_STRIDE] + movq m7, [r0+7*FENC_STRIDE] + punpcklbw m0, m8 + punpcklbw m1, m8 + punpcklbw m2, m8 + punpcklbw m3, m8 + punpcklbw m4, m8 + punpcklbw m5, m8 + punpcklbw m6, m8 + punpcklbw m7, m8 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 ; dc movzx edi, word [r1+0] @@ -691,95 +730,97 @@ and edi, -16 shl edi, 2 - pxor xmm15, xmm15 - movdqa xmm8, xmm2 - movdqa xmm9, xmm3 - movdqa xmm10, xmm4 - movdqa xmm11, xmm5 - ABS4 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13 - paddusw xmm8, xmm10 - paddusw xmm9, xmm11 + pxor m15, m15 + movdqa m8, m2 + movdqa m9, m3 + movdqa m10, m4 + movdqa m11, m5 + ABS4 m8, m9, m10, m11, m12, m13 + paddusw m8, m10 + paddusw m9, m11 %ifidn %1, ssse3 - pabsw xmm10, xmm6 - pabsw xmm11, xmm7 - pabsw xmm15, xmm1 + pabsw m10, m6 + pabsw m11, m7 + pabsw m15, m1 %else - movdqa xmm10, xmm6 - movdqa xmm11, xmm7 - movdqa xmm15, xmm1 - ABS2 xmm10, xmm11, xmm13, xmm14 - ABS1 xmm15, xmm13 -%endif - paddusw xmm10, xmm11 - paddusw xmm8, xmm9 - paddusw xmm15, xmm10 - paddusw xmm15, xmm8 - movdqa xmm14, xmm15 ; 7x8 sum - - movdqa xmm8, [r1+0] ; left edge - movd xmm9, edi - psllw xmm8, 3 - psubw xmm8, xmm0 - psubw xmm9, xmm0 - ABS1 xmm8, xmm10 - ABS1 xmm9, xmm11 ; 1x8 sum - paddusw xmm14, xmm8 - paddusw xmm15, xmm9 - punpcklwd xmm0, xmm1 - punpcklwd xmm2, xmm3 - punpcklwd xmm4, xmm5 - punpcklwd xmm6, xmm7 - punpckldq xmm0, xmm2 - punpckldq xmm4, xmm6 - punpcklqdq xmm0, xmm4 ; transpose - movdqa xmm1, [r1+16] ; top edge - movdqa xmm2, xmm15 - psllw xmm1, 3 - psrldq xmm2, 2 ; 8x7 sum - psubw xmm0, xmm1 ; 8x1 sum - ABS1 xmm0, xmm1 - paddusw xmm2, xmm0 + movdqa m10, m6 + movdqa m11, m7 + movdqa m15, m1 + ABS2 m10, m11, m13, m14 + ABS1 m15, m13 +%endif + paddusw m10, m11 + paddusw m8, m9 + paddusw m15, m10 + paddusw m15, m8 + movdqa m14, m15 ; 7x8 sum + + movdqa m8, [r1+0] ; left edge + movd m9, edi + psllw m8, 3 + psubw m8, m0 + psubw m9, m0 + ABS1 m8, m10 + ABS1 m9, m11 ; 1x8 sum + paddusw m14, m8 + paddusw m15, m9 + punpcklwd m0, m1 + punpcklwd m2, m3 + punpcklwd m4, m5 + punpcklwd m6, m7 + punpckldq m0, m2 + punpckldq m4, m6 + punpcklqdq m0, m4 ; transpose + movdqa m1, [r1+16] ; top edge + movdqa m2, m15 + psllw m1, 3 + psrldq m2, 2 ; 8x7 sum + psubw m0, m1 ; 8x1 sum + ABS1 m0, m1 + paddusw m2, m0 ; 3x HADDW - movdqa xmm7, [pw_1 GLOBAL] - pmaddwd xmm2, xmm7 - pmaddwd xmm14, xmm7 - pmaddwd xmm15, xmm7 - movdqa xmm3, xmm2 - punpckldq xmm2, xmm14 - punpckhdq xmm3, xmm14 - pshufd xmm5, xmm15, 0xf5 - paddd xmm2, xmm3 - paddd xmm5, xmm15 - movdqa xmm3, xmm2 - punpcklqdq xmm2, xmm5 - punpckhqdq xmm3, xmm5 - pavgw xmm3, xmm2 - pxor xmm0, xmm0 - pavgw xmm3, xmm0 - movq [r2], xmm3 ; i8x8_v, i8x8_h - psrldq xmm3, 8 - movd [r2+8], xmm3 ; i8x8_dc + movdqa m7, [pw_1 GLOBAL] + pmaddwd m2, m7 + pmaddwd m14, m7 + pmaddwd m15, m7 + movdqa m3, m2 + punpckldq m2, m14 + punpckhdq m3, m14 + pshufd m5, m15, 0xf5 + paddd m2, m3 + paddd m5, m15 + movdqa m3, m2 + punpcklqdq m2, m5 + punpckhqdq m3, m5 + pavgw m3, m2 + pxor m0, m0 + pavgw m3, m0 + movq [r2], m3 ; i8x8_v, i8x8_h + psrldq m3, 8 + movd [r2+8], m3 ; i8x8_dc ret %endif ; ARCH_X86_64 -%endmacro ; INTRA_SATDS +%endmacro ; INTRA_SA8D_SSE2 ; in: r0 = fenc -; out: mm0..mm3 = hadamard coefs +; out: m0..m3 = hadamard coefs +INIT_MMX ALIGN 16 load_hadamard: - pxor mm7, mm7 - movd mm0, [r0+0*FENC_STRIDE] - movd mm4, [r0+1*FENC_STRIDE] - movd mm3, [r0+2*FENC_STRIDE] - movd mm1, [r0+3*FENC_STRIDE] - punpcklbw mm0, mm7 - punpcklbw mm4, mm7 - punpcklbw mm3, mm7 - punpcklbw mm1, mm7 - HADAMARD4_1D mm0, mm4, mm3, mm1 - TRANSPOSE4x4W mm0, mm4, mm3, mm1, mm2 - HADAMARD4_1D mm0, mm1, mm2, mm3 + pxor m7, m7 + movd m0, [r0+0*FENC_STRIDE] + movd m1, [r0+1*FENC_STRIDE] + movd m2, [r0+2*FENC_STRIDE] + movd m3, [r0+3*FENC_STRIDE] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + HADAMARD4_1D m0, m1, m2, m3 + TRANSPOSE4x4W 0, 1, 2, 3, 4 + HADAMARD4_1D m0, m1, m2, m3 + SAVE_MM_PERMUTATION load_hadamard ret %macro SCALAR_SUMSUB 4 @@ -848,53 +889,54 @@ mov qword [sums+8], 0 mov qword [sums+16], 0 %else - pxor mm7, mm7 - movq [sums+0], mm7 - movq [sums+8], mm7 - movq [sums+16], mm7 + pxor m7, m7 + movq [sums+0], m7 + movq [sums+8], m7 + movq [sums+16], m7 %endif %endmacro -; in: mm1..mm3 -; out: mm7 -; clobber: mm4..mm6 +; in: m1..m3 +; out: m7 +; clobber: m4..m6 %macro SUM3x4 1 %ifidn %1, ssse3 - pabsw mm4, mm1 - pabsw mm5, mm2 - pabsw mm7, mm3 - paddw mm4, mm5 + pabsw m4, m1 + pabsw m5, m2 + pabsw m7, m3 + paddw m4, m5 %else - movq mm4, mm1 - movq mm5, mm2 - ABS2 mm4, mm5, mm6, mm7 - movq mm7, mm3 - paddw mm4, mm5 - ABS1 mm7, mm6 -%endif - paddw mm7, mm4 + movq m4, m1 + movq m5, m2 + ABS2 m4, m5, m6, m7 + movq m7, m3 + paddw m4, m5 + ABS1 m7, m6 +%endif + paddw m7, m4 %endmacro -; in: mm0..mm3 (4x4), mm7 (3x4) -; out: mm0 v, mm4 h, mm5 dc -; clobber: mm6 +; in: m0..m3 (4x4), m7 (3x4) +; out: m0 v, m4 h, m5 dc +; clobber: m6 %macro SUM4x3 3 ; dc, left, top - movq mm4, %2 - movd mm5, %1 - psllw mm4, 2 - psubw mm4, mm0 - psubw mm5, mm0 - punpcklwd mm0, mm1 - punpcklwd mm2, mm3 - punpckldq mm0, mm2 ; transpose - movq mm1, %3 - psllw mm1, 2 - psubw mm0, mm1 - ABS2 mm4, mm5, mm2, mm3 ; 1x4 sum - ABS1 mm0, mm1 ; 4x1 sum + movq m4, %2 + movd m5, %1 + psllw m4, 2 + psubw m4, m0 + psubw m5, m0 + punpcklwd m0, m1 + punpcklwd m2, m3 + punpckldq m0, m2 ; transpose + movq m1, %3 + psllw m1, 2 + psubw m0, m1 + ABS2 m4, m5, m2, m3 ; 1x4 sum + ABS1 m0, m1 ; 4x1 sum %endmacro %macro INTRA_SATDS_MMX 1 +INIT_MMX ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- @@ -924,19 +966,19 @@ SUM3x4 %1 SUM4x3 t0d, [left_1d], [top_1d] - paddw mm4, mm7 - paddw mm5, mm7 - movq mm1, mm5 - psrlq mm1, 16 ; 4x3 sum - paddw mm0, mm1 + paddw m4, m7 + paddw m5, m7 + movq m1, m5 + psrlq m1, 16 ; 4x3 sum + paddw m0, m1 - SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw + SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw %ifndef ARCH_X86_64 mov r2, r2m %endif - movd [r2+0], mm0 ; i4x4_v satd - movd [r2+4], mm4 ; i4x4_h satd - movd [r2+8], mm5 ; i4x4_dc satd + movd [r2+0], m0 ; i4x4_v satd + movd [r2+4], m4 ; i4x4_h satd + movd [r2+8], m5 ; i4x4_dc satd %ifndef ARCH_X86_64 ADD esp, 16 %endif @@ -966,10 +1008,10 @@ %assign stack_pad 88 + ((stack_offset+88+4)&15) %endif ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call - SUB rsp, stack_pad -%define sums rsp+64 ; size 24 -%define top_1d rsp+32 ; size 32 -%define left_1d rsp ; size 32 + SUB rsp, stack_pad +%define sums rsp+64 ; size 24 +%define top_1d rsp+32 ; size 32 +%define left_1d rsp ; size 32 movifnidn r1d, r1m CLEAR_SUMS @@ -997,14 +1039,14 @@ SUM3x4 %1 SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4] - pavgw mm4, mm7 - pavgw mm5, mm7 - paddw mm0, [sums+0] ; i16x16_v satd - paddw mm4, [sums+8] ; i16x16_h satd - paddw mm5, [sums+16] ; i16x16_dc satd - movq [sums+0], mm0 - movq [sums+8], mm4 - movq [sums+16], mm5 + pavgw m4, m7 + pavgw m5, m7 + paddw m0, [sums+0] ; i16x16_v satd + paddw m4, [sums+8] ; i16x16_h satd + paddw m5, [sums+16] ; i16x16_dc satd + movq [sums+0], m0 + movq [sums+8], m4 + movq [sums+16], m5 add r0, 4 inc r4d @@ -1017,19 +1059,19 @@ ; horizontal sum movifnidn r2d, r2m - movq mm2, [sums+16] - movq mm1, [sums+8] - movq mm0, [sums+0] - movq mm7, mm2 - SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd - psrld mm0, 1 - pslld mm7, 16 - psrld mm7, 16 - paddd mm0, mm2 - psubd mm0, mm7 - movd [r2+8], mm2 ; i16x16_dc satd - movd [r2+4], mm1 ; i16x16_h satd - movd [r2+0], mm0 ; i16x16_v satd + movq m2, [sums+16] + movq m1, [sums+8] + movq m0, [sums+0] + movq m7, m2 + SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd + psrld m0, 1 + pslld m7, 16 + psrld m7, 16 + paddd m0, m2 + psubd m0, m7 + movd [r2+8], m2 ; i16x16_dc satd + movd [r2+4], m1 ; i16x16_h satd + movd [r2+0], m0 ; i16x16_v satd ADD rsp, stack_pad RET @@ -1086,14 +1128,14 @@ SUM3x4 %1 SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4] - pavgw mm4, mm7 - pavgw mm5, mm7 - paddw mm0, [sums+16] ; i4x4_v satd - paddw mm4, [sums+8] ; i4x4_h satd - paddw mm5, [sums+0] ; i4x4_dc satd - movq [sums+16], mm0 - movq [sums+8], mm4 - movq [sums+0], mm5 + pavgw m4, m7 + pavgw m5, m7 + paddw m0, [sums+16] ; i4x4_v satd + paddw m4, [sums+8] ; i4x4_h satd + paddw m5, [sums+0] ; i4x4_dc satd + movq [sums+16], m0 + movq [sums+8], m4 + movq [sums+0], m5 add r0, 4 inc r4d @@ -1106,21 +1148,336 @@ jl .loop_y ; horizontal sum - movq mm0, [sums+0] - movq mm1, [sums+8] - movq mm2, [sums+16] - movq mm7, mm0 - psrlq mm7, 15 - paddw mm2, mm7 - SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd - psrld mm2, 1 - movd [r2+0], mm0 ; i8x8c_dc satd - movd [r2+4], mm1 ; i8x8c_h satd - movd [r2+8], mm2 ; i8x8c_v satd + movq m0, [sums+0] + movq m1, [sums+8] + movq m2, [sums+16] + movq m7, m0 + psrlq m7, 15 + paddw m2, m7 + SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd + psrld m2, 1 + movd [r2+0], m0 ; i8x8c_dc satd + movd [r2+4], m1 ; i8x8c_h satd + movd [r2+8], m2 ; i8x8c_v satd ADD rsp, 72 RET +%endmacro ; INTRA_SATDS_MMX + + +%macro ABS_MOV_SSSE3 2 + pabsw %1, %2 +%endmacro + +%macro ABS_MOV_MMX 2 + pxor %1, %1 + psubw %1, %2 + pmaxsw %1, %2 %endmacro +%define ABS_MOV ABS_MOV_MMX + +; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0 +; out: [tmp]=hadamard4, m0=satd +cglobal x264_hadamard_ac_4x4_mmxext + movh m0, [r0] + movh m1, [r0+r1] + movh m2, [r0+r1*2] + movh m3, [r0+r2] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + HADAMARD4_1D m0, m1, m2, m3 + TRANSPOSE4x4W 0, 1, 2, 3, 4 + HADAMARD4_1D m0, m1, m2, m3 + mova [r3], m0 + mova [r3+8], m1 + mova [r3+16], m2 + mova [r3+24], m3 + ABS1 m0, m4 + ABS1 m1, m4 + pand m0, m6 + ABS1 m2, m4 + ABS1 m3, m4 + paddw m0, m1 + paddw m2, m3 + paddw m0, m2 + SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext + ret + +cglobal x264_hadamard_ac_2x2_mmxext + mova m0, [r3+0x00] + mova m1, [r3+0x20] + mova m2, [r3+0x40] + mova m3, [r3+0x60] + HADAMARD4_1D m0, m1, m2, m3 + ABS2 m0, m1, m4, m5 + ABS2 m2, m3, m4, m5 + SAVE_MM_PERMUTATION x264_hadamard_ac_2x2_mmxext + ret + +cglobal x264_hadamard_ac_8x8_mmxext + mova m6, [mask_ac4 GLOBAL] + pxor m7, m7 + call x264_hadamard_ac_4x4_mmxext + add r0, 4 + add r3, 32 + mova m5, m0 + call x264_hadamard_ac_4x4_mmxext + lea r0, [r0+4*r1] + add r3, 64 + paddw m5, m0 + call x264_hadamard_ac_4x4_mmxext + sub r0, 4 + sub r3, 32 + paddw m5, m0 + call x264_hadamard_ac_4x4_mmxext + paddw m5, m0 + sub r3, 64 + mova [rsp+gprsize+8], m5 ; save satd + call x264_hadamard_ac_2x2_mmxext + add r3, 8 + pand m6, m0 + mova m7, m1 + paddw m6, m2 + paddw m7, m3 +%rep 2 + call x264_hadamard_ac_2x2_mmxext + add r3, 8 + paddw m6, m0 + paddw m7, m1 + paddw m6, m2 + paddw m7, m3 +%endrep + call x264_hadamard_ac_2x2_mmxext + sub r3, 24 + paddw m6, m0 + paddw m7, m1 + paddw m6, m2 + paddw m7, m3 + paddw m6, m7 + mova [rsp+gprsize], m6 ; save sa8d + SWAP m0, m6 + SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext + ret + +%macro HADAMARD_AC_WXH_MMX 2 +cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4 + %assign pad 16-gprsize-(stack_offset&15) + %define ysub r1 + sub rsp, 16+128+pad + lea r2, [r1*3] + lea r3, [rsp+16] + call x264_hadamard_ac_8x8_mmxext +%if %2==16 + %define ysub r2 + lea r0, [r0+r1*4] + sub rsp, 16 + call x264_hadamard_ac_8x8_mmxext +%endif +%if %1==16 + neg ysub + sub rsp, 16 + lea r0, [r0+ysub*4+8] + neg ysub + call x264_hadamard_ac_8x8_mmxext +%if %2==16 + lea r0, [r0+r1*4] + sub rsp, 16 + call x264_hadamard_ac_8x8_mmxext +%endif +%endif + mova m1, [rsp+0x08] +%if %1*%2 >= 128 + paddusw m0, [rsp+0x10] + paddusw m1, [rsp+0x18] +%endif +%if %1*%2 == 256 + paddusw m0, [rsp+0x20] + paddusw m1, [rsp+0x28] + paddusw m0, [rsp+0x30] + paddusw m1, [rsp+0x38] +%endif + psrlw m0, 1 + psrlw m1, 1 + HADDW m0, m2 + HADDW m1, m3 + movd edx, m0 + movd eax, m1 + shr edx, 1 +%ifdef ARCH_X86_64 + shl rdx, 32 + add rax, rdx +%endif + add rsp, 128+%1*%2/4+pad + RET +%endmacro ; HADAMARD_AC_WXH_MMX + +HADAMARD_AC_WXH_MMX 16, 16 +HADAMARD_AC_WXH_MMX 8, 16 +HADAMARD_AC_WXH_MMX 16, 8 +HADAMARD_AC_WXH_MMX 8, 8 + +%macro HADAMARD_AC_SSE2 1 +INIT_XMM +; in: r0=pix, r1=stride, r2=stride*3 +; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4 +cglobal x264_hadamard_ac_8x8_%1 +%ifdef ARCH_X86_64 + %define spill0 m8 + %define spill1 m9 + %define spill2 m10 +%else + %define spill0 [rsp+gprsize] + %define spill1 [rsp+gprsize+16] + %define spill2 [rsp+gprsize+32] +%endif + pxor m7, m7 + movh m0, [r0] + movh m1, [r0+r1] + movh m2, [r0+r1*2] + movh m3, [r0+r2] + lea r0, [r0+r1*4] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + HADAMARD4_1D m0, m1, m2, m3 + mova spill0, m3 + SWAP m3, m7 + movh m4, [r0] + movh m5, [r0+r1] + movh m6, [r0+r1*2] + movh m7, [r0+r2] + punpcklbw m4, m3 + punpcklbw m5, m3 + punpcklbw m6, m3 + punpcklbw m7, m3 + HADAMARD4_1D m4, m5, m6, m7 + mova m3, spill0 +%ifdef ARCH_X86_64 + TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 +%else + TRANSPOSE8x8W 0,1,2,3,4,5,6,7,spill0,spill1 +%endif + HADAMARD4_1D m0, m1, m2, m3 + HADAMARD4_1D m4, m5, m6, m7 + mova spill0, m1 + mova spill1, m2 + mova spill2, m3 + ABS_MOV m1, m0 + ABS_MOV m2, m4 + ABS_MOV m3, m5 + paddw m1, m2 + SUMSUB_BA m0, m4 + pand m1, [mask_ac4 GLOBAL] + ABS_MOV m2, spill0 + paddw m1, m3 + ABS_MOV m3, spill1 + paddw m1, m2 + ABS_MOV m2, spill2 + paddw m1, m3 + ABS_MOV m3, m6 + paddw m1, m2 + ABS_MOV m2, m7 + paddw m1, m3 + mova m3, m7 + paddw m1, m2 + mova m2, m6 + psubw m7, spill2 + paddw m3, spill2 + mova [rsp+gprsize+32], m1 ; save satd + mova m1, m5 + psubw m6, spill1 + paddw m2, spill1 + psubw m5, spill0 + paddw m1, spill0 + mova spill1, m7 + SBUTTERFLY qdq, 0, 4, 7 + SBUTTERFLY qdq, 1, 5, 7 + SBUTTERFLY qdq, 2, 6, 7 + SUMSUB_BADC m0, m4, m1, m5 + SUMSUB_BA m2, m6 + ABS1 m0, m7 + ABS1 m1, m7 + pand m0, [mask_ac8 GLOBAL] + ABS1 m2, m7 + ABS1 m4, m7 + ABS1 m5, m7 + ABS1 m6, m7 + mova m7, spill1 + paddw m0, m4 + SBUTTERFLY qdq, 3, 7, 4 + SUMSUB_BA m3, m7 + paddw m1, m5 + ABS1 m3, m4 + ABS1 m7, m4 + paddw m2, m6 + paddw m3, m7 + paddw m0, m1 + paddw m2, m3 + paddw m0, m2 + mova [rsp+gprsize+16], m0 ; save sa8d + SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1 + ret + +HADAMARD_AC_WXH_SSE2 16, 16, %1 +HADAMARD_AC_WXH_SSE2 8, 16, %1 +HADAMARD_AC_WXH_SSE2 16, 8, %1 +HADAMARD_AC_WXH_SSE2 8, 8, %1 +%endmacro ; HADAMARD_AC_SSE2 + +; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride ) +%macro HADAMARD_AC_WXH_SSE2 3 +cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3 + %assign pad 16-gprsize-(stack_offset&15) + %define ysub r1 + sub rsp, 48+pad + lea r2, [r1*3] + call x264_hadamard_ac_8x8_%3 +%if %2==16 + %define ysub r2 + lea r0, [r0+r1*4] + sub rsp, 32 + call x264_hadamard_ac_8x8_%3 +%endif +%if %1==16 + neg ysub + sub rsp, 32 + lea r0, [r0+ysub*4+8] + neg ysub + call x264_hadamard_ac_8x8_%3 +%if %2==16 + lea r0, [r0+r1*4] + sub rsp, 32 + call x264_hadamard_ac_8x8_%3 +%endif +%endif + mova m1, [rsp+0x20] +%if %1*%2 >= 128 + paddusw m0, [rsp+0x30] + paddusw m1, [rsp+0x40] +%endif +%if %1*%2 == 256 + paddusw m0, [rsp+0x50] + paddusw m1, [rsp+0x60] + paddusw m0, [rsp+0x70] + paddusw m1, [rsp+0x80] +%endif + HADDW m0, m2 + HADDW m1, m3 + movd edx, m0 + movd eax, m1 + shr edx, 2 + shr eax, 1 +%ifdef ARCH_X86_64 + shl rdx, 32 + add rax, rdx +%endif + add rsp, 16+%1*%2/2+pad + RET +%endmacro ; HADAMARD_AC_WXH_SSE2 + ; instantiate satds %ifndef ARCH_X86_64 @@ -1134,13 +1491,16 @@ SA8D_16x16_32 sse2 INTRA_SA8D_SSE2 sse2 INTRA_SATDS_MMX mmxext +HADAMARD_AC_SSE2 sse2 %define ABS1 ABS1_SSSE3 %define ABS2 ABS2_SSSE3 +%define ABS_MOV ABS_MOV_SSSE3 +SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3. SATDS_SSE2 ssse3 SA8D_16x16_32 ssse3 INTRA_SA8D_SSE2 ssse3 INTRA_SATDS_MMX ssse3 -SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3. +HADAMARD_AC_SSE2 ssse3 %define SATD_8x4_SSE2 SATD_8x4_PHADD SATDS_SSE2 ssse3_phadd @@ -1155,44 +1515,43 @@ ; const uint8_t *pix2, int stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4 - pxor xmm0, xmm0 - pxor xmm1, xmm1 - pxor xmm2, xmm2 - pxor xmm3, xmm3 - pxor xmm4, xmm4 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + pxor m4, m4 %rep 4 - movq xmm5, [r0] - movq xmm6, [r2] - punpcklbw xmm5, xmm0 - punpcklbw xmm6, xmm0 - paddw xmm1, xmm5 - paddw xmm2, xmm6 - movdqa xmm7, xmm5 - pmaddwd xmm5, xmm5 - pmaddwd xmm7, xmm6 - pmaddwd xmm6, xmm6 - paddd xmm3, xmm5 - paddd xmm4, xmm7 - paddd xmm3, xmm6 + movq m5, [r0] + movq m6, [r2] + punpcklbw m5, m0 + punpcklbw m6, m0 + paddw m1, m5 + paddw m2, m6 + movdqa m7, m5 + pmaddwd m5, m5 + pmaddwd m7, m6 + pmaddwd m6, m6 + paddd m3, m5 + paddd m4, m7 + paddd m3, m6 add r0, r1 add r2, r3 %endrep - ; PHADDW xmm1, xmm2 - ; PHADDD xmm3, xmm4 - picgetgot eax - movdqa xmm7, [pw_1 GLOBAL] - pshufd xmm5, xmm3, 0xb1 - pmaddwd xmm1, xmm7 - pmaddwd xmm2, xmm7 - pshufd xmm6, xmm4, 0xb1 - packssdw xmm1, xmm2 - paddd xmm3, xmm5 - pshufd xmm1, xmm1, 0xd8 - paddd xmm4, xmm6 - pmaddwd xmm1, xmm7 - movdqa xmm5, xmm3 - punpckldq xmm3, xmm4 - punpckhdq xmm5, xmm4 + ; PHADDW m1, m2 + ; PHADDD m3, m4 + movdqa m7, [pw_1 GLOBAL] + pshufd m5, m3, 0xb1 + pmaddwd m1, m7 + pmaddwd m2, m7 + pshufd m6, m4, 0xb1 + packssdw m1, m2 + paddd m3, m5 + pshufd m1, m1, 0xd8 + paddd m4, m6 + pmaddwd m1, m7 + movdqa m5, m3 + punpckldq m3, m4 + punpckhdq m5, m4 %ifdef ARCH_X86_64 %define t0 r4 @@ -1201,77 +1560,76 @@ mov t0, r4m %endif - movq [t0+ 0], xmm1 - movq [t0+ 8], xmm3 - psrldq xmm1, 8 - movq [t0+16], xmm1 - movq [t0+24], xmm5 + movq [t0+ 0], m1 + movq [t0+ 8], m3 + psrldq m1, 8 + movq [t0+16], m1 + movq [t0+24], m5 RET ;----------------------------------------------------------------------------- ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width ) ;----------------------------------------------------------------------------- cglobal x264_pixel_ssim_end4_sse2, 3,3 - movdqa xmm0, [r0+ 0] - movdqa xmm1, [r0+16] - movdqa xmm2, [r0+32] - movdqa xmm3, [r0+48] - movdqa xmm4, [r0+64] - paddd xmm0, [r1+ 0] - paddd xmm1, [r1+16] - paddd xmm2, [r1+32] - paddd xmm3, [r1+48] - paddd xmm4, [r1+64] - paddd xmm0, xmm1 - paddd xmm1, xmm2 - paddd xmm2, xmm3 - paddd xmm3, xmm4 - picgetgot r1 - movdqa xmm5, [ssim_c1 GLOBAL] - movdqa xmm6, [ssim_c2 GLOBAL] - TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4 - -; s1=mm0, s2=mm3, ss=mm4, s12=mm2 - movdqa xmm1, xmm3 - pslld xmm3, 16 - pmaddwd xmm1, xmm0 ; s1*s2 - por xmm0, xmm3 - pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2 - pslld xmm1, 1 - pslld xmm2, 7 - pslld xmm4, 6 - psubd xmm2, xmm1 ; covar*2 - psubd xmm4, xmm0 ; vars - paddd xmm0, xmm5 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm4, xmm6 - cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1) - cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1) - cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2) - cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2) - mulps xmm1, xmm2 - mulps xmm0, xmm4 - divps xmm1, xmm0 ; ssim + movdqa m0, [r0+ 0] + movdqa m1, [r0+16] + movdqa m2, [r0+32] + movdqa m3, [r0+48] + movdqa m4, [r0+64] + paddd m0, [r1+ 0] + paddd m1, [r1+16] + paddd m2, [r1+32] + paddd m3, [r1+48] + paddd m4, [r1+64] + paddd m0, m1 + paddd m1, m2 + paddd m2, m3 + paddd m3, m4 + movdqa m5, [ssim_c1 GLOBAL] + movdqa m6, [ssim_c2 GLOBAL] + TRANSPOSE4x4D 0, 1, 2, 3, 4 + +; s1=m0, s2=m1, ss=m2, s12=m3 + movdqa m4, m1 + pslld m1, 16 + pmaddwd m4, m0 ; s1*s2 + por m0, m1 + pmaddwd m0, m0 ; s1*s1 + s2*s2 + pslld m4, 1 + pslld m3, 7 + pslld m2, 6 + psubd m3, m4 ; covar*2 + psubd m2, m0 ; vars + paddd m0, m5 + paddd m4, m5 + paddd m3, m6 + paddd m2, m6 + cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1) + cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1) + cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2) + cvtdq2ps m2, m2 ; (float)(vars + ssim_c2) + mulps m4, m3 + mulps m0, m2 + divps m4, m0 ; ssim - cmp r2d, 4 + cmp r2d, 4 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level - neg r2 -%ifdef PIC64 - lea r3, [mask_ff + 16 GLOBAL] - movdqu xmm3, [r3 + r2*4] + neg r2 +%ifdef PIC + lea r3, [mask_ff + 16 GLOBAL] + movdqu m1, [r3 + r2*4] %else - movdqu xmm3, [mask_ff + r2*4 + 16 GLOBAL] + movdqu m1, [mask_ff + r2*4 + 16 GLOBAL] %endif - pand xmm1, xmm3 + pand m4, m1 .skip: - movhlps xmm0, xmm1 - addps xmm0, xmm1 - pshuflw xmm1, xmm0, 0xE - addss xmm0, xmm1 + movhlps m0, m4 + addps m0, m4 + pshuflw m4, m0, 0xE + addss m0, m4 %ifndef ARCH_X86_64 - movd r0m, xmm0 - fld dword r0m + movd r0m, m0 + fld dword r0m %endif RET @@ -1281,7 +1639,7 @@ ; Successive Elimination ADS ;============================================================================= -%macro ADS_START 1 ; unroll_size +%macro ADS_START 1 ; unroll_size %ifdef ARCH_X86_64 %define t0 r6 mov r10, rsp @@ -1295,7 +1653,7 @@ and rsp, ~15 mov t0, rsp shl r2d, 1 -%endmacro +%endmacro %macro ADS_END 1 add r1, 8*%1

@@ -43,6 +43,7 @@ DECL_X1( sad, mmxext ) DECL_X1( sad, sse2 ) DECL_X1( sad, sse3 ) +DECL_X1( sad, sse2_aligned ) DECL_X4( sad, mmxext ) DECL_X4( sad, sse2 ) DECL_X4( sad, sse3 ) @@ -64,22 +65,27 @@ DECL_X4( sad, cache64_sse2 ); DECL_X4( sad, cache64_ssse3 ); -#undef DECL_PIXELS -#undef DECL_X1 -#undef DECL_X4 +DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride, uint32_t *sad )) +DECL_PIXELS( int, var, sse2, ( uint8_t *pix, int i_stride, uint32_t *sad )) +DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( uint8_t *pix, int i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( uint8_t *pix, int i_stride )) -void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * ); -void x264_intra_satd_x3_4x4_ssse3( uint8_t *, uint8_t *, int * ); -void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * ); -void x264_intra_satd_x3_8x8c_ssse3( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * ); -void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * ); -void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *, uint8_t *, int * ); -void x264_intra_sa8d_x3_8x8_sse2( uint8_t *, uint8_t *, int * ); -void x264_intra_sa8d_x3_8x8_ssse3( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * ); +void x264_intra_sad_x3_16x16_mmxext ( uint8_t *, uint8_t *, int * ); +void x264_intra_sad_x3_16x16_sse2 ( uint8_t *, uint8_t *, int * ); +void x264_intra_sad_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * ); +void x264_intra_sa8d_x3_8x8_mmxext ( uint8_t *, uint8_t *, int * ); +void x264_intra_sa8d_x3_8x8_sse2 ( uint8_t *, uint8_t *, int * ); +void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * ); void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * ); -void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *, int16_t [2][8], int * ); -void x264_intra_sa8d_x3_8x8_core_ssse3( uint8_t *, int16_t [2][8], int * ); +void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * ); +void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * ); void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); @@ -99,6 +105,10 @@ DECL_ADS( 4, ssse3 ) DECL_ADS( 2, ssse3 ) DECL_ADS( 1, ssse3 ) + +#undef DECL_PIXELS +#undef DECL_X1 +#undef DECL_X4 #undef DECL_ADS #endif

@@ -22,6 +22,7 @@ ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" %macro STORE8x8 2 movq [r0 + 0*FDEC_STRIDE], %1 @@ -66,13 +67,14 @@ ALIGN 16 pb_1: times 16 db 1 +pb_3: times 16 db 3 pw_2: times 4 dw 2 pw_4: times 4 dw 4 pw_8: times 8 dw 8 pw_76543210: pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 pb_00s_ff: times 8 db 0 -pb_0s_ff: times 7 db 0 +pb_0s_ff: times 7 db 0 db 0xff SECTION .text @@ -99,7 +101,7 @@ ;----------------------------------------------------------------------------- ; void predict_4x4_ddl_mmxext( uint8_t *src ) ;----------------------------------------------------------------------------- -cglobal predict_4x4_ddl_mmxext, 1,1,1 +cglobal predict_4x4_ddl_mmxext, 1,1 sub r0, FDEC_STRIDE movq mm3, [r0] movq mm1, [r0-1] @@ -123,7 +125,7 @@ ;----------------------------------------------------------------------------- ; void predict_4x4_vl_mmxext( uint8_t *src ) ;----------------------------------------------------------------------------- -cglobal predict_4x4_vl_mmxext, 1,1,1 +cglobal predict_4x4_vl_mmxext, 1,1 movq mm1, [r0-FDEC_STRIDE] movq mm3, mm1 movq mm2, mm1 @@ -144,6 +146,31 @@ RET ;----------------------------------------------------------------------------- +; void predict_4x4_dc( uint8_t *src ) +;----------------------------------------------------------------------------- + +cglobal predict_4x4_dc_mmxext, 1,4 + pxor mm7, mm7 + movd mm0, [r0-FDEC_STRIDE] + psadbw mm0, mm7 + movd r3d, mm0 + movzx r1d, byte [r0-1] +%assign n 1 +%rep 3 + movzx r2d, byte [r0+FDEC_STRIDE*n-1] + add r1d, r2d +%assign n n+1 +%endrep + lea r1d, [r1+r3+4] + shr r1d, 3 + imul r1d, 0x01010101 + mov [r0+FDEC_STRIDE*0], r1d + mov [r0+FDEC_STRIDE*1], r1d + mov [r0+FDEC_STRIDE*2], r1d + mov [r0+FDEC_STRIDE*3], r1d + RET + +;----------------------------------------------------------------------------- ; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_v_mmxext, 2,2 @@ -152,9 +179,34 @@ RET ;----------------------------------------------------------------------------- +; void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] ) +;----------------------------------------------------------------------------- + +INIT_MMX +cglobal predict_8x8_h_mmxext, 2,2 + movu m3, [r1+7] + mova m7, m3 + punpckhbw m3, m3 + punpcklbw m7, m7 + pshufw m0, m3, 0xff + pshufw m1, m3, 0xaa + pshufw m2, m3, 0x55 + pshufw m3, m3, 0x00 + pshufw m4, m7, 0xff + pshufw m5, m7, 0xaa + pshufw m6, m7, 0x55 + pshufw m7, m7, 0x00 +%assign n 0 +%rep 8 + mova [r0+n*FDEC_STRIDE], m %+ n +%assign n n+1 +%endrep + RET + +;----------------------------------------------------------------------------- ; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge ); ;----------------------------------------------------------------------------- -cglobal predict_8x8_dc_mmxext, 2,2,1 +cglobal predict_8x8_dc_mmxext, 2,2 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r1+7] @@ -171,7 +223,7 @@ ; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge ); ;----------------------------------------------------------------------------- %macro PRED8x8_DC 2 -cglobal %1, 2,2,1 +cglobal %1, 2,2 pxor mm0, mm0 psadbw mm0, [r1+%2] paddw mm0, [pw_4 GLOBAL] @@ -192,7 +244,7 @@ ;----------------------------------------------------------------------------- ; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddl_mmxext, 2,2,1 +cglobal predict_8x8_ddl_mmxext, 2,2 movq mm5, [r1+16] movq mm2, [r1+17] movq mm3, [r1+23] @@ -223,7 +275,7 @@ ;----------------------------------------------------------------------------- ; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddr_mmxext, 2,2,1 +cglobal predict_8x8_ddr_mmxext, 2,2 movq mm1, [r1+7] movq mm2, [r1+9] movq mm3, [r1+15] @@ -254,7 +306,7 @@ ;----------------------------------------------------------------------------- ; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddl_sse2, 2,2,1 +cglobal predict_8x8_ddl_sse2, 2,2 movdqa xmm3, [r1+16] movdqu xmm2, [r1+17] movdqa xmm1, xmm3 @@ -272,7 +324,7 @@ ;----------------------------------------------------------------------------- ; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddr_sse2, 2,2,1 +cglobal predict_8x8_ddr_sse2, 2,2 movdqu xmm3, [r1+8] movdqu xmm1, [r1+7] movdqa xmm2, xmm3 @@ -297,7 +349,7 @@ ;----------------------------------------------------------------------------- ; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_vl_sse2, 2,2,1 +cglobal predict_8x8_vl_sse2, 2,2 movdqa xmm4, [r1+16] movdqa xmm2, xmm4 movdqa xmm1, xmm4 @@ -338,7 +390,7 @@ ; 6 ..... ; 7 ,,,,, -cglobal predict_8x8_vr_core_mmxext, 2,2,1 +cglobal predict_8x8_vr_core_mmxext, 2,2 movq mm2, [r1+16] movq mm3, [r1+15] movq mm1, [r1+14] @@ -368,9 +420,33 @@ RET ;----------------------------------------------------------------------------- +; void predict_8x8c_h_mmxext( uint8_t *src ) +;----------------------------------------------------------------------------- + +%macro PRED_8x8C_H 1 +cglobal predict_8x8c_h_%1, 1,1 +%ifidn %1, ssse3 + mova m1, [pb_3 GLOBAL] +%endif +%assign n 0 +%rep 8 + SPLATB m0, r0+FDEC_STRIDE*n-1, m1 + mova [r0+FDEC_STRIDE*n], m0 +%assign n n+1 +%endrep + REP_RET +%endmacro + +INIT_MMX +%define SPLATB SPLATB_MMX +PRED_8x8C_H mmxext +%define SPLATB SPLATB_SSSE3 +PRED_8x8C_H ssse3 + +;----------------------------------------------------------------------------- ; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 ) ;----------------------------------------------------------------------------- -cglobal predict_8x8c_dc_core_mmxext, 1,1,1 +cglobal predict_8x8c_dc_core_mmxext, 1,1 movq mm0, [r0 - FDEC_STRIDE] pxor mm1, mm1 pxor mm2, mm2 @@ -422,7 +498,7 @@ ;----------------------------------------------------------------------------- ; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- -cglobal predict_8x8c_p_core_mmxext, 1,2,1 +cglobal predict_8x8c_p_core_mmxext, 1,2 LOAD_PLANE_ARGS movq mm1, mm2 pmullw mm2, [pw_3210 GLOBAL] @@ -450,7 +526,7 @@ ;----------------------------------------------------------------------------- ; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- -cglobal predict_16x16_p_core_mmxext, 1,2,1 +cglobal predict_16x16_p_core_mmxext, 1,2 LOAD_PLANE_ARGS movq mm5, mm2 movq mm1, mm2 @@ -492,7 +568,7 @@ ;----------------------------------------------------------------------------- ; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- -cglobal predict_16x16_p_core_sse2, 1,2,1 +cglobal predict_16x16_p_core_sse2, 1,2 movd xmm0, r1m movd xmm1, r2m movd xmm2, r3m @@ -543,6 +619,39 @@ REP_RET ;----------------------------------------------------------------------------- +; void predict_16x16_h_mmxext( uint8_t *src ) +;----------------------------------------------------------------------------- + +%macro PRED_16x16_H 1 +cglobal predict_16x16_h_%1, 1,2 + mov r1, FDEC_STRIDE*12 +%ifidn %1, ssse3 + mova m1, [pb_3 GLOBAL] +%endif +.vloop: +%assign n 0 +%rep 4 + SPLATB m0, r0+r1+FDEC_STRIDE*n-1, m1 + mova [r0+r1+FDEC_STRIDE*n], m0 +%if mmsize==8 + mova [r0+r1+FDEC_STRIDE*n+8], m0 +%endif +%assign n n+1 +%endrep + add r1, -FDEC_STRIDE*4 + jge .vloop + REP_RET +%endmacro + +;no SSE2, its slower than MMX on all systems that don't support SSSE3 +INIT_MMX +%define SPLATB SPLATB_MMX +PRED_16x16_H mmxext +INIT_XMM +%define SPLATB SPLATB_SSSE3 +PRED_16x16_H ssse3 + +;----------------------------------------------------------------------------- ; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left ) ;----------------------------------------------------------------------------- @@ -568,7 +677,7 @@ %endif REP_RET -cglobal predict_16x16_dc_top_mmxext, 1,2,1 +cglobal predict_16x16_dc_top_mmxext, 1,2 PRED16x16_DC [pw_8 GLOBAL], 4 REP_RET @@ -594,7 +703,7 @@ PRED16x16_DC_SSE2 xmm2, 5 REP_RET -cglobal predict_16x16_dc_top_sse2, 1,2,1 +cglobal predict_16x16_dc_top_sse2, 1,2 PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4 REP_RET

@@ -26,13 +26,18 @@ #include "pixel.h" extern void predict_16x16_v_mmx( uint8_t *src ); +extern void predict_16x16_h_mmxext( uint8_t *src ); +extern void predict_16x16_h_ssse3( uint8_t *src ); extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left ); extern void predict_16x16_dc_top_mmxext( uint8_t *src ); extern void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c ); extern void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ); extern void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 ); extern void predict_8x8c_v_mmx( uint8_t *src ); +extern void predict_8x8c_h_mmxext( uint8_t *src ); +extern void predict_8x8c_h_ssse3( uint8_t *src ); extern void predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] ); +extern void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] ); extern void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] ); @@ -44,6 +49,7 @@ extern void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] ); extern void predict_4x4_ddl_mmxext( uint8_t *src ); extern void predict_4x4_vl_mmxext( uint8_t *src ); +extern void predict_4x4_dc_mmxext( uint8_t *src ); extern void predict_16x16_dc_top_sse2( uint8_t *src ); extern void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left ); extern void predict_16x16_v_sse2( uint8_t *src ); @@ -126,40 +132,18 @@ } #ifdef ARCH_X86_64 -static void predict_16x16_h( uint8_t *src ) -{ - int y; - for( y = 0; y < 16; y++ ) - { - const uint64_t v = 0x0101010101010101ULL * src[-1]; - uint64_t *p = (uint64_t*)src; - p[0] = p[1] = v; - src += FDEC_STRIDE; - } -} - -static void predict_8x8c_h( uint8_t *src ) -{ - int y; - for( y = 0; y < 8; y++ ) - { - *(uint64_t*)src = 0x0101010101010101ULL * src[-1]; - src += FDEC_STRIDE; - } -} - static void predict_16x16_dc_left( uint8_t *src ) { uint32_t s = 0; - uint64_t dc; + uint64_t dc; int y; - + for( y = 0; y < 16; y++ ) { s += src[-1 + y * FDEC_STRIDE]; - } + } dc = (( s + 8 ) >> 4) * 0x0101010101010101ULL; - + for( y = 0; y < 16; y++ ) { uint64_t *p = (uint64_t*)src; @@ -496,7 +480,6 @@ if( !(cpu&X264_CPU_MMX) ) return; #ifdef ARCH_X86_64 - pf[I_PRED_16x16_H] = predict_16x16_h; pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left; #endif pf[I_PRED_16x16_V] = predict_16x16_v_mmx; @@ -505,6 +488,7 @@ pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext; pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext; pf[I_PRED_16x16_P] = predict_16x16_p_mmxext; + pf[I_PRED_16x16_H] = predict_16x16_h_mmxext; if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_16x16_DC] = predict_16x16_dc_sse2; @@ -513,6 +497,9 @@ return; pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2; pf[I_PRED_16x16_P] = predict_16x16_p_sse2; + if( !(cpu&X264_CPU_SSSE3) ) + return; + pf[I_PRED_16x16_H] = predict_16x16_h_ssse3; } void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] ) @@ -520,15 +507,18 @@ if( !(cpu&X264_CPU_MMX) ) return; #ifdef ARCH_X86_64 - pf[I_PRED_CHROMA_H] = predict_8x8c_h; pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left; pf[I_PRED_CHROMA_DC_TOP] = predict_8x8c_dc_top; #endif pf[I_PRED_CHROMA_V] = predict_8x8c_v_mmx; if( !(cpu&X264_CPU_MMXEXT) ) return; + pf[I_PRED_CHROMA_H] = predict_8x8c_h_mmxext; pf[I_PRED_CHROMA_P] = predict_8x8c_p_mmxext; pf[I_PRED_CHROMA_DC] = predict_8x8c_dc_mmxext; + if( !(cpu&X264_CPU_SSSE3) ) + return; + pf[I_PRED_CHROMA_H] = predict_8x8c_h_ssse3; } void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12] ) @@ -536,6 +526,7 @@ if( !(cpu&X264_CPU_MMXEXT) ) return; pf[I_PRED_8x8_V] = predict_8x8_v_mmxext; + pf[I_PRED_8x8_H] = predict_8x8_h_mmxext; pf[I_PRED_8x8_DC] = predict_8x8_dc_mmxext; pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext; pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext; @@ -565,4 +556,5 @@ return; pf[I_PRED_4x4_DDL] = predict_4x4_ddl_mmxext; pf[I_PRED_4x4_VL] = predict_4x4_vl_mmxext; + pf[I_PRED_4x4_DC] = predict_4x4_dc_mmxext; }

@@ -270,7 +270,6 @@ .rshift32: neg t0d movd m5, t0d - picgetgot t0d mova m6, [pd_1 GLOBAL] pxor m7, m7 pslld m6, m5 @@ -290,12 +289,11 @@ sub t2d, t1d sub t2d, t1d ; i_mf = i_qp % 6 shl t2d, %3 -%ifdef PIC64 +%ifdef PIC lea r1, [dequant%2_scale GLOBAL] add r1, t2 %else - picgetgot r0 - lea r1, [t2 + dequant%2_scale GLOBAL] + lea r1, [dequant%2_scale + t2 GLOBAL] %endif movifnidn r0d, r0m movd m7, t0d @@ -331,10 +329,10 @@ ;----------------------------------------------------------------------------- -; void x264_denoise_dct_core_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ) +; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ) ;----------------------------------------------------------------------------- %macro DENOISE_DCT 1 -cglobal x264_denoise_dct_core_%1, 4,5 +cglobal x264_denoise_dct_%1, 4,5 movzx r4d, word [r0] ; backup DC coefficient pxor m7, m7 .loop:

@@ -43,8 +43,8 @@ void x264_dequant_8x8_flat16_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); void x264_dequant_4x4_flat16_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); void x264_dequant_8x8_flat16_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); -void x264_denoise_dct_core_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); -void x264_denoise_dct_core_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); -void x264_denoise_dct_core_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); +void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); +void x264_denoise_dct_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); +void x264_denoise_dct_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); #endif

@@ -23,8 +23,10 @@ ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION_RODATA +pb_3: times 16 db 3 sw_64: dd 64 SECTION .text @@ -80,7 +82,7 @@ pxor mm0, mm0 %rep %2/2 SAD_INC_2x%1P -%endrep +%endrep movd eax, mm0 RET %endmacro @@ -215,7 +217,99 @@ SAD_W16 sse2 %define movdqu lddqu SAD_W16 sse3 -%undef movdqu +%define movdqu movdqa +SAD_W16 sse2_aligned +%define movdqu movups + + + +;----------------------------------------------------------------------------- +; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] ); +;----------------------------------------------------------------------------- + +;xmm7: DC prediction xmm6: H prediction xmm5: V prediction +;xmm4: DC pred score xmm3: H pred score xmm2: V pred score +%macro INTRA_SAD16 1 +cglobal x264_intra_sad_x3_16x16_%1,3,5 + pxor mm0, mm0 + pxor mm1, mm1 + psadbw mm0, [r1-FDEC_STRIDE+0] + psadbw mm1, [r1-FDEC_STRIDE+8] + paddw mm0, mm1 + movd r3d, mm0 +%ifidn %1, ssse3 + mova m1, [pb_3 GLOBAL] +%endif +%assign n 0 +%rep 16 + movzx r4d, byte [r1-1+FDEC_STRIDE*n] + add r3d, r4d +%assign n n+1 +%endrep + add r3d, 16 + shr r3d, 5 + imul r3d, 0x01010101 + movd m7, r3d + mova m5, [r1-FDEC_STRIDE] +%if mmsize==16 + pshufd m7, m7, 0 +%else + mova m1, [r1-FDEC_STRIDE+8] + punpckldq m7, m7 +%endif + pxor m4, m4 + pxor m3, m3 + pxor m2, m2 + mov r3d, 15*FENC_STRIDE +.vloop: + SPLATB m6, r1+r3*2-1, m1 + mova m0, [r0+r3] + psadbw m0, m7 + paddw m4, m0 + mova m0, [r0+r3] + psadbw m0, m5 + paddw m2, m0 +%if mmsize==8 + mova m0, [r0+r3] + psadbw m0, m6 + paddw m3, m0 + mova m0, [r0+r3+8] + psadbw m0, m7 + paddw m4, m0 + mova m0, [r0+r3+8] + psadbw m0, m1 + paddw m2, m0 + psadbw m6, [r0+r3+8] + paddw m3, m6 +%else + psadbw m6, [r0+r3] + paddw m3, m6 +%endif + add r3d, -FENC_STRIDE + jge .vloop +%if mmsize==16 + pslldq m3, 4 + por m3, m2 + movhlps m1, m3 + paddw m3, m1 + movq [r2+0], m3 + movhlps m1, m4 + paddw m4, m1 +%else + movd [r2+0], m2 + movd [r2+4], m3 +%endif + movd [r2+8], m4 + RET +%endmacro + +INIT_MMX +%define SPLATB SPLATB_MMX +INTRA_SAD16 mmxext +INIT_XMM +INTRA_SAD16 sse2 +%define SPLATB SPLATB_SSSE3 +INTRA_SAD16 ssse3 @@ -694,7 +788,7 @@ and eax, 0x37 cmp eax, 0x30 jle x264_pixel_sad_16x%2_sse2 - PROLOGUE 4,6,0 + PROLOGUE 4,6 mov r4d, r2d and r4d, 15 %ifidn %1, ssse3 @@ -704,11 +798,10 @@ shl r4d, 4 ; code size = 80 %endif %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1)) -%ifdef PIC64 +%ifdef PIC lea r5, [sad_w16_addr GLOBAL] add r5, r4 %else - picgetgot r5 lea r5, [sad_w16_addr + r4 GLOBAL] %endif and r2, ~15 @@ -728,18 +821,10 @@ jle x264_pixel_sad_%1x%2_mmxext and eax, 7 shl eax, 3 -%ifdef PIC32 - ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx - mov r2, 64 - sub r2, eax - movd mm7, eax - movd mm6, r2 -%else movd mm6, [sw_64 GLOBAL] movd mm7, eax psubw mm6, mm7 -%endif - PROLOGUE 4,5,0 + PROLOGUE 4,5 and r2, ~7 mov r4d, %3 pxor mm0, mm0 @@ -825,11 +910,11 @@ call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11], eax pop r2 - mov r0, r10 + mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+4], eax pop r2 - mov r0, r10 + mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+8], eax %else @@ -875,15 +960,15 @@ call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11], eax pop r2 - mov r0, r10 + mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+4], eax pop r2 - mov r0, r10 + mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+8], eax pop r2 - mov r0, r10 + mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+12], eax %else

@@ -69,7 +69,7 @@ "jg 1b \n" "movq %%mm4, %0 \n" :"=m"(output), "+r"(i_mvc) - :"r"(mvc) + :"r"(mvc), "m"(*(struct {int16_t x[4];} *)mvc) ); sum += output[0] + output[1] + output[2] + output[3]; return sum; @@ -90,7 +90,7 @@ "psadbw %%mm7, %%mm1 \n" "movd %%mm1, %0 \n" :"=r"(count) - :"r"(v) + :"r"(v), "m"(*(struct {int16_t x[16];} *)v) ); return (count+0x10)&0xff; } @@ -100,7 +100,7 @@ { if(i_count == 128) { - int nonzero; + int nonzero = 0; asm( "movq (%1), %%mm0 \n" "por 8(%1), %%mm0 \n" @@ -121,7 +121,7 @@ "packsswb %%mm0, %%mm0 \n" "movd %%mm0, %0 \n" :"=r"(nonzero) - :"r"(v) + :"r"(v), "m"(*(struct {int16_t x[64];} *)v) ); return !!nonzero; }

@@ -37,57 +37,26 @@ %endif %endmacro -; PIC support macros. All these macros are totally harmless when PIC is -; not defined but can ruin everything if misused in PIC mode. On x86_32, shared -; objects cannot directly access global variables by address, they need to -; go through the GOT (global offset table). Most OSes do not care about it -; and let you load non-shared .so objects (Linux, Win32...). However, OS X -; requires PIC code in its .dylib objects. -; -; - GLOBAL should be used as a suffix for global addressing, eg. -; picgetgot ebx +; PIC support macros. +; x86_64 can't fit 64bit address literals in most instruction types, +; so shared objects (under the assumption that they might be anywhere +; in memory) must use an address mode that does fit. +; So all accesses to global variables must use this macro, e.g. ; mov eax, [foo GLOBAL] -; instead of +; instead of ; mov eax, [foo] ; -; - picgetgot computes the GOT address into the given register in PIC -; mode, otherwise does nothing. You need to do this before using GLOBAL. -; Before in both execution order and compiled code order (so GLOBAL knows -; which register the GOT is in). +; x86_32 doesn't require PIC. +; Some distros prefer shared objects to be PIC, but nothing breaks if +; the code contains a few textrels, so we'll skip that complexity. -%ifndef PIC - %define GLOBAL - %macro picgetgot 1 - %endmacro -%elifdef ARCH_X86_64 - %define PIC64 +%ifndef ARCH_X86_64 + %undef PIC +%endif +%ifdef PIC %define GLOBAL wrt rip - %macro picgetgot 1 - %endmacro %else - %define PIC32 - %ifidn __OUTPUT_FORMAT__,macho - ; There is no real global offset table on OS X, but we still - ; need to reference our variables by offset. - %macro picgetgot 1 - call %%getgot - %%getgot: - pop %1 - add %1, $$ - %%getgot - %undef GLOBAL - %define GLOBAL + %1 - fakegot - %endmacro - %else ; elf - extern _GLOBAL_OFFSET_TABLE_ - %macro picgetgot 1 - call %%getgot - %%getgot: - pop %1 - add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc - %undef GLOBAL - %define GLOBAL + %1 wrt ..gotoff - %endmacro - %endif + %define GLOBAL %endif ; Macros to eliminate most code duplication between x86_32 and x86_64: @@ -97,14 +66,13 @@ ; PROLOGUE: ; %1 = number of arguments. loads them from stack if needed. -; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed. -; %3 = whether global constants are used in this function. inits x86_32 PIC if needed. -; %4 = list of names to define to registers +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = list of names to define to registers ; PROLOGUE can also be invoked by adding the same options to cglobal ; e.g. -; cglobal foo, 2,3,0, dst, src, tmp -; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals +; cglobal foo, 2,3, dst, src, tmp +; declares a function (foo), taking two args (dst and src) and one local variable (tmp) ; TODO Some functions can use some args directly from the stack. If they're the ; last args then you can just not declare them, but if they're in the middle @@ -240,12 +208,12 @@ %endif %endmacro -%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... +%macro PROLOGUE 2-3+ ; #args, #regs, arg_names... ASSERT %2 >= %1 ASSERT %2 <= 7 %assign stack_offset 0 LOAD_IF_USED 6, %1 - DEFINE_ARGS %4 + DEFINE_ARGS %3 %endmacro %macro RET 0 @@ -288,15 +256,10 @@ %endif %endmacro -%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... +%macro PROLOGUE 2-3+ ; #args, #regs, arg_names... ASSERT %2 >= %1 %assign stack_offset 0 %assign regs_used %2 - %ifdef PIC - %if %3 - %assign regs_used regs_used+1 - %endif - %endif ASSERT regs_used <= 7 PUSH_IF_USED 3 PUSH_IF_USED 4 @@ -309,10 +272,7 @@ LOAD_IF_USED 4, %1 LOAD_IF_USED 5, %1 LOAD_IF_USED 6, %1 - %if %3 - picgetgot r%2 - %endif - DEFINE_ARGS %4 + DEFINE_ARGS %3 %endmacro %macro RET 0 @@ -502,6 +462,7 @@ %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, %1_m %+ %%i + CAT_XDEFINE n, m %+ %%i, %%i %assign %%i %%i+1 %endrep %endmacro

@@ -18,6 +18,87 @@ ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;***************************************************************************** +%macro SBUTTERFLY 4 + mova m%4, m%2 + punpckl%1 m%2, m%3 + punpckh%1 m%4, m%3 + SWAP %3, %4 +%endmacro + +%macro TRANSPOSE4x4W 5 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE2x4x4W 5 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SBUTTERFLY qdq, %1, %2, %5 + SBUTTERFLY qdq, %3, %4, %5 +%endmacro + +%macro TRANSPOSE4x4D 5 + SBUTTERFLY dq, %1, %2, %5 + SBUTTERFLY dq, %3, %4, %5 + SBUTTERFLY qdq, %1, %3, %5 + SBUTTERFLY qdq, %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE8x8W 9-11 +%ifdef ARCH_X86_64 + SBUTTERFLY wd, %1, %2, %9 + SBUTTERFLY wd, %3, %4, %9 + SBUTTERFLY wd, %5, %6, %9 + SBUTTERFLY wd, %7, %8, %9 + SBUTTERFLY dq, %1, %3, %9 + SBUTTERFLY dq, %2, %4, %9 + SBUTTERFLY dq, %5, %7, %9 + SBUTTERFLY dq, %6, %8, %9 + SBUTTERFLY qdq, %1, %5, %9 + SBUTTERFLY qdq, %2, %6, %9 + SBUTTERFLY qdq, %3, %7, %9 + SBUTTERFLY qdq, %4, %8, %9 + SWAP %2, %5 + SWAP %4, %7 +%else +; in: m0..m7, unless %11 in which case m6 is in %9 +; out: m0..m7, unless %11 in which case m4 is in %10 +; spills into %9 and %10 +%if %0<11 + movdqa %9, m%7 +%endif + SBUTTERFLY wd, %1, %2, %7 + movdqa %10, m%2 + movdqa m%7, %9 + SBUTTERFLY wd, %3, %4, %2 + SBUTTERFLY wd, %5, %6, %2 + SBUTTERFLY wd, %7, %8, %2 + SBUTTERFLY dq, %1, %3, %2 + movdqa %9, m%3 + movdqa m%2, %10 + SBUTTERFLY dq, %2, %4, %3 + SBUTTERFLY dq, %5, %7, %3 + SBUTTERFLY dq, %6, %8, %3 + SBUTTERFLY qdq, %1, %5, %3 + SBUTTERFLY qdq, %2, %6, %3 + movdqa %10, m%2 + movdqa m%3, %9 + SBUTTERFLY qdq, %3, %7, %2 + SBUTTERFLY qdq, %4, %8, %2 + SWAP %2, %5 + SWAP %4, %7 +%if 0<11 + movdqa m%5, %10 +%endif +%endif +%endmacro + %macro ABS1_MMX 2 ; a, tmp pxor %2, %2 psubw %2, %1 @@ -50,6 +131,40 @@ ABS2 %3, %4, %5, %6 %endmacro +%macro SPLATB_MMX 3 + movd %1, [%2-3] ;to avoid crossing a cacheline + punpcklbw %1, %1 +%if mmsize==16 + pshuflw %1, %1, 0xff + movlhps %1, %1 +%else + pshufw %1, %1, 0xff +%endif +%endmacro + +%macro SPLATB_SSSE3 3 + movd %1, [%2-3] + pshufb %1, %3 +%endmacro + +%macro PALIGNR_MMX 4 + %ifnidn %4, %2 + mova %4, %2 + %endif + %if mmsize == 8 + psllq %1, (8-%3)*8 + psrlq %4, %3*8 + %else + pslldq %1, 16-%3 + psrldq %4, %3 + %endif + por %1, %4 +%endmacro + +%macro PALIGNR_SSSE3 4 + palignr %1, %2, %3 +%endmacro + %macro SUMSUB_BA 2 paddw %1, %2 paddw %2, %2 @@ -122,3 +237,4 @@ packuswb %1, %1 movh %4, %1 %endmacro +

@@ -243,7 +243,6 @@ case $host_cpu in i*86) ARCH="X86" - AS="yasm" ASFLAGS="-O2" if [ "$SYS" = MACOSX ]; then ASFLAGS="$ASFLAGS -f macho -DPREFIX" @@ -256,7 +255,6 @@ ;; x86_64) ARCH="X86_64" - AS="yasm" if [ "$SYS" = MACOSX ];then ASFLAGS="-f macho64 -m amd64 -DPIC -DPREFIX" CFLAGS="$CFLAGS -arch x86_64" @@ -309,15 +307,12 @@ fi if [ $asm = yes -a $ $ARCH = X86 -o $ARCH = X86_64 $ ] ; then - if [ $ARCH = X86 -a $pic = yes -a x$AS = xyasm -a\ - "`yasm --version 2>$DEVNULL | head -n 1`" "<" "yasm 0.6.2" ] ; then - echo "yasm prior to 0.6.2 miscompiles PIC. trying nasm instead..." - AS=nasm - fi if as_check "pabsw xmm0, xmm0" ; then CFLAGS="$CFLAGS -DHAVE_MMX" else - echo "No suitable assembler found. Install 'yasm' to get MMX/SSE optimized code." + VER=`([ $AS == nasm ] && nasm -v || $AS --version || echo no assembler) 2>$DEVNULL | head -n 1` + echo "Found $VER" + echo "Minimum version is yasm-0.6.1 or nasm-2.0" echo "If you really want to compile without asm, configure with --disable-asm." exit 1 fi @@ -455,15 +450,25 @@ echo 'IMPLIBNAME=libx264.dll.a' >> config.mak echo 'SOFLAGS=-Wl,--out-implib,$(IMPLIBNAME) -Wl,--enable-auto-image-base' >> config.mak elif [ "$SYS" = "MACOSX" ]; then + echo "SOSUFFIX=dylib" >> config.mak echo "SONAME=libx264.$API.dylib" >> config.mak - echo 'SOFLAGS=-dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress' >> config.mak + echo 'SOFLAGS=-dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name $(DESTDIR)$(libdir)/$(SONAME)' >> config.mak else + echo "SOSUFFIX=so" >> config.mak echo "SONAME=libx264.so.$API" >> config.mak echo 'SOFLAGS=-Wl,-soname,$(SONAME)' >> config.mak fi echo 'default: $(SONAME)' >> config.mak if [ "$gtk" = "yes" ]; then - echo "SONAMEGTK=libx264gtk.so.$API" >> gtk/config.mak + if [ "$SYS" = "MACOSX" ]; then + echo "SOSUFFIX=dylib" >> gtk/config.mak + echo "SONAMEGTK=libx264gtk.$API.dylib" >> gtk/config.mak + echo 'SOFLAGS=-dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name $(DESTDIR)$(libdir)/$(SONAMEGTK)' >> gtk/config.mak + else + echo "SOSUFFIX=so" >> gtk/config.mak + echo "SONAMEGTK=libx264gtk.so.$API" >> gtk/config.mak + echo 'SOFLAGS=-Wl,-soname,$(SONAMEGTK)' >> gtk/config.mak + fi fi fi

@@ -205,7 +205,7 @@ { /* conduct the analysis using this lamda and QP */ a->i_qp = h->mb.i_qp = i_qp; - h->mb.i_chroma_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )]; + h->mb.i_chroma_qp = h->chroma_qp_table[i_qp]; a->i_lambda = x264_lambda_tab[i_qp]; a->i_lambda2 = x264_lambda2_tab[i_qp]; a->b_mbrd = h->param.analyse.i_subpel_refine >= 6 && @@ -225,8 +225,8 @@ a->i_satd_i4x4 = a->i_satd_i8x8chroma = COST_MAX; - /* non-RD PCM decision is inaccurate, so don't do it */ - a->i_satd_pcm = a->b_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX; + /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */ + a->i_satd_pcm = !h->mb.i_psy_rd && a->b_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX; a->b_fast_intra = 0; h->mb.i_skip_intra = @@ -467,6 +467,58 @@ } } +/* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */ +static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct ) +{ + DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] ); + DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] ); + DECLARE_ALIGNED_16( uint8_t zero[16*FDEC_STRIDE] ) = {0}; + int i; + + if( do_both_dct || h->mb.b_transform_8x8 ) + { + h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero ); + for( i = 0; i < 4; i++ ) + h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] ); + } + if( do_both_dct || !h->mb.b_transform_8x8 ) + { + h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero ); + for( i = 0; i < 16; i++ ) + h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] ); + } +} + +/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */ +static inline void x264_mb_cache_fenc_satd( x264_t *h ) +{ + DECLARE_ALIGNED_16(uint8_t zero[16]) = {0}; + uint8_t *fenc; + int x, y, satd_sum = 0, sa8d_sum = 0; + if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis ) + x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 ); + if( !h->mb.i_psy_rd ) + return; + for( y = 0; y < 4; y++ ) + for( x = 0; x < 4; x++ ) + { + fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE; + h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE ) + - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1); + satd_sum += h->mb.pic.fenc_satd[y][x]; + } + for( y = 0; y < 2; y++ ) + for( x = 0; x < 2; x++ ) + { + fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE; + h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE ) + - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2); + sa8d_sum += h->mb.pic.fenc_sa8d[y][x]; + } + h->mb.pic.fenc_satd_sum = satd_sum; + h->mb.pic.fenc_sa8d_sum = sa8d_sum; +} + static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) { int i; @@ -498,7 +550,7 @@ h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE ); satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE ); - + for( i=0; i<i_max; i++ ) { int i_mode = predict_mode[i]; @@ -517,8 +569,13 @@ int i_mode = predict_mode[i]; /* we do the prediction */ - h->predict_8x8c[i_mode]( p_dstc[0] ); - h->predict_8x8c[i_mode]( p_dstc[1] ); + if( h->mb.b_lossless ) + x264_predict_lossless_8x8_chroma( h, i_mode ); + else + { + h->predict_8x8c[i_mode]( p_dstc[0] ); + h->predict_8x8c[i_mode]( p_dstc[1] ); + } /* we calculate the cost */ i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, @@ -544,7 +601,7 @@ int i, idx; int i_max; int predict_mode[9]; - int b_merged_satd = h->pixf.intra_satd_x3_16x16 && h->pixf.mbcmp[0] == h->pixf.satd[0]; + int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless; /*---------------- Try all mode and calculate their score ---------------*/ @@ -553,7 +610,7 @@ if( b_merged_satd && i_max == 4 ) { - h->pixf.intra_satd_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir ); + h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir ); h->predict_16x16[I_PRED_16x16_P]( p_dst ); a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ); @@ -569,7 +626,11 @@ { int i_satd; int i_mode = predict_mode[i]; - h->predict_16x16[i_mode]( p_dst ); + + if( h->mb.b_lossless ) + x264_predict_lossless_16x16( h, i_mode ); + else + h->predict_16x16[i_mode]( p_dst ); i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) + a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] ); @@ -588,7 +649,7 @@ if( flags & X264_ANALYSE_I8x8 ) { DECLARE_ALIGNED_16( uint8_t edge[33] ); - x264_pixel_cmp_t sa8d = (*h->pixf.mbcmp == *h->pixf.sad) ? h->pixf.sad[PIXEL_8x8] : h->pixf.sa8d[PIXEL_8x8]; + x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8]; int i_satd_thresh = a->b_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 ); int i_cost = 0; b_merged_satd = h->pixf.intra_sa8d_x3_8x8 && h->pixf.mbcmp[0] == h->pixf.satd[0]; @@ -629,7 +690,10 @@ int i_satd; int i_mode = predict_mode[i]; - h->predict_8x8[i_mode]( p_dst_by, edge ); + if( h->mb.b_lossless ) + x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge ); + else + h->predict_8x8[i_mode]( p_dst_by, edge ); i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4); @@ -711,8 +775,10 @@ { int i_satd; int i_mode = predict_mode[i]; - - h->predict_4x4[i_mode]( p_dst_by ); + if( h->mb.b_lossless ) + x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode ); + else + h->predict_4x4[i_mode]( p_dst_by ); i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) @@ -824,7 +890,10 @@ for( i = 0; i < i_max; i++ ) { i_mode = predict_mode[i]; - h->predict_4x4[i_mode]( p_dst_by ); + if( h->mb.b_lossless ) + x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode ); + else + h->predict_4x4[i_mode]( p_dst_by ); i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode ); if( i_best > i_satd ) @@ -876,7 +945,10 @@ i_mode = predict_mode[i]; if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh ) continue; - h->predict_8x8[i_mode]( p_dst_by, edge ); + if( h->mb.b_lossless ) + x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge ); + else + h->predict_8x8[i_mode]( p_dst_by, edge ); i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode ); if( i_best > i_satd ) @@ -928,8 +1000,13 @@ for( i = 0; i < i_max; i++ ) { i_mode = predict_mode[i]; - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); + if( h->mb.b_lossless ) + x264_predict_lossless_8x8_chroma( h, i_mode ); + else + { + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); + } /* if we've already found a mode that needs no residual, then * probably any mode with a residual will be worse. * so avoid dct on the remaining modes to improve speed. */ @@ -964,7 +1041,7 @@ { x264_me_t m; int i_ref, i_mvc; - DECLARE_ALIGNED_4( int16_t mvc[7][2] ); + DECLARE_ALIGNED_4( int16_t mvc[8][2] ); int i_halfpel_thresh = INT_MAX; int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL; @@ -1009,7 +1086,7 @@ h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) ); /* save mv for predicting neighbors */ - *(uint32_t*)a->l0.mvc[i_ref][0] = + *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv; } @@ -1017,12 +1094,15 @@ assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 ); h->mb.i_type = P_L0; - if( a->b_mbrd && a->l0.me16x16.i_ref == 0 - && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv ) + if( a->b_mbrd ) { - h->mb.i_partition = D_16x16; - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); - a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 ); + x264_mb_cache_fenc_satd( h ); + if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv ) + { + h->mb.i_partition = D_16x16; + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); + a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 ); + } } } @@ -1419,26 +1499,21 @@ } } -#define WEIGHTED_AVG( size, pix1, stride1, src2, stride2 ) \ - { \ - if( h->param.analyse.b_weighted_bipred ) \ - h->mc.avg_weight[size]( pix1, stride1, src2, stride2, \ - h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \ - else \ - h->mc.avg[size]( pix1, stride1, src2, stride2 ); \ - } +#define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \ +{ \ + h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \ +} static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) { + DECLARE_ALIGNED_16( uint8_t pix0[16*16] ); DECLARE_ALIGNED_16( uint8_t pix1[16*16] ); - DECLARE_ALIGNED_16( uint8_t pix2[16*16] ); - uint8_t *src2; - int stride2 = 16; - int weight; + uint8_t *src0, *src1; + int stride0 = 16, stride1 = 16; x264_me_t m; int i_ref, i_mvc; - DECLARE_ALIGNED_4( int16_t mvc[8][2] ); + DECLARE_ALIGNED_4( int16_t mvc[9][2] ); int i_halfpel_thresh = INT_MAX; int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL; @@ -1504,41 +1579,16 @@ x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref ); /* get cost of BI mode */ - weight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref]; - if ( (*(uint32_t*)a->l0.me16x16.mv & 0x10001) == 0 ) - { - /* l0 reference is halfpel, so get_ref on it will make it faster */ - src2 = - h->mc.get_ref( pix2, &stride2, - h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0], - a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], - 16, 16 ); - h->mc.mc_luma( pix1, 16, - h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0], - a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], - 16, 16 ); - weight = 64 - weight; - } - else - { - /* if l0 was qpel, we'll use get_ref on l1 instead */ - h->mc.mc_luma( pix1, 16, - h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0], - a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], - 16, 16 ); - src2 = - h->mc.get_ref( pix2, &stride2, - h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0], - a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], - 16, 16 ); - } + src0 = h->mc.get_ref( pix0, &stride0, + h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0], + a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 ); + src1 = h->mc.get_ref( pix1, &stride1, + h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0], + a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 ); - if( h->param.analyse.b_weighted_bipred ) - h->mc.avg_weight[PIXEL_16x16]( pix1, 16, src2, stride2, weight ); - else - h->mc.avg[PIXEL_16x16]( pix1, 16, src2, stride2 ); + h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); - a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix1, 16 ) + a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 ) + REF_COST( 0, a->l0.i_ref ) + REF_COST( 1, a->l1.i_ref ) + a->l0.me16x16.cost_mv @@ -1654,6 +1704,8 @@ const int y8 = i/2; int i_part_cost; int i_part_cost_bi = 0; + int stride[2] = {8,8}; + uint8_t *src[2]; for( l = 0; l < 2; l++ ) { @@ -1672,13 +1724,12 @@ x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv ); /* BI mode */ - h->mc.mc_luma( pix[l], 8, m->p_fref, m->i_stride[0], - m->mv[0], m->mv[1], 8, 8 ); + src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0], + m->mv[0], m->mv[1], 8, 8 ); i_part_cost_bi += m->cost_mv; /* FIXME: ref cost */ } - - WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, pix[1], 8 ); + h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ) + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8]; a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8]; @@ -1704,7 +1755,7 @@ uint8_t **p_fref[2] = { h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.p_fref[1][a->l1.i_ref] }; - DECLARE_ALIGNED_16( uint8_t pix[2][16*8] ); + DECLARE_ALIGNED_16( uint8_t pix[2][16*8] ); DECLARE_ALIGNED_4( int16_t mvc[2][2] ); int i, l; @@ -1715,6 +1766,8 @@ { int i_part_cost; int i_part_cost_bi = 0; + int stride[2] = {16,16}; + uint8_t *src[2]; /* TODO: check only the list(s) that were used in b8x8? */ for( l = 0; l < 2; l++ ) @@ -1735,13 +1788,12 @@ x264_me_search( h, m, mvc, 2 ); /* BI mode */ - h->mc.mc_luma( pix[l], 16, m->p_fref, m->i_stride[0], - m->mv[0], m->mv[1], 16, 8 ); + src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0], + m->mv[0], m->mv[1], 16, 8 ); /* FIXME: ref cost */ i_part_cost_bi += m->cost_mv; } - - WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, pix[1], 16 ); + h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 ); i_part_cost = a->l0.me16x8[i].cost; @@ -1784,6 +1836,8 @@ { int i_part_cost; int i_part_cost_bi = 0; + int stride[2] = {8,8}; + uint8_t *src[2]; for( l = 0; l < 2; l++ ) { @@ -1803,13 +1857,13 @@ x264_me_search( h, m, mvc, 2 ); /* BI mode */ - h->mc.mc_luma( pix[l], 8, m->p_fref, m->i_stride[0], - m->mv[0], m->mv[1], 8, 16 ); + src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0], + m->mv[0], m->mv[1], 8, 16 ); /* FIXME: ref cost */ i_part_cost_bi += m->cost_mv; } - WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, pix[1], 8 ); + h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ); i_part_cost = a->l0.me8x16[i].cost; @@ -1907,7 +1961,7 @@ static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter ) { - int thresh = i_satd_inter * 17/16; + int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16; if( a->b_direct_available && a->i_rd16x16direct == COST_MAX ) { @@ -2006,7 +2060,7 @@ static inline void x264_mb_analyse_transform( x264_t *h ) { - if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 ) + if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless ) { int i_cost4, i_cost8; /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */ @@ -2066,6 +2120,8 @@ /*--------------------------- Do the analysis ---------------------------*/ if( h->sh.i_type == SLICE_TYPE_I ) { + if( analysis.b_mbrd ) + x264_mb_cache_fenc_satd( h ); x264_mb_analyse_intra( h, &analysis, COST_MAX ); if( analysis.b_mbrd ) x264_intra_rd( h, &analysis, COST_MAX ); @@ -2344,6 +2400,9 @@ int i_bskip_cost = COST_MAX; int b_skip = 0; + if( analysis.b_mbrd ) + x264_mb_cache_fenc_satd( h ); + h->mb.i_type = B_SKIP; if( h->mb.b_direct_auto_write ) { @@ -2558,7 +2617,7 @@ h->mb.i_type = i_type; h->mb.i_partition = i_partition; } - + x264_mb_analyse_intra( h, &analysis, i_satd_inter ); if( analysis.b_mbrd ) @@ -2589,6 +2648,8 @@ h->mb.b_trellis = h->param.analyse.i_trellis; h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction; + if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 ) + x264_psy_trellis_init( h, 0 ); if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction ) h->mb.i_skip_intra = 0; }

@@ -790,7 +790,7 @@ if( i_mb_type == I_PCM ) { i_mb_pos_tex = x264_cabac_pos( cb ); - h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start; + h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; memcpy( cb->p, h->mb.pic.p_fenc[0], 256 ); cb->p += 256; @@ -811,7 +811,7 @@ h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 ); h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 ); - h->stat.frame.i_itex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex; + h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex; return; } #endif @@ -963,7 +963,7 @@ #ifndef RDO_SKIP_BS i_mb_pos_tex = x264_cabac_pos( cb ); - h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start; + h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; #endif if( i_mb_type != I_16x16 ) @@ -1018,10 +1018,7 @@ } #ifndef RDO_SKIP_BS - if( IS_INTRA( i_mb_type ) ) - h->stat.frame.i_itex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex; - else - h->stat.frame.i_ptex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex; + h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex; #endif } @@ -1032,7 +1029,7 @@ * works on all partition sizes except 16x16 * for sub8x8, call once per 8x8 block *****************************************************************************/ -void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_pixel ) +static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_pixel ) { const int i_mb_type = h->mb.i_type; int j;

@@ -116,9 +116,7 @@ /* total/trailing */ if( i_idx == BLOCK_INDEX_CHROMA_DC ) - { bs_write_vlc( s, x264_coeff_token[4][i_total*4+i_trailing] ); - } else { /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */ @@ -132,9 +130,7 @@ i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0; if( i_trailing > 0 ) - { bs_write( s, i_trailing, i_sign ); - } for( i = i_trailing; i < i_total; i++ ) { int mask = level[i] >> 15; @@ -145,19 +141,13 @@ i_level_code -= 2; /* as level[i] can't be 1 for the first one if i_trailing < 3 */ if( ( i_level_code >> i_suffix_length ) < 14 ) - { bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length, (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) ); - } else if( i_suffix_length == 0 && i_level_code < 30 ) - { bs_write( s, 19, (1<<4) + (i_level_code - 14) ); - } else if( i_suffix_length > 0 && ( i_level_code >> i_suffix_length ) == 14 ) - { bs_write( s, 15 + i_suffix_length, (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) ); - } else { int i_level_prefix = 15; @@ -344,7 +334,7 @@ { bs_write_ue( s, i_mb_i_offset + 25 ); i_mb_pos_tex = bs_pos( s ); - h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start; + h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; bs_align_0( s ); @@ -362,7 +352,7 @@ h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 ); h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 ); - h->stat.frame.i_itex_bits += bs_pos(s) - i_mb_pos_tex; + h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex; return; } #endif @@ -384,16 +374,10 @@ int i_pred = x264_mb_predict_intra4x4_mode( h, i ); int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] ); - if( i_pred == i_mode) - { + if( i_pred == i_mode ) bs_write1( s, 1 ); /* b_prev_intra4x4_pred_mode */ - } else - { - if( i_mode >= i_pred ) - i_mode--; - bs_write( s, 4, i_mode ); - } + bs_write( s, 4, i_mode - (i_mode > i_pred) ); } bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] ); } @@ -412,9 +396,7 @@ bs_write_ue( s, 0 ); if( h->mb.pic.i_fref[0] > 1 ) - { bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] ); - } x264_mb_predict_mv( h, 0, 0, 4, mvp ); bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] ); bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] ); @@ -457,9 +439,8 @@ else if( i_mb_type == P_8x8 ) { int b_sub_ref0; - - if( h->mb.cache.ref[0][x264_scan8[0]] == 0 && h->mb.cache.ref[0][x264_scan8[4]] == 0 && - h->mb.cache.ref[0][x264_scan8[8]] == 0 && h->mb.cache.ref[0][x264_scan8[12]] == 0 ) + if( (h->mb.cache.ref[0][x264_scan8[0]] | h->mb.cache.ref[0][x264_scan8[ 4]] | + h->mb.cache.ref[0][x264_scan8[8]] | h->mb.cache.ref[0][x264_scan8[12]]) == 0 ) { bs_write_ue( s, 4 ); b_sub_ref0 = 0; @@ -469,11 +450,14 @@ bs_write_ue( s, 3 ); b_sub_ref0 = 1; } + /* sub mb type */ - for( i = 0; i < 4; i++ ) - { - bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i] ] ); - } + if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 ) + for( i = 0; i < 4; i++ ) + bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i] ] ); + else + bs_write( s, 4, 0xf ); + /* ref0 */ if( h->mb.pic.i_fref[0] > 1 && b_sub_ref0 ) { @@ -492,24 +476,16 @@ /* sub mb type */ for( i = 0; i < 4; i++ ) - { bs_write_ue( s, sub_mb_type_b_to_golomb[ h->mb.i_sub_partition[i] ] ); - } + /* ref */ for( i = 0; i < 4; i++ ) - { if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] ) - { bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] ); - } - } for( i = 0; i < 4; i++ ) - { if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] ) - { bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] ); - } - } + /* mvd */ for( i = 0; i < 4; i++ ) cavlc_mb8x8_mvd( h, s, 0, i ); @@ -532,30 +508,27 @@ b_list[1][i] = x264_mb_type_list1_table[i_mb_type][i]; } - bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] ); for( i_list = 0; i_list < 2; i_list++ ) { - const int i_ref_max = i_list == 0 ? h->mb.pic.i_fref[0] : h->mb.pic.i_fref[1]; + const int i_ref_max = (i_list == 0 ? h->mb.pic.i_fref[0] : h->mb.pic.i_fref[1]) - 1; - if( i_ref_max > 1 ) - { + if( i_ref_max ) switch( h->mb.i_partition ) { case D_16x16: - if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] ); + if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] ); break; case D_16x8: - if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] ); - if( b_list[i_list][1] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[8]] ); + if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] ); + if( b_list[i_list][1] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[8]] ); break; case D_8x16: - if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] ); - if( b_list[i_list][1] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[4]] ); + if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] ); + if( b_list[i_list][1] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[4]] ); break; } - } } for( i_list = 0; i_list < 2; i_list++ ) { @@ -601,9 +574,7 @@ } } else if( i_mb_type == B_DIRECT ) - { bs_write_ue( s, 0 ); - } else { x264_log(h, X264_LOG_ERROR, "invalid/unhandled mb_type\n" ); @@ -612,24 +583,18 @@ #ifndef RDO_SKIP_BS i_mb_pos_tex = bs_pos( s ); - h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start; + h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; #endif /* Coded block patern */ if( i_mb_type == I_4x4 || i_mb_type == I_8x8 ) - { bs_write_ue( s, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] ); - } else if( i_mb_type != I_16x16 ) - { bs_write_ue( s, inter_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] ); - } /* transform size 8x8 flag */ if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma ) - { bs_write1( s, h->mb.b_transform_8x8 ); - } /* write residual */ if( i_mb_type == I_16x16 ) @@ -640,19 +605,19 @@ block_residual_write_cavlc( h, s, BLOCK_INDEX_LUMA_DC , h->dct.luma16x16_dc, 16 ); /* AC Luma */ - if( h->mb.i_cbp_luma != 0 ) + if( h->mb.i_cbp_luma ) for( i = 0; i < 16; i++ ) { h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] ); block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 ); } } - else if( h->mb.i_cbp_luma != 0 || h->mb.i_cbp_chroma != 0 ) + else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma ) { cavlc_qp_delta( h, s ); x264_macroblock_luma_write_cavlc( h, s, 0, 3 ); } - if( h->mb.i_cbp_chroma != 0 ) + if( h->mb.i_cbp_chroma ) { /* Chroma DC residual present */ block_residual_write_cavlc( h, s, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[0], 4 ); @@ -666,10 +631,7 @@ } #ifndef RDO_SKIP_BS - if( IS_INTRA( i_mb_type ) ) - h->stat.frame.i_itex_bits += bs_pos(s) - i_mb_pos_tex; - else - h->stat.frame.i_ptex_bits += bs_pos(s) - i_mb_pos_tex; + h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex; #endif } @@ -680,7 +642,7 @@ * works on all partition sizes except 16x16 * for sub8x8, call once per 8x8 block *****************************************************************************/ -int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel ) +static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel ) { bs_t s; const int i_mb_type = h->mb.i_type; @@ -770,7 +732,7 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h ) { h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] ); - if( h->mb.i_cbp_chroma != 0 ) + if( h->mb.i_cbp_chroma ) { block_residual_write_cavlc( h, &h->out.bs, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[0], 4 ); block_residual_write_cavlc( h, &h->out.bs, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[1], 4 );

@@ -148,7 +148,7 @@ /* If effective qp <= 15, deblocking would have no effect anyway */ if( param->b_deblocking_filter && ( h->mb.b_variable_qp - || 15 < i_qp + 2 * X264_MAX(param->i_deblocking_filter_alphac0, param->i_deblocking_filter_beta) ) ) + || 15 < i_qp + 2 * X264_MIN(param->i_deblocking_filter_alphac0, param->i_deblocking_filter_beta) ) ) { sh->i_disable_deblocking_filter_idc = 0; } @@ -237,7 +237,7 @@ { bs_write_ue( s, sh->ref_pic_list_order[0][i].idc ); bs_write_ue( s, sh->ref_pic_list_order[0][i].arg ); - + } bs_write_ue( s, 3 ); } @@ -403,13 +403,16 @@ h->param.rc.i_rc_method = X264_RC_CQP; h->param.rc.f_ip_factor = 1; h->param.rc.f_pb_factor = 1; - h->param.analyse.b_transform_8x8 = 0; h->param.analyse.b_psnr = 0; h->param.analyse.b_ssim = 0; h->param.analyse.i_chroma_qp_offset = 0; h->param.analyse.i_trellis = 0; h->param.analyse.b_fast_pskip = 0; h->param.analyse.i_noise_reduction = 0; + h->param.analyse.f_psy_rd = 0; + /* 8x8dct is not useful at all in CAVLC lossless */ + if( !h->param.b_cabac ) + h->param.analyse.b_transform_8x8 = 0; } if( h->param.rc.i_rc_method == X264_RC_CQP ) { @@ -429,7 +432,7 @@ // There's nothing special about 1080 in that the warning still applies to it, // but chances are the user can't help it if his content is already 1080p, // so there's no point in warning in that case. - x264_log( h, X264_LOG_WARNING, + x264_log( h, X264_LOG_WARNING, "width or height not divisible by 16 (%dx%d), compression will suffer.\n", h->param.i_width, h->param.i_height ); } @@ -442,7 +445,8 @@ h->param.i_bframe = x264_clip3( h->param.i_bframe, 0, X264_BFRAME_MAX ); h->param.i_bframe_bias = x264_clip3( h->param.i_bframe_bias, -90, 100 ); h->param.b_bframe_pyramid = h->param.b_bframe_pyramid && h->param.i_bframe > 1; - h->param.b_bframe_adaptive = h->param.b_bframe_adaptive && h->param.i_bframe > 0; + if( !h->param.i_bframe ) + h->param.i_bframe_adaptive = X264_B_ADAPT_NONE; h->param.analyse.b_weighted_bipred = h->param.analyse.b_weighted_bipred && h->param.i_bframe > 0; h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO && h->param.i_bframe @@ -487,12 +491,29 @@ if( !h->param.b_cabac ) h->param.analyse.i_trellis = 0; h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 ); - h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 ); + if( !h->param.analyse.i_trellis ) + h->param.analyse.f_psy_trellis = 0; + h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 ); + h->param.analyse.f_psy_trellis = x264_clip3f( h->param.analyse.f_psy_trellis, 0, 10 ); + if( h->param.analyse.i_subpel_refine < 6 ) + h->param.analyse.f_psy_rd = 0; + h->mb.i_psy_rd = FIX8( h->param.analyse.f_psy_rd ); + /* Psy RDO increases overall quantizers to improve the quality of luma--this indirectly hurts chroma quality */ + /* so we lower the chroma QP offset to compensate */ + /* This can be triggered repeatedly on multiple calls to parameter_validate, but since encoding + * uses the pps chroma qp offset not the param chroma qp offset, this is not a problem. */ + if( h->mb.i_psy_rd ) + h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_rd < 0.25 ? 1 : 2; + h->mb.i_psy_trellis = FIX8( h->param.analyse.f_psy_trellis / 4 ); + /* Psy trellis has a similar effect. */ + if( h->mb.i_psy_trellis ) + h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_trellis < 0.25 ? 1 : 2; + else + h->mb.i_psy_trellis = 0; + h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12); + h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 1 ); if( h->param.rc.f_aq_strength <= 0 ) h->param.rc.i_aq_mode = 0; - /* VAQ effectively replaces qcomp, so qcomp is raised towards 1 to compensate. */ - if( h->param.rc.i_aq_mode == X264_AQ_GLOBAL ) - h->param.rc.f_qcompress = x264_clip3f(h->param.rc.f_qcompress + h->param.rc.f_aq_strength / 0.7, 0, 1); h->param.analyse.i_noise_reduction = x264_clip3( h->param.analyse.i_noise_reduction, 0, 1<<16 ); { @@ -583,7 +604,9 @@ static void mbcmp_init( x264_t *h ) { int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1; - memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp) ); + memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) ); + memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) ); + h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16; satd &= h->param.analyse.i_me_method == X264_ME_TESA; memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) ); memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) ); @@ -621,8 +644,6 @@ h->param.rc.psz_stat_out = strdup( h->param.rc.psz_stat_out ); if( h->param.rc.psz_stat_in ) h->param.rc.psz_stat_in = strdup( h->param.rc.psz_stat_in ); - if( h->param.rc.psz_rc_eq ) - h->param.rc.psz_rc_eq = strdup( h->param.rc.psz_rc_eq ); /* VUI */ if( h->param.vui.i_sar_width > 0 && h->param.vui.i_sar_height > 0 ) @@ -672,18 +693,21 @@ x264_free( h ); return NULL; } - + h->mb.i_mb_count = h->sps->i_mb_width * h->sps->i_mb_height; /* Init frames. */ - h->frames.i_delay = h->param.i_bframe + h->param.i_threads - 1; + if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS ) + h->frames.i_delay = X264_MAX(h->param.i_bframe,3)*4 + h->param.i_threads - 1; + else + h->frames.i_delay = h->param.i_bframe + h->param.i_threads - 1; h->frames.i_max_ref0 = h->param.i_frame_reference; h->frames.i_max_ref1 = h->sps->vui.i_num_reorder_frames; h->frames.i_max_dpb = h->sps->vui.i_max_dec_frame_buffering; h->frames.b_have_lowres = !h->param.rc.b_stat_read && ( h->param.rc.i_rc_method == X264_RC_ABR || h->param.rc.i_rc_method == X264_RC_CRF - || h->param.b_bframe_adaptive + || h->param.i_bframe_adaptive || h->param.b_pre_scenecut ); h->frames.b_have_lowres |= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0); @@ -694,6 +718,8 @@ h->i_ref0 = 0; h->i_ref1 = 0; + h->chroma_qp_table = i_chroma_qp_table + 12 + h->pps->i_chroma_qp_index_offset; + x264_rdo_init( ); /* init CPU functions */ @@ -1235,9 +1261,8 @@ /* Compute misc bits */ h->stat.frame.i_misc_bits = bs_pos( &h->out.bs ) + NALU_OVERHEAD * 8 - - h->stat.frame.i_itex_bits - - h->stat.frame.i_ptex_bits - - h->stat.frame.i_hdr_bits; + - h->stat.frame.i_tex_bits + - h->stat.frame.i_mv_bits; } static void x264_thread_sync_context( x264_t *dst, x264_t *src ) @@ -1256,7 +1281,6 @@ // copy everything except the per-thread pointers and the constants. memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.type) - offsetof(x264_t, i_frame) ); - memcpy( &dst->mb.i_type, &src->mb.i_type, offsetof(x264_t, rc) - offsetof(x264_t, mb.i_type) ); dst->stat = src->stat; } @@ -1362,6 +1386,9 @@ if( h->frames.b_have_lowres ) x264_frame_init_lowres( h, fenc ); + if( h->param.rc.i_aq_mode ) + x264_adaptive_quant_frame( h, fenc ); + if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads ) { /* Nothing yet to encode */ @@ -1542,7 +1569,7 @@ /* restore CPU state (before using float again) */ x264_emms(); - if( h->sh.i_type == SLICE_TYPE_P && !h->param.rc.b_stat_read + if( h->sh.i_type == SLICE_TYPE_P && !h->param.rc.b_stat_read && h->param.i_scenecut_threshold >= 0 && !h->param.b_pre_scenecut ) { @@ -1603,12 +1630,12 @@ /* If using B-frames, force GOP to be closed. * Even if this frame is going to be I and not IDR, forcing a * P-frame before the scenecut will probably help compression. - * + * * We don't yet know exactly which frame is the scene cut, so * we can't assign an I-frame. Instead, change the previous * B-frame to P, and rearrange coding order. */ - if( h->param.b_bframe_adaptive || b > 1 ) + if( h->param.i_bframe_adaptive || b > 1 ) h->fenc->i_type = X264_TYPE_AUTO; x264_frame_sort_pts( h->frames.current ); x264_frame_unshift( h->frames.next, h->fenc ); @@ -1734,22 +1761,22 @@ psz_message[0] = '\0'; if( h->param.analyse.b_psnr ) { - int64_t sqe[3] = { + int64_t ssd[3] = { h->stat.frame.i_ssd[0], h->stat.frame.i_ssd[1], h->stat.frame.i_ssd[2], }; - h->stat.i_sqe_global[h->sh.i_type] += sqe[0] + sqe[1] + sqe[2]; - h->stat.f_psnr_average[h->sh.i_type] += x264_psnr( sqe[0] + sqe[1] + sqe[2], 3 * h->param.i_width * h->param.i_height / 2 ); - h->stat.f_psnr_mean_y[h->sh.i_type] += x264_psnr( sqe[0], h->param.i_width * h->param.i_height ); - h->stat.f_psnr_mean_u[h->sh.i_type] += x264_psnr( sqe[1], h->param.i_width * h->param.i_height / 4 ); - h->stat.f_psnr_mean_v[h->sh.i_type] += x264_psnr( sqe[2], h->param.i_width * h->param.i_height / 4 ); + h->stat.i_ssd_global[h->sh.i_type] += ssd[0] + ssd[1] + ssd[2]; + h->stat.f_psnr_average[h->sh.i_type] += x264_psnr( ssd[0] + ssd[1] + ssd[2], 3 * h->param.i_width * h->param.i_height / 2 ); + h->stat.f_psnr_mean_y[h->sh.i_type] += x264_psnr( ssd[0], h->param.i_width * h->param.i_height ); + h->stat.f_psnr_mean_u[h->sh.i_type] += x264_psnr( ssd[1], h->param.i_width * h->param.i_height / 4 ); + h->stat.f_psnr_mean_v[h->sh.i_type] += x264_psnr( ssd[2], h->param.i_width * h->param.i_height / 4 ); snprintf( psz_message, 80, " PSNR Y:%5.2f U:%5.2f V:%5.2f", - x264_psnr( sqe[0], h->param.i_width * h->param.i_height ), - x264_psnr( sqe[1], h->param.i_width * h->param.i_height / 4), - x264_psnr( sqe[2], h->param.i_width * h->param.i_height / 4) ); + x264_psnr( ssd[0], h->param.i_width * h->param.i_height ), + x264_psnr( ssd[1], h->param.i_width * h->param.i_height / 4), + x264_psnr( ssd[2], h->param.i_width * h->param.i_height / 4) ); } if( h->param.analyse.b_ssim ) @@ -1761,7 +1788,7 @@ " SSIM Y:%.5f", ssim_y ); } psz_message[79] = '\0'; - + x264_log( h, X264_LOG_DEBUG, "frame=%4d QP=%.2f NAL=%d Slice:%c Poc:%-3d I:%-4d P:%-4d SKIP:%-4d size=%d bytes%s\n", h->i_frame, @@ -1857,7 +1884,7 @@ (double)h->stat.i_slice_size[i_slice] / i_count, h->stat.f_psnr_mean_y[i_slice] / i_count, h->stat.f_psnr_mean_u[i_slice] / i_count, h->stat.f_psnr_mean_v[i_slice] / i_count, h->stat.f_psnr_average[i_slice] / i_count, - x264_psnr( h->stat.i_sqe_global[i_slice], i_count * i_yuv_size ) ); + x264_psnr( h->stat.i_ssd_global[i_slice], i_count * i_yuv_size ) ); } else { @@ -2013,7 +2040,7 @@ SUM3( h->stat.f_psnr_mean_u ) / i_count, SUM3( h->stat.f_psnr_mean_v ) / i_count, SUM3( h->stat.f_psnr_average ) / i_count, - x264_psnr( SUM3( h->stat.i_sqe_global ), i_count * i_yuv_size ), + x264_psnr( SUM3( h->stat.i_ssd_global ), i_count * i_yuv_size ), f_bitrate ); } else @@ -2028,8 +2055,6 @@ free( h->param.rc.psz_stat_out ); if( h->param.rc.psz_stat_in ) free( h->param.rc.psz_stat_in ); - if( h->param.rc.psz_rc_eq ) - free( h->param.rc.psz_rc_eq ); x264_cqm_delete( h );

@@ -79,7 +79,25 @@ return i_score; } -void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale ) +static ALWAYS_INLINE void x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx ) +{ + int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY; + if( h->mb.b_trellis ) + x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx ); + else + h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] ); +} + +static ALWAYS_INLINE void x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx ) +{ + int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY; + if( h->mb.b_trellis ) + x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx ); + else + h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] ); +} + +void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp ) { uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]]; uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]]; @@ -93,15 +111,12 @@ h->dctf.sub4x4_dct( dct4x4, p_src, p_dst ); - if( h->mb.b_trellis ) - x264_quant_4x4_trellis( h, dct4x4, CQM_4IY, i_qscale, DCT_LUMA_4x4, 1 ); - else - h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] ); + x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx ); if( array_non_zero( dct4x4 ) ) { h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 ); - h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale ); + h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qp ); /* output samples to fdec */ h->dctf.add4x4_idct( p_dst, dct4x4 ); @@ -110,7 +125,7 @@ memset( h->dct.luma4x4[idx], 0, sizeof(h->dct.luma4x4[idx])); } -void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale ) +void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp ) { int x = 8 * (idx&1); int y = 8 * (idx>>1); @@ -118,19 +133,22 @@ uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE]; DECLARE_ALIGNED_16( int16_t dct8x8[8][8] ); + if( h->mb.b_lossless ) + { + h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst ); + return; + } + h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst ); - if( h->mb.b_trellis ) - x264_quant_8x8_trellis( h, dct8x8, CQM_8IY, i_qscale, 1 ); - else - h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8IY][i_qscale], h->quant8_bias[CQM_8IY][i_qscale] ); + x264_quant_8x8( h, dct8x8, i_qp, 1, idx ); h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 ); - h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale ); + h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp ); h->dctf.add8x8_idct8( p_dst, dct8x8 ); } -static void x264_mb_encode_i16x16( x264_t *h, int i_qscale ) +static void x264_mb_encode_i16x16( x264_t *h, int i_qp ) { uint8_t *p_src = h->mb.pic.p_fenc[0]; uint8_t *p_dst = h->mb.pic.p_fdec[0]; @@ -162,22 +180,19 @@ dct4x4[i][0][0] = 0; /* quant/scan/dequant */ - if( h->mb.b_trellis ) - x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 ); - else - h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] ); + x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i ); h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] ); - h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qscale ); + h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp ); } h->dctf.dct4x4dc( dct_dc4x4 ); - h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qscale][0]>>1, h->quant4_bias[CQM_4IY][i_qscale][0]<<1 ); + h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 ); h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 ); /* output samples to fdec */ h->dctf.idct4x4dc( dct_dc4x4 ); - x264_mb_dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qscale ); /* XXX not inversed */ + x264_mb_dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp ); /* XXX not inversed */ /* calculate dct coeffs */ for( i = 0; i < 16; i++ ) @@ -189,7 +204,7 @@ h->dctf.add16x16_idct( p_dst, dct4x4 ); } -void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale ) +void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) { int i, ch; int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate); @@ -215,7 +230,7 @@ } continue; } - + h->dctf.sub8x8_dct( dct4x4, p_src, p_dst ); /* calculate dct coeffs */ for( i = 0; i < 4; i++ ) @@ -225,22 +240,20 @@ dct4x4[i][0][0] = 0; /* no trellis; it doesn't seem to help chroma noticeably */ - h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qscale], h->quant4_bias[CQM_4IC+b_inter][i_qscale] ); + h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] ); h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] ); if( b_decimate ) - { i_decimate_score += x264_mb_decimate_score( h->dct.luma4x4[16+i+ch*4]+1, 15 ); - } } h->dctf.dct2x2dc( dct2x2 ); - h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qscale][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qscale][0]<<1 ); + h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 ); zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 ); /* output samples to fdec */ h->dctf.idct2x2dc( dct2x2 ); - x264_mb_dequant_2x2_dc( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qscale ); /* XXX not inversed */ + x264_mb_dequant_2x2_dc( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ); /* XXX not inversed */ if( b_decimate && i_decimate_score < 7 ) { @@ -253,7 +266,7 @@ else { for( i = 0; i < 4; i++ ) - h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale ); + h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp ); } dct4x4[0][0][0] = dct2x2[0][0]; dct4x4[1][0][0] = dct2x2[0][1]; @@ -289,7 +302,7 @@ * x264_macroblock_encode_pskip: * Encode an already marked skip block *****************************************************************************/ -void x264_macroblock_encode_pskip( x264_t *h ) +static void x264_macroblock_encode_pskip( x264_t *h ) { const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0], h->mb.mv_min[0], h->mb.mv_max[0] ); @@ -316,6 +329,74 @@ } /***************************************************************************** + * Intra prediction for predictive lossless mode. + *****************************************************************************/ + +/* Note that these functions take a shortcut (mc.copy instead of actual pixel prediction) which assumes + * that the edge pixels of the reconstructed frame are the same as that of the source frame. This means + * they will only work correctly if the neighboring blocks are losslessly coded. In practice, this means + * lossless mode cannot be mixed with lossy mode within a frame. */ +/* This can be resolved by explicitly copying the edge pixels after doing the mc.copy, but this doesn't + * need to be done unless we decide to allow mixing lossless and lossy compression. */ + +void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode ) +{ + int stride = h->fenc->i_stride[1] << h->mb.b_interlaced; + if( i_mode == I_PRED_CHROMA_V ) + { + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-stride, stride, 8 ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-stride, stride, 8 ); + } + else if( i_mode == I_PRED_CHROMA_H ) + { + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-1, stride, 8 ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-1, stride, 8 ); + } + else + { + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); + } +} + +void x264_predict_lossless_4x4( x264_t *h, uint8_t *p_dst, int idx, int i_mode ) +{ + int stride = h->fenc->i_stride[0] << h->mb.b_interlaced; + uint8_t *p_src = h->mb.pic.p_fenc_plane[0] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride; + + if( i_mode == I_PRED_4x4_V ) + h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 ); + else if( i_mode == I_PRED_4x4_H ) + h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 ); + else + h->predict_4x4[i_mode]( p_dst ); +} + +void x264_predict_lossless_8x8( x264_t *h, uint8_t *p_dst, int idx, int i_mode, uint8_t edge[33] ) +{ + int stride = h->fenc->i_stride[0] << h->mb.b_interlaced; + uint8_t *p_src = h->mb.pic.p_fenc_plane[0] + (idx&1)*8 + (idx>>1)*8*stride; + + if( i_mode == I_PRED_8x8_V ) + h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 ); + else if( i_mode == I_PRED_8x8_H ) + h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 ); + else + h->predict_8x8[i_mode]( p_dst, edge ); +} + +void x264_predict_lossless_16x16( x264_t *h, int i_mode ) +{ + int stride = h->fenc->i_stride[0] << h->mb.b_interlaced; + if( i_mode == I_PRED_16x16_V ) + h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-stride, stride, 16 ); + else if( i_mode == I_PRED_16x16_H ) + h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-1, stride, 16 ); + else + h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] ); +} + +/***************************************************************************** * x264_macroblock_encode: *****************************************************************************/ void x264_macroblock_encode( x264_t *h ) @@ -363,8 +444,11 @@ { const int i_mode = h->mb.i_intra16x16_pred_mode; h->mb.b_transform_8x8 = 0; - /* do the right prediction */ - h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] ); + + if( h->mb.b_lossless ) + x264_predict_lossless_16x16( h, i_mode ); + else + h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] ); /* encode the 16x16 macroblock */ x264_mb_encode_i16x16( h, i_qp ); @@ -385,9 +469,13 @@ { uint8_t *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * FDEC_STRIDE]; int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]]; - x264_predict_8x8_filter( p_dst, edge, h->mb.i_neighbour8[i], x264_pred_i4x4_neighbors[i_mode] ); - h->predict_8x8[i_mode]( p_dst, edge ); + + if( h->mb.b_lossless ) + x264_predict_lossless_8x8( h, p_dst, i, i_mode, edge ); + else + h->predict_8x8[i_mode]( p_dst, edge ); + x264_mb_encode_i8x8( h, i, i_qp ); } for( i = 0; i < 4; i++ ) @@ -413,7 +501,10 @@ /* emulate missing topright samples */ *(uint32_t*) &p_dst[4-FDEC_STRIDE] = p_dst[3-FDEC_STRIDE] * 0x01010101U; - h->predict_4x4[i_mode]( p_dst ); + if( h->mb.b_lossless ) + x264_predict_lossless_4x4( h, p_dst, i, i_mode ); + else + h->predict_4x4[i_mode]( p_dst ); x264_mb_encode_i4x4( h, i, i_qp ); } } @@ -428,12 +519,23 @@ if( h->mb.b_lossless ) { - for( i4x4 = 0; i4x4 < 16; i4x4++ ) - { - h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4], - h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4], - h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] ); - } + if( h->mb.b_transform_8x8 ) + for( i8x8 = 0; i8x8 < 4; i8x8++ ) + { + int x = 8*(i8x8&1); + int y = 8*(i8x8>>1); + h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8], + h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE, + h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE ); + nnz8x8[i8x8] = array_non_zero( h->dct.luma8x8[i8x8] ); + } + else + for( i4x4 = 0; i4x4 < 16; i4x4++ ) + { + h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4], + h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4], + h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] ); + } } else if( h->mb.b_transform_8x8 ) { @@ -445,11 +547,8 @@ for( idx = 0; idx < 4; idx++ ) { if( h->mb.b_noise_reduction ) - h->quantf.denoise_dct_core( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 ); - if( h->mb.b_trellis ) - x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 ); - else - h->quantf.quant_8x8( dct8x8[idx], h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] ); + h->quantf.denoise_dct( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 ); + x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx ); h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] ); @@ -493,11 +592,8 @@ idx = i8x8 * 4 + i4x4; if( h->mb.b_noise_reduction ) - h->quantf.denoise_dct_core( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 ); - if( h->mb.b_trellis ) - x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 ); - else - h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); + h->quantf.denoise_dct( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 ); + x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx ); h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] ); @@ -530,8 +626,13 @@ if( IS_INTRA( h->mb.i_type ) ) { const int i_mode = h->mb.i_chroma_pred_mode; - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); + if( h->mb.b_lossless ) + x264_predict_lossless_8x8_chroma( h, i_mode ); + else + { + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); + } } /* encode the 8x8 blocks */ @@ -594,7 +695,7 @@ if( !b_force_no_skip ) { if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 && - !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) && + !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) && *(uint32_t*)h->mb.cache.mv[0][x264_scan8[0]] == *(uint32_t*)h->mb.cache.pskip_mv && h->mb.cache.ref[0][x264_scan8[0]] == 0 ) { @@ -755,12 +856,20 @@ if( h->mb.b_lossless ) { int i4; - for( i4 = i8*4; i4 < i8*4+4; i4++ ) + if( h->mb.b_transform_8x8 ) + { + h->zigzagf.sub_4x4( h->dct.luma4x4[i8], p_fenc, p_fdec ); + nnz8x8 = array_non_zero( h->dct.luma8x8[i8] ); + } + else { - h->zigzagf.sub_4x4( h->dct.luma4x4[i4], - h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4], - h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] ); - nnz8x8 |= array_non_zero( h->dct.luma4x4[i4] ); + for( i4 = i8*4; i4 < i8*4+4; i4++ ) + { + h->zigzagf.sub_4x4( h->dct.luma4x4[i4], + h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4], + h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] ); + nnz8x8 |= array_non_zero( h->dct.luma4x4[i4] ); + } } for( ch = 0; ch < 2; ch++ ) { @@ -776,10 +885,10 @@ { DECLARE_ALIGNED_16( int16_t dct8x8[8][8] ); h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec ); - h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] ); + x264_quant_8x8( h, dct8x8, i_qp, 0, i8 ); h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 ); - if( b_decimate ) + if( b_decimate && !h->mb.b_trellis ) nnz8x8 = 4 <= x264_mb_decimate_score( h->dct.luma8x8[i8], 64 ); else nnz8x8 = array_non_zero( dct8x8 ); @@ -796,7 +905,8 @@ DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] ); h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); for( i4 = 0; i4 < 4; i4++ ) - h->quantf.quant_4x4( dct4x4[i4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); + x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 ); + for( i4 = 0; i4 < 4; i4++ ) h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );

@@ -29,7 +29,7 @@ extern const int x264_lambda2_tab[52]; extern const int x264_lambda_tab[52]; -void x264_rdo_init( ); +void x264_rdo_init( void ); int x264_macroblock_probe_skip( x264_t *h, int b_bidir ); @@ -38,21 +38,26 @@ static inline int x264_macroblock_probe_bskip( x264_t *h ) { return x264_macroblock_probe_skip( h, 1 ); } +void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode ); +void x264_predict_lossless_4x4( x264_t *h, uint8_t *p_dst, int idx, int i_mode ); +void x264_predict_lossless_8x8( x264_t *h, uint8_t *p_dst, int idx, int i_mode, uint8_t edge[33] ); +void x264_predict_lossless_16x16( x264_t *h, int i_mode ); + void x264_macroblock_encode ( x264_t *h ); void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb ); void x264_macroblock_write_cavlc ( x264_t *h, bs_t *s ); void x264_macroblock_encode_p8x8( x264_t *h, int i8 ); -void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale ); -void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale ); -void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale ); +void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp ); +void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp ); +void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ); void x264_cabac_mb_skip( x264_t *h, int b_skip ); void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat, - int i_qp, int i_ctxBlockCat, int b_intra ); + int i_qp, int i_ctxBlockCat, int b_intra, int idx ); void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat, - int i_qp, int b_intra ); + int i_qp, int b_intra, int idx ); void x264_noise_reduction_update( x264_t *h );

@@ -31,7 +31,7 @@ * and refine_* are run only on the winner. * the subme=7 values are much higher because any amount of satd search makes * up its time by reducing the number of rd iterations. */ -static const int subpel_iterations[][4] = +static const int subpel_iterations[][4] = {{1,0,0,0}, {1,1,0,0}, {0,1,1,0}, @@ -162,7 +162,7 @@ int omx, omy, pmx, pmy; uint8_t *p_fref = m->p_fref[0]; DECLARE_ALIGNED_16( uint8_t pix[16*16] ); - + int i = 0, j; int dir; int costs[6]; @@ -663,7 +663,7 @@ { \ int stride = 16; \ uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \ - int cost = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ if( b_chroma_me && cost < bcost ) \ { \ @@ -787,8 +787,10 @@ #define BIME_CACHE( dx, dy ) \ { \ int i = 4 + 3*dx + dy; \ - h->mc.mc_luma( pix0[i], bw, m0->p_fref, m0->i_stride[0], om0x+dx, om0y+dy, bw, bh ); \ - h->mc.mc_luma( pix1[i], bw, m1->p_fref, m1->i_stride[0], om1x+dx, om1y+dy, bw, bh ); \ + stride0[i] = bw;\ + stride1[i] = bw;\ + src0[i] = h->mc.get_ref( pix0[i], &stride0[i], m0->p_fref, m0->i_stride[0], om0x+dx, om0y+dy, bw, bh ); \ + src1[i] = h->mc.get_ref( pix1[i], &stride1[i], m1->p_fref, m1->i_stride[0], om1x+dx, om1y+dy, bw, bh ); \ } #define BIME_CACHE2(a,b) \ @@ -802,11 +804,7 @@ int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \ int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \ visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\ - h->mc.memcpy_aligned( pix, pix0[i0], bs ); \ - if( i_weight == 32 ) \ - h->mc.avg[i_pixel]( pix, bw, pix1[i1], bw ); \ - else \ - h->mc.avg_weight[i_pixel]( pix, bw, pix1[i1], bw, i_weight ); \ + h->mc.avg[i_pixel]( pix, bw, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight ); \ cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, bw ) \ + p_cost_m0x[ m0x ] + p_cost_m0y[ m0y ] \ + p_cost_m1x[ m1x ] + p_cost_m1y[ m1y ]; \ @@ -838,7 +836,6 @@ const int i_pixel = m0->i_pixel; const int bw = x264_pixel_size[i_pixel].w; const int bh = x264_pixel_size[i_pixel].h; - const int bs = bw*bh; const int16_t *p_cost_m0x = m0->p_cost_mv - x264_clip3( m0->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); const int16_t *p_cost_m0y = m0->p_cost_mv - x264_clip3( m0->mvp[1], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); const int16_t *p_cost_m1x = m1->p_cost_mv - x264_clip3( m1->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); @@ -846,6 +843,10 @@ DECLARE_ALIGNED_16( uint8_t pix0[9][16*16] ); DECLARE_ALIGNED_16( uint8_t pix1[9][16*16] ); DECLARE_ALIGNED_16( uint8_t pix[16*16] ); + uint8_t *src0[9]; + uint8_t *src1[9]; + int stride0[9]; + int stride1[9]; int bm0x = m0->mv[0], om0x = bm0x; int bm0y = m0->mv[1], om0y = bm0y; int bm1x = m1->mv[0], om1x = bm1x; @@ -853,7 +854,7 @@ int bcost = COST_MAX; int pass = 0; /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */ - uint8_t visited[8][8][8]; + DECLARE_ALIGNED_16( uint8_t visited[8][8][8] ); h->mc.memzero_aligned( visited, sizeof(visited) ); BIME_CACHE( 0, 0 ); @@ -904,7 +905,7 @@ { \ int stride = 16; \ uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4 ); \ - dst = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + dst = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + p_cost_mvx[mx] + p_cost_mvy[my]; \ COPY1_IF_LT( bsatd, dst ); \ }

@@ -40,8 +40,7 @@ int kept_as_ref; float qscale; int mv_bits; - int i_tex_bits; - int p_tex_bits; + int tex_bits; int misc_bits; uint64_t expected_bits; double expected_vbv; @@ -118,10 +117,6 @@ double lmin[5]; /* min qscale by frame type */ double lmax[5]; double lstep; /* max change (multiply) in qscale per frame */ - double i_cplx_sum[5]; /* estimated total texture bits in intra MBs at qscale=1 */ - double p_cplx_sum[5]; - double mv_bits_sum[5]; - int frame_count[5]; /* number of frames of each type */ /* MBRC stuff */ double frame_size_estimated; @@ -132,10 +127,6 @@ int bframes; /* # consecutive B-frames before this P-frame */ int bframe_bits; /* total cost of those frames */ - /* AQ stuff */ - float aq_threshold; - int *ac_energy; - int i_zones; x264_zone_t *zones; x264_zone_t *prev_zone; @@ -149,7 +140,6 @@ static void update_vbv_plan( x264_t *h ); static double predict_size( predictor_t *p, double q, double var ); static void update_predictor( predictor_t *p, double q, double var, double bits ); -int x264_rc_analyse_slice( x264_t *h ); /* Terminology: * qp = h.264's quantizer @@ -172,72 +162,46 @@ { if(qscale<0.1) qscale = 0.1; - return (rce->i_tex_bits + rce->p_tex_bits + .1) * pow( rce->qscale / qscale, 1.1 ) + return (rce->tex_bits + .1) * pow( rce->qscale / qscale, 1.1 ) + rce->mv_bits * pow( X264_MAX(rce->qscale, 1) / X264_MAX(qscale, 1), 0.5 ) + rce->misc_bits; } // Find the total AC energy of the block in all planes. -static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, int *satd ) +static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame ) { /* This function contains annoying hacks because GCC has a habit of reordering emms * and putting it after floating point ops. As a result, we put the emms at the end of the * function and make sure that its always called before the float math. Noinline makes * sure no reordering goes on. */ - /* FIXME: This array is larger than necessary because a bug in GCC causes an all-zero - * array to be placed in .bss despite .bss not being correctly aligned on some platforms (win32?) */ - DECLARE_ALIGNED_16( static uint8_t zero[17] ) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}; - unsigned int var=0, sad, ssd, i; - if( satd || h->param.rc.i_aq_mode == X264_AQ_GLOBAL ) - { - for( i=0; i<3; i++ ) - { - int w = i ? 8 : 16; - int stride = h->fenc->i_stride[i]; - int offset = h->mb.b_interlaced - ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride - : w * (mb_x + mb_y * stride); - int pix = i ? PIXEL_8x8 : PIXEL_16x16; - stride <<= h->mb.b_interlaced; - sad = h->pixf.sad[pix]( zero, 0, h->fenc->plane[i]+offset, stride ); - ssd = h->pixf.ssd[pix]( zero, 0, h->fenc->plane[i]+offset, stride ); - var += ssd - (sad * sad >> (i?6:8)); - // SATD to represent the block's overall complexity (bit cost) for intra encoding. - // exclude the DC coef, because nothing short of an actual intra prediction will estimate DC cost. - if( var && satd ) - *satd += h->pixf.satd[pix]( zero, 0, h->fenc->plane[i]+offset, stride ) - sad/2; - } - var = X264_MAX(var,1); + unsigned int var=0, sad, i; + for( i=0; i<3; i++ ) + { + int w = i ? 8 : 16; + int stride = frame->i_stride[i]; + int offset = h->mb.b_interlaced + ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride + : w * (mb_x + mb_y * stride); + int pix = i ? PIXEL_8x8 : PIXEL_16x16; + stride <<= h->mb.b_interlaced; + var += h->pixf.var[pix]( frame->plane[i]+offset, stride, &sad ); } - else var = h->rc->ac_energy[h->mb.i_mb_xy]; + var = X264_MAX(var,1); x264_emms(); return var; } -void x264_autosense_aq( x264_t *h ) +void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame ) { - double total = 0; - double n = 0; int mb_x, mb_y; - // FIXME: Some of the SATDs might be already calculated elsewhere (ratecontrol?). Can we reuse them? - // FIXME: Is chroma SATD necessary? for( mb_y=0; mb_y<h->sps->i_mb_height; mb_y++ ) for( mb_x=0; mb_x<h->sps->i_mb_width; mb_x++ ) { - int satd=0; - int energy = ac_energy_mb( h, mb_x, mb_y, &satd ); - h->rc->ac_energy[mb_x + mb_y * h->sps->i_mb_width] = energy; - /* Weight the energy value by the SATD value of the MB. - * This represents the fact that the more complex blocks in a frame should - * be weighted more when calculating the optimal threshold. This also helps - * diminish the negative effect of large numbers of simple blocks in a frame, - * such as in the case of a letterboxed film. */ - total += logf(energy) * satd; - n += satd; + int energy = ac_energy_mb( h, mb_x, mb_y, frame ); + /* 10 constant chosen to result in approximately the same overall bitrate as without AQ. */ + float qp_adj = h->param.rc.f_aq_strength * 1.5 * (logf(energy) - 10.0); + frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj; } - x264_emms(); - /* Calculate and store the threshold. */ - h->rc->aq_threshold = n ? total/n : 15; } /***************************************************************************** @@ -249,18 +213,16 @@ *****************************************************************************/ void x264_adaptive_quant( x264_t *h ) { - int energy = ac_energy_mb( h, h->mb.i_mb_x, h->mb.i_mb_y, NULL ); - /* Adjust the QP based on the AC energy of the macroblock. */ - float qp = h->rc->f_qpm; - float qp_adj = 1.5 * (logf(energy) - h->rc->aq_threshold); - if( h->param.rc.i_aq_mode == X264_AQ_LOCAL ) - qp_adj = x264_clip3f( qp_adj, -5, 5 ); - h->mb.i_qp = x264_clip3( qp + qp_adj * h->param.rc.f_aq_strength + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); + float qp, qp_adj; + x264_emms(); + qp = h->rc->f_qpm; + qp_adj = h->fenc->f_qp_offset[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride]; + h->mb.i_qp = x264_clip3( qp + qp_adj + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB, * to lower the bit cost of the qp_delta. */ if( abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ) h->mb.i_qp = h->mb.i_last_qp; - h->mb.i_chroma_qp = i_chroma_qp_table[x264_clip3( h->mb.i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )]; + h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; } int x264_ratecontrol_new( x264_t *h ) @@ -275,7 +237,7 @@ rc->b_abr = h->param.rc.i_rc_method != X264_RC_CQP && !h->param.rc.b_stat_read; rc->b_2pass = h->param.rc.i_rc_method == X264_RC_ABR && h->param.rc.b_stat_read; - + /* FIXME: use integers */ if(h->param.i_fps_num > 0 && h->param.i_fps_den > 0) rc->fps = (float) h->param.i_fps_num / h->param.i_fps_den; @@ -435,6 +397,25 @@ if( strstr( opts, "qp=0" ) && h->param.rc.i_rc_method == X264_RC_ABR ) x264_log( h, X264_LOG_WARNING, "1st pass was lossless, bitrate prediction will be inaccurate\n" ); + + if( ( p = strstr( opts, "b_adapt=" ) ) && sscanf( p, "b_adapt=%d", &i ) && i >= X264_B_ADAPT_NONE && i <= X264_B_ADAPT_TRELLIS ) + h->param.i_bframe_adaptive = i; + else if( h->param.i_bframe ) + { + x264_log( h, X264_LOG_ERROR, "b_adapt method specified in stats file not valid\n" ); + return -1; + } + + if( ( p = strstr( opts, "scenecut=" ) ) && sscanf( p, "scenecut=%d", &i ) && i >= -1 && i <= 100 ) + { + h->param.i_scenecut_threshold = i; + h->param.b_pre_scenecut = !!strstr( p, "(pre)" ); + } + else + { + x264_log( h, X264_LOG_ERROR, "scenecut method specified in stats file not valid\n" ); + return -1; + } } /* find number of pics */ @@ -503,8 +484,8 @@ rce = &rc->entry[frame_number]; rce->direct_mode = 0; - e += sscanf(p, " in:%*d out:%*d type:%c q:%f itex:%d ptex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c", - &pict_type, &qp, &rce->i_tex_bits, &rce->p_tex_bits, + e += sscanf(p, " in:%*d out:%*d type:%c q:%f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c", + &pict_type, &qp, &rce->tex_bits, &rce->mv_bits, &rce->misc_bits, &rce->i_count, &rce->p_count, &rce->s_count, &rce->direct_mode); @@ -561,9 +542,11 @@ { h->thread[i]->rc = rc+i; if( i ) + { rc[i] = rc[0]; - if( h->param.rc.i_aq_mode == X264_AQ_LOCAL ) - rc[i].ac_energy = x264_malloc( h->mb.i_mb_count * sizeof(int) ); + memcpy( &h->thread[i]->param, &h->param, sizeof( x264_param_t ) ); + h->thread[i]->mb.b_variable_qp = h->mb.b_variable_qp; + } } return 0; @@ -673,7 +656,7 @@ return 0; } -x264_zone_t *get_zone( x264_t *h, int frame_num ) +static x264_zone_t *get_zone( x264_t *h, int frame_num ) { int i; for( i = h->rc->i_zones-1; i >= 0; i-- ) @@ -691,7 +674,7 @@ if( rc->b_abr && h->param.rc.i_rc_method == X264_RC_ABR && rc->cbr_decay > .9999 ) { double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80); - x264_log( h, X264_LOG_INFO, "final ratefactor: %.2f\n", + x264_log( h, X264_LOG_INFO, "final ratefactor: %.2f\n", qscale2qp( pow( base_cplx, 1 - h->param.rc.f_qcompress ) * rc->cplxr_sum / rc->wanted_bits_window ) ); } @@ -725,8 +708,6 @@ x264_free( rc->zones[i].param ); x264_free( rc->zones ); } - for( i=0; i<h->param.i_threads; i++ ) - x264_free( rc[i].ac_energy ); x264_free( rc ); } @@ -850,26 +831,19 @@ if( h->sh.i_type != SLICE_TYPE_B ) rc->last_non_b_pict_type = h->sh.i_type; - - /* Adaptive AQ thresholding algorithm. */ - if( h->param.rc.i_aq_mode == X264_AQ_GLOBAL ) - /* Arbitrary value for "center" of the AQ curve. - * Chosen so that any given value of CRF has on average similar bitrate with and without AQ. */ - h->rc->aq_threshold = logf(5000); - else if( h->param.rc.i_aq_mode == X264_AQ_LOCAL ) - x264_autosense_aq(h); } -double predict_row_size( x264_t *h, int y, int qp ) +static double predict_row_size( x264_t *h, int y, int qp ) { /* average between two predictors: * absolute SATD, and scaled bit cost of the colocated row in the previous frame */ x264_ratecontrol_t *rc = h->rc; double pred_s = predict_size( rc->row_pred, qp2qscale(qp), h->fdec->i_row_satd[y] ); double pred_t = 0; - if( h->sh.i_type != SLICE_TYPE_I + if( h->sh.i_type != SLICE_TYPE_I && h->fref0[0]->i_type == h->fdec->i_type - && h->fref0[0]->i_row_satd[y] > 0 ) + && h->fref0[0]->i_row_satd[y] > 0 + && (abs(h->fref0[0]->i_row_satd[y] - h->fdec->i_row_satd[y]) < h->fdec->i_row_satd[y]/2)) { pred_t = h->fref0[0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref0[0]->i_row_satd[y] * qp2qscale(h->fref0[0]->i_row_qp[y]) / qp2qscale(qp); @@ -880,7 +854,7 @@ return (pred_s + pred_t) / 2; } -double row_bits_so_far( x264_t *h, int y ) +static double row_bits_so_far( x264_t *h, int y ) { int i; double bits = 0; @@ -889,7 +863,7 @@ return bits; } -double predict_row_size_sum( x264_t *h, int y, int qp ) +static double predict_row_size_sum( x264_t *h, int y, int qp ) { int i; double bits = row_bits_so_far(h, y); @@ -1016,14 +990,14 @@ x264_log(h, X264_LOG_ERROR, "2nd pass has more frames than 1st pass (%d)\n", rc->num_entries); x264_log(h, X264_LOG_ERROR, "continuing anyway, at constant QP=%d\n", h->param.rc.i_qp_constant); - if( h->param.b_bframe_adaptive ) + if( h->param.i_bframe_adaptive ) x264_log(h, X264_LOG_ERROR, "disabling adaptive B-frames\n"); rc->b_abr = 0; rc->b_2pass = 0; h->param.rc.i_rc_method = X264_RC_CQP; h->param.rc.b_stat_read = 0; - h->param.b_bframe_adaptive = 0; + h->param.i_bframe_adaptive = 0; if( h->param.i_bframe > 1 ) h->param.i_bframe = 1; return X264_TYPE_P; @@ -1073,15 +1047,16 @@ int dir_frame = h->stat.frame.i_direct_score[1] - h->stat.frame.i_direct_score[0]; int dir_avg = h->stat.i_direct_score[1] - h->stat.i_direct_score[0]; char c_direct = h->mb.b_direct_auto_write ? - ( dir_frame>0 ? 's' : dir_frame<0 ? 't' : + ( dir_frame>0 ? 's' : dir_frame<0 ? 't' : dir_avg>0 ? 's' : dir_avg<0 ? 't' : '-' ) : '-'; fprintf( rc->p_stat_file_out, - "in:%d out:%d type:%c q:%.2f itex:%d ptex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c;\n", + "in:%d out:%d type:%c q:%.2f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c;\n", h->fenc->i_frame, h->i_frame, c_type, rc->qpa_rc, - h->stat.frame.i_itex_bits, h->stat.frame.i_ptex_bits, - h->stat.frame.i_hdr_bits, h->stat.frame.i_misc_bits, + h->stat.frame.i_tex_bits, + h->stat.frame.i_mv_bits, + h->stat.frame.i_misc_bits, h->stat.frame.i_mb_count_i, h->stat.frame.i_mb_count_p, h->stat.frame.i_mb_count_skip, @@ -1132,76 +1107,19 @@ * 2 pass functions ***************************************************************************/ -double x264_eval( char *s, double *const_value, const char **const_name, - double (**func1)(void *, double), const char **func1_name, - double (**func2)(void *, double, double), char **func2_name, - void *opaque ); - /** * modify the bitrate curve from pass1 for one frame */ static double get_qscale(x264_t *h, ratecontrol_entry_t *rce, double rate_factor, int frame_num) { x264_ratecontrol_t *rcc= h->rc; - const int pict_type = rce->pict_type; double q; x264_zone_t *zone = get_zone( h, frame_num ); - double const_values[]={ - rce->i_tex_bits * rce->qscale, - rce->p_tex_bits * rce->qscale, - (rce->i_tex_bits + rce->p_tex_bits) * rce->qscale, - rce->mv_bits * rce->qscale, - (double)rce->i_count / rcc->nmb, - (double)rce->p_count / rcc->nmb, - (double)rce->s_count / rcc->nmb, - rce->pict_type == SLICE_TYPE_I, - rce->pict_type == SLICE_TYPE_P, - rce->pict_type == SLICE_TYPE_B, - h->param.rc.f_qcompress, - rcc->i_cplx_sum[SLICE_TYPE_I] / rcc->frame_count[SLICE_TYPE_I], - rcc->i_cplx_sum[SLICE_TYPE_P] / rcc->frame_count[SLICE_TYPE_P], - rcc->p_cplx_sum[SLICE_TYPE_P] / rcc->frame_count[SLICE_TYPE_P], - rcc->p_cplx_sum[SLICE_TYPE_B] / rcc->frame_count[SLICE_TYPE_B], - (rcc->i_cplx_sum[pict_type] + rcc->p_cplx_sum[pict_type]) / rcc->frame_count[pict_type], - rce->blurred_complexity, - 0 - }; - static const char *const_names[]={ - "iTex", - "pTex", - "tex", - "mv", - "iCount", - "pCount", - "sCount", - "isI", - "isP", - "isB", - "qComp", - "avgIITex", - "avgPITex", - "avgPPTex", - "avgBPTex", - "avgTex", - "blurCplx", - NULL - }; - static double (*func1[])(void *, double)={ -// (void *)bits2qscale, - (void *)qscale2bits, - NULL - }; - static const char *func1_names[]={ -// "bits2qp", - "qp2bits", - NULL - }; - - q = x264_eval((char*)h->param.rc.psz_rc_eq, const_values, const_names, func1, func1_names, NULL, NULL, rce); + q = pow( rce->blurred_complexity, 1 - h->param.rc.f_qcompress ); // avoid NaN's in the rc_eq - if(!isfinite(q) || rce->i_tex_bits + rce->p_tex_bits + rce->mv_bits == 0) + if(!isfinite(q) || rce->tex_bits + rce->mv_bits == 0) q = rcc->last_qscale; else { @@ -1253,7 +1171,7 @@ } else if( pict_type == SLICE_TYPE_P && rcc->last_non_b_pict_type == SLICE_TYPE_P - && rce->i_tex_bits + rce->p_tex_bits == 0 ) + && rce->tex_bits == 0 ) { q = last_p_q; } @@ -1431,6 +1349,21 @@ + h->stat.i_slice_size[SLICE_TYPE_P] + h->stat.i_slice_size[SLICE_TYPE_B]); + if( h->param.i_threads > 1 ) + { + int j = h->rc - h->thread[0]->rc; + int i; + for( i=1; i<h->param.i_threads; i++ ) + { + x264_t *t = h->thread[ (j+i)%h->param.i_threads ]; + double bits = t->rc->frame_size_planned; + if( !t->b_thread_active ) + continue; + bits = X264_MAX(bits, x264_ratecontrol_get_estimated_size(t)); + total_bits += (int64_t)bits; + } + } + if( rcc->b_2pass ) { rce = *rcc->rce; @@ -1501,10 +1434,12 @@ double expected_fullness = rce.expected_vbv / rcc->buffer_size; double qmax = q*(2 - expected_fullness); double size_constraint = 1 + expected_fullness; + qmax = X264_MAX(qmax, rce.new_qscale); if (expected_fullness < .05) qmax = lmax; qmax = X264_MIN(qmax, lmax); - while( (expected_vbv < rce.expected_vbv/size_constraint) && (q < qmax) ) + while( ((expected_vbv < rce.expected_vbv/size_constraint) && (q < qmax)) || + ((expected_vbv < 0) && (q < lmax))) { q *= 1.05; expected_size = qscale2bits(&rce, q); @@ -1534,9 +1469,8 @@ rcc->short_term_cplxsum += rcc->last_satd; rcc->short_term_cplxcount ++; - rce.p_tex_bits = rcc->last_satd; + rce.tex_bits = rcc->last_satd; rce.blurred_complexity = rcc->short_term_cplxsum / rcc->short_term_cplxcount; - rce.i_tex_bits = 0; rce.mv_bits = 0; rce.p_count = rcc->nmb; rce.i_count = 0; @@ -1789,10 +1723,6 @@ { ratecontrol_entry_t *rce = &rcc->entry[i]; all_const_bits += rce->misc_bits; - rcc->i_cplx_sum[rce->pict_type] += rce->i_tex_bits * rce->qscale; - rcc->p_cplx_sum[rce->pict_type] += rce->p_tex_bits * rce->qscale; - rcc->mv_bits_sum[rce->pict_type] += rce->mv_bits * rce->qscale; - rcc->frame_count[rce->pict_type] ++; } if( all_available_bits < all_const_bits)

@@ -27,6 +27,8 @@ int x264_ratecontrol_new ( x264_t * ); void x264_ratecontrol_delete( x264_t * ); +void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame ); +void x264_adaptive_quant( x264_t * ); void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next ); void x264_ratecontrol_start( x264_t *, int i_force_qp ); int x264_ratecontrol_slice_type( x264_t *, int i_frame ); @@ -34,9 +36,9 @@ int x264_ratecontrol_qp( x264_t * ); void x264_ratecontrol_end( x264_t *, int bits ); void x264_ratecontrol_summary( x264_t * ); -void x264_adaptive_quant( x264_t * ); void x264_ratecontrol_set_estimated_size( x264_t *, int bits ); int x264_ratecontrol_get_estimated_size( x264_t const *); +int x264_rc_analyse_slice( x264_t *h ); #endif

@@ -34,7 +34,7 @@ #define bs_write_ue(s,v) ((s)->i_bits_encoded += bs_size_ue(v)) #define bs_write_se(s,v) ((s)->i_bits_encoded += bs_size_se(v)) #define bs_write_te(s,v,l) ((s)->i_bits_encoded += bs_size_te(v,l)) -#define x264_macroblock_write_cavlc x264_macroblock_size_cavlc +#define x264_macroblock_write_cavlc static x264_macroblock_size_cavlc #include "cavlc.c" /* CABAC: not exactly the same. x264_cabac_size_decision() keeps track of @@ -45,26 +45,84 @@ #define x264_cabac_encode_bypass(c,v) ((c)->f8_bits_encoded += 256) #define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue_big(v+(1<<e)-1)-e)<<8) #define x264_cabac_encode_flush(h,c) -#define x264_macroblock_write_cabac x264_macroblock_size_cabac +#define x264_macroblock_write_cabac static x264_macroblock_size_cabac #include "cabac.c" #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \ sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) ) - -static int ssd_mb( x264_t *h ) + + +/* Sum the cached SATDs to avoid repeating them. */ +static inline int sum_satd( x264_t *h, int pixel, int x, int y ) { - return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, - h->mb.pic.p_fdec[0], FDEC_STRIDE ) - + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE, - h->mb.pic.p_fdec[1], FDEC_STRIDE ) - + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE, - h->mb.pic.p_fdec[2], FDEC_STRIDE ); + int satd = 0; + int min_x = x>>2; + int min_y = y>>2; + int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2); + int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2); + if( pixel == PIXEL_16x16 ) + return h->mb.pic.fenc_satd_sum; + for( y = min_y; y < max_y; y++ ) + for( x = min_x; x < max_x; x++ ) + satd += h->mb.pic.fenc_satd[y][x]; + return satd; +} + +static inline int sum_sa8d( x264_t *h, int pixel, int x, int y ) +{ + int sa8d = 0; + int min_x = x>>3; + int min_y = y>>3; + int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3); + int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3); + if( pixel == PIXEL_16x16 ) + return h->mb.pic.fenc_sa8d_sum; + for( y = min_y; y < max_y; y++ ) + for( x = min_x; x < max_x; x++ ) + sa8d += h->mb.pic.fenc_sa8d[y][x]; + return sa8d; +} + +/* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */ +/* SATD and SA8D are used to measure block complexity. */ +/* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size. Using SATD */ +/* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */ + +/* FIXME: Is there a better metric than averaged SATD/SA8D difference for complexity difference? */ +/* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */ +/* This optimization can also be used in non-RD transform decision. */ + +static inline int ssd_plane( x264_t *h, int size, int p, int x, int y ) +{ + DECLARE_ALIGNED_16(static uint8_t zero[16]); + int satd = 0; + uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE; + uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE; + if( p == 0 && h->mb.i_psy_rd ) + { + /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */ + if( size <= PIXEL_8x8 ) + { + uint64_t acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE ); + satd = abs((int32_t)acs - sum_satd( h, size, x, y )) + + abs((int32_t)(acs>>32) - sum_sa8d( h, size, x, y )); + satd >>= 1; + } + else + { + int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1; + satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - sum_satd( h, size, x, y )); + } + satd = (satd * h->mb.i_psy_rd * x264_lambda_tab[h->mb.i_qp] + 128) >> 8; + } + return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd; } -static int ssd_plane( x264_t *h, int size, int p, int x, int y ) +static inline int ssd_mb( x264_t *h ) { - return h->pixf.ssd[size]( h->mb.pic.p_fenc[p] + x+y*FENC_STRIDE, FENC_STRIDE, - h->mb.pic.p_fdec[p] + x+y*FDEC_STRIDE, FDEC_STRIDE ); + return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + + ssd_plane(h, PIXEL_8x8, 1, 0, 0) + + ssd_plane(h, PIXEL_8x8, 2, 0, 0); } static int x264_rd_cost_mb( x264_t *h, int i_lambda2 ) @@ -140,7 +198,7 @@ return (i_ssd<<8) + i_bits; } -uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode ) +static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode ) { uint64_t i_ssd, i_bits; @@ -162,7 +220,7 @@ return (i_ssd<<8) + i_bits; } -uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode ) +static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode ) { uint64_t i_ssd, i_bits; @@ -184,7 +242,7 @@ return (i_ssd<<8) + i_bits; } -uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct ) +static uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct ) { uint64_t i_ssd, i_bits; @@ -219,7 +277,7 @@ #define LAMBDA_BITS 4 /* precalculate the cost of coding abs_level_m1 */ -void x264_rdo_init( ) +void x264_rdo_init( void ) { int i_prefix; int i_ctx; @@ -247,29 +305,29 @@ // I'm just matching the behaviour of deadzone quant. static const int lambda2_tab[2][52] = { // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS) - { 46, 58, 73, 92, 117, 147, - 185, 233, 294, 370, 466, 587, - 740, 932, 1174, 1480, 1864, 2349, - 2959, 3728, 4697, 5918, 7457, 9395, - 11837, 14914, 18790, 23674, 29828, 37581, - 47349, 59656, 75163, 94699, 119313, 150326, - 189399, 238627, 300652, 378798, 477255, 601304, - 757596, 954511, 1202608, 1515192, 1909022, 2405217, + { 46, 58, 73, 92, 117, 147, + 185, 233, 294, 370, 466, 587, + 740, 932, 1174, 1480, 1864, 2349, + 2959, 3728, 4697, 5918, 7457, 9395, + 11837, 14914, 18790, 23674, 29828, 37581, + 47349, 59656, 75163, 94699, 119313, 150326, + 189399, 238627, 300652, 378798, 477255, 601304, + 757596, 954511, 1202608, 1515192, 1909022, 2405217, 3030384, 3818045, 4810435, 6060769 }, // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS) - { 27, 34, 43, 54, 68, 86, - 108, 136, 172, 216, 273, 343, - 433, 545, 687, 865, 1090, 1374, - 1731, 2180, 2747, 3461, 4361, 5494, - 6922, 8721, 10988, 13844, 17442, 21976, - 27688, 34885, 43953, 55377, 69771, 87906, - 110755, 139543, 175813, 221511, 279087, 351627, - 443023, 558174, 703255, 886046, 1116348, 1406511, + { 27, 34, 43, 54, 68, 86, + 108, 136, 172, 216, 273, 343, + 433, 545, 687, 865, 1090, 1374, + 1731, 2180, 2747, 3461, 4361, 5494, + 6922, 8721, 10988, 13844, 17442, 21976, + 27688, 34885, 43953, 55377, 69771, 87906, + 110755, 139543, 175813, 221511, 279087, 351627, + 443023, 558174, 703255, 886046, 1116348, 1406511, 1772093, 2232697, 2813022, 3544186 } }; typedef struct { - uint64_t score; + int64_t score; int level_idx; // index into level_tree[] uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1 } trellis_node_t; @@ -298,7 +356,7 @@ static inline void quant_trellis_cabac( x264_t *h, int16_t *dct, const uint16_t *quant_mf, const int *unquant_mf, const int *coef_weight, const uint8_t *zigzag, - int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs ) + int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs, int idx ) { int abs_coefs[64], signs[64]; trellis_node_t nodes[2][8]; @@ -430,8 +488,20 @@ // that are better left coded, especially at QP > 40. for( abs_level = q; abs_level >= q-1; abs_level-- ) { - int d = i_coef - ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8); - uint64_t ssd = (int64_t)d*d * coef_weight[i]; + int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8); + int d = i_coef - unquant_abs_level; + int64_t ssd; + /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */ + if( h->mb.i_psy_trellis && i ) + { + int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][i] : h->mb.pic.fenc_dct4[idx][i]; + int predicted_coef = orig_coef - i_coef * signs[i]; + int psy_value = h->mb.i_psy_trellis * abs(predicted_coef + unquant_abs_level * signs[i]); + int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]]; + ssd = (int64_t)d*d * coef_weight[i] - psy_weight * psy_value; + } + else + ssd = (int64_t)d*d * coef_weight[i]; for( j = 0; j < 8; j++ ) { @@ -495,24 +565,24 @@ void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat, - int i_qp, int i_ctxBlockCat, int b_intra ) + int i_qp, int i_ctxBlockCat, int b_intra, int idx ) { int b_ac = (i_ctxBlockCat == DCT_LUMA_AC); quant_trellis_cabac( h, (int16_t*)dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], x264_dct4_weight2_zigzag[h->mb.b_interlaced], x264_zigzag_scan4[h->mb.b_interlaced], - i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 16 ); + i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 16, idx ); } void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat, - int i_qp, int b_intra ) + int i_qp, int b_intra, int idx ) { quant_trellis_cabac( h, (int16_t*)dct, h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], x264_dct8_weight2_zigzag[h->mb.b_interlaced], x264_zigzag_scan8[h->mb.b_interlaced], - DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 64 ); + DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 64, idx ); }

@@ -27,6 +27,7 @@ #ifndef _MSC_VER #include "config.h" #endif +#include "set.h" #define bs_write_ue bs_write_ue_big @@ -79,7 +80,7 @@ sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0; if( sps->b_qpprime_y_zero_transform_bypass ) - sps->i_profile_idc = PROFILE_HIGH444; + sps->i_profile_idc = PROFILE_HIGH444_PREDICTIVE; else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT ) sps->i_profile_idc = PROFILE_HIGH; else if( param->b_cabac || param->i_bframe > 0 ) @@ -150,11 +151,11 @@ sps->vui.i_sar_width = param->vui.i_sar_width; sps->vui.i_sar_height= param->vui.i_sar_height; } - + sps->vui.b_overscan_info_present = ( param->vui.i_overscan ? 1 : 0 ); if( sps->vui.b_overscan_info_present ) sps->vui.b_overscan_info = ( param->vui.i_overscan == 2 ? 1 : 0 ); - + sps->vui.b_signal_type_present = 0; sps->vui.i_vidformat = ( param->vui.i_vidformat <= 5 ? param->vui.i_vidformat : 5 ); sps->vui.b_fullrange = ( param->vui.b_fullrange ? 1 : 0 ); @@ -176,7 +177,7 @@ { sps->vui.b_signal_type_present = 1; } - + /* FIXME: not sufficient for interlaced video */ sps->vui.b_chroma_loc_info_present = ( param->vui.i_chroma_loc ? 1 : 0 ); if( sps->vui.b_chroma_loc_info_present )

@@ -37,9 +37,9 @@ h->mb.b_chroma_me = 0; } -int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, +static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b, - int dist_scale_factor ) + int dist_scale_factor, int do_search[2] ) { x264_frame_t *fref0 = frames[p0]; x264_frame_t *fref1 = frames[p1]; @@ -51,6 +51,9 @@ const int i_mb_xy = i_mb_x + i_mb_y * i_mb_stride; const int i_stride = fenc->i_stride_lowres; const int i_pel_offset = 8 * ( i_mb_x + i_mb_y * i_stride ); + const int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32; + int16_t (*fenc_mvs[2])[2] = { &frames[b]->lowres_mvs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mvs[1][p1-b-1][i_mb_xy] }; + int (*fenc_costs[2]) = { &frames[b]->lowres_mv_costs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mv_costs[1][p1-b-1][i_mb_xy] }; DECLARE_ALIGNED_8( uint8_t pix1[9*FDEC_STRIDE] ); uint8_t *pix2 = pix1+8; @@ -70,7 +73,7 @@ h->mb.mv_max_fpel[0] = 8*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 4; h->mb.mv_min_spel[0] = 4*( h->mb.mv_min_fpel[0] - 8 ); h->mb.mv_max_spel[0] = 4*( h->mb.mv_max_fpel[0] + 8 ); - if( h->mb.i_mb_x <= 1 ) + if( h->mb.i_mb_x >= h->sps->i_mb_width - 2 ) { h->mb.mv_min_fpel[1] = -8*h->mb.i_mb_y - 4; h->mb.mv_max_fpel[1] = 8*( h->sps->i_mb_height - h->mb.i_mb_y - 1 ) + 4; @@ -85,12 +88,6 @@ (dst)[2] = &(src)[2][i_pel_offset]; \ (dst)[3] = &(src)[3][i_pel_offset]; \ } -#define SAVE_MVS( mv0, mv1 ) \ - { \ - *(uint32_t*)fenc->mv[0][i_mb_xy] = *(uint32_t*)mv0; \ - if( b_bidir ) \ - *(uint32_t*)fenc->mv[1][i_mb_xy] = *(uint32_t*)mv1; \ - } #define CLIP_MV( mv ) \ { \ mv[0] = x264_clip3( mv[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); \ @@ -98,21 +95,18 @@ } #define TRY_BIDIR( mv0, mv1, penalty ) \ { \ - int stride2 = 16; \ - uint8_t *src2; \ + int stride1 = 16, stride2 = 16; \ + uint8_t *src1, *src2; \ int i_cost; \ - h->mc.mc_luma( pix1, 16, m[0].p_fref, m[0].i_stride[0], \ - (mv0)[0], (mv0)[1], 8, 8 ); \ + src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \ + (mv0)[0], (mv0)[1], 8, 8 ); \ src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \ - (mv1)[0], (mv1)[1], 8, 8 ); \ - h->mc.avg[PIXEL_8x8]( pix1, 16, src2, stride2 ); \ + (mv1)[0], (mv1)[1], 8, 8 ); \ + h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \ i_cost = penalty + h->pixf.mbcmp[PIXEL_8x8]( \ m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \ if( i_bcost > i_cost ) \ - { \ i_bcost = i_cost; \ - SAVE_MVS( mv0, mv1 ); \ - } \ } m[0].i_pixel = PIXEL_8x8; @@ -123,7 +117,7 @@ if( b_bidir ) { - int16_t *mvr = fref1->mv[0][i_mb_xy]; + int16_t *mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy]; int dmv[2][2]; int mv0[2] = {0,0}; @@ -149,68 +143,79 @@ { DECLARE_ALIGNED_4(int16_t mvc[4][2]) = {{0}}; int i_mvc = 0; - int16_t (*fenc_mv)[2] = &fenc->mv[l][i_mb_xy]; -#define MVC(mv) { *(uint32_t*)mvc[i_mvc] = *(uint32_t*)mv; i_mvc++; } - if( i_mb_x > 0 ) - MVC(fenc_mv[-1]); - if( i_mb_y > 0 ) + int16_t (*fenc_mv)[2] = fenc_mvs[l]; + + if( do_search[l] ) { - MVC(fenc_mv[-i_mb_stride]); + /* Reverse-order MV prediction. */ +#define MVC(mv) { *(uint32_t*)mvc[i_mvc] = *(uint32_t*)mv; i_mvc++; } if( i_mb_x < h->sps->i_mb_width - 1 ) - MVC(fenc_mv[-i_mb_stride+1]); - if( i_mb_x > 0 ) - MVC(fenc_mv[-i_mb_stride-1]); - } + MVC(fenc_mv[1]); + if( i_mb_y < h->sps->i_mb_height - 1 ) + { + MVC(fenc_mv[i_mb_stride]); + if( i_mb_x > 0 ) + MVC(fenc_mv[i_mb_stride-1]); + if( i_mb_x < h->sps->i_mb_width - 1 ) + MVC(fenc_mv[i_mb_stride+1]); + } #undef MVC - x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] ); - x264_me_search( h, &m[l], mvc, i_mvc ); + x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] ); + x264_me_search( h, &m[l], mvc, i_mvc ); - m[l].cost -= 2; // remove mvcost from skip mbs - if( *(uint32_t*)m[l].mv ) - m[l].cost += 5; + m[l].cost -= 2; // remove mvcost from skip mbs + if( *(uint32_t*)m[l].mv ) + m[l].cost += 5; + *(uint32_t*)fenc_mvs[l] = *(uint32_t*)m[l].mv; + *fenc_costs[l] = m[l].cost; + } + else + { + *(uint32_t*)m[l].mv = *(uint32_t*)fenc_mvs[l]; + m[l].cost = *fenc_costs[l]; + } i_bcost = X264_MIN( i_bcost, m[l].cost ); } if( b_bidir && ( *(uint32_t*)m[0].mv || *(uint32_t*)m[1].mv ) ) TRY_BIDIR( m[0].mv, m[1].mv, 5 ); - if( i_bcost < i_cost_bak ) - SAVE_MVS( m[0].mv, m[1].mv ); - - //FIXME intra part could be shared across multiple encodings of the frame lowres_intra_mb: - if( !b_bidir ) // forbid intra-mbs in B-frames, because it's rare and not worth checking + /* forbid intra-mbs in B-frames, because it's rare and not worth checking */ + /* FIXME: Should we still forbid them now that we cache intra scores? */ + if( !b_bidir ) { - uint8_t *pix = &pix1[8+FDEC_STRIDE - 1]; - uint8_t *src = &fenc->lowres[0][i_pel_offset - 1]; - const int intra_penalty = 5; - int satds[4], i_icost, b_intra; - - memcpy( pix-FDEC_STRIDE, src-i_stride, 17 ); - for( i=0; i<8; i++ ) - pix[i*FDEC_STRIDE] = src[i*i_stride]; - pix++; - - if( h->pixf.intra_satd_x3_8x8c && h->pixf.mbcmp[0] == h->pixf.satd[0] ) - { - h->pixf.intra_satd_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds ); - h->predict_8x8c[I_PRED_CHROMA_P]( pix ); - satds[I_PRED_CHROMA_P] = - h->pixf.satd[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); - } - else + int i_icost, b_intra; + if( !fenc->b_intra_calculated ) { - for( i=0; i<4; i++ ) + DECLARE_ALIGNED_16( uint8_t edge[33] ); + uint8_t *pix = &pix1[8+FDEC_STRIDE - 1]; + uint8_t *src = &fenc->lowres[0][i_pel_offset - 1]; + const int intra_penalty = 5; + int satds[4]; + + memcpy( pix-FDEC_STRIDE, src-i_stride, 17 ); + for( i=0; i<8; i++ ) + pix[i*FDEC_STRIDE] = src[i*i_stride]; + pix++; + + if( h->pixf.intra_satd_x3_8x8c && h->pixf.mbcmp[0] == h->pixf.satd[0] ) { - h->predict_8x8c[i]( pix ); - satds[i] = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); + h->pixf.intra_satd_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds ); + h->predict_8x8c[I_PRED_CHROMA_P]( pix ); + satds[I_PRED_CHROMA_P] = + h->pixf.satd[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); } - } - i_icost = X264_MIN4( satds[0], satds[1], satds[2], satds[3] ); + else + { + for( i=0; i<4; i++ ) + { + h->predict_8x8c[i]( pix ); + satds[i] = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); + } + } + i_icost = X264_MIN4( satds[0], satds[1], satds[2], satds[3] ); - if( i_icost < i_bcost * 2 ) - { - DECLARE_ALIGNED_16( uint8_t edge[33] ); x264_predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); for( i=3; i<9; i++ ) { @@ -219,9 +224,12 @@ satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); i_icost = X264_MIN( i_icost, satd ); } - } - i_icost += intra_penalty; + i_icost += intra_penalty; + fenc->i_intra_cost[i_mb_xy] = i_icost; + } + else + i_icost = fenc->i_intra_cost[i_mb_xy]; b_intra = i_icost < i_bcost; if( b_intra ) i_bcost = i_icost; @@ -236,18 +244,26 @@ return i_bcost; } #undef TRY_BIDIR -#undef SAVE_MVS -int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a, +#define NUM_MBS\ + (h->sps->i_mb_width > 2 && h->sps->i_mb_height > 2 ?\ + (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2) :\ + h->sps->i_mb_width * h->sps->i_mb_height) + +static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b, int b_intra_penalty ) { int i_score = 0; + /* Don't use the AQ'd scores for slicetype decision. */ + int i_score_aq = 0; + int do_search[2]; /* Check whether we already evaluated this frame * If we have tried this frame as P, then we have also tried * the preceding frames as B. (is this still true?) */ - if( frames[b]->i_cost_est[b-p0][p1-b] >= 0 ) + /* Also check that we already calculated the row SATDs for the current frame. */ + if( frames[b]->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || frames[b]->i_row_satds[b-p0][p1-b][0] != -1) ) { i_score = frames[b]->i_cost_est[b-p0][p1-b]; } @@ -256,11 +272,11 @@ int dist_scale_factor = 128; int *row_satd = frames[b]->i_row_satds[b-p0][p1-b]; - /* Init MVs so that we don't have to check edge conditions when loading predictors. */ - /* FIXME: not needed every time */ - memset( frames[b]->mv[0], 0, h->sps->i_mb_height * h->sps->i_mb_width * 2*sizeof(int16_t) ); - if( b != p1 ) - memset( frames[b]->mv[1], 0, h->sps->i_mb_height * h->sps->i_mb_width * 2*sizeof(int16_t) ); + /* For each list, check to see whether we have lowres motion-searched this reference frame before. */ + do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF; + do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF; + if( do_search[0] ) frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0; + if( do_search[1] ) frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0; if( b == p1 ) { @@ -270,50 +286,157 @@ if( p1 != p0 ) dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0); + /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode. */ + /* This considerably improves MV prediction overall. */ + if( h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 ) + { + for( h->mb.i_mb_y = h->sps->i_mb_height - 1; h->mb.i_mb_y >= 0 ; h->mb.i_mb_y-- ) + for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0 ; h->mb.i_mb_x-- ) + i_score += x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search ); + } /* the edge mbs seem to reduce the predictive quality of the * whole frame's score, but are needed for a spatial distribution. */ - if( h->param.rc.i_vbv_buffer_size ) + else if( h->param.rc.i_vbv_buffer_size ) { - for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ ) + for( h->mb.i_mb_y = h->sps->i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- ) { row_satd[ h->mb.i_mb_y ] = 0; - for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++ ) + for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- ) { - int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor ); - row_satd[ h->mb.i_mb_y ] += i_mb_cost; + int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search ); + int i_mb_cost_aq = i_mb_cost; + if( h->param.rc.i_aq_mode ) + { + x264_emms(); + i_mb_cost_aq *= pow(2.0,-(frames[b]->f_qp_offset[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride])/6.0); + } + row_satd[ h->mb.i_mb_y ] += i_mb_cost_aq; if( h->mb.i_mb_y > 0 && h->mb.i_mb_y < h->sps->i_mb_height - 1 && h->mb.i_mb_x > 0 && h->mb.i_mb_x < h->sps->i_mb_width - 1 ) { + /* Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. */ i_score += i_mb_cost; + i_score_aq += i_mb_cost_aq; } } } } else { - for( h->mb.i_mb_y = 1; h->mb.i_mb_y < h->sps->i_mb_height - 1; h->mb.i_mb_y++ ) - for( h->mb.i_mb_x = 1; h->mb.i_mb_x < h->sps->i_mb_width - 1; h->mb.i_mb_x++ ) - i_score += x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor ); + for( h->mb.i_mb_y = h->sps->i_mb_height - 2; h->mb.i_mb_y > 0; h->mb.i_mb_y-- ) + for( h->mb.i_mb_x = h->sps->i_mb_width - 2; h->mb.i_mb_x > 0; h->mb.i_mb_x-- ) + { + int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search ); + int i_mb_cost_aq = i_mb_cost; + if( h->param.rc.i_aq_mode ) + { + x264_emms(); + i_mb_cost_aq *= pow(2.0,-(frames[b]->f_qp_offset[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride])/6.0); + } + i_score += i_mb_cost; + i_score_aq += i_mb_cost_aq; + } } if( b != p1 ) i_score = i_score * 100 / (120 + h->param.i_bframe_bias); + else + frames[b]->b_intra_calculated = 1; frames[b]->i_cost_est[b-p0][p1-b] = i_score; -// fprintf( stderr, "frm %d %c(%d,%d): %6d %6d imb:%d \n", frames[b]->i_frame, -// (p1==0?'I':b<p1?'B':'P'), b-p0, p1-b, i_score, frames[b]->i_cost_est[0][0], frames[b]->i_intra_mbs[b-p0] ); + frames[b]->i_cost_est_aq[b-p0][p1-b] = i_score_aq; x264_emms(); } if( b_intra_penalty ) { // arbitrary penalty for I-blocks after B-frames - int nmb = (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2); + int nmb = NUM_MBS; i_score += i_score * frames[b]->i_intra_mbs[b-p0] / (nmb * 8); } return i_score; } +#define MAX_LENGTH (X264_BFRAME_MAX*4) + +static int x264_slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, char *path, int threshold ) +{ + int loc = 1; + int cost = 0; + int cur_p = 0; + path--; /* Since the 1st path element is really the second frame */ + while( path[loc] ) + { + int next_p = loc; + int next_b; + /* Find the location of the next P-frame. */ + while( path[next_p] && path[next_p] != 'P' ) + next_p++; + /* Return if the path doesn't end on a P-frame. */ + if( path[next_p] != 'P' ) + return cost; + + /* Add the cost of the P-frame found above */ + cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_p, 0 ); + /* Early terminate if the cost we have found is larger than the best path cost so far */ + if( cost > threshold ) + break; + + for( next_b = loc; next_b < next_p && cost < threshold; next_b++ ) + cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_b, 0 ); + + loc = next_p + 1; + cur_p = next_p; + } + return cost; +} + +/* Viterbi/trellis slicetype decision algorithm. */ +/* Uses strings due to the fact that the speed of the control functions is + negligable compared to the cost of running slicetype_frame_cost, and because + it makes debugging easier. */ +static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int max_bframes, int buffer_size, char (*best_paths)[MAX_LENGTH] ) +{ + char paths[X264_BFRAME_MAX+2][MAX_LENGTH] = {{0}}; + int num_paths = X264_MIN(max_bframes+1, length); + int suffix_size, loc, path; + int best_cost = COST_MAX; + int best_path_index = 0; + length = X264_MIN(length,MAX_LENGTH); + + /* Iterate over all currently possible paths and add suffixes to each one */ + for( suffix_size = 0; suffix_size < num_paths; suffix_size++ ) + { + memcpy( paths[suffix_size], best_paths[length - (suffix_size + 1)], length - (suffix_size + 1) ); + for( loc = 0; loc < suffix_size; loc++ ) + strcat( paths[suffix_size], "B" ); + strcat( paths[suffix_size], "P" ); + } + + /* Calculate the actual cost of each of the current paths */ + for( path = 0; path < num_paths; path++ ) + { + int cost = x264_slicetype_path_cost( h, a, frames, paths[path], best_cost ); + if( cost < best_cost ) + { + best_cost = cost; + best_path_index = path; + } + } + + /* Store the best path. */ + memcpy( best_paths[length], paths[best_path_index], length ); +} + +static int x264_slicetype_path_search( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int bframes, int buffer ) +{ + char best_paths[MAX_LENGTH][MAX_LENGTH] = {"","P"}; + int n; + for( n = 2; n < length-1; n++ ) + x264_slicetype_path( h, a, frames, n, bframes, buffer, best_paths ); + return strspn( best_paths[length-2], "B" ); +} + static int scenecut( x264_t *h, x264_frame_t *frame, int pdist ) { int icost = frame->i_cost_est[0][0]; @@ -336,15 +459,15 @@ { f_bias = f_thresh_min + ( f_thresh_max - f_thresh_min ) - * ( i_gop_size - h->param.i_keyint_min ) - / ( h->param.i_keyint_max - h->param.i_keyint_min ); + * ( i_gop_size - h->param.i_keyint_min ) + / ( h->param.i_keyint_max - h->param.i_keyint_min ) ; } res = pcost >= (1.0 - f_bias) * icost; if( res ) { int imb = frame->i_intra_mbs[pdist]; - int pmb = (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2) - imb; + int pmb = NUM_MBS - imb; x264_log( h, X264_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n", frame->i_frame, icost, pcost, 1. - (double)pcost / icost, @@ -353,14 +476,14 @@ return res; } -void x264_slicetype_analyse( x264_t *h ) +static void x264_slicetype_analyse( x264_t *h ) { x264_mb_analysis_t a; - x264_frame_t *frames[X264_BFRAME_MAX+3] = { NULL, }; + x264_frame_t *frames[X264_BFRAME_MAX*4+3] = { NULL, }; int num_frames; int keyint_limit; int j; - int i_mb_count = (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2); + int i_mb_count = NUM_MBS; int cost1p0, cost2p0, cost1b1, cost2p1; int idr_frame_type; @@ -392,37 +515,65 @@ return; } - cost2p1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 2, 1 ); - if( frames[2]->i_intra_mbs[2] > i_mb_count / 2 ) - goto no_b_frames; - - cost1b1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 1, 0 ); - cost1p0 = x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 ); - cost2p0 = x264_slicetype_frame_cost( h, &a, frames, 1, 2, 2, 0 ); -// fprintf( stderr, "PP: %d + %d <=> BP: %d + %d \n", -// cost1p0, cost2p0, cost1b1, cost2p1 ); - if( cost1p0 + cost2p0 < cost1b1 + cost2p1 ) - goto no_b_frames; - -// arbitrary and untuned -#define INTER_THRESH 300 -#define P_SENS_BIAS (50 - h->param.i_bframe_bias) - frames[1]->i_type = X264_TYPE_B; - - for( j = 2; j <= X264_MIN( h->param.i_bframe, num_frames-1 ); j++ ) - { - int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-1), INTER_THRESH/10); - int pcost = x264_slicetype_frame_cost( h, &a, frames, 0, j+1, j+1, 1 ); -// fprintf( stderr, "frm%d+%d: %d <=> %d, I:%d/%d \n", -// frames[0]->i_frame, j-1, pthresh, pcost/i_mb_count, -// frames[j+1]->i_intra_mbs[j+1], i_mb_count ); - if( pcost > pthresh*i_mb_count || frames[j+1]->i_intra_mbs[j+1] > i_mb_count/3 ) + if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS ) + { + int num_bframes; + int max_bframes = X264_MIN(num_frames-1, h->param.i_bframe); + if( h->param.b_pre_scenecut ) { - frames[j]->i_type = X264_TYPE_P; - break; + x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 ); + if( scenecut( h, frames[1], 1 ) ) + { + frames[1]->i_type = idr_frame_type; + return; + } } - else + num_bframes = x264_slicetype_path_search( h, &a, frames, num_frames, max_bframes, num_frames-max_bframes ); + assert(num_bframes < num_frames); + + for( j = 1; j < num_bframes+1; j++ ) + { + if( h->param.b_pre_scenecut && scenecut( h, frames[j+1], j+1 ) ) + { + frames[j]->i_type = X264_TYPE_P; + frames[j+1]->i_type = idr_frame_type; + return; + } frames[j]->i_type = X264_TYPE_B; + } + frames[num_bframes+1]->i_type = X264_TYPE_P; + } + else + { + cost2p1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 2, 1 ); + if( frames[2]->i_intra_mbs[2] > i_mb_count / 2 ) + goto no_b_frames; + + cost1b1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 1, 0 ); + cost1p0 = x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 ); + cost2p0 = x264_slicetype_frame_cost( h, &a, frames, 1, 2, 2, 0 ); + + if( cost1p0 + cost2p0 < cost1b1 + cost2p1 ) + goto no_b_frames; + + // arbitrary and untuned + #define INTER_THRESH 300 + #define P_SENS_BIAS (50 - h->param.i_bframe_bias) + frames[1]->i_type = X264_TYPE_B; + + for( j = 2; j <= X264_MIN( h->param.i_bframe, num_frames-1 ); j++ ) + { + int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-1), INTER_THRESH/10); + int pcost = x264_slicetype_frame_cost( h, &a, frames, 0, j+1, j+1, 1 ); + + if( pcost > pthresh*i_mb_count || frames[j+1]->i_intra_mbs[j+1] > i_mb_count/3 ) + { + frames[j]->i_type = X264_TYPE_P; + break; + } + else + frames[j]->i_type = X264_TYPE_B; + } } } @@ -442,7 +593,7 @@ h->frames.next[i]->i_type = x264_ratecontrol_slice_type( h, h->frames.next[i]->i_frame ); } - else if( (h->param.i_bframe && h->param.b_bframe_adaptive) + else if( (h->param.i_bframe && h->param.i_bframe_adaptive) || h->param.b_pre_scenecut ) x264_slicetype_analyse( h ); @@ -492,7 +643,7 @@ int x264_rc_analyse_slice( x264_t *h ) { x264_mb_analysis_t a; - x264_frame_t *frames[X264_BFRAME_MAX+2] = { NULL, }; + x264_frame_t *frames[X264_BFRAME_MAX*4+2] = { NULL, }; int p0=0, p1, b; int cost; @@ -520,6 +671,11 @@ frames[b] = h->fenc; cost = x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 ); + + /* In AQ, use the weighted score instead. */ + if( h->param.rc.i_aq_mode ) + cost = frames[b]->i_cost_est[b-p0][p1-b]; + h->fenc->i_row_satd = h->fenc->i_row_satds[b-p0][p1-b]; h->fdec->i_row_satd = h->fdec->i_row_satds[b-p0][p1-b]; h->fdec->i_satd = cost;

@@ -43,9 +43,9 @@ typedef __int64 int_least64_t; typedef unsigned __int64 uint_least64_t; -/* 7.18.1.3 Fastest minimum-width integer types +/* 7.18.1.3 Fastest minimum-width integer types * Not actually guaranteed to be fastest for all purposes - * Here we use the exact-width types for 8 and 16-bit ints. + * Here we use the exact-width types for 8 and 16-bit ints. */ typedef char int_fast8_t; typedef unsigned char uint_fast8_t; @@ -68,7 +68,7 @@ #if !defined ( __cplusplus) || defined (__STDC_LIMIT_MACROS) /* 7.18.2.1 Limits of exact-width integer types */ -#define INT8_MIN (-128) +#define INT8_MIN (-128) #define INT16_MIN (-32768) #define INT32_MIN (-2147483647 - 1) #define INT64_MIN (-9223372036854775807LL - 1) @@ -116,7 +116,7 @@ #define UINT_FAST64_MAX UINT64_MAX /* 7.18.2.4 Limits of integer types capable of holding - object pointers */ + object pointers */ #define INTPTR_MIN INT32_MIN #define INTPTR_MAX INT32_MAX #define UINTPTR_MAX UINT32_MAX @@ -135,7 +135,7 @@ #define SIZE_MAX UINT32_MAX -#ifndef WCHAR_MIN /* also in wchar.h */ +#ifndef WCHAR_MIN /* also in wchar.h */ #define WCHAR_MIN 0 #define WCHAR_MAX ((wchar_t)-1) /* UINT16_MAX */ #endif

@@ -93,7 +93,7 @@ $(SONAMEGTK): $(OBJECTS_LIB) @echo " L: $(@F)" - @$(CC) -shared -o $@ $(OBJECTS_LIB) -Wl,-soname,$(SONAMEGTK) $(LDFLAGS) + @$(CC) -shared -o $@ $(OBJECTS_LIB) $(SOFLAGS) $(LDFLAGS) # Program : test $(TEST_BIN): $(OBJECTS_LIB) $(OBJECTS_TEST) @@ -142,8 +142,8 @@ @install -d "$(DESTDIR)$(libdir)" @echo " I: $(DESTDIR)$(libdir)/libx264gtk.a" @install -m 644 libx264gtk.a "$(DESTDIR)$(libdir)" - @echo " I: $(DESTDIR)$(libdir)/libx264gtk.so" - @$(if $(SONAMEGTK), ln -sf $(SONAMEGTK) $(DESTDIR)$(libdir)/libx264gtk.so) + @echo " I: $(DESTDIR)$(libdir)/libx264gtk.$(SOSUFFIX)" + @$(if $(SONAMEGTK), ln -sf $(SONAMEGTK) $(DESTDIR)$(libdir)/libx264gtk.$(SOSUFFIX)) @$(if $(SONAMEGTK), install -m 755 $(SONAMEGTK) $(DESTDIR)$(libdir)) @echo " D: $(DESTDIR)$(bindir)" @install -d "$(DESTDIR)$(bindir)" @@ -172,9 +172,9 @@ @rm -f "$(DESTDIR)$(includedir)/x264_gtk_enum.h" @echo " U: $(DESTDIR)$(libdir)/libx264gtk.a" @rm -f "$(DESTDIR)$(libdir)/libx264gtk.a" - @echo " U: $(DESTDIR)$(libdir)/libx264gtk.so" + @echo " U: $(DESTDIR)$(libdir)/$(SONAMEGTK)" @$(if $(SONAMEGTK), rm -f "$(DESTDIR)$(libdir)/$(SONAMEGTK)") - @rm -f "$(DESTDIR)$(libdir)/libx264gtk.so" + @rm -f "$(DESTDIR)$(libdir)/libx264gtk.$(SOSUFFIX)" @echo " U: $(DESTDIR)$(bindir)/$(ENCODE_BIN)" @rm -f "$(DESTDIR)$(bindir)/$(ENCODE_BIN)" @echo " U: $(DESTDIR)${datadir}/x264"

@@ -115,7 +115,7 @@ param->b_bframe_pyramid = x264_gtk->bframe_pyramid && x264_gtk->bframe; param->analyse.b_bidir_me = x264_gtk->bidir_me; - param->b_bframe_adaptive = x264_gtk->bframe_adaptive; + param->i_bframe_adaptive = x264_gtk->bframe_adaptive; param->analyse.b_weighted_bipred = x264_gtk->weighted_bipred; param->i_bframe = x264_gtk->bframe; param->i_bframe_bias = x264_gtk->bframe_bias; @@ -470,7 +470,7 @@ gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (config->mb.bframes.bframe_pyramid), param.b_bframe_pyramid); gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (config->mb.bframes.bidir_me), param.analyse.b_bidir_me); - gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (config->mb.bframes.bframe_adaptive), param.b_bframe_adaptive); + gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (config->mb.bframes.bframe_adaptive), param.i_bframe_adaptive); gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (config->mb.bframes.weighted_bipred), param.analyse.b_weighted_bipred); g_snprintf (buf, 64, "%d", param.i_bframe); gtk_entry_set_text (GTK_ENTRY (config->mb.bframes.bframe), buf); @@ -602,7 +602,7 @@ x264_gtk->bframe_pyramid = param.b_bframe_pyramid; x264_gtk->bidir_me = param.analyse.b_bidir_me; - x264_gtk->bframe_adaptive = param.b_bframe_adaptive; + x264_gtk->bframe_adaptive = param.i_bframe_adaptive; x264_gtk->weighted_bipred = param.analyse.b_weighted_bipred; x264_gtk->bframe = param.i_bframe; x264_gtk->bframe_bias = param.i_bframe_bias;

@@ -76,7 +76,7 @@ gint threads; guint trellis; gint noise_reduction; - + gint strength; gint threshold;

@@ -753,7 +753,7 @@ gtk_entry_set_text (GTK_ENTRY (thread_data->video_rendering_rate), str); - snprintf (str, 128, "%lld:%02lld:%02lld", + snprintf (str, 128, "%" PRId64 ":%02" PRId64 ":%02" PRId64, (pipe_data.elapsed / 1000000) / 3600, ((pipe_data.elapsed / 1000000) / 60) % 60, (pipe_data.elapsed / 1000000) % 60);

@@ -54,7 +54,7 @@ GIOChannel *io_write; /* use it with write */ }; -struct X264_Pipe_Data_ +struct X264_Pipe_Data_ { int frame; int frame_total;

@@ -407,7 +407,7 @@ return 0; } -int mk_flushFrame(mk_Writer *w) { +static int mk_flushFrame(mk_Writer *w) { int64_t delta, ref = 0; unsigned fsize, bgsize; unsigned char c_delta_flags[3];

@@ -216,7 +216,8 @@ tokstart = strchr(tokstart, 0x20); break; case 'A': /* Pixel aspect - 0:0 if unknown */ - if( sscanf(tokstart, "%d:%d", &n, &d) == 2 && n && d ) + /* Don't override the aspect ratio if sar has been explicitly set on the commandline. */ + if( sscanf(tokstart, "%d:%d", &n, &d) == 2 && n && d && !p_param->vui.i_sar_width && !p_param->vui.i_sar_height ) { x264_reduce_fraction( &n, &d ); p_param->vui.i_sar_width = n; @@ -285,7 +286,7 @@ /* Read frame header - without terminating '\n' */ if (fread(header, 1, slen, h->fh) != slen) return -1; - + header[slen] = 0; if (strncmp(header, Y4M_FRAME_MAGIC, slen)) { @@ -293,7 +294,7 @@ *((uint32_t*)header), header); return -1; } - + /* Skip most of it */ while (i<MAX_FRAME_HEADER && fgetc(h->fh) != '\n') i++; @@ -426,6 +427,7 @@ x264_pthread_t tid; int next_frame; int frame_total; + int in_progress; struct thread_input_arg_t *next_args; } thread_input_t; @@ -443,6 +445,7 @@ h->p_read_frame = p_read_frame; h->p_close_infile = p_close_infile; h->p_handle = *p_handle; + h->in_progress = 0; h->next_frame = -1; h->next_args = malloc(sizeof(thread_input_arg_t)); h->next_args->h = h; @@ -459,7 +462,7 @@ return h->frame_total; } -void read_frame_thread_int( thread_input_arg_t *i ) +static void read_frame_thread_int( thread_input_arg_t *i ) { i->status = i->h->p_read_frame( i->pic, i->h->p_handle, i->i_frame ); } @@ -474,6 +477,7 @@ { x264_pthread_join( h->tid, &stuff ); ret |= h->next_args->status; + h->in_progress = 0; } if( h->next_frame == i_frame ) @@ -491,6 +495,7 @@ h->next_args->i_frame = i_frame+1; h->next_args->pic = &h->pic; x264_pthread_create( &h->tid, NULL, (void*)read_frame_thread_int, h->next_args ); + h->in_progress = 1; } else h->next_frame = -1; @@ -503,7 +508,8 @@ thread_input_t *h = handle; h->p_close_infile( h->p_handle ); x264_picture_clean( &h->pic ); - x264_pthread_join( h->tid, NULL ); + if( h->in_progress ) + x264_pthread_join( h->tid, NULL ); free( h->next_args ); free( h ); return 0; @@ -563,7 +569,7 @@ } mp4_t; -void recompute_bitrate_mp4(GF_ISOFile *p_file, int i_track) +static void recompute_bitrate_mp4(GF_ISOFile *p_file, int i_track) { u32 i, count, di, timescale, time_wnd, rate; u64 offset; @@ -806,7 +812,7 @@ char b_writing_frame; } mkv_t; -int write_header_mkv( mkv_t *p_mkv ) +static int write_header_mkv( mkv_t *p_mkv ) { int ret; uint8_t *avcC;

@@ -499,7 +499,7 @@ /* skip i_offset_for_top_to_bottom_field */ bs_read_se( &s ); /* read i_num_ref_frames_in_poc_cycle */ - i_cycle = bs_read_ue( &s ); + i_cycle = bs_read_ue( &s ); if( i_cycle > 256 ) i_cycle = 256; while( i_cycle > 0 ) {

@@ -61,7 +61,6 @@ or r3, r5 jz .ok mov r3, eax - picgetgot r1 lea r1, [error_message GLOBAL] push r1 xor eax, eax

@@ -100,12 +100,12 @@ return &benchs[i].vers[j]; } -int cmp_nop( const void *a, const void *b ) +static int cmp_nop( const void *a, const void *b ) { return *(uint16_t*)a - *(uint16_t*)b; } -int cmp_bench( const void *a, const void *b ) +static int cmp_bench( const void *a, const void *b ) { // asciibetical sort except preserving numbers const char *sa = ((bench_func_t*)a)->name; @@ -258,6 +258,7 @@ report( "pixel " #name " :" ); TEST_PIXEL( sad, 0 ); + TEST_PIXEL( sad_aligned, 1 ); TEST_PIXEL( ssd, 1 ); TEST_PIXEL( satd, 0 ); TEST_PIXEL( sa8d, 0 ); @@ -302,7 +303,45 @@ TEST_PIXEL_X(3); TEST_PIXEL_X(4); -#define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \ +#define TEST_PIXEL_VAR( i ) \ + if( pixel_asm.var[i] != pixel_ref.var[i] ) \ + { \ + uint32_t res_c, res_asm; \ + uint32_t sad_c, sad_asm; \ + set_func_name( "%s_%s", "var", pixel_names[i] ); \ + used_asm = 1; \ + res_c = call_c( pixel_c.var[i], buf1, 16, &sad_c ); \ + res_asm = call_a( pixel_asm.var[i], buf1, 16, &sad_asm ); \ + if( (res_c != res_asm) || (sad_c != sad_asm) ) \ + { \ + ok = 0; \ + fprintf( stderr, "var[%d]: %d,%d != %d,%d [FAILED]\n", i, res_c, sad_c, res_asm, sad_asm ); \ + } \ + } + + ok = 1; used_asm = 0; + TEST_PIXEL_VAR( PIXEL_16x16 ); + TEST_PIXEL_VAR( PIXEL_8x8 ); + report( "pixel var :" ); + + for( i=0, ok=1, used_asm=0; i<4; i++ ) + if( pixel_asm.hadamard_ac[i] != pixel_ref.hadamard_ac[i] ) + { + set_func_name( "hadamard_ac_%s", pixel_names[i] ); + used_asm = 1; + uint64_t rc = pixel_c.hadamard_ac[i]( buf1, 16 ); + uint64_t ra = pixel_asm.hadamard_ac[i]( buf1, 16 ); + if( rc != ra ) + { + ok = 0; + fprintf( stderr, "hadamard_ac[%d]: %d,%d != %d,%d\n", i, (int)rc, (int)(rc>>32), (int)ra, (int)(ra>>32) ); + } + call_c2( pixel_c.hadamard_ac[i], buf1, 16 ); + call_a2( pixel_asm.hadamard_ac[i], buf1, 16 ); + } + report( "pixel hadamard_ac :" ); + +#define TEST_INTRA_MBCMP( name, pred, satd, i8x8, ... ) \ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \ int res_c[3], res_asm[3]; \ @@ -311,10 +350,10 @@ memcpy( buf3, buf2, 1024 ); \ for( i=0; i<3; i++ ) \ { \ - pred[i]( buf3+40, ##__VA_ARGS__ ); \ - res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \ + pred[i]( buf3+48, ##__VA_ARGS__ ); \ + res_c[i] = pixel_c.satd( buf1+48, 16, buf3+48, 32 ); \ } \ - call_a( pixel_asm.name, buf1+40, i8x8 ? edge : buf3+40, res_asm ); \ + call_a( pixel_asm.name, buf1+48, i8x8 ? edge : buf3+48, res_asm ); \ if( memcmp(res_c, res_asm, sizeof(res_c)) ) \ { \ ok = 0; \ @@ -325,11 +364,13 @@ } ok = 1; used_asm = 0; - TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 ); - TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8], 0 ); - TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4], 0 ); - TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], 1, edge ); + TEST_INTRA_MBCMP( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 ); + TEST_INTRA_MBCMP( intra_satd_x3_8x8c , predict_8x8c , satd[PIXEL_8x8] , 0 ); + TEST_INTRA_MBCMP( intra_satd_x3_4x4 , predict_4x4 , satd[PIXEL_4x4] , 0 ); + TEST_INTRA_MBCMP( intra_sa8d_x3_8x8 , predict_8x8 , sa8d[PIXEL_8x8] , 1, edge ); report( "intra satd_x3 :" ); + TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 ); + report( "intra sad_x3 :" ); if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core || pixel_asm.ssim_end4 != pixel_ref.ssim_end4 ) @@ -568,6 +609,7 @@ { \ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\ used_asm = 1; \ + memcpy(dct, buf1, size*sizeof(int16_t));\ call_c( zigzag_c.name, t1, dct ); \ call_a( zigzag_asm.name, t2, dct ); \ if( memcmp( t1, t2, size*sizeof(int16_t) ) ) \ @@ -729,31 +771,29 @@ #undef MC_TEST_LUMA #undef MC_TEST_CHROMA -#define MC_TEST_AVG( name, ... ) \ +#define MC_TEST_AVG( name, weight ) \ for( i = 0, ok = 1, used_asm = 0; i < 10; i++ ) \ { \ - memcpy( buf3, buf1, 1024 ); \ - memcpy( buf4, buf1, 1024 ); \ + memcpy( buf3, buf1+320, 320 ); \ + memcpy( buf4, buf1+320, 320 ); \ if( mc_a.name[i] != mc_ref.name[i] ) \ { \ set_func_name( "%s_%s", #name, pixel_names[i] );\ used_asm = 1; \ - call_c1( mc_c.name[i], buf3, 32, buf2, 16, ##__VA_ARGS__ ); \ - call_a1( mc_a.name[i], buf4, 32, buf2, 16, ##__VA_ARGS__ ); \ - if( memcmp( buf3, buf4, 1024 ) ) \ + call_c1( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \ + call_a1( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \ + if( memcmp( buf3, buf4, 320 ) ) \ { \ ok = 0; \ fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \ } \ - call_c2( mc_c.name[i], buf3, 32, buf2, 16, ##__VA_ARGS__ ); \ - call_a2( mc_a.name[i], buf4, 32, buf2, 16, ##__VA_ARGS__ ); \ + call_c2( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \ + call_a2( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \ } \ } - MC_TEST_AVG( avg ); - report( "mc avg :" ); ok = 1; used_asm = 0; - for( w = -64; w <= 128 && ok; w++ ) - MC_TEST_AVG( avg_weight, w ); + for( w = -63; w <= 127 && ok; w++ ) + MC_TEST_AVG( avg, w ); report( "mc wpredb :" ); if( mc_a.hpel_filter != mc_ref.hpel_filter ) @@ -1048,7 +1088,7 @@ report( "dequant :" ); - if( qf_a.denoise_dct_core != qf_ref.denoise_dct_core ) + if( qf_a.denoise_dct != qf_ref.denoise_dct ) { int size; for( size = 16; size <= 64; size += 48 ) @@ -1058,12 +1098,12 @@ memcpy(dct1, buf1, size*2); memcpy(dct2, buf1, size*2); memcpy(buf3+256, buf3, 256); - call_c1( qf_c.denoise_dct_core, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size ); - call_a1( qf_a.denoise_dct_core, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size ); + call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size ); + call_a1( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size ); if( memcmp( dct1, dct2, size*2 ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) ) ok = 0; - call_c2( qf_c.denoise_dct_core, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size ); - call_a2( qf_a.denoise_dct_core, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size ); + call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size ); + call_a2( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size ); } } report( "denoise dct :" ); @@ -1178,7 +1218,7 @@ return ret; } -int check_all_funcs( int cpu_ref, int cpu_new ) +static int check_all_funcs( int cpu_ref, int cpu_new ) { return check_pixel( cpu_ref, cpu_new ) + check_dct( cpu_ref, cpu_new ) @@ -1189,7 +1229,7 @@ + check_cabac( cpu_ref, cpu_new ); } -int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name ) +static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name ) { *cpu_ref = *cpu_new; *cpu_new |= flags; @@ -1200,7 +1240,7 @@ return check_all_funcs( *cpu_ref, *cpu_new ); } -int check_all_flags( void ) +static int check_all_flags( void ) { int ret = 0; int cpu0 = 0, cpu1 = 0;

@@ -37,6 +37,12 @@ #include "config.h" #endif +#ifdef _WIN32 +#include <windows.h> +#else +#define SetConsoleTitle(t) +#endif + uint8_t *mux_buffer = NULL; int mux_buffer_size = 0; @@ -163,7 +169,11 @@ H1( " --pre-scenecut Faster, less precise scenecut detection.\n" " Required and implied by multi-threading.\n" ); H0( " -b, --bframes <integer> Number of B-frames between I and P [%d]\n", defaults->i_bframe ); - H1( " --no-b-adapt Disable adaptive B-frame decision\n" ); + H1( " --b-adapt Adaptive B-frame decision method [%d]\n" + " Higher values may lower threading efficiency.\n" + " - 0: Disabled\n" + " - 1: Fast\n" + " - 2: Optimal (slow with high --bframes)\n", defaults->i_bframe_adaptive ); H1( " --b-bias <integer> Influences how often B-frames are used [%d]\n", defaults->i_bframe_bias ); H0( " --b-pyramid Keep some B-frames as references\n" ); H0( " --no-cabac Disable CABAC\n" ); @@ -188,10 +198,9 @@ H0( " --ipratio <float> QP factor between I and P [%.2f]\n", defaults->rc.f_ip_factor ); H0( " --pbratio <float> QP factor between P and B [%.2f]\n", defaults->rc.f_pb_factor ); H1( " --chroma-qp-offset <integer> QP difference between chroma and luma [%d]\n", defaults->analyse.i_chroma_qp_offset ); - H0( " --aq-mode <integer> How AQ distributes bits [%d]\n" + H1( " --aq-mode <integer> AQ method [%d]\n" " - 0: Disabled\n" - " - 1: Avoid moving bits between frames\n" - " - 2: Move bits between frames\n", defaults->rc.i_aq_mode ); + " - 1: Variance AQ (complexity mask)\n", defaults->rc.i_aq_mode ); H0( " --aq-strength <float> Reduces blocking and blurring in flat and\n" " textured areas. [%.1f]\n" " - 0.5: weak AQ\n" @@ -202,7 +211,6 @@ " - 2: Last pass, does not overwrite stats file\n" " - 3: Nth pass, overwrites stats file\n" ); H0( " --stats <string> Filename for 2 pass stats [\"%s\"]\n", defaults->rc.psz_stat_out ); - H1( " --rceq <string> Ratecontrol equation [\"%s\"]\n", defaults->rc.psz_rc_eq ); H0( " --qcomp <float> QP curve compression: 0.0 => CBR, 1.0 => CQP [%.2f]\n", defaults->rc.f_qcompress ); H1( " --cplxblur <float> Reduce fluctuations in QP (before curve compression) [%.1f]\n", defaults->rc.f_complexity_blur ); H1( " --qblur <float> Reduce fluctuations in QP (after curve compression) [%.1f]\n", defaults->rc.f_qblur ); @@ -243,6 +251,10 @@ H0( " -m, --subme <integer> Subpixel motion estimation and partition\n" " decision quality: 1=fast, 7=best. [%d]\n", defaults->analyse.i_subpel_refine ); H0( " --b-rdo RD based mode decision for B-frames. Requires subme 6.\n" ); + H0( " --psy-rd Strength of psychovisual optimization [\"%.1f:%.1f\"]\n" + " #1: RDO (requires subme>=6)\n" + " #2: Trellis (requires trellis, experimental)\n", + defaults->analyse.f_psy_rd, defaults->analyse.f_psy_trellis ); H0( " --mixed-refs Decide references on a per partition basis\n" ); H1( " --no-chroma-me Ignore chroma in motion estimation\n" ); H1( " --bime Jointly optimize both MVs in B-frames\n" ); @@ -375,6 +387,7 @@ { "version", no_argument, NULL, 'V' }, { "bitrate", required_argument, NULL, 'B' }, { "bframes", required_argument, NULL, 'b' }, + { "b-adapt", required_argument, NULL, 0 }, { "no-b-adapt", no_argument, NULL, 0 }, { "b-bias", required_argument, NULL, 0 }, { "b-pyramid", no_argument, NULL, 0 }, @@ -411,6 +424,7 @@ { "mvrange", required_argument, NULL, 0 }, { "mvrange-thread", required_argument, NULL, 0 }, { "subme", required_argument, NULL, 'm' }, + { "psy-rd", required_argument, NULL, 0 }, { "b-rdo", no_argument, NULL, 0 }, { "mixed-refs", no_argument, NULL, 0 }, { "no-chroma-me", no_argument, NULL, 0 }, @@ -433,7 +447,6 @@ { "chroma-qp-offset", required_argument, NULL, 0 }, { "pass", required_argument, NULL, 'p' }, { "stats", required_argument, NULL, 0 }, - { "rceq", required_argument, NULL, 0 }, { "qcomp", required_argument, NULL, 0 }, { "qblur", required_argument, NULL, 0 }, { "cplxblur",required_argument, NULL, 0 }, @@ -541,7 +554,7 @@ return -1; } param->i_scenecut_threshold = -1; - param->b_bframe_adaptive = 0; + param->i_bframe_adaptive = X264_B_ADAPT_NONE; break; case OPT_THREAD_INPUT: b_thread_input = 1; @@ -632,7 +645,7 @@ sscanf( argv[optind++], "%ux%u", &param->i_width, &param->i_height ); } } - + if( !(b_avis || b_y4m) && ( !param->i_width || !param->i_height ) ) { fprintf( stderr, "x264 [error]: Rawyuv input requires a resolution.\n" ); @@ -772,14 +785,17 @@ int64_t i_start, i_end; int64_t i_file; int i_frame_size; - int i_progress; + int i_update_interval; + char buf[200]; + opt->b_progress &= param->i_log_level < X264_LOG_DEBUG; i_frame_total = p_get_frame_total( opt->hin ); i_frame_total -= opt->i_seek; if( ( i_frame_total == 0 || param->i_frame_total < i_frame_total ) && param->i_frame_total > 0 ) i_frame_total = param->i_frame_total; param->i_frame_total = i_frame_total; + i_update_interval = i_frame_total ? x264_clip3( i_frame_total / 1000, 1, 10 ) : 10; if( ( h = x264_encoder_open( param ) ) == NULL ) { @@ -802,8 +818,7 @@ i_start = x264_mdate(); /* Encode frames */ - for( i_frame = 0, i_file = 0, i_progress = 0; - b_ctrl_c == 0 && (i_frame < i_frame_total || i_frame_total == 0); ) + for( i_frame = 0, i_file = 0; b_ctrl_c == 0 && (i_frame < i_frame_total || i_frame_total == 0); ) { if( p_read_frame( &pic, opt->hin, i_frame + opt->i_seek ) ) break; @@ -824,22 +839,24 @@ i_frame++; /* update status line (up to 1000 times per input file) */ - if( opt->b_progress && param->i_log_level < X264_LOG_DEBUG && - ( i_frame_total ? i_frame * 1000 / i_frame_total > i_progress - : i_frame % 10 == 0 ) ) + if( opt->b_progress && i_frame % i_update_interval == 0 ) { int64_t i_elapsed = x264_mdate() - i_start; double fps = i_elapsed > 0 ? i_frame * 1000000. / i_elapsed : 0; + double bitrate = (double) i_file * 8 * param->i_fps_num / ( (double) param->i_fps_den * i_frame * 1000 ); if( i_frame_total ) { int eta = i_elapsed * (i_frame_total - i_frame) / ((int64_t)i_frame * 1000000); - i_progress = i_frame * 1000 / i_frame_total; - fprintf( stderr, "encoded frames: %d/%d (%.1f%%), %.2f fps, eta %d:%02d:%02d \r", - i_frame, i_frame_total, (float)i_progress / 10, fps, + sprintf( buf, "x264 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d", + 100. * i_frame / i_frame_total, i_frame, i_frame_total, fps, bitrate, eta/3600, (eta/60)%60, eta%60 ); } else - fprintf( stderr, "encoded frames: %d, %.2f fps \r", i_frame, fps ); + { + sprintf( buf, "x264 %d frames: %.2f fps, %.2f kb/s", i_frame, fps, bitrate ); + } + fprintf( stderr, "%s \r", buf+5 ); + SetConsoleTitle( buf ); fflush( stderr ); // needed in windows } } @@ -851,6 +868,9 @@ i_end = x264_mdate(); x264_picture_clean( &pic ); + /* Erase progress indicator before printing encoding stats. */ + if( opt->b_progress ) + fprintf( stderr, " \r" ); x264_encoder_close( h ); x264_free( mux_buffer ); fprintf( stderr, "\n" );

@@ -35,7 +35,7 @@ #include <stdarg.h> -#define X264_BUILD 60 +#define X264_BUILD 64 /* x264_t: * opaque handler for encoder */ @@ -85,8 +85,10 @@ #define X264_RC_CRF 1 #define X264_RC_ABR 2 #define X264_AQ_NONE 0 -#define X264_AQ_LOCAL 1 -#define X264_AQ_GLOBAL 2 +#define X264_AQ_VARIANCE 1 +#define X264_B_ADAPT_NONE 0 +#define X264_B_ADAPT_FAST 1 +#define X264_B_ADAPT_TRELLIS 2 static const char * const x264_direct_pred_names[] = { "none", "spatial", "temporal", "auto", 0 }; static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", "tesa", 0 }; @@ -155,7 +157,7 @@ int i_width; int i_height; int i_csp; /* CSP of encoded bitstream, only i420 supported */ - int i_level_idc; + int i_level_idc; int i_frame_total; /* number of frames to encode if known, else 0 */ struct @@ -165,7 +167,7 @@ int i_sar_width; int i_overscan; /* 0=undef, 1=no overscan, 2=overscan */ - + /* see h264 annex E for the values of the following */ int i_vidformat; int b_fullrange; @@ -185,7 +187,7 @@ int i_scenecut_threshold; /* how aggressively to insert extra I frames */ int b_pre_scenecut; /* compute scenecut on lowres frames */ int i_bframe; /* how many b-frame between 2 references pictures */ - int b_bframe_adaptive; + int i_bframe_adaptive; int i_bframe_bias; int b_bframe_pyramid; /* Keep some B-frames as references */ @@ -239,6 +241,8 @@ int b_fast_pskip; /* early SKIP detection on P-frames */ int b_dct_decimate; /* transform coefficient thresholding on P-frames */ int i_noise_reduction; /* adaptive pseudo-deadzone */ + float f_psy_rd; /* Psy RD strength */ + float f_psy_trellis; /* Psy trellis strength */ /* the deadzone size that will be used in luma quantization */ int i_luma_deadzone[2]; /* {inter, intra} */ @@ -276,7 +280,6 @@ char *psz_stat_in; /* 2pass params (same as ffmpeg ones) */ - char *psz_rc_eq; /* 2 pass rate control equation */ float f_qcompress; /* 0.0 => cbr, 1.0 => constant qp */ float f_qblur; /* temporally blur quants */ float f_complexity_blur; /* temporally blur complexity */

@@ -1,132 +0,0 @@ -# norootforbuild - -%define binname x264 -%define realname libx264 -%define soname 59 -%define svn 20080607 - -Name: %{binname} -Summary: A free h264/avc encoder - encoder binary. -Version: 0.0svn%{svn} -Release: 1 -License: GPL -Group: Productivity/Multimedia/Video -Url: http://developers.videolan.org/x264.html - -Source: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2 - -BuildRoot: %{_tmppath}/build-root-%{name} - -Requires: %{realname}-%{soname} - -%ifarch x86_64 -BuildRequires: nasm yasm -%else -BuildRequires: nasm -%endif - -%description -x264 is a free library for encoding next-generation H264/AVC video -streams. The code is written from scratch by Laurent Aimar, Loren -Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans -Rullgard, Radek Czyz, Christian Heine (asm), Alex Izvorski (asm), and -Alex Wright. It is released under the terms of the GPL license. This -package contains a shared library and a commandline tool for encoding -H264 streams. This library is needed for mplayer/mencoder for H264 -encoding support. - -Encoder features: -- CAVLC/CABAC -- Multi-references -- Intra: all macroblock types (16x16, 8x8, and 4x4 with all predictions) -- Inter P: all partitions (from 16x16 down to 4x4) -- Inter B: partitions from 16x16 down to 8x8 (including skip/direct) -- Ratecontrol: constant quantizer, single or multipass ABR, optional VBV -- Scene cut detection -- Adaptive B-frame placement -- B-frames as references / arbitrary frame order -- 8x8 and 4x4 adaptive spatial transform -- Lossless mode -- Custom quantization matrices -- Parallel encoding of multiple slices (currently disabled) - -Be aware that the x264 library is still in early development stage. The -command line tool x264 can handle only raw YUV 4:2:0 streams at the -moment so please use mencoder or another tool that supports x264 library -for all other file types. - -%package -n %{realname}-%{soname} -Summary: A free h264/avc encoder - encoder binary -Group: Productivity/Multimedia/Video - -%description -n %{realname}-%{soname} -x264 is a free library for encoding next-generation H264/AVC video -streams. The code is written from scratch by Laurent Aimar, Loren -Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans -Rullgard, Radek Czyz, Christian Heine (asm), Alex Izvorski (asm), and -Alex Wright. It is released under the terms of the GPL license. This -package contains a static library and a header needed for the -development with libx264. This library is needed to build -mplayer/mencoder with H264 encoding support. - - -%package -n %{realname}-devel -Summary: Libraries and include file for the x264 encoder. -Group: Development/Libraries/C and C++ -Requires: %{realname}-%{soname} = %{version}-%{release} -Requires: %{buildrequires} - -%description -n %{realname}-devel -x264 is a free library for encoding next-generation H264/AVC video -streams. The code is written from scratch by Laurent Aimar, Loren -Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans -Rullgard, Radek Czyz, Christian Heine (asm), Alex Izvorski (asm), and -Alex Wright. It is released under the terms of the GPL license. This -package contains a static library and a header needed for the -development with libx264. This library is needed to build -mplayer/mencoder with H264 encoding support. - -%prep -%setup -q -n x264-snapshot-%{svn}-2245 - - -%build -%{configure} --enable-shared --enable-pic -#TODO: to compile with --enable-mp4-output gpac is needed, this should be added in the future... -%{__make} %{?jobs:-j%{jobs}} - - -%install -%makeinstall - -rm $RPM_BUILD_ROOT/%{_libdir}/libx264.so -cd $RPM_BUILD_ROOT/%{_libdir} -ln -s libx264.so.%{soname} libx264.so - - -%clean -%__rm -rf %{buildroot} - - -%files -%defattr(755,root,root) -%doc doc/*.txt -%{_bindir}/x264 - -%files -n %{realname}-%{soname} -%defattr(755,root,root) -%{_libdir}/libx264.so.%{soname} - - -%files -n %{realname}-devel -%defattr(755,root,root) -%{_libdir}/pkgconfig/x264.pc -%{_includedir}/x264.h -%{_libdir}/libx264.so -%{_libdir}/libx264.a - - -%changelog -* Sun Sep 30 2007 Carsten Schoene <cs@linux-administrator.com> -- import for SLE_10 build -

[-] [+]	Changed	x264.spec
@@ -2,24 +2,30 @@ #!BuildIgnore: post-build-checks %define binname x264 %define realname libx264 -%define soname 60 -%define svn 20080814 -%define sonamecomp 57 -%define svncomp 20071225 - -Name: %{binname} -Summary: A free h264/avc encoder - encoder binary -Version: 0.0svn%{svn} -Release: 0.pm.2 -License: GNU General Public License (GPL) -Group: Productivity/Multimedia/Video -URL: http://developers.videolan.org/x264.html -Source0: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2 -Source1: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svncomp}-2245.tar.bz2 -BuildRoot: %{_tmppath}/build-root-%{name} -Requires: %{realname}-%{soname} = %{version}-%{release} -BuildRequires: nasm -BuildRequires: yasm >= 0.7.1 +%define soname 65 +%define svn 20081218 +%define sonamecompa 64 +%define svncompa 20081001 +%define sonamecompb 60 +%define svncompb 20080829 +%define sonamecompc 57 +%define svncompc 20080126 + +Name: %{binname} +Summary: A free h264/avc encoder - encoder binary +Version: 0.0svn20081218 +Release: 5 +License: GNU General Public License (GPL) +Group: Productivity/Multimedia/Video +URL: http://developers.videolan.org/x264.html +Source0: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2 +Source1: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svncompa}-2245.tar.bz2 +Source2: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svncompb}-2245.tar.bz2 +Source3: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svncompc}-2245.tar.bz2 +BuildRoot: %{_tmppath}/build-root-%{name} +Requires: %{realname}-%{soname} = %{version}-%{release} +BuildRequires: nasm +BuildRequires: yasm >= 0.7.1 %description x264 is a free library for encoding next-generation H264/AVC video @@ -52,7 +58,7 @@ for all other file types. %package -n %{realname}-%{soname} -Summary: A free h264/avc encoder - encoder binary +Summary: A free h264/avc encoder - encoder binary Group: Productivity/Multimedia/Video %description -n %{realname}-%{soname} @@ -66,11 +72,41 @@ mplayer/mencoder with H264 encoding support. -%package -n %{realname}-%{sonamecomp} -Summary: A free h264/avc encoder - encoder binary +%package -n %{realname}-%{sonamecompa} +Summary: A free h264/avc encoder - encoder binary +Group: Productivity/Multimedia/Video + +%description -n %{realname}-%{sonamecompa} +x264 is a free library for encoding next-generation H264/AVC video +streams. The code is written from scratch by Laurent Aimar, Loren +Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans +Rullgard, Radek Czyz, Christian Heine (asm), Alex Izvorski (asm), and +Alex Wright. It is released under the terms of the GPL license. This +package contains a static library and a header needed for the +development with libx264. This library is needed to build +mplayer/mencoder with H264 encoding support. + + +%package -n %{realname}-%{sonamecompb} +Summary: A free h264/avc encoder - encoder binary +Group: Productivity/Multimedia/Video + +%description -n %{realname}-%{sonamecompb} +x264 is a free library for encoding next-generation H264/AVC video +streams. The code is written from scratch by Laurent Aimar, Loren +Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans +Rullgard, Radek Czyz, Christian Heine (asm), Alex Izvorski (asm), and +Alex Wright. It is released under the terms of the GPL license. This +package contains a static library and a header needed for the +development with libx264. This library is needed to build +mplayer/mencoder with H264 encoding support. + + +%package -n %{realname}-%{sonamecompc} +Summary: A free h264/avc encoder - encoder binary Group: Productivity/Multimedia/Video -%description -n %{realname}-%{sonamecomp} +%description -n %{realname}-%{sonamecompc} x264 is a free library for encoding next-generation H264/AVC video streams. The code is written from scratch by Laurent Aimar, Loren Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans @@ -86,8 +122,8 @@ Group: Development/Libraries/C and C++ Requires: %{realname}-%{soname} = %{version}-%{release} Requires: %{buildrequires} -Obsoletes: x264-devel -Provides: x264-devel = %{version}-%{release} +Obsoletes: x264-devel < %{version}-%{release} +Provides: x264-devel = %{version}-%{release} %description -n %{realname}-devel x264 is a free library for encoding next-generation H264/AVC video @@ -102,7 +138,7 @@ %debug_package %prep -%setup -q -n "x264-snapshot-%{svn}-2245" -b 1 +%setup -q -n "x264-snapshot-%{svn}-2245" -b 1 -b 2 -b 3 %build %ifarch i586 @@ -111,22 +147,49 @@ %else %configure --enable-shared --enable-pic %endif -#TODO: to compile with --enable-mp4-output gpac is needed, this should be added in the future... %__make %{?jobs:-j%{jobs}} -cd ../x264-snapshot-%{svncomp}-2245 +cd ../x264-snapshot-%{svncompa}-2245 +%ifarch i586 +#no mmx or sse on the i586 package, therefore use i686 package, should run on every i586 cpu +%configure --enable-shared --enable-pic --disable-asm +%else %configure --enable-shared --enable-pic -#TODO: to compile with --enable-mp4-output gpac is needed, this should be added in the future... +%endif +%__make %{?jobs:-j%{jobs}} + +cd ../x264-snapshot-%{svncompb}-2245 +%ifarch i586 +#no mmx or sse on the i586 package, therefore use i686 package, should run on every i586 cpu +%configure --enable-shared --enable-pic --disable-asm +%else +%configure --enable-shared --enable-pic +%endif +%__make %{?jobs:-j%{jobs}} + +cd ../x264-snapshot-%{svncompc}-2245 +%ifarch i586 +#no mmx or sse on the i586 package, therefore use i686 package, should run on every i586 cpu +%configure --enable-shared --enable-pic --disable-asm +%else +%configure --enable-shared --enable-pic +%endif %__make %{?jobs:-j%{jobs}} %install %makeinstall -%__rm "%{buildroot}/%{_libdir}/libx264.so" +%__rm -f "%{buildroot}/%{_libdir}/libx264.so" %__ln_s libx264.so.%{soname} "%{buildroot}%{_libdir}/libx264.so" -cd ../x264-snapshot-%{svncomp}-2245 -%{__install} -m 644 libx264.so.%{sonamecomp} %{buildroot}%{_libdir}/ +cd ../x264-snapshot-%{svncompa}-2245 +%{__install} -m 644 libx264.so.%{sonamecompa} %{buildroot}%{_libdir}/ + +cd ../x264-snapshot-%{svncompb}-2245 +%{__install} -m 644 libx264.so.%{sonamecompb} %{buildroot}%{_libdir}/ + +cd ../x264-snapshot-%{svncompc}-2245 +%{__install} -m 644 libx264.so.%{sonamecompc} %{buildroot}%{_libdir}/ %clean %__rm -rf "%{buildroot}" @@ -137,12 +200,25 @@ %postun -n %{realname}-%{soname} /sbin/ldconfig -%post -n %{realname}-%{sonamecomp} +%post -n %{realname}-%{sonamecompa} +/sbin/ldconfig + +%postun -n %{realname}-%{sonamecompa} +/sbin/ldconfig + +%post -n %{realname}-%{sonamecompb} +/sbin/ldconfig + +%postun -n %{realname}-%{sonamecompb} /sbin/ldconfig -%postun -n %{realname}-%{sonamecomp} +%post -n %{realname}-%{sonamecompc} /sbin/ldconfig
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/common/amd64/deblock-a.asm ^
@@ -318,7 +318,8 @@ lea r11, [r10+r10*2] lea rax, [rdi-4] lea r9, [rdi-4+r11] - %define pix_tmp rsp-104 ; 16x6 for the buffer + 8 for x264_deblock_v_luma_sse2's return address + sub rsp, 0x68 + %define pix_tmp rsp ; transpose 6x16 -> tmp space TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp @@ -352,6 +353,7 @@ movq mm3, [pix_tmp+0x40] TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) + add rsp, 0x68 ret
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/common/amd64/pixel-a.asm ^
@@ -805,9 +805,10 @@ ; void x264_intra_satd_x3_16x16_mmxext( uint8_t fenc, uint8_t fdec, int res ) ;----------------------------------------------------------------------------- cglobal x264_intra_satd_x3_16x16_mmxext -%define sums rsp-32 ; +24 -%define top_1d rsp-64 ; +32 -%define left_1d rsp-96 ; +32 + sub rsp, 96 +%define sums rsp+64 ; size 24 +%define top_1d rsp+32 ; size 32 +%define left_1d rsp ; size 32 mov qword [sums+0], 0 mov qword [sums+8], 0 @@ -913,15 +914,17 @@ movd [parm3q+8], mm2 ; i16x16_dc satd movd [parm3q+4], mm1 ; i16x16_h satd movd [parm3q+0], mm0 ; i16x16_v satd + add rsp, 96 ret ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_8x8c_mmxext( uint8_t fenc, uint8_t fdec, int res ) ;----------------------------------------------------------------------------- cglobal x264_intra_satd_x3_8x8c_mmxext -%define sums rsp-32 ; +24 -%define top_1d rsp-48 ; +16 -%define left_1d rsp-64 ; +16 + sub rsp, 64 +%define sums rsp+32 ; size 24 +%define top_1d rsp+16 ; size 16 +%define left_1d rsp ; size 16 mov qword [sums+0], 0 mov qword [sums+8], 0 @@ -1041,13 +1044,87 @@ movd [parm3q+0], mm0 ; i8x8c_dc satd movd [parm3q+4], mm1 ; i8x8c_h satd movd [parm3q+8], mm2 ; i8x8c_v satd + add rsp, 64 ret +; int x264_pixel_ads_mvs( int16_t mvs, uint8_t masks, int width ) +; { +; int nmv=0, i, j; +; (uint32_t)(masks+width) = 0; +; for( i=0; i<width; i+=8 ) +; { +; uint64_t mask = (uint64_t)(masks+i); +; if( !mask ) continue; +; for( j=0; j<8; j++ ) +; if( mask & (255<<j8) ) +; mvs[nmv++] = i+j; +; } +; return nmv; +; } +cglobal x264_pixel_ads_mvs + ; mvs = parm5q + ; masks = rsp + ; width = r10 + mov dword [rsp+r10], 0 + xor eax, eax + xor esi, esi +.loopi: + mov rdi, [rsp+rsi] + test rdi, rdi + jz .nexti + xor ecx, ecx +%macro TEST 1 + mov [parm5q+rax2], si + test edi, 0xff<<(%18) + setne cl + add eax, ecx + inc esi +%endmacro + TEST 0 + TEST 1 + TEST 2 + TEST 3 + shr rdi, 32 + TEST 0 + TEST 1 + TEST 2 + TEST 3 + cmp esi, r10d + jl .loopi + leave + ret +.nexti: + add esi, 8 + cmp esi, r10d + jl .loopi + leave + ret + +%macro ADS_START 0 + push rbp + mov rbp, rsp + sub rsp, parm6q + sub rsp, 4 + and rsp, ~15 + mov rax, rsp + mov r10d, parm6d + shl parm3q, 1 +%endmacro + +%macro ADS_END 1 + add parm2q, 8%1 + add parm4q, 8%1 + add rax, 4%1 + sub parm6d, 4%1 + jg .loop + jmp x264_pixel_ads_mvs +%endmacro + ;----------------------------------------------------------------------------- -; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t sums, int delta, -; uint16_t res, int width ) +; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t sums, int delta, +; uint16_t cost_mvx, int16_t mvs, int width, int thresh ) ;----------------------------------------------------------------------------- cglobal x264_pixel_ads4_mmxext movq mm6, [parm1q] @@ -1056,7 +1133,7 @@ pshufw mm6, mm6, 0xAA pshufw mm5, mm4, 0 pshufw mm4, mm4, 0xAA - shl parm3q, 1 + ADS_START .loop: movq mm0, [parm2q] movq mm1, [parm2q+16] @@ -1073,19 +1150,19 @@ MMX_ABS mm3, mm1 paddw mm0, mm2 paddw mm0, mm3 - movq [parm4q], mm0 - add parm2q, 8 - add parm4q, 8 - sub parm5d, 4 - jg .loop - nop - ret + pshufw mm1, [rbp+16], 0 + paddusw mm0, [parm4q] + psubusw mm1, mm0 + packsswb mm1, mm1 + movd [rax], mm1 + ADS_END 1 cglobal x264_pixel_ads2_mmxext movq mm6, [parm1q] + pshufw mm5, parm7q, 0 pshufw mm7, mm6, 0 pshufw mm6, mm6, 0xAA - shl parm3q, 1 + ADS_START .loop: movq mm0, [parm2q] movq mm1, [parm2q+parm3q] @@ -1094,16 +1171,17 @@ MMX_ABS mm0, mm2 MMX_ABS mm1, mm3 paddw mm0, mm1 - movq [parm4q], mm0 - add parm2q, 8 - add parm4q, 8 - sub parm5d, 4 - jg .loop - nop - ret + paddusw mm0, [parm4q] + movq mm4, mm5 + psubusw mm4, mm0 + packsswb mm4, mm4 + movd [rax], mm4 + ADS_END 1 cglobal x264_pixel_ads1_mmxext pshufw mm7, [parm1q], 0 + pshufw mm6, parm7q, 0 + ADS_START .loop: movq mm0, [parm2q] movq mm1, [parm2q+8] @@ -1111,11 +1189,113 @@ psubw mm1, mm7 MMX_ABS mm0, mm2 MMX_ABS mm1, mm3 - movq [parm4q], mm0 - movq [parm4q+8], mm1 - add parm2q, 16 - add parm4q, 16 - sub parm5d, 8 - jg .loop - nop - ret + paddusw mm0, [parm4q] + paddusw mm1, [parm4q+8] + movq mm4, mm6 + movq mm5, mm6 + psubusw mm4, mm0 + psubusw mm5, mm1 + packsswb mm4, mm5 + movq [rax], mm4 + ADS_END 2 + +%macro ADS_SSE2 1 +cglobal x264_pixel_ads4_%1 + movdqa xmm4, [parm1q] + pshuflw xmm8, parm7q, 0 + pshuflw xmm7, xmm4, 0 + pshuflw xmm6, xmm4, 0xAA + pshufhw xmm5, xmm4, 0 + pshufhw xmm4, xmm4, 0xAA + punpcklqdq xmm8, xmm8 + punpcklqdq xmm7, xmm7 + punpcklqdq xmm6, xmm6 + punpckhqdq xmm5, xmm5 + punpckhqdq xmm4, xmm4 + ADS_START + movdqu xmm10, [parm2q] + movdqu xmm11, [parm2q+parm3q] +.loop: + movdqa xmm0, xmm10 + movdqu xmm1, [parm2q+16] + movdqa xmm10, xmm1 + psubw xmm0, xmm7 + psubw xmm1, xmm6 + MMX_ABS xmm0, xmm2 + MMX_ABS xmm1, xmm3 + movdqa xmm2, xmm11 + movdqu xmm3, [parm2q+parm3q+16] + movdqa xmm11, xmm3 + psubw xmm2, xmm5 + psubw xmm3, xmm4 + paddw xmm0, xmm1 + movdqu xmm9, [parm4q] + MMX_ABS xmm2, xmm1 + MMX_ABS xmm3, xmm1 + paddw xmm0, xmm2 + paddw xmm0, xmm3 + paddusw xmm0, xmm9 + movdqa xmm1, xmm8 + psubusw xmm1, xmm0 + packsswb xmm1, xmm1 + movq [rax], xmm1 + ADS_END 2 + +cglobal x264_pixel_ads2_%1 + movq xmm6, [parm1q] + pshuflw xmm8, parm7q, 0 + pshuflw xmm7, xmm6, 0 + pshuflw xmm6, xmm6, 0xAA + punpcklqdq xmm8, xmm8 + punpcklqdq xmm7, xmm7 + punpcklqdq xmm6, xmm6 + ADS_START +.loop: + movdqu xmm0, [parm2q] + movdqu xmm1, [parm2q+parm3q] + psubw xmm0, xmm7 + psubw xmm1, xmm6 + movdqu xmm9, [parm4q] + MMX_ABS xmm0, xmm2 + MMX_ABS xmm1, xmm3 + paddw xmm0, xmm1 + paddusw xmm0, xmm9 + movdqa xmm4, xmm8 + psubusw xmm4, xmm0 + packsswb xmm4, xmm4 + movq [rax], xmm4 + ADS_END 2 + +cglobal x264_pixel_ads1_%1 + pshuflw xmm7, [parm1q], 0 + pshuflw xmm8, parm7q, 0 + punpcklqdq xmm7, xmm7 + punpcklqdq xmm8, xmm8 + ADS_START +.loop: + movdqu xmm0, [parm2q] + movdqu xmm1, [parm2q+16] + psubw xmm0, xmm7 + psubw xmm1, xmm7 + movdqu xmm9, [parm4q] + movdqu xmm10, [parm4q+16] + MMX_ABS xmm0, xmm2 + MMX_ABS xmm1, xmm3 + paddusw xmm0, xmm9 + paddusw xmm1, xmm10 + movdqa xmm4, xmm8 + movdqa xmm5, xmm8 + psubusw xmm4, xmm0 + psubusw xmm5, xmm1 + packsswb xmm4, xmm5 + movdqa [rax], xmm4 + ADS_END 4 +%endmacro + +ADS_SSE2 sse2 +%ifdef HAVE_SSE3 +%macro MMX_ABS 2 + pabsw %1, %1 +%endmacro +ADS_SSE2 ssse3 +%endif
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/common/amd64/pixel-sse2.asm ^
@@ -1000,9 +1000,9 @@ pshufd xmm6, xmm4, 0xB1 packssdw xmm1, xmm2 paddd xmm3, xmm5 - pmaddwd xmm1, xmm8 - paddd xmm4, xmm6 pshufd xmm1, xmm1, 0xD8 + paddd xmm4, xmm6 + pmaddwd xmm1, xmm8 movdqa xmm5, xmm3 punpckldq xmm3, xmm4 punpckhdq xmm5, xmm4
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/common/cpu.c ^
@@ -28,7 +28,7 @@ #ifdef SYS_BEOS #include <kernel/OS.h> #endif -#ifdef SYS_MACOSX +#if defined(SYS_MACOSX) \|\| defined(SYS_FREEBSD) #include <sys/types.h> #include <sys/sysctl.h> #endif @@ -219,7 +219,7 @@ #if !defined(HAVE_PTHREAD) return 1; -#elif defined(WIN32) +#elif defined(_WIN32) return pthread_num_processors_np(); #elif defined(SYS_LINUX) @@ -237,7 +237,7 @@ get_system_info( &info ); return info.cpu_count; -#elif defined(SYS_MACOSX) +#elif defined(SYS_MACOSX) \|\| defined(SYS_FREEBSD) int numberOfCPUs; size_t length = sizeof( numberOfCPUs ); if( sysctlbyname("hw.ncpu", &numberOfCPUs, &length, NULL, 0) )
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/common/i386/pixel-a.asm ^
@@ -1579,24 +1579,91 @@ +; int x264_pixel_ads_mvs( int16_t mvs, uint8_t masks, int width ) +cglobal x264_pixel_ads_mvs + mov ebx, [ebp+24] ; mvs + mov ecx, esp ; masks + mov edi, [ebp+28] ; width + mov dword [ecx+edi], 0 + push esi + push ebp + xor eax, eax + xor esi, esi +.loopi: + mov ebp, [ecx+esi] + mov edx, [ecx+esi+4] + or edx, ebp + jz .nexti + xor edx, edx +%macro TEST 1 + mov [ebx+eax2], si + test ebp, 0xff<<(%18) + setne dl + add eax, edx + inc esi +%endmacro + TEST 0 + TEST 1 + TEST 2 + TEST 3 + mov ebp, [ecx+esi] + TEST 0 + TEST 1 + TEST 2 + TEST 3 + cmp esi, edi + jl .loopi + jmp .end +.nexti: + add esi, 8 + cmp esi, edi + jl .loopi +.end: + pop ebp + pop esi + mov edi, [ebp-8] + mov ebx, [ebp-4] + leave + ret + +%macro ADS_START 0 + push ebp + mov ebp, esp + push ebx + push edi + mov eax, [ebp+12] ; sums + mov ebx, [ebp+16] ; delta + mov ecx, [ebp+20] ; cost_mvx + mov edx, [ebp+28] ; width + sub esp, edx + sub esp, 4 + and esp, ~15 + mov edi, esp + shl ebx, 1 +%endmacro + +%macro ADS_END 1 + add eax, 8%1 + add ecx, 8%1 + add edi, 4%1 + sub edx, 4%1 + jg .loop + jmp x264_pixel_ads_mvs +%endmacro + ;----------------------------------------------------------------------------- -; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t sums, int delta, -; uint16_t res, int width ) +; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t sums, int delta, +; uint16_t cost_mvx, int16_t *mvs, int width, int thresh ) ;----------------------------------------------------------------------------- cglobal x264_pixel_ads4_mmxext - push ebx - mov eax, [esp+8] + mov eax, [esp+4] movq mm6, [eax] movq mm4, [eax+8] pshufw mm7, mm6, 0 pshufw mm6, mm6, 0xAA pshufw mm5, mm4, 0 pshufw mm4, mm4, 0xAA - mov eax, [esp+12] - mov ebx, [esp+16] - mov ecx, [esp+20] - mov edx, [esp+24] - shl ebx, 1 + ADS_START .loop: movq mm0, [eax] movq mm1, [eax+16] @@ -1613,25 +1680,20 @@ MMX_ABS mm3, mm1 paddw mm0, mm2 paddw mm0, mm3 - movq [ecx], mm0 - add eax, 8 - add ecx, 8 - sub edx, 4 - jg .loop - pop ebx - ret + pshufw mm1, [ebp+32], 0 + paddusw mm0, [ecx] + psubusw mm1, mm0 + packsswb mm1, mm1 + movd [edi], mm1 + ADS_END 1 cglobal x264_pixel_ads2_mmxext - push ebx - mov eax, [esp+8] + mov eax, [esp+4] movq mm6, [eax] + pshufw mm5, [esp+28], 0 pshufw mm7, mm6, 0 pshufw mm6, mm6, 0xAA - mov eax, [esp+12] - mov ebx, [esp+16] - mov ecx, [esp+20] - mov edx, [esp+24] - shl ebx, 1 + ADS_START .loop: movq mm0, [eax] movq mm1, [eax+ebx] @@ -1640,20 +1702,18 @@ MMX_ABS mm0, mm2 MMX_ABS mm1, mm3 paddw mm0, mm1 - movq [ecx], mm0 - add eax, 8 - add ecx, 8 - sub edx, 4 - jg .loop - pop ebx - ret + paddusw mm0, [ecx] + movq mm4, mm5 + psubusw mm4, mm0 + packsswb mm4, mm4 + movd [edi], mm4 + ADS_END 1 cglobal x264_pixel_ads1_mmxext mov eax, [esp+4] pshufw mm7, [eax], 0 - mov eax, [esp+8] - mov ecx, [esp+16] - mov edx, [esp+20] + pshufw mm6, [esp+28], 0 + ADS_START .loop: movq mm0, [eax] movq mm1, [eax+8] @@ -1661,11 +1721,115 @@ psubw mm1, mm7 MMX_ABS mm0, mm2 MMX_ABS mm1, mm3 - movq [ecx], mm0 - movq [ecx+8], mm1 - add eax, 16 - add ecx, 16 - sub edx, 8 - jg .loop - nop - ret + paddusw mm0, [ecx] + paddusw mm1, [ecx+8] + movq mm4, mm6 + movq mm5, mm6 + psubusw mm4, mm0 + psubusw mm5, mm1 + packsswb mm4, mm5 + movq [edi], mm4 + ADS_END 2 + +%macro ADS_SSE2 1 +cglobal x264_pixel_ads4_%1 + mov eax, [esp+4] ; enc_dc + movdqa xmm4, [eax] + pshuflw xmm7, xmm4, 0 + pshuflw xmm6, xmm4, 0xAA + pshufhw xmm5, xmm4, 0 + pshufhw xmm4, xmm4, 0xAA + punpcklqdq xmm7, xmm7 + punpcklqdq xmm6, xmm6 + punpckhqdq xmm5, xmm5 + punpckhqdq xmm4, xmm4 + ADS_START +.loop: + movdqu xmm0, [eax] + movdqu xmm1, [eax+16] + psubw xmm0, xmm7 + psubw xmm1, xmm6 + MMX_ABS xmm0, xmm2 + MMX_ABS xmm1, xmm3 + movdqu xmm2, [eax+ebx] + movdqu xmm3, [eax+ebx+16] + psubw xmm2, xmm5 + psubw xmm3, xmm4 + paddw xmm0, xmm1 + MMX_ABS xmm2, xmm1 + MMX_ABS xmm3, xmm1 + paddw xmm0, xmm2 + paddw xmm0, xmm3 + movd xmm1, [ebp+32] ; thresh + movdqu xmm2, [ecx] + pshuflw xmm1, xmm1, 0 + punpcklqdq xmm1, xmm1 + paddusw xmm0, xmm2 + psubusw xmm1, xmm0 + packsswb xmm1, xmm1 + movq [edi], xmm1 + ADS_END 2 + +cglobal x264_pixel_ads2_%1 + mov eax, [esp+4] ; enc_dc + movq xmm6, [eax] + movd xmm5, [esp+28] ; thresh + pshuflw xmm7, xmm6, 0 + pshuflw xmm6, xmm6, 0xAA + pshuflw xmm5, xmm5, 0 + punpcklqdq xmm7, xmm7 + punpcklqdq xmm6, xmm6 + punpcklqdq xmm5, xmm5 + ADS_START +.loop: + movdqu xmm0, [eax] + movdqu xmm1, [eax+ebx] + psubw xmm0, xmm7 + psubw xmm1, xmm6 + movdqu xmm4, [ecx] + MMX_ABS xmm0, xmm2 + MMX_ABS xmm1, xmm3 + paddw xmm0, xmm1 + paddusw xmm0, xmm4 + movdqa xmm1, xmm5 + psubusw xmm1, xmm0 + packsswb xmm1, xmm1 + movq [edi], xmm1 + ADS_END 2 + +cglobal x264_pixel_ads1_%1 + mov eax, [esp+4] ; enc_dc + movd xmm7, [eax] + movd xmm6, [esp+28] ; thresh + pshuflw xmm7, xmm7, 0 + pshuflw xmm6, xmm6, 0 + punpcklqdq xmm7, xmm7 + punpcklqdq xmm6, xmm6 + ADS_START +.loop: + movdqu xmm0, [eax] + movdqu xmm1, [eax+16] + psubw xmm0, xmm7 + psubw xmm1, xmm7 + movdqu xmm2, [ecx] + movdqu xmm3, [ecx+16] + MMX_ABS xmm0, xmm4 + MMX_ABS xmm1, xmm5 + paddusw xmm0, xmm2 + paddusw xmm1, xmm3 + movdqa xmm4, xmm6 + movdqa xmm5, xmm6 + psubusw xmm4, xmm0 + psubusw xmm5, xmm1 + packsswb xmm4, xmm5 + movdqa [edi], xmm4 + ADS_END 4 +%endmacro + +ADS_SSE2 sse2 +%ifdef HAVE_SSE3 +%macro MMX_ABS 2 + pabsw %1, %1 +%endmacro +ADS_SSE2 ssse3 +%endif
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/common/i386/pixel-sse2.asm ^
@@ -973,9 +973,9 @@ pshufd xmm6, xmm4, 0xB1 packssdw xmm1, xmm2 paddd xmm3, xmm5 - pmaddwd xmm1, xmm7 - paddd xmm4, xmm6 pshufd xmm1, xmm1, 0xD8 + paddd xmm4, xmm6 + pmaddwd xmm1, xmm7 movdqa xmm5, xmm3 punpckldq xmm3, xmm4 punpckhdq xmm5, xmm4
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/common/i386/pixel.h ^
@@ -81,11 +81,18 @@ const uint8_t pix2, int stride2, int sums[2][4] ); float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width ); -void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t sums, int delta, - uint16_t res, int width ); -void x264_pixel_ads2_mmxext( int enc_dc[2], uint16_t sums, int delta, - uint16_t res, int width ); -void x264_pixel_ads1_mmxext( int enc_dc[1], uint16_t sums, int delta, - uint16_t res, int width ); +#define DECL_ADS( size, suffix ) \ +int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t sums, int delta,\ + uint16_t cost_mvx, int16_t mvs, int width, int thresh ); +DECL_ADS( 4, mmxext ) +DECL_ADS( 2, mmxext ) +DECL_ADS( 1, mmxext ) +DECL_ADS( 4, sse2 ) +DECL_ADS( 2, sse2 ) +DECL_ADS( 1, sse2 ) +DECL_ADS( 4, ssse3 ) +DECL_ADS( 2, ssse3 ) +DECL_ADS( 1, ssse3 ) +#undef DECL_ADS #endif
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/common/mc.c ^
@@ -430,7 +430,7 @@ uint8_t ref = frame->plane[0] + y stride - PADH; uint16_t line = frame->integral + (y+1) stride - PADH + 1; uint16_t v = line[0] = 0; - for( x = 0; x < stride-1; x++ ) + for( x = 1; x < stride-1; x++ ) line[x] = v += ref[x] + line[x-stride] - line[x-stride-1]; line -= 8*stride; if( y >= 9-PADV )
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/common/osdep.h ^
@@ -50,7 +50,7 @@ #if defined(_MSC_VER) \|\| defined(SYS_SunOS) \|\| defined(SYS_MACOSX) #define sqrtf sqrt #endif -#ifdef __WIN32__ +#ifdef _WIN32 #define rename(src,dst) (unlink(dst), rename(src,dst)) // POSIX says that rename() removes the destination, but win32 doesn't. #ifndef strtok_r #define strtok_r(str,delim,save) strtok(str,delim)
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/common/pixel.c ^
@@ -408,32 +408,50 @@ /**************************************************************************** * successive elimination ***************************************************************************/ -static void pixel_ads4( int enc_dc[4], uint16_t sums, int delta, - uint16_t res, int width ) +static int x264_pixel_ads4( int enc_dc[4], uint16_t sums, int delta, + uint16_t cost_mvx, int16_t mvs, int width, int thresh ) { - int i; + int nmv=0, i; for( i=0; i<width; i++, sums++ ) - res[i] = abs( enc_dc[0] - sums[0] ) - + abs( enc_dc[1] - sums[8] ) - + abs( enc_dc[2] - sums[delta] ) - + abs( enc_dc[3] - sums[delta+8] ); + { + int ads = abs( enc_dc[0] - sums[0] ) + + abs( enc_dc[1] - sums[8] ) + + abs( enc_dc[2] - sums[delta] ) + + abs( enc_dc[3] - sums[delta+8] ) + + cost_mvx[i]; + if( ads < thresh ) + mvs[nmv++] = i; + } + return nmv; } -static void pixel_ads2( int enc_dc[2], uint16_t sums, int delta, - uint16_t res, int width ) +static int x264_pixel_ads2( int enc_dc[2], uint16_t sums, int delta, + uint16_t cost_mvx, int16_t mvs, int width, int thresh ) { - int i; + int nmv=0, i; for( i=0; i<width; i++, sums++ ) - res[i] = abs( enc_dc[0] - sums[0] ) - + abs( enc_dc[1] - sums[delta] ); + { + int ads = abs( enc_dc[0] - sums[0] ) + + abs( enc_dc[1] - sums[delta] ) + + cost_mvx[i]; + if( ads < thresh ) + mvs[nmv++] = i; + } + return nmv; } -static void pixel_ads1( int enc_dc[1], uint16_t sums, int delta, - uint16_t res, int width ) +static int x264_pixel_ads1( int enc_dc[1], uint16_t sums, int delta, + uint16_t cost_mvx, int16_t mvs, int width, int thresh ) { - int i; + int nmv=0, i; for( i=0; i<width; i++, sums++ ) - res[i] = abs( enc_dc[0] - sums[0] ); + { + int ads = abs( enc_dc[0] - sums[0] ) + + cost_mvx[i]; + if( ads < thresh ) + mvs[nmv++] = i; + } + return nmv; } @@ -459,20 +477,22 @@ pixf->name[PIXEL_4x8] = x264_pixel_##name##_4x8##cpu;\ pixf->name[PIXEL_4x4] = x264_pixel_##name##_4x4##cpu; +#define INIT_ADS( cpu ) \ + pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\ + pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\ + pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu; + INIT7( sad, ); INIT7( sad_x3, ); INIT7( sad_x4, ); INIT7( ssd, ); INIT7( satd, ); INIT4( sa8d, ); + INIT_ADS( ); pixf->ssim_4x4x2_core = ssim_4x4x2_core; pixf->ssim_end4 = ssim_end4; - pixf->ads[PIXEL_16x16] = pixel_ads4; - pixf->ads[PIXEL_16x8] = pixel_ads2; - pixf->ads[PIXEL_8x8] = pixel_ads1; - #ifdef HAVE_MMX if( cpu&X264_CPU_MMX ) { @@ -485,10 +505,7 @@ INIT7( sad_x3, _mmxext ); INIT7( sad_x4, _mmxext ); INIT7( satd, _mmxext ); - - pixf->ads[PIXEL_16x16] = x264_pixel_ads4_mmxext; - pixf->ads[PIXEL_16x8 ] = x264_pixel_ads2_mmxext; - pixf->ads[PIXEL_8x8 ] = x264_pixel_ads1_mmxext; + INIT_ADS( _mmxext ); #ifdef ARCH_X86 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext; @@ -535,6 +552,7 @@ INIT2( sad_x3, _sse2 ); INIT2( sad_x4, _sse2 ); INIT5( satd, _sse2 ); + INIT_ADS( _sse2 ); #ifdef ARCH_X86 if( cpu&X264_CPU_CACHELINE_SPLIT ) @@ -570,6 +588,7 @@ if( cpu&X264_CPU_SSSE3 ) { INIT5( satd, _ssse3 ); + INIT_ADS( _ssse3 ); #ifdef ARCH_X86_64 pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/common/pixel.h ^
@@ -80,9 +80,9 @@ x264_pixel_cmp_x4_t sad_x4[7]; /* abs-diff-sum for successive elimination. - * may round width up to a multiple of 8. / - void (ads[7])( int enc_dc[4], uint16_t sums, int delta, - uint16_t res, int width ); + * may round width up to a multiple of 16. / + int (ads[7])( int enc_dc[4], uint16_t sums, int delta, + uint16_t cost_mvx, int16_t mvs, int width, int thresh ); / calculate satd of V, H, and DC modes. * may be NULL, in which case just use pred+satd instead. */
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/encoder/analyse.c ^
@@ -167,16 +167,18 @@ static void x264_analyse_update_cache( x264_t h, x264_mb_analysis_t a ); +uint16_t x264_cost_mv_fpel[52][4]; + / initialize an array of lambdanbits for all possible mvs / static void x264_mb_analyse_load_costs( x264_t h, x264_mb_analysis_t a ) { static int16_t p_cost_mv[52]; + int i, j; if( !p_cost_mv[a->i_qp] ) { / could be faster, but isn't called many times / / factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp / - int i; p_cost_mv[a->i_qp] = x264_malloc( (442048 + 1) sizeof(int16_t) ); p_cost_mv[a->i_qp] += 242048; for( i = 0; i <= 242048; i++ ) @@ -185,8 +187,19 @@ p_cost_mv[a->i_qp][i] = a->i_lambda * bs_size_se( i ); } } - a->p_cost_mv = p_cost_mv[a->i_qp]; + + /* FIXME is this useful for all me methods? / + if( h->param.analyse.i_me_method == X264_ME_ESA && !x264_cost_mv_fpel[a->i_qp][0] ) + { + for( j=0; j<4; j++ ) + { + x264_cost_mv_fpel[a->i_qp][j] = x264_malloc( (42048 + 1) * sizeof(int16_t) ); + x264_cost_mv_fpel[a->i_qp][j] += 22048; + for( i = -22048; i < 22048; i++ ) + x264_cost_mv_fpel[a->i_qp][j][i] = p_cost_mv[a->i_qp][i4+j]; + } + } } static void x264_mb_analyse_init( x264_t h, x264_mb_analysis_t a, int i_qp )
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/encoder/encoder.c ^
@@ -885,7 +885,7 @@ h->i_ref1 = X264_MIN( h->i_ref1, h->frames.i_max_ref1 ); h->i_ref0 = X264_MIN( h->i_ref0, h->frames.i_max_ref0 ); h->i_ref0 = X264_MIN( h->i_ref0, h->param.i_frame_reference ); // if reconfig() has lowered the limit - h->i_ref0 = X264_MIN( h->i_ref0, 16 - h->i_ref1 ); + assert( h->i_ref0 + h->i_ref1 <= 16 ); h->mb.pic.i_fref[0] = h->i_ref0; h->mb.pic.i_fref[1] = h->i_ref1; }
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/encoder/me.c ^
@@ -101,22 +101,19 @@ COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\ } -#define COST_MV_X4_ABS( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\ +#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\ {\ - h->pixf.sad_x4[i_pixel]( m->p_fenc[0],\ + h->pixf.sad_x3[i_pixel]( m->p_fenc[0],\ p_fref + (m0x) + (m0y)m->i_stride[0],\ p_fref + (m1x) + (m1y)m->i_stride[0],\ p_fref + (m2x) + (m2y)m->i_stride[0],\ - p_fref + (m3x) + (m3y)m->i_stride[0],\ m->i_stride[0], costs );\ - costs[0] += p_cost_mvx[m0x<<2]; /* no cost_mvy /\ - costs[1] += p_cost_mvx[m1x<<2];\ - costs[2] += p_cost_mvx[m2x<<2];\ - costs[3] += p_cost_mvx[m3x<<2];\ + costs[0] += p_cost_mvx[(m0x)<<2]; / no cost_mvy /\ + costs[1] += p_cost_mvx[(m1x)<<2];\ + costs[2] += p_cost_mvx[(m2x)<<2];\ COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\ COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\ COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\ - COPY3_IF_LT( bcost, costs[3], bmx, m3x, bmy, m3y );\ } / 1 / @@ -454,13 +451,16 @@ case X264_ME_ESA: { - const int min_x = X264_MAX( bmx - i_me_range, mv_x_min); - const int min_y = X264_MAX( bmy - i_me_range, mv_y_min); - const int max_x = X264_MIN( bmx + i_me_range, mv_x_max); - const int max_y = X264_MIN( bmy + i_me_range, mv_y_max); - int mx, my; + const int min_x = X264_MAX( bmx - i_me_range, mv_x_min ); + const int min_y = X264_MAX( bmy - i_me_range, mv_y_min ); + const int max_x = X264_MIN( bmx + i_me_range, mv_x_max ); + const int max_y = X264_MIN( bmy + i_me_range, mv_y_max ); + / SEA is fastest in multiples of 4 / + const int width = (max_x - min_x + 3) & ~3; + int my; #if 0 / plain old exhaustive search / + int mx; for( my = min_y; my <= max_y; my++ ) for( mx = min_x; mx <= max_x; mx++ ) COST_MV( mx, my ); @@ -470,10 +470,13 @@ const int stride = m->i_stride[0]; static uint8_t zero[1616] = {0,}; uint16_t sums_base = m->integral; - int enc_dc[4]; + DECLARE_ALIGNED( int, enc_dc[4], 16 ); int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4; int delta = x264_pixel_size[sad_size].w; - uint16_t ads = x264_malloc((max_x-min_x+8) * sizeof(uint16_t)); + int16_t xs_buf[64]; + int16_t xs = width<=64 ? xs_buf : x264_malloc( (width+15)sizeof(int16_t) ); + int xn; + uint16_t cost_fpel_mvx = x264_cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2); h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+delta, m->p_fenc[0]+deltaFENC_STRIDE, m->p_fenc[0]+delta+deltaFENC_STRIDE, @@ -487,29 +490,18 @@ for( my = min_y; my <= max_y; my++ ) { - int mvs[3], i_mvs=0; bcost -= p_cost_mvy[my<<2]; - h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my stride, delta, - ads, max_x-min_x+1 ); - for( mx = min_x; mx <= max_x; mx++ ) - { - if( ads[mx-min_x] < bcost - p_cost_mvx[mx<<2] ) - { - if( i_mvs == 3 ) - { - COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my ); - i_mvs = 0; - } - else - mvs[i_mvs++] = mx; - } - } + xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta, + cost_fpel_mvx+min_x, xs, width, bcost ); + for( i=0; i<xn-2; i+=3 ) + COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my ); bcost += p_cost_mvy[my<<2]; - for( i=0; i<i_mvs; i++ ) - COST_MV( mvs[i], my ); + for( ; i<xn; i++ ) + COST_MV( min_x+xs[i], my ); } - x264_free(ads); + if( xs != xs_buf ) + x264_free( xs ); #endif } break;
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/encoder/me.h ^
@@ -56,6 +56,8 @@ int x264_me_refine_bidir( x264_t h, x264_me_t m0, x264_me_t m1, int i_weight ); int x264_rd_cost_part( x264_t h, int i_lambda2, int i8, int i_pixel ); +extern uint16_t *x264_cost_mv_fpel[52][4]; + #define COPY1_IF_LT(x,y)\ if((y)<(x))\ (x)=(y);
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/encoder/set.c ^
@@ -196,7 +196,7 @@ /* extra slot with pyramid so that we don't have to override the * order of forgetting old pictures */ sps->vui.i_max_dec_frame_buffering = - sps->i_num_ref_frames = X264_MIN(16, param->i_frame_reference + sps->vui.i_num_reorder_frames + param->b_bframe_pyramid); + sps->i_num_ref_frames = X264_MIN(16, X264_MAX(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames)); sps->vui.b_bitstream_restriction = 1; if( sps->vui.b_bitstream_restriction )
[-] [+]	Changed	x264-snapshot-20080126-2245.tar.bz2/tools/checkasm.c ^
@@ -36,6 +36,7 @@ x264_predict_t predict_4x4[9+3]; x264_predict8x8_t predict_8x8[9+3]; DECLARE_ALIGNED( uint8_t, edge[33], 8 ); + uint16_t cost_mv[32]; int ret = 0, ok, used_asm; int i, j; @@ -155,20 +156,24 @@ } ok = 1; used_asm = 0; - for( i=0; i<4; i++ ) - if( pixel_asm.ads[i] != pixel_ref.ads[i] ) + for( i=0; i<32; i++ ) + cost_mv[i] = i10; + for( i=0; i<100; i++ ) + if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] ) { - uint16_t res_a[32], res_c[32]; - uint16_t sums[72]; - int dc[4]; + DECLARE_ALIGNED( uint16_t, sums[72], 16 ); + DECLARE_ALIGNED( int, dc[4], 16 ); + int16_t mvs_a[32], mvs_c[32]; + int mvn_a, mvn_c; + int thresh = rand() & 0x3fff; for( j=0; j<72; j++ ) sums[j] = rand() & 0x3fff; for( j=0; j<4; j++ ) dc[j] = rand() & 0x3fff; used_asm = 1; - pixel_c.ads[i]( dc, sums, 32, res_c, 32 ); - pixel_asm.ads[i]( dc, sums, 32, res_a, 32 ); - if( memcmp(res_a, res_c, sizeof(res_c)) ) + mvn_c = pixel_c.ads[i&3]( dc, sums, 32, cost_mv, mvs_c, 32, thresh ); + mvn_a = pixel_asm.ads[i&3]( dc, sums, 32, cost_mv, mvs_a, 32, thresh ); + if( mvn_c != mvn_a \|\| memcmp( mvs_c, mvs_a, mvn_csizeof(*mvs_c) ) ) ok = 0; } report( "esa ads:" );
	Deleted	x264-snapshot-20080814-2245.tar.bz2/.git/objects/pack/pack-86b450d425caafda828e437e0bbcfb6dd9c53021.idx ^
	Deleted	x264-snapshot-20080814-2245.tar.bz2/.git/objects/pack/pack-86b450d425caafda828e437e0bbcfb6dd9c53021.pack ^
[-] [+]	Deleted	x264-snapshot-20080814-2245.tar.bz2/encoder/eval.c ^
@@ -1,253 +0,0 @@ -/***************************************************************************** - * simple arithmetic expression evaluator - ***************************************************************************** - * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - ***************************************************************************/ - -/ - * @file eval.c - * simple arithmetic expression evaluator. - * - * see http://joe.hotchkiss.com/programming/eval/eval.html - / - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <math.h> - -#ifndef NAN - #define NAN 0 -#endif - -#ifndef M_PI -#define M_PI 3.14159265358979323846 -#endif - -#define STACK_SIZE 100 - -typedef struct Parser{ - double stack[STACK_SIZE]; - int stack_index; - char s; - double const_value; - const char const_name; // NULL terminated - double (func1)(void , double a); // NULL terminated - const char func1_name; // NULL terminated - double (func2)(void , double a, double b); // NULL terminated - char func2_name; // NULL terminated - void opaque; -} Parser; - -static void evalExpression(Parser p); - -static void push(Parser p, double d){ - if(p->stack_index+1>= STACK_SIZE){ - fprintf(stderr, "stack overflow in the parser\n"); - return; - } - p->stack[ p->stack_index++ ]= d; -//printf("push %f\n", d); fflush(stdout); -} - -static double pop(Parser p){ - if(p->stack_index<=0){ - fprintf(stderr, "stack underflow in the parser\n"); - return NAN; - } -//printf("pop\n"); fflush(stdout); - return p->stack[ --p->stack_index ]; -} - -static int strmatch(const char s, const char prefix){ - int i; - for(i=0; prefix[i]; i++){ - if(prefix[i] != s[i]) return 0; - } - return 1; -} - -static void evalPrimary(Parser p){ - double d, d2=NAN; - char next= p->s; - int i; - - / number / - d= strtod(p->s, &next); - if(next != p->s){ - push(p, d); - p->s= next; - return; - } - - / named constants / - for(i=0; p->const_name[i]; i++){ - if(strmatch(p->s, p->const_name[i])){ - push(p, p->const_value[i]); - p->s+= strlen(p->const_name[i]); - return; - } - } - - p->s= strchr(p->s, '('); - if(p->s==NULL){ - fprintf(stderr, "Parser: missing ( in \"%s\"\n", next); - return; - } - p->s++; // "(" - evalExpression(p); - d= pop(p); - if(p->s[0]== ','){ - p->s++; // "," - evalExpression(p); - d2= pop(p); - } - if(p->s[0] != ')'){ - fprintf(stderr, "Parser: missing ) in \"%s\"\n", next); - return; - } - p->s++; // ")" - - if( strmatch(next, "sinh" ) ) d= sinh(d); - else if( strmatch(next, "cosh" ) ) d= cosh(d); - else if( strmatch(next, "tanh" ) ) d= tanh(d); - else if( strmatch(next, "sin" ) ) d= sin(d); - else if( strmatch(next, "cos" ) ) d= cos(d); - else if( strmatch(next, "tan" ) ) d= tan(d); - else if( strmatch(next, "exp" ) ) d= exp(d); - else if( strmatch(next, "log" ) ) d= log(d); - else if( strmatch(next, "squish") ) d= 1/(1+exp(4d)); - else if( strmatch(next, "gauss" ) ) d= exp(-dd/2)/sqrt(2M_PI); - else if( strmatch(next, "abs" ) ) d= fabs(d); - else if( strmatch(next, "max" ) ) d= d > d2 ? d : d2; - else if( strmatch(next, "min" ) ) d= d < d2 ? d : d2; - else if( strmatch(next, "gt" ) ) d= d > d2 ? 1.0 : 0.0; - else if( strmatch(next, "gte" ) ) d= d >= d2 ? 1.0 : 0.0; - else if( strmatch(next, "lt" ) ) d= d > d2 ? 0.0 : 1.0; - else if( strmatch(next, "lte" ) ) d= d >= d2 ? 0.0 : 1.0; - else if( strmatch(next, "eq" ) ) d= d == d2 ? 1.0 : 0.0; -// else if( strmatch(next, "l1" ) ) d= 1 + d2(d - 1); -// else if( strmatch(next, "sq01" ) ) d= (d >= 0.0 && d <=1.0) ? 1.0 : 0.0; - else{ - int error=1; - for(i=0; p->func1_name && p->func1_name[i]; i++){ - if(strmatch(next, p->func1_name[i])){ - d= p->func1[i](p->opaque, d); - error=0; - break; - } - } - - for(i=0; p->func2_name && p->func2_name[i]; i++){ - if(strmatch(next, p->func2_name[i])){ - d= p->func2[i](p->opaque, d, d2); - error=0; - break; - } - } - - if(error){ - fprintf(stderr, "Parser: unknown function in \"%s\"\n", next); - return; - } - } - - push(p, d); -} - -static void evalPow(Parser p){ - int neg= 0; - if(p->s[0]=='+') p->s++; - - if(p->s[0]=='-'){ - neg= 1; - p->s++; - } - - if(p->s[0]=='('){ - p->s++;; - evalExpression(p); - - if(p->s[0]!=')') - fprintf(stderr, "Parser: missing )\n"); - p->s++; - }else{ - evalPrimary(p); - } - - if(neg) push(p, -pop(p)); -} - -static void evalFactor(Parser p){ - evalPow(p); - while(p->s[0]=='^'){ - double d; - - p->s++; - evalPow(p); - d= pop(p); - push(p, pow(pop(p), d)); - } -} - -static void evalTerm(Parser p){ - evalFactor(p); - while(p->s[0]=='' \|\| p->s[0]=='/'){ - int inv= p->s[0]=='/'; - double d; - - p->s++; - evalFactor(p); - d= pop(p); - if(inv) d= 1.0/d; - push(p, d pop(p)); - } -} - -static void evalExpression(Parser p){ - evalTerm(p); - while(p->s[0]=='+' \|\| p->s[0]=='-'){ - int sign= p->s[0]=='-'; - double d; - - p->s++; - evalTerm(p); - d= pop(p); - if(sign) d= -d; - push(p, d + pop(p)); - } -} - -double x264_eval(char s, double const_value, const char const_name, - double (func1)(void , double), const char func1_name, - double (func2)(void , double, double), char func2_name, - void opaque){ - Parser p; - - p.stack_index=0; - p.s= s; - p.const_value= const_value; - p.const_name = const_name; - p.func1 = func1; - p.func1_name = func1_name; - p.func2 = func2; - p.func2_name = func2_name; - p.opaque = opaque; - - evalExpression(&p); - return pop(&p); -}
	Changed	x264-snapshot-20080829-2245.tar.bz2 ^
	Changed	x264-snapshot-20081001-2245.tar.bz2/.git/index ^
	Added	x264-snapshot-20081001-2245.tar.bz2/.git/objects/pack/pack-cbee77041ce0953f87213f3295f7bddd63f94b6d.idx ^
	Added	x264-snapshot-20081001-2245.tar.bz2/.git/objects/pack/pack-cbee77041ce0953f87213f3295f7bddd63f94b6d.pack ^
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/.git/refs/heads/master ^
@@ -1 +1 @@ -fd1de69b8054ef718b52f5ae1520267a5e5402e8 +2324c7074585b8b3f56e49ae41df9cbca06f6185
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/.git/refs/heads/origin ^
@@ -1 +1 @@ -fd1de69b8054ef718b52f5ae1520267a5e5402e8 +2324c7074585b8b3f56e49ae41df9cbca06f6185
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/AUTHORS ^
@@ -15,9 +15,6 @@ D: Motion estimation (subpel and mixed refs) D: B-RDO -N: Andrew Dunstan -D: win64 asm port - N: bobololo D: Avisynth input D: MP4 muxing @@ -26,9 +23,6 @@ E: sennindemokrit AT gmx DOT net D: x86 asm -N: Placeholder -D: Altivec optimizations - N: Eric Petit E: eric.petit AT lapsus DOT org C: titer @@ -36,9 +30,6 @@ D: BeOS and MacOS X ports. S: France -N: Francesco Corriga -D: VfW - N: Gabriel Bouvigne E: gabriel.bouvigne AT joost DOT com D: 2pass VBV @@ -54,12 +45,6 @@ D: various speed optimizations, bugfixes S: USA -N: Justin Clay -E: justin_clay AT hotmail DOT com -C: wheatgerm -D: Inital work on VfW -S: Nova Scotia, Canada - N: Laurent Aimar E: fenrir AT via.ecp DOT fr C: fenrir @@ -96,7 +81,6 @@ C: chenm001 D: Win32/VC 6.0 port D: gcc asm to nasm conversion -D: VfW S: China N: Phil Jensen @@ -107,10 +91,6 @@ E: radoslaw AT syskin DOT cjb DOT net D: Cached motion compensation -N: Riccardo Stievano -E: walkunafraid AT tin DOT it -D: VfW - N: Tuukka Toivonen E: tuukkat AT ee DOT oulu DOT fi D: Visualization
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/Makefile ^
@@ -10,7 +10,7 @@ common/quant.c common/vlc.c \ encoder/analyse.c encoder/me.c encoder/ratecontrol.c \ encoder/set.c encoder/macroblock.c encoder/cabac.c \ - encoder/cavlc.c encoder/encoder.c encoder/eval.c + encoder/cavlc.c encoder/encoder.c SRCCLI = x264.c matroska.c muxers.c @@ -161,7 +161,7 @@ ifeq ($(SYS),MINGW) $(if $(SONAME), install -m 755 $(SONAME) $(DESTDIR)$(bindir)) else - $(if $(SONAME), ln -sf $(SONAME) $(DESTDIR)$(libdir)/libx264.so) + $(if $(SONAME), ln -sf $(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX)) $(if $(SONAME), install -m 755 $(SONAME) $(DESTDIR)$(libdir)) endif $(if $(IMPLIBNAME), install -m 644 $(IMPLIBNAME) $(DESTDIR)$(libdir)) @@ -172,7 +172,7 @@ uninstall: rm -f $(DESTDIR)$(includedir)/x264.h $(DESTDIR)$(libdir)/libx264.a rm -f $(DESTDIR)$(bindir)/x264 $(DESTDIR)$(libdir)/pkgconfig/x264.pc - $(if $(SONAME), rm -f $(DESTDIR)$(libdir)/$(SONAME) $(DESTDIR)$(libdir)/libx264.so) + $(if $(SONAME), rm -f $(DESTDIR)$(libdir)/$(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX)) $(MAKE) -C gtk uninstall etags: TAGS
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/bs.h ^
@@ -76,7 +76,11 @@ s->i_left -= i_count; if( s->i_left <= 32 ) { +#ifdef WORDS_BIGENDIAN + (uint32_t)s->p = s->cur_bits >> (32 - s->i_left); +#else (uint32_t)s->p = endian_fix( s->cur_bits << s->i_left ); +#endif s->i_left += 32; s->p += 4; }
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/cabac.h ^
@@ -40,7 +40,7 @@ /* aligned for memcpy_aligned starting here / DECLARE_ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision() - + / context */ uint8_t state[460]; } x264_cabac_t;
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/common.c ^
@@ -69,7 +69,7 @@ param->i_keyint_min = 25; param->i_bframe = 0; param->i_scenecut_threshold = 40; - param->b_bframe_adaptive = 1; + param->i_bframe_adaptive = X264_B_ADAPT_FAST; param->i_bframe_bias = 0; param->b_bframe_pyramid = 0; @@ -93,14 +93,13 @@ param->rc.i_qp_step = 4; param->rc.f_ip_factor = 1.4; param->rc.f_pb_factor = 1.3; - param->rc.i_aq_mode = X264_AQ_GLOBAL; + param->rc.i_aq_mode = X264_AQ_VARIANCE; param->rc.f_aq_strength = 1.0; param->rc.b_stat_write = 0; param->rc.psz_stat_out = "x264_2pass.log"; param->rc.b_stat_read = 0; param->rc.psz_stat_in = "x264_2pass.log"; - param->rc.psz_rc_eq = "blurCplx^(1-qComp)"; param->rc.f_qcompress = 0.6; param->rc.f_qblur = 0.5; param->rc.f_complexity_blur = 20; @@ -117,8 +116,10 @@ \| X264_ANALYSE_PSUB16x16 \| X264_ANALYSE_BSUB16x16; param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL; param->analyse.i_me_method = X264_ME_HEX; + param->analyse.f_psy_rd = 1.0; + param->analyse.f_psy_trellis = 0; param->analyse.i_me_range = 16; - param->analyse.i_subpel_refine = 5; + param->analyse.i_subpel_refine = 6; param->analyse.b_chroma_me = 1; param->analyse.i_mv_range_thread = -1; param->analyse.i_mv_range = -1; // set from level_idc @@ -169,12 +170,12 @@ static int x264_atobool( const char str, int b_error ) { - if( !strcmp(str, "1") \|\| - !strcmp(str, "true") \|\| + if( !strcmp(str, "1") \|\| + !strcmp(str, "true") \|\| !strcmp(str, "yes") ) return 1; - if( !strcmp(str, "0") \|\| - !strcmp(str, "false") \|\| + if( !strcmp(str, "0") \|\| + !strcmp(str, "false") \|\| !strcmp(str, "no") ) return 0; b_error = 1; @@ -329,7 +330,14 @@ OPT("bframes") p->i_bframe = atoi(value); OPT("b-adapt") - p->b_bframe_adaptive = atobool(value); + { + p->i_bframe_adaptive = atobool(value); + if( b_error ) + { + b_error = 0; + p->i_bframe_adaptive = atoi(value); + } + } OPT("b-bias") p->i_bframe_bias = atoi(value); OPT("b-pyramid") @@ -464,6 +472,21 @@ p->analyse.i_mv_range_thread = atoi(value); OPT2("subme", "subq") p->analyse.i_subpel_refine = atoi(value); + OPT("psy-rd") + { + if( 2 == sscanf( value, "%f:%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) \|\| + 2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ) + { } + else if( sscanf( value, "%f", &p->analyse.f_psy_rd ) ) + { + p->analyse.f_psy_trellis = 0; + } + else + { + p->analyse.f_psy_rd = 0; + p->analyse.f_psy_trellis = 0; + } + } OPT("bime") p->analyse.b_bidir_me = atobool(value); OPT("chroma-me") @@ -532,8 +555,6 @@ p->rc.psz_stat_in = strdup(value); p->rc.psz_stat_out = strdup(value); } - OPT("rceq") - p->rc.psz_rc_eq = strdup(value); OPT("qcomp") p->rc.f_qcompress = atof(value); OPT("qblur") @@ -644,7 +665,6 @@ uint8_t dst = p_data; uint8_t src = nal->p_payload; uint8_t end = &nal->p_payload[nal->i_payload]; - int i_count = 0; /* FIXME this code doesn't check overflow / @@ -669,13 +689,9 @@ i_count = 0; } if( src == 0 ) - { i_count++; - } else - { i_count = 0; - } dst++ = src++; } pi_data = dst - (uint8_t)p_data; @@ -683,37 +699,6 @@ return pi_data; } -/*************************************************************************** - * x264_nal_decode: - ***************************************************************************/ -int x264_nal_decode( x264_nal_t nal, void p_data, int i_data ) -{ - uint8_t src = p_data; - uint8_t end = &src[i_data]; - uint8_t dst = nal->p_payload; - - nal->i_type = src[0]&0x1f; - nal->i_ref_idc = (src[0] >> 5)&0x03; - - src++; - - while( src < end ) - { - if( src < end - 3 && src[0] == 0x00 && src[1] == 0x00 && src[2] == 0x03 ) - { - dst++ = 0x00; - dst++ = 0x00; - - src += 3; - continue; - } - dst++ = src++; - } - - nal->i_payload = dst - (uint8_t)p_data; - return 0; -} - /*************************************************************************** @@ -856,6 +841,7 @@ s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter ); s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] ); s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine ); + s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis ); s += sprintf( s, " brdo=%d", p->analyse.b_bframe_rdo ); s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references ); s += sprintf( s, " me_range=%d", p->analyse.i_me_range ); @@ -874,7 +860,7 @@ if( p->i_bframe ) { s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d wpredb=%d bime=%d", - p->b_bframe_pyramid, p->b_bframe_adaptive, p->i_bframe_bias, + p->b_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias, p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred, p->analyse.b_bidir_me ); } @@ -893,9 +879,8 @@ else s += sprintf( s, " bitrate=%d ratetol=%.1f", p->rc.i_bitrate, p->rc.f_rate_tolerance ); - s += sprintf( s, " rceq='%s' qcomp=%.2f qpmin=%d qpmax=%d qpstep=%d", - p->rc.psz_rc_eq, p->rc.f_qcompress, - p->rc.i_qp_min, p->rc.i_qp_max, p->rc.i_qp_step ); + s += sprintf( s, " qcomp=%.2f qpmin=%d qpmax=%d qpstep=%d", + p->rc.f_qcompress, p->rc.i_qp_min, p->rc.i_qp_max, p->rc.i_qp_step ); if( p->rc.b_stat_read ) s += sprintf( s, " cplxblur=%.1f qblur=%.1f", p->rc.f_complexity_blur, p->rc.f_qblur );
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/common.h ^
@@ -262,6 +262,8 @@ int i_frame_size; } out; + /** thread synchronization starts here */ + / frame number/poc / int i_frame; @@ -294,6 +296,8 @@ uint16_t (quant4_bias[4])[16]; /* [4][52][16] / uint16_t (quant8_bias[2])[64]; /* [2][52][64] / + const uint8_t chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset / + DECLARE_ALIGNED_16( uint32_t nr_residual_sum[2][64] ); DECLARE_ALIGNED_16( uint16_t nr_offset[2][64] ); uint32_t nr_count[2]; @@ -307,11 +311,11 @@ struct { / Frames to be encoded (whose types have been decided) / - x264_frame_t current[X264_BFRAME_MAX+3]; + x264_frame_t current[X264_BFRAME_MAX4+3]; /* Temporary buffer (frames types not yet decided) / - x264_frame_t next[X264_BFRAME_MAX+3]; + x264_frame_t next[X264_BFRAME_MAX4+3]; /* Unused frames / - x264_frame_t unused[X264_BFRAME_MAX + X264_THREAD_MAX2 + 16+4]; + x264_frame_t unused[X264_BFRAME_MAX4 + X264_THREAD_MAX2 + 16+4]; /* For adaptive B decision / x264_frame_t last_nonb; @@ -370,13 +374,15 @@ int i_mb_xy; int i_b8_xy; int i_b4_xy; - + /* Search parameters / int i_me_method; int i_subpel_refine; int b_chroma_me; int b_trellis; int b_noise_reduction; + int i_psy_rd; / Psy RD strength--fixed point value/ + int i_psy_trellis; / Psy trellis strength--fixed point value/ int b_interlaced; @@ -395,13 +401,17 @@ unsigned int i_neighbour; unsigned int i_neighbour8[4]; / neighbours of each 8x8 or 4x4 block that are available / unsigned int i_neighbour4[16]; / at the time the block is coded / - int i_mb_type_top; - int i_mb_type_left; - int i_mb_type_topleft; - int i_mb_type_topright; + int i_mb_type_top; + int i_mb_type_left; + int i_mb_type_topleft; + int i_mb_type_topright; int i_mb_prev_xy; int i_mb_top_xy; + /* thread synchronization ends here */ + / subsequent variables are either thread-local or constant, + * and won't be copied from one thread to another / + / mb table / int8_t type; /* mb type / int8_t qp; /* mb qp / @@ -448,14 +458,26 @@ DECLARE_ALIGNED_16( uint8_t fenc_buf[24FENC_STRIDE] ); DECLARE_ALIGNED_16( uint8_t fdec_buf[27FDEC_STRIDE] ); - / i4x4 and i8x8 backup data, for skipping the encode stage when possible / + / i4x4 and i8x8 backup data, for skipping the encode stage when possible / DECLARE_ALIGNED_16( uint8_t i4x4_fdec_buf[1616] ); DECLARE_ALIGNED_16( uint8_t i8x8_fdec_buf[1616] ); DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] ); DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] ); + / Psy trellis DCT data / + DECLARE_ALIGNED_16( int16_t fenc_dct8[4][64] ); + DECLARE_ALIGNED_16( int16_t fenc_dct4[16][16] ); + + / Psy RD SATD scores / + int fenc_satd[4][4]; + int fenc_satd_sum; + int fenc_sa8d[2][2]; + int fenc_sa8d_sum; + / pointer over mb of the frame to be compressed / uint8_t p_fenc[3]; + /* pointer to the actual source frame, not a block copy / + uint8_t p_fenc_plane[3]; /* pointer over mb of the frame to be reconstructed / uint8_t p_fdec[3]; @@ -524,11 +546,10 @@ /* Current frame stats / struct { - / Headers bits (MV+Ref+MB Block Type / - int i_hdr_bits; - / Texture bits (Intra/Predicted) / - int i_itex_bits; - int i_ptex_bits; + / MV bits (MV+Ref+Block Type) / + int i_mv_bits; + / Texture bits (DCT coefs) / + int i_tex_bits; / ? / int i_misc_bits; / MB type counts / @@ -559,7 +580,7 @@ double f_slice_qp[5]; int i_consecutive_bframes[X264_BFRAME_MAX+1]; / */ - int64_t i_sqe_global[5]; + int64_t i_ssd_global[5]; double f_psnr_average[5]; double f_psnr_mean_y[5]; double f_psnr_mean_u[5];
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/dct.c ^
@@ -460,45 +460,62 @@ // gcc pessimizes multi-dimensional arrays here, even with constant indices #define ZIG(i,y,x) level[i] = dct[0][x8+y]; +#define ZIGZAG8_FRAME\ + ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\ + ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\ + ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\ + ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\ + ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\ + ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\ + ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\ + ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\ + ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\ + ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\ + ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\ + ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\ + ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\ + ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\ + ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\ + ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\ + +#define ZIGZAG8_FIELD\ + ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\ + ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\ + ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\ + ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\ + ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\ + ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\ + ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\ + ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\ + ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\ + ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\ + ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\ + ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\ + ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\ + ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\ + ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\ + ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7) + +#define ZIGZAG4_FRAME\ + ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\ + ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\ + ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\ + ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3) + +#define ZIGZAG4_FIELD\ + ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\ + ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\ + ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\ + ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3) static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] ) { - ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0) - ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2) - ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1) - ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5) - ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1) - ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2) - ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6) - ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4) - ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0) - ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4) - ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7) - ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3) - ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5) - ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6) - ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6) - ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7) + ZIGZAG8_FRAME } static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] ) { - ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1) - ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1) - ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0) - ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3) - ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1) - ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3) - ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2) - ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4) - ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3) - ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5) - ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4) - ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5) - ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6) - ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6) - ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7) - ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7) + ZIGZAG8_FIELD } #undef ZIG @@ -506,10 +523,7 @@ static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] ) { - ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0) - ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2) - ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2) - ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3) + ZIGZAG4_FRAME } static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] ) @@ -531,26 +545,40 @@ (uint32_t)(p_dst+0FDEC_STRIDE) = (uint32_t)(p_src+0FENC_STRIDE);\ (uint32_t)(p_dst+1FDEC_STRIDE) = (uint32_t)(p_src+1FENC_STRIDE);\ (uint32_t)(p_dst+2FDEC_STRIDE) = (uint32_t)(p_src+2FENC_STRIDE);\ - (uint32_t)(p_dst+3FDEC_STRIDE) = (uint32_t)(p_src+3FENC_STRIDE);\ + (uint32_t)(p_dst+3FDEC_STRIDE) = (uint32_t)(p_src+3FENC_STRIDE); +#define COPY8x8\ + (uint64_t)(p_dst+0FDEC_STRIDE) = (uint64_t)(p_src+0FENC_STRIDE);\ + (uint64_t)(p_dst+1FDEC_STRIDE) = (uint64_t)(p_src+1FENC_STRIDE);\ + (uint64_t)(p_dst+2FDEC_STRIDE) = (uint64_t)(p_src+2FENC_STRIDE);\ + (uint64_t)(p_dst+3FDEC_STRIDE) = (uint64_t)(p_src+3FENC_STRIDE);\ + (uint64_t)(p_dst+4FDEC_STRIDE) = (uint64_t)(p_src+4FENC_STRIDE);\ + (uint64_t)(p_dst+5FDEC_STRIDE) = (uint64_t)(p_src+5FENC_STRIDE);\ + (uint64_t)(p_dst+6FDEC_STRIDE) = (uint64_t)(p_src+6FENC_STRIDE);\ + (uint64_t)(p_dst+7FDEC_STRIDE) = (uint64_t)(p_src+7FENC_STRIDE); static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t p_src, uint8_t p_dst ) { - ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0) - ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2) - ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2) - ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3) + ZIGZAG4_FRAME COPY4x4 } static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t p_src, uint8_t p_dst ) { - ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0) - ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1) - ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2) - ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3) + ZIGZAG4_FIELD COPY4x4 } +static void zigzag_sub_8x8_frame( int16_t level[64], const uint8_t p_src, uint8_t p_dst ) +{ + ZIGZAG8_FRAME + COPY8x8 +} +static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t p_src, uint8_t *p_dst ) +{ + ZIGZAG8_FIELD + COPY8x8 +} + #undef ZIG #undef COPY4x4 @@ -560,6 +588,7 @@ { pf->scan_8x8 = zigzag_scan_8x8_field; pf->scan_4x4 = zigzag_scan_4x4_field; + pf->sub_8x8 = zigzag_sub_8x8_field; pf->sub_4x4 = zigzag_sub_4x4_field; #ifdef HAVE_MMX if( cpu&X264_CPU_MMXEXT ) @@ -575,10 +604,22 @@ { pf->scan_8x8 = zigzag_scan_8x8_frame; pf->scan_4x4 = zigzag_scan_4x4_frame; + pf->sub_8x8 = zigzag_sub_8x8_frame; pf->sub_4x4 = zigzag_sub_4x4_frame; #ifdef HAVE_MMX + if( cpu&X264_CPU_MMX ) + pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx; + if( cpu&X264_CPU_MMXEXT ) + pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext; + if( cpu&X264_CPU_SSE2_IS_FAST ) + pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; if( cpu&X264_CPU_SSSE3 ) - pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; + { + pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; + pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; + } + if( cpu&X264_CPU_PHADD_IS_FAST ) + pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; #endif #ifdef ARCH_PPC
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/dct.h ^
@@ -41,6 +41,17 @@ }; #undef W +#define W(i) (i==0 ? FIX8(1.76777) :\ + i==1 ? FIX8(1.11803) :\ + i==2 ? FIX8(0.70711) :0) +static const uint16_t x264_dct4_weight_tab[16] = { + W(0), W(1), W(0), W(1), + W(1), W(2), W(1), W(2), + W(0), W(1), W(0), W(1), + W(1), W(2), W(1), W(2) +}; +#undef W + /* inverse squared / #define W(i) (i==0 ? FIX8(3.125) :\ i==1 ? FIX8(1.25) :\ @@ -107,6 +118,7 @@ { void (scan_8x8)( int16_t level[64], int16_t dct[8][8] ); void (scan_4x4)( int16_t level[16], int16_t dct[4][4] ); + void (sub_8x8)( int16_t level[64], const uint8_t p_src, uint8_t p_dst ); void (sub_4x4)( int16_t level[16], const uint8_t p_src, uint8_t *p_dst ); } x264_zigzag_function_t;
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/frame.c ^
@@ -77,6 +77,14 @@ CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size ); for( i = 0; i < 4; i++ ) frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size; + + for( j = 0; j <= !!h->param.i_bframe; j++ ) + for( i = 0; i <= h->param.i_bframe; i++ ) + { + CHECKED_MALLOC( frame->lowres_mvs[j][i], 2h->mb.i_mb_countsizeof(int16_t) ); + memset( frame->lowres_mvs[j][i], 0, 2h->mb.i_mb_countsizeof(int16_t) ); + CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_countsizeof(int) ); + } } if( h->param.analyse.i_me_method >= X264_ME_ESA ) @@ -97,6 +105,7 @@ CHECKED_MALLOC( frame->mb_type, i_mb_count sizeof(int8_t)); CHECKED_MALLOC( frame->mv[0], 216 i_mb_count * sizeof(int16_t) ); CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) ); + CHECKED_MALLOC( frame->i_intra_cost, i_mb_count * sizeof(uint16_t) ); if( h->param.i_bframe ) { CHECKED_MALLOC( frame->mv[1], 216 i_mb_count * sizeof(int16_t) ); @@ -114,6 +123,9 @@ for( j = 0; j < h->param.i_bframe + 2; j++ ) CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) ); + if( h->param.rc.i_aq_mode ) + CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) ); + x264_pthread_mutex_init( &frame->mutex, NULL ); x264_pthread_cond_init( &frame->cv, NULL ); @@ -134,6 +146,13 @@ for( i = 0; i < X264_BFRAME_MAX+2; i++ ) for( j = 0; j < X264_BFRAME_MAX+2; j++ ) x264_free( frame->i_row_satds[i][j] ); + for( j = 0; j < 2; j++ ) + for( i = 0; i <= X264_BFRAME_MAX; i++ ) + { + x264_free( frame->lowres_mvs[j][i] ); + x264_free( frame->lowres_mv_costs[j][i] ); + } + x264_free( frame->f_qp_offset ); x264_free( frame->i_row_bits ); x264_free( frame->i_row_qp ); x264_free( frame->mb_type ); @@ -233,7 +252,7 @@ void x264_frame_expand_border_filtered( x264_t h, x264_frame_t frame, int mb_y, int b_end ) { /* during filtering, 8 extra pixels were filtered on each edge, - * but up to 3 of the horizontal ones may be wrong. + * but up to 3 of the horizontal ones may be wrong. we want to expand border from the last filtered pixel / int b_start = !mb_y; int stride = frame->i_stride[0]; @@ -297,7 +316,7 @@ / cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of * entropy coding, but per 64 coeffs for the purpose of deblocking / -void munge_cavlc_nnz_row( x264_t h, int mb_y, uint8_t (buf)[16] ) +static void munge_cavlc_nnz_row( x264_t h, int mb_y, uint8_t (buf)[16] ) { uint32_t (src)[6] = (uint32_t()[6])h->mb.non_zero_count + mb_y h->sps->i_mb_width; int8_t transform = h->mb.mb_transform_size + mb_y h->sps->i_mb_width; @@ -338,82 +357,86 @@ /* Deblocking filter / - -static const int i_alpha_table[52] = +static const uint8_t i_alpha_table[52+122] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 17, 20, 22, 25, 28, 32, 36, 40, 45, 50, 56, 63, 71, 80, 90,101,113,127,144,162,182,203,226, - 255, 255 + 255,255, + 255,255,255,255,255,255,255,255,255,255,255,255, }; -static const int i_beta_table[52] = +static const uint8_t i_beta_table[52+122] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, - 18, 18 + 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, }; -static const int i_tc0_table[52][3] = +static const int8_t i_tc0_table[52+122][4] = { - { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, - { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, - { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 1 }, - { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 }, { 0, 1, 1 }, { 0, 1, 1 }, { 1, 1, 1 }, - { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 2 }, { 1, 1, 2 }, { 1, 1, 2 }, - { 1, 1, 2 }, { 1, 2, 3 }, { 1, 2, 3 }, { 2, 2, 3 }, { 2, 2, 4 }, { 2, 3, 4 }, - { 2, 3, 4 }, { 3, 3, 5 }, { 3, 4, 6 }, { 3, 4, 6 }, { 4, 5, 7 }, { 4, 5, 8 }, - { 4, 6, 9 }, { 5, 7,10 }, { 6, 8,11 }, { 6, 8,13 }, { 7,10,14 }, { 8,11,16 }, - { 9,12,18 }, {10,13,20 }, {11,15,23 }, {13,17,25 } + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 }, + {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 }, + {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, + {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 }, + {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 }, + {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 }, + {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, }; +#define alpha_table(x) i_alpha_table[(x)+12] +#define beta_table(x) i_beta_table[(x)+12] +#define tc0_table(x) i_tc0_table[(x)+12] /* From ffmpeg / -static inline int clip_uint8( int a ) -{ - if (a&(~255)) - return (-a)>>31; - else - return a; -} - static inline void deblock_luma_c( uint8_t pix, int xstride, int ystride, int alpha, int beta, int8_t tc0 ) { int i, d; - for( i = 0; i < 4; i++ ) { - if( tc0[i] < 0 ) { + for( i = 0; i < 4; i++ ) + { + if( tc0[i] < 0 ) + { pix += 4ystride; continue; } - for( d = 0; d < 4; d++ ) { + for( d = 0; d < 4; d++ ) + { const int p2 = pix[-3xstride]; const int p1 = pix[-2xstride]; const int p0 = pix[-1xstride]; const int q0 = pix[ 0xstride]; const int q1 = pix[ 1xstride]; const int q2 = pix[ 2xstride]; - - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) { - + + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { int tc = tc0[i]; int delta; - - if( abs( p2 - p0 ) < beta ) { + if( abs( p2 - p0 ) < beta ) + { pix[-2xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] ); - tc++; + tc++; } - if( abs( q2 - q0 ) < beta ) { + if( abs( q2 - q0 ) < beta ) + { pix[ 1xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] ); tc++; } - + delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - pix[-1xstride] = clip_uint8( p0 + delta ); / p0' / - pix[ 0xstride] = clip_uint8( q0 - delta ); /* q0' / + pix[-1xstride] = x264_clip_uint8( p0 + delta ); /* p0' / + pix[ 0xstride] = x264_clip_uint8( q0 - delta ); /* q0' / } pix += ystride; } @@ -421,7 +444,7 @@ } static void deblock_v_luma_c( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) { - deblock_luma_c( pix, stride, 1, alpha, beta, tc0 ); + deblock_luma_c( pix, stride, 1, alpha, beta, tc0 ); } static void deblock_h_luma_c( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) { @@ -431,43 +454,45 @@ static inline void deblock_chroma_c( uint8_t pix, int xstride, int ystride, int alpha, int beta, int8_t tc0 ) { int i, d; - for( i = 0; i < 4; i++ ) { + for( i = 0; i < 4; i++ ) + { const int tc = tc0[i]; - if( tc <= 0 ) { + if( tc <= 0 ) + { pix += 2ystride; continue; } - for( d = 0; d < 2; d++ ) { + for( d = 0; d < 2; d++ ) + { const int p1 = pix[-2xstride]; const int p0 = pix[-1xstride]; const int q0 = pix[ 0xstride]; const int q1 = pix[ 1xstride]; - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) { - + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - pix[-1xstride] = clip_uint8( p0 + delta ); / p0' / - pix[ 0xstride] = clip_uint8( q0 - delta ); /* q0' / + pix[-1xstride] = x264_clip_uint8( p0 + delta ); /* p0' / + pix[ 0xstride] = x264_clip_uint8( q0 - delta ); /* q0' / } pix += ystride; } } } static void deblock_v_chroma_c( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) -{ +{ deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 ); } static void deblock_h_chroma_c( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) -{ +{ deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 ); } static inline void deblock_luma_intra_c( uint8_t pix, int xstride, int ystride, int alpha, int beta ) { int d; - for( d = 0; d < 16; d++ ) { + for( d = 0; d < 16; d++ ) + { const int p2 = pix[-3xstride]; const int p1 = pix[-2xstride]; const int p0 = pix[-1xstride]; @@ -475,35 +500,31 @@ const int q1 = pix[ 1xstride]; const int q2 = pix[ 2xstride]; - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) { - - if(abs( p0 - q0 ) < ((alpha >> 2) + 2) ){ - if( abs( p2 - p0 ) < beta) + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { + if(abs( p0 - q0 ) < ((alpha >> 2) + 2) ) + { + if( abs( p2 - p0 ) < beta ) / p0', p1', p2' / { const int p3 = pix[-4xstride]; - /* p0', p1', p2' / pix[-1xstride] = ( p2 + 2p1 + 2p0 + 2q0 + q1 + 4 ) >> 3; pix[-2xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; pix[-3xstride] = ( 2p3 + 3p2 + p1 + p0 + q0 + 4 ) >> 3; - } else { - / p0' / - pix[-1xstride] = ( 2p1 + p0 + q1 + 2 ) >> 2; } - if( abs( q2 - q0 ) < beta) + else / p0' / + pix[-1xstride] = ( 2p1 + p0 + q1 + 2 ) >> 2; + if( abs( q2 - q0 ) < beta ) / q0', q1', q2' / { const int q3 = pix[3xstride]; - /* q0', q1', q2' / pix[0xstride] = ( p1 + 2p0 + 2q0 + 2q1 + q2 + 4 ) >> 3; pix[1xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; pix[2xstride] = ( 2q3 + 3q2 + q1 + q0 + p0 + 4 ) >> 3; - } else { - / q0' / - pix[0xstride] = ( 2q1 + q0 + p1 + 2 ) >> 2; } - }else{ - / p0', q0' / + else / q0' / + pix[0xstride] = ( 2q1 + q0 + p1 + 2 ) >> 2; + } + else / p0', q0' / + { pix[-1xstride] = ( 2p1 + p0 + q1 + 2 ) >> 2; pix[ 0xstride] = ( 2q1 + q0 + p1 + 2 ) >> 2; } @@ -512,59 +533,72 @@ } } static void deblock_v_luma_intra_c( uint8_t pix, int stride, int alpha, int beta ) -{ +{ deblock_luma_intra_c( pix, stride, 1, alpha, beta ); } static void deblock_h_luma_intra_c( uint8_t pix, int stride, int alpha, int beta ) -{ +{ deblock_luma_intra_c( pix, 1, stride, alpha, beta ); } static inline void deblock_chroma_intra_c( uint8_t pix, int xstride, int ystride, int alpha, int beta ) -{ - int d; - for( d = 0; d < 8; d++ ) { +{ + int d; + for( d = 0; d < 8; d++ ) + { const int p1 = pix[-2xstride]; const int p0 = pix[-1xstride]; const int q0 = pix[ 0xstride]; const int q1 = pix[ 1xstride]; - if( abs( p0 - q0 ) < alpha && - abs( p1 - p0 ) < beta && - abs( q1 - q0 ) < beta ) { - + if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) + { pix[-1xstride] = (2p1 + p0 + q1 + 2) >> 2; /* p0' / pix[ 0xstride] = (2q1 + q0 + p1 + 2) >> 2; / q0' / } - pix += ystride; } } static void deblock_v_chroma_intra_c( uint8_t pix, int stride, int alpha, int beta ) -{ +{ deblock_chroma_intra_c( pix, stride, 1, alpha, beta ); } static void deblock_h_chroma_intra_c( uint8_t pix, int stride, int alpha, int beta ) -{ +{ deblock_chroma_intra_c( pix, 1, stride, alpha, beta ); } -static inline void deblock_edge( x264_t h, uint8_t pix, int i_stride, int bS[4], int i_qp, int b_chroma, - x264_deblock_inter_t pf_inter, x264_deblock_intra_t pf_intra ) +static inline void deblock_edge( x264_t h, uint8_t pix1, uint8_t pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) { - int i; - const int index_a = x264_clip3( i_qp + h->sh.i_alpha_c0_offset, 0, 51 ); - const int alpha = i_alpha_table[index_a]; - const int beta = i_beta_table[x264_clip3( i_qp + h->sh.i_beta_offset, 0, 51 )]; - - if( bS[0] < 4 ) { - int8_t tc[4]; - for(i=0; i<4; i++) - tc[i] = (bS[i] ? i_tc0_table[index_a][bS[i] - 1] : -1) + b_chroma; - pf_inter( pix, i_stride, alpha, beta, tc ); - } else { - pf_intra( pix, i_stride, alpha, beta ); - } + const int index_a = i_qp + h->sh.i_alpha_c0_offset; + const int alpha = alpha_table(index_a); + const int beta = beta_table(i_qp + h->sh.i_beta_offset); + int8_t tc[4]; + + if( !alpha \|\| !beta ) + return; + + tc[0] = tc0_table(index_a)[bS[0]] + b_chroma; + tc[1] = tc0_table(index_a)[bS[1]] + b_chroma; + tc[2] = tc0_table(index_a)[bS[2]] + b_chroma; + tc[3] = tc0_table(index_a)[bS[3]] + b_chroma; + + pf_inter( pix1, i_stride, alpha, beta, tc ); + if( b_chroma ) + pf_inter( pix2, i_stride, alpha, beta, tc ); +} + +static inline void deblock_edge_intra( x264_t h, uint8_t pix1, uint8_t pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra ) +{ + const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset); + const int beta = beta_table(i_qp + h->sh.i_beta_offset); + + if( !alpha \|\| !beta ) + return; + + pf_intra( pix1, i_stride, alpha, beta ); + if( b_chroma ) + pf_intra( pix2, i_stride, alpha, beta ); } void x264_frame_deblock_row( x264_t h, int mb_y ) @@ -573,152 +607,159 @@ const int s4x4 = 4 * h->mb.i_mb_stride; const int b_interlaced = h->sh.b_mbaff; const int mvy_limit = 4 >> b_interlaced; + const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset); int mb_x; - - int i_stride2[3] = { h->fdec->i_stride[0] << b_interlaced, - h->fdec->i_stride[1] << b_interlaced, - h->fdec->i_stride[2] << b_interlaced }; + int stridey = h->fdec->i_stride[0]; + int stride2y = stridey << b_interlaced; + int strideuv = h->fdec->i_stride[1]; + int stride2uv = strideuv << b_interlaced; if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode ) munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, munge_cavlc_nnz_row ); - for( mb_x = 0; mb_x < h->sps->i_mb_width; ) + for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced \| mb_y)&1, mb_y ^= b_interlaced ) { const int mb_xy = mb_y * h->mb.i_mb_stride + mb_x; const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x; const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x; const int b_8x8_transform = h->mb.mb_transform_size[mb_xy]; - const int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4; - int i_edge; - - int i_pix_y[3] = { 16mb_yh->fdec->i_stride[0] + 16mb_x, - 8mb_yh->fdec->i_stride[1] + 8mb_x, - 8mb_yh->fdec->i_stride[2] + 8mb_x }; + const int i_qp = h->mb.qp[mb_xy]; + int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4; + uint8_t pixy = h->fdec->plane[0] + 16mb_ystridey + 16mb_x; + uint8_t pixu = h->fdec->plane[1] + 8mb_ystrideuv + 8mb_x; + uint8_t pixv = h->fdec->plane[2] + 8mb_ystrideuv + 8mb_x; if( b_interlaced && (mb_y&1) ) { - i_pix_y[0] -= 15h->fdec->i_stride[0]; - i_pix_y[1] -= 7h->fdec->i_stride[1]; - i_pix_y[2] -= 7h->fdec->i_stride[2]; + pixy -= 15stridey; + pixu -= 7strideuv; + pixv -= 7strideuv; } x264_prefetch_fenc( h, h->fdec, mb_x, mb_y ); - / i_dir == 0 -> vertical edge - * i_dir == 1 -> horizontal edge / + if( i_qp <= qp_thresh ) + i_edge_end = 1; - #define deblock_dir(i_dir)\ + #define FILTER_DIR(intra, i_dir)\ {\ - int i_start = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\ - int i_qp, i_qpn;\ - for( i_edge = i_start; i_edge < i_edge_end; i_edge++ )\ + / Y plane /\ + i_qpn= h->mb.qp[mbn_xy];\ + if( i_dir == 0 )\ {\ - int mbn_xy, mbn_8x8, mbn_4x4;\ - int bS[4]; / filtering strength /\ - if( b_8x8_transform && (i_edge&1) )\ - continue;\ - mbn_xy = i_edge > 0 ? mb_xy : ( i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride );\ - mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 s8x8 );\ - mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 );\ - if( b_interlaced && i_edge == 0 && i_dir == 1 )\ + /* vertical edge /\ + deblock_edge##intra( h, pixy + 4i_edge, NULL,\ + stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\ + h->loopf.deblock_h_luma##intra );\ + if( !(i_edge & 1) )\ {\ - mbn_xy -= h->mb.i_mb_stride;\ - mbn_8x8 -= 2 * s8x8;\ - mbn_4x4 -= 4 * s4x4;\ + /* U/V planes /\ + int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\ + deblock_edge##intra( h, pixu + 2i_edge, pixv + 2i_edge,\ + stride2uv, bS, i_qpc, 1,\ + h->loopf.deblock_h_chroma##intra );\ }\ - / * Get bS for each 4px for the current edge * /\ - if( IS_INTRA( h->mb.type[mb_xy] ) \|\| IS_INTRA( h->mb.type[mbn_xy] ) )\ - bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 && !(b_interlaced && i_dir) ? 4 : 3 );\ - else\ + }\ + else\ + {\ + / horizontal edge /\ + deblock_edge##intra( h, pixy + 4i_edgestride2y, NULL,\ + stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\ + h->loopf.deblock_v_luma##intra );\ + / U/V planes /\ + if( !(i_edge & 1) )\ {\ - int i;\ - for( i = 0; i < 4; i++ )\ + int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\ + deblock_edge##intra( h, pixu + 2i_edgestride2uv, pixv + 2i_edgestride2uv,\ + stride2uv, bS, i_qpc, 1,\ + h->loopf.deblock_v_chroma##intra );\ + }\ + }\ + } + + #define DEBLOCK_STRENGTH(i_dir)\ + {\ + / * Get bS for each 4px for the current edge * /\ + if( IS_INTRA( h->mb.type[mb_xy] ) \|\| IS_INTRA( h->mb.type[mbn_xy]) )\ + (uint32_t)bS = 0x03030303;\ + else\ + {\ + (uint32_t)bS = 0x00000000;\ + for( i = 0; i < 4; i++ )\ + {\ + int x = i_dir == 0 ? i_edge : i;\ + int y = i_dir == 0 ? i : i_edge;\ + int xn = i_dir == 0 ? (x - 1)&0x03 : x;\ + int yn = i_dir == 0 ? y : (y - 1)&0x03;\ + if( h->mb.non_zero_count[mb_xy][x+y4] != 0 \|\|\ + h->mb.non_zero_count[mbn_xy][xn+yn4] != 0 )\ + bS[i] = 2;\ + else\ {\ - int x = i_dir == 0 ? i_edge : i;\ - int y = i_dir == 0 ? i : i_edge;\ - int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;\ - int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;\ - if( h->mb.non_zero_count[mb_xy][x+y4] != 0 \|\|\ - h->mb.non_zero_count[mbn_xy][xn+yn4] != 0 )\ - {\ - bS[i] = 2;\ - }\ - else\ - {\ - / FIXME: A given frame may occupy more than one position in\ - * the reference list. So we should compare the frame numbers,\ - * not the indices in the ref list.\ - * No harm yet, as we don't generate that case./\ - int i8p= mb_8x8+(x/2)+(y/2)s8x8;\ - int i8q= mbn_8x8+(xn/2)+(yn/2)s8x8;\ - int i4p= mb_4x4+x+ys4x4;\ - int i4q= mbn_4x4+xn+yns4x4;\ - int l;\ - bS[i] = 0;\ - for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )\ + / FIXME: A given frame may occupy more than one position in\ + * the reference list. So we should compare the frame numbers,\ + * not the indices in the ref list.\ + * No harm yet, as we don't generate that case./\ + int i8p= mb_8x8+(x>>1)+(y>>1)s8x8;\ + int i8q= mbn_8x8+(xn>>1)+(yn>>1)s8x8;\ + int i4p= mb_4x4+x+ys4x4;\ + int i4q= mbn_4x4+xn+yns4x4;\ + for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )\ + if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] \|\|\ + abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 \|\|\ + abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )\ {\ - if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] \|\|\ - abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 \|\|\ - abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )\ - {\ - bS[i] = 1;\ - break;\ - }\ + bS[i] = 1;\ + break;\ }\ - }\ }\ }\ - / * filter * /\ - / Y plane /\ - i_qp = h->mb.qp[mb_xy];\ - i_qpn= h->mb.qp[mbn_xy];\ - if( i_dir == 0 )\ + }\ + } + + / i_dir == 0 -> vertical edge + * i_dir == 1 -> horizontal edge / + #define DEBLOCK_DIR(i_dir)\ + {\ + int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\ + int i_qpn, i, l, mbn_xy, mbn_8x8, mbn_4x4;\ + DECLARE_ALIGNED_4( uint8_t bS[4] ); / filtering strength /\ + if( i_edge )\ + i_edge+= b_8x8_transform;\ + else\ + {\ + mbn_xy = i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride;\ + mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 s8x8;\ + mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\ + if( b_interlaced && i_dir == 1 )\ {\ - /* vertical edge /\ - deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4i_edge],\ - i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\ - h->loopf.deblock_h_luma, h->loopf.deblock_h_luma_intra );\ - if( !(i_edge & 1) )\ - {\ - /* U/V planes /\ - int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\ - i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\ - deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2i_edge],\ - i_stride2[1], bS, i_qpc, 1,\ - h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\ - deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2i_edge],\ - i_stride2[2], bS, i_qpc, 1,\ - h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\ - }\ + mbn_xy -= h->mb.i_mb_stride;\ + mbn_8x8 -= 2 s8x8;\ + mbn_4x4 -= 4 * s4x4;\ }\ - else\ + else if( IS_INTRA( h->mb.type[mb_xy] ) \|\| IS_INTRA( h->mb.type[mbn_xy]) )\ {\ - /* horizontal edge /\ - deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4i_edgei_stride2[0]],\ - i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\ - h->loopf.deblock_v_luma, h->loopf.deblock_v_luma_intra );\ - / U/V planes /\ - if( !(i_edge & 1) )\ - {\ - int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\ - i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\ - deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2i_edgei_stride2[1]],\ - i_stride2[1], bS, i_qpc, 1,\ - h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\ - deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2i_edgei_stride2[2]],\ - i_stride2[2], bS, i_qpc, 1,\ - h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\ - }\ + FILTER_DIR( _intra, i_dir );\ + goto end##i_dir;\ }\ + DEBLOCK_STRENGTH(i_dir);\ + if( (uint32_t)bS )\ + FILTER_DIR( , i_dir);\ + end##i_dir:\ + i_edge += b_8x8_transform+1;\ + }\ + mbn_xy = mb_xy;\ + mbn_8x8 = mb_8x8;\ + mbn_4x4 = mb_4x4;\ + for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\ + {\ + DEBLOCK_STRENGTH(i_dir);\ + if( (uint32_t)bS )\ + FILTER_DIR( , i_dir);\ }\ } - deblock_dir(0); - deblock_dir(1); - - / next mb / - if( !b_interlaced \|\| (mb_y&1) ) - mb_x++; - mb_y ^= b_interlaced; + DEBLOCK_DIR(0); + DEBLOCK_DIR(1); } if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode ) @@ -748,12 +789,12 @@ void x264_deblock_h_luma_intra_mmxext( uint8_t pix, int stride, int alpha, int beta ); void x264_deblock_v8_luma_intra_mmxext( uint8_t pix, int stride, int alpha, int beta ); -void x264_deblock_v_luma_mmxext( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) +static void x264_deblock_v_luma_mmxext( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) { x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 ); x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 ); } -void x264_deblock_v_luma_intra_mmxext( uint8_t pix, int stride, int alpha, int beta ) +static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) { x264_deblock_v8_luma_intra_mmxext( pix, stride, alpha, beta ); x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta ); @@ -884,6 +925,7 @@ frame = x264_frame_new( h ); assert( frame->i_reference_count == 0 ); frame->i_reference_count = 1; + frame->b_intra_calculated = 0; return frame; }
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/frame.h ^
@@ -62,6 +62,8 @@ /* motion data / int8_t mb_type; int16_t (mv[2])[2]; + int16_t (lowres_mvs[2][X264_BFRAME_MAX+1])[2]; + int lowres_mv_costs[2][X264_BFRAME_MAX+1]; int8_t ref[2]; int i_ref[2]; int ref_poc[2][16]; @@ -71,17 +73,21 @@ * contains the SATD cost of the lowres frame encoded in various modes * FIXME: how big an array do we need? / int i_cost_est[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]; + int i_cost_est_aq[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]; int i_satd; // the i_cost_est of the selected frametype int i_intra_mbs[X264_BFRAME_MAX+2]; int i_row_satds[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]; int i_row_satd; int i_row_bits; int i_row_qp; + float f_qp_offset; + int b_intra_calculated; + uint16_t i_intra_cost; / threading / int i_lines_completed; / in pixels / int i_reference_count; / number of threads using this frame (not necessarily the number of pointers) */ - x264_pthread_mutex_t mutex; + x264_pthread_mutex_t mutex; x264_pthread_cond_t cv; } x264_frame_t;
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/macroblock.c ^
@@ -24,71 +24,6 @@ #include "common.h" -int x264_mb_predict_intra4x4_mode( x264_t h, int idx ) -{ - const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1]; - const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8]; - const int m = X264_MIN( x264_mb_pred_mode4x4_fix(ma), - x264_mb_pred_mode4x4_fix(mb) ); - - if( m < 0 ) - return I_PRED_4x4_DC; - - return m; -} - -int x264_mb_predict_non_zero_code( x264_t h, int idx ) -{ - const int za = h->mb.cache.non_zero_count[x264_scan8[idx] - 1]; - const int zb = h->mb.cache.non_zero_count[x264_scan8[idx] - 8]; - - int i_ret = za + zb; - - if( i_ret < 0x80 ) - { - i_ret = ( i_ret + 1 ) >> 1; - } - return i_ret & 0x7f; -} - -int x264_mb_transform_8x8_allowed( x264_t h ) -{ - // intra and skip are disallowed - // large partitions are allowed - // direct and 8x8 are conditional - static const uint8_t partition_tab[X264_MBTYPE_MAX] = { - 0,0,0,0,1,2,0,2,1,1,1,1,1,1,1,1,1,2,0, - }; - int p, i; - - if( !h->pps->b_transform_8x8_mode ) - return 0; - p = partition_tab[h->mb.i_type]; - if( p < 2 ) - return p; - else if( h->mb.i_type == B_DIRECT ) - return h->sps->b_direct8x8_inference; - else if( h->mb.i_type == P_8x8 ) - { - if( !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8) ) - return 1; - for( i=0; i<4; i++ ) - if( h->mb.i_sub_partition[i] != D_L0_8x8 ) - return 0; - return 1; - } - else // B_8x8 - { - // x264 currently doesn't use sub-8x8 B partitions, so don't check for them - if( h->sps->b_direct8x8_inference ) - return 1; - for( i=0; i<4; i++ ) - if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 ) - return 0; - return 1; - } -} - void x264_mb_predict_mv( x264_t h, int i_list, int idx, int i_width, int16_t mvp[2] ) { const int i8 = x264_scan8[idx]; @@ -223,9 +158,9 @@ int i8, i4; int b8x8; const int type_col = h->fref1[0]->mb_type[ h->mb.i_mb_xy ]; - + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 ); - + if( IS_INTRA( type_col ) ) { x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); @@ -338,7 +273,7 @@ if( ref[0] < 0 && ref[1] < 0 ) { - ref[0] = + ref[0] = ref[1] = 0; (uint64_t)mv[0] = 0; } @@ -481,7 +416,7 @@ } /* This just improves encoder performance, it's not part of the spec / -void x264_mb_predict_mv_ref16x16( x264_t h, int i_list, int i_ref, int16_t mvc[8][2], int i_mvc ) +void x264_mb_predict_mv_ref16x16( x264_t h, int i_list, int i_ref, int16_t mvc[9][2], int i_mvc ) { int16_t (mvr)[2] = h->mb.mvr[i_list][i_ref]; int i = 0; @@ -498,6 +433,13 @@ SET_MVP( h->mb.cache.mv[i_list][x264_scan8[12]] ); } + if( i_ref == 0 && h->frames.b_have_lowres ) + { + int16_t (lowres_mv)[2] = i_list ? h->fenc->lowres_mvs[1][h->fref1[0]->i_frame-h->fenc->i_frame-1] + : h->fenc->lowres_mvs[0][h->fenc->i_frame-h->fref0[0]->i_frame-1]; + if( lowres_mv[0][0] != 0x7fff ) (uint32_t)mvc[i++] = ((uint32_t)lowres_mv[h->mb.i_mb_xy]2)&0xfffeffff; + } + /* spatial predictors / if( h->mb.i_neighbour & MB_LEFT ) { @@ -612,48 +554,41 @@ static inline void x264_mb_mc_01xywh( x264_t h, int x, int y, int width, int height ) { const int i8 = x264_scan8[0]+x+8y; - + const int i_ref0 = h->mb.cache.ref[0][i8]; const int i_ref1 = h->mb.cache.ref[1][i8]; + const int weight = h->mb.bipred_weight[i_ref0][i_ref1]; + const int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ); const int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ); + int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ); int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ); - DECLARE_ALIGNED_16( uint8_t tmp[1616] ); - int i_mode = x264_size2pixel[height][width]; - - x264_mb_mc_0xywh( h, x, y, width, height ); - - h->mc.mc_luma( tmp, 16, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0], - mvx1 + 44x, mvy1 + 44y, 4width, 4height ); + int i_mode = x264_size2pixel[height][width]; + int i_stride0 = 16, i_stride1 = 16; + DECLARE_ALIGNED_16( uint8_t tmp0[1616] ); + DECLARE_ALIGNED_16( uint8_t tmp1[1616] ); + uint8_t src0, src1; + + src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0], + mvx0 + 44x, mvy0 + 44y, 4width, 4height ); + src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0], + mvx1 + 44x, mvy1 + 44y, 4width, 4height ); + h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4yFDEC_STRIDE+4x], FDEC_STRIDE, + src0, i_stride0, src1, i_stride1, weight ); + if( h->mb.b_interlaced & i_ref0 ) + mvy0 += (h->mb.i_mb_y & 1)4 - 2; if( h->mb.b_interlaced & i_ref1 ) mvy1 += (h->mb.i_mb_y & 1)4 - 2; - if( h->param.analyse.b_weighted_bipred ) - { - const int i_ref0 = h->mb.cache.ref[0][i8]; - const int weight = h->mb.bipred_weight[i_ref0][i_ref1]; - - h->mc.avg_weight[i_mode]( &h->mb.pic.p_fdec[0][4yFDEC_STRIDE+4x], FDEC_STRIDE, tmp, 16, weight ); - - h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][4][2yh->mb.pic.i_stride[1]+2x], h->mb.pic.i_stride[1], - mvx1, mvy1, 2width, 2height ); - h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[1][2yFDEC_STRIDE+2x], FDEC_STRIDE, tmp, 16, weight ); - - h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][5][2yh->mb.pic.i_stride[2]+2x], h->mb.pic.i_stride[2], - mvx1, mvy1, 2width, 2height ); - h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[2][2yFDEC_STRIDE+2x], FDEC_STRIDE, tmp, 16, weight ); - } - else - { - h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4yFDEC_STRIDE+4x], FDEC_STRIDE, tmp, 16 ); - - h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][4][2yh->mb.pic.i_stride[1]+2x], h->mb.pic.i_stride[1], - mvx1, mvy1, 2width, 2height ); - h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2yFDEC_STRIDE+2x], FDEC_STRIDE, tmp, 16 ); - - h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][5][2yh->mb.pic.i_stride[2]+2x], h->mb.pic.i_stride[2], - mvx1, mvy1, 2width, 2height ); - h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2yFDEC_STRIDE+2x], FDEC_STRIDE, tmp, 16 ); - } + h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][4][2yh->mb.pic.i_stride[1]+2x], h->mb.pic.i_stride[1], + mvx0, mvy0, 2width, 2height ); + h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][4][2yh->mb.pic.i_stride[1]+2x], h->mb.pic.i_stride[1], + mvx1, mvy1, 2width, 2height ); + h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2yFDEC_STRIDE+2x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); + h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][5][2yh->mb.pic.i_stride[2]+2x], h->mb.pic.i_stride[2], + mvx0, mvy0, 2width, 2height ); + h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][5][2yh->mb.pic.i_stride[2]+2x], h->mb.pic.i_stride[2], + mvx1, mvy1, 2width, 2height ); + h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2yFDEC_STRIDE+2x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); } static void x264_mb_mc_direct8x8( x264_t h, int x, int y ) @@ -885,6 +820,34 @@ memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) ); memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) ); + /* fdec: fenc: + * yyyyyyy + * yYYYY YYYY + * yYYYY YYYY + * yYYYY YYYY + * yYYYY YYYY + * uuu vvv UUVV + * uUU vVV UUVV + * uUU vVV + / + h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf; + h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16FENC_STRIDE; + h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16FENC_STRIDE + 8; + h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2FDEC_STRIDE; + h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19FDEC_STRIDE; + h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19FDEC_STRIDE + 16; + + h->mb.i_neighbour4[6] = + h->mb.i_neighbour4[9] = + h->mb.i_neighbour4[12] = + h->mb.i_neighbour4[14] = MB_LEFT\|MB_TOP\|MB_TOPLEFT\|MB_TOPRIGHT; + h->mb.i_neighbour4[3] = + h->mb.i_neighbour4[7] = + h->mb.i_neighbour4[11] = + h->mb.i_neighbour4[13] = + h->mb.i_neighbour4[15] = + h->mb.i_neighbour8[3] = MB_LEFT\|MB_TOP\|MB_TOPLEFT; + return 0; fail: return -1; } @@ -982,8 +945,9 @@ if( h->mb.b_interlaced ) ref_pix_offset[1] += (1-2(i_mb_y&1)) i_stride; h->mb.pic.i_stride[i] = i_stride2; + h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset]; h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, - &h->fenc->plane[i][i_pix_offset], i_stride2, w ); + h->mb.pic.p_fenc_plane[i], i_stride2, w ); memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w3/2+1 ); if( h->mb.b_interlaced ) { @@ -1150,23 +1114,6 @@ + !!(h->mb.i_neighbour & MB_TOP); } - / fdec: fenc: - * yyyyyyy - * yYYYY YYYY - * yYYYY YYYY - * yYYYY YYYY - * yYYYY YYYY - * uuu vvv UUVV - * uUU vVV UUVV - * uUU vVV - / - h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf; - h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16FENC_STRIDE; - h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16FENC_STRIDE + 8; - h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2FDEC_STRIDE; - h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19FDEC_STRIDE; - h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19FDEC_STRIDE + 16; - if( !h->mb.b_interlaced ) { copy_column8( h->mb.pic.p_fdec[0]-1, h->mb.pic.p_fdec[0]+15 ); @@ -1267,8 +1214,10 @@ h->mb.cache.ref[i_list][i8+28] = h->mb.cache.ref[i_list][i8+38] = h->mb.ref[i_list][ir + 1s8x8]; - for( i = 0; i < 4; i++ ) - (uint32_t)h->mb.cache.mv[i_list][i8+i8] = (uint32_t)h->mb.mv[i_list][iv + is4x4]; + (uint32_t)h->mb.cache.mv[i_list][i8+08] = (uint32_t)h->mb.mv[i_list][iv + 0s4x4]; + (uint32_t)h->mb.cache.mv[i_list][i8+18] = (uint32_t)h->mb.mv[i_list][iv + 1s4x4]; + (uint32_t)h->mb.cache.mv[i_list][i8+28] = (uint32_t)h->mb.mv[i_list][iv + 2s4x4]; + (uint32_t)h->mb.cache.mv[i_list][i8+38] = (uint32_t)h->mb.mv[i_list][iv + 3s4x4]; } else { @@ -1300,8 +1249,10 @@ { const int i8 = x264_scan8[0] - 1; const int iv = i_mb_4x4 - 1; - for( i = 0; i < 4; i++ ) - (uint32_t)h->mb.cache.mvd[i_list][i8+i8] = (uint32_t)h->mb.mvd[i_list][iv + is4x4]; + (uint32_t)h->mb.cache.mvd[i_list][i8+08] = (uint32_t)h->mb.mvd[i_list][iv + 0s4x4]; + (uint32_t)h->mb.cache.mvd[i_list][i8+18] = (uint32_t)h->mb.mvd[i_list][iv + 1s4x4]; + (uint32_t)h->mb.cache.mvd[i_list][i8+28] = (uint32_t)h->mb.mvd[i_list][iv + 2s4x4]; + (uint32_t)h->mb.cache.mvd[i_list][i8+38] = (uint32_t)h->mb.mvd[i_list][iv + 3s4x4]; } else { @@ -1343,19 +1294,9 @@ h->mb.i_neighbour4[8] = h->mb.i_neighbour4[10] = h->mb.i_neighbour8[2] = MB_TOP\|MB_TOPRIGHT \| ((h->mb.i_neighbour & MB_LEFT) ? (MB_LEFT\|MB_TOPLEFT) : 0); - h->mb.i_neighbour4[3] = - h->mb.i_neighbour4[7] = - h->mb.i_neighbour4[11] = - h->mb.i_neighbour4[13] = - h->mb.i_neighbour4[15] = - h->mb.i_neighbour8[3] = MB_LEFT\|MB_TOP\|MB_TOPLEFT; h->mb.i_neighbour4[5] = h->mb.i_neighbour8[1] = MB_LEFT \| (h->mb.i_neighbour & MB_TOPRIGHT) \| ((h->mb.i_neighbour & MB_TOP) ? MB_TOP\|MB_TOPLEFT : 0); - h->mb.i_neighbour4[6] = - h->mb.i_neighbour4[9] = - h->mb.i_neighbour4[12] = - h->mb.i_neighbour4[14] = MB_LEFT\|MB_TOP\|MB_TOPLEFT\|MB_TOPRIGHT; } static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t h, int i) @@ -1394,14 +1335,7 @@ x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y ); h->mb.type[i_mb_xy] = i_mb_type; - - if( h->mb.i_type == I_PCM \|\| (h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0) ) - h->mb.i_qp = h->mb.i_last_qp; - h->mb.qp[i_mb_xy] = i_mb_type != I_PCM ? h->mb.i_qp : 0; - - h->mb.i_last_dqp = h->mb.i_qp - h->mb.i_last_qp; - h->mb.i_last_qp = h->mb.i_qp; - h->mb.i_mb_prev_xy = h->mb.i_mb_xy; + h->mb.i_mb_prev_xy = i_mb_xy; /* save intra4x4 / if( i_mb_type == I_4x4 ) @@ -1416,6 +1350,8 @@ if( i_mb_type == I_PCM ) { + h->mb.qp[i_mb_xy] = 0; + h->mb.i_last_dqp = 0; h->mb.i_cbp_chroma = 2; h->mb.i_cbp_luma = 0xf; h->mb.cbp[i_mb_xy] = 0x72f; / all set / @@ -1426,59 +1362,71 @@ else { / save non zero count / - for( y = 0; y < 4; y++ ) - (uint32_t)&non_zero_count[y4] = (uint32_t)&h->mb.cache.non_zero_count[x264_scan8[0]+y8]; - for( y = 0; y < 4; y++ ) - (uint16_t)&non_zero_count[16+y2] = (uint32_t)&h->mb.cache.non_zero_count[x264_scan8[16+y2]-1] >> 8; - + (uint32_t)&non_zero_count[04] = (uint32_t)&h->mb.cache.non_zero_count[x264_scan8[0]+08]; + (uint32_t)&non_zero_count[14] = (uint32_t)&h->mb.cache.non_zero_count[x264_scan8[0]+18]; + (uint32_t)&non_zero_count[24] = (uint32_t)&h->mb.cache.non_zero_count[x264_scan8[0]+28]; + (uint32_t)&non_zero_count[34] = (uint32_t)&h->mb.cache.non_zero_count[x264_scan8[0]+38]; + (uint16_t)&non_zero_count[16+02] = (uint32_t)&h->mb.cache.non_zero_count[x264_scan8[16+02]-1] >> 8; + (uint16_t)&non_zero_count[16+12] = (uint32_t)&h->mb.cache.non_zero_count[x264_scan8[16+12]-1] >> 8; + (uint16_t)&non_zero_count[16+22] = (uint32_t)&h->mb.cache.non_zero_count[x264_scan8[16+22]-1] >> 8; + (uint16_t)&non_zero_count[16+32] = (uint32_t)&h->mb.cache.non_zero_count[x264_scan8[16+32]-1] >> 8; + + if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 ) + h->mb.i_qp = h->mb.i_last_qp; + h->mb.qp[i_mb_xy] = h->mb.i_qp; + h->mb.i_last_dqp = h->mb.i_qp - h->mb.i_last_qp; + h->mb.i_last_qp = h->mb.i_qp; } if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 ) h->mb.b_transform_8x8 = 0; h->mb.mb_transform_size[i_mb_xy] = h->mb.b_transform_8x8; - if( !IS_INTRA( i_mb_type ) ) + if( h->sh.i_type != SLICE_TYPE_I ) { - h->mb.ref[0][i_mb_8x8+0+0s8x8] = h->mb.cache.ref[0][x264_scan8[0]]; - h->mb.ref[0][i_mb_8x8+1+0s8x8] = h->mb.cache.ref[0][x264_scan8[4]]; - h->mb.ref[0][i_mb_8x8+0+1s8x8] = h->mb.cache.ref[0][x264_scan8[8]]; - h->mb.ref[0][i_mb_8x8+1+1s8x8] = h->mb.cache.ref[0][x264_scan8[12]]; - for( y = 0; y < 4; y++ ) - { - (uint64_t)h->mb.mv[0][i_mb_4x4+ys4x4+0] = (uint64_t)h->mb.cache.mv[0][x264_scan8[0]+8y+0]; - (uint64_t)h->mb.mv[0][i_mb_4x4+ys4x4+2] = (uint64_t)h->mb.cache.mv[0][x264_scan8[0]+8y+2]; - } - if(h->sh.i_type == SLICE_TYPE_B) - { - h->mb.ref[1][i_mb_8x8+0+0s8x8] = h->mb.cache.ref[1][x264_scan8[0]]; - h->mb.ref[1][i_mb_8x8+1+0s8x8] = h->mb.cache.ref[1][x264_scan8[4]]; - h->mb.ref[1][i_mb_8x8+0+1s8x8] = h->mb.cache.ref[1][x264_scan8[8]]; - h->mb.ref[1][i_mb_8x8+1+1s8x8] = h->mb.cache.ref[1][x264_scan8[12]]; + if( !IS_INTRA( i_mb_type ) ) + { + h->mb.ref[0][i_mb_8x8+0+0s8x8] = h->mb.cache.ref[0][x264_scan8[0]]; + h->mb.ref[0][i_mb_8x8+1+0s8x8] = h->mb.cache.ref[0][x264_scan8[4]]; + h->mb.ref[0][i_mb_8x8+0+1s8x8] = h->mb.cache.ref[0][x264_scan8[8]]; + h->mb.ref[0][i_mb_8x8+1+1s8x8] = h->mb.cache.ref[0][x264_scan8[12]]; for( y = 0; y < 4; y++ ) { - (uint64_t)h->mb.mv[1][i_mb_4x4+ys4x4+0] = (uint64_t)h->mb.cache.mv[1][x264_scan8[0]+8y+0]; - (uint64_t)h->mb.mv[1][i_mb_4x4+ys4x4+2] = (uint64_t)h->mb.cache.mv[1][x264_scan8[0]+8y+2]; + (uint64_t)h->mb.mv[0][i_mb_4x4+ys4x4+0] = (uint64_t)h->mb.cache.mv[0][x264_scan8[0]+8y+0]; + (uint64_t)h->mb.mv[0][i_mb_4x4+ys4x4+2] = (uint64_t)h->mb.cache.mv[0][x264_scan8[0]+8y+2]; + } + if( h->sh.i_type == SLICE_TYPE_B ) + { + h->mb.ref[1][i_mb_8x8+0+0s8x8] = h->mb.cache.ref[1][x264_scan8[0]]; + h->mb.ref[1][i_mb_8x8+1+0s8x8] = h->mb.cache.ref[1][x264_scan8[4]]; + h->mb.ref[1][i_mb_8x8+0+1s8x8] = h->mb.cache.ref[1][x264_scan8[8]]; + h->mb.ref[1][i_mb_8x8+1+1s8x8] = h->mb.cache.ref[1][x264_scan8[12]]; + for( y = 0; y < 4; y++ ) + { + (uint64_t)h->mb.mv[1][i_mb_4x4+ys4x4+0] = (uint64_t)h->mb.cache.mv[1][x264_scan8[0]+8y+0]; + (uint64_t)h->mb.mv[1][i_mb_4x4+ys4x4+2] = (uint64_t)h->mb.cache.mv[1][x264_scan8[0]+8y+2]; + } } } - } - else - { - int i_list; - for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2 : 1 ); i_list++ ) + else { - (uint16_t)&h->mb.ref[i_list][i_mb_8x8+0s8x8] = (uint8_t)(-1) * 0x0101; - (uint16_t)&h->mb.ref[i_list][i_mb_8x8+1s8x8] = (uint8_t)(-1) 0x0101; - for( y = 0; y < 4; y++ ) + int i_list; + for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2 : 1 ); i_list++ ) { - (uint64_t)h->mb.mv[i_list][i_mb_4x4+ys4x4+0] = 0; - (uint64_t)h->mb.mv[i_list][i_mb_4x4+ys4x4+2] = 0; + (uint16_t)&h->mb.ref[i_list][i_mb_8x8+0s8x8] = (uint8_t)(-1) 0x0101; + (uint16_t)&h->mb.ref[i_list][i_mb_8x8+1s8x8] = (uint8_t)(-1) 0x0101; + for( y = 0; y < 4; y++ ) + { + (uint64_t)h->mb.mv[i_list][i_mb_4x4+ys4x4+0] = 0; + (uint64_t)h->mb.mv[i_list][i_mb_4x4+ys4x4+2] = 0; + } } } } if( h->param.b_cabac ) { - if( i_mb_type == I_4x4 \|\| i_mb_type == I_16x16 ) + if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM ) h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ]; else h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC; @@ -1554,7 +1502,12 @@ if( h->param.analyse.b_weighted_bipred && dist_scale_factor >= -64 && dist_scale_factor <= 128 ) + { h->mb.bipred_weight[i_ref0][i_ref1] = 64 - dist_scale_factor; + // ssse3 implementation of biweight doesn't support the extrema. + // if we ever generate them, we'll have to drop that optimization. + assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 ); + } else h->mb.bipred_weight[i_ref0][i_ref1] = 32; }
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/macroblock.h ^
@@ -251,14 +251,16 @@ 24 + 34FDEC_STRIDE, 34 + 34FDEC_STRIDE }; -static const uint8_t i_chroma_qp_table[52] = +static const uint8_t i_chroma_qp_table[52+122] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 38, 39, 39, - 39, 39 + 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, }; enum cabac_ctx_block_cat_e @@ -312,16 +314,6 @@ h->mb. need only valid values from other blocks / void x264_mb_predict_mv_ref16x16( x264_t h, int i_list, int i_ref, int16_t mvc[8][2], int i_mvc ); - -int x264_mb_predict_intra4x4_mode( x264_t h, int idx ); -int x264_mb_predict_non_zero_code( x264_t h, int idx ); - -/ x264_mb_transform_8x8_allowed: - * check whether any partition is smaller than 8x8 (or at least - * might be, according to just partition type.) - * doesn't check for cbp / -int x264_mb_transform_8x8_allowed( x264_t h ); - void x264_mb_mc( x264_t h ); void x264_mb_mc_8x8( x264_t h, int i8 ); @@ -444,6 +436,72 @@ return i_nz; } +static inline int x264_mb_predict_intra4x4_mode( x264_t h, int idx ) +{ + const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1]; + const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8]; + const int m = X264_MIN( x264_mb_pred_mode4x4_fix(ma), + x264_mb_pred_mode4x4_fix(mb) ); + + if( m < 0 ) + return I_PRED_4x4_DC; + + return m; +} +static inline int x264_mb_predict_non_zero_code( x264_t h, int idx ) +{ + const int za = h->mb.cache.non_zero_count[x264_scan8[idx] - 1]; + const int zb = h->mb.cache.non_zero_count[x264_scan8[idx] - 8]; + + int i_ret = za + zb; + + if( i_ret < 0x80 ) + { + i_ret = ( i_ret + 1 ) >> 1; + } + return i_ret & 0x7f; +} +/* x264_mb_transform_8x8_allowed: + * check whether any partition is smaller than 8x8 (or at least + * might be, according to just partition type.) + * doesn't check for cbp / +static inline int x264_mb_transform_8x8_allowed( x264_t h ) +{ + // intra and skip are disallowed + // large partitions are allowed + // direct and 8x8 are conditional + static const uint8_t partition_tab[X264_MBTYPE_MAX] = { + 0,0,0,0,1,2,0,2,1,1,1,1,1,1,1,1,1,2,0, + }; + int p, i; + + if( !h->pps->b_transform_8x8_mode ) + return 0; + p = partition_tab[h->mb.i_type]; + if( p < 2 ) + return p; + else if( h->mb.i_type == B_DIRECT ) + return h->sps->b_direct8x8_inference; + else if( h->mb.i_type == P_8x8 ) + { + if( !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8) ) + return 1; + for( i=0; i<4; i++ ) + if( h->mb.i_sub_partition[i] != D_L0_8x8 ) + return 0; + return 1; + } + else // B_8x8 + { + // x264 currently doesn't use sub-8x8 B partitions, so don't check for them + if( h->sps->b_direct8x8_inference ) + return 1; + for( i=0; i<4; i++ ) + if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 ) + return 0; + return 1; + } +} #endif
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/mc.c ^
@@ -49,45 +49,30 @@ } } -static inline void pixel_avg_wxh( uint8_t dst, int i_dst, uint8_t src, int i_src, int width, int height ) +static inline void pixel_avg_wxh( uint8_t dst, int i_dst, uint8_t src1, int i_src1, uint8_t src2, int i_src2, int width, int height ) { int x, y; for( y = 0; y < height; y++ ) { for( x = 0; x < width; x++ ) { - dst[x] = ( dst[x] + src[x] + 1 ) >> 1; + dst[x] = ( src1[x] + src2[x] + 1 ) >> 1; } + src1 += i_src1; + src2 += i_src2; dst += i_dst; - src += i_src; } } -#define PIXEL_AVG_C( name, width, height ) \ -static void name( uint8_t pix1, int i_stride_pix1, \ - uint8_t pix2, int i_stride_pix2 ) \ -{ \ - pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \ -} -PIXEL_AVG_C( pixel_avg_16x16, 16, 16 ) -PIXEL_AVG_C( pixel_avg_16x8, 16, 8 ) -PIXEL_AVG_C( pixel_avg_8x16, 8, 16 ) -PIXEL_AVG_C( pixel_avg_8x8, 8, 8 ) -PIXEL_AVG_C( pixel_avg_8x4, 8, 4 ) -PIXEL_AVG_C( pixel_avg_4x8, 4, 8 ) -PIXEL_AVG_C( pixel_avg_4x4, 4, 4 ) -PIXEL_AVG_C( pixel_avg_4x2, 4, 2 ) -PIXEL_AVG_C( pixel_avg_2x4, 2, 4 ) -PIXEL_AVG_C( pixel_avg_2x2, 2, 2 ) - - / Implicit weighted bipred only: * assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 / -#define op_scale2(x) dst[x] = x264_clip_uint8( (dst[x]i_weight1 + src[x]i_weight2 + (1<<5)) >> 6 ) -static inline void pixel_avg_weight_wxh( uint8_t dst, int i_dst, uint8_t src, int i_src, int width, int height, int i_weight1 ){ +#define op_scale2(x) dst[x] = x264_clip_uint8( (src1[x]i_weight1 + src2[x]i_weight2 + (1<<5)) >> 6 ) +static inline void pixel_avg_weight_wxh( uint8_t dst, int i_dst, uint8_t src1, int i_src1, uint8_t src2, int i_src2, int width, int height, int i_weight1 ) +{ int y; const int i_weight2 = 64 - i_weight1; - for(y=0; y<height; y++, dst += i_dst, src += i_src){ + for( y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 ) + { op_scale2(0); op_scale2(1); if(width==2) continue; @@ -109,27 +94,28 @@ op_scale2(15); } } +#undef op_scale2 -#define PIXEL_AVG_WEIGHT_C( width, height ) \ -static void pixel_avg_weight_##width##x##height( \ - uint8_t pix1, int i_stride_pix1, \ - uint8_t pix2, int i_stride_pix2, int i_weight1 ) \ +#define PIXEL_AVG_C( name, width, height ) \ +static void name( uint8_t pix1, int i_stride_pix1, \ + uint8_t pix2, int i_stride_pix2, \ + uint8_t pix3, int i_stride_pix3, int weight ) \ { \ - pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height, i_weight1 ); \ + if( weight == 32 )\ + pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \ + else\ + pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \ } - -PIXEL_AVG_WEIGHT_C(16,16) -PIXEL_AVG_WEIGHT_C(16,8) -PIXEL_AVG_WEIGHT_C(8,16) -PIXEL_AVG_WEIGHT_C(8,8) -PIXEL_AVG_WEIGHT_C(8,4) -PIXEL_AVG_WEIGHT_C(4,8) -PIXEL_AVG_WEIGHT_C(4,4) -PIXEL_AVG_WEIGHT_C(4,2) -PIXEL_AVG_WEIGHT_C(2,4) -PIXEL_AVG_WEIGHT_C(2,2) -#undef op_scale2 -#undef PIXEL_AVG_WEIGHT_C +PIXEL_AVG_C( pixel_avg_16x16, 16, 16 ) +PIXEL_AVG_C( pixel_avg_16x8, 16, 8 ) +PIXEL_AVG_C( pixel_avg_8x16, 8, 16 ) +PIXEL_AVG_C( pixel_avg_8x8, 8, 8 ) +PIXEL_AVG_C( pixel_avg_8x4, 8, 4 ) +PIXEL_AVG_C( pixel_avg_4x8, 4, 8 ) +PIXEL_AVG_C( pixel_avg_4x4, 4, 4 ) +PIXEL_AVG_C( pixel_avg_4x2, 4, 2 ) +PIXEL_AVG_C( pixel_avg_2x4, 2, 4 ) +PIXEL_AVG_C( pixel_avg_2x2, 2, 2 ) static void mc_copy( uint8_t src, int i_src_stride, uint8_t dst, int i_dst_stride, int i_width, int i_height ) { @@ -299,9 +285,15 @@ i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres ); x264_frame_expand_border_lowres( frame ); - for( y=0; y<16; y++ ) - for( x=0; x<16; x++ ) - frame->i_cost_est[y][x] = -1; + memset( frame->i_cost_est, -1, sizeof(frame->i_cost_est) ); + + for( x = 0; x < h->param.i_bframe + 2; x++ ) + for( y = 0; y < h->param.i_bframe + 2; y++ ) + frame->i_row_satds[y][x][0] = -1; + + for( y = 0; y <= !!h->param.i_bframe; y++ ) + for( x = 0; x <= h->param.i_bframe; x++ ) + frame->lowres_mvs[y][x][0][0] = 0x7FFF; } static void frame_init_lowres_core( uint8_t src0, uint8_t dst0, uint8_t dsth, uint8_t dstv, uint8_t dstc, @@ -346,18 +338,8 @@ pf->avg[PIXEL_4x2] = pixel_avg_4x2; pf->avg[PIXEL_2x4] = pixel_avg_2x4; pf->avg[PIXEL_2x2] = pixel_avg_2x2; - - pf->avg_weight[PIXEL_16x16]= pixel_avg_weight_16x16; - pf->avg_weight[PIXEL_16x8] = pixel_avg_weight_16x8; - pf->avg_weight[PIXEL_8x16] = pixel_avg_weight_8x16; - pf->avg_weight[PIXEL_8x8] = pixel_avg_weight_8x8; - pf->avg_weight[PIXEL_8x4] = pixel_avg_weight_8x4; - pf->avg_weight[PIXEL_4x8] = pixel_avg_weight_4x8; - pf->avg_weight[PIXEL_4x4] = pixel_avg_weight_4x4; - pf->avg_weight[PIXEL_4x2] = pixel_avg_weight_4x2; - pf->avg_weight[PIXEL_2x4] = pixel_avg_weight_2x4; - pf->avg_weight[PIXEL_2x2] = pixel_avg_weight_2x2; + pf->copy_16x16_unaligned = mc_copy_w16; pf->copy[PIXEL_16x16] = mc_copy_w16; pf->copy[PIXEL_8x8] = mc_copy_w8; pf->copy[PIXEL_4x4] = mc_copy_w4;
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/mc.h ^
@@ -45,11 +45,11 @@ int mvx, int mvy, int i_width, int i_height ); - void (avg[10])( uint8_t dst, int, uint8_t src, int ); - void (avg_weight[10])( uint8_t dst, int, uint8_t src, int, int i_weight ); + void (avg[10])( uint8_t dst, int, uint8_t src1, int, uint8_t src2, int, int i_weight ); /* only 16x16, 8x8, and 4x4 defined / void (copy[7])( uint8_t dst, int, uint8_t src, int, int i_height ); + void (copy_16x16_unaligned)( uint8_t dst, int, uint8_t src, int, int i_height ); void (plane_copy)( uint8_t dst, int i_dst, uint8_t src, int i_src, int w, int h); @@ -62,7 +62,7 @@ uint8_t pix_uv, int stride_uv, int mb_x ); / prefetch the next few macroblocks of a hpel reference frame / void (prefetch_ref)( uint8_t pix, int stride, int parity ); - + void (memcpy_aligned)( void dst, const void src, size_t n ); void (memzero_aligned)( void *dst, int n );
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/mdate.c ^
@@ -26,6 +26,7 @@ #endif #include <time.h> +#include "common.h" #include "osdep.h" int64_t x264_mdate( void )
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/pixel.c ^
@@ -136,29 +136,49 @@ } -static inline void pixel_sub_wxh( int16_t diff, int i_size, - uint8_t pix1, int i_pix1, uint8_t pix2, int i_pix2 ) -{ - int y, x; - for( y = 0; y < i_size; y++ ) - { - for( x = 0; x < i_size; x++ ) - { - diff[x + yi_size] = pix1[x] - pix2[x]; - } - pix1 += i_pix1; - pix2 += i_pix2; - } +/**************************************************************************** + * pixel_var_wxh + ***************************************************************************/ +#define PIXEL_VAR_C( name, w, shift ) \ +static int name( uint8_t pix, int i_stride, uint32_t sad ) \ +{ \ + uint32_t var = 0, sum = 0, sqr = 0; \ + int x, y; \ + for( y = 0; y < w; y++ ) \ + { \ + for( x = 0; x < w; x++ ) \ + { \ + sum += pix[x]; \ + sqr += pix[x] pix[x]; \ + } \ + pix += i_stride; \ + } \ + var = sqr - (sum * sum >> shift); \ + sad = sum; \ + return var; \ +} + +PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 ) +PIXEL_VAR_C( x264_pixel_var_8x8, 8, 6 ) + + +#define HADAMARD4(d0,d1,d2,d3,s0,s1,s2,s3) {\ + int t0 = s0 + s1;\ + int t1 = s0 - s1;\ + int t2 = s2 + s3;\ + int t3 = s2 - s3;\ + d0 = t0 + t2;\ + d2 = t0 - t2;\ + d1 = t1 + t3;\ + d3 = t1 - t3;\ } - /*************************************************************************** * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences ***************************************************************************/ static int pixel_satd_wxh( uint8_t pix1, int i_pix1, uint8_t pix2, int i_pix2, int i_width, int i_height ) { int16_t tmp[4][4]; - int16_t diff[4][4]; int x, y; int i_satd = 0; @@ -166,32 +186,22 @@ { for( x = 0; x < i_width; x += 4 ) { - int d; - - pixel_sub_wxh( (int16_t)diff, 4, &pix1[x], i_pix1, &pix2[x], i_pix2 ); + int i; + uint8_t p1 = pix1+x, p2 = pix2+x; - for( d = 0; d < 4; d++ ) + for( i=0; i<4; i++, p1+=i_pix1, p2+=i_pix2 ) { - int s01, s23; - int d01, d23; - - s01 = diff[d][0] + diff[d][1]; s23 = diff[d][2] + diff[d][3]; - d01 = diff[d][0] - diff[d][1]; d23 = diff[d][2] - diff[d][3]; - - tmp[d][0] = s01 + s23; - tmp[d][1] = s01 - s23; - tmp[d][2] = d01 - d23; - tmp[d][3] = d01 + d23; + int a0 = p1[0] - p2[0]; + int a1 = p1[1] - p2[1]; + int a2 = p1[2] - p2[2]; + int a3 = p1[3] - p2[3]; + HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 ); } - for( d = 0; d < 4; d++ ) + for( i=0; i<4; i++ ) { - int s01, s23; - int d01, d23; - - s01 = tmp[0][d] + tmp[1][d]; s23 = tmp[2][d] + tmp[3][d]; - d01 = tmp[0][d] - tmp[1][d]; d23 = tmp[2][d] - tmp[3][d]; - - i_satd += abs( s01 + s23 ) + abs( s01 - s23 ) + abs( d01 - d23 ) + abs( d01 + d23 ); + int a0,a1,a2,a3; + HADAMARD4( a0,a1,a2,a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] ); + i_satd += abs(a0) + abs(a1) + abs(a2) + abs(a3); } } @@ -220,30 +230,17 @@ * pixel_sa8d_WxH: sum of 8x8 Hadamard transformed differences ***************************************************************************/ #define SA8D_1D {\ - const int a0 = SRC(0) + SRC(4);\ - const int a4 = SRC(0) - SRC(4);\ - const int a1 = SRC(1) + SRC(5);\ - const int a5 = SRC(1) - SRC(5);\ - const int a2 = SRC(2) + SRC(6);\ - const int a6 = SRC(2) - SRC(6);\ - const int a3 = SRC(3) + SRC(7);\ - const int a7 = SRC(3) - SRC(7);\ - const int b0 = a0 + a2;\ - const int b2 = a0 - a2;\ - const int b1 = a1 + a3;\ - const int b3 = a1 - a3;\ - const int b4 = a4 + a6;\ - const int b6 = a4 - a6;\ - const int b5 = a5 + a7;\ - const int b7 = a5 - a7;\ - DST(0, b0 + b1);\ - DST(1, b0 - b1);\ - DST(2, b2 + b3);\ - DST(3, b2 - b3);\ - DST(4, b4 + b5);\ - DST(5, b4 - b5);\ - DST(6, b6 + b7);\ - DST(7, b6 - b7);\ + int b0,b1,b2,b3,b4,b5,b6,b7;\ + HADAMARD4( b0,b1,b2,b3, SRC(0), SRC(1), SRC(2), SRC(3) );\ + HADAMARD4( b4,b5,b6,b7, SRC(4), SRC(5), SRC(6), SRC(7) );\ + DST(0, b0 + b4);\ + DST(4, b0 - b4);\ + DST(1, b1 + b5);\ + DST(5, b1 - b5);\ + DST(2, b2 + b6);\ + DST(6, b2 - b6);\ + DST(3, b3 + b7);\ + DST(7, b3 - b7);\ } static inline int pixel_sa8d_wxh( uint8_t pix1, int i_pix1, uint8_t pix2, int i_pix2, @@ -258,18 +255,28 @@ for( x = 0; x < i_width; x += 8 ) { int i; - pixel_sub_wxh( (int16_t)diff, 8, pix1+x, i_pix1, pix2+x, i_pix2 ); + uint8_t p1 = pix1+x, p2 = pix2+x; -#define SRC(x) diff[i][x] +#define SRC(x) a##x #define DST(x,rhs) diff[i][x] = (rhs) - for( i = 0; i < 8; i++ ) + for( i=0; i<8; i++, p1+=i_pix1, p2+=i_pix2 ) + { + int a0 = p1[0] - p2[0]; + int a1 = p1[1] - p2[1]; + int a2 = p1[2] - p2[2]; + int a3 = p1[3] - p2[3]; + int a4 = p1[4] - p2[4]; + int a5 = p1[5] - p2[5]; + int a6 = p1[6] - p2[6]; + int a7 = p1[7] - p2[7]; SA8D_1D + } #undef SRC #undef DST #define SRC(x) diff[x][i] #define DST(x,rhs) i_satd += abs(rhs) - for( i = 0; i < 8; i++ ) + for( i=0; i<8; i++ ) SA8D_1D #undef SRC #undef DST @@ -292,6 +299,69 @@ PIXEL_SA8D_C( 8, 16 ) PIXEL_SA8D_C( 8, 8 ) + +static uint64_t pixel_hadamard_ac( uint8_t pix, int stride ) +{ + int16_t tmp[8][8]; + int sum4=0, sum8=0; + int i; + for( i=0; i<8; i++, pix+=stride ) + { + HADAMARD4( tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i], + pix[0], pix[1], pix[2], pix[3] ); + HADAMARD4( tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i], + pix[4], pix[5], pix[6], pix[7] ); + } + for( i=0; i<8; i++ ) + { + int a0,a1,a2,a3,a4,a5,a6,a7; + HADAMARD4( a0,a1,a2,a3, tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3] ); + sum4 += abs(a0) + abs(a1) + abs(a2) + abs(a3); + HADAMARD4( a4,a5,a6,a7, tmp[i][4], tmp[i][5], tmp[i][6], tmp[i][7] ); + sum4 += abs(a4) + abs(a5) + abs(a6) + abs(a7); + tmp[i][0] = a0 + a4; + tmp[i][4] = a0 - a4; + tmp[i][1] = a1 + a5; + tmp[i][5] = a1 - a5; + tmp[i][2] = a2 + a6; + tmp[i][6] = a2 - a6; + tmp[i][3] = a3 + a7; + tmp[i][7] = a3 - a7; + } + for( i=0; i<8; i++ ) + { + sum8 += abs( tmp[0][i] + tmp[4][i] ) + + abs( tmp[0][i] - tmp[4][i] ) + + abs( tmp[1][i] + tmp[5][i] ) + + abs( tmp[1][i] - tmp[5][i] ) + + abs( tmp[2][i] + tmp[6][i] ) + + abs( tmp[2][i] - tmp[6][i] ) + + abs( tmp[3][i] + tmp[7][i] ) + + abs( tmp[3][i] - tmp[7][i] ); + } + sum4 -= tmp[0][0]+tmp[4][0]; + sum8 -= tmp[0][0]+tmp[4][0]; + return ((uint64_t)sum8<<32) + sum4; +} + +#define HADAMARD_AC(w,h) \ +static uint64_t x264_pixel_hadamard_ac_##w##x##h( uint8_t pix, int stride )\ +{\ + uint64_t sum = pixel_hadamard_ac( pix, stride );\ + if( w==16 )\ + sum += pixel_hadamard_ac( pix+8, stride );\ + if( h==16 )\ + sum += pixel_hadamard_ac( pix+8stride, stride );\ + if( w==16 && h==16 )\ + sum += pixel_hadamard_ac( pix+8stride+8, stride );\ + return ((sum>>34)<<32) + ((uint32_t)sum>>1);\ +} +HADAMARD_AC( 16, 16 ) +HADAMARD_AC( 16, 8 ) +HADAMARD_AC( 8, 16 ) +HADAMARD_AC( 8, 8 ) + + /**************************************************************************** * pixel_sad_x4 ***************************************************************************/ @@ -502,20 +572,24 @@ { memset( pixf, 0, sizeof(pixf) ); -#define INIT2( name, cpu ) \ - pixf->name[PIXEL_16x16] = x264_pixel_##name##_16x16##cpu;\ - pixf->name[PIXEL_16x8] = x264_pixel_##name##_16x8##cpu; -#define INIT4( name, cpu ) \ - INIT2( name, cpu ) \ - pixf->name[PIXEL_8x16] = x264_pixel_##name##_8x16##cpu;\ - pixf->name[PIXEL_8x8] = x264_pixel_##name##_8x8##cpu; -#define INIT5( name, cpu ) \ - INIT4( name, cpu ) \ - pixf->name[PIXEL_8x4] = x264_pixel_##name##_8x4##cpu; -#define INIT7( name, cpu ) \ - INIT5( name, cpu ) \ - pixf->name[PIXEL_4x8] = x264_pixel_##name##_4x8##cpu;\ - pixf->name[PIXEL_4x4] = x264_pixel_##name##_4x4##cpu; +#define INIT2_NAME( name1, name2, cpu ) \ + pixf->name1[PIXEL_16x16] = x264_pixel_##name2##_16x16##cpu;\ + pixf->name1[PIXEL_16x8] = x264_pixel_##name2##_16x8##cpu; +#define INIT4_NAME( name1, name2, cpu ) \ + INIT2_NAME( name1, name2, cpu ) \ + pixf->name1[PIXEL_8x16] = x264_pixel_##name2##_8x16##cpu;\ + pixf->name1[PIXEL_8x8] = x264_pixel_##name2##_8x8##cpu; +#define INIT5_NAME( name1, name2, cpu ) \ + INIT4_NAME( name1, name2, cpu ) \ + pixf->name1[PIXEL_8x4] = x264_pixel_##name2##_8x4##cpu; +#define INIT7_NAME( name1, name2, cpu ) \ + INIT5_NAME( name1, name2, cpu ) \ + pixf->name1[PIXEL_4x8] = x264_pixel_##name2##_4x8##cpu;\ + pixf->name1[PIXEL_4x4] = x264_pixel_##name2##_4x4##cpu; +#define INIT2( name, cpu ) INIT2_NAME( name, name, cpu ) +#define INIT4( name, cpu ) INIT4_NAME( name, name, cpu ) +#define INIT5( name, cpu ) INIT5_NAME( name, name, cpu ) +#define INIT7( name, cpu ) INIT7_NAME( name, name, cpu ) #define INIT_ADS( cpu ) \ pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\ @@ -523,6 +597,7 @@ pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu; INIT7( sad, ); + INIT7_NAME( sad_aligned, sad, ); INIT7( sad_x3, ); INIT7( sad_x4, ); INIT7( ssd, ); @@ -530,8 +605,12 @@ INIT7( satd_x3, ); INIT7( satd_x4, ); INIT4( sa8d, ); + INIT4( hadamard_ac, ); INIT_ADS( ); + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8; + pixf->ssim_4x4x2_core = ssim_4x4x2_core; pixf->ssim_end4 = ssim_end4; @@ -544,13 +623,16 @@ if( cpu&X264_CPU_MMXEXT ) { INIT7( sad, _mmxext ); + INIT7_NAME( sad_aligned, sad, _mmxext ); INIT7( sad_x3, _mmxext ); INIT7( sad_x4, _mmxext ); INIT7( satd, _mmxext ); INIT7( satd_x3, _mmxext ); INIT7( satd_x4, _mmxext ); + INIT4( hadamard_ac, _mmxext ); INIT_ADS( _mmxext ); - + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext; #ifdef ARCH_X86 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext; @@ -582,6 +664,7 @@ } #endif pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext; } @@ -591,8 +674,10 @@ INIT2( sad, _sse2 ); INIT2( sad_x3, _sse2 ); INIT2( sad_x4, _sse2 ); + INIT4( hadamard_ac, _sse2 ); INIT_ADS( _sse2 ); - + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2; #ifdef ARCH_X86 if( cpu&X264_CPU_CACHELINE_64 ) { @@ -608,6 +693,8 @@ INIT5( satd, _sse2 ); INIT5( satd_x3, _sse2 ); INIT5( satd_x4, _sse2 ); + INIT2_NAME( sad_aligned, sad, _sse2_aligned ); + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; @@ -629,10 +716,12 @@ INIT7( satd, _ssse3 ); INIT7( satd_x3, _ssse3 ); INIT7( satd_x4, _ssse3 ); + INIT4( hadamard_ac, _ssse3 ); INIT_ADS( _ssse3 ); pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3; #ifdef ARCH_X86_64
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/pixel.h ^
@@ -68,9 +68,14 @@ x264_pixel_cmp_t ssim[7]; x264_pixel_cmp_t sa8d[4]; x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision / + x264_pixel_cmp_t mbcmp_unaligned[7]; / unaligned mbcmp for subpel / x264_pixel_cmp_t fpelcmp[7]; / either satd or sad for fullpel motion search / x264_pixel_cmp_x3_t fpelcmp_x3[7]; x264_pixel_cmp_x4_t fpelcmp_x4[7]; + x264_pixel_cmp_t sad_aligned[7]; / Aligned SAD for mbcmp / + + int (var[4])( uint8_t pix, int stride, uint32_t sad ); + uint64_t (hadamard_ac[4])( uint8_t pix, int stride ); void (ssim_4x4x2_core)( const uint8_t pix1, int stride1, const uint8_t pix2, int stride2, int sums[2][4] ); @@ -87,12 +92,14 @@ int (ads[7])( int enc_dc[4], uint16_t sums, int delta, uint16_t cost_mvx, int16_t mvs, int width, int thresh ); - / calculate satd of V, H, and DC modes. + /* calculate satd or sad of V, H, and DC modes. * may be NULL, in which case just use pred+satd instead. / - void (intra_satd_x3_16x16)( uint8_t fenc, uint8_t fdec, int res[3] ); - void (intra_satd_x3_8x8c)( uint8_t fenc, uint8_t fdec, int res[3] ); - void (intra_satd_x3_4x4)( uint8_t fenc, uint8_t fdec, int res[3] ); - void (intra_sa8d_x3_8x8)( uint8_t fenc, uint8_t edge[33], int res[3] ); + void (intra_mbcmp_x3_16x16)( uint8_t fenc, uint8_t fdec , int res[3] ); + void (intra_satd_x3_16x16) ( uint8_t fenc, uint8_t fdec , int res[3] ); + void (intra_sad_x3_16x16) ( uint8_t fenc, uint8_t fdec , int res[3] ); + void (intra_satd_x3_8x8c) ( uint8_t fenc, uint8_t fdec , int res[3] ); + void (intra_satd_x3_4x4) ( uint8_t fenc, uint8_t fdec , int res[3] ); + void (intra_sa8d_x3_8x8) ( uint8_t fenc, uint8_t edge[33], int res[3] ); } x264_pixel_function_t; void x264_pixel_init( int cpu, x264_pixel_function_t pixf );
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/predict.c ^
@@ -27,9 +27,6 @@ #include "common.h" -#ifdef _MSC_VER -#undef HAVE_MMX /* not finished now / -#endif #ifdef HAVE_MMX # include "x86/predict.h" #endif @@ -646,7 +643,7 @@ SRC(5,0)=SRC(6,1)=SRC(7,2)= F2(t3,t4,t5); SRC(6,0)=SRC(7,1)= F2(t4,t5,t6); SRC(7,0)= F2(t5,t6,t7); - + } static void predict_8x8_vr( uint8_t src, uint8_t edge[33] ) {
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/quant.c ^
@@ -194,7 +194,7 @@ } } -void x264_denoise_dct_core( int16_t dct, uint32_t sum, uint16_t offset, int size ) +static void x264_denoise_dct( int16_t dct, uint32_t sum, uint16_t offset, int size ) { int i; for( i=1; i<size; i++ ) @@ -218,7 +218,7 @@ pf->dequant_4x4 = dequant_4x4; pf->dequant_8x8 = dequant_8x8; - pf->denoise_dct_core = x264_denoise_dct_core; + pf->denoise_dct = x264_denoise_dct; #ifdef HAVE_MMX if( cpu&X264_CPU_MMX ) @@ -233,7 +233,7 @@ pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx; pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx; } - pf->denoise_dct_core = x264_denoise_dct_core_mmx; + pf->denoise_dct = x264_denoise_dct_mmx; #endif } @@ -257,7 +257,7 @@ pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2; pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2; } - pf->denoise_dct_core = x264_denoise_dct_core_sse2; + pf->denoise_dct = x264_denoise_dct_sse2; } if( cpu&X264_CPU_SSSE3 ) @@ -266,7 +266,7 @@ pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3; pf->quant_4x4 = x264_quant_4x4_ssse3; pf->quant_8x8 = x264_quant_8x8_ssse3; - pf->denoise_dct_core = x264_denoise_dct_core_ssse3; + pf->denoise_dct = x264_denoise_dct_ssse3; } #endif // HAVE_MMX
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/quant.h ^
@@ -33,7 +33,7 @@ void (dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); void (dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); - void (denoise_dct_core)( int16_t dct, uint32_t sum, uint16_t offset, int size ); + void (denoise_dct)( int16_t dct, uint32_t sum, uint16_t offset, int size ); } x264_quant_function_t; void x264_quant_init( x264_t h, int cpu, x264_quant_function_t pf );
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/set.c ^
@@ -75,7 +75,7 @@ int quant8_mf[2][6][8][8]; int q, i, j, i_list; int deadzone[4] = { 32 - h->param.analyse.i_luma_deadzone[1], - 32 - h->param.analyse.i_luma_deadzone[0], + 32 - h->param.analyse.i_luma_deadzone[0], 32 - 11, 32 - 21 }; int max_qp_err = -1; @@ -195,7 +195,7 @@ } } -int x264_cqm_parse_jmlist( x264_t h, const char buf, const char name, +static int x264_cqm_parse_jmlist( x264_t h, const char buf, const char name, uint8_t cqm, const uint8_t jvt, int length ) { char *p; @@ -247,7 +247,7 @@ int b_error = 0; h->param.i_cqm_preset = X264_CQM_CUSTOM; - + buf = x264_slurp_file( filename ); if( !buf ) {
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/set.h ^
@@ -28,11 +28,12 @@ { PROFILE_BASELINE = 66, PROFILE_MAIN = 77, - PROFILE_EXTENTED = 88, + PROFILE_EXTENDED = 88, PROFILE_HIGH = 100, PROFILE_HIGH10 = 110, PROFILE_HIGH422 = 122, - PROFILE_HIGH444 = 144 + PROFILE_HIGH444 = 144, + PROFILE_HIGH444_PREDICTIVE = 244, }; enum cqm4_e @@ -94,7 +95,7 @@ int b_aspect_ratio_info_present; int i_sar_width; int i_sar_height; - + int b_overscan_info_present; int b_overscan_info;
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/cabac-a.asm ^
@@ -63,20 +63,13 @@ endstruc %macro LOAD_GLOBAL 4 -%ifdef PIC64 +%ifdef PIC ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea lea r11, [%2 GLOBAL] %ifnidn %3, 0 add r11, %3 %endif movzx %1, byte [r11+%4] -%elifdef PIC32 - %ifnidn %3, 0 - lea %1, [%3+%4] - movzx %1, byte [%2+%1 GLOBAL] - %else - movzx %1, byte [%2+%3+%4 GLOBAL] - %endif %else movzx %1, byte [%2+%3+%4] %endif @@ -85,7 +78,6 @@ cglobal x264_cabac_encode_decision_asm, 0,7 movifnidn t0d, r0m movifnidn t1d, r1m - picgetgot t2 mov t5d, [r0+cb.range] movzx t3d, byte [r0+cb.state+t1] mov t4d, t5d @@ -95,22 +87,13 @@ sub t4d, t5d mov t6d, t3d shr t6d, 6 -%ifdef PIC32 - cmp t6d, r2m -%else movifnidn t2d, r2m cmp t6d, t2d -%endif mov t6d, [r0+cb.low] lea t7, [t6+t4] cmovne t4d, t5d cmovne t6d, t7d -%ifdef PIC32 - mov t1, r2m - LOAD_GLOBAL t3d, x264_cabac_transition, t1, t32 -%else LOAD_GLOBAL t3d, x264_cabac_transition, t2, t32 -%endif movifnidn t1d, r1m mov [r0+cb.state+t1], t3b .renorm:
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/dct-32.asm ^
@@ -32,21 +32,6 @@ SECTION .text -%macro SBUTTERFLY 4 - mova m%4, m%2 - punpckl%1 m%2, m%3 - punpckh%1 m%4, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE4x4W 5 - SBUTTERFLY wd, %1, %2, %5 - SBUTTERFLY wd, %3, %4, %5 - SBUTTERFLY dq, %1, %3, %5 - SBUTTERFLY dq, %2, %4, %5 - SWAP %2, %3 -%endmacro - ; in: m0..m7 ; out: 0,4,6 in mem, rest in regs %macro DCT8_1D 9 @@ -359,31 +344,6 @@ INIT_XMM -; in: m0..m7, except m6 which is in [%9+0x60] -; out: m0..m7, except m4 which is in [%9+0x40] -%macro TRANSPOSE8x8W 9 - SBUTTERFLY wd, %1, %2, %7 - movdqa [%9+16], m%2 - movdqa m%7, [%9+0x60] - SBUTTERFLY wd, %3, %4, %2 - SBUTTERFLY wd, %5, %6, %2 - SBUTTERFLY wd, %7, %8, %2 - SBUTTERFLY dq, %1, %3, %2 - movdqa [%9], m%3 - movdqa m%2, [%9+16] - SBUTTERFLY dq, %2, %4, %3 - SBUTTERFLY dq, %5, %7, %3 - SBUTTERFLY dq, %6, %8, %3 - SBUTTERFLY qdq, %1, %5, %3 - SBUTTERFLY qdq, %2, %6, %3 - movdqa [%9+0x40], m%2 - movdqa m%3, [%9] - SBUTTERFLY qdq, %3, %7, %2 - SBUTTERFLY qdq, %4, %8, %2 - SWAP %2, %5 - SWAP %4, %7 -%endmacro - ;----------------------------------------------------------------------------- ; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t pix1, uint8_t pix2 ) ;----------------------------------------------------------------------------- @@ -402,7 +362,7 @@ UNSPILL r0, 0 DCT8_1D 0,1,2,3,4,5,6,7,r0 UNSPILL r0, 0,4 - TRANSPOSE8x8W 0,1,2,3,4,5,6,7,r0 + TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1 UNSPILL r0, 4 DCT8_1D 0,1,2,3,4,5,6,7,r0 SPILL r0, 1,2,3,5,7 @@ -417,8 +377,7 @@ UNSPILL r1, 1,2,3,5,6,7 IDCT8_1D 0,1,2,3,4,5,6,7,r1 SPILL r1, 6 - TRANSPOSE8x8W 0,1,2,3,4,5,6,7,r1 - picgetgot edx + TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1 paddw m0, [pw_32 GLOBAL] SPILL r1, 0 IDCT8_1D 0,1,2,3,4,5,6,7,r1
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/dct-64.asm ^
@@ -29,35 +29,8 @@ pw_32: times 8 dw 32 SECTION .text - INIT_XMM -%macro SBUTTERFLY 4 - mova m%4, m%2 - punpckl%1 m%2, m%3 - punpckh%1 m%4, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8x8W 9 - SBUTTERFLY wd, %1, %2, %9 - SBUTTERFLY wd, %3, %4, %9 - SBUTTERFLY wd, %5, %6, %9 - SBUTTERFLY wd, %7, %8, %9 - SBUTTERFLY dq, %1, %3, %9 - SBUTTERFLY dq, %2, %4, %9 - SBUTTERFLY dq, %5, %7, %9 - SBUTTERFLY dq, %6, %8, %9 - SBUTTERFLY qdq, %1, %5, %9 - SBUTTERFLY qdq, %2, %6, %9 - SBUTTERFLY qdq, %3, %7, %9 - SBUTTERFLY qdq, %4, %8, %9 - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -SECTION .text - %macro DCT8_1D 10 SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07 SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16 @@ -151,7 +124,7 @@ paddw m%9, m%2 paddw m%9, m%4 paddw m%9, m%6 ; %9=a7 - + movdqa m%10, m%6 psraw m%10, 1 paddw m%10, m%6 @@ -208,7 +181,7 @@ TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 paddw m0, [pw_32 GLOBAL] ; rounding for the >>6 at the end IDCT8_1D 0,1,2,3,4,5,6,7,8,9 - + pxor m9, m9 STORE_DIFF m0, m8, m9, [r0+0FDEC_STRIDE] STORE_DIFF m1, m8, m9, [r0+1FDEC_STRIDE]
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/dct-a.asm ^
@@ -28,34 +28,12 @@ SECTION_RODATA pw_1: times 8 dw 1 pw_32: times 8 dw 32 -pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 +pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 +pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11 +pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15 SECTION .text -%macro SBUTTERFLY 4 - mova m%4, m%2 - punpckl%1 m%2, m%3 - punpckh%1 m%4, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE4x4W 5 - SBUTTERFLY wd, %1, %2, %5 - SBUTTERFLY wd, %3, %4, %5 - SBUTTERFLY dq, %1, %3, %5 - SBUTTERFLY dq, %2, %4, %5 - SWAP %2, %3 -%endmacro - -%macro TRANSPOSE2x4x4W 5 - SBUTTERFLY wd, %1, %2, %5 - SBUTTERFLY wd, %3, %4, %5 - SBUTTERFLY dq, %1, %3, %5 - SBUTTERFLY dq, %2, %4, %5 - SBUTTERFLY qdq, %1, %2, %5 - SBUTTERFLY qdq, %3, %4, %5 -%endmacro - %macro HADAMARD4_1D 4 SUMSUB_BADC m%2, m%1, m%4, m%3 SUMSUB_BADC m%4, m%2, m%3, m%1 @@ -65,7 +43,7 @@ ;----------------------------------------------------------------------------- ; void x264_dct4x4dc_mmx( int16_t d[4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_dct4x4dc_mmx, 1,1,1 +cglobal x264_dct4x4dc_mmx, 1,1 movq m0, [r0+ 0] movq m1, [r0+ 8] movq m2, [r0+16] @@ -143,7 +121,7 @@ ;----------------------------------------------------------------------------- ; void x264_add4x4_idct_mmx( uint8_t p_dst, int16_t dct[4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_add4x4_idct_mmx, 2,2,1 +cglobal x264_add4x4_idct_mmx, 2,2 .skip_prologue: movq m0, [r1+ 0] movq m1, [r1+ 8] @@ -179,7 +157,7 @@ movhps [r0+56], m3 ret -cglobal x264_add8x8_idct_sse2, 2,2,1 +cglobal x264_add8x8_idct_sse2, 2,2 .skip_prologue: call .8x4 add r1, 64 @@ -221,7 +199,7 @@ ; void x264_add8x8_idct_mmx( uint8_t pix, int16_t dct[4][4][4] ) ;----------------------------------------------------------------------------- %macro ADD_NxN_IDCT 6 -cglobal %1, 2,2,1 +cglobal %1, 2,2 .skip_prologue: call %2 add r0, %4-%5-%6FDEC_STRIDE @@ -257,7 +235,264 @@ SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0 ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0 +;----------------------------------------------------------------------------- +; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] ) +;----------------------------------------------------------------------------- +%macro SCAN_8x8 1 +cglobal x264_zigzag_scan_8x8_frame_%1, 2,2 + movdqa xmm0, [r1] + movdqa xmm1, [r1+16] + movdq2q mm0, xmm0 + PALIGNR xmm1, xmm1, 14, xmm2 + movdq2q mm1, xmm1 + + movdqa xmm2, [r1+32] + movdqa xmm3, [r1+48] + PALIGNR xmm2, xmm2, 12, xmm4 + movdq2q mm2, xmm2 + PALIGNR xmm3, xmm3, 10, xmm4 + movdq2q mm3, xmm3 + + punpckhwd xmm0, xmm1 + punpckhwd xmm2, xmm3 + + movq mm4, mm1 + movq mm5, mm1 + movq mm6, mm2 + movq mm7, mm3 + punpckhwd mm1, mm0 + psllq mm0, 16 + psrlq mm3, 16 + punpckhdq mm1, mm1 + punpckhdq mm2, mm0 + punpcklwd mm0, mm4 + punpckhwd mm4, mm3 + punpcklwd mm4, mm2 + punpckhdq mm0, mm2 + punpcklwd mm6, mm3 + punpcklwd mm5, mm7 + punpcklwd mm5, mm6 + + movdqa xmm4, [r1+64] + movdqa xmm5, [r1+80] + movdqa xmm6, [r1+96] + movdqa xmm7, [r1+112] + + movq [r0+200], mm0 + movq [r0+204], mm4 + movd [r0+208], mm1 + movq [r0+236], mm5 + movq [r0+246], mm6 + + PALIGNR xmm4, xmm4, 14, xmm3 + movdq2q mm4, xmm4 + PALIGNR xmm5, xmm5, 12, xmm3 + movdq2q mm5, xmm5 + PALIGNR xmm6, xmm6, 10, xmm3 + movdq2q mm6, xmm6 +%ifidn %1, ssse3 + PALIGNR xmm7, xmm7, 8, xmm3 + movdq2q mm7, xmm7 +%else + movhlps xmm3, xmm7 + movlhps xmm7, xmm7 + movdq2q mm7, xmm3 +%endif + + punpckhwd xmm4, xmm5 + punpckhwd xmm6, xmm7 + movq mm0, mm4 + movq mm1, mm5 + movq mm3, mm7 + punpcklwd mm7, mm6 + psrlq mm6, 16 + punpcklwd mm4, mm6 + punpcklwd mm5, mm4 + punpckhdq mm4, mm3 + punpcklwd mm3, mm6 + punpckhwd mm3, mm4 + punpckhwd mm0, mm1 + punpckldq mm4, mm0 + punpckhdq mm0, mm6 + pshufw mm4, mm4, 0x6c + + movq [r0+214], mm4 + movq [r0+225], mm0 + movd [r0+254], mm7 + movq [r0+256], mm5 + movq [r0+260], mm3 + + movdqa xmm3, xmm0 + movdqa xmm7, xmm4 + punpckldq xmm0, xmm2 + punpckldq xmm4, xmm6 + punpckhdq xmm3, xmm2 + punpckhdq xmm7, xmm6 + pshufhw xmm0, xmm0, 0x1b + pshuflw xmm4, xmm4, 0x1b + pshufhw xmm3, xmm3, 0x1b + pshuflw xmm7, xmm7, 0x1b + + movlps [r0+210], xmm0 + movhps [r0+217], xmm0 + movlps [r0+221], xmm3 + movlps [r0+228], xmm4 + movhps [r0+232], xmm3 + movhps [r0+239], xmm4 + movlps [r0+243], xmm7 + movhps [r0+250], xmm7 + + RET +%endmacro + +INIT_XMM +%define PALIGNR PALIGNR_MMX +SCAN_8x8 sse2 +%define PALIGNR PALIGNR_SSSE3 +SCAN_8x8 ssse3 + +;----------------------------------------------------------------------------- +; void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] ) +;----------------------------------------------------------------------------- +cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2 + movq mm0, [r1] + movq mm1, [r1+28] + movq mm2, [r1+214] + movq mm3, [r1+221] + movq mm4, [r1+228] + movq mm5, mm0 + movq mm6, mm1 + psrlq mm0, 16 + punpckldq mm1, mm1 + punpcklwd mm5, mm6 + punpckhwd mm1, mm3 + punpckhwd mm6, mm0 + punpckldq mm5, mm0 + movq mm7, [r1+252] + movq mm0, [r1+260] + punpckhwd mm1, mm2 + punpcklwd mm2, mm4 + punpckhwd mm4, mm3 + punpckldq mm3, mm3 + punpckhwd mm3, mm2 + movq [r0], mm5 + movq [r0+24], mm1 + movq [r0+28], mm6 + punpcklwd mm6, mm0 + punpcklwd mm6, mm7 + movq mm1, [r1+232] + movq mm5, [r1+239] + movq mm2, [r1+246] + movq [r0+235], mm3 + movq [r0+247], mm4 + punpckhwd mm7, mm0 + psllq mm0, 16 + movq mm3, mm5 + punpcklwd mm5, mm1 + punpckhwd mm1, mm2 + punpckhdq mm3, mm3 + movq [r0+252], mm6 + movq [r0+213], mm5 + movq mm4, [r1+211] + movq mm6, [r1+225] + punpcklwd mm5, mm7 + punpcklwd mm1, mm3 + punpckhdq mm0, mm7 + movq mm3, [r1+24] + movq mm7, [r1+218] + punpcklwd mm2, mm5 + movq [r0+225], mm1 + movq mm1, mm4 + movq mm5, mm6 + punpcklwd mm4, mm3 + punpcklwd mm6, mm7 + punpckhwd mm1, mm3 + punpckhwd mm5, mm7 + movq mm3, mm6 + movq mm7, mm5 + punpckldq mm6, mm4 + punpckldq mm5, mm1 + punpckhdq mm3, mm4 + punpckhdq mm7, mm1 + movq mm4, [r1+235] + movq mm1, [r1+249] + pshufw mm6, mm6, 0x1b + pshufw mm5, mm5, 0x1b + movq [r0+260], mm0 + movq [r0+256], mm2 + movq mm0, [r1+242] + movq mm2, [r1+256] + movq [r0+217], mm3 + movq [r0+232], mm7 + movq [r0+210], mm6 + movq [r0+221], mm5 + movq mm3, mm0 + movq mm7, mm2 + punpcklwd mm0, mm4 + punpcklwd mm2, mm1 + punpckhwd mm3, mm4 + punpckhwd mm7, mm1 + movq mm4, mm2 + movq mm1, mm7 + punpckhdq mm2, mm0 + punpckhdq mm7, mm3 + punpckldq mm4, mm0 + punpckldq mm1, mm3 + pshufw mm2, mm2, 0x1b + pshufw mm7, mm7, 0x1b + movq [r0+228], mm4 + movq [r0+243], mm1 + movq [r0+239], mm2 + movq [r0+250], mm7 + RET + +;----------------------------------------------------------------------------- +; void x264_zigzag_scan_4x4_frame_mmx( int16_t level[16], int16_t dct[4][4] ) +;----------------------------------------------------------------------------- +cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2 + movq mm0, [r1] + movq mm1, [r1+8] + movq mm2, [r1+16] + movq mm3, [r1+24] + movq mm4, mm0 + movq mm5, mm1 + movq mm6, mm2 + movq mm7, mm3 + psllq mm3, 16 + psrlq mm0, 16 + punpckldq mm2, mm2 + punpckhdq mm1, mm1 + punpcklwd mm4, mm5 + punpcklwd mm5, mm3 + punpckldq mm4, mm0 + punpckhwd mm5, mm2 + punpckhwd mm0, mm6 + punpckhwd mm6, mm7 + punpcklwd mm1, mm0 + punpckhdq mm3, mm6 + movq [r0], mm4 + movq [r0+8], mm5 + movq [r0+16], mm1 + movq [r0+24], mm3 + RET + +;----------------------------------------------------------------------------- +; void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[4][4] ) +;----------------------------------------------------------------------------- +cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2 + movdqa xmm1, [r1+16] + movdqa xmm0, [r1] + pshufb xmm1, [pb_scan4frameb GLOBAL] + pshufb xmm0, [pb_scan4framea GLOBAL] + movdqa xmm2, xmm1 + psrldq xmm1, 6 + palignr xmm2, xmm0, 6 + pslldq xmm0, 10 + palignr xmm1, xmm0, 10 + movdqa [r0], xmm2 + movdqa [r0+16], xmm1 + RET ;----------------------------------------------------------------------------- ; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] ) @@ -292,14 +527,13 @@ movd [r2+1FDEC_STRIDE], xmm1 movd [r2+2FDEC_STRIDE], xmm2 movd [r2+3FDEC_STRIDE], xmm3 - picgetgot r1 punpckldq xmm0, xmm1 punpckldq xmm2, xmm3 punpckldq xmm4, xmm5 punpckldq xmm6, xmm7 movlhps xmm0, xmm2 movlhps xmm4, xmm6 - movdqa xmm7, [pb_zigzag4 GLOBAL] + movdqa xmm7, [pb_sub4frame GLOBAL] pshufb xmm0, xmm7 pshufb xmm4, xmm7 pxor xmm6, xmm6
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/dct.h ^
@@ -24,32 +24,37 @@ #ifndef X264_I386_DCT_H #define X264_I386_DCT_H -void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t pix1, uint8_t pix2 ); -void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t pix1, uint8_t pix2 ); -void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t pix1, uint8_t pix2 ); -void x264_sub8x8_dct_sse2( int16_t dct[4][4][4], uint8_t pix1, uint8_t pix2 ); -void x264_sub16x16_dct_sse2( int16_t dct[16][4][4], uint8_t pix1, uint8_t pix2 ); - -void x264_add4x4_idct_mmx( uint8_t p_dst, int16_t dct[4][4] ); -void x264_add8x8_idct_mmx( uint8_t p_dst, int16_t dct[4][4][4] ); -void x264_add16x16_idct_mmx( uint8_t p_dst, int16_t dct[16][4][4] ); -void x264_add8x8_idct_sse2( uint8_t p_dst, int16_t dct[4][4][4] ); -void x264_add16x16_idct_sse2( uint8_t p_dst, int16_t dct[16][4][4] ); - -void x264_dct4x4dc_mmx( int16_t d[4][4] ); -void x264_idct4x4dc_mmx( int16_t d[4][4] ); - -void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t pix1, uint8_t pix2 ); -void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t pix1, uint8_t pix2 ); -void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t pix1, uint8_t pix2 ); -void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t pix1, uint8_t pix2 ); - -void x264_add8x8_idct8_mmx( uint8_t dst, int16_t dct[8][8] ); -void x264_add16x16_idct8_mmx( uint8_t dst, int16_t dct[4][8][8] ); -void x264_add8x8_idct8_sse2( uint8_t dst, int16_t dct[8][8] ); +void x264_sub4x4_dct_mmx ( int16_t dct[ 4][4] , uint8_t pix1, uint8_t pix2 ); +void x264_sub8x8_dct_mmx ( int16_t dct[ 4][4][4], uint8_t pix1, uint8_t pix2 ); +void x264_sub16x16_dct_mmx ( int16_t dct[16][4][4], uint8_t pix1, uint8_t pix2 ); +void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][4][4], uint8_t pix1, uint8_t pix2 ); +void x264_sub16x16_dct_sse2 ( int16_t dct[16][4][4], uint8_t pix1, uint8_t pix2 ); + +void x264_add4x4_idct_mmx ( uint8_t p_dst, int16_t dct[ 4][4] ); +void x264_add8x8_idct_mmx ( uint8_t p_dst, int16_t dct[ 4][4][4] ); +void x264_add16x16_idct_mmx ( uint8_t p_dst, int16_t dct[16][4][4] ); +void x264_add8x8_idct_sse2 ( uint8_t p_dst, int16_t dct[ 4][4][4] ); +void x264_add16x16_idct_sse2 ( uint8_t p_dst, int16_t dct[16][4][4] ); + +void x264_dct4x4dc_mmx ( int16_t d[4][4] ); +void x264_idct4x4dc_mmx ( int16_t d[4][4] ); + +void x264_sub8x8_dct8_mmx ( int16_t dct[8][8] , uint8_t pix1, uint8_t pix2 ); +void x264_sub16x16_dct8_mmx ( int16_t dct[4][8][8], uint8_t pix1, uint8_t pix2 ); +void x264_sub8x8_dct8_sse2 ( int16_t dct[8][8] , uint8_t pix1, uint8_t pix2 ); +void x264_sub16x16_dct8_sse2 ( int16_t dct[4][8][8], uint8_t pix1, uint8_t pix2 ); + +void x264_add8x8_idct8_mmx ( uint8_t dst, int16_t dct[8][8] ); +void x264_add16x16_idct8_mmx ( uint8_t dst, int16_t dct[4][8][8] ); +void x264_add8x8_idct8_sse2 ( uint8_t dst, int16_t dct[8][8] ); void x264_add16x16_idct8_sse2( uint8_t dst, int16_t dct[4][8][8] ); +void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[8][8] ); +void x264_zigzag_scan_8x8_frame_sse2 ( int16_t level[64], int16_t dct[8][8] ); +void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] ); +void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[4][4] ); +void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] ); void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] ); -void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t src, uint8_t dst ); +void x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t src, uint8_t *dst ); #endif
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/deblock-a.asm ^
@@ -373,7 +373,7 @@ ;----------------------------------------------------------------------------- ; void x264_deblock_v8_luma_mmxext( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_%1, 5,5,1 +cglobal x264_deblock_%2_luma_%1, 5,5 lea r4, [r13] dec r2 ; alpha-1 neg r4 @@ -609,7 +609,7 @@ ;----------------------------------------------------------------------------- ; void x264_deblock_v_luma_intra_sse2( uint8_t pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_intra_%1, 4,6,1 +cglobal x264_deblock_%2_luma_intra_%1, 4,6 %ifndef ARCH_X86_64 sub esp, 0x60 %endif @@ -813,7 +813,6 @@ movd m6, [r4] ; tc0 punpcklbw m6, m6 pand m7, m6 - picgetgot r4 DEBLOCK_P0_Q0 ret @@ -862,7 +861,6 @@ LOAD_MASK r2d, r3d movq m5, m1 movq m6, m2 - picgetgot r2 CHROMA_INTRA_P0 m1, m0, m3 CHROMA_INTRA_P0 m2, m3, m0 psubb m1, m5
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/mc-a.asm ^
@@ -36,33 +36,21 @@ SECTION .text ;============================================================================= -; pixel avg +; weighted prediction ;============================================================================= - -;----------------------------------------------------------------------------- -; void x264_pixel_avg_4x4_mmxext( uint8_t dst, int dst_stride, -; uint8_t src, int src_stride ); -;----------------------------------------------------------------------------- -%macro AVGH 3 -%assign function_align 8 ; the whole function fits in 8 bytes, so a larger align just wastes space -cglobal x264_pixel_avg_%1x%2_%3 - mov eax, %2 - jmp x264_pixel_avg_w%1_%3 -%assign function_align 16 -%endmacro - -;----------------------------------------------------------------------------- -; void x264_pixel_avg_w4_mmxext( uint8_t dst, int dst_stride, -; uint8_t src, int src_stride, -; int height ); -;----------------------------------------------------------------------------- +; implicit bipred only: +; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 %ifdef ARCH_X86_64 %define t0 r0 %define t1 r1 %define t2 r2 %define t3 r3 - %macro AVG_START 1 - cglobal %1, 4,5 + %define t4 r4 + %define t5 r5 + %define t6d r10d + %define t7d r11d + %macro AVG_START 0 + PROLOGUE 6,7 .height_loop: %endmacro %else @@ -70,79 +58,228 @@ %define t1 r2 %define t2 r3 %define t3 r4 - %macro AVG_START 1 - cglobal %1, 0,5 + %define t4 r5 + %define t5 r6 + %define t6d r1d + %define t7d r2d + %macro AVG_START 0 + PROLOGUE 0,7 mov t0, r0m mov t1, r1m mov t2, r2m mov t3, r3m + mov t4, r4m + mov t5, r5m .height_loop: %endmacro %endif +%macro SPLATW 2 +%if mmsize==16 + pshuflw %1, %2, 0 + movlhps %1, %1 +%else + pshufw %1, %2, 0 +%endif +%endmacro + +%macro BIWEIGHT_MMX 2 + movh m0, %1 + movh m1, %2 + punpcklbw m0, m7 + punpcklbw m1, m7 + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m1 + paddw m0, m6 + psraw m0, 6 +%endmacro + +%macro BIWEIGHT_START_MMX 0 + movd m4, r6m + SPLATW m4, m4 ; weight_dst + mova m5, [pw_64 GLOBAL] + psubw m5, m4 ; weight_src + mova m6, [pw_32 GLOBAL] ; rounding + pxor m7, m7 +%endmacro + +%macro BIWEIGHT_SSSE3 2 + movh m0, %1 + movh m1, %2 + punpcklbw m0, m1 + pmaddubsw m0, m5 + paddw m0, m6 + psraw m0, 6 +%endmacro + +%macro BIWEIGHT_START_SSSE3 0 + movzx t6d, byte r6m ; FIXME x86_64 + mov t7d, 64 + sub t7d, t6d + shl t7d, 8 + add t6d, t7d + movd m5, t6d + mova m6, [pw_32 GLOBAL] + SPLATW m5, m5 ; weight_dst,src +%endmacro + +%macro BIWEIGHT_ROW 4 + BIWEIGHT [%2], [%3] +%if %4==mmsize/2 + packuswb m0, m0 + movh [%1], m0 +%else + SWAP 0, 2 + BIWEIGHT [%2+mmsize/2], [%3+mmsize/2] + packuswb m2, m0 + mova [%1], m2 +%endif +%endmacro + +;----------------------------------------------------------------------------- +; int x264_pixel_avg_weight_w16_mmxext( uint8_t dst, int, uint8_t src1, int, uint8_t src2, int, int i_weight ) +;----------------------------------------------------------------------------- +%macro AVG_WEIGHT 2 +cglobal x264_pixel_avg_weight_w%2_%1, 0,0 + BIWEIGHT_START + AVG_START +%if %2==8 && mmsize==16 + BIWEIGHT [t2], [t4] + SWAP 0, 2 + BIWEIGHT [t2+t3], [t4+t5] + packuswb m2, m0 + movlps [t0], m2 + movhps [t0+t1], m2 +%else +%assign x 0 +%rep 1+%2/(mmsize2) + BIWEIGHT_ROW t0+x, t2+x, t4+x, %2 + BIWEIGHT_ROW t0+x+t1, t2+x+t3, t4+x+t5, %2 +%assign x x+mmsize +%endrep +%endif + lea t0, [t0+t12] + lea t2, [t2+t32] + lea t4, [t4+t52] + sub eax, 2 + jg .height_loop + REP_RET +%endmacro + +%define BIWEIGHT BIWEIGHT_MMX +%define BIWEIGHT_START BIWEIGHT_START_MMX +INIT_MMX +AVG_WEIGHT mmxext, 4 +AVG_WEIGHT mmxext, 8 +AVG_WEIGHT mmxext, 16 +INIT_XMM +%define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext +AVG_WEIGHT sse2, 8 +AVG_WEIGHT sse2, 16 +%define BIWEIGHT BIWEIGHT_SSSE3 +%define BIWEIGHT_START BIWEIGHT_START_SSSE3 +INIT_MMX +AVG_WEIGHT ssse3, 4 +INIT_XMM +AVG_WEIGHT ssse3, 8 +AVG_WEIGHT ssse3, 16 + + + +;============================================================================= +; pixel avg +;============================================================================= + +;----------------------------------------------------------------------------- +; void x264_pixel_avg_4x4_mmxext( uint8_t dst, int dst_stride, +; uint8_t src1, int src1_stride, uint8_t src2, int src2_stride, int weight ); +;----------------------------------------------------------------------------- +%macro AVGH 3 +cglobal x264_pixel_avg_%1x%2_%3,0,0 + mov eax, %2 + cmp dword r6m, 32 + jne x264_pixel_avg_weight_w%1_%3 +%if mmsize == 16 && %1 == 16 + test dword r4m, 15 + jz x264_pixel_avg_w%1_sse2 +%endif + jmp x264_pixel_avg_w%1_mmxext +%endmacro + +;----------------------------------------------------------------------------- +; void x264_pixel_avg_w4_mmxext( uint8_t dst, int dst_stride, +; uint8_t src1, int src1_stride, uint8_t src2, int src2_stride, +; int height, int weight ); +;----------------------------------------------------------------------------- + %macro AVG_END 0 sub eax, 2 + lea t4, [t4+t52] lea t2, [t2+t32] lea t0, [t0+t12] jg .height_loop REP_RET %endmacro -AVG_START x264_pixel_avg_w4_mmxext - movd mm0, [t2] - movd mm1, [t2+t3] - pavgb mm0, [t0] - pavgb mm1, [t0+t1] - movd [t0], mm0 - movd [t0+t1], mm1 -AVG_END +%macro AVG_FUNC 3 +cglobal %1 + AVG_START + %2 m0, [t2] + %2 m1, [t2+t3] + pavgb m0, [t4] + pavgb m1, [t4+t5] + %3 [t0], m0 + %3 [t0+t1], m1 + AVG_END +%endmacro +INIT_MMX +AVG_FUNC x264_pixel_avg_w4_mmxext, movd, movd AVGH 4, 8, mmxext AVGH 4, 4, mmxext AVGH 4, 2, mmxext -AVG_START x264_pixel_avg_w8_mmxext - movq mm0, [t2] - movq mm1, [t2+t3] - pavgb mm0, [t0] - pavgb mm1, [t0+t1] - movq [t0], mm0 - movq [t0+t1], mm1 -AVG_END - +AVG_FUNC x264_pixel_avg_w8_mmxext, movq, movq AVGH 8, 16, mmxext AVGH 8, 8, mmxext AVGH 8, 4, mmxext -AVG_START x264_pixel_avg_w16_mmxext +cglobal x264_pixel_avg_w16_mmxext + AVG_START movq mm0, [t2 ] movq mm1, [t2+8] movq mm2, [t2+t3 ] movq mm3, [t2+t3+8] - pavgb mm0, [t0 ] - pavgb mm1, [t0+8] - pavgb mm2, [t0+t1 ] - pavgb mm3, [t0+t1+8] + pavgb mm0, [t4 ] + pavgb mm1, [t4+8] + pavgb mm2, [t4+t5 ] + pavgb mm3, [t4+t5+8] movq [t0 ], mm0 movq [t0+8], mm1 movq [t0+t1 ], mm2 movq [t0+t1+8], mm3 -AVG_END + AVG_END AVGH 16, 16, mmxext AVGH 16, 8, mmxext -AVG_START x264_pixel_avg_w16_sse2 - movdqu xmm0, [t2] - movdqu xmm1, [t2+t3] - pavgb xmm0, [t0] - pavgb xmm1, [t0+t1] - movdqa [t0], xmm0 - movdqa [t0+t1], xmm1 -AVG_END - +INIT_XMM +AVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqa AVGH 16, 16, sse2 -AVGH 16, 8, sse2 +AVGH 16, 8, sse2 +AVGH 8, 16, sse2 +AVGH 8, 8, sse2 +AVGH 8, 4, sse2 +AVGH 16, 16, ssse3 +AVGH 16, 8, ssse3 +AVGH 8, 16, ssse3 +AVGH 8, 8, ssse3 +AVGH 8, 4, ssse3 +INIT_MMX +AVGH 4, 8, ssse3 +AVGH 4, 4, ssse3 +AVGH 4, 2, ssse3 @@ -284,17 +421,9 @@ %macro INIT_SHIFT 2 and eax, 7 shl eax, 3 -%ifdef PIC32 - ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx - mov r2, 64 - sub r2, eax - movd %2, eax - movd %1, r2 -%else movd %1, [sw_64 GLOBAL] movd %2, eax psubw %1, %2 -%endif %endmacro %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set @@ -316,7 +445,7 @@ INIT_SHIFT mm6, mm7 mov eax, r4m INIT_SHIFT mm4, mm5 - PROLOGUE 6,6,0 + PROLOGUE 6,6 and r2, ~7 and r4, ~7 sub r4, r2 @@ -474,102 +603,12 @@ ;============================================================================= -; weighted prediction -;============================================================================= -; implicit bipred only: -; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 - -%macro SPLATW 2 -%if mmsize==16 - pshuflw %1, %2, 0 - movlhps %1, %1 -%else - pshufw %1, %2, 0 -%endif -%endmacro - -%macro BIWEIGHT 2 - movh m0, %1 - movh m1, %2 - punpcklbw m0, m7 - punpcklbw m1, m7 - pmullw m0, m4 - pmullw m1, m5 - paddw m0, m1 - paddw m0, m6 - psraw m0, 6 - pmaxsw m0, m7 - packuswb m0, m0 - movh %1, m0 -%endmacro - -%macro BIWEIGHT_START 1 -%ifidn r4m, r4d - movd m4, r4m - SPLATW m4, m4 ; weight_dst -%else - SPLATW m4, r4m -%endif - picgetgot r4 - mova m5, [pw_64 GLOBAL] - psubw m5, m4 ; weight_src - mova m6, [pw_32 GLOBAL] ; rounding - pxor m7, m7 -%if %1 -%ifidn r5m, r5d - %define t0 r5d -%else - %define t0 r4d - mov r4d, r5m -%endif -%endif -.height_loop: -%endmacro - -INIT_MMX -;----------------------------------------------------------------------------- -; int x264_pixel_avg_weight_w16_mmxext( uint8_t dst, int, uint8_t src, int, int i_weight, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4,1 - BIWEIGHT_START 0 - BIWEIGHT [r0 ], [r2 ] - BIWEIGHT [r0+r1 ], [r2+r3 ] - BIWEIGHT [r0+r12], [r2+r32] - add r0, r1 - add r2, r3 - BIWEIGHT [r0+r12], [r2+r32] - RET - -%macro AVG_WEIGHT 2 -cglobal x264_pixel_avg_weight_w%2_%1, 4,5 - BIWEIGHT_START 1 -%assign x 0 -%rep %22/mmsize - BIWEIGHT [r0+x], [r2+x] -%assign x x+mmsize/2 -%endrep - add r0, r1 - add r2, r3 - dec t0 - jg .height_loop - REP_RET -%endmacro - -AVG_WEIGHT mmxext, 8 -AVG_WEIGHT mmxext, 16 -INIT_XMM -AVG_WEIGHT sse2, 8 -AVG_WEIGHT sse2, 16 - - - -;============================================================================= ; prefetch ;============================================================================= ; FIXME assumes 64 byte cachelines ;----------------------------------------------------------------------------- -; void x264_prefetch_fenc_mmxext( uint8_t pix_y, int stride_y, +; void x264_prefetch_fenc_mmxext( uint8_t pix_y, int stride_y, ; uint8_t pix_uv, int stride_uv, int mb_x ) ;----------------------------------------------------------------------------- %ifdef ARCH_X86_64 @@ -671,7 +710,7 @@ ; int width, int height ) ;----------------------------------------------------------------------------- %macro MC_CHROMA 1 -cglobal x264_mc_chroma_%1, 0,6,1 +cglobal x264_mc_chroma_%1, 0,6 %if mmsize == 16 cmp dword r6m, 4 jle x264_mc_chroma_mmxext %+ .skip_prologue @@ -833,7 +872,7 @@ MC_CHROMA sse2 INIT_MMX -cglobal x264_mc_chroma_ssse3, 0,6,1 +cglobal x264_mc_chroma_ssse3, 0,6 MC_CHROMA_START and r4d, 7 and r5d, 7
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/mc-a2.asm ^
@@ -32,12 +32,25 @@ SECTION .text -%macro LOAD_ADD 3 +%macro LOAD_ADD 4 + movh %4, %3 movh %1, %2 - movh m7, %3 + punpcklbw %4, m0 punpcklbw %1, m0 - punpcklbw m7, m0 - paddw %1, m7 + paddw %1, %4 +%endmacro + +%macro LOAD_ADD_2 6 + mova %5, %3 + mova %1, %4 + mova %6, %5 + mova %2, %1 + punpcklbw %5, m0 + punpcklbw %1, m0 + punpckhbw %6, m0 + punpckhbw %2, m0 + paddw %1, %5 + paddw %2, %6 %endmacro %macro FILT_V2 0 @@ -64,27 +77,27 @@ paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5b+20c)/16 %endmacro -%macro FILT_H2 0 - psubw m1, m2 - psubw m4, m5 - psraw m1, 2 - psraw m4, 2 - psubw m1, m2 - psubw m4, m5 - paddw m1, m3 - paddw m4, m6 - psraw m1, 2 - psraw m4, 2 - paddw m1, m3 - paddw m4, m6 +%macro FILT_H2 6 + psubw %1, %2 + psubw %4, %5 + psraw %1, 2 + psraw %4, 2 + psubw %1, %2 + psubw %4, %5 + paddw %1, %3 + paddw %4, %6 + psraw %1, 2 + psraw %4, 2 + paddw %1, %3 + paddw %4, %6 %endmacro -%macro FILT_PACK 1 - paddw m1, m7 - paddw m4, m7 - psraw m1, %1 - psraw m4, %1 - packuswb m1, m4 +%macro FILT_PACK 3 + paddw %1, m7 + paddw %2, m7 + psraw %1, %3 + psraw %2, %3 + packuswb %1, %2 %endmacro %macro PALIGNR_MMX 4 @@ -111,7 +124,7 @@ ;----------------------------------------------------------------------------- ; void x264_hpel_filter_v_mmxext( uint8_t dst, uint8_t src, int16_t buf, int stride, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_v_%1, 5,6,1 +cglobal x264_hpel_filter_v_%1, 5,6 lea r5, [r1+r3] sub r1, r3 sub r1, r3 @@ -120,13 +133,10 @@ neg r4 pxor m0, m0 .loop: - prefetcht0 [r5+r32+64] - LOAD_ADD m1, [r1 ], [r5+r32] ; a0 - LOAD_ADD m2, [r1+r3 ], [r5+r3 ] ; b0 - LOAD_ADD m3, [r1+r32], [r5 ] ; c0 - LOAD_ADD m4, [r1 +mmsize/2], [r5+r32+mmsize/2] ; a1 - LOAD_ADD m5, [r1+r3 +mmsize/2], [r5+r3 +mmsize/2] ; b1 - LOAD_ADD m6, [r1+r32+mmsize/2], [r5 +mmsize/2] ; c1 + LOAD_ADD_2 m1, m4, [r1 ], [r5+r32], m6, m7 ; a0 / a1 + LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1 + LOAD_ADD m3, [r1+r32], [r5 ], m7 ; c0 + LOAD_ADD m6, [r1+r32+mmsize/2], [r5+mmsize/2], m7 ; c1 FILT_V2 mova m7, [pw_16 GLOBAL] mova [r2+r42], m1 @@ -136,7 +146,7 @@ psraw m1, 5 psraw m4, 5 packuswb m1, m4 - movnt [r0+r4], m1 + mova [r0+r4], m1 add r1, mmsize add r5, mmsize add r4, mmsize @@ -148,7 +158,7 @@ ;----------------------------------------------------------------------------- ; void x264_hpel_filter_c_mmxext( uint8_t dst, int16_t buf, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_c_mmxext, 3,3,1 +cglobal x264_hpel_filter_c_mmxext, 3,3 add r0, r2 lea r1, [r1+r22] neg r2 @@ -167,8 +177,8 @@ paddw m4, [src+14] ; a1 paddw m5, [src+12] ; b1 paddw m6, [src+10] ; c1 - FILT_H2 - FILT_PACK 6 + FILT_H2 m1, m2, m3, m4, m5, m6 + FILT_PACK m1, m4, 6 movntq [r0+r2], m1 add r2, 8 jl .loop @@ -177,7 +187,7 @@ ;----------------------------------------------------------------------------- ; void x264_hpel_filter_h_mmxext( uint8_t dst, uint8_t src, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_h_mmxext, 3,3,1 +cglobal x264_hpel_filter_h_mmxext, 3,3 add r0, r2 add r1, r2 neg r2 @@ -211,8 +221,8 @@ punpcklbw m6, m0 paddw m6, m7 ; a1 movq m7, [pw_1 GLOBAL] - FILT_H2 - FILT_PACK 1 + FILT_H2 m1, m2, m3, m4, m5, m6 + FILT_PACK m1, m4, 1 movntq [r0+r2], m1 add r2, 8 jl .loop @@ -224,7 +234,7 @@ ;----------------------------------------------------------------------------- ; void x264_hpel_filter_c_sse2( uint8_t dst, int16_t buf, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_c_%1, 3,3,1 +cglobal x264_hpel_filter_c_%1, 3,3 add r0, r2 lea r1, [r1+r22] neg r2 @@ -267,7 +277,7 @@ ;----------------------------------------------------------------------------- ; void x264_hpel_filter_h_sse2( uint8_t dst, uint8_t src, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_h_sse2, 3,3,1 +cglobal x264_hpel_filter_h_sse2, 3,3 add r0, r2 add r1, r2 neg r2 @@ -305,24 +315,217 @@ punpcklbw m7, m0 paddw m6, m7 ; c1 mova m7, [pw_1 GLOBAL] ; FIXME xmm8 - FILT_H2 - FILT_PACK 1 + FILT_H2 m1, m2, m3, m4, m5, m6 + FILT_PACK m1, m4, 1 movntdq [r0+r2], m1 add r2, 16 jl .loop REP_RET +;----------------------------------------------------------------------------- +; void x264_hpel_filter_h_ssse3( uint8_t dst, uint8_t src, int width ); +;----------------------------------------------------------------------------- +cglobal x264_hpel_filter_h_ssse3, 3,3 + add r0, r2 + add r1, r2 + neg r2 + %define src r1+r2 + pxor m0, m0 + movh m1, [src-8] + punpcklbw m1, m0 ; 00 -1 00 -2 00 -3 00 -4 00 -5 00 -6 00 -7 00 -8 + movh m2, [src] + punpcklbw m2, m0 + mova m7, [pw_1 GLOBAL] +.loop: + movh m3, [src+8] + punpcklbw m3, m0 + + mova m4, m2 + palignr m2, m1, 14 + mova m5, m3 + palignr m3, m4, 4 + paddw m3, m2 + + mova m2, m4 + palignr m4, m1, 12 + mova m1, m5 + palignr m5, m2, 6 + paddw m5, m4 + + mova m4, m1 + palignr m1, m2, 2 + paddw m1, m2 + + FILT_H m5, m3, m1 + + movh m1, [src+16] + punpcklbw m1, m0 + + mova m3, m4 + palignr m4, m2, 14 + mova m6, m1 + palignr m1, m3, 4 + paddw m1, m4 + + mova m4, m3 + palignr m3, m2, 12 + mova m2, m6 + palignr m6, m4, 6 + paddw m6, m3 + + mova m3, m2 + palignr m2, m4, 2 + paddw m2, m4 + + FILT_H m6, m1, m2 + FILT_PACK m5, m6, 1 + movdqa [r0+r2], m5 + + add r2, 16 + mova m2, m3 + mova m1, m4 + + jl .loop + REP_RET + + %define PALIGNR PALIGNR_MMX HPEL_V sse2 HPEL_C sse2 %define PALIGNR PALIGNR_SSSE3 HPEL_C ssse3 -cglobal x264_sfence +%ifdef ARCH_X86_64 + +%macro DO_FILT_V 5 + LOAD_ADD_2 m1, m4, [r3 ], [r1+r22], m2, m5 ; a0 / a1 + LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1 + LOAD_ADD_2 m3, m6, [r3+r22], [r1 ], %3, %4 ; c0 / c1 + FILT_V2 + mova %1, m1 + mova %2, m4 + paddw m1, m15 + paddw m4, m15 + add r3, 16 + add r1, 16 + psraw m1, 5 + psraw m4, 5 + packuswb m1, m4 + movntps [r11+r4+%5], m1 +%endmacro + +%macro DO_FILT_H 4 + mova m1, %2 + PALIGNR m1, %1, 12, m4 + mova m2, %2 + PALIGNR m2, %1, 14, m4 + mova %1, %3 + PALIGNR %3, %2, 6, m4 + mova m3, %1 + PALIGNR m3, %2, 4, m4 + mova m4, %1 + paddw %3, m1 + PALIGNR m4, %2, 2, m1 + paddw m3, m2 + paddw m4, %2 + FILT_H %3, m3, m4 + paddw %3, m15 + psraw %3, %4 +%endmacro + +%macro DO_FILT_CC 4 + DO_FILT_H %1, %2, %3, 6 + DO_FILT_H %2, %1, %4, 6 + packuswb %3, %4 + movntps [r5+r4], %3 +%endmacro + +%macro DO_FILT_HH 4 + DO_FILT_H %1, %2, %3, 1 + DO_FILT_H %2, %1, %4, 1 + packuswb %3, %4 + movntps [r0+r4], %3 +%endmacro + +%macro DO_FILT_H2 6 + DO_FILT_H %1, %2, %3, 6 + psrlw m15, 5 + DO_FILT_H %4, %5, %6, 1 + packuswb %6, %3 +%endmacro + +%macro HPEL 1 +;----------------------------------------------------------------------------- +; void x264_hpel_filter_sse2( uint8_t dsth, uint8_t dstv, uint8_t dstc, +; uint8_t src, int stride, int width, int height) +;----------------------------------------------------------------------------- +cglobal x264_hpel_filter_%1, 7,7 + mov r10, r3 + sub r5, 16 + mov r11, r1 + and r10, 15 + sub r3, r10 + add r0, r5 + add r11, r5 + add r10, r5 + add r5, r2 + mov r2, r4 + neg r10 + lea r1, [r3+r2] + sub r3, r2 + sub r3, r2 + mov r4, r10 + pxor m0, m0 + pcmpeqw m15, m15 + psrlw m15, 15 ; pw_1 + psllw m15, 4 +;ALIGN 16 +.loopy: +; first filter_v +; prefetching does not help here! lots of variants tested, all slower + DO_FILT_V m8, m7, m13, m12, 0 +;ALIGN 16 +.loopx: + DO_FILT_V m6, m5, m11, m10, 16 +.lastx: + paddw m15, m15 + DO_FILT_CC m9, m8, m7, m6 + movdqa m7, m12 ; not really necessary, but seems free and + movdqa m6, m11 ; gives far shorter code + psrlw m15, 5 + DO_FILT_HH m14, m13, m7, m6 + psllw m15, 4 ; pw_16 + movdqa m7, m5 + movdqa m12, m10 + add r4, 16 + jl .loopx + cmp r4, 16 + jl .lastx +; setup regs for next y + sub r4, r10 + sub r4, r2 + sub r1, r4 + sub r3, r4 + add r0, r2 + add r11, r2 + add r5, r2 + mov r4, r10 + sub r6d, 1 + jg .loopy sfence - ret + RET +%endmacro +%define PALIGNR PALIGNR_MMX +HPEL sse2 +%define PALIGNR PALIGNR_SSSE3 +HPEL ssse3 + +%endif +cglobal x264_sfence + sfence + ret ;----------------------------------------------------------------------------- ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/mc-c.c ^
@@ -27,29 +27,27 @@ #include <string.h> #include "common/common.h" +#include "mc.h" -/* NASM functions / -extern void x264_pixel_avg_16x16_sse2( uint8_t , int, uint8_t , int ); -extern void x264_pixel_avg_16x8_sse2( uint8_t , int, uint8_t , int ); -extern void x264_pixel_avg_16x16_mmxext( uint8_t , int, uint8_t , int ); -extern void x264_pixel_avg_16x8_mmxext( uint8_t , int, uint8_t , int ); -extern void x264_pixel_avg_8x16_mmxext( uint8_t , int, uint8_t , int ); -extern void x264_pixel_avg_8x8_mmxext( uint8_t , int, uint8_t , int ); -extern void x264_pixel_avg_8x4_mmxext( uint8_t , int, uint8_t , int ); -extern void x264_pixel_avg_4x8_mmxext( uint8_t , int, uint8_t , int ); -extern void x264_pixel_avg_4x4_mmxext( uint8_t , int, uint8_t , int ); -extern void x264_pixel_avg_4x2_mmxext( uint8_t , int, uint8_t , int ); +#define DECL_SUF( func, args )\ + void func##_mmxext args;\ + void func##_sse2 args;\ + void func##_ssse3 args; + +DECL_SUF( x264_pixel_avg_16x16, ( uint8_t , int, uint8_t , int, uint8_t , int, int )) +DECL_SUF( x264_pixel_avg_16x8, ( uint8_t , int, uint8_t , int, uint8_t , int, int )) +DECL_SUF( x264_pixel_avg_8x16, ( uint8_t , int, uint8_t , int, uint8_t , int, int )) +DECL_SUF( x264_pixel_avg_8x8, ( uint8_t , int, uint8_t , int, uint8_t , int, int )) +DECL_SUF( x264_pixel_avg_8x4, ( uint8_t , int, uint8_t , int, uint8_t , int, int )) +DECL_SUF( x264_pixel_avg_4x8, ( uint8_t , int, uint8_t , int, uint8_t , int, int )) +DECL_SUF( x264_pixel_avg_4x4, ( uint8_t , int, uint8_t , int, uint8_t , int, int )) +DECL_SUF( x264_pixel_avg_4x2, ( uint8_t , int, uint8_t , int, uint8_t , int, int )) extern void x264_mc_copy_w4_mmx( uint8_t , int, uint8_t , int, int ); extern void x264_mc_copy_w8_mmx( uint8_t , int, uint8_t , int, int ); extern void x264_mc_copy_w16_mmx( uint8_t , int, uint8_t , int, int ); extern void x264_mc_copy_w16_sse2( uint8_t , int, uint8_t , int, int ); extern void x264_mc_copy_w16_sse3( uint8_t , int, uint8_t , int, int ); extern void x264_mc_copy_w16_aligned_sse2( uint8_t , int, uint8_t , int, int ); -extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t , int, uint8_t , int, int ); -extern void x264_pixel_avg_weight_w8_mmxext( uint8_t , int, uint8_t , int, int, int ); -extern void x264_pixel_avg_weight_w8_sse2( uint8_t , int, uint8_t , int, int, int ); -extern void x264_pixel_avg_weight_w16_sse2( uint8_t , int, uint8_t , int, int, int ); -extern void x264_pixel_avg_weight_w16_mmxext( uint8_t , int, uint8_t , int, int, int ); extern void x264_prefetch_fenc_mmxext( uint8_t , int, uint8_t , int, int ); extern void x264_prefetch_ref_mmxext( uint8_t , int, int ); extern void x264_mc_chroma_mmxext( uint8_t src, int i_src_stride, @@ -86,23 +84,6 @@ PIXEL_AVG_WALL(cache64_sse2) PIXEL_AVG_WALL(sse2) -#define AVG_WEIGHT(W,H,name) \ -void x264_pixel_avg_weight_ ## W ## x ## H ## _##name( uint8_t dst, int i_dst, uint8_t src, int i_src, int i_weight_dst ) \ -{ \ - x264_pixel_avg_weight_w ## W ## _##name( dst, i_dst, src, i_src, i_weight_dst, H ); \ -} - -AVG_WEIGHT(16,16,mmxext) -AVG_WEIGHT(16,8,mmxext) -AVG_WEIGHT(8,16,mmxext) -AVG_WEIGHT(8,8,mmxext) -AVG_WEIGHT(8,4,mmxext) -AVG_WEIGHT(16,16,sse2) -AVG_WEIGHT(16,8,sse2) -AVG_WEIGHT(8,16,sse2) -AVG_WEIGHT(8,8,sse2) -AVG_WEIGHT(8,4,sse2) - #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\ static void ( const x264_pixel_avg_wtab_##instr[6])( uint8_t , int, uint8_t , int, uint8_t , int ) =\ {\ @@ -143,7 +124,7 @@ static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; #define MC_LUMA(name,instr1,instr2)\ -void mc_luma_##name( uint8_t dst, int i_dst_stride,\ +static void mc_luma_##name( uint8_t dst, int i_dst_stride,\ uint8_t src[4], int i_src_stride,\ int mvx, int mvy,\ int i_width, int i_height )\ @@ -174,7 +155,7 @@ MC_LUMA(cache64_sse2,cache64_sse2,sse2) #define GET_REF(name)\ -uint8_t get_ref_##name( uint8_t dst, int i_dst_stride,\ +static uint8_t get_ref_##name( uint8_t dst, int i_dst_stride,\ uint8_t src[4], int i_src_stride,\ int mvx, int mvy,\ int i_width, int i_height )\ @@ -210,7 +191,7 @@ void x264_hpel_filter_c_##cpuc( uint8_t dst, int16_t buf, int width );\ void x264_hpel_filter_h_##cpuh( uint8_t dst, uint8_t src, int width );\ void x264_sfence( void );\ -void x264_hpel_filter_##cpu( uint8_t dsth, uint8_t dstv, uint8_t dstc, uint8_t src,\ +static void x264_hpel_filter_##cpu( uint8_t dsth, uint8_t dstv, uint8_t dstc, uint8_t src,\ int stride, int width, int height )\ {\ int16_t buf;\ @@ -237,14 +218,20 @@ HPEL(8, mmxext, mmxext, mmxext, mmxext) HPEL(16, sse2_amd, mmxext, mmxext, sse2) +#ifdef ARCH_X86_64 +void x264_hpel_filter_sse2( uint8_t dsth, uint8_t dstv, uint8_t dstc, uint8_t src, int stride, int width, int height ); +void x264_hpel_filter_ssse3( uint8_t dsth, uint8_t dstv, uint8_t dstc, uint8_t src, int stride, int width, int height ); +#else HPEL(16, sse2, sse2, sse2, sse2) -HPEL(16, ssse3, sse2, ssse3, sse2) +HPEL(16, ssse3, sse2, ssse3, ssse3) +#endif void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { if( !(cpu&X264_CPU_MMX) ) return; + pf->copy_16x16_unaligned = x264_mc_copy_w16_mmx; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx; pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx; @@ -267,14 +254,6 @@ pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext; - pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_mmxext; - pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_mmxext; - pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_mmxext; - pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_mmxext; - pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_mmxext; - pf->avg_weight[PIXEL_4x4] = x264_pixel_avg_weight_4x4_mmxext; - // avg_weight_4x8 is rare and 4x2 is not used - pf->plane_copy = x264_plane_copy_mmxext; pf->hpel_filter = x264_hpel_filter_mmxext; pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext; @@ -310,14 +289,9 @@ pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2; pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2; - if( !(cpu&X264_CPU_STACK_MOD4) ) - { - pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_sse2; - pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_sse2; - pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_sse2; - pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_sse2; - pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_sse2; - } + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2; pf->hpel_filter = x264_hpel_filter_sse2; pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2; pf->mc_chroma = x264_mc_chroma_sse2; @@ -336,6 +310,15 @@ if( !(cpu&X264_CPU_SSSE3) ) return; + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_ssse3; + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_ssse3; + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_ssse3; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_ssse3; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_ssse3; + pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_ssse3; + pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3; + pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3; + pf->hpel_filter = x264_hpel_filter_ssse3; pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; pf->mc_chroma = x264_mc_chroma_ssse3;
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/mc.h ^
@@ -26,7 +26,4 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t pf ); -void x264_mc_chroma_mmxext( uint8_t src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); #endif
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/pixel-32.asm ^
@@ -25,21 +25,9 @@ %include "x86util.asm" SECTION .text +INIT_MMX -%macro SBUTTERFLY 5 - mov%1 %5, %3 - punpckl%2 %3, %4 - punpckh%2 %5, %4 -%endmacro - -%macro TRANSPOSE4x4W 5 ; abcd-t -> adtc - SBUTTERFLY q, wd, %1, %2, %5 - SBUTTERFLY q, wd, %3, %4, %2 - SBUTTERFLY q, dq, %1, %3, %4 - SBUTTERFLY q, dq, %5, %2, %3 -%endmacro - -%macro LOAD_DIFF_4P 4 ; mmp, mmt, dx, dy +%macro LOAD_DIFF_4P 4 ; mp, mt, dx, dy movd %1, [eax+ebx%4+%3] movd %2, [ecx+edx%4+%3] punpcklbw %1, %2 @@ -48,40 +36,40 @@ %endmacro %macro LOAD_DIFF_4x8P 1 ; dx - LOAD_DIFF_4P mm0, mm7, %1, 0 - LOAD_DIFF_4P mm1, mm7, %1, 1 + LOAD_DIFF_4P m0, m7, %1, 0 + LOAD_DIFF_4P m1, m7, %1, 1 lea eax, [eax+2ebx] lea ecx, [ecx+2edx] - LOAD_DIFF_4P mm2, mm7, %1, 0 - LOAD_DIFF_4P mm3, mm7, %1, 1 + LOAD_DIFF_4P m2, m7, %1, 0 + LOAD_DIFF_4P m3, m7, %1, 1 lea eax, [eax+2ebx] lea ecx, [ecx+2edx] - LOAD_DIFF_4P mm4, mm7, %1, 0 - LOAD_DIFF_4P mm5, mm7, %1, 1 + LOAD_DIFF_4P m4, m7, %1, 0 + LOAD_DIFF_4P m5, m7, %1, 1 lea eax, [eax+2ebx] lea ecx, [ecx+2edx] - LOAD_DIFF_4P mm6, mm7, %1, 0 - movq [spill], mm6 - LOAD_DIFF_4P mm7, mm6, %1, 1 - movq mm6, [spill] + LOAD_DIFF_4P m6, m7, %1, 0 + movq [spill], m6 + LOAD_DIFF_4P m7, m6, %1, 1 + movq m6, [spill] %endmacro %macro SUM4x8_MM 0 - movq [spill], mm6 - movq [spill+8], mm7 - ABS2 mm0, mm1, mm6, mm7 - ABS2 mm2, mm3, mm6, mm7 - paddw mm0, mm2 - paddw mm1, mm3 - movq mm6, [spill] - movq mm7, [spill+8] - ABS2 mm4, mm5, mm2, mm3 - ABS2 mm6, mm7, mm2, mm3 - paddw mm4, mm6 - paddw mm5, mm7 - paddw mm0, mm4 - paddw mm1, mm5 - paddw mm0, mm1 + movq [spill], m6 + movq [spill+8], m7 + ABS2 m0, m1, m6, m7 + ABS2 m2, m3, m6, m7 + paddw m0, m2 + paddw m1, m3 + movq m6, [spill] + movq m7, [spill+8] + ABS2 m4, m5, m2, m3 + ABS2 m6, m7, m2, m3 + paddw m4, m6 + paddw m5, m7 + paddw m0, m4 + paddw m1, m5 + paddw m0, m1 %endmacro ;----------------------------------------------------------------------------- @@ -98,67 +86,67 @@ %define spill esp+0x60 ; +16 %define trans esp+0 ; +96 LOAD_DIFF_4x8P 0 - HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - movq [spill], mm0 - TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0 - movq [trans+0x00], mm4 - movq [trans+0x08], mm7 - movq [trans+0x10], mm0 - movq [trans+0x18], mm6 - movq mm0, [spill] - TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4 - movq [trans+0x20], mm0 - movq [trans+0x28], mm3 - movq [trans+0x30], mm4 - movq [trans+0x38], mm2 + movq [spill], m0 + TRANSPOSE4x4W 4, 5, 6, 7, 0 + movq [trans+0x00], m4 + movq [trans+0x08], m5 + movq [trans+0x10], m6 + movq [trans+0x18], m7 + movq m0, [spill] + TRANSPOSE4x4W 0, 1, 2, 3, 4 + movq [trans+0x20], m0 + movq [trans+0x28], m1 + movq [trans+0x30], m2 + movq [trans+0x38], m3 mov eax, [args+4] mov ecx, [args+12] LOAD_DIFF_4x8P 4 - HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - movq [spill], mm7 - TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7 - movq [trans+0x40], mm0 - movq [trans+0x48], mm3 - movq [trans+0x50], mm7 - movq [trans+0x58], mm2 - movq mm7, [spill] - TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0 - movq mm5, [trans+0x00] - movq mm1, [trans+0x08] - movq mm2, [trans+0x10] - movq mm3, [trans+0x18] + movq [spill], m7 + TRANSPOSE4x4W 0, 1, 2, 3, 7 + movq [trans+0x40], m0 + movq [trans+0x48], m1 + movq [trans+0x50], m2 + movq [trans+0x58], m3 + movq m7, [spill] + TRANSPOSE4x4W 4, 5, 6, 7, 0 + movq m0, [trans+0x00] + movq m1, [trans+0x08] + movq m2, [trans+0x10] + movq m3, [trans+0x18] - HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 SUM4x8_MM - movq [trans], mm0 + movq [trans], m0 + + movq m0, [trans+0x20] + movq m1, [trans+0x28] + movq m2, [trans+0x30] + movq m3, [trans+0x38] + movq m4, [trans+0x40] + movq m5, [trans+0x48] + movq m6, [trans+0x50] + movq m7, [trans+0x58] - movq mm0, [trans+0x20] - movq mm1, [trans+0x28] - movq mm2, [trans+0x30] - movq mm3, [trans+0x38] - movq mm4, [trans+0x40] - movq mm5, [trans+0x48] - movq mm6, [trans+0x50] - movq mm7, [trans+0x58] - - HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 SUM4x8_MM - pavgw mm0, [esp] - pshufw mm1, mm0, 01001110b - paddw mm0, mm1 - pshufw mm1, mm0, 10110001b - paddw mm0, mm1 - movd eax, mm0 - and eax, 0xffff - mov ecx, eax ; preserve rounding for 16x16 - add eax, 1 - shr eax, 1 - add esp, 0x70 - pop ebx + pavgw m0, [trans] + pshufw m1, m0, 01001110b + paddw m0, m1 + pshufw m1, m0, 10110001b + paddw m0, m1 + movd eax, m0 + and eax, 0xffff + mov ecx, eax ; preserve rounding for 16x16 + add eax, 1 + shr eax, 1 + add esp, 0x70 + pop ebx ret %undef args %undef spill @@ -184,25 +172,25 @@ %endmacro %macro LOAD_4x8P 1 ; dx - pxor mm7, mm7 - movd mm6, [eax+%1+7FENC_STRIDE] - movd mm0, [eax+%1+0FENC_STRIDE] - movd mm1, [eax+%1+1FENC_STRIDE] - movd mm2, [eax+%1+2FENC_STRIDE] - movd mm3, [eax+%1+3FENC_STRIDE] - movd mm4, [eax+%1+4FENC_STRIDE] - movd mm5, [eax+%1+5FENC_STRIDE] - punpcklbw mm6, mm7 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - movq [spill], mm6 - punpcklbw mm2, mm7 - punpcklbw mm3, mm7 - movd mm6, [eax+%1+6FENC_STRIDE] - punpcklbw mm4, mm7 - punpcklbw mm5, mm7 - punpcklbw mm6, mm7 - movq mm7, [spill] + pxor m7, m7 + movd m6, [eax+%1+7FENC_STRIDE] + movd m0, [eax+%1+0FENC_STRIDE] + movd m1, [eax+%1+1FENC_STRIDE] + movd m2, [eax+%1+2FENC_STRIDE] + movd m3, [eax+%1+3FENC_STRIDE] + movd m4, [eax+%1+4FENC_STRIDE] + movd m5, [eax+%1+5FENC_STRIDE] + punpcklbw m6, m7 + punpcklbw m0, m7 + punpcklbw m1, m7 + movq [spill], m6 + punpcklbw m2, m7 + punpcklbw m3, m7 + movd m6, [eax+%1+6FENC_STRIDE] + punpcklbw m4, m7 + punpcklbw m5, m7 + punpcklbw m6, m7 + movq m7, [spill] %endmacro ;----------------------------------------------------------------------------- @@ -217,146 +205,146 @@ %define trans esp+0 ; +96 %define sum esp+0 ; +32 LOAD_4x8P 0 - HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - movq [spill], mm0 - TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0 - movq [trans+0x00], mm4 - movq [trans+0x08], mm7 - movq [trans+0x10], mm0 - movq [trans+0x18], mm6 - movq mm0, [spill] - TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4 - movq [trans+0x20], mm0 - movq [trans+0x28], mm3 - movq [trans+0x30], mm4 - movq [trans+0x38], mm2 + movq [spill], m0 + TRANSPOSE4x4W 4, 5, 6, 7, 0 + movq [trans+0x00], m4 + movq [trans+0x08], m5 + movq [trans+0x10], m6 + movq [trans+0x18], m7 + movq m0, [spill] + TRANSPOSE4x4W 0, 1, 2, 3, 4 + movq [trans+0x20], m0 + movq [trans+0x28], m1 + movq [trans+0x30], m2 + movq [trans+0x38], m3 LOAD_4x8P 4 - HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - movq [spill], mm7 - TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7 - movq [trans+0x40], mm0 - movq [trans+0x48], mm3 - movq [trans+0x50], mm7 - movq [trans+0x58], mm2 - movq mm7, [spill] - TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0 - movq mm5, [trans+0x00] - movq mm1, [trans+0x08] - movq mm2, [trans+0x10] - movq mm3, [trans+0x18] - - HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6 - - movq [spill+0], mm5 - movq [spill+8], mm7 - ABS2 mm0, mm1, mm5, mm7 - ABS2 mm2, mm3, mm5, mm7 - paddw mm0, mm2 - paddw mm1, mm3 - paddw mm0, mm1 - ABS2 mm4, mm6, mm2, mm3 - movq mm5, [spill+0] - movq mm7, [spill+8] - paddw mm0, mm4 - paddw mm0, mm6 - ABS1 mm7, mm1 - paddw mm0, mm7 ; 7x4 sum - movq mm6, mm5 - movq mm7, [ecx+8] ; left bottom - psllw mm7, 3 - psubw mm6, mm7 - ABS2 mm5, mm6, mm2, mm3 - paddw mm5, mm0 - paddw mm6, mm0 - movq [sum+0], mm5 ; dc - movq [sum+8], mm6 ; left - - movq mm0, [trans+0x20] - movq mm1, [trans+0x28] - movq mm2, [trans+0x30] - movq mm3, [trans+0x38] - movq mm4, [trans+0x40] - movq mm5, [trans+0x48] - movq mm6, [trans+0x50] - movq mm7, [trans+0x58] - - HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 - - movd [sum+0x10], mm0 - movd [sum+0x12], mm1 - movd [sum+0x14], mm2 - movd [sum+0x16], mm3 - movd [sum+0x18], mm4 - movd [sum+0x1a], mm5 - movd [sum+0x1c], mm6 - movd [sum+0x1e], mm7 - - movq [spill], mm0 - movq [spill+8], mm1 - ABS2 mm2, mm3, mm0, mm1 - ABS2 mm4, mm5, mm0, mm1 - paddw mm2, mm3 - paddw mm4, mm5 - paddw mm2, mm4 - movq mm0, [spill] - movq mm1, [spill+8] - ABS2 mm6, mm7, mm4, mm5 - ABS1 mm1, mm4 - paddw mm2, mm7 - paddw mm1, mm6 - paddw mm2, mm1 ; 7x4 sum - movq mm1, mm0 + movq [spill], m7 + TRANSPOSE4x4W 0, 1, 2, 3, 7 + movq [trans+0x40], m0 + movq [trans+0x48], m1 + movq [trans+0x50], m2 + movq [trans+0x58], m3 + movq m7, [spill] + TRANSPOSE4x4W 4, 5, 6, 7, 0 + movq m0, [trans+0x00] + movq m1, [trans+0x08] + movq m2, [trans+0x10] + movq m3, [trans+0x18] + + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + + movq [spill+0], m0 + movq [spill+8], m1 + ABS2 m2, m3, m0, m1 + ABS2 m4, m5, m0, m1 + paddw m2, m4 + paddw m3, m5 + ABS2 m6, m7, m4, m5 + movq m0, [spill+0] + movq m1, [spill+8] + paddw m2, m6 + paddw m3, m7 + paddw m2, m3 + ABS1 m1, m4 + paddw m2, m1 ; 7x4 sum + movq m7, m0 + movq m1, [ecx+8] ; left bottom + psllw m1, 3 + psubw m7, m1 + ABS2 m0, m7, m5, m3 + paddw m0, m2 + paddw m7, m2 + movq [sum+0], m0 ; dc + movq [sum+8], m7 ; left + + movq m0, [trans+0x20] + movq m1, [trans+0x28] + movq m2, [trans+0x30] + movq m3, [trans+0x38] + movq m4, [trans+0x40] + movq m5, [trans+0x48] + movq m6, [trans+0x50] + movq m7, [trans+0x58] + + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + + movd [sum+0x10], m0 + movd [sum+0x12], m1 + movd [sum+0x14], m2 + movd [sum+0x16], m3 + movd [sum+0x18], m4 + movd [sum+0x1a], m5 + movd [sum+0x1c], m6 + movd [sum+0x1e], m7 + + movq [spill], m0 + movq [spill+8], m1 + ABS2 m2, m3, m0, m1 + ABS2 m4, m5, m0, m1 + paddw m2, m4 + paddw m3, m5 + paddw m2, m3 + movq m0, [spill] + movq m1, [spill+8] + ABS2 m6, m7, m4, m5 + ABS1 m1, m3 + paddw m2, m7 + paddw m1, m6 + paddw m2, m1 ; 7x4 sum + movq m1, m0 - movq mm7, [ecx+0] - psllw mm7, 3 ; left top + movq m7, [ecx+0] + psllw m7, 3 ; left top - movzx edx, word [ecx+0] + movzx edx, word [ecx+0] add dx, [ecx+16] - lea edx, [4edx+32] - and edx, -64 - movd mm6, edx ; dc - - psubw mm1, mm7 - psubw mm0, mm6 - ABS2 mm0, mm1, mm5, mm6 - movq mm3, [sum+0] ; dc - paddw mm0, mm2 - paddw mm1, mm2 - movq mm2, mm0 - paddw mm0, mm3 - paddw mm1, [sum+8] ; h - psrlq mm2, 16 - paddw mm2, mm3 - - movq mm3, [ecx+16] ; top left - movq mm4, [ecx+24] ; top right - psllw mm3, 3 - psllw mm4, 3 - psubw mm3, [sum+16] - psubw mm4, [sum+24] - ABS2 mm3, mm4, mm5, mm6 - paddw mm2, mm3 - paddw mm2, mm4 ; v - - SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd - mov eax, [args+8] - movd ecx, mm2 - movd edx, mm1 - add ecx, 2 - add edx, 2 - shr ecx, 2 - shr edx, 2 - mov [eax+0], ecx ; i8x8_v satd - mov [eax+4], edx ; i8x8_h satd - movd ecx, mm0 - add ecx, 2 - shr ecx, 2 - mov [eax+8], ecx ; i8x8_dc satd + lea edx, [4edx+32] + and edx, -64 + movd m6, edx ; dc + + psubw m1, m7 + psubw m0, m6 + ABS2 m0, m1, m5, m6 + movq m3, [sum+0] ; dc + paddw m0, m2 + paddw m1, m2 + movq m2, m0 + paddw m0, m3 + paddw m1, [sum+8] ; h + psrlq m2, 16 + paddw m2, m3 + + movq m3, [ecx+16] ; top left + movq m4, [ecx+24] ; top right + psllw m3, 3 + psllw m4, 3 + psubw m3, [sum+16] + psubw m4, [sum+24] + ABS2 m3, m4, m5, m6 + paddw m2, m3 + paddw m2, m4 ; v + + SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd + mov eax, [args+8] + movd ecx, m2 + movd edx, m1 + add ecx, 2 + add edx, 2 + shr ecx, 2 + shr edx, 2 + mov [eax+0], ecx ; i8x8_v satd + mov [eax+4], edx ; i8x8_h satd + movd ecx, m0 + add ecx, 2 + shr ecx, 2 + mov [eax+8], ecx ; i8x8_dc satd - add esp, 0x70 + add esp, 0x70 ret %undef args %undef spill @@ -370,57 +358,57 @@ ; const uint8_t pix2, int stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- cglobal x264_pixel_ssim_4x4x2_core_mmxext - push ebx - push edi - mov ebx, [esp+16] - mov edx, [esp+24] - mov edi, 4 - pxor mm0, mm0 + push ebx + push edi + mov ebx, [esp+16] + mov edx, [esp+24] + mov edi, 4 + pxor m0, m0 .loop: - mov eax, [esp+12] - mov ecx, [esp+20] - add eax, edi - add ecx, edi - pxor mm1, mm1 - pxor mm2, mm2 - pxor mm3, mm3 - pxor mm4, mm4 + mov eax, [esp+12] + mov ecx, [esp+20] + add eax, edi + add ecx, edi + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + pxor m4, m4 %rep 4 - movd mm5, [eax] - movd mm6, [ecx] - punpcklbw mm5, mm0 - punpcklbw mm6, mm0 - paddw mm1, mm5 - paddw mm2, mm6 - movq mm7, mm5 - pmaddwd mm5, mm5 - pmaddwd mm7, mm6 - pmaddwd mm6, mm6 - paddd mm3, mm5 - paddd mm4, mm7 - paddd mm3, mm6 - add eax, ebx - add ecx, edx + movd m5, [eax] + movd m6, [ecx] + punpcklbw m5, m0 + punpcklbw m6, m0 + paddw m1, m5 + paddw m2, m6 + movq m7, m5 + pmaddwd m5, m5 + pmaddwd m7, m6 + pmaddwd m6, m6 + paddd m3, m5 + paddd m4, m7 + paddd m3, m6 + add eax, ebx + add ecx, edx %endrep - mov eax, [esp+28] - lea eax, [eax+edi4] - pshufw mm5, mm1, 0xE - pshufw mm6, mm2, 0xE - paddusw mm1, mm5 - paddusw mm2, mm6 - punpcklwd mm1, mm2 - pshufw mm2, mm1, 0xE - pshufw mm5, mm3, 0xE - pshufw mm6, mm4, 0xE - paddusw mm1, mm2 - paddd mm3, mm5 - paddd mm4, mm6 - punpcklwd mm1, mm0 - punpckldq mm3, mm4 - movq [eax+0], mm1 - movq [eax+8], mm3 - sub edi, 4 - jge .loop + mov eax, [esp+28] + lea eax, [eax+edi*4] + pshufw m5, m1, 0xE + pshufw m6, m2, 0xE + paddusw m1, m5 + paddusw m2, m6 + punpcklwd m1, m2 + pshufw m2, m1, 0xE + pshufw m5, m3, 0xE + pshufw m6, m4, 0xE + paddusw m1, m2 + paddd m3, m5 + paddd m4, m6 + punpcklwd m1, m0 + punpckldq m3, m4 + movq [eax+0], m1 + movq [eax+8], m3 + sub edi, 4 + jge .loop pop edi pop ebx emms
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/pixel-a.asm ^
@@ -31,6 +31,8 @@ ssim_c2: times 4 dd 235963 ; .03.032552556463 mask_ff: times 16 db 0xff times 16 db 0 +mask_ac4: dw 0,-1,-1,-1, 0,-1,-1,-1 +mask_ac8: dw 0,-1,-1,-1,-1,-1,-1,-1 SECTION .text @@ -162,6 +164,112 @@ SSD 8, 4, sse2 +;============================================================================= +; variance +;============================================================================= + +%macro VAR_START 0 + pxor m5, m5 ; sum + pxor m6, m6 ; sum squared + pxor m7, m7 ; zero +%ifdef ARCH_X86_64 + %define t3d r3d +%else + %define t3d r2d +%endif +%endmacro + +%macro VAR_END 1 +%if mmsize == 16 + movhlps m0, m5 + paddw m5, m0 +%endif + movifnidn r2d, r2m + movd r1d, m5 + movd [r2], m5 ; return sum + imul r1d, r1d + HADDD m6, m1 + shr r1d, %1 + movd eax, m6 + sub eax, r1d ; sqr - (sum sum >> shift) + RET +%endmacro + +%macro VAR_2ROW 2 + mov t3d, %2 +.loop: + mova m0, [r0] + mova m1, m0 + mova m3, [r0+%1] + mova m2, m0 + punpcklbw m0, m7 + mova m4, m3 + punpckhbw m1, m7 +%ifidn %1, r1 + lea r0, [r0+%12] +%else + add r0, r1 +%endif + punpckhbw m4, m7 + psadbw m2, m7 + paddw m5, m2 + mova m2, m3 + punpcklbw m3, m7 + dec t3d + psadbw m2, m7 + pmaddwd m0, m0 + paddw m5, m2 + pmaddwd m1, m1 + paddd m6, m0 + pmaddwd m3, m3 + paddd m6, m1 + pmaddwd m4, m4 + paddd m6, m3 + paddd m6, m4 + jg .loop +%endmacro + +;----------------------------------------------------------------------------- +; int x264_pixel_var_wxh_mmxext( uint8_t , int, int * ) +;----------------------------------------------------------------------------- +INIT_MMX +cglobal x264_pixel_var_16x16_mmxext, 2,3 + VAR_START + VAR_2ROW 8, 16 + VAR_END 8 + +cglobal x264_pixel_var_8x8_mmxext, 2,3 + VAR_START + VAR_2ROW r1, 4 + VAR_END 6 + +INIT_XMM +cglobal x264_pixel_var_16x16_sse2, 2,3 + VAR_START + VAR_2ROW r1, 8 + VAR_END 8 + +cglobal x264_pixel_var_8x8_sse2, 2,3 + VAR_START + mov t3d, 4 +.loop: + movh m0, [r0] + movhps m0, [r0+r1] + lea r0, [r0+r12] + mova m1, m0 + punpcklbw m0, m7 + mova m2, m1 + punpckhbw m1, m7 + dec t3d + pmaddwd m0, m0 + pmaddwd m1, m1 + psadbw m2, m7 + paddw m5, m2 + paddd m6, m0 + paddd m6, m1 + jnz .loop + VAR_END 6 + ;============================================================================= ; SATD @@ -173,16 +281,18 @@ ; whereas phaddw-based transform doesn't care what order the coefs end up in. %macro PHSUMSUB 3 - movdqa %3, %1 - phaddw %1, %2 - phsubw %3, %2 + movdqa m%3, m%1 + phaddw m%1, m%2 + phsubw m%3, m%2 + SWAP %2, %3 %endmacro -%macro HADAMARD4_ROW_PHADD 5 ; abcd-t -> adtc - PHSUMSUB %1, %2, %5 - PHSUMSUB %3, %4, %2 - PHSUMSUB %1, %3, %4 - PHSUMSUB %5, %2, %3 +%macro HADAMARD4_ROW_PHADD 5 + PHSUMSUB %1, %2, %5 + PHSUMSUB %3, %4, %5 + PHSUMSUB %1, %3, %5 + PHSUMSUB %2, %4, %5 + SWAP %3, %4 %endmacro %macro HADAMARD4_1D 4 @@ -190,102 +300,29 @@ SUMSUB_BADC %1, %3, %2, %4 %endmacro -%macro SBUTTERFLY 5 - mov%1 %5, %3 - punpckl%2 %3, %4 - punpckh%2 %5, %4 -%endmacro - -%macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4x2 to not shuffle registers - mov%1 %5, %3 - punpckh%2 %3, %4 - punpckl%2 %5, %4 -%endmacro - -%macro TRANSPOSE4x4W 5 ; abcd-t -> adtc - SBUTTERFLY q, wd, %1, %2, %5 - SBUTTERFLY q, wd, %3, %4, %2 - SBUTTERFLY q, dq, %1, %3, %4 - SBUTTERFLY q, dq, %5, %2, %3 -%endmacro - -%macro TRANSPOSE4x4D 5 ; abcd-t -> adtc - SBUTTERFLY dqa, dq, %1, %2, %5 - SBUTTERFLY dqa, dq, %3, %4, %2 - SBUTTERFLY dqa, qdq, %1, %3, %4 - SBUTTERFLY dqa, qdq, %5, %2, %3 -%endmacro - -%macro TRANSPOSE2x4x4W 5 ; abcd-t -> abcd - SBUTTERFLY dqa, wd, %1, %2, %5 - SBUTTERFLY dqa, wd, %3, %4, %2 - SBUTTERFLY dqa, dq, %1, %3, %4 - SBUTTERFLY2 dqa, dq, %5, %2, %3 - SBUTTERFLY dqa, qdq, %1, %3, %2 - SBUTTERFLY2 dqa, qdq, %4, %5, %3 -%endmacro - -%ifdef ARCH_X86_64 -%macro TRANSPOSE8x8W 9 ; abcdefgh-t -> afhdtecb - SBUTTERFLY dqa, wd, %1, %2, %9 - SBUTTERFLY dqa, wd, %3, %4, %2 - SBUTTERFLY dqa, wd, %5, %6, %4 - SBUTTERFLY dqa, wd, %7, %8, %6 - SBUTTERFLY dqa, dq, %1, %3, %8 - SBUTTERFLY dqa, dq, %9, %2, %3 - SBUTTERFLY dqa, dq, %5, %7, %2 - SBUTTERFLY dqa, dq, %4, %6, %7 - SBUTTERFLY dqa, qdq, %1, %5, %6 - SBUTTERFLY dqa, qdq, %9, %4, %5 - SBUTTERFLY dqa, qdq, %8, %2, %4 - SBUTTERFLY dqa, qdq, %3, %7, %2 -%endmacro -%else -%macro TRANSPOSE8x8W 9 ; abcdefgh -> afhdgecb - movdqa [%9], %8 - SBUTTERFLY dqa, wd, %1, %2, %8 - movdqa [%9+16], %8 - movdqa %8, [%9] - SBUTTERFLY dqa, wd, %3, %4, %2 - SBUTTERFLY dqa, wd, %5, %6, %4 - SBUTTERFLY dqa, wd, %7, %8, %6 - SBUTTERFLY dqa, dq, %1, %3, %8 - movdqa [%9], %8 - movdqa %8, [16+%9] - SBUTTERFLY dqa, dq, %8, %2, %3 - SBUTTERFLY dqa, dq, %5, %7, %2 - SBUTTERFLY dqa, dq, %4, %6, %7 - SBUTTERFLY dqa, qdq, %1, %5, %6 - SBUTTERFLY dqa, qdq, %8, %4, %5 - movdqa [%9+16], %8 - movdqa %8, [%9] - SBUTTERFLY dqa, qdq, %8, %2, %4 - SBUTTERFLY dqa, qdq, %3, %7, %2 - movdqa %7, [%9+16] -%endmacro -%endif - %macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block) - HADAMARD4_1D mm4, mm5, mm6, mm7 - TRANSPOSE4x4W mm4, mm5, mm6, mm7, %1 - HADAMARD4_1D mm4, mm7, %1, mm6 - ABS2 mm4, mm7, mm3, mm5 - ABS2 %1, mm6, mm3, mm5 - paddw %1, mm4 - paddw mm6, mm7 - pavgw %1, mm6 + %xdefine %%n n%1 + HADAMARD4_1D m4, m5, m6, m7 + TRANSPOSE4x4W 4, 5, 6, 7, %%n + HADAMARD4_1D m4, m5, m6, m7 + ABS2 m4, m5, m3, m %+ %%n + ABS2 m6, m7, m3, m %+ %%n + paddw m6, m4 + paddw m7, m5 + pavgw m6, m7 + SWAP %%n, 6 %endmacro ; in: r4=3stride1, r5=3stride2 ; in: %2 = horizontal offset ; in: %3 = whether we need to increment pix1 and pix2 -; clobber: mm3..mm7 +; clobber: m3..m7 ; out: %1 = satd %macro SATD_4x4_MMX 3 - LOAD_DIFF mm4, mm3, none, [r0+%2], [r2+%2] - LOAD_DIFF mm5, mm3, none, [r0+r1+%2], [r2+r3+%2] - LOAD_DIFF mm6, mm3, none, [r0+2r1+%2], [r2+2r3+%2] - LOAD_DIFF mm7, mm3, none, [r0+r4+%2], [r2+r5+%2] + LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2] + LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2] + LOAD_DIFF m6, m3, none, [r0+2r1+%2], [r2+2r3+%2] + LOAD_DIFF m7, m3, none, [r0+r4+%2], [r2+r5+%2] %if %3 lea r0, [r0+4r1] lea r2, [r2+4r3] @@ -294,66 +331,66 @@ %endmacro %macro SATD_8x4_START 1 - SATD_4x4_MMX mm0, 0, 0 - SATD_4x4_MMX mm1, 4, %1 + SATD_4x4_MMX m0, 0, 0 + SATD_4x4_MMX m1, 4, %1 %endmacro %macro SATD_8x4_INC 1 - SATD_4x4_MMX mm2, 0, 0 - paddw mm0, mm1 - SATD_4x4_MMX mm1, 4, %1 - paddw mm0, mm2 + SATD_4x4_MMX m2, 0, 0 + paddw m0, m1 + SATD_4x4_MMX m1, 4, %1 + paddw m0, m2 %endmacro %macro SATD_16x4_START 1 - SATD_4x4_MMX mm0, 0, 0 - SATD_4x4_MMX mm1, 4, 0 - SATD_4x4_MMX mm2, 8, 0 - paddw mm0, mm1 - SATD_4x4_MMX mm1, 12, %1 - paddw mm0, mm2 + SATD_4x4_MMX m0, 0, 0 + SATD_4x4_MMX m1, 4, 0 + SATD_4x4_MMX m2, 8, 0 + paddw m0, m1 + SATD_4x4_MMX m1, 12, %1 + paddw m0, m2 %endmacro %macro SATD_16x4_INC 1 - SATD_4x4_MMX mm2, 0, 0 - paddw mm0, mm1 - SATD_4x4_MMX mm1, 4, 0 - paddw mm0, mm2 - SATD_4x4_MMX mm2, 8, 0 - paddw mm0, mm1 - SATD_4x4_MMX mm1, 12, %1 - paddw mm0, mm2 + SATD_4x4_MMX m2, 0, 0 + paddw m0, m1 + SATD_4x4_MMX m1, 4, 0 + paddw m0, m2 + SATD_4x4_MMX m2, 8, 0 + paddw m0, m1 + SATD_4x4_MMX m1, 12, %1 + paddw m0, m2 %endmacro %macro SATD_8x4_SSE2 1 - LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5 %if %1 lea r0, [r0+4r1] lea r2, [r2+4r3] %endif - HADAMARD4_1D xmm0, xmm1, xmm2, xmm3 - TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4 - HADAMARD4_1D xmm0, xmm1, xmm2, xmm3 - ABS4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - paddusw xmm0, xmm1 - paddusw xmm2, xmm3 - paddusw xmm6, xmm0 - paddusw xmm6, xmm2 + HADAMARD4_1D m0, m1, m2, m3 + TRANSPOSE2x4x4W 0, 1, 2, 3, 4 + HADAMARD4_1D m0, m1, m2, m3 + ABS4 m0, m1, m2, m3, m4, m5 + paddusw m0, m1 + paddusw m2, m3 + paddusw m6, m0 + paddusw m6, m2 %endmacro %macro SATD_8x4_PHADD 1 - LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5 %if %1 lea r0, [r0+4r1] lea r2, [r2+4r3] %endif - HADAMARD4_1D xmm0, xmm1, xmm2, xmm3 - HADAMARD4_ROW_PHADD xmm0, xmm1, xmm2, xmm3, xmm4 - ABS4 xmm0, xmm3, xmm4, xmm2, xmm1, xmm5 - paddusw xmm0, xmm3 - paddusw xmm2, xmm4 - paddusw xmm6, xmm0 - paddusw xmm6, xmm2 + HADAMARD4_1D m0, m1, m2, m3 + HADAMARD4_ROW_PHADD 0, 1, 2, 3, 4 + ABS4 m0, m1, m2, m3, m4, m5 + paddusw m0, m1 + paddusw m2, m3 + paddusw m6, m0 + paddusw m6, m2 %endmacro %macro SATD_START_MMX 0 @@ -362,12 +399,12 @@ %endmacro %macro SATD_END_MMX 0 - pshufw mm1, mm0, 01001110b - paddw mm0, mm1 - pshufw mm1, mm0, 10110001b - paddw mm0, mm1 - movd eax, mm0 - and eax, 0xffff + pshufw m1, m0, 01001110b + paddw m0, m1 + pshufw m1, m0, 10110001b + paddw m0, m1 + movd eax, m0 + and eax, 0xffff RET %endmacro @@ -377,27 +414,28 @@ ;----------------------------------------------------------------------------- ; int x264_pixel_satd_16x16_mmxext (uint8_t , int, uint8_t , int ) ;----------------------------------------------------------------------------- +INIT_MMX cglobal x264_pixel_satd_16x16_mmxext, 4,6 SATD_START_MMX SATD_16x4_START 1 SATD_16x4_INC 1 SATD_16x4_INC 1 SATD_16x4_INC 0 - paddw mm0, mm1 - pxor mm3, mm3 - pshufw mm1, mm0, 01001110b - paddw mm0, mm1 - punpcklwd mm0, mm3 - pshufw mm1, mm0, 01001110b - paddd mm0, mm1 - movd eax, mm0 + paddw m0, m1 + pxor m3, m3 + pshufw m1, m0, 01001110b + paddw m0, m1 + punpcklwd m0, m3 + pshufw m1, m0, 01001110b + paddd m0, m1 + movd eax, m0 RET cglobal x264_pixel_satd_16x8_mmxext, 4,6 SATD_START_MMX SATD_16x4_START 1 SATD_16x4_INC 0 - paddw mm0, mm1 + paddw m0, m1 SATD_END_MMX cglobal x264_pixel_satd_8x16_mmxext, 4,6 @@ -406,56 +444,56 @@ SATD_8x4_INC 1 SATD_8x4_INC 1 SATD_8x4_INC 0 - paddw mm0, mm1 + paddw m0, m1 SATD_END_MMX cglobal x264_pixel_satd_8x8_mmxext, 4,6 SATD_START_MMX SATD_8x4_START 1 SATD_8x4_INC 0 - paddw mm0, mm1 + paddw m0, m1 SATD_END_MMX cglobal x264_pixel_satd_8x4_mmxext, 4,6 SATD_START_MMX SATD_8x4_START 0 - paddw mm0, mm1 + paddw m0, m1 SATD_END_MMX %macro SATD_W4 1 +INIT_MMX cglobal x264_pixel_satd_4x8_%1, 4,6 SATD_START_MMX - SATD_4x4_MMX mm0, 0, 1 - SATD_4x4_MMX mm1, 0, 0 - paddw mm0, mm1 + SATD_4x4_MMX m0, 0, 1 + SATD_4x4_MMX m1, 0, 0 + paddw m0, m1 SATD_END_MMX cglobal x264_pixel_satd_4x4_%1, 4,6 SATD_START_MMX - SATD_4x4_MMX mm0, 0, 0 + SATD_4x4_MMX m0, 0, 0 SATD_END_MMX %endmacro SATD_W4 mmxext %macro SATD_START_SSE2 0 - pxor xmm6, xmm6 - lea r4, [3r1] - lea r5, [3r3] + pxor m6, m6 + lea r4, [3r1] + lea r5, [3r3] %endmacro %macro SATD_END_SSE2 0 - picgetgot ebx - psrlw xmm6, 1 - HADDW xmm6, xmm7 - movd eax, xmm6 + psrlw m6, 1 + HADDW m6, m7 + movd eax, m6 RET %endmacro %macro BACKUP_POINTERS 0 %ifdef ARCH_X86_64 - mov r10, r0 - mov r11, r2 + mov r10, r0 + mov r11, r2 %endif %endmacro @@ -475,6 +513,7 @@ ; int x264_pixel_satd_8x4_sse2 (uint8_t , int, uint8_t , int ) ;----------------------------------------------------------------------------- %macro SATDS_SSE2 1 +INIT_XMM cglobal x264_pixel_satd_16x16_%1, 4,6 SATD_START_SSE2 BACKUP_POINTERS @@ -526,26 +565,26 @@ lea r4, [3r1] lea r5, [3r3] .skip_lea: - LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm8, xmm9 + LOAD_DIFF_8x4P m0, m1, m2, m3, m8, m9 lea r0, [r0+4r1] lea r2, [r2+4r3] - LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm8, xmm9 + LOAD_DIFF_8x4P m4, m5, m6, m7, m8, m9 - HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 - TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 - HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1 - - ABS4 xmm0, xmm1, xmm2, xmm3, xmm6, xmm9 - ABS4 xmm4, xmm5, xmm7, xmm8, xmm6, xmm9 - paddusw xmm0, xmm1 - paddusw xmm2, xmm3 - paddusw xmm4, xmm5 - paddusw xmm7, xmm8 - paddusw xmm0, xmm2 - paddusw xmm4, xmm7 - pavgw xmm0, xmm4 - HADDW xmm0, xmm1 - movd eax, xmm0 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + + ABS4 m0, m1, m2, m3, m8, m9 + ABS4 m4, m5, m6, m7, m8, m9 + paddusw m0, m1 + paddusw m2, m3 + paddusw m4, m5 + paddusw m6, m7 + paddusw m0, m2 + paddusw m4, m6 + pavgw m0, m4 + HADDW m0, m1 + movd eax, m0 add r10d, eax ; preserve rounding for 16x16 add eax, 1 shr eax, 1 @@ -576,39 +615,38 @@ sub esp, 32 lea r4, [3r1] lea r5, [3r3] - LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm6, xmm7 - movdqa [esp], xmm2 + LOAD_DIFF_8x4P m0, m1, m2, m3, m6, m7 + movdqa [esp], m2 lea r0, [r0+4r1] lea r2, [r2+4r3] - LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm2, xmm2 - movdqa xmm2, [esp] + LOAD_DIFF_8x4P m4, m5, m6, m7, m2, m2 + movdqa m2, [esp] - HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 - TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, esp - HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm6, xmm4, xmm2, xmm1 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [esp], [esp+16] + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 %ifidn %1, sse2 - movdqa [esp], xmm6 - movdqa [esp+16], xmm7 + movdqa [esp], m4 + movdqa [esp+16], m2 %endif - ABS2 xmm2, xmm3, xmm6, xmm7 - ABS2 xmm0, xmm1, xmm6, xmm7 - paddusw xmm0, xmm2 - paddusw xmm1, xmm3 + ABS2 m6, m3, m4, m2 + ABS2 m0, m7, m4, m2 + paddusw m0, m6 + paddusw m7, m3 %ifidn %1, sse2 - movdqa xmm6, [esp] - movdqa xmm7, [esp+16] + movdqa m4, [esp] + movdqa m2, [esp+16] %endif - ABS2 xmm4, xmm5, xmm2, xmm3 - ABS2 xmm6, xmm7, xmm2, xmm3 - paddusw xmm4, xmm5 - paddusw xmm6, xmm7 - paddusw xmm0, xmm1 - paddusw xmm4, xmm6 - pavgw xmm0, xmm4 - picgetgot ebx - HADDW xmm0, xmm1 - movd eax, xmm0 + ABS2 m5, m1, m6, m3 + ABS2 m4, m2, m6, m3 + paddusw m5, m1 + paddusw m4, m2 + paddusw m0, m7 + paddusw m5, m4 + pavgw m0, m5 + HADDW m0, m7 + movd eax, m0 mov ecx, eax ; preserve rounding for 16x16 add eax, 1 shr eax, 1 @@ -658,31 +696,32 @@ %macro INTRA_SA8D_SSE2 1 %ifdef ARCH_X86_64 +INIT_XMM ;----------------------------------------------------------------------------- ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t fenc, int16_t edges[2][8], int res ) ;----------------------------------------------------------------------------- cglobal x264_intra_sa8d_x3_8x8_core_%1 ; 8x8 hadamard - pxor xmm4, xmm4 - movq xmm0, [r0+0FENC_STRIDE] - movq xmm7, [r0+1FENC_STRIDE] - movq xmm6, [r0+2FENC_STRIDE] - movq xmm3, [r0+3FENC_STRIDE] - movq xmm5, [r0+4FENC_STRIDE] - movq xmm1, [r0+5FENC_STRIDE] - movq xmm8, [r0+6FENC_STRIDE] - movq xmm2, [r0+7FENC_STRIDE] - punpcklbw xmm0, xmm4 - punpcklbw xmm7, xmm4 - punpcklbw xmm6, xmm4 - punpcklbw xmm3, xmm4 - punpcklbw xmm5, xmm4 - punpcklbw xmm1, xmm4 - punpcklbw xmm8, xmm4 - punpcklbw xmm2, xmm4 - HADAMARD8_1D xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2 - TRANSPOSE8x8W xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4 - HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 + pxor m8, m8 + movq m0, [r0+0FENC_STRIDE] + movq m1, [r0+1FENC_STRIDE] + movq m2, [r0+2FENC_STRIDE] + movq m3, [r0+3FENC_STRIDE] + movq m4, [r0+4FENC_STRIDE] + movq m5, [r0+5FENC_STRIDE] + movq m6, [r0+6FENC_STRIDE] + movq m7, [r0+7FENC_STRIDE] + punpcklbw m0, m8 + punpcklbw m1, m8 + punpcklbw m2, m8 + punpcklbw m3, m8 + punpcklbw m4, m8 + punpcklbw m5, m8 + punpcklbw m6, m8 + punpcklbw m7, m8 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 ; dc movzx edi, word [r1+0] @@ -691,95 +730,97 @@ and edi, -16 shl edi, 2 - pxor xmm15, xmm15 - movdqa xmm8, xmm2 - movdqa xmm9, xmm3 - movdqa xmm10, xmm4 - movdqa xmm11, xmm5 - ABS4 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13 - paddusw xmm8, xmm10 - paddusw xmm9, xmm11 + pxor m15, m15 + movdqa m8, m2 + movdqa m9, m3 + movdqa m10, m4 + movdqa m11, m5 + ABS4 m8, m9, m10, m11, m12, m13 + paddusw m8, m10 + paddusw m9, m11 %ifidn %1, ssse3 - pabsw xmm10, xmm6 - pabsw xmm11, xmm7 - pabsw xmm15, xmm1 + pabsw m10, m6 + pabsw m11, m7 + pabsw m15, m1 %else - movdqa xmm10, xmm6 - movdqa xmm11, xmm7 - movdqa xmm15, xmm1 - ABS2 xmm10, xmm11, xmm13, xmm14 - ABS1 xmm15, xmm13 -%endif - paddusw xmm10, xmm11 - paddusw xmm8, xmm9 - paddusw xmm15, xmm10 - paddusw xmm15, xmm8 - movdqa xmm14, xmm15 ; 7x8 sum - - movdqa xmm8, [r1+0] ; left edge - movd xmm9, edi - psllw xmm8, 3 - psubw xmm8, xmm0 - psubw xmm9, xmm0 - ABS1 xmm8, xmm10 - ABS1 xmm9, xmm11 ; 1x8 sum - paddusw xmm14, xmm8 - paddusw xmm15, xmm9 - punpcklwd xmm0, xmm1 - punpcklwd xmm2, xmm3 - punpcklwd xmm4, xmm5 - punpcklwd xmm6, xmm7 - punpckldq xmm0, xmm2 - punpckldq xmm4, xmm6 - punpcklqdq xmm0, xmm4 ; transpose - movdqa xmm1, [r1+16] ; top edge - movdqa xmm2, xmm15 - psllw xmm1, 3 - psrldq xmm2, 2 ; 8x7 sum - psubw xmm0, xmm1 ; 8x1 sum - ABS1 xmm0, xmm1 - paddusw xmm2, xmm0 + movdqa m10, m6 + movdqa m11, m7 + movdqa m15, m1 + ABS2 m10, m11, m13, m14 + ABS1 m15, m13 +%endif + paddusw m10, m11 + paddusw m8, m9 + paddusw m15, m10 + paddusw m15, m8 + movdqa m14, m15 ; 7x8 sum + + movdqa m8, [r1+0] ; left edge + movd m9, edi + psllw m8, 3 + psubw m8, m0 + psubw m9, m0 + ABS1 m8, m10 + ABS1 m9, m11 ; 1x8 sum + paddusw m14, m8 + paddusw m15, m9 + punpcklwd m0, m1 + punpcklwd m2, m3 + punpcklwd m4, m5 + punpcklwd m6, m7 + punpckldq m0, m2 + punpckldq m4, m6 + punpcklqdq m0, m4 ; transpose + movdqa m1, [r1+16] ; top edge + movdqa m2, m15 + psllw m1, 3 + psrldq m2, 2 ; 8x7 sum + psubw m0, m1 ; 8x1 sum + ABS1 m0, m1 + paddusw m2, m0 ; 3x HADDW - movdqa xmm7, [pw_1 GLOBAL] - pmaddwd xmm2, xmm7 - pmaddwd xmm14, xmm7 - pmaddwd xmm15, xmm7 - movdqa xmm3, xmm2 - punpckldq xmm2, xmm14 - punpckhdq xmm3, xmm14 - pshufd xmm5, xmm15, 0xf5 - paddd xmm2, xmm3 - paddd xmm5, xmm15 - movdqa xmm3, xmm2 - punpcklqdq xmm2, xmm5 - punpckhqdq xmm3, xmm5 - pavgw xmm3, xmm2 - pxor xmm0, xmm0 - pavgw xmm3, xmm0 - movq [r2], xmm3 ; i8x8_v, i8x8_h - psrldq xmm3, 8 - movd [r2+8], xmm3 ; i8x8_dc + movdqa m7, [pw_1 GLOBAL] + pmaddwd m2, m7 + pmaddwd m14, m7 + pmaddwd m15, m7 + movdqa m3, m2 + punpckldq m2, m14 + punpckhdq m3, m14 + pshufd m5, m15, 0xf5 + paddd m2, m3 + paddd m5, m15 + movdqa m3, m2 + punpcklqdq m2, m5 + punpckhqdq m3, m5 + pavgw m3, m2 + pxor m0, m0 + pavgw m3, m0 + movq [r2], m3 ; i8x8_v, i8x8_h + psrldq m3, 8 + movd [r2+8], m3 ; i8x8_dc ret %endif ; ARCH_X86_64 -%endmacro ; INTRA_SATDS +%endmacro ; INTRA_SA8D_SSE2 ; in: r0 = fenc -; out: mm0..mm3 = hadamard coefs +; out: m0..m3 = hadamard coefs +INIT_MMX ALIGN 16 load_hadamard: - pxor mm7, mm7 - movd mm0, [r0+0FENC_STRIDE] - movd mm4, [r0+1FENC_STRIDE] - movd mm3, [r0+2FENC_STRIDE] - movd mm1, [r0+3FENC_STRIDE] - punpcklbw mm0, mm7 - punpcklbw mm4, mm7 - punpcklbw mm3, mm7 - punpcklbw mm1, mm7 - HADAMARD4_1D mm0, mm4, mm3, mm1 - TRANSPOSE4x4W mm0, mm4, mm3, mm1, mm2 - HADAMARD4_1D mm0, mm1, mm2, mm3 + pxor m7, m7 + movd m0, [r0+0FENC_STRIDE] + movd m1, [r0+1FENC_STRIDE] + movd m2, [r0+2FENC_STRIDE] + movd m3, [r0+3FENC_STRIDE] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + HADAMARD4_1D m0, m1, m2, m3 + TRANSPOSE4x4W 0, 1, 2, 3, 4 + HADAMARD4_1D m0, m1, m2, m3 + SAVE_MM_PERMUTATION load_hadamard ret %macro SCALAR_SUMSUB 4 @@ -848,53 +889,54 @@ mov qword [sums+8], 0 mov qword [sums+16], 0 %else - pxor mm7, mm7 - movq [sums+0], mm7 - movq [sums+8], mm7 - movq [sums+16], mm7 + pxor m7, m7 + movq [sums+0], m7 + movq [sums+8], m7 + movq [sums+16], m7 %endif %endmacro -; in: mm1..mm3 -; out: mm7 -; clobber: mm4..mm6 +; in: m1..m3 +; out: m7 +; clobber: m4..m6 %macro SUM3x4 1 %ifidn %1, ssse3 - pabsw mm4, mm1 - pabsw mm5, mm2 - pabsw mm7, mm3 - paddw mm4, mm5 + pabsw m4, m1 + pabsw m5, m2 + pabsw m7, m3 + paddw m4, m5 %else - movq mm4, mm1 - movq mm5, mm2 - ABS2 mm4, mm5, mm6, mm7 - movq mm7, mm3 - paddw mm4, mm5 - ABS1 mm7, mm6 -%endif - paddw mm7, mm4 + movq m4, m1 + movq m5, m2 + ABS2 m4, m5, m6, m7 + movq m7, m3 + paddw m4, m5 + ABS1 m7, m6 +%endif + paddw m7, m4 %endmacro -; in: mm0..mm3 (4x4), mm7 (3x4) -; out: mm0 v, mm4 h, mm5 dc -; clobber: mm6 +; in: m0..m3 (4x4), m7 (3x4) +; out: m0 v, m4 h, m5 dc +; clobber: m6 %macro SUM4x3 3 ; dc, left, top - movq mm4, %2 - movd mm5, %1 - psllw mm4, 2 - psubw mm4, mm0 - psubw mm5, mm0 - punpcklwd mm0, mm1 - punpcklwd mm2, mm3 - punpckldq mm0, mm2 ; transpose - movq mm1, %3 - psllw mm1, 2 - psubw mm0, mm1 - ABS2 mm4, mm5, mm2, mm3 ; 1x4 sum - ABS1 mm0, mm1 ; 4x1 sum + movq m4, %2 + movd m5, %1 + psllw m4, 2 + psubw m4, m0 + psubw m5, m0 + punpcklwd m0, m1 + punpcklwd m2, m3 + punpckldq m0, m2 ; transpose + movq m1, %3 + psllw m1, 2 + psubw m0, m1 + ABS2 m4, m5, m2, m3 ; 1x4 sum + ABS1 m0, m1 ; 4x1 sum %endmacro %macro INTRA_SATDS_MMX 1 +INIT_MMX ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_4x4_mmxext( uint8_t fenc, uint8_t fdec, int res ) ;----------------------------------------------------------------------------- @@ -924,19 +966,19 @@ SUM3x4 %1 SUM4x3 t0d, [left_1d], [top_1d] - paddw mm4, mm7 - paddw mm5, mm7 - movq mm1, mm5 - psrlq mm1, 16 ; 4x3 sum - paddw mm0, mm1 + paddw m4, m7 + paddw m5, m7 + movq m1, m5 + psrlq m1, 16 ; 4x3 sum + paddw m0, m1 - SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw + SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw %ifndef ARCH_X86_64 mov r2, r2m %endif - movd [r2+0], mm0 ; i4x4_v satd - movd [r2+4], mm4 ; i4x4_h satd - movd [r2+8], mm5 ; i4x4_dc satd + movd [r2+0], m0 ; i4x4_v satd + movd [r2+4], m4 ; i4x4_h satd + movd [r2+8], m5 ; i4x4_dc satd %ifndef ARCH_X86_64 ADD esp, 16 %endif @@ -966,10 +1008,10 @@ %assign stack_pad 88 + ((stack_offset+88+4)&15) %endif ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call - SUB rsp, stack_pad -%define sums rsp+64 ; size 24 -%define top_1d rsp+32 ; size 32 -%define left_1d rsp ; size 32 + SUB rsp, stack_pad +%define sums rsp+64 ; size 24 +%define top_1d rsp+32 ; size 32 +%define left_1d rsp ; size 32 movifnidn r1d, r1m CLEAR_SUMS @@ -997,14 +1039,14 @@ SUM3x4 %1 SUM4x3 t2d, [left_1d+8r3], [top_1d+8r4] - pavgw mm4, mm7 - pavgw mm5, mm7 - paddw mm0, [sums+0] ; i16x16_v satd - paddw mm4, [sums+8] ; i16x16_h satd - paddw mm5, [sums+16] ; i16x16_dc satd - movq [sums+0], mm0 - movq [sums+8], mm4 - movq [sums+16], mm5 + pavgw m4, m7 + pavgw m5, m7 + paddw m0, [sums+0] ; i16x16_v satd + paddw m4, [sums+8] ; i16x16_h satd + paddw m5, [sums+16] ; i16x16_dc satd + movq [sums+0], m0 + movq [sums+8], m4 + movq [sums+16], m5 add r0, 4 inc r4d @@ -1017,19 +1059,19 @@ ; horizontal sum movifnidn r2d, r2m - movq mm2, [sums+16] - movq mm1, [sums+8] - movq mm0, [sums+0] - movq mm7, mm2 - SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd - psrld mm0, 1 - pslld mm7, 16 - psrld mm7, 16 - paddd mm0, mm2 - psubd mm0, mm7 - movd [r2+8], mm2 ; i16x16_dc satd - movd [r2+4], mm1 ; i16x16_h satd - movd [r2+0], mm0 ; i16x16_v satd + movq m2, [sums+16] + movq m1, [sums+8] + movq m0, [sums+0] + movq m7, m2 + SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd + psrld m0, 1 + pslld m7, 16 + psrld m7, 16 + paddd m0, m2 + psubd m0, m7 + movd [r2+8], m2 ; i16x16_dc satd + movd [r2+4], m1 ; i16x16_h satd + movd [r2+0], m0 ; i16x16_v satd ADD rsp, stack_pad RET @@ -1086,14 +1128,14 @@ SUM3x4 %1 SUM4x3 [r5+4r4], [left_1d+8r3], [top_1d+8r4] - pavgw mm4, mm7 - pavgw mm5, mm7 - paddw mm0, [sums+16] ; i4x4_v satd - paddw mm4, [sums+8] ; i4x4_h satd - paddw mm5, [sums+0] ; i4x4_dc satd - movq [sums+16], mm0 - movq [sums+8], mm4 - movq [sums+0], mm5 + pavgw m4, m7 + pavgw m5, m7 + paddw m0, [sums+16] ; i4x4_v satd + paddw m4, [sums+8] ; i4x4_h satd + paddw m5, [sums+0] ; i4x4_dc satd + movq [sums+16], m0 + movq [sums+8], m4 + movq [sums+0], m5 add r0, 4 inc r4d @@ -1106,21 +1148,336 @@ jl .loop_y ; horizontal sum - movq mm0, [sums+0] - movq mm1, [sums+8] - movq mm2, [sums+16] - movq mm7, mm0 - psrlq mm7, 15 - paddw mm2, mm7 - SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd - psrld mm2, 1 - movd [r2+0], mm0 ; i8x8c_dc satd - movd [r2+4], mm1 ; i8x8c_h satd - movd [r2+8], mm2 ; i8x8c_v satd + movq m0, [sums+0] + movq m1, [sums+8] + movq m2, [sums+16] + movq m7, m0 + psrlq m7, 15 + paddw m2, m7 + SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd + psrld m2, 1 + movd [r2+0], m0 ; i8x8c_dc satd + movd [r2+4], m1 ; i8x8c_h satd + movd [r2+8], m2 ; i8x8c_v satd ADD rsp, 72 RET +%endmacro ; INTRA_SATDS_MMX + + +%macro ABS_MOV_SSSE3 2 + pabsw %1, %2 +%endmacro + +%macro ABS_MOV_MMX 2 + pxor %1, %1 + psubw %1, %2 + pmaxsw %1, %2 %endmacro +%define ABS_MOV ABS_MOV_MMX + +; in: r0=pix, r1=stride, r2=stride3, r3=tmp, m6=mask_ac4, m7=0 +; out: [tmp]=hadamard4, m0=satd +cglobal x264_hadamard_ac_4x4_mmxext + movh m0, [r0] + movh m1, [r0+r1] + movh m2, [r0+r12] + movh m3, [r0+r2] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + HADAMARD4_1D m0, m1, m2, m3 + TRANSPOSE4x4W 0, 1, 2, 3, 4 + HADAMARD4_1D m0, m1, m2, m3 + mova [r3], m0 + mova [r3+8], m1 + mova [r3+16], m2 + mova [r3+24], m3 + ABS1 m0, m4 + ABS1 m1, m4 + pand m0, m6 + ABS1 m2, m4 + ABS1 m3, m4 + paddw m0, m1 + paddw m2, m3 + paddw m0, m2 + SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext + ret + +cglobal x264_hadamard_ac_2x2_mmxext + mova m0, [r3+0x00] + mova m1, [r3+0x20] + mova m2, [r3+0x40] + mova m3, [r3+0x60] + HADAMARD4_1D m0, m1, m2, m3 + ABS2 m0, m1, m4, m5 + ABS2 m2, m3, m4, m5 + SAVE_MM_PERMUTATION x264_hadamard_ac_2x2_mmxext + ret + +cglobal x264_hadamard_ac_8x8_mmxext + mova m6, [mask_ac4 GLOBAL] + pxor m7, m7 + call x264_hadamard_ac_4x4_mmxext + add r0, 4 + add r3, 32 + mova m5, m0 + call x264_hadamard_ac_4x4_mmxext + lea r0, [r0+4r1] + add r3, 64 + paddw m5, m0 + call x264_hadamard_ac_4x4_mmxext + sub r0, 4 + sub r3, 32 + paddw m5, m0 + call x264_hadamard_ac_4x4_mmxext + paddw m5, m0 + sub r3, 64 + mova [rsp+gprsize+8], m5 ; save satd + call x264_hadamard_ac_2x2_mmxext + add r3, 8 + pand m6, m0 + mova m7, m1 + paddw m6, m2 + paddw m7, m3 +%rep 2 + call x264_hadamard_ac_2x2_mmxext + add r3, 8 + paddw m6, m0 + paddw m7, m1 + paddw m6, m2 + paddw m7, m3 +%endrep + call x264_hadamard_ac_2x2_mmxext + sub r3, 24 + paddw m6, m0 + paddw m7, m1 + paddw m6, m2 + paddw m7, m3 + paddw m6, m7 + mova [rsp+gprsize], m6 ; save sa8d + SWAP m0, m6 + SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext + ret + +%macro HADAMARD_AC_WXH_MMX 2 +cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4 + %assign pad 16-gprsize-(stack_offset&15) + %define ysub r1 + sub rsp, 16+128+pad + lea r2, [r13] + lea r3, [rsp+16] + call x264_hadamard_ac_8x8_mmxext +%if %2==16 + %define ysub r2 + lea r0, [r0+r14] + sub rsp, 16 + call x264_hadamard_ac_8x8_mmxext +%endif +%if %1==16 + neg ysub + sub rsp, 16 + lea r0, [r0+ysub4+8] + neg ysub + call x264_hadamard_ac_8x8_mmxext +%if %2==16 + lea r0, [r0+r14] + sub rsp, 16 + call x264_hadamard_ac_8x8_mmxext +%endif +%endif + mova m1, [rsp+0x08] +%if %1%2 >= 128 + paddusw m0, [rsp+0x10] + paddusw m1, [rsp+0x18] +%endif +%if %1%2 == 256 + paddusw m0, [rsp+0x20] + paddusw m1, [rsp+0x28] + paddusw m0, [rsp+0x30] + paddusw m1, [rsp+0x38] +%endif + psrlw m0, 1 + psrlw m1, 1 + HADDW m0, m2 + HADDW m1, m3 + movd edx, m0 + movd eax, m1 + shr edx, 1 +%ifdef ARCH_X86_64 + shl rdx, 32 + add rax, rdx +%endif + add rsp, 128+%1%2/4+pad + RET +%endmacro ; HADAMARD_AC_WXH_MMX + +HADAMARD_AC_WXH_MMX 16, 16 +HADAMARD_AC_WXH_MMX 8, 16 +HADAMARD_AC_WXH_MMX 16, 8 +HADAMARD_AC_WXH_MMX 8, 8 + +%macro HADAMARD_AC_SSE2 1 +INIT_XMM +; in: r0=pix, r1=stride, r2=stride3 +; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride4 +cglobal x264_hadamard_ac_8x8_%1 +%ifdef ARCH_X86_64 + %define spill0 m8 + %define spill1 m9 + %define spill2 m10 +%else + %define spill0 [rsp+gprsize] + %define spill1 [rsp+gprsize+16] + %define spill2 [rsp+gprsize+32] +%endif + pxor m7, m7 + movh m0, [r0] + movh m1, [r0+r1] + movh m2, [r0+r12] + movh m3, [r0+r2] + lea r0, [r0+r14] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + HADAMARD4_1D m0, m1, m2, m3 + mova spill0, m3 + SWAP m3, m7 + movh m4, [r0] + movh m5, [r0+r1] + movh m6, [r0+r12] + movh m7, [r0+r2] + punpcklbw m4, m3 + punpcklbw m5, m3 + punpcklbw m6, m3 + punpcklbw m7, m3 + HADAMARD4_1D m4, m5, m6, m7 + mova m3, spill0 +%ifdef ARCH_X86_64 + TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 +%else + TRANSPOSE8x8W 0,1,2,3,4,5,6,7,spill0,spill1 +%endif + HADAMARD4_1D m0, m1, m2, m3 + HADAMARD4_1D m4, m5, m6, m7 + mova spill0, m1 + mova spill1, m2 + mova spill2, m3 + ABS_MOV m1, m0 + ABS_MOV m2, m4 + ABS_MOV m3, m5 + paddw m1, m2 + SUMSUB_BA m0, m4 + pand m1, [mask_ac4 GLOBAL] + ABS_MOV m2, spill0 + paddw m1, m3 + ABS_MOV m3, spill1 + paddw m1, m2 + ABS_MOV m2, spill2 + paddw m1, m3 + ABS_MOV m3, m6 + paddw m1, m2 + ABS_MOV m2, m7 + paddw m1, m3 + mova m3, m7 + paddw m1, m2 + mova m2, m6 + psubw m7, spill2 + paddw m3, spill2 + mova [rsp+gprsize+32], m1 ; save satd + mova m1, m5 + psubw m6, spill1 + paddw m2, spill1 + psubw m5, spill0 + paddw m1, spill0 + mova spill1, m7 + SBUTTERFLY qdq, 0, 4, 7 + SBUTTERFLY qdq, 1, 5, 7 + SBUTTERFLY qdq, 2, 6, 7 + SUMSUB_BADC m0, m4, m1, m5 + SUMSUB_BA m2, m6 + ABS1 m0, m7 + ABS1 m1, m7 + pand m0, [mask_ac8 GLOBAL] + ABS1 m2, m7 + ABS1 m4, m7 + ABS1 m5, m7 + ABS1 m6, m7 + mova m7, spill1 + paddw m0, m4 + SBUTTERFLY qdq, 3, 7, 4 + SUMSUB_BA m3, m7 + paddw m1, m5 + ABS1 m3, m4 + ABS1 m7, m4 + paddw m2, m6 + paddw m3, m7 + paddw m0, m1 + paddw m2, m3 + paddw m0, m2 + mova [rsp+gprsize+16], m0 ; save sa8d + SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1 + ret + +HADAMARD_AC_WXH_SSE2 16, 16, %1 +HADAMARD_AC_WXH_SSE2 8, 16, %1 +HADAMARD_AC_WXH_SSE2 16, 8, %1 +HADAMARD_AC_WXH_SSE2 8, 8, %1 +%endmacro ; HADAMARD_AC_SSE2 + +; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t pix, int stride ) +%macro HADAMARD_AC_WXH_SSE2 3 +cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3 + %assign pad 16-gprsize-(stack_offset&15) + %define ysub r1 + sub rsp, 48+pad + lea r2, [r13] + call x264_hadamard_ac_8x8_%3 +%if %2==16 + %define ysub r2 + lea r0, [r0+r14] + sub rsp, 32 + call x264_hadamard_ac_8x8_%3 +%endif +%if %1==16 + neg ysub + sub rsp, 32 + lea r0, [r0+ysub4+8] + neg ysub + call x264_hadamard_ac_8x8_%3 +%if %2==16 + lea r0, [r0+r14] + sub rsp, 32 + call x264_hadamard_ac_8x8_%3 +%endif +%endif + mova m1, [rsp+0x20] +%if %1%2 >= 128 + paddusw m0, [rsp+0x30] + paddusw m1, [rsp+0x40] +%endif +%if %1%2 == 256 + paddusw m0, [rsp+0x50] + paddusw m1, [rsp+0x60] + paddusw m0, [rsp+0x70] + paddusw m1, [rsp+0x80] +%endif + HADDW m0, m2 + HADDW m1, m3 + movd edx, m0 + movd eax, m1 + shr edx, 2 + shr eax, 1 +%ifdef ARCH_X86_64 + shl rdx, 32 + add rax, rdx +%endif + add rsp, 16+%1%2/2+pad + RET +%endmacro ; HADAMARD_AC_WXH_SSE2 + ; instantiate satds %ifndef ARCH_X86_64 @@ -1134,13 +1491,16 @@ SA8D_16x16_32 sse2 INTRA_SA8D_SSE2 sse2 INTRA_SATDS_MMX mmxext +HADAMARD_AC_SSE2 sse2 %define ABS1 ABS1_SSSE3 %define ABS2 ABS2_SSSE3 +%define ABS_MOV ABS_MOV_SSSE3 +SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3. SATDS_SSE2 ssse3 SA8D_16x16_32 ssse3 INTRA_SA8D_SSE2 ssse3 INTRA_SATDS_MMX ssse3 -SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3. +HADAMARD_AC_SSE2 ssse3 %define SATD_8x4_SSE2 SATD_8x4_PHADD SATDS_SSE2 ssse3_phadd @@ -1155,44 +1515,43 @@ ; const uint8_t pix2, int stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4 - pxor xmm0, xmm0 - pxor xmm1, xmm1 - pxor xmm2, xmm2 - pxor xmm3, xmm3 - pxor xmm4, xmm4 + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + pxor m4, m4 %rep 4 - movq xmm5, [r0] - movq xmm6, [r2] - punpcklbw xmm5, xmm0 - punpcklbw xmm6, xmm0 - paddw xmm1, xmm5 - paddw xmm2, xmm6 - movdqa xmm7, xmm5 - pmaddwd xmm5, xmm5 - pmaddwd xmm7, xmm6 - pmaddwd xmm6, xmm6 - paddd xmm3, xmm5 - paddd xmm4, xmm7 - paddd xmm3, xmm6 + movq m5, [r0] + movq m6, [r2] + punpcklbw m5, m0 + punpcklbw m6, m0 + paddw m1, m5 + paddw m2, m6 + movdqa m7, m5 + pmaddwd m5, m5 + pmaddwd m7, m6 + pmaddwd m6, m6 + paddd m3, m5 + paddd m4, m7 + paddd m3, m6 add r0, r1 add r2, r3 %endrep - ; PHADDW xmm1, xmm2 - ; PHADDD xmm3, xmm4 - picgetgot eax - movdqa xmm7, [pw_1 GLOBAL] - pshufd xmm5, xmm3, 0xb1 - pmaddwd xmm1, xmm7 - pmaddwd xmm2, xmm7 - pshufd xmm6, xmm4, 0xb1 - packssdw xmm1, xmm2 - paddd xmm3, xmm5 - pshufd xmm1, xmm1, 0xd8 - paddd xmm4, xmm6 - pmaddwd xmm1, xmm7 - movdqa xmm5, xmm3 - punpckldq xmm3, xmm4 - punpckhdq xmm5, xmm4 + ; PHADDW m1, m2 + ; PHADDD m3, m4 + movdqa m7, [pw_1 GLOBAL] + pshufd m5, m3, 0xb1 + pmaddwd m1, m7 + pmaddwd m2, m7 + pshufd m6, m4, 0xb1 + packssdw m1, m2 + paddd m3, m5 + pshufd m1, m1, 0xd8 + paddd m4, m6 + pmaddwd m1, m7 + movdqa m5, m3 + punpckldq m3, m4 + punpckhdq m5, m4 %ifdef ARCH_X86_64 %define t0 r4 @@ -1201,77 +1560,76 @@ mov t0, r4m %endif - movq [t0+ 0], xmm1 - movq [t0+ 8], xmm3 - psrldq xmm1, 8 - movq [t0+16], xmm1 - movq [t0+24], xmm5 + movq [t0+ 0], m1 + movq [t0+ 8], m3 + psrldq m1, 8 + movq [t0+16], m1 + movq [t0+24], m5 RET ;----------------------------------------------------------------------------- ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width ) ;----------------------------------------------------------------------------- cglobal x264_pixel_ssim_end4_sse2, 3,3 - movdqa xmm0, [r0+ 0] - movdqa xmm1, [r0+16] - movdqa xmm2, [r0+32] - movdqa xmm3, [r0+48] - movdqa xmm4, [r0+64] - paddd xmm0, [r1+ 0] - paddd xmm1, [r1+16] - paddd xmm2, [r1+32] - paddd xmm3, [r1+48] - paddd xmm4, [r1+64] - paddd xmm0, xmm1 - paddd xmm1, xmm2 - paddd xmm2, xmm3 - paddd xmm3, xmm4 - picgetgot r1 - movdqa xmm5, [ssim_c1 GLOBAL] - movdqa xmm6, [ssim_c2 GLOBAL] - TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4 - -; s1=mm0, s2=mm3, ss=mm4, s12=mm2 - movdqa xmm1, xmm3 - pslld xmm3, 16 - pmaddwd xmm1, xmm0 ; s1s2 - por xmm0, xmm3 - pmaddwd xmm0, xmm0 ; s1s1 + s2s2 - pslld xmm1, 1 - pslld xmm2, 7 - pslld xmm4, 6 - psubd xmm2, xmm1 ; covar2 - psubd xmm4, xmm0 ; vars - paddd xmm0, xmm5 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm4, xmm6 - cvtdq2ps xmm0, xmm0 ; (float)(s1s1 + s2s2 + ssim_c1) - cvtdq2ps xmm1, xmm1 ; (float)(s1s22 + ssim_c1) - cvtdq2ps xmm2, xmm2 ; (float)(covar2 + ssim_c2) - cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2) - mulps xmm1, xmm2 - mulps xmm0, xmm4 - divps xmm1, xmm0 ; ssim + movdqa m0, [r0+ 0] + movdqa m1, [r0+16] + movdqa m2, [r0+32] + movdqa m3, [r0+48] + movdqa m4, [r0+64] + paddd m0, [r1+ 0] + paddd m1, [r1+16] + paddd m2, [r1+32] + paddd m3, [r1+48] + paddd m4, [r1+64] + paddd m0, m1 + paddd m1, m2 + paddd m2, m3 + paddd m3, m4 + movdqa m5, [ssim_c1 GLOBAL] + movdqa m6, [ssim_c2 GLOBAL] + TRANSPOSE4x4D 0, 1, 2, 3, 4 + +; s1=m0, s2=m1, ss=m2, s12=m3 + movdqa m4, m1 + pslld m1, 16 + pmaddwd m4, m0 ; s1s2 + por m0, m1 + pmaddwd m0, m0 ; s1s1 + s2s2 + pslld m4, 1 + pslld m3, 7 + pslld m2, 6 + psubd m3, m4 ; covar2 + psubd m2, m0 ; vars + paddd m0, m5 + paddd m4, m5 + paddd m3, m6 + paddd m2, m6 + cvtdq2ps m0, m0 ; (float)(s1s1 + s2s2 + ssim_c1) + cvtdq2ps m4, m4 ; (float)(s1s22 + ssim_c1) + cvtdq2ps m3, m3 ; (float)(covar2 + ssim_c2) + cvtdq2ps m2, m2 ; (float)(vars + ssim_c2) + mulps m4, m3 + mulps m0, m2 + divps m4, m0 ; ssim - cmp r2d, 4 + cmp r2d, 4 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level - neg r2 -%ifdef PIC64 - lea r3, [mask_ff + 16 GLOBAL] - movdqu xmm3, [r3 + r24] + neg r2 +%ifdef PIC + lea r3, [mask_ff + 16 GLOBAL] + movdqu m1, [r3 + r24] %else - movdqu xmm3, [mask_ff + r24 + 16 GLOBAL] + movdqu m1, [mask_ff + r24 + 16 GLOBAL] %endif - pand xmm1, xmm3 + pand m4, m1 .skip: - movhlps xmm0, xmm1 - addps xmm0, xmm1 - pshuflw xmm1, xmm0, 0xE - addss xmm0, xmm1 + movhlps m0, m4 + addps m0, m4 + pshuflw m4, m0, 0xE + addss m0, m4 %ifndef ARCH_X86_64 - movd r0m, xmm0 - fld dword r0m + movd r0m, m0 + fld dword r0m %endif RET @@ -1281,7 +1639,7 @@ ; Successive Elimination ADS ;============================================================================= -%macro ADS_START 1 ; unroll_size +%macro ADS_START 1 ; unroll_size %ifdef ARCH_X86_64 %define t0 r6 mov r10, rsp @@ -1295,7 +1653,7 @@ and rsp, ~15 mov t0, rsp shl r2d, 1 -%endmacro +%endmacro %macro ADS_END 1 add r1, 8%1
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/pixel.h ^
@@ -43,6 +43,7 @@ DECL_X1( sad, mmxext ) DECL_X1( sad, sse2 ) DECL_X1( sad, sse3 ) +DECL_X1( sad, sse2_aligned ) DECL_X4( sad, mmxext ) DECL_X4( sad, sse2 ) DECL_X4( sad, sse3 ) @@ -64,22 +65,27 @@ DECL_X4( sad, cache64_sse2 ); DECL_X4( sad, cache64_ssse3 ); -#undef DECL_PIXELS -#undef DECL_X1 -#undef DECL_X4 +DECL_PIXELS( int, var, mmxext, ( uint8_t pix, int i_stride, uint32_t sad )) +DECL_PIXELS( int, var, sse2, ( uint8_t pix, int i_stride, uint32_t sad )) +DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t pix, int i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( uint8_t pix, int i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( uint8_t pix, int i_stride )) -void x264_intra_satd_x3_4x4_mmxext( uint8_t , uint8_t , int ); -void x264_intra_satd_x3_4x4_ssse3( uint8_t , uint8_t , int * ); -void x264_intra_satd_x3_8x8c_mmxext( uint8_t , uint8_t , int * ); -void x264_intra_satd_x3_8x8c_ssse3( uint8_t , uint8_t , int * ); +void x264_intra_satd_x3_4x4_mmxext ( uint8_t , uint8_t , int * ); +void x264_intra_satd_x3_4x4_ssse3 ( uint8_t , uint8_t , int * ); +void x264_intra_satd_x3_8x8c_mmxext ( uint8_t , uint8_t , int * ); +void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t , uint8_t , int * ); void x264_intra_satd_x3_16x16_mmxext( uint8_t , uint8_t , int * ); -void x264_intra_satd_x3_16x16_ssse3( uint8_t , uint8_t , int * ); -void x264_intra_sa8d_x3_8x8_mmxext( uint8_t , uint8_t , int * ); -void x264_intra_sa8d_x3_8x8_sse2( uint8_t , uint8_t , int * ); -void x264_intra_sa8d_x3_8x8_ssse3( uint8_t , uint8_t , int * ); +void x264_intra_satd_x3_16x16_ssse3 ( uint8_t , uint8_t , int * ); +void x264_intra_sad_x3_16x16_mmxext ( uint8_t , uint8_t , int * ); +void x264_intra_sad_x3_16x16_sse2 ( uint8_t , uint8_t , int * ); +void x264_intra_sad_x3_16x16_ssse3 ( uint8_t , uint8_t , int * ); +void x264_intra_sa8d_x3_8x8_mmxext ( uint8_t , uint8_t , int * ); +void x264_intra_sa8d_x3_8x8_sse2 ( uint8_t , uint8_t , int * ); +void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t , uint8_t , int * ); void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t , int16_t [2][8], int ); -void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t , int16_t [2][8], int ); -void x264_intra_sa8d_x3_8x8_core_ssse3( uint8_t , int16_t [2][8], int ); +void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t , int16_t [2][8], int ); +void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t , int16_t [2][8], int ); void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t pix1, int stride1, const uint8_t pix2, int stride2, int sums[2][4] ); @@ -99,6 +105,10 @@ DECL_ADS( 4, ssse3 ) DECL_ADS( 2, ssse3 ) DECL_ADS( 1, ssse3 ) + +#undef DECL_PIXELS +#undef DECL_X1 +#undef DECL_X4 #undef DECL_ADS #endif
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/predict-a.asm ^
@@ -22,6 +22,7 @@ ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" %macro STORE8x8 2 movq [r0 + 0FDEC_STRIDE], %1 @@ -66,13 +67,14 @@ ALIGN 16 pb_1: times 16 db 1 +pb_3: times 16 db 3 pw_2: times 4 dw 2 pw_4: times 4 dw 4 pw_8: times 8 dw 8 pw_76543210: pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 pb_00s_ff: times 8 db 0 -pb_0s_ff: times 7 db 0 +pb_0s_ff: times 7 db 0 db 0xff SECTION .text @@ -99,7 +101,7 @@ ;----------------------------------------------------------------------------- ; void predict_4x4_ddl_mmxext( uint8_t src ) ;----------------------------------------------------------------------------- -cglobal predict_4x4_ddl_mmxext, 1,1,1 +cglobal predict_4x4_ddl_mmxext, 1,1 sub r0, FDEC_STRIDE movq mm3, [r0] movq mm1, [r0-1] @@ -123,7 +125,7 @@ ;----------------------------------------------------------------------------- ; void predict_4x4_vl_mmxext( uint8_t src ) ;----------------------------------------------------------------------------- -cglobal predict_4x4_vl_mmxext, 1,1,1 +cglobal predict_4x4_vl_mmxext, 1,1 movq mm1, [r0-FDEC_STRIDE] movq mm3, mm1 movq mm2, mm1 @@ -144,6 +146,31 @@ RET ;----------------------------------------------------------------------------- +; void predict_4x4_dc( uint8_t src ) +;----------------------------------------------------------------------------- + +cglobal predict_4x4_dc_mmxext, 1,4 + pxor mm7, mm7 + movd mm0, [r0-FDEC_STRIDE] + psadbw mm0, mm7 + movd r3d, mm0 + movzx r1d, byte [r0-1] +%assign n 1 +%rep 3 + movzx r2d, byte [r0+FDEC_STRIDEn-1] + add r1d, r2d +%assign n n+1 +%endrep + lea r1d, [r1+r3+4] + shr r1d, 3 + imul r1d, 0x01010101 + mov [r0+FDEC_STRIDE0], r1d + mov [r0+FDEC_STRIDE1], r1d + mov [r0+FDEC_STRIDE2], r1d + mov [r0+FDEC_STRIDE3], r1d + RET + +;----------------------------------------------------------------------------- ; void predict_8x8_v_mmxext( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_v_mmxext, 2,2 @@ -152,9 +179,34 @@ RET ;----------------------------------------------------------------------------- +; void predict_8x8_h_mmxext( uint8_t src, uint8_t edge[33] ) +;----------------------------------------------------------------------------- + +INIT_MMX +cglobal predict_8x8_h_mmxext, 2,2 + movu m3, [r1+7] + mova m7, m3 + punpckhbw m3, m3 + punpcklbw m7, m7 + pshufw m0, m3, 0xff + pshufw m1, m3, 0xaa + pshufw m2, m3, 0x55 + pshufw m3, m3, 0x00 + pshufw m4, m7, 0xff + pshufw m5, m7, 0xaa + pshufw m6, m7, 0x55 + pshufw m7, m7, 0x00 +%assign n 0 +%rep 8 + mova [r0+nFDEC_STRIDE], m %+ n +%assign n n+1 +%endrep + RET + +;----------------------------------------------------------------------------- ; void predict_8x8_dc_mmxext( uint8_t src, uint8_t edge ); ;----------------------------------------------------------------------------- -cglobal predict_8x8_dc_mmxext, 2,2,1 +cglobal predict_8x8_dc_mmxext, 2,2 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r1+7] @@ -171,7 +223,7 @@ ; void predict_8x8_dc_top_mmxext( uint8_t src, uint8_t edge ); ;----------------------------------------------------------------------------- %macro PRED8x8_DC 2 -cglobal %1, 2,2,1 +cglobal %1, 2,2 pxor mm0, mm0 psadbw mm0, [r1+%2] paddw mm0, [pw_4 GLOBAL] @@ -192,7 +244,7 @@ ;----------------------------------------------------------------------------- ; void predict_8x8_ddl_mmxext( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddl_mmxext, 2,2,1 +cglobal predict_8x8_ddl_mmxext, 2,2 movq mm5, [r1+16] movq mm2, [r1+17] movq mm3, [r1+23] @@ -223,7 +275,7 @@ ;----------------------------------------------------------------------------- ; void predict_8x8_ddr_mmxext( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddr_mmxext, 2,2,1 +cglobal predict_8x8_ddr_mmxext, 2,2 movq mm1, [r1+7] movq mm2, [r1+9] movq mm3, [r1+15] @@ -254,7 +306,7 @@ ;----------------------------------------------------------------------------- ; void predict_8x8_ddl_sse2( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddl_sse2, 2,2,1 +cglobal predict_8x8_ddl_sse2, 2,2 movdqa xmm3, [r1+16] movdqu xmm2, [r1+17] movdqa xmm1, xmm3 @@ -272,7 +324,7 @@ ;----------------------------------------------------------------------------- ; void predict_8x8_ddr_sse2( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddr_sse2, 2,2,1 +cglobal predict_8x8_ddr_sse2, 2,2 movdqu xmm3, [r1+8] movdqu xmm1, [r1+7] movdqa xmm2, xmm3 @@ -297,7 +349,7 @@ ;----------------------------------------------------------------------------- ; void predict_8x8_vl_sse2( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_vl_sse2, 2,2,1 +cglobal predict_8x8_vl_sse2, 2,2 movdqa xmm4, [r1+16] movdqa xmm2, xmm4 movdqa xmm1, xmm4 @@ -338,7 +390,7 @@ ; 6 ..... ; 7 ,,,,, -cglobal predict_8x8_vr_core_mmxext, 2,2,1 +cglobal predict_8x8_vr_core_mmxext, 2,2 movq mm2, [r1+16] movq mm3, [r1+15] movq mm1, [r1+14] @@ -368,9 +420,33 @@ RET ;----------------------------------------------------------------------------- +; void predict_8x8c_h_mmxext( uint8_t src ) +;----------------------------------------------------------------------------- + +%macro PRED_8x8C_H 1 +cglobal predict_8x8c_h_%1, 1,1 +%ifidn %1, ssse3 + mova m1, [pb_3 GLOBAL] +%endif +%assign n 0 +%rep 8 + SPLATB m0, r0+FDEC_STRIDEn-1, m1 + mova [r0+FDEC_STRIDEn], m0 +%assign n n+1 +%endrep + REP_RET +%endmacro + +INIT_MMX +%define SPLATB SPLATB_MMX +PRED_8x8C_H mmxext +%define SPLATB SPLATB_SSSE3 +PRED_8x8C_H ssse3 + +;----------------------------------------------------------------------------- ; void predict_8x8c_dc_core_mmxext( uint8_t src, int s2, int s3 ) ;----------------------------------------------------------------------------- -cglobal predict_8x8c_dc_core_mmxext, 1,1,1 +cglobal predict_8x8c_dc_core_mmxext, 1,1 movq mm0, [r0 - FDEC_STRIDE] pxor mm1, mm1 pxor mm2, mm2 @@ -422,7 +498,7 @@ ;----------------------------------------------------------------------------- ; void predict_8x8c_p_core_mmxext( uint8_t src, int i00, int b, int c ) ;----------------------------------------------------------------------------- -cglobal predict_8x8c_p_core_mmxext, 1,2,1 +cglobal predict_8x8c_p_core_mmxext, 1,2 LOAD_PLANE_ARGS movq mm1, mm2 pmullw mm2, [pw_3210 GLOBAL] @@ -450,7 +526,7 @@ ;----------------------------------------------------------------------------- ; void predict_16x16_p_core_mmxext( uint8_t src, int i00, int b, int c ) ;----------------------------------------------------------------------------- -cglobal predict_16x16_p_core_mmxext, 1,2,1 +cglobal predict_16x16_p_core_mmxext, 1,2 LOAD_PLANE_ARGS movq mm5, mm2 movq mm1, mm2 @@ -492,7 +568,7 @@ ;----------------------------------------------------------------------------- ; void predict_16x16_p_core_sse2( uint8_t src, int i00, int b, int c ) ;----------------------------------------------------------------------------- -cglobal predict_16x16_p_core_sse2, 1,2,1 +cglobal predict_16x16_p_core_sse2, 1,2 movd xmm0, r1m movd xmm1, r2m movd xmm2, r3m @@ -543,6 +619,39 @@ REP_RET ;----------------------------------------------------------------------------- +; void predict_16x16_h_mmxext( uint8_t src ) +;----------------------------------------------------------------------------- + +%macro PRED_16x16_H 1 +cglobal predict_16x16_h_%1, 1,2 + mov r1, FDEC_STRIDE12 +%ifidn %1, ssse3 + mova m1, [pb_3 GLOBAL] +%endif +.vloop: +%assign n 0 +%rep 4 + SPLATB m0, r0+r1+FDEC_STRIDEn-1, m1 + mova [r0+r1+FDEC_STRIDEn], m0 +%if mmsize==8 + mova [r0+r1+FDEC_STRIDEn+8], m0 +%endif +%assign n n+1 +%endrep + add r1, -FDEC_STRIDE4 + jge .vloop + REP_RET +%endmacro + +;no SSE2, its slower than MMX on all systems that don't support SSSE3 +INIT_MMX +%define SPLATB SPLATB_MMX +PRED_16x16_H mmxext +INIT_XMM +%define SPLATB SPLATB_SSSE3 +PRED_16x16_H ssse3 + +;----------------------------------------------------------------------------- ; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left ) ;----------------------------------------------------------------------------- @@ -568,7 +677,7 @@ %endif REP_RET -cglobal predict_16x16_dc_top_mmxext, 1,2,1 +cglobal predict_16x16_dc_top_mmxext, 1,2 PRED16x16_DC [pw_8 GLOBAL], 4 REP_RET @@ -594,7 +703,7 @@ PRED16x16_DC_SSE2 xmm2, 5 REP_RET -cglobal predict_16x16_dc_top_sse2, 1,2,1 +cglobal predict_16x16_dc_top_sse2, 1,2 PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4 REP_RET
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/predict-c.c ^
@@ -26,13 +26,18 @@ #include "pixel.h" extern void predict_16x16_v_mmx( uint8_t src ); +extern void predict_16x16_h_mmxext( uint8_t src ); +extern void predict_16x16_h_ssse3( uint8_t src ); extern void predict_16x16_dc_core_mmxext( uint8_t src, int i_dc_left ); extern void predict_16x16_dc_top_mmxext( uint8_t src ); extern void predict_16x16_p_core_mmxext( uint8_t src, int i00, int b, int c ); extern void predict_8x8c_p_core_mmxext( uint8_t src, int i00, int b, int c ); extern void predict_8x8c_dc_core_mmxext( uint8_t src, int s2, int s3 ); extern void predict_8x8c_v_mmx( uint8_t src ); +extern void predict_8x8c_h_mmxext( uint8_t src ); +extern void predict_8x8c_h_ssse3( uint8_t src ); extern void predict_8x8_v_mmxext( uint8_t src, uint8_t edge[33] ); +extern void predict_8x8_h_mmxext( uint8_t src, uint8_t edge[33] ); extern void predict_8x8_dc_mmxext( uint8_t src, uint8_t edge[33] ); extern void predict_8x8_dc_top_mmxext( uint8_t src, uint8_t edge[33] ); extern void predict_8x8_dc_left_mmxext( uint8_t src, uint8_t edge[33] ); @@ -44,6 +49,7 @@ extern void predict_8x8_vr_core_mmxext( uint8_t src, uint8_t edge[33] ); extern void predict_4x4_ddl_mmxext( uint8_t src ); extern void predict_4x4_vl_mmxext( uint8_t src ); +extern void predict_4x4_dc_mmxext( uint8_t src ); extern void predict_16x16_dc_top_sse2( uint8_t src ); extern void predict_16x16_dc_core_sse2( uint8_t src, int i_dc_left ); extern void predict_16x16_v_sse2( uint8_t src ); @@ -126,40 +132,18 @@ } #ifdef ARCH_X86_64 -static void predict_16x16_h( uint8_t src ) -{ - int y; - for( y = 0; y < 16; y++ ) - { - const uint64_t v = 0x0101010101010101ULL * src[-1]; - uint64_t p = (uint64_t)src; - p[0] = p[1] = v; - src += FDEC_STRIDE; - } -} - -static void predict_8x8c_h( uint8_t src ) -{ - int y; - for( y = 0; y < 8; y++ ) - { - (uint64_t)src = 0x0101010101010101ULL src[-1]; - src += FDEC_STRIDE; - } -} - static void predict_16x16_dc_left( uint8_t src ) { uint32_t s = 0; - uint64_t dc; + uint64_t dc; int y; - + for( y = 0; y < 16; y++ ) { s += src[-1 + y FDEC_STRIDE]; - } + } dc = (( s + 8 ) >> 4) * 0x0101010101010101ULL; - + for( y = 0; y < 16; y++ ) { uint64_t p = (uint64_t)src; @@ -496,7 +480,6 @@ if( !(cpu&X264_CPU_MMX) ) return; #ifdef ARCH_X86_64 - pf[I_PRED_16x16_H] = predict_16x16_h; pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left; #endif pf[I_PRED_16x16_V] = predict_16x16_v_mmx; @@ -505,6 +488,7 @@ pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext; pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext; pf[I_PRED_16x16_P] = predict_16x16_p_mmxext; + pf[I_PRED_16x16_H] = predict_16x16_h_mmxext; if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_16x16_DC] = predict_16x16_dc_sse2; @@ -513,6 +497,9 @@ return; pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2; pf[I_PRED_16x16_P] = predict_16x16_p_sse2; + if( !(cpu&X264_CPU_SSSE3) ) + return; + pf[I_PRED_16x16_H] = predict_16x16_h_ssse3; } void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] ) @@ -520,15 +507,18 @@ if( !(cpu&X264_CPU_MMX) ) return; #ifdef ARCH_X86_64 - pf[I_PRED_CHROMA_H] = predict_8x8c_h; pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left; pf[I_PRED_CHROMA_DC_TOP] = predict_8x8c_dc_top; #endif pf[I_PRED_CHROMA_V] = predict_8x8c_v_mmx; if( !(cpu&X264_CPU_MMXEXT) ) return; + pf[I_PRED_CHROMA_H] = predict_8x8c_h_mmxext; pf[I_PRED_CHROMA_P] = predict_8x8c_p_mmxext; pf[I_PRED_CHROMA_DC] = predict_8x8c_dc_mmxext; + if( !(cpu&X264_CPU_SSSE3) ) + return; + pf[I_PRED_CHROMA_H] = predict_8x8c_h_ssse3; } void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12] ) @@ -536,6 +526,7 @@ if( !(cpu&X264_CPU_MMXEXT) ) return; pf[I_PRED_8x8_V] = predict_8x8_v_mmxext; + pf[I_PRED_8x8_H] = predict_8x8_h_mmxext; pf[I_PRED_8x8_DC] = predict_8x8_dc_mmxext; pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext; pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext; @@ -565,4 +556,5 @@ return; pf[I_PRED_4x4_DDL] = predict_4x4_ddl_mmxext; pf[I_PRED_4x4_VL] = predict_4x4_vl_mmxext; + pf[I_PRED_4x4_DC] = predict_4x4_dc_mmxext; }
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/quant-a.asm ^
@@ -270,7 +270,6 @@ .rshift32: neg t0d movd m5, t0d - picgetgot t0d mova m6, [pd_1 GLOBAL] pxor m7, m7 pslld m6, m5 @@ -290,12 +289,11 @@ sub t2d, t1d sub t2d, t1d ; i_mf = i_qp % 6 shl t2d, %3 -%ifdef PIC64 +%ifdef PIC lea r1, [dequant%2_scale GLOBAL] add r1, t2 %else - picgetgot r0 - lea r1, [t2 + dequant%2_scale GLOBAL] + lea r1, [dequant%2_scale + t2 GLOBAL] %endif movifnidn r0d, r0m movd m7, t0d @@ -331,10 +329,10 @@ ;----------------------------------------------------------------------------- -; void x264_denoise_dct_core_mmx( int16_t dct, uint32_t sum, uint16_t offset, int size ) +; void x264_denoise_dct_mmx( int16_t dct, uint32_t sum, uint16_t offset, int size ) ;----------------------------------------------------------------------------- %macro DENOISE_DCT 1 -cglobal x264_denoise_dct_core_%1, 4,5 +cglobal x264_denoise_dct_%1, 4,5 movzx r4d, word [r0] ; backup DC coefficient pxor m7, m7 .loop:
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/quant.h ^
@@ -43,8 +43,8 @@ void x264_dequant_8x8_flat16_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); void x264_dequant_4x4_flat16_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); void x264_dequant_8x8_flat16_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); -void x264_denoise_dct_core_mmx( int16_t dct, uint32_t sum, uint16_t offset, int size ); -void x264_denoise_dct_core_sse2( int16_t dct, uint32_t sum, uint16_t offset, int size ); -void x264_denoise_dct_core_ssse3( int16_t dct, uint32_t sum, uint16_t offset, int size ); +void x264_denoise_dct_mmx( int16_t dct, uint32_t sum, uint16_t offset, int size ); +void x264_denoise_dct_sse2( int16_t dct, uint32_t sum, uint16_t offset, int size ); +void x264_denoise_dct_ssse3( int16_t dct, uint32_t sum, uint16_t offset, int size ); #endif
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/sad-a.asm ^
@@ -23,8 +23,10 @@ ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION_RODATA +pb_3: times 16 db 3 sw_64: dd 64 SECTION .text @@ -80,7 +82,7 @@ pxor mm0, mm0 %rep %2/2 SAD_INC_2x%1P -%endrep +%endrep movd eax, mm0 RET %endmacro @@ -215,7 +217,99 @@ SAD_W16 sse2 %define movdqu lddqu SAD_W16 sse3 -%undef movdqu +%define movdqu movdqa +SAD_W16 sse2_aligned +%define movdqu movups + + + +;----------------------------------------------------------------------------- +; void intra_sad_x3_16x16 ( uint8_t fenc, uint8_t fdec, int res[3] ); +;----------------------------------------------------------------------------- + +;xmm7: DC prediction xmm6: H prediction xmm5: V prediction +;xmm4: DC pred score xmm3: H pred score xmm2: V pred score +%macro INTRA_SAD16 1 +cglobal x264_intra_sad_x3_16x16_%1,3,5 + pxor mm0, mm0 + pxor mm1, mm1 + psadbw mm0, [r1-FDEC_STRIDE+0] + psadbw mm1, [r1-FDEC_STRIDE+8] + paddw mm0, mm1 + movd r3d, mm0 +%ifidn %1, ssse3 + mova m1, [pb_3 GLOBAL] +%endif +%assign n 0 +%rep 16 + movzx r4d, byte [r1-1+FDEC_STRIDEn] + add r3d, r4d +%assign n n+1 +%endrep + add r3d, 16 + shr r3d, 5 + imul r3d, 0x01010101 + movd m7, r3d + mova m5, [r1-FDEC_STRIDE] +%if mmsize==16 + pshufd m7, m7, 0 +%else + mova m1, [r1-FDEC_STRIDE+8] + punpckldq m7, m7 +%endif + pxor m4, m4 + pxor m3, m3 + pxor m2, m2 + mov r3d, 15FENC_STRIDE +.vloop: + SPLATB m6, r1+r3*2-1, m1 + mova m0, [r0+r3] + psadbw m0, m7 + paddw m4, m0 + mova m0, [r0+r3] + psadbw m0, m5 + paddw m2, m0 +%if mmsize==8 + mova m0, [r0+r3] + psadbw m0, m6 + paddw m3, m0 + mova m0, [r0+r3+8] + psadbw m0, m7 + paddw m4, m0 + mova m0, [r0+r3+8] + psadbw m0, m1 + paddw m2, m0 + psadbw m6, [r0+r3+8] + paddw m3, m6 +%else + psadbw m6, [r0+r3] + paddw m3, m6 +%endif + add r3d, -FENC_STRIDE + jge .vloop +%if mmsize==16 + pslldq m3, 4 + por m3, m2 + movhlps m1, m3 + paddw m3, m1 + movq [r2+0], m3 + movhlps m1, m4 + paddw m4, m1 +%else + movd [r2+0], m2 + movd [r2+4], m3 +%endif + movd [r2+8], m4 + RET +%endmacro + +INIT_MMX +%define SPLATB SPLATB_MMX +INTRA_SAD16 mmxext +INIT_XMM +INTRA_SAD16 sse2 +%define SPLATB SPLATB_SSSE3 +INTRA_SAD16 ssse3 @@ -694,7 +788,7 @@ and eax, 0x37 cmp eax, 0x30 jle x264_pixel_sad_16x%2_sse2 - PROLOGUE 4,6,0 + PROLOGUE 4,6 mov r4d, r2d and r4d, 15 %ifidn %1, ssse3 @@ -704,11 +798,10 @@ shl r4d, 4 ; code size = 80 %endif %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1)) -%ifdef PIC64 +%ifdef PIC lea r5, [sad_w16_addr GLOBAL] add r5, r4 %else - picgetgot r5 lea r5, [sad_w16_addr + r4 GLOBAL] %endif and r2, ~15 @@ -728,18 +821,10 @@ jle x264_pixel_sad_%1x%2_mmxext and eax, 7 shl eax, 3 -%ifdef PIC32 - ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx - mov r2, 64 - sub r2, eax - movd mm7, eax - movd mm6, r2 -%else movd mm6, [sw_64 GLOBAL] movd mm7, eax psubw mm6, mm7 -%endif - PROLOGUE 4,5,0 + PROLOGUE 4,5 and r2, ~7 mov r4d, %3 pxor mm0, mm0 @@ -825,11 +910,11 @@ call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11], eax pop r2 - mov r0, r10 + mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+4], eax pop r2 - mov r0, r10 + mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+8], eax %else @@ -875,15 +960,15 @@ call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11], eax pop r2 - mov r0, r10 + mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+4], eax pop r2 - mov r0, r10 + mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+8], eax pop r2 - mov r0, r10 + mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+12], eax %else
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/util.h ^
@@ -69,7 +69,7 @@ "jg 1b \n" "movq %%mm4, %0 \n" :"=m"(output), "+r"(i_mvc) - :"r"(mvc) + :"r"(mvc), "m"((struct {int16_t x[4];} )mvc) ); sum += output[0] + output[1] + output[2] + output[3]; return sum; @@ -90,7 +90,7 @@ "psadbw %%mm7, %%mm1 \n" "movd %%mm1, %0 \n" :"=r"(count) - :"r"(v) + :"r"(v), "m"((struct {int16_t x[16];} )v) ); return (count+0x10)&0xff; } @@ -100,7 +100,7 @@ { if(i_count == 128) { - int nonzero; + int nonzero = 0; asm( "movq (%1), %%mm0 \n" "por 8(%1), %%mm0 \n" @@ -121,7 +121,7 @@ "packsswb %%mm0, %%mm0 \n" "movd %%mm0, %0 \n" :"=r"(nonzero) - :"r"(v) + :"r"(v), "m"((struct {int16_t x[64];} )v) ); return !!nonzero; }
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/x86inc.asm ^
@@ -37,57 +37,26 @@ %endif %endmacro -; PIC support macros. All these macros are totally harmless when PIC is -; not defined but can ruin everything if misused in PIC mode. On x86_32, shared -; objects cannot directly access global variables by address, they need to -; go through the GOT (global offset table). Most OSes do not care about it -; and let you load non-shared .so objects (Linux, Win32...). However, OS X -; requires PIC code in its .dylib objects. -; -; - GLOBAL should be used as a suffix for global addressing, eg. -; picgetgot ebx +; PIC support macros. +; x86_64 can't fit 64bit address literals in most instruction types, +; so shared objects (under the assumption that they might be anywhere +; in memory) must use an address mode that does fit. +; So all accesses to global variables must use this macro, e.g. ; mov eax, [foo GLOBAL] -; instead of +; instead of ; mov eax, [foo] ; -; - picgetgot computes the GOT address into the given register in PIC -; mode, otherwise does nothing. You need to do this before using GLOBAL. -; Before in both execution order and compiled code order (so GLOBAL knows -; which register the GOT is in). +; x86_32 doesn't require PIC. +; Some distros prefer shared objects to be PIC, but nothing breaks if +; the code contains a few textrels, so we'll skip that complexity. -%ifndef PIC - %define GLOBAL - %macro picgetgot 1 - %endmacro -%elifdef ARCH_X86_64 - %define PIC64 +%ifndef ARCH_X86_64 + %undef PIC +%endif +%ifdef PIC %define GLOBAL wrt rip - %macro picgetgot 1 - %endmacro %else - %define PIC32 - %ifidn __OUTPUT_FORMAT__,macho - ; There is no real global offset table on OS X, but we still - ; need to reference our variables by offset. - %macro picgetgot 1 - call %%getgot - %%getgot: - pop %1 - add %1, $$ - %%getgot - %undef GLOBAL - %define GLOBAL + %1 - fakegot - %endmacro - %else ; elf - extern _GLOBAL_OFFSET_TABLE_ - %macro picgetgot 1 - call %%getgot - %%getgot: - pop %1 - add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc - %undef GLOBAL - %define GLOBAL + %1 wrt ..gotoff - %endmacro - %endif + %define GLOBAL %endif ; Macros to eliminate most code duplication between x86_32 and x86_64: @@ -97,14 +66,13 @@ ; PROLOGUE: ; %1 = number of arguments. loads them from stack if needed. -; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed. -; %3 = whether global constants are used in this function. inits x86_32 PIC if needed. -; %4 = list of names to define to registers +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = list of names to define to registers ; PROLOGUE can also be invoked by adding the same options to cglobal ; e.g. -; cglobal foo, 2,3,0, dst, src, tmp -; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals +; cglobal foo, 2,3, dst, src, tmp +; declares a function (foo), taking two args (dst and src) and one local variable (tmp) ; TODO Some functions can use some args directly from the stack. If they're the ; last args then you can just not declare them, but if they're in the middle @@ -240,12 +208,12 @@ %endif %endmacro -%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... +%macro PROLOGUE 2-3+ ; #args, #regs, arg_names... ASSERT %2 >= %1 ASSERT %2 <= 7 %assign stack_offset 0 LOAD_IF_USED 6, %1 - DEFINE_ARGS %4 + DEFINE_ARGS %3 %endmacro %macro RET 0 @@ -288,15 +256,10 @@ %endif %endmacro -%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... +%macro PROLOGUE 2-3+ ; #args, #regs, arg_names... ASSERT %2 >= %1 %assign stack_offset 0 %assign regs_used %2 - %ifdef PIC - %if %3 - %assign regs_used regs_used+1 - %endif - %endif ASSERT regs_used <= 7 PUSH_IF_USED 3 PUSH_IF_USED 4 @@ -309,10 +272,7 @@ LOAD_IF_USED 4, %1 LOAD_IF_USED 5, %1 LOAD_IF_USED 6, %1 - %if %3 - picgetgot r%2 - %endif - DEFINE_ARGS %4 + DEFINE_ARGS %3 %endmacro %macro RET 0 @@ -502,6 +462,7 @@ %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, %1_m %+ %%i + CAT_XDEFINE n, m %+ %%i, %%i %assign %%i %%i+1 %endrep %endmacro
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/common/x86/x86util.asm ^
@@ -18,6 +18,87 @@ ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;***************************************************************************** +%macro SBUTTERFLY 4 + mova m%4, m%2 + punpckl%1 m%2, m%3 + punpckh%1 m%4, m%3 + SWAP %3, %4 +%endmacro + +%macro TRANSPOSE4x4W 5 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE2x4x4W 5 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SBUTTERFLY qdq, %1, %2, %5 + SBUTTERFLY qdq, %3, %4, %5 +%endmacro + +%macro TRANSPOSE4x4D 5 + SBUTTERFLY dq, %1, %2, %5 + SBUTTERFLY dq, %3, %4, %5 + SBUTTERFLY qdq, %1, %3, %5 + SBUTTERFLY qdq, %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE8x8W 9-11 +%ifdef ARCH_X86_64 + SBUTTERFLY wd, %1, %2, %9 + SBUTTERFLY wd, %3, %4, %9 + SBUTTERFLY wd, %5, %6, %9 + SBUTTERFLY wd, %7, %8, %9 + SBUTTERFLY dq, %1, %3, %9 + SBUTTERFLY dq, %2, %4, %9 + SBUTTERFLY dq, %5, %7, %9 + SBUTTERFLY dq, %6, %8, %9 + SBUTTERFLY qdq, %1, %5, %9 + SBUTTERFLY qdq, %2, %6, %9 + SBUTTERFLY qdq, %3, %7, %9 + SBUTTERFLY qdq, %4, %8, %9 + SWAP %2, %5 + SWAP %4, %7 +%else +; in: m0..m7, unless %11 in which case m6 is in %9 +; out: m0..m7, unless %11 in which case m4 is in %10 +; spills into %9 and %10 +%if %0<11 + movdqa %9, m%7 +%endif + SBUTTERFLY wd, %1, %2, %7 + movdqa %10, m%2 + movdqa m%7, %9 + SBUTTERFLY wd, %3, %4, %2 + SBUTTERFLY wd, %5, %6, %2 + SBUTTERFLY wd, %7, %8, %2 + SBUTTERFLY dq, %1, %3, %2 + movdqa %9, m%3 + movdqa m%2, %10 + SBUTTERFLY dq, %2, %4, %3 + SBUTTERFLY dq, %5, %7, %3 + SBUTTERFLY dq, %6, %8, %3 + SBUTTERFLY qdq, %1, %5, %3 + SBUTTERFLY qdq, %2, %6, %3 + movdqa %10, m%2 + movdqa m%3, %9 + SBUTTERFLY qdq, %3, %7, %2 + SBUTTERFLY qdq, %4, %8, %2 + SWAP %2, %5 + SWAP %4, %7 +%if 0<11 + movdqa m%5, %10 +%endif +%endif +%endmacro + %macro ABS1_MMX 2 ; a, tmp pxor %2, %2 psubw %2, %1 @@ -50,6 +131,40 @@ ABS2 %3, %4, %5, %6 %endmacro +%macro SPLATB_MMX 3 + movd %1, [%2-3] ;to avoid crossing a cacheline + punpcklbw %1, %1 +%if mmsize==16 + pshuflw %1, %1, 0xff + movlhps %1, %1 +%else + pshufw %1, %1, 0xff +%endif +%endmacro + +%macro SPLATB_SSSE3 3 + movd %1, [%2-3] + pshufb %1, %3 +%endmacro + +%macro PALIGNR_MMX 4 + %ifnidn %4, %2 + mova %4, %2 + %endif + %if mmsize == 8 + psllq %1, (8-%3)8 + psrlq %4, %38 + %else + pslldq %1, 16-%3 + psrldq %4, %3 + %endif + por %1, %4 +%endmacro + +%macro PALIGNR_SSSE3 4 + palignr %1, %2, %3 +%endmacro + %macro SUMSUB_BA 2 paddw %1, %2 paddw %2, %2 @@ -122,3 +237,4 @@ packuswb %1, %1 movh %4, %1 %endmacro +
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/configure ^
@@ -243,7 +243,6 @@ case $host_cpu in i*86) ARCH="X86" - AS="yasm" ASFLAGS="-O2" if [ "$SYS" = MACOSX ]; then ASFLAGS="$ASFLAGS -f macho -DPREFIX" @@ -256,7 +255,6 @@ ;; x86_64) ARCH="X86_64" - AS="yasm" if [ "$SYS" = MACOSX ];then ASFLAGS="-f macho64 -m amd64 -DPIC -DPREFIX" CFLAGS="$CFLAGS -arch x86_64" @@ -309,15 +307,12 @@ fi if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then - if [ $ARCH = X86 -a $pic = yes -a x$AS = xyasm -a\ - "`yasm --version 2>$DEVNULL \| head -n 1`" "<" "yasm 0.6.2" ] ; then - echo "yasm prior to 0.6.2 miscompiles PIC. trying nasm instead..." - AS=nasm - fi if as_check "pabsw xmm0, xmm0" ; then CFLAGS="$CFLAGS -DHAVE_MMX" else - echo "No suitable assembler found. Install 'yasm' to get MMX/SSE optimized code." + VER=`([ $AS == nasm ] && nasm -v \|\| $AS --version \|\| echo no assembler) 2>$DEVNULL \| head -n 1` + echo "Found $VER" + echo "Minimum version is yasm-0.6.1 or nasm-2.0" echo "If you really want to compile without asm, configure with --disable-asm." exit 1 fi @@ -455,15 +450,25 @@ echo 'IMPLIBNAME=libx264.dll.a' >> config.mak echo 'SOFLAGS=-Wl,--out-implib,$(IMPLIBNAME) -Wl,--enable-auto-image-base' >> config.mak elif [ "$SYS" = "MACOSX" ]; then + echo "SOSUFFIX=dylib" >> config.mak echo "SONAME=libx264.$API.dylib" >> config.mak - echo 'SOFLAGS=-dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress' >> config.mak + echo 'SOFLAGS=-dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name $(DESTDIR)$(libdir)/$(SONAME)' >> config.mak else + echo "SOSUFFIX=so" >> config.mak echo "SONAME=libx264.so.$API" >> config.mak echo 'SOFLAGS=-Wl,-soname,$(SONAME)' >> config.mak fi echo 'default: $(SONAME)' >> config.mak if [ "$gtk" = "yes" ]; then - echo "SONAMEGTK=libx264gtk.so.$API" >> gtk/config.mak + if [ "$SYS" = "MACOSX" ]; then + echo "SOSUFFIX=dylib" >> gtk/config.mak + echo "SONAMEGTK=libx264gtk.$API.dylib" >> gtk/config.mak + echo 'SOFLAGS=-dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name $(DESTDIR)$(libdir)/$(SONAMEGTK)' >> gtk/config.mak + else + echo "SOSUFFIX=so" >> gtk/config.mak + echo "SONAMEGTK=libx264gtk.so.$API" >> gtk/config.mak + echo 'SOFLAGS=-Wl,-soname,$(SONAMEGTK)' >> gtk/config.mak + fi fi fi
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/encoder/analyse.c ^
@@ -205,7 +205,7 @@ { /* conduct the analysis using this lamda and QP / a->i_qp = h->mb.i_qp = i_qp; - h->mb.i_chroma_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )]; + h->mb.i_chroma_qp = h->chroma_qp_table[i_qp]; a->i_lambda = x264_lambda_tab[i_qp]; a->i_lambda2 = x264_lambda2_tab[i_qp]; a->b_mbrd = h->param.analyse.i_subpel_refine >= 6 && @@ -225,8 +225,8 @@ a->i_satd_i4x4 = a->i_satd_i8x8chroma = COST_MAX; - / non-RD PCM decision is inaccurate, so don't do it / - a->i_satd_pcm = a->b_mbrd ? ((uint64_t)X264_PCM_COSTa->i_lambda2 + 128) >> 8 : COST_MAX; + /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it / + a->i_satd_pcm = !h->mb.i_psy_rd && a->b_mbrd ? ((uint64_t)X264_PCM_COSTa->i_lambda2 + 128) >> 8 : COST_MAX; a->b_fast_intra = 0; h->mb.i_skip_intra = @@ -467,6 +467,58 @@ } } +/* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. / +static void inline x264_psy_trellis_init( x264_t h, int do_both_dct ) +{ + DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] ); + DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] ); + DECLARE_ALIGNED_16( uint8_t zero[16FDEC_STRIDE] ) = {0}; + int i; + + if( do_both_dct \|\| h->mb.b_transform_8x8 ) + { + h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero ); + for( i = 0; i < 4; i++ ) + h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] ); + } + if( do_both_dct \|\| !h->mb.b_transform_8x8 ) + { + h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero ); + for( i = 0; i < 16; i++ ) + h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] ); + } +} + +/ Pre-calculate fenc satd scores for psy RD, minus DC coefficients / +static inline void x264_mb_cache_fenc_satd( x264_t h ) +{ + DECLARE_ALIGNED_16(uint8_t zero[16]) = {0}; + uint8_t fenc; + int x, y, satd_sum = 0, sa8d_sum = 0; + if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis ) + x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 ); + if( !h->mb.i_psy_rd ) + return; + for( y = 0; y < 4; y++ ) + for( x = 0; x < 4; x++ ) + { + fenc = h->mb.pic.p_fenc[0]+x4+y4FENC_STRIDE; + h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE ) + - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1); + satd_sum += h->mb.pic.fenc_satd[y][x]; + } + for( y = 0; y < 2; y++ ) + for( x = 0; x < 2; x++ ) + { + fenc = h->mb.pic.p_fenc[0]+x8+y8FENC_STRIDE; + h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE ) + - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2); + sa8d_sum += h->mb.pic.fenc_sa8d[y][x]; + } + h->mb.pic.fenc_satd_sum = satd_sum; + h->mb.pic.fenc_sa8d_sum = sa8d_sum; +} + static void x264_mb_analyse_intra_chroma( x264_t h, x264_mb_analysis_t a ) { int i; @@ -498,7 +550,7 @@ h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE ); satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE ); - + for( i=0; i<i_max; i++ ) { int i_mode = predict_mode[i]; @@ -517,8 +569,13 @@ int i_mode = predict_mode[i]; / we do the prediction / - h->predict_8x8c[i_mode]( p_dstc[0] ); - h->predict_8x8c[i_mode]( p_dstc[1] ); + if( h->mb.b_lossless ) + x264_predict_lossless_8x8_chroma( h, i_mode ); + else + { + h->predict_8x8c[i_mode]( p_dstc[0] ); + h->predict_8x8c[i_mode]( p_dstc[1] ); + } / we calculate the cost / i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, @@ -544,7 +601,7 @@ int i, idx; int i_max; int predict_mode[9]; - int b_merged_satd = h->pixf.intra_satd_x3_16x16 && h->pixf.mbcmp[0] == h->pixf.satd[0]; + int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless; /---------------- Try all mode and calculate their score ---------------/ @@ -553,7 +610,7 @@ if( b_merged_satd && i_max == 4 ) { - h->pixf.intra_satd_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir ); + h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir ); h->predict_16x16[I_PRED_16x16_P]( p_dst ); a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ); @@ -569,7 +626,11 @@ { int i_satd; int i_mode = predict_mode[i]; - h->predict_16x16[i_mode]( p_dst ); + + if( h->mb.b_lossless ) + x264_predict_lossless_16x16( h, i_mode ); + else + h->predict_16x16[i_mode]( p_dst ); i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) + a->i_lambda bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] ); @@ -588,7 +649,7 @@ if( flags & X264_ANALYSE_I8x8 ) { DECLARE_ALIGNED_16( uint8_t edge[33] ); - x264_pixel_cmp_t sa8d = (h->pixf.mbcmp == h->pixf.sad) ? h->pixf.sad[PIXEL_8x8] : h->pixf.sa8d[PIXEL_8x8]; + x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8]; int i_satd_thresh = a->b_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 ); int i_cost = 0; b_merged_satd = h->pixf.intra_sa8d_x3_8x8 && h->pixf.mbcmp[0] == h->pixf.satd[0]; @@ -629,7 +690,10 @@ int i_satd; int i_mode = predict_mode[i]; - h->predict_8x8[i_mode]( p_dst_by, edge ); + if( h->mb.b_lossless ) + x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge ); + else + h->predict_8x8[i_mode]( p_dst_by, edge ); i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4); @@ -711,8 +775,10 @@ { int i_satd; int i_mode = predict_mode[i]; - - h->predict_4x4[i_mode]( p_dst_by ); + if( h->mb.b_lossless ) + x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode ); + else + h->predict_4x4[i_mode]( p_dst_by ); i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) @@ -824,7 +890,10 @@ for( i = 0; i < i_max; i++ ) { i_mode = predict_mode[i]; - h->predict_4x4[i_mode]( p_dst_by ); + if( h->mb.b_lossless ) + x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode ); + else + h->predict_4x4[i_mode]( p_dst_by ); i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode ); if( i_best > i_satd ) @@ -876,7 +945,10 @@ i_mode = predict_mode[i]; if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh ) continue; - h->predict_8x8[i_mode]( p_dst_by, edge ); + if( h->mb.b_lossless ) + x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge ); + else + h->predict_8x8[i_mode]( p_dst_by, edge ); i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode ); if( i_best > i_satd ) @@ -928,8 +1000,13 @@ for( i = 0; i < i_max; i++ ) { i_mode = predict_mode[i]; - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); + if( h->mb.b_lossless ) + x264_predict_lossless_8x8_chroma( h, i_mode ); + else + { + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); + } /* if we've already found a mode that needs no residual, then * probably any mode with a residual will be worse. * so avoid dct on the remaining modes to improve speed. / @@ -964,7 +1041,7 @@ { x264_me_t m; int i_ref, i_mvc; - DECLARE_ALIGNED_4( int16_t mvc[7][2] ); + DECLARE_ALIGNED_4( int16_t mvc[8][2] ); int i_halfpel_thresh = INT_MAX; int p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL; @@ -1009,7 +1086,7 @@ h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) ); /* save mv for predicting neighbors / - (uint32_t)a->l0.mvc[i_ref][0] = + (uint32_t)a->l0.mvc[i_ref][0] = (uint32_t)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = (uint32_t)m.mv; } @@ -1017,12 +1094,15 @@ assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] \|\| h->param.i_threads == 1 ); h->mb.i_type = P_L0; - if( a->b_mbrd && a->l0.me16x16.i_ref == 0 - && (uint32_t)a->l0.me16x16.mv == (uint32_t)h->mb.cache.pskip_mv ) + if( a->b_mbrd ) { - h->mb.i_partition = D_16x16; - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); - a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 ); + x264_mb_cache_fenc_satd( h ); + if( a->l0.me16x16.i_ref == 0 && (uint32_t)a->l0.me16x16.mv == (uint32_t)h->mb.cache.pskip_mv ) + { + h->mb.i_partition = D_16x16; + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); + a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 ); + } } } @@ -1419,26 +1499,21 @@ } } -#define WEIGHTED_AVG( size, pix1, stride1, src2, stride2 ) \ - { \ - if( h->param.analyse.b_weighted_bipred ) \ - h->mc.avg_weight[size]( pix1, stride1, src2, stride2, \ - h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \ - else \ - h->mc.avg[size]( pix1, stride1, src2, stride2 ); \ - } +#define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \ +{ \ + h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \ +} static void x264_mb_analyse_inter_b16x16( x264_t h, x264_mb_analysis_t a ) { + DECLARE_ALIGNED_16( uint8_t pix0[1616] ); DECLARE_ALIGNED_16( uint8_t pix1[1616] ); - DECLARE_ALIGNED_16( uint8_t pix2[1616] ); - uint8_t src2; - int stride2 = 16; - int weight; + uint8_t src0, src1; + int stride0 = 16, stride1 = 16; x264_me_t m; int i_ref, i_mvc; - DECLARE_ALIGNED_4( int16_t mvc[8][2] ); + DECLARE_ALIGNED_4( int16_t mvc[9][2] ); int i_halfpel_thresh = INT_MAX; int p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL; @@ -1504,41 +1579,16 @@ x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref ); /* get cost of BI mode / - weight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref]; - if ( ((uint32_t)a->l0.me16x16.mv & 0x10001) == 0 ) - { - / l0 reference is halfpel, so get_ref on it will make it faster / - src2 = - h->mc.get_ref( pix2, &stride2, - h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0], - a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], - 16, 16 ); - h->mc.mc_luma( pix1, 16, - h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0], - a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], - 16, 16 ); - weight = 64 - weight; - } - else - { - / if l0 was qpel, we'll use get_ref on l1 instead / - h->mc.mc_luma( pix1, 16, - h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0], - a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], - 16, 16 ); - src2 = - h->mc.get_ref( pix2, &stride2, - h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0], - a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], - 16, 16 ); - } + src0 = h->mc.get_ref( pix0, &stride0, + h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0], + a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 ); + src1 = h->mc.get_ref( pix1, &stride1, + h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0], + a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 ); - if( h->param.analyse.b_weighted_bipred ) - h->mc.avg_weight[PIXEL_16x16]( pix1, 16, src2, stride2, weight ); - else - h->mc.avg[PIXEL_16x16]( pix1, 16, src2, stride2 ); + h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); - a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix1, 16 ) + a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 ) + REF_COST( 0, a->l0.i_ref ) + REF_COST( 1, a->l1.i_ref ) + a->l0.me16x16.cost_mv @@ -1654,6 +1704,8 @@ const int y8 = i/2; int i_part_cost; int i_part_cost_bi = 0; + int stride[2] = {8,8}; + uint8_t src[2]; for( l = 0; l < 2; l++ ) { @@ -1672,13 +1724,12 @@ x264_macroblock_cache_mv_ptr( h, 2x8, 2y8, 2, 2, l, m->mv ); /* BI mode / - h->mc.mc_luma( pix[l], 8, m->p_fref, m->i_stride[0], - m->mv[0], m->mv[1], 8, 8 ); + src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0], + m->mv[0], m->mv[1], 8, 8 ); i_part_cost_bi += m->cost_mv; / FIXME: ref cost / } - - WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, pix[1], 8 ); + h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ) + a->i_lambda i_sub_mb_b_cost_table[D_BI_8x8]; a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8]; @@ -1704,7 +1755,7 @@ uint8_t *p_fref[2] = { h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.p_fref[1][a->l1.i_ref] }; - DECLARE_ALIGNED_16( uint8_t pix[2][168] ); + DECLARE_ALIGNED_16( uint8_t pix[2][168] ); DECLARE_ALIGNED_4( int16_t mvc[2][2] ); int i, l; @@ -1715,6 +1766,8 @@ { int i_part_cost; int i_part_cost_bi = 0; + int stride[2] = {16,16}; + uint8_t src[2]; /* TODO: check only the list(s) that were used in b8x8? / for( l = 0; l < 2; l++ ) @@ -1735,13 +1788,12 @@ x264_me_search( h, m, mvc, 2 ); / BI mode / - h->mc.mc_luma( pix[l], 16, m->p_fref, m->i_stride[0], - m->mv[0], m->mv[1], 16, 8 ); + src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0], + m->mv[0], m->mv[1], 16, 8 ); / FIXME: ref cost / i_part_cost_bi += m->cost_mv; } - - WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, pix[1], 16 ); + h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 ); i_part_cost = a->l0.me16x8[i].cost; @@ -1784,6 +1836,8 @@ { int i_part_cost; int i_part_cost_bi = 0; + int stride[2] = {8,8}; + uint8_t src[2]; for( l = 0; l < 2; l++ ) { @@ -1803,13 +1857,13 @@ x264_me_search( h, m, mvc, 2 ); /* BI mode / - h->mc.mc_luma( pix[l], 8, m->p_fref, m->i_stride[0], - m->mv[0], m->mv[1], 8, 16 ); + src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0], + m->mv[0], m->mv[1], 8, 16 ); / FIXME: ref cost / i_part_cost_bi += m->cost_mv; } - WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, pix[1], 8 ); + h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ); i_part_cost = a->l0.me8x16[i].cost; @@ -1907,7 +1961,7 @@ static void x264_mb_analyse_b_rd( x264_t h, x264_mb_analysis_t a, int i_satd_inter ) { - int thresh = i_satd_inter 17/16; + int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16; if( a->b_direct_available && a->i_rd16x16direct == COST_MAX ) { @@ -2006,7 +2060,7 @@ static inline void x264_mb_analyse_transform( x264_t h ) { - if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 ) + if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless ) { int i_cost4, i_cost8; / Only luma MC is really needed, but the full MC is re-used in macroblock_encode. / @@ -2066,6 +2120,8 @@ /--------------------------- Do the analysis ---------------------------*/ if( h->sh.i_type == SLICE_TYPE_I ) { + if( analysis.b_mbrd ) + x264_mb_cache_fenc_satd( h ); x264_mb_analyse_intra( h, &analysis, COST_MAX ); if( analysis.b_mbrd ) x264_intra_rd( h, &analysis, COST_MAX ); @@ -2344,6 +2400,9 @@ int i_bskip_cost = COST_MAX; int b_skip = 0; + if( analysis.b_mbrd ) + x264_mb_cache_fenc_satd( h ); + h->mb.i_type = B_SKIP; if( h->mb.b_direct_auto_write ) { @@ -2558,7 +2617,7 @@ h->mb.i_type = i_type; h->mb.i_partition = i_partition; } - + x264_mb_analyse_intra( h, &analysis, i_satd_inter ); if( analysis.b_mbrd ) @@ -2589,6 +2648,8 @@ h->mb.b_trellis = h->param.analyse.i_trellis; h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction; + if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 ) + x264_psy_trellis_init( h, 0 ); if( h->mb.b_trellis == 1 \|\| h->mb.b_noise_reduction ) h->mb.i_skip_intra = 0; }
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/encoder/cabac.c ^
@@ -790,7 +790,7 @@ if( i_mb_type == I_PCM ) { i_mb_pos_tex = x264_cabac_pos( cb ); - h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start; + h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; memcpy( cb->p, h->mb.pic.p_fenc[0], 256 ); cb->p += 256; @@ -811,7 +811,7 @@ h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 ); h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 ); - h->stat.frame.i_itex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex; + h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex; return; } #endif @@ -963,7 +963,7 @@ #ifndef RDO_SKIP_BS i_mb_pos_tex = x264_cabac_pos( cb ); - h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start; + h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; #endif if( i_mb_type != I_16x16 ) @@ -1018,10 +1018,7 @@ } #ifndef RDO_SKIP_BS - if( IS_INTRA( i_mb_type ) ) - h->stat.frame.i_itex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex; - else - h->stat.frame.i_ptex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex; + h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex; #endif } @@ -1032,7 +1029,7 @@ * works on all partition sizes except 16x16 * for sub8x8, call once per 8x8 block ****************************************************************************/ -void x264_partition_size_cabac( x264_t h, x264_cabac_t cb, int i8, int i_pixel ) +static void x264_partition_size_cabac( x264_t h, x264_cabac_t *cb, int i8, int i_pixel ) { const int i_mb_type = h->mb.i_type; int j;
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/encoder/cavlc.c ^
@@ -116,9 +116,7 @@ /* total/trailing / if( i_idx == BLOCK_INDEX_CHROMA_DC ) - { bs_write_vlc( s, x264_coeff_token[4][i_total4+i_trailing] ); - } else { /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 / @@ -132,9 +130,7 @@ i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0; if( i_trailing > 0 ) - { bs_write( s, i_trailing, i_sign ); - } for( i = i_trailing; i < i_total; i++ ) { int mask = level[i] >> 15; @@ -145,19 +141,13 @@ i_level_code -= 2; / as level[i] can't be 1 for the first one if i_trailing < 3 / if( ( i_level_code >> i_suffix_length ) < 14 ) - { bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length, (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) ); - } else if( i_suffix_length == 0 && i_level_code < 30 ) - { bs_write( s, 19, (1<<4) + (i_level_code - 14) ); - } else if( i_suffix_length > 0 && ( i_level_code >> i_suffix_length ) == 14 ) - { bs_write( s, 15 + i_suffix_length, (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) ); - } else { int i_level_prefix = 15; @@ -344,7 +334,7 @@ { bs_write_ue( s, i_mb_i_offset + 25 ); i_mb_pos_tex = bs_pos( s ); - h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start; + h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; bs_align_0( s ); @@ -362,7 +352,7 @@ h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 ); h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 ); - h->stat.frame.i_itex_bits += bs_pos(s) - i_mb_pos_tex; + h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex; return; } #endif @@ -384,16 +374,10 @@ int i_pred = x264_mb_predict_intra4x4_mode( h, i ); int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] ); - if( i_pred == i_mode) - { + if( i_pred == i_mode ) bs_write1( s, 1 ); / b_prev_intra4x4_pred_mode / - } else - { - if( i_mode >= i_pred ) - i_mode--; - bs_write( s, 4, i_mode ); - } + bs_write( s, 4, i_mode - (i_mode > i_pred) ); } bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] ); } @@ -412,9 +396,7 @@ bs_write_ue( s, 0 ); if( h->mb.pic.i_fref[0] > 1 ) - { bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] ); - } x264_mb_predict_mv( h, 0, 0, 4, mvp ); bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] ); bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] ); @@ -457,9 +439,8 @@ else if( i_mb_type == P_8x8 ) { int b_sub_ref0; - - if( h->mb.cache.ref[0][x264_scan8[0]] == 0 && h->mb.cache.ref[0][x264_scan8[4]] == 0 && - h->mb.cache.ref[0][x264_scan8[8]] == 0 && h->mb.cache.ref[0][x264_scan8[12]] == 0 ) + if( (h->mb.cache.ref[0][x264_scan8[0]] \| h->mb.cache.ref[0][x264_scan8[ 4]] \| + h->mb.cache.ref[0][x264_scan8[8]] \| h->mb.cache.ref[0][x264_scan8[12]]) == 0 ) { bs_write_ue( s, 4 ); b_sub_ref0 = 0; @@ -469,11 +450,14 @@ bs_write_ue( s, 3 ); b_sub_ref0 = 1; } + / sub mb type / - for( i = 0; i < 4; i++ ) - { - bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i] ] ); - } + if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 ) + for( i = 0; i < 4; i++ ) + bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i] ] ); + else + bs_write( s, 4, 0xf ); + / ref0 / if( h->mb.pic.i_fref[0] > 1 && b_sub_ref0 ) { @@ -492,24 +476,16 @@ / sub mb type / for( i = 0; i < 4; i++ ) - { bs_write_ue( s, sub_mb_type_b_to_golomb[ h->mb.i_sub_partition[i] ] ); - } + / ref / for( i = 0; i < 4; i++ ) - { if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] ) - { bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i4]] ); - } - } for( i = 0; i < 4; i++ ) - { if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] ) - { bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i4]] ); - } - } + / mvd / for( i = 0; i < 4; i++ ) cavlc_mb8x8_mvd( h, s, 0, i ); @@ -532,30 +508,27 @@ b_list[1][i] = x264_mb_type_list1_table[i_mb_type][i]; } - bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] ); for( i_list = 0; i_list < 2; i_list++ ) { - const int i_ref_max = i_list == 0 ? h->mb.pic.i_fref[0] : h->mb.pic.i_fref[1]; + const int i_ref_max = (i_list == 0 ? h->mb.pic.i_fref[0] : h->mb.pic.i_fref[1]) - 1; - if( i_ref_max > 1 ) - { + if( i_ref_max ) switch( h->mb.i_partition ) { case D_16x16: - if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] ); + if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] ); break; case D_16x8: - if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] ); - if( b_list[i_list][1] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[8]] ); + if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] ); + if( b_list[i_list][1] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[8]] ); break; case D_8x16: - if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] ); - if( b_list[i_list][1] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[4]] ); + if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] ); + if( b_list[i_list][1] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[4]] ); break; } - } } for( i_list = 0; i_list < 2; i_list++ ) { @@ -601,9 +574,7 @@ } } else if( i_mb_type == B_DIRECT ) - { bs_write_ue( s, 0 ); - } else { x264_log(h, X264_LOG_ERROR, "invalid/unhandled mb_type\n" ); @@ -612,24 +583,18 @@ #ifndef RDO_SKIP_BS i_mb_pos_tex = bs_pos( s ); - h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start; + h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; #endif / Coded block patern / if( i_mb_type == I_4x4 \|\| i_mb_type == I_8x8 ) - { bs_write_ue( s, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )\|h->mb.i_cbp_luma] ); - } else if( i_mb_type != I_16x16 ) - { bs_write_ue( s, inter_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )\|h->mb.i_cbp_luma] ); - } / transform size 8x8 flag / if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma ) - { bs_write1( s, h->mb.b_transform_8x8 ); - } / write residual / if( i_mb_type == I_16x16 ) @@ -640,19 +605,19 @@ block_residual_write_cavlc( h, s, BLOCK_INDEX_LUMA_DC , h->dct.luma16x16_dc, 16 ); / AC Luma / - if( h->mb.i_cbp_luma != 0 ) + if( h->mb.i_cbp_luma ) for( i = 0; i < 16; i++ ) { h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] ); block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 ); } } - else if( h->mb.i_cbp_luma != 0 \|\| h->mb.i_cbp_chroma != 0 ) + else if( h->mb.i_cbp_luma \| h->mb.i_cbp_chroma ) { cavlc_qp_delta( h, s ); x264_macroblock_luma_write_cavlc( h, s, 0, 3 ); } - if( h->mb.i_cbp_chroma != 0 ) + if( h->mb.i_cbp_chroma ) { / Chroma DC residual present / block_residual_write_cavlc( h, s, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[0], 4 ); @@ -666,10 +631,7 @@ } #ifndef RDO_SKIP_BS - if( IS_INTRA( i_mb_type ) ) - h->stat.frame.i_itex_bits += bs_pos(s) - i_mb_pos_tex; - else - h->stat.frame.i_ptex_bits += bs_pos(s) - i_mb_pos_tex; + h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex; #endif } @@ -680,7 +642,7 @@ works on all partition sizes except 16x16 * for sub8x8, call once per 8x8 block ****************************************************************************/ -int x264_partition_size_cavlc( x264_t h, int i8, int i_pixel ) +static int x264_partition_size_cavlc( x264_t h, int i8, int i_pixel ) { bs_t s; const int i_mb_type = h->mb.i_type; @@ -770,7 +732,7 @@ static int x264_i8x8_chroma_size_cavlc( x264_t h ) { h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] ); - if( h->mb.i_cbp_chroma != 0 ) + if( h->mb.i_cbp_chroma ) { block_residual_write_cavlc( h, &h->out.bs, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[0], 4 ); block_residual_write_cavlc( h, &h->out.bs, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[1], 4 );
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/encoder/encoder.c ^
@@ -148,7 +148,7 @@ /* If effective qp <= 15, deblocking would have no effect anyway / if( param->b_deblocking_filter && ( h->mb.b_variable_qp - \|\| 15 < i_qp + 2 X264_MAX(param->i_deblocking_filter_alphac0, param->i_deblocking_filter_beta) ) ) + \|\| 15 < i_qp + 2 * X264_MIN(param->i_deblocking_filter_alphac0, param->i_deblocking_filter_beta) ) ) { sh->i_disable_deblocking_filter_idc = 0; } @@ -237,7 +237,7 @@ { bs_write_ue( s, sh->ref_pic_list_order[0][i].idc ); bs_write_ue( s, sh->ref_pic_list_order[0][i].arg ); - + } bs_write_ue( s, 3 ); } @@ -403,13 +403,16 @@ h->param.rc.i_rc_method = X264_RC_CQP; h->param.rc.f_ip_factor = 1; h->param.rc.f_pb_factor = 1; - h->param.analyse.b_transform_8x8 = 0; h->param.analyse.b_psnr = 0; h->param.analyse.b_ssim = 0; h->param.analyse.i_chroma_qp_offset = 0; h->param.analyse.i_trellis = 0; h->param.analyse.b_fast_pskip = 0; h->param.analyse.i_noise_reduction = 0; + h->param.analyse.f_psy_rd = 0; + /* 8x8dct is not useful at all in CAVLC lossless / + if( !h->param.b_cabac ) + h->param.analyse.b_transform_8x8 = 0; } if( h->param.rc.i_rc_method == X264_RC_CQP ) { @@ -429,7 +432,7 @@ // There's nothing special about 1080 in that the warning still applies to it, // but chances are the user can't help it if his content is already 1080p, // so there's no point in warning in that case. - x264_log( h, X264_LOG_WARNING, + x264_log( h, X264_LOG_WARNING, "width or height not divisible by 16 (%dx%d), compression will suffer.\n", h->param.i_width, h->param.i_height ); } @@ -442,7 +445,8 @@ h->param.i_bframe = x264_clip3( h->param.i_bframe, 0, X264_BFRAME_MAX ); h->param.i_bframe_bias = x264_clip3( h->param.i_bframe_bias, -90, 100 ); h->param.b_bframe_pyramid = h->param.b_bframe_pyramid && h->param.i_bframe > 1; - h->param.b_bframe_adaptive = h->param.b_bframe_adaptive && h->param.i_bframe > 0; + if( !h->param.i_bframe ) + h->param.i_bframe_adaptive = X264_B_ADAPT_NONE; h->param.analyse.b_weighted_bipred = h->param.analyse.b_weighted_bipred && h->param.i_bframe > 0; h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO && h->param.i_bframe @@ -487,12 +491,29 @@ if( !h->param.b_cabac ) h->param.analyse.i_trellis = 0; h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 ); - h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 ); + if( !h->param.analyse.i_trellis ) + h->param.analyse.f_psy_trellis = 0; + h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 ); + h->param.analyse.f_psy_trellis = x264_clip3f( h->param.analyse.f_psy_trellis, 0, 10 ); + if( h->param.analyse.i_subpel_refine < 6 ) + h->param.analyse.f_psy_rd = 0; + h->mb.i_psy_rd = FIX8( h->param.analyse.f_psy_rd ); + / Psy RDO increases overall quantizers to improve the quality of luma--this indirectly hurts chroma quality / + / so we lower the chroma QP offset to compensate / + / This can be triggered repeatedly on multiple calls to parameter_validate, but since encoding + * uses the pps chroma qp offset not the param chroma qp offset, this is not a problem. / + if( h->mb.i_psy_rd ) + h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_rd < 0.25 ? 1 : 2; + h->mb.i_psy_trellis = FIX8( h->param.analyse.f_psy_trellis / 4 ); + / Psy trellis has a similar effect. / + if( h->mb.i_psy_trellis ) + h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_trellis < 0.25 ? 1 : 2; + else + h->mb.i_psy_trellis = 0; + h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12); + h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 1 ); if( h->param.rc.f_aq_strength <= 0 ) h->param.rc.i_aq_mode = 0; - / VAQ effectively replaces qcomp, so qcomp is raised towards 1 to compensate. / - if( h->param.rc.i_aq_mode == X264_AQ_GLOBAL ) - h->param.rc.f_qcompress = x264_clip3f(h->param.rc.f_qcompress + h->param.rc.f_aq_strength / 0.7, 0, 1); h->param.analyse.i_noise_reduction = x264_clip3( h->param.analyse.i_noise_reduction, 0, 1<<16 ); { @@ -583,7 +604,9 @@ static void mbcmp_init( x264_t h ) { int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1; - memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp) ); + memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) ); + memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) ); + h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16; satd &= h->param.analyse.i_me_method == X264_ME_TESA; memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) ); memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) ); @@ -621,8 +644,6 @@ h->param.rc.psz_stat_out = strdup( h->param.rc.psz_stat_out ); if( h->param.rc.psz_stat_in ) h->param.rc.psz_stat_in = strdup( h->param.rc.psz_stat_in ); - if( h->param.rc.psz_rc_eq ) - h->param.rc.psz_rc_eq = strdup( h->param.rc.psz_rc_eq ); /* VUI / if( h->param.vui.i_sar_width > 0 && h->param.vui.i_sar_height > 0 ) @@ -672,18 +693,21 @@ x264_free( h ); return NULL; } - + h->mb.i_mb_count = h->sps->i_mb_width h->sps->i_mb_height; /* Init frames. / - h->frames.i_delay = h->param.i_bframe + h->param.i_threads - 1; + if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS ) + h->frames.i_delay = X264_MAX(h->param.i_bframe,3)4 + h->param.i_threads - 1; + else + h->frames.i_delay = h->param.i_bframe + h->param.i_threads - 1; h->frames.i_max_ref0 = h->param.i_frame_reference; h->frames.i_max_ref1 = h->sps->vui.i_num_reorder_frames; h->frames.i_max_dpb = h->sps->vui.i_max_dec_frame_buffering; h->frames.b_have_lowres = !h->param.rc.b_stat_read && ( h->param.rc.i_rc_method == X264_RC_ABR \|\| h->param.rc.i_rc_method == X264_RC_CRF - \|\| h->param.b_bframe_adaptive + \|\| h->param.i_bframe_adaptive \|\| h->param.b_pre_scenecut ); h->frames.b_have_lowres \|= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0); @@ -694,6 +718,8 @@ h->i_ref0 = 0; h->i_ref1 = 0; + h->chroma_qp_table = i_chroma_qp_table + 12 + h->pps->i_chroma_qp_index_offset; + x264_rdo_init( ); /* init CPU functions / @@ -1235,9 +1261,8 @@ / Compute misc bits / h->stat.frame.i_misc_bits = bs_pos( &h->out.bs ) + NALU_OVERHEAD 8 - - h->stat.frame.i_itex_bits - - h->stat.frame.i_ptex_bits - - h->stat.frame.i_hdr_bits; + - h->stat.frame.i_tex_bits + - h->stat.frame.i_mv_bits; } static void x264_thread_sync_context( x264_t dst, x264_t src ) @@ -1256,7 +1281,6 @@ // copy everything except the per-thread pointers and the constants. memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.type) - offsetof(x264_t, i_frame) ); - memcpy( &dst->mb.i_type, &src->mb.i_type, offsetof(x264_t, rc) - offsetof(x264_t, mb.i_type) ); dst->stat = src->stat; } @@ -1362,6 +1386,9 @@ if( h->frames.b_have_lowres ) x264_frame_init_lowres( h, fenc ); + if( h->param.rc.i_aq_mode ) + x264_adaptive_quant_frame( h, fenc ); + if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads ) { /* Nothing yet to encode / @@ -1542,7 +1569,7 @@ / restore CPU state (before using float again) / x264_emms(); - if( h->sh.i_type == SLICE_TYPE_P && !h->param.rc.b_stat_read + if( h->sh.i_type == SLICE_TYPE_P && !h->param.rc.b_stat_read && h->param.i_scenecut_threshold >= 0 && !h->param.b_pre_scenecut ) { @@ -1603,12 +1630,12 @@ / If using B-frames, force GOP to be closed. * Even if this frame is going to be I and not IDR, forcing a * P-frame before the scenecut will probably help compression. - * + * * We don't yet know exactly which frame is the scene cut, so * we can't assign an I-frame. Instead, change the previous * B-frame to P, and rearrange coding order. / - if( h->param.b_bframe_adaptive \|\| b > 1 ) + if( h->param.i_bframe_adaptive \|\| b > 1 ) h->fenc->i_type = X264_TYPE_AUTO; x264_frame_sort_pts( h->frames.current ); x264_frame_unshift( h->frames.next, h->fenc ); @@ -1734,22 +1761,22 @@ psz_message[0] = '\0'; if( h->param.analyse.b_psnr ) { - int64_t sqe[3] = { + int64_t ssd[3] = { h->stat.frame.i_ssd[0], h->stat.frame.i_ssd[1], h->stat.frame.i_ssd[2], }; - h->stat.i_sqe_global[h->sh.i_type] += sqe[0] + sqe[1] + sqe[2]; - h->stat.f_psnr_average[h->sh.i_type] += x264_psnr( sqe[0] + sqe[1] + sqe[2], 3 h->param.i_width * h->param.i_height / 2 ); - h->stat.f_psnr_mean_y[h->sh.i_type] += x264_psnr( sqe[0], h->param.i_width * h->param.i_height ); - h->stat.f_psnr_mean_u[h->sh.i_type] += x264_psnr( sqe[1], h->param.i_width * h->param.i_height / 4 ); - h->stat.f_psnr_mean_v[h->sh.i_type] += x264_psnr( sqe[2], h->param.i_width * h->param.i_height / 4 ); + h->stat.i_ssd_global[h->sh.i_type] += ssd[0] + ssd[1] + ssd[2]; + h->stat.f_psnr_average[h->sh.i_type] += x264_psnr( ssd[0] + ssd[1] + ssd[2], 3 * h->param.i_width * h->param.i_height / 2 ); + h->stat.f_psnr_mean_y[h->sh.i_type] += x264_psnr( ssd[0], h->param.i_width * h->param.i_height ); + h->stat.f_psnr_mean_u[h->sh.i_type] += x264_psnr( ssd[1], h->param.i_width * h->param.i_height / 4 ); + h->stat.f_psnr_mean_v[h->sh.i_type] += x264_psnr( ssd[2], h->param.i_width * h->param.i_height / 4 ); snprintf( psz_message, 80, " PSNR Y:%5.2f U:%5.2f V:%5.2f", - x264_psnr( sqe[0], h->param.i_width * h->param.i_height ), - x264_psnr( sqe[1], h->param.i_width * h->param.i_height / 4), - x264_psnr( sqe[2], h->param.i_width * h->param.i_height / 4) ); + x264_psnr( ssd[0], h->param.i_width * h->param.i_height ), + x264_psnr( ssd[1], h->param.i_width * h->param.i_height / 4), + x264_psnr( ssd[2], h->param.i_width * h->param.i_height / 4) ); } if( h->param.analyse.b_ssim ) @@ -1761,7 +1788,7 @@ " SSIM Y:%.5f", ssim_y ); } psz_message[79] = '\0'; - + x264_log( h, X264_LOG_DEBUG, "frame=%4d QP=%.2f NAL=%d Slice:%c Poc:%-3d I:%-4d P:%-4d SKIP:%-4d size=%d bytes%s\n", h->i_frame, @@ -1857,7 +1884,7 @@ (double)h->stat.i_slice_size[i_slice] / i_count, h->stat.f_psnr_mean_y[i_slice] / i_count, h->stat.f_psnr_mean_u[i_slice] / i_count, h->stat.f_psnr_mean_v[i_slice] / i_count, h->stat.f_psnr_average[i_slice] / i_count, - x264_psnr( h->stat.i_sqe_global[i_slice], i_count * i_yuv_size ) ); + x264_psnr( h->stat.i_ssd_global[i_slice], i_count * i_yuv_size ) ); } else { @@ -2013,7 +2040,7 @@ SUM3( h->stat.f_psnr_mean_u ) / i_count, SUM3( h->stat.f_psnr_mean_v ) / i_count, SUM3( h->stat.f_psnr_average ) / i_count, - x264_psnr( SUM3( h->stat.i_sqe_global ), i_count * i_yuv_size ), + x264_psnr( SUM3( h->stat.i_ssd_global ), i_count * i_yuv_size ), f_bitrate ); } else @@ -2028,8 +2055,6 @@ free( h->param.rc.psz_stat_out ); if( h->param.rc.psz_stat_in ) free( h->param.rc.psz_stat_in ); - if( h->param.rc.psz_rc_eq ) - free( h->param.rc.psz_rc_eq ); x264_cqm_delete( h );
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/encoder/macroblock.c ^
@@ -79,7 +79,25 @@ return i_score; } -void x264_mb_encode_i4x4( x264_t h, int idx, int i_qscale ) +static ALWAYS_INLINE void x264_quant_4x4( x264_t h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx ) +{ + int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY; + if( h->mb.b_trellis ) + x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx ); + else + h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] ); +} + +static ALWAYS_INLINE void x264_quant_8x8( x264_t h, int16_t dct[8][8], int i_qp, int b_intra, int idx ) +{ + int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY; + if( h->mb.b_trellis ) + x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx ); + else + h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] ); +} + +void x264_mb_encode_i4x4( x264_t h, int idx, int i_qp ) { uint8_t p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]]; uint8_t p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]]; @@ -93,15 +111,12 @@ h->dctf.sub4x4_dct( dct4x4, p_src, p_dst ); - if( h->mb.b_trellis ) - x264_quant_4x4_trellis( h, dct4x4, CQM_4IY, i_qscale, DCT_LUMA_4x4, 1 ); - else - h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] ); + x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx ); if( array_non_zero( dct4x4 ) ) { h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 ); - h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale ); + h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qp ); /* output samples to fdec / h->dctf.add4x4_idct( p_dst, dct4x4 ); @@ -110,7 +125,7 @@ memset( h->dct.luma4x4[idx], 0, sizeof(h->dct.luma4x4[idx])); } -void x264_mb_encode_i8x8( x264_t h, int idx, int i_qscale ) +void x264_mb_encode_i8x8( x264_t h, int idx, int i_qp ) { int x = 8 (idx&1); int y = 8 * (idx>>1); @@ -118,19 +133,22 @@ uint8_t p_dst = &h->mb.pic.p_fdec[0][x+yFDEC_STRIDE]; DECLARE_ALIGNED_16( int16_t dct8x8[8][8] ); + if( h->mb.b_lossless ) + { + h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst ); + return; + } + h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst ); - if( h->mb.b_trellis ) - x264_quant_8x8_trellis( h, dct8x8, CQM_8IY, i_qscale, 1 ); - else - h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8IY][i_qscale], h->quant8_bias[CQM_8IY][i_qscale] ); + x264_quant_8x8( h, dct8x8, i_qp, 1, idx ); h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 ); - h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale ); + h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp ); h->dctf.add8x8_idct8( p_dst, dct8x8 ); } -static void x264_mb_encode_i16x16( x264_t h, int i_qscale ) +static void x264_mb_encode_i16x16( x264_t h, int i_qp ) { uint8_t p_src = h->mb.pic.p_fenc[0]; uint8_t p_dst = h->mb.pic.p_fdec[0]; @@ -162,22 +180,19 @@ dct4x4[i][0][0] = 0; /* quant/scan/dequant / - if( h->mb.b_trellis ) - x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 ); - else - h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] ); + x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i ); h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] ); - h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qscale ); + h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp ); } h->dctf.dct4x4dc( dct_dc4x4 ); - h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qscale][0]>>1, h->quant4_bias[CQM_4IY][i_qscale][0]<<1 ); + h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 ); h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 ); / output samples to fdec / h->dctf.idct4x4dc( dct_dc4x4 ); - x264_mb_dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qscale ); / XXX not inversed / + x264_mb_dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp ); / XXX not inversed / / calculate dct coeffs / for( i = 0; i < 16; i++ ) @@ -189,7 +204,7 @@ h->dctf.add16x16_idct( p_dst, dct4x4 ); } -void x264_mb_encode_8x8_chroma( x264_t h, int b_inter, int i_qscale ) +void x264_mb_encode_8x8_chroma( x264_t h, int b_inter, int i_qp ) { int i, ch; int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B \|\| h->param.analyse.b_dct_decimate); @@ -215,7 +230,7 @@ } continue; } - + h->dctf.sub8x8_dct( dct4x4, p_src, p_dst ); / calculate dct coeffs / for( i = 0; i < 4; i++ ) @@ -225,22 +240,20 @@ dct4x4[i][0][0] = 0; / no trellis; it doesn't seem to help chroma noticeably / - h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qscale], h->quant4_bias[CQM_4IC+b_inter][i_qscale] ); + h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] ); h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch4], dct4x4[i] ); if( b_decimate ) - { i_decimate_score += x264_mb_decimate_score( h->dct.luma4x4[16+i+ch4]+1, 15 ); - } } h->dctf.dct2x2dc( dct2x2 ); - h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qscale][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qscale][0]<<1 ); + h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 ); zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 ); / output samples to fdec / h->dctf.idct2x2dc( dct2x2 ); - x264_mb_dequant_2x2_dc( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qscale ); / XXX not inversed / + x264_mb_dequant_2x2_dc( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ); / XXX not inversed / if( b_decimate && i_decimate_score < 7 ) { @@ -253,7 +266,7 @@ else { for( i = 0; i < 4; i++ ) - h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale ); + h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp ); } dct4x4[0][0][0] = dct2x2[0][0]; dct4x4[1][0][0] = dct2x2[0][1]; @@ -289,7 +302,7 @@ x264_macroblock_encode_pskip: * Encode an already marked skip block ****************************************************************************/ -void x264_macroblock_encode_pskip( x264_t h ) +static void x264_macroblock_encode_pskip( x264_t h ) { const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0], h->mb.mv_min[0], h->mb.mv_max[0] ); @@ -316,6 +329,74 @@ } /**************************************************************************** + * Intra prediction for predictive lossless mode. + ****************************************************************************/ + +/ Note that these functions take a shortcut (mc.copy instead of actual pixel prediction) which assumes + * that the edge pixels of the reconstructed frame are the same as that of the source frame. This means + * they will only work correctly if the neighboring blocks are losslessly coded. In practice, this means + * lossless mode cannot be mixed with lossy mode within a frame. / +/ This can be resolved by explicitly copying the edge pixels after doing the mc.copy, but this doesn't + * need to be done unless we decide to allow mixing lossless and lossy compression. / + +void x264_predict_lossless_8x8_chroma( x264_t h, int i_mode ) +{ + int stride = h->fenc->i_stride[1] << h->mb.b_interlaced; + if( i_mode == I_PRED_CHROMA_V ) + { + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-stride, stride, 8 ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-stride, stride, 8 ); + } + else if( i_mode == I_PRED_CHROMA_H ) + { + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-1, stride, 8 ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-1, stride, 8 ); + } + else + { + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); + } +} + +void x264_predict_lossless_4x4( x264_t h, uint8_t p_dst, int idx, int i_mode ) +{ + int stride = h->fenc->i_stride[0] << h->mb.b_interlaced; + uint8_t p_src = h->mb.pic.p_fenc_plane[0] + block_idx_x[idx]4 + block_idx_y[idx]4 stride; + + if( i_mode == I_PRED_4x4_V ) + h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 ); + else if( i_mode == I_PRED_4x4_H ) + h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 ); + else + h->predict_4x4[i_mode]( p_dst ); +} + +void x264_predict_lossless_8x8( x264_t h, uint8_t p_dst, int idx, int i_mode, uint8_t edge[33] ) +{ + int stride = h->fenc->i_stride[0] << h->mb.b_interlaced; + uint8_t p_src = h->mb.pic.p_fenc_plane[0] + (idx&1)8 + (idx>>1)8stride; + + if( i_mode == I_PRED_8x8_V ) + h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 ); + else if( i_mode == I_PRED_8x8_H ) + h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 ); + else + h->predict_8x8[i_mode]( p_dst, edge ); +} + +void x264_predict_lossless_16x16( x264_t h, int i_mode ) +{ + int stride = h->fenc->i_stride[0] << h->mb.b_interlaced; + if( i_mode == I_PRED_16x16_V ) + h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-stride, stride, 16 ); + else if( i_mode == I_PRED_16x16_H ) + h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-1, stride, 16 ); + else + h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] ); +} + +/**************************************************************************** * x264_macroblock_encode: ****************************************************************************/ void x264_macroblock_encode( x264_t h ) @@ -363,8 +444,11 @@ { const int i_mode = h->mb.i_intra16x16_pred_mode; h->mb.b_transform_8x8 = 0; - /* do the right prediction / - h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] ); + + if( h->mb.b_lossless ) + x264_predict_lossless_16x16( h, i_mode ); + else + h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] ); / encode the 16x16 macroblock / x264_mb_encode_i16x16( h, i_qp ); @@ -385,9 +469,13 @@ { uint8_t p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * FDEC_STRIDE]; int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4i]]; - x264_predict_8x8_filter( p_dst, edge, h->mb.i_neighbour8[i], x264_pred_i4x4_neighbors[i_mode] ); - h->predict_8x8[i_mode]( p_dst, edge ); + + if( h->mb.b_lossless ) + x264_predict_lossless_8x8( h, p_dst, i, i_mode, edge ); + else + h->predict_8x8[i_mode]( p_dst, edge ); + x264_mb_encode_i8x8( h, i, i_qp ); } for( i = 0; i < 4; i++ ) @@ -413,7 +501,10 @@ / emulate missing topright samples / (uint32_t) &p_dst[4-FDEC_STRIDE] = p_dst[3-FDEC_STRIDE] 0x01010101U; - h->predict_4x4[i_mode]( p_dst ); + if( h->mb.b_lossless ) + x264_predict_lossless_4x4( h, p_dst, i, i_mode ); + else + h->predict_4x4[i_mode]( p_dst ); x264_mb_encode_i4x4( h, i, i_qp ); } } @@ -428,12 +519,23 @@ if( h->mb.b_lossless ) { - for( i4x4 = 0; i4x4 < 16; i4x4++ ) - { - h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4], - h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4], - h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] ); - } + if( h->mb.b_transform_8x8 ) + for( i8x8 = 0; i8x8 < 4; i8x8++ ) + { + int x = 8(i8x8&1); + int y = 8(i8x8>>1); + h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8], + h->mb.pic.p_fenc[0]+x+yFENC_STRIDE, + h->mb.pic.p_fdec[0]+x+yFDEC_STRIDE ); + nnz8x8[i8x8] = array_non_zero( h->dct.luma8x8[i8x8] ); + } + else + for( i4x4 = 0; i4x4 < 16; i4x4++ ) + { + h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4], + h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4], + h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] ); + } } else if( h->mb.b_transform_8x8 ) { @@ -445,11 +547,8 @@ for( idx = 0; idx < 4; idx++ ) { if( h->mb.b_noise_reduction ) - h->quantf.denoise_dct_core( dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 ); - if( h->mb.b_trellis ) - x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 ); - else - h->quantf.quant_8x8( dct8x8[idx], h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] ); + h->quantf.denoise_dct( dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 ); + x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx ); h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] ); @@ -493,11 +592,8 @@ idx = i8x8 * 4 + i4x4; if( h->mb.b_noise_reduction ) - h->quantf.denoise_dct_core( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 ); - if( h->mb.b_trellis ) - x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 ); - else - h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); + h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 ); + x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx ); h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] ); @@ -530,8 +626,13 @@ if( IS_INTRA( h->mb.i_type ) ) { const int i_mode = h->mb.i_chroma_pred_mode; - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); + if( h->mb.b_lossless ) + x264_predict_lossless_8x8_chroma( h, i_mode ); + else + { + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); + } } /* encode the 8x8 blocks / @@ -594,7 +695,7 @@ if( !b_force_no_skip ) { if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 && - !(h->mb.i_cbp_luma \| h->mb.i_cbp_chroma) && + !(h->mb.i_cbp_luma \| h->mb.i_cbp_chroma) && (uint32_t)h->mb.cache.mv[0][x264_scan8[0]] == (uint32_t)h->mb.cache.pskip_mv && h->mb.cache.ref[0][x264_scan8[0]] == 0 ) { @@ -755,12 +856,20 @@ if( h->mb.b_lossless ) { int i4; - for( i4 = i84; i4 < i84+4; i4++ ) + if( h->mb.b_transform_8x8 ) + { + h->zigzagf.sub_4x4( h->dct.luma4x4[i8], p_fenc, p_fdec ); + nnz8x8 = array_non_zero( h->dct.luma8x8[i8] ); + } + else { - h->zigzagf.sub_4x4( h->dct.luma4x4[i4], - h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4], - h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] ); - nnz8x8 \|= array_non_zero( h->dct.luma4x4[i4] ); + for( i4 = i84; i4 < i84+4; i4++ ) + { + h->zigzagf.sub_4x4( h->dct.luma4x4[i4], + h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4], + h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] ); + nnz8x8 \|= array_non_zero( h->dct.luma4x4[i4] ); + } } for( ch = 0; ch < 2; ch++ ) { @@ -776,10 +885,10 @@ { DECLARE_ALIGNED_16( int16_t dct8x8[8][8] ); h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec ); - h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] ); + x264_quant_8x8( h, dct8x8, i_qp, 0, i8 ); h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 ); - if( b_decimate ) + if( b_decimate && !h->mb.b_trellis ) nnz8x8 = 4 <= x264_mb_decimate_score( h->dct.luma8x8[i8], 64 ); else nnz8x8 = array_non_zero( dct8x8 ); @@ -796,7 +905,8 @@ DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] ); h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); for( i4 = 0; i4 < 4; i4++ ) - h->quantf.quant_4x4( dct4x4[i4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); + x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i84+i4 ); + for( i4 = 0; i4 < 4; i4++ ) h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/encoder/macroblock.h ^
@@ -29,7 +29,7 @@ extern const int x264_lambda2_tab[52]; extern const int x264_lambda_tab[52]; -void x264_rdo_init( ); +void x264_rdo_init( void ); int x264_macroblock_probe_skip( x264_t h, int b_bidir ); @@ -38,21 +38,26 @@ static inline int x264_macroblock_probe_bskip( x264_t h ) { return x264_macroblock_probe_skip( h, 1 ); } +void x264_predict_lossless_8x8_chroma( x264_t h, int i_mode ); +void x264_predict_lossless_4x4( x264_t h, uint8_t p_dst, int idx, int i_mode ); +void x264_predict_lossless_8x8( x264_t h, uint8_t p_dst, int idx, int i_mode, uint8_t edge[33] ); +void x264_predict_lossless_16x16( x264_t h, int i_mode ); + void x264_macroblock_encode ( x264_t h ); void x264_macroblock_write_cabac ( x264_t h, x264_cabac_t cb ); void x264_macroblock_write_cavlc ( x264_t h, bs_t s ); void x264_macroblock_encode_p8x8( x264_t h, int i8 ); -void x264_mb_encode_i4x4( x264_t h, int idx, int i_qscale ); -void x264_mb_encode_i8x8( x264_t h, int idx, int i_qscale ); -void x264_mb_encode_8x8_chroma( x264_t h, int b_inter, int i_qscale ); +void x264_mb_encode_i4x4( x264_t h, int idx, int i_qp ); +void x264_mb_encode_i8x8( x264_t h, int idx, int i_qp ); +void x264_mb_encode_8x8_chroma( x264_t h, int b_inter, int i_qp ); void x264_cabac_mb_skip( x264_t h, int b_skip ); void x264_quant_4x4_trellis( x264_t h, int16_t dct[4][4], int i_quant_cat, - int i_qp, int i_ctxBlockCat, int b_intra ); + int i_qp, int i_ctxBlockCat, int b_intra, int idx ); void x264_quant_8x8_trellis( x264_t h, int16_t dct[8][8], int i_quant_cat, - int i_qp, int b_intra ); + int i_qp, int b_intra, int idx ); void x264_noise_reduction_update( x264_t h );
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/encoder/me.c ^
@@ -31,7 +31,7 @@ * and refine_* are run only on the winner. * the subme=7 values are much higher because any amount of satd search makes * up its time by reducing the number of rd iterations. / -static const int subpel_iterations[][4] = +static const int subpel_iterations[][4] = {{1,0,0,0}, {1,1,0,0}, {0,1,1,0}, @@ -162,7 +162,7 @@ int omx, omy, pmx, pmy; uint8_t p_fref = m->p_fref[0]; DECLARE_ALIGNED_16( uint8_t pix[1616] ); - + int i = 0, j; int dir; int costs[6]; @@ -663,7 +663,7 @@ { \ int stride = 16; \ uint8_t src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \ - int cost = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ if( b_chroma_me && cost < bcost ) \ { \ @@ -787,8 +787,10 @@ #define BIME_CACHE( dx, dy ) \ { \ int i = 4 + 3dx + dy; \ - h->mc.mc_luma( pix0[i], bw, m0->p_fref, m0->i_stride[0], om0x+dx, om0y+dy, bw, bh ); \ - h->mc.mc_luma( pix1[i], bw, m1->p_fref, m1->i_stride[0], om1x+dx, om1y+dy, bw, bh ); \ + stride0[i] = bw;\ + stride1[i] = bw;\ + src0[i] = h->mc.get_ref( pix0[i], &stride0[i], m0->p_fref, m0->i_stride[0], om0x+dx, om0y+dy, bw, bh ); \ + src1[i] = h->mc.get_ref( pix1[i], &stride1[i], m1->p_fref, m1->i_stride[0], om1x+dx, om1y+dy, bw, bh ); \ } #define BIME_CACHE2(a,b) \ @@ -802,11 +804,7 @@ int i0 = 4 + 3(m0x-om0x) + (m0y-om0y); \ int i1 = 4 + 3(m1x-om1x) + (m1y-om1y); \ visited[(m0x)&7][(m0y)&7][(m1x)&7] \|= (1<<((m1y)&7));\ - h->mc.memcpy_aligned( pix, pix0[i0], bs ); \ - if( i_weight == 32 ) \ - h->mc.avg[i_pixel]( pix, bw, pix1[i1], bw ); \ - else \ - h->mc.avg_weight[i_pixel]( pix, bw, pix1[i1], bw, i_weight ); \ + h->mc.avg[i_pixel]( pix, bw, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight ); \ cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, bw ) \ + p_cost_m0x[ m0x ] + p_cost_m0y[ m0y ] \ + p_cost_m1x[ m1x ] + p_cost_m1y[ m1y ]; \ @@ -838,7 +836,6 @@ const int i_pixel = m0->i_pixel; const int bw = x264_pixel_size[i_pixel].w; const int bh = x264_pixel_size[i_pixel].h; - const int bs = bwbh; const int16_t p_cost_m0x = m0->p_cost_mv - x264_clip3( m0->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); const int16_t p_cost_m0y = m0->p_cost_mv - x264_clip3( m0->mvp[1], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); const int16_t p_cost_m1x = m1->p_cost_mv - x264_clip3( m1->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); @@ -846,6 +843,10 @@ DECLARE_ALIGNED_16( uint8_t pix0[9][1616] ); DECLARE_ALIGNED_16( uint8_t pix1[9][1616] ); DECLARE_ALIGNED_16( uint8_t pix[1616] ); + uint8_t src0[9]; + uint8_t src1[9]; + int stride0[9]; + int stride1[9]; int bm0x = m0->mv[0], om0x = bm0x; int bm0y = m0->mv[1], om0y = bm0y; int bm1x = m1->mv[0], om1x = bm1x; @@ -853,7 +854,7 @@ int bcost = COST_MAX; int pass = 0; /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed / - uint8_t visited[8][8][8]; + DECLARE_ALIGNED_16( uint8_t visited[8][8][8] ); h->mc.memzero_aligned( visited, sizeof(visited) ); BIME_CACHE( 0, 0 ); @@ -904,7 +905,7 @@ { \ int stride = 16; \ uint8_t src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw4, bh4 ); \ - dst = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + dst = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + p_cost_mvx[mx] + p_cost_mvy[my]; \ COPY1_IF_LT( bsatd, dst ); \ }
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/encoder/ratecontrol.c ^
@@ -40,8 +40,7 @@ int kept_as_ref; float qscale; int mv_bits; - int i_tex_bits; - int p_tex_bits; + int tex_bits; int misc_bits; uint64_t expected_bits; double expected_vbv; @@ -118,10 +117,6 @@ double lmin[5]; /* min qscale by frame type / double lmax[5]; double lstep; / max change (multiply) in qscale per frame / - double i_cplx_sum[5]; / estimated total texture bits in intra MBs at qscale=1 / - double p_cplx_sum[5]; - double mv_bits_sum[5]; - int frame_count[5]; / number of frames of each type / / MBRC stuff / double frame_size_estimated; @@ -132,10 +127,6 @@ int bframes; / # consecutive B-frames before this P-frame / int bframe_bits; / total cost of those frames / - / AQ stuff / - float aq_threshold; - int ac_energy; - int i_zones; x264_zone_t zones; x264_zone_t prev_zone; @@ -149,7 +140,6 @@ static void update_vbv_plan( x264_t h ); static double predict_size( predictor_t p, double q, double var ); static void update_predictor( predictor_t p, double q, double var, double bits ); -int x264_rc_analyse_slice( x264_t h ); /* Terminology: * qp = h.264's quantizer @@ -172,72 +162,46 @@ { if(qscale<0.1) qscale = 0.1; - return (rce->i_tex_bits + rce->p_tex_bits + .1) * pow( rce->qscale / qscale, 1.1 ) + return (rce->tex_bits + .1) * pow( rce->qscale / qscale, 1.1 ) + rce->mv_bits * pow( X264_MAX(rce->qscale, 1) / X264_MAX(qscale, 1), 0.5 ) + rce->misc_bits; } // Find the total AC energy of the block in all planes. -static NOINLINE int ac_energy_mb( x264_t h, int mb_x, int mb_y, int satd ) +static NOINLINE int ac_energy_mb( x264_t h, int mb_x, int mb_y, x264_frame_t frame ) { /* This function contains annoying hacks because GCC has a habit of reordering emms * and putting it after floating point ops. As a result, we put the emms at the end of the * function and make sure that its always called before the float math. Noinline makes * sure no reordering goes on. / - / FIXME: This array is larger than necessary because a bug in GCC causes an all-zero - * array to be placed in .bss despite .bss not being correctly aligned on some platforms (win32?) / - DECLARE_ALIGNED_16( static uint8_t zero[17] ) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}; - unsigned int var=0, sad, ssd, i; - if( satd \|\| h->param.rc.i_aq_mode == X264_AQ_GLOBAL ) - { - for( i=0; i<3; i++ ) - { - int w = i ? 8 : 16; - int stride = h->fenc->i_stride[i]; - int offset = h->mb.b_interlaced - ? w (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride - : w * (mb_x + mb_y * stride); - int pix = i ? PIXEL_8x8 : PIXEL_16x16; - stride <<= h->mb.b_interlaced; - sad = h->pixf.sad[pix]( zero, 0, h->fenc->plane[i]+offset, stride ); - ssd = h->pixf.ssd[pix]( zero, 0, h->fenc->plane[i]+offset, stride ); - var += ssd - (sad * sad >> (i?6:8)); - // SATD to represent the block's overall complexity (bit cost) for intra encoding. - // exclude the DC coef, because nothing short of an actual intra prediction will estimate DC cost. - if( var && satd ) - satd += h->pixf.satd[pix]( zero, 0, h->fenc->plane[i]+offset, stride ) - sad/2; - } - var = X264_MAX(var,1); + unsigned int var=0, sad, i; + for( i=0; i<3; i++ ) + { + int w = i ? 8 : 16; + int stride = frame->i_stride[i]; + int offset = h->mb.b_interlaced + ? w (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride + : w * (mb_x + mb_y * stride); + int pix = i ? PIXEL_8x8 : PIXEL_16x16; + stride <<= h->mb.b_interlaced; + var += h->pixf.var[pix]( frame->plane[i]+offset, stride, &sad ); } - else var = h->rc->ac_energy[h->mb.i_mb_xy]; + var = X264_MAX(var,1); x264_emms(); return var; } -void x264_autosense_aq( x264_t h ) +void x264_adaptive_quant_frame( x264_t h, x264_frame_t frame ) { - double total = 0; - double n = 0; int mb_x, mb_y; - // FIXME: Some of the SATDs might be already calculated elsewhere (ratecontrol?). Can we reuse them? - // FIXME: Is chroma SATD necessary? for( mb_y=0; mb_y<h->sps->i_mb_height; mb_y++ ) for( mb_x=0; mb_x<h->sps->i_mb_width; mb_x++ ) { - int satd=0; - int energy = ac_energy_mb( h, mb_x, mb_y, &satd ); - h->rc->ac_energy[mb_x + mb_y h->sps->i_mb_width] = energy; - /* Weight the energy value by the SATD value of the MB. - * This represents the fact that the more complex blocks in a frame should - * be weighted more when calculating the optimal threshold. This also helps - * diminish the negative effect of large numbers of simple blocks in a frame, - * such as in the case of a letterboxed film. / - total += logf(energy) satd; - n += satd; + int energy = ac_energy_mb( h, mb_x, mb_y, frame ); + /* 10 constant chosen to result in approximately the same overall bitrate as without AQ. / + float qp_adj = h->param.rc.f_aq_strength 1.5 * (logf(energy) - 10.0); + frame->f_qp_offset[mb_x + mb_yh->mb.i_mb_stride] = qp_adj; } - x264_emms(); - / Calculate and store the threshold. / - h->rc->aq_threshold = n ? total/n : 15; } /************************************************************************** @@ -249,18 +213,16 @@ **************************************************************************/ void x264_adaptive_quant( x264_t h ) { - int energy = ac_energy_mb( h, h->mb.i_mb_x, h->mb.i_mb_y, NULL ); - /* Adjust the QP based on the AC energy of the macroblock. / - float qp = h->rc->f_qpm; - float qp_adj = 1.5 (logf(energy) - h->rc->aq_threshold); - if( h->param.rc.i_aq_mode == X264_AQ_LOCAL ) - qp_adj = x264_clip3f( qp_adj, -5, 5 ); - h->mb.i_qp = x264_clip3( qp + qp_adj * h->param.rc.f_aq_strength + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); + float qp, qp_adj; + x264_emms(); + qp = h->rc->f_qpm; + qp_adj = h->fenc->f_qp_offset[h->mb.i_mb_x + h->mb.i_mb_yh->mb.i_mb_stride]; + h->mb.i_qp = x264_clip3( qp + qp_adj + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); / If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB, * to lower the bit cost of the qp_delta. / if( abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ) h->mb.i_qp = h->mb.i_last_qp; - h->mb.i_chroma_qp = i_chroma_qp_table[x264_clip3( h->mb.i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )]; + h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; } int x264_ratecontrol_new( x264_t h ) @@ -275,7 +237,7 @@ rc->b_abr = h->param.rc.i_rc_method != X264_RC_CQP && !h->param.rc.b_stat_read; rc->b_2pass = h->param.rc.i_rc_method == X264_RC_ABR && h->param.rc.b_stat_read; - + /* FIXME: use integers / if(h->param.i_fps_num > 0 && h->param.i_fps_den > 0) rc->fps = (float) h->param.i_fps_num / h->param.i_fps_den; @@ -435,6 +397,25 @@ if( strstr( opts, "qp=0" ) && h->param.rc.i_rc_method == X264_RC_ABR ) x264_log( h, X264_LOG_WARNING, "1st pass was lossless, bitrate prediction will be inaccurate\n" ); + + if( ( p = strstr( opts, "b_adapt=" ) ) && sscanf( p, "b_adapt=%d", &i ) && i >= X264_B_ADAPT_NONE && i <= X264_B_ADAPT_TRELLIS ) + h->param.i_bframe_adaptive = i; + else if( h->param.i_bframe ) + { + x264_log( h, X264_LOG_ERROR, "b_adapt method specified in stats file not valid\n" ); + return -1; + } + + if( ( p = strstr( opts, "scenecut=" ) ) && sscanf( p, "scenecut=%d", &i ) && i >= -1 && i <= 100 ) + { + h->param.i_scenecut_threshold = i; + h->param.b_pre_scenecut = !!strstr( p, "(pre)" ); + } + else + { + x264_log( h, X264_LOG_ERROR, "scenecut method specified in stats file not valid\n" ); + return -1; + } } / find number of pics / @@ -503,8 +484,8 @@ rce = &rc->entry[frame_number]; rce->direct_mode = 0; - e += sscanf(p, " in:%d out:%d type:%c q:%f itex:%d ptex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c", - &pict_type, &qp, &rce->i_tex_bits, &rce->p_tex_bits, + e += sscanf(p, " in:%d out:%d type:%c q:%f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c", + &pict_type, &qp, &rce->tex_bits, &rce->mv_bits, &rce->misc_bits, &rce->i_count, &rce->p_count, &rce->s_count, &rce->direct_mode); @@ -561,9 +542,11 @@ { h->thread[i]->rc = rc+i; if( i ) + { rc[i] = rc[0]; - if( h->param.rc.i_aq_mode == X264_AQ_LOCAL ) - rc[i].ac_energy = x264_malloc( h->mb.i_mb_count sizeof(int) ); + memcpy( &h->thread[i]->param, &h->param, sizeof( x264_param_t ) ); + h->thread[i]->mb.b_variable_qp = h->mb.b_variable_qp; + } } return 0; @@ -673,7 +656,7 @@ return 0; } -x264_zone_t get_zone( x264_t h, int frame_num ) +static x264_zone_t get_zone( x264_t h, int frame_num ) { int i; for( i = h->rc->i_zones-1; i >= 0; i-- ) @@ -691,7 +674,7 @@ if( rc->b_abr && h->param.rc.i_rc_method == X264_RC_ABR && rc->cbr_decay > .9999 ) { double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80); - x264_log( h, X264_LOG_INFO, "final ratefactor: %.2f\n", + x264_log( h, X264_LOG_INFO, "final ratefactor: %.2f\n", qscale2qp( pow( base_cplx, 1 - h->param.rc.f_qcompress ) * rc->cplxr_sum / rc->wanted_bits_window ) ); } @@ -725,8 +708,6 @@ x264_free( rc->zones[i].param ); x264_free( rc->zones ); } - for( i=0; i<h->param.i_threads; i++ ) - x264_free( rc[i].ac_energy ); x264_free( rc ); } @@ -850,26 +831,19 @@ if( h->sh.i_type != SLICE_TYPE_B ) rc->last_non_b_pict_type = h->sh.i_type; - - /* Adaptive AQ thresholding algorithm. / - if( h->param.rc.i_aq_mode == X264_AQ_GLOBAL ) - / Arbitrary value for "center" of the AQ curve. - * Chosen so that any given value of CRF has on average similar bitrate with and without AQ. / - h->rc->aq_threshold = logf(5000); - else if( h->param.rc.i_aq_mode == X264_AQ_LOCAL ) - x264_autosense_aq(h); } -double predict_row_size( x264_t h, int y, int qp ) +static double predict_row_size( x264_t h, int y, int qp ) { / average between two predictors: * absolute SATD, and scaled bit cost of the colocated row in the previous frame / x264_ratecontrol_t rc = h->rc; double pred_s = predict_size( rc->row_pred, qp2qscale(qp), h->fdec->i_row_satd[y] ); double pred_t = 0; - if( h->sh.i_type != SLICE_TYPE_I + if( h->sh.i_type != SLICE_TYPE_I && h->fref0[0]->i_type == h->fdec->i_type - && h->fref0[0]->i_row_satd[y] > 0 ) + && h->fref0[0]->i_row_satd[y] > 0 + && (abs(h->fref0[0]->i_row_satd[y] - h->fdec->i_row_satd[y]) < h->fdec->i_row_satd[y]/2)) { pred_t = h->fref0[0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref0[0]->i_row_satd[y] * qp2qscale(h->fref0[0]->i_row_qp[y]) / qp2qscale(qp); @@ -880,7 +854,7 @@ return (pred_s + pred_t) / 2; } -double row_bits_so_far( x264_t h, int y ) +static double row_bits_so_far( x264_t h, int y ) { int i; double bits = 0; @@ -889,7 +863,7 @@ return bits; } -double predict_row_size_sum( x264_t h, int y, int qp ) +static double predict_row_size_sum( x264_t h, int y, int qp ) { int i; double bits = row_bits_so_far(h, y); @@ -1016,14 +990,14 @@ x264_log(h, X264_LOG_ERROR, "2nd pass has more frames than 1st pass (%d)\n", rc->num_entries); x264_log(h, X264_LOG_ERROR, "continuing anyway, at constant QP=%d\n", h->param.rc.i_qp_constant); - if( h->param.b_bframe_adaptive ) + if( h->param.i_bframe_adaptive ) x264_log(h, X264_LOG_ERROR, "disabling adaptive B-frames\n"); rc->b_abr = 0; rc->b_2pass = 0; h->param.rc.i_rc_method = X264_RC_CQP; h->param.rc.b_stat_read = 0; - h->param.b_bframe_adaptive = 0; + h->param.i_bframe_adaptive = 0; if( h->param.i_bframe > 1 ) h->param.i_bframe = 1; return X264_TYPE_P; @@ -1073,15 +1047,16 @@ int dir_frame = h->stat.frame.i_direct_score[1] - h->stat.frame.i_direct_score[0]; int dir_avg = h->stat.i_direct_score[1] - h->stat.i_direct_score[0]; char c_direct = h->mb.b_direct_auto_write ? - ( dir_frame>0 ? 's' : dir_frame<0 ? 't' : + ( dir_frame>0 ? 's' : dir_frame<0 ? 't' : dir_avg>0 ? 's' : dir_avg<0 ? 't' : '-' ) : '-'; fprintf( rc->p_stat_file_out, - "in:%d out:%d type:%c q:%.2f itex:%d ptex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c;\n", + "in:%d out:%d type:%c q:%.2f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c;\n", h->fenc->i_frame, h->i_frame, c_type, rc->qpa_rc, - h->stat.frame.i_itex_bits, h->stat.frame.i_ptex_bits, - h->stat.frame.i_hdr_bits, h->stat.frame.i_misc_bits, + h->stat.frame.i_tex_bits, + h->stat.frame.i_mv_bits, + h->stat.frame.i_misc_bits, h->stat.frame.i_mb_count_i, h->stat.frame.i_mb_count_p, h->stat.frame.i_mb_count_skip, @@ -1132,76 +1107,19 @@ * 2 pass functions **************************************************************************/ -double x264_eval( char s, double const_value, const char const_name, - double (func1)(void , double), const char func1_name, - double (func2)(void , double, double), char func2_name, - void opaque ); - /** * modify the bitrate curve from pass1 for one frame / static double get_qscale(x264_t h, ratecontrol_entry_t rce, double rate_factor, int frame_num) { x264_ratecontrol_t rcc= h->rc; - const int pict_type = rce->pict_type; double q; x264_zone_t zone = get_zone( h, frame_num ); - double const_values[]={ - rce->i_tex_bits rce->qscale, - rce->p_tex_bits * rce->qscale, - (rce->i_tex_bits + rce->p_tex_bits) * rce->qscale, - rce->mv_bits * rce->qscale, - (double)rce->i_count / rcc->nmb, - (double)rce->p_count / rcc->nmb, - (double)rce->s_count / rcc->nmb, - rce->pict_type == SLICE_TYPE_I, - rce->pict_type == SLICE_TYPE_P, - rce->pict_type == SLICE_TYPE_B, - h->param.rc.f_qcompress, - rcc->i_cplx_sum[SLICE_TYPE_I] / rcc->frame_count[SLICE_TYPE_I], - rcc->i_cplx_sum[SLICE_TYPE_P] / rcc->frame_count[SLICE_TYPE_P], - rcc->p_cplx_sum[SLICE_TYPE_P] / rcc->frame_count[SLICE_TYPE_P], - rcc->p_cplx_sum[SLICE_TYPE_B] / rcc->frame_count[SLICE_TYPE_B], - (rcc->i_cplx_sum[pict_type] + rcc->p_cplx_sum[pict_type]) / rcc->frame_count[pict_type], - rce->blurred_complexity, - 0 - }; - static const char const_names[]={ - "iTex", - "pTex", - "tex", - "mv", - "iCount", - "pCount", - "sCount", - "isI", - "isP", - "isB", - "qComp", - "avgIITex", - "avgPITex", - "avgPPTex", - "avgBPTex", - "avgTex", - "blurCplx", - NULL - }; - static double (func1[])(void , double)={ -// (void )bits2qscale, - (void )qscale2bits, - NULL - }; - static const char func1_names[]={ -// "bits2qp", - "qp2bits", - NULL - }; - - q = x264_eval((char)h->param.rc.psz_rc_eq, const_values, const_names, func1, func1_names, NULL, NULL, rce); + q = pow( rce->blurred_complexity, 1 - h->param.rc.f_qcompress ); // avoid NaN's in the rc_eq - if(!isfinite(q) \|\| rce->i_tex_bits + rce->p_tex_bits + rce->mv_bits == 0) + if(!isfinite(q) \|\| rce->tex_bits + rce->mv_bits == 0) q = rcc->last_qscale; else { @@ -1253,7 +1171,7 @@ } else if( pict_type == SLICE_TYPE_P && rcc->last_non_b_pict_type == SLICE_TYPE_P - && rce->i_tex_bits + rce->p_tex_bits == 0 ) + && rce->tex_bits == 0 ) { q = last_p_q; } @@ -1431,6 +1349,21 @@ + h->stat.i_slice_size[SLICE_TYPE_P] + h->stat.i_slice_size[SLICE_TYPE_B]); + if( h->param.i_threads > 1 ) + { + int j = h->rc - h->thread[0]->rc; + int i; + for( i=1; i<h->param.i_threads; i++ ) + { + x264_t t = h->thread[ (j+i)%h->param.i_threads ]; + double bits = t->rc->frame_size_planned; + if( !t->b_thread_active ) + continue; + bits = X264_MAX(bits, x264_ratecontrol_get_estimated_size(t)); + total_bits += (int64_t)bits; + } + } + if( rcc->b_2pass ) { rce = rcc->rce; @@ -1501,10 +1434,12 @@ double expected_fullness = rce.expected_vbv / rcc->buffer_size; double qmax = q(2 - expected_fullness); double size_constraint = 1 + expected_fullness; + qmax = X264_MAX(qmax, rce.new_qscale); if (expected_fullness < .05) qmax = lmax; qmax = X264_MIN(qmax, lmax); - while( (expected_vbv < rce.expected_vbv/size_constraint) && (q < qmax) ) + while( ((expected_vbv < rce.expected_vbv/size_constraint) && (q < qmax)) \|\| + ((expected_vbv < 0) && (q < lmax))) { q = 1.05; expected_size = qscale2bits(&rce, q); @@ -1534,9 +1469,8 @@ rcc->short_term_cplxsum += rcc->last_satd; rcc->short_term_cplxcount ++; - rce.p_tex_bits = rcc->last_satd; + rce.tex_bits = rcc->last_satd; rce.blurred_complexity = rcc->short_term_cplxsum / rcc->short_term_cplxcount; - rce.i_tex_bits = 0; rce.mv_bits = 0; rce.p_count = rcc->nmb; rce.i_count = 0; @@ -1789,10 +1723,6 @@ { ratecontrol_entry_t rce = &rcc->entry[i]; all_const_bits += rce->misc_bits; - rcc->i_cplx_sum[rce->pict_type] += rce->i_tex_bits * rce->qscale; - rcc->p_cplx_sum[rce->pict_type] += rce->p_tex_bits * rce->qscale; - rcc->mv_bits_sum[rce->pict_type] += rce->mv_bits * rce->qscale; - rcc->frame_count[rce->pict_type] ++; } if( all_available_bits < all_const_bits)
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/encoder/ratecontrol.h ^
@@ -27,6 +27,8 @@ int x264_ratecontrol_new ( x264_t * ); void x264_ratecontrol_delete( x264_t * ); +void x264_adaptive_quant_frame( x264_t h, x264_frame_t frame ); +void x264_adaptive_quant( x264_t * ); void x264_thread_sync_ratecontrol( x264_t cur, x264_t prev, x264_t next ); void x264_ratecontrol_start( x264_t , int i_force_qp ); int x264_ratecontrol_slice_type( x264_t , int i_frame ); @@ -34,9 +36,9 @@ int x264_ratecontrol_qp( x264_t ); void x264_ratecontrol_end( x264_t , int bits ); void x264_ratecontrol_summary( x264_t ); -void x264_adaptive_quant( x264_t * ); void x264_ratecontrol_set_estimated_size( x264_t , int bits ); int x264_ratecontrol_get_estimated_size( x264_t const ); +int x264_rc_analyse_slice( x264_t *h ); #endif
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/encoder/rdo.c ^
@@ -34,7 +34,7 @@ #define bs_write_ue(s,v) ((s)->i_bits_encoded += bs_size_ue(v)) #define bs_write_se(s,v) ((s)->i_bits_encoded += bs_size_se(v)) #define bs_write_te(s,v,l) ((s)->i_bits_encoded += bs_size_te(v,l)) -#define x264_macroblock_write_cavlc x264_macroblock_size_cavlc +#define x264_macroblock_write_cavlc static x264_macroblock_size_cavlc #include "cavlc.c" /* CABAC: not exactly the same. x264_cabac_size_decision() keeps track of @@ -45,26 +45,84 @@ #define x264_cabac_encode_bypass(c,v) ((c)->f8_bits_encoded += 256) #define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue_big(v+(1<<e)-1)-e)<<8) #define x264_cabac_encode_flush(h,c) -#define x264_macroblock_write_cabac x264_macroblock_size_cabac +#define x264_macroblock_write_cabac static x264_macroblock_size_cabac #include "cabac.c" #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \ sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) ) - -static int ssd_mb( x264_t h ) + + +/ Sum the cached SATDs to avoid repeating them. / +static inline int sum_satd( x264_t h, int pixel, int x, int y ) { - return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, - h->mb.pic.p_fdec[0], FDEC_STRIDE ) - + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE, - h->mb.pic.p_fdec[1], FDEC_STRIDE ) - + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE, - h->mb.pic.p_fdec[2], FDEC_STRIDE ); + int satd = 0; + int min_x = x>>2; + int min_y = y>>2; + int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2); + int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2); + if( pixel == PIXEL_16x16 ) + return h->mb.pic.fenc_satd_sum; + for( y = min_y; y < max_y; y++ ) + for( x = min_x; x < max_x; x++ ) + satd += h->mb.pic.fenc_satd[y][x]; + return satd; +} + +static inline int sum_sa8d( x264_t h, int pixel, int x, int y ) +{ + int sa8d = 0; + int min_x = x>>3; + int min_y = y>>3; + int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3); + int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3); + if( pixel == PIXEL_16x16 ) + return h->mb.pic.fenc_sa8d_sum; + for( y = min_y; y < max_y; y++ ) + for( x = min_x; x < max_x; x++ ) + sa8d += h->mb.pic.fenc_sa8d[y][x]; + return sa8d; +} + +/ Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" / +/ SATD and SA8D are used to measure block complexity. / +/ The difference between SATD and SA8D scores are both used to avoid bias from the DCT size. Using SATD / +/ only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. / + +/ FIXME: Is there a better metric than averaged SATD/SA8D difference for complexity difference? / +/ Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. / +/ This optimization can also be used in non-RD transform decision. / + +static inline int ssd_plane( x264_t h, int size, int p, int x, int y ) +{ + DECLARE_ALIGNED_16(static uint8_t zero[16]); + int satd = 0; + uint8_t fdec = h->mb.pic.p_fdec[p] + x + yFDEC_STRIDE; + uint8_t fenc = h->mb.pic.p_fenc[p] + x + yFENC_STRIDE; + if( p == 0 && h->mb.i_psy_rd ) + { + /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. / + if( size <= PIXEL_8x8 ) + { + uint64_t acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE ); + satd = abs((int32_t)acs - sum_satd( h, size, x, y )) + + abs((int32_t)(acs>>32) - sum_sa8d( h, size, x, y )); + satd >>= 1; + } + else + { + int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1; + satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - sum_satd( h, size, x, y )); + } + satd = (satd h->mb.i_psy_rd * x264_lambda_tab[h->mb.i_qp] + 128) >> 8; + } + return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd; } -static int ssd_plane( x264_t h, int size, int p, int x, int y ) +static inline int ssd_mb( x264_t h ) { - return h->pixf.ssd[size]( h->mb.pic.p_fenc[p] + x+yFENC_STRIDE, FENC_STRIDE, - h->mb.pic.p_fdec[p] + x+yFDEC_STRIDE, FDEC_STRIDE ); + return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + + ssd_plane(h, PIXEL_8x8, 1, 0, 0) + + ssd_plane(h, PIXEL_8x8, 2, 0, 0); } static int x264_rd_cost_mb( x264_t h, int i_lambda2 ) @@ -140,7 +198,7 @@ return (i_ssd<<8) + i_bits; } -uint64_t x264_rd_cost_i8x8( x264_t h, int i_lambda2, int i8, int i_mode ) +static uint64_t x264_rd_cost_i8x8( x264_t h, int i_lambda2, int i8, int i_mode ) { uint64_t i_ssd, i_bits; @@ -162,7 +220,7 @@ return (i_ssd<<8) + i_bits; } -uint64_t x264_rd_cost_i4x4( x264_t h, int i_lambda2, int i4, int i_mode ) +static uint64_t x264_rd_cost_i4x4( x264_t h, int i_lambda2, int i4, int i_mode ) { uint64_t i_ssd, i_bits; @@ -184,7 +242,7 @@ return (i_ssd<<8) + i_bits; } -uint64_t x264_rd_cost_i8x8_chroma( x264_t h, int i_lambda2, int i_mode, int b_dct ) +static uint64_t x264_rd_cost_i8x8_chroma( x264_t h, int i_lambda2, int i_mode, int b_dct ) { uint64_t i_ssd, i_bits; @@ -219,7 +277,7 @@ #define LAMBDA_BITS 4 / precalculate the cost of coding abs_level_m1 / -void x264_rdo_init( ) +void x264_rdo_init( void ) { int i_prefix; int i_ctx; @@ -247,29 +305,29 @@ // I'm just matching the behaviour of deadzone quant. static const int lambda2_tab[2][52] = { // inter lambda = .85 .85 * 2*(qp/3. + 10 - LAMBDA_BITS) - { 46, 58, 73, 92, 117, 147, - 185, 233, 294, 370, 466, 587, - 740, 932, 1174, 1480, 1864, 2349, - 2959, 3728, 4697, 5918, 7457, 9395, - 11837, 14914, 18790, 23674, 29828, 37581, - 47349, 59656, 75163, 94699, 119313, 150326, - 189399, 238627, 300652, 378798, 477255, 601304, - 757596, 954511, 1202608, 1515192, 1909022, 2405217, + { 46, 58, 73, 92, 117, 147, + 185, 233, 294, 370, 466, 587, + 740, 932, 1174, 1480, 1864, 2349, + 2959, 3728, 4697, 5918, 7457, 9395, + 11837, 14914, 18790, 23674, 29828, 37581, + 47349, 59656, 75163, 94699, 119313, 150326, + 189399, 238627, 300652, 378798, 477255, 601304, + 757596, 954511, 1202608, 1515192, 1909022, 2405217, 3030384, 3818045, 4810435, 6060769 }, // intra lambda = .65 .65 * 2*(qp/3. + 10 - LAMBDA_BITS) - { 27, 34, 43, 54, 68, 86, - 108, 136, 172, 216, 273, 343, - 433, 545, 687, 865, 1090, 1374, - 1731, 2180, 2747, 3461, 4361, 5494, - 6922, 8721, 10988, 13844, 17442, 21976, - 27688, 34885, 43953, 55377, 69771, 87906, - 110755, 139543, 175813, 221511, 279087, 351627, - 443023, 558174, 703255, 886046, 1116348, 1406511, + { 27, 34, 43, 54, 68, 86, + 108, 136, 172, 216, 273, 343, + 433, 545, 687, 865, 1090, 1374, + 1731, 2180, 2747, 3461, 4361, 5494, + 6922, 8721, 10988, 13844, 17442, 21976, + 27688, 34885, 43953, 55377, 69771, 87906, + 110755, 139543, 175813, 221511, 279087, 351627, + 443023, 558174, 703255, 886046, 1116348, 1406511, 1772093, 2232697, 2813022, 3544186 } }; typedef struct { - uint64_t score; + int64_t score; int level_idx; // index into level_tree[] uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1 } trellis_node_t; @@ -298,7 +356,7 @@ static inline void quant_trellis_cabac( x264_t h, int16_t dct, const uint16_t quant_mf, const int unquant_mf, const int coef_weight, const uint8_t zigzag, - int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs ) + int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs, int idx ) { int abs_coefs[64], signs[64]; trellis_node_t nodes[2][8]; @@ -430,8 +488,20 @@ // that are better left coded, especially at QP > 40. for( abs_level = q; abs_level >= q-1; abs_level-- ) { - int d = i_coef - ((unquant_mf[zigzag[i]] abs_level + 128) >> 8); - uint64_t ssd = (int64_t)dd coef_weight[i]; + int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8); + int d = i_coef - unquant_abs_level; + int64_t ssd; + /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. / + if( h->mb.i_psy_trellis && i ) + { + int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][i] : h->mb.pic.fenc_dct4[idx][i]; + int predicted_coef = orig_coef - i_coef signs[i]; + int psy_value = h->mb.i_psy_trellis * abs(predicted_coef + unquant_abs_level * signs[i]); + int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]]; + ssd = (int64_t)dd coef_weight[i] - psy_weight * psy_value; + } + else + ssd = (int64_t)dd coef_weight[i]; for( j = 0; j < 8; j++ ) { @@ -495,24 +565,24 @@ void x264_quant_4x4_trellis( x264_t h, int16_t dct[4][4], int i_quant_cat, - int i_qp, int i_ctxBlockCat, int b_intra ) + int i_qp, int i_ctxBlockCat, int b_intra, int idx ) { int b_ac = (i_ctxBlockCat == DCT_LUMA_AC); quant_trellis_cabac( h, (int16_t)dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], x264_dct4_weight2_zigzag[h->mb.b_interlaced], x264_zigzag_scan4[h->mb.b_interlaced], - i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 16 ); + i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 16, idx ); } void x264_quant_8x8_trellis( x264_t h, int16_t dct[8][8], int i_quant_cat, - int i_qp, int b_intra ) + int i_qp, int b_intra, int idx ) { quant_trellis_cabac( h, (int16_t)dct, h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], x264_dct8_weight2_zigzag[h->mb.b_interlaced], x264_zigzag_scan8[h->mb.b_interlaced], - DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 64 ); + DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 64, idx ); }
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/encoder/set.c ^
@@ -27,6 +27,7 @@ #ifndef _MSC_VER #include "config.h" #endif +#include "set.h" #define bs_write_ue bs_write_ue_big @@ -79,7 +80,7 @@ sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0; if( sps->b_qpprime_y_zero_transform_bypass ) - sps->i_profile_idc = PROFILE_HIGH444; + sps->i_profile_idc = PROFILE_HIGH444_PREDICTIVE; else if( param->analyse.b_transform_8x8 \|\| param->i_cqm_preset != X264_CQM_FLAT ) sps->i_profile_idc = PROFILE_HIGH; else if( param->b_cabac \|\| param->i_bframe > 0 ) @@ -150,11 +151,11 @@ sps->vui.i_sar_width = param->vui.i_sar_width; sps->vui.i_sar_height= param->vui.i_sar_height; } - + sps->vui.b_overscan_info_present = ( param->vui.i_overscan ? 1 : 0 ); if( sps->vui.b_overscan_info_present ) sps->vui.b_overscan_info = ( param->vui.i_overscan == 2 ? 1 : 0 ); - + sps->vui.b_signal_type_present = 0; sps->vui.i_vidformat = ( param->vui.i_vidformat <= 5 ? param->vui.i_vidformat : 5 ); sps->vui.b_fullrange = ( param->vui.b_fullrange ? 1 : 0 ); @@ -176,7 +177,7 @@ { sps->vui.b_signal_type_present = 1; } - + /* FIXME: not sufficient for interlaced video */ sps->vui.b_chroma_loc_info_present = ( param->vui.i_chroma_loc ? 1 : 0 ); if( sps->vui.b_chroma_loc_info_present )
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/encoder/slicetype.c ^
@@ -37,9 +37,9 @@ h->mb.b_chroma_me = 0; } -int x264_slicetype_mb_cost( x264_t h, x264_mb_analysis_t a, +static int x264_slicetype_mb_cost( x264_t h, x264_mb_analysis_t a, x264_frame_t *frames, int p0, int p1, int b, - int dist_scale_factor ) + int dist_scale_factor, int do_search[2] ) { x264_frame_t fref0 = frames[p0]; x264_frame_t fref1 = frames[p1]; @@ -51,6 +51,9 @@ const int i_mb_xy = i_mb_x + i_mb_y i_mb_stride; const int i_stride = fenc->i_stride_lowres; const int i_pel_offset = 8 * ( i_mb_x + i_mb_y * i_stride ); + const int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32; + int16_t (fenc_mvs[2])[2] = { &frames[b]->lowres_mvs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mvs[1][p1-b-1][i_mb_xy] }; + int (fenc_costs[2]) = { &frames[b]->lowres_mv_costs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mv_costs[1][p1-b-1][i_mb_xy] }; DECLARE_ALIGNED_8( uint8_t pix1[9FDEC_STRIDE] ); uint8_t pix2 = pix1+8; @@ -70,7 +73,7 @@ h->mb.mv_max_fpel[0] = 8( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 4; h->mb.mv_min_spel[0] = 4( h->mb.mv_min_fpel[0] - 8 ); h->mb.mv_max_spel[0] = 4( h->mb.mv_max_fpel[0] + 8 ); - if( h->mb.i_mb_x <= 1 ) + if( h->mb.i_mb_x >= h->sps->i_mb_width - 2 ) { h->mb.mv_min_fpel[1] = -8h->mb.i_mb_y - 4; h->mb.mv_max_fpel[1] = 8( h->sps->i_mb_height - h->mb.i_mb_y - 1 ) + 4; @@ -85,12 +88,6 @@ (dst)[2] = &(src)[2][i_pel_offset]; \ (dst)[3] = &(src)[3][i_pel_offset]; \ } -#define SAVE_MVS( mv0, mv1 ) \ - { \ - (uint32_t)fenc->mv[0][i_mb_xy] = (uint32_t)mv0; \ - if( b_bidir ) \ - (uint32_t)fenc->mv[1][i_mb_xy] = (uint32_t)mv1; \ - } #define CLIP_MV( mv ) \ { \ mv[0] = x264_clip3( mv[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); \ @@ -98,21 +95,18 @@ } #define TRY_BIDIR( mv0, mv1, penalty ) \ { \ - int stride2 = 16; \ - uint8_t src2; \ + int stride1 = 16, stride2 = 16; \ + uint8_t src1, src2; \ int i_cost; \ - h->mc.mc_luma( pix1, 16, m[0].p_fref, m[0].i_stride[0], \ - (mv0)[0], (mv0)[1], 8, 8 ); \ + src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \ + (mv0)[0], (mv0)[1], 8, 8 ); \ src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \ - (mv1)[0], (mv1)[1], 8, 8 ); \ - h->mc.avg[PIXEL_8x8]( pix1, 16, src2, stride2 ); \ + (mv1)[0], (mv1)[1], 8, 8 ); \ + h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \ i_cost = penalty + h->pixf.mbcmp[PIXEL_8x8]( \ m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \ if( i_bcost > i_cost ) \ - { \ i_bcost = i_cost; \ - SAVE_MVS( mv0, mv1 ); \ - } \ } m[0].i_pixel = PIXEL_8x8; @@ -123,7 +117,7 @@ if( b_bidir ) { - int16_t mvr = fref1->mv[0][i_mb_xy]; + int16_t mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy]; int dmv[2][2]; int mv0[2] = {0,0}; @@ -149,68 +143,79 @@ { DECLARE_ALIGNED_4(int16_t mvc[4][2]) = {{0}}; int i_mvc = 0; - int16_t (fenc_mv)[2] = &fenc->mv[l][i_mb_xy]; -#define MVC(mv) { (uint32_t)mvc[i_mvc] = (uint32_t)mv; i_mvc++; } - if( i_mb_x > 0 ) - MVC(fenc_mv[-1]); - if( i_mb_y > 0 ) + int16_t (fenc_mv)[2] = fenc_mvs[l]; + + if( do_search[l] ) { - MVC(fenc_mv[-i_mb_stride]); + /* Reverse-order MV prediction. / +#define MVC(mv) { (uint32_t)mvc[i_mvc] = (uint32_t)mv; i_mvc++; } if( i_mb_x < h->sps->i_mb_width - 1 ) - MVC(fenc_mv[-i_mb_stride+1]); - if( i_mb_x > 0 ) - MVC(fenc_mv[-i_mb_stride-1]); - } + MVC(fenc_mv[1]); + if( i_mb_y < h->sps->i_mb_height - 1 ) + { + MVC(fenc_mv[i_mb_stride]); + if( i_mb_x > 0 ) + MVC(fenc_mv[i_mb_stride-1]); + if( i_mb_x < h->sps->i_mb_width - 1 ) + MVC(fenc_mv[i_mb_stride+1]); + } #undef MVC - x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] ); - x264_me_search( h, &m[l], mvc, i_mvc ); + x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] ); + x264_me_search( h, &m[l], mvc, i_mvc ); - m[l].cost -= 2; // remove mvcost from skip mbs - if( (uint32_t)m[l].mv ) - m[l].cost += 5; + m[l].cost -= 2; // remove mvcost from skip mbs + if( (uint32_t)m[l].mv ) + m[l].cost += 5; + (uint32_t)fenc_mvs[l] = (uint32_t)m[l].mv; + fenc_costs[l] = m[l].cost; + } + else + { + (uint32_t)m[l].mv = (uint32_t)fenc_mvs[l]; + m[l].cost = fenc_costs[l]; + } i_bcost = X264_MIN( i_bcost, m[l].cost ); } if( b_bidir && ( (uint32_t)m[0].mv \|\| (uint32_t)m[1].mv ) ) TRY_BIDIR( m[0].mv, m[1].mv, 5 ); - if( i_bcost < i_cost_bak ) - SAVE_MVS( m[0].mv, m[1].mv ); - - //FIXME intra part could be shared across multiple encodings of the frame lowres_intra_mb: - if( !b_bidir ) // forbid intra-mbs in B-frames, because it's rare and not worth checking + / forbid intra-mbs in B-frames, because it's rare and not worth checking / + / FIXME: Should we still forbid them now that we cache intra scores? / + if( !b_bidir ) { - uint8_t pix = &pix1[8+FDEC_STRIDE - 1]; - uint8_t src = &fenc->lowres[0][i_pel_offset - 1]; - const int intra_penalty = 5; - int satds[4], i_icost, b_intra; - - memcpy( pix-FDEC_STRIDE, src-i_stride, 17 ); - for( i=0; i<8; i++ ) - pix[iFDEC_STRIDE] = src[ii_stride]; - pix++; - - if( h->pixf.intra_satd_x3_8x8c && h->pixf.mbcmp[0] == h->pixf.satd[0] ) - { - h->pixf.intra_satd_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds ); - h->predict_8x8c[I_PRED_CHROMA_P]( pix ); - satds[I_PRED_CHROMA_P] = - h->pixf.satd[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); - } - else + int i_icost, b_intra; + if( !fenc->b_intra_calculated ) { - for( i=0; i<4; i++ ) + DECLARE_ALIGNED_16( uint8_t edge[33] ); + uint8_t pix = &pix1[8+FDEC_STRIDE - 1]; + uint8_t src = &fenc->lowres[0][i_pel_offset - 1]; + const int intra_penalty = 5; + int satds[4]; + + memcpy( pix-FDEC_STRIDE, src-i_stride, 17 ); + for( i=0; i<8; i++ ) + pix[iFDEC_STRIDE] = src[ii_stride]; + pix++; + + if( h->pixf.intra_satd_x3_8x8c && h->pixf.mbcmp[0] == h->pixf.satd[0] ) { - h->predict_8x8c[i]( pix ); - satds[i] = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); + h->pixf.intra_satd_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds ); + h->predict_8x8c[I_PRED_CHROMA_P]( pix ); + satds[I_PRED_CHROMA_P] = + h->pixf.satd[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); } - } - i_icost = X264_MIN4( satds[0], satds[1], satds[2], satds[3] ); + else + { + for( i=0; i<4; i++ ) + { + h->predict_8x8c[i]( pix ); + satds[i] = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); + } + } + i_icost = X264_MIN4( satds[0], satds[1], satds[2], satds[3] ); - if( i_icost < i_bcost 2 ) - { - DECLARE_ALIGNED_16( uint8_t edge[33] ); x264_predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); for( i=3; i<9; i++ ) { @@ -219,9 +224,12 @@ satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); i_icost = X264_MIN( i_icost, satd ); } - } - i_icost += intra_penalty; + i_icost += intra_penalty; + fenc->i_intra_cost[i_mb_xy] = i_icost; + } + else + i_icost = fenc->i_intra_cost[i_mb_xy]; b_intra = i_icost < i_bcost; if( b_intra ) i_bcost = i_icost; @@ -236,18 +244,26 @@ return i_bcost; } #undef TRY_BIDIR -#undef SAVE_MVS -int x264_slicetype_frame_cost( x264_t h, x264_mb_analysis_t a, +#define NUM_MBS\ + (h->sps->i_mb_width > 2 && h->sps->i_mb_height > 2 ?\ + (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2) :\ + h->sps->i_mb_width * h->sps->i_mb_height) + +static int x264_slicetype_frame_cost( x264_t h, x264_mb_analysis_t a, x264_frame_t *frames, int p0, int p1, int b, int b_intra_penalty ) { int i_score = 0; + / Don't use the AQ'd scores for slicetype decision. / + int i_score_aq = 0; + int do_search[2]; / Check whether we already evaluated this frame * If we have tried this frame as P, then we have also tried * the preceding frames as B. (is this still true?) / - if( frames[b]->i_cost_est[b-p0][p1-b] >= 0 ) + / Also check that we already calculated the row SATDs for the current frame. / + if( frames[b]->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size \|\| frames[b]->i_row_satds[b-p0][p1-b][0] != -1) ) { i_score = frames[b]->i_cost_est[b-p0][p1-b]; } @@ -256,11 +272,11 @@ int dist_scale_factor = 128; int row_satd = frames[b]->i_row_satds[b-p0][p1-b]; - /* Init MVs so that we don't have to check edge conditions when loading predictors. / - / FIXME: not needed every time / - memset( frames[b]->mv[0], 0, h->sps->i_mb_height h->sps->i_mb_width * 2sizeof(int16_t) ); - if( b != p1 ) - memset( frames[b]->mv[1], 0, h->sps->i_mb_height h->sps->i_mb_width * 2sizeof(int16_t) ); + / For each list, check to see whether we have lowres motion-searched this reference frame before. / + do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF; + do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF; + if( do_search[0] ) frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0; + if( do_search[1] ) frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0; if( b == p1 ) { @@ -270,50 +286,157 @@ if( p1 != p0 ) dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0); + / Lowres lookahead goes backwards because the MVs are used as predictors in the main encode. / + / This considerably improves MV prediction overall. / + if( h->sps->i_mb_width <= 2 \|\| h->sps->i_mb_height <= 2 ) + { + for( h->mb.i_mb_y = h->sps->i_mb_height - 1; h->mb.i_mb_y >= 0 ; h->mb.i_mb_y-- ) + for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0 ; h->mb.i_mb_x-- ) + i_score += x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search ); + } / the edge mbs seem to reduce the predictive quality of the * whole frame's score, but are needed for a spatial distribution. / - if( h->param.rc.i_vbv_buffer_size ) + else if( h->param.rc.i_vbv_buffer_size ) { - for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ ) + for( h->mb.i_mb_y = h->sps->i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- ) { row_satd[ h->mb.i_mb_y ] = 0; - for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++ ) + for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- ) { - int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor ); - row_satd[ h->mb.i_mb_y ] += i_mb_cost; + int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search ); + int i_mb_cost_aq = i_mb_cost; + if( h->param.rc.i_aq_mode ) + { + x264_emms(); + i_mb_cost_aq = pow(2.0,-(frames[b]->f_qp_offset[h->mb.i_mb_x + h->mb.i_mb_yh->mb.i_mb_stride])/6.0); + } + row_satd[ h->mb.i_mb_y ] += i_mb_cost_aq; if( h->mb.i_mb_y > 0 && h->mb.i_mb_y < h->sps->i_mb_height - 1 && h->mb.i_mb_x > 0 && h->mb.i_mb_x < h->sps->i_mb_width - 1 ) { + / Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. / i_score += i_mb_cost; + i_score_aq += i_mb_cost_aq; } } } } else { - for( h->mb.i_mb_y = 1; h->mb.i_mb_y < h->sps->i_mb_height - 1; h->mb.i_mb_y++ ) - for( h->mb.i_mb_x = 1; h->mb.i_mb_x < h->sps->i_mb_width - 1; h->mb.i_mb_x++ ) - i_score += x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor ); + for( h->mb.i_mb_y = h->sps->i_mb_height - 2; h->mb.i_mb_y > 0; h->mb.i_mb_y-- ) + for( h->mb.i_mb_x = h->sps->i_mb_width - 2; h->mb.i_mb_x > 0; h->mb.i_mb_x-- ) + { + int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search ); + int i_mb_cost_aq = i_mb_cost; + if( h->param.rc.i_aq_mode ) + { + x264_emms(); + i_mb_cost_aq = pow(2.0,-(frames[b]->f_qp_offset[h->mb.i_mb_x + h->mb.i_mb_yh->mb.i_mb_stride])/6.0); + } + i_score += i_mb_cost; + i_score_aq += i_mb_cost_aq; + } } if( b != p1 ) i_score = i_score 100 / (120 + h->param.i_bframe_bias); + else + frames[b]->b_intra_calculated = 1; frames[b]->i_cost_est[b-p0][p1-b] = i_score; -// fprintf( stderr, "frm %d %c(%d,%d): %6d %6d imb:%d \n", frames[b]->i_frame, -// (p1==0?'I':b<p1?'B':'P'), b-p0, p1-b, i_score, frames[b]->i_cost_est[0][0], frames[b]->i_intra_mbs[b-p0] ); + frames[b]->i_cost_est_aq[b-p0][p1-b] = i_score_aq; x264_emms(); } if( b_intra_penalty ) { // arbitrary penalty for I-blocks after B-frames - int nmb = (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2); + int nmb = NUM_MBS; i_score += i_score * frames[b]->i_intra_mbs[b-p0] / (nmb * 8); } return i_score; } +#define MAX_LENGTH (X264_BFRAME_MAX4) + +static int x264_slicetype_path_cost( x264_t h, x264_mb_analysis_t a, x264_frame_t frames, char path, int threshold ) +{ + int loc = 1; + int cost = 0; + int cur_p = 0; + path--; /* Since the 1st path element is really the second frame / + while( path[loc] ) + { + int next_p = loc; + int next_b; + / Find the location of the next P-frame. / + while( path[next_p] && path[next_p] != 'P' ) + next_p++; + / Return if the path doesn't end on a P-frame. / + if( path[next_p] != 'P' ) + return cost; + + / Add the cost of the P-frame found above / + cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_p, 0 ); + / Early terminate if the cost we have found is larger than the best path cost so far / + if( cost > threshold ) + break; + + for( next_b = loc; next_b < next_p && cost < threshold; next_b++ ) + cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_b, 0 ); + + loc = next_p + 1; + cur_p = next_p; + } + return cost; +} + +/ Viterbi/trellis slicetype decision algorithm. / +/ Uses strings due to the fact that the speed of the control functions is + negligable compared to the cost of running slicetype_frame_cost, and because + it makes debugging easier. / +static void x264_slicetype_path( x264_t h, x264_mb_analysis_t a, x264_frame_t frames, int length, int max_bframes, int buffer_size, char (best_paths)[MAX_LENGTH] ) +{ + char paths[X264_BFRAME_MAX+2][MAX_LENGTH] = {{0}}; + int num_paths = X264_MIN(max_bframes+1, length); + int suffix_size, loc, path; + int best_cost = COST_MAX; + int best_path_index = 0; + length = X264_MIN(length,MAX_LENGTH); + + /* Iterate over all currently possible paths and add suffixes to each one / + for( suffix_size = 0; suffix_size < num_paths; suffix_size++ ) + { + memcpy( paths[suffix_size], best_paths[length - (suffix_size + 1)], length - (suffix_size + 1) ); + for( loc = 0; loc < suffix_size; loc++ ) + strcat( paths[suffix_size], "B" ); + strcat( paths[suffix_size], "P" ); + } + + / Calculate the actual cost of each of the current paths / + for( path = 0; path < num_paths; path++ ) + { + int cost = x264_slicetype_path_cost( h, a, frames, paths[path], best_cost ); + if( cost < best_cost ) + { + best_cost = cost; + best_path_index = path; + } + } + + / Store the best path. / + memcpy( best_paths[length], paths[best_path_index], length ); +} + +static int x264_slicetype_path_search( x264_t h, x264_mb_analysis_t a, x264_frame_t frames, int length, int bframes, int buffer ) +{ + char best_paths[MAX_LENGTH][MAX_LENGTH] = {"","P"}; + int n; + for( n = 2; n < length-1; n++ ) + x264_slicetype_path( h, a, frames, n, bframes, buffer, best_paths ); + return strspn( best_paths[length-2], "B" ); +} + static int scenecut( x264_t h, x264_frame_t frame, int pdist ) { int icost = frame->i_cost_est[0][0]; @@ -336,15 +459,15 @@ { f_bias = f_thresh_min + ( f_thresh_max - f_thresh_min ) - ( i_gop_size - h->param.i_keyint_min ) - / ( h->param.i_keyint_max - h->param.i_keyint_min ); + * ( i_gop_size - h->param.i_keyint_min ) + / ( h->param.i_keyint_max - h->param.i_keyint_min ) ; } res = pcost >= (1.0 - f_bias) * icost; if( res ) { int imb = frame->i_intra_mbs[pdist]; - int pmb = (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2) - imb; + int pmb = NUM_MBS - imb; x264_log( h, X264_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n", frame->i_frame, icost, pcost, 1. - (double)pcost / icost, @@ -353,14 +476,14 @@ return res; } -void x264_slicetype_analyse( x264_t h ) +static void x264_slicetype_analyse( x264_t h ) { x264_mb_analysis_t a; - x264_frame_t frames[X264_BFRAME_MAX+3] = { NULL, }; + x264_frame_t frames[X264_BFRAME_MAX4+3] = { NULL, }; int num_frames; int keyint_limit; int j; - int i_mb_count = (h->sps->i_mb_width - 2) (h->sps->i_mb_height - 2); + int i_mb_count = NUM_MBS; int cost1p0, cost2p0, cost1b1, cost2p1; int idr_frame_type; @@ -392,37 +515,65 @@ return; } - cost2p1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 2, 1 ); - if( frames[2]->i_intra_mbs[2] > i_mb_count / 2 ) - goto no_b_frames; - - cost1b1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 1, 0 ); - cost1p0 = x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 ); - cost2p0 = x264_slicetype_frame_cost( h, &a, frames, 1, 2, 2, 0 ); -// fprintf( stderr, "PP: %d + %d <=> BP: %d + %d \n", -// cost1p0, cost2p0, cost1b1, cost2p1 ); - if( cost1p0 + cost2p0 < cost1b1 + cost2p1 ) - goto no_b_frames; - -// arbitrary and untuned -#define INTER_THRESH 300 -#define P_SENS_BIAS (50 - h->param.i_bframe_bias) - frames[1]->i_type = X264_TYPE_B; - - for( j = 2; j <= X264_MIN( h->param.i_bframe, num_frames-1 ); j++ ) - { - int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-1), INTER_THRESH/10); - int pcost = x264_slicetype_frame_cost( h, &a, frames, 0, j+1, j+1, 1 ); -// fprintf( stderr, "frm%d+%d: %d <=> %d, I:%d/%d \n", -// frames[0]->i_frame, j-1, pthresh, pcost/i_mb_count, -// frames[j+1]->i_intra_mbs[j+1], i_mb_count ); - if( pcost > pthreshi_mb_count \|\| frames[j+1]->i_intra_mbs[j+1] > i_mb_count/3 ) + if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS ) + { + int num_bframes; + int max_bframes = X264_MIN(num_frames-1, h->param.i_bframe); + if( h->param.b_pre_scenecut ) { - frames[j]->i_type = X264_TYPE_P; - break; + x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 ); + if( scenecut( h, frames[1], 1 ) ) + { + frames[1]->i_type = idr_frame_type; + return; + } } - else + num_bframes = x264_slicetype_path_search( h, &a, frames, num_frames, max_bframes, num_frames-max_bframes ); + assert(num_bframes < num_frames); + + for( j = 1; j < num_bframes+1; j++ ) + { + if( h->param.b_pre_scenecut && scenecut( h, frames[j+1], j+1 ) ) + { + frames[j]->i_type = X264_TYPE_P; + frames[j+1]->i_type = idr_frame_type; + return; + } frames[j]->i_type = X264_TYPE_B; + } + frames[num_bframes+1]->i_type = X264_TYPE_P; + } + else + { + cost2p1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 2, 1 ); + if( frames[2]->i_intra_mbs[2] > i_mb_count / 2 ) + goto no_b_frames; + + cost1b1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 1, 0 ); + cost1p0 = x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 ); + cost2p0 = x264_slicetype_frame_cost( h, &a, frames, 1, 2, 2, 0 ); + + if( cost1p0 + cost2p0 < cost1b1 + cost2p1 ) + goto no_b_frames; + + // arbitrary and untuned + #define INTER_THRESH 300 + #define P_SENS_BIAS (50 - h->param.i_bframe_bias) + frames[1]->i_type = X264_TYPE_B; + + for( j = 2; j <= X264_MIN( h->param.i_bframe, num_frames-1 ); j++ ) + { + int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS (j-1), INTER_THRESH/10); + int pcost = x264_slicetype_frame_cost( h, &a, frames, 0, j+1, j+1, 1 ); + + if( pcost > pthreshi_mb_count \|\| frames[j+1]->i_intra_mbs[j+1] > i_mb_count/3 ) + { + frames[j]->i_type = X264_TYPE_P; + break; + } + else + frames[j]->i_type = X264_TYPE_B; + } } } @@ -442,7 +593,7 @@ h->frames.next[i]->i_type = x264_ratecontrol_slice_type( h, h->frames.next[i]->i_frame ); } - else if( (h->param.i_bframe && h->param.b_bframe_adaptive) + else if( (h->param.i_bframe && h->param.i_bframe_adaptive) \|\| h->param.b_pre_scenecut ) x264_slicetype_analyse( h ); @@ -492,7 +643,7 @@ int x264_rc_analyse_slice( x264_t h ) { x264_mb_analysis_t a; - x264_frame_t frames[X264_BFRAME_MAX+2] = { NULL, }; + x264_frame_t frames[X264_BFRAME_MAX4+2] = { NULL, }; int p0=0, p1, b; int cost; @@ -520,6 +671,11 @@ frames[b] = h->fenc; cost = x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 ); + + / In AQ, use the weighted score instead. */ + if( h->param.rc.i_aq_mode ) + cost = frames[b]->i_cost_est[b-p0][p1-b]; + h->fenc->i_row_satd = h->fenc->i_row_satds[b-p0][p1-b]; h->fdec->i_row_satd = h->fdec->i_row_satds[b-p0][p1-b]; h->fdec->i_satd = cost;
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/extras/stdint.h ^
@@ -43,9 +43,9 @@ typedef __int64 int_least64_t; typedef unsigned __int64 uint_least64_t; -/* 7.18.1.3 Fastest minimum-width integer types +/* 7.18.1.3 Fastest minimum-width integer types * Not actually guaranteed to be fastest for all purposes - * Here we use the exact-width types for 8 and 16-bit ints. + * Here we use the exact-width types for 8 and 16-bit ints. / typedef char int_fast8_t; typedef unsigned char uint_fast8_t; @@ -68,7 +68,7 @@ #if !defined ( __cplusplus) \|\| defined (__STDC_LIMIT_MACROS) / 7.18.2.1 Limits of exact-width integer types / -#define INT8_MIN (-128) +#define INT8_MIN (-128) #define INT16_MIN (-32768) #define INT32_MIN (-2147483647 - 1) #define INT64_MIN (-9223372036854775807LL - 1) @@ -116,7 +116,7 @@ #define UINT_FAST64_MAX UINT64_MAX / 7.18.2.4 Limits of integer types capable of holding - object pointers / + object pointers / #define INTPTR_MIN INT32_MIN #define INTPTR_MAX INT32_MAX #define UINTPTR_MAX UINT32_MAX @@ -135,7 +135,7 @@ #define SIZE_MAX UINT32_MAX -#ifndef WCHAR_MIN /* also in wchar.h / +#ifndef WCHAR_MIN / also in wchar.h / #define WCHAR_MIN 0 #define WCHAR_MAX ((wchar_t)-1) / UINT16_MAX */ #endif
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/gtk/Makefile ^
@@ -93,7 +93,7 @@ $(SONAMEGTK): $(OBJECTS_LIB) @echo " L: $(@F)" - @$(CC) -shared -o $@ $(OBJECTS_LIB) -Wl,-soname,$(SONAMEGTK) $(LDFLAGS) + @$(CC) -shared -o $@ $(OBJECTS_LIB) $(SOFLAGS) $(LDFLAGS) # Program : test $(TEST_BIN): $(OBJECTS_LIB) $(OBJECTS_TEST) @@ -142,8 +142,8 @@ @install -d "$(DESTDIR)$(libdir)" @echo " I: $(DESTDIR)$(libdir)/libx264gtk.a" @install -m 644 libx264gtk.a "$(DESTDIR)$(libdir)" - @echo " I: $(DESTDIR)$(libdir)/libx264gtk.so" - @$(if $(SONAMEGTK), ln -sf $(SONAMEGTK) $(DESTDIR)$(libdir)/libx264gtk.so) + @echo " I: $(DESTDIR)$(libdir)/libx264gtk.$(SOSUFFIX)" + @$(if $(SONAMEGTK), ln -sf $(SONAMEGTK) $(DESTDIR)$(libdir)/libx264gtk.$(SOSUFFIX)) @$(if $(SONAMEGTK), install -m 755 $(SONAMEGTK) $(DESTDIR)$(libdir)) @echo " D: $(DESTDIR)$(bindir)" @install -d "$(DESTDIR)$(bindir)" @@ -172,9 +172,9 @@ @rm -f "$(DESTDIR)$(includedir)/x264_gtk_enum.h" @echo " U: $(DESTDIR)$(libdir)/libx264gtk.a" @rm -f "$(DESTDIR)$(libdir)/libx264gtk.a" - @echo " U: $(DESTDIR)$(libdir)/libx264gtk.so" + @echo " U: $(DESTDIR)$(libdir)/$(SONAMEGTK)" @$(if $(SONAMEGTK), rm -f "$(DESTDIR)$(libdir)/$(SONAMEGTK)") - @rm -f "$(DESTDIR)$(libdir)/libx264gtk.so" + @rm -f "$(DESTDIR)$(libdir)/libx264gtk.$(SOSUFFIX)" @echo " U: $(DESTDIR)$(bindir)/$(ENCODE_BIN)" @rm -f "$(DESTDIR)$(bindir)/$(ENCODE_BIN)" @echo " U: $(DESTDIR)${datadir}/x264"
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/gtk/x264_gtk.c ^
@@ -115,7 +115,7 @@ param->b_bframe_pyramid = x264_gtk->bframe_pyramid && x264_gtk->bframe; param->analyse.b_bidir_me = x264_gtk->bidir_me; - param->b_bframe_adaptive = x264_gtk->bframe_adaptive; + param->i_bframe_adaptive = x264_gtk->bframe_adaptive; param->analyse.b_weighted_bipred = x264_gtk->weighted_bipred; param->i_bframe = x264_gtk->bframe; param->i_bframe_bias = x264_gtk->bframe_bias; @@ -470,7 +470,7 @@ gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (config->mb.bframes.bframe_pyramid), param.b_bframe_pyramid); gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (config->mb.bframes.bidir_me), param.analyse.b_bidir_me); - gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (config->mb.bframes.bframe_adaptive), param.b_bframe_adaptive); + gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (config->mb.bframes.bframe_adaptive), param.i_bframe_adaptive); gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (config->mb.bframes.weighted_bipred), param.analyse.b_weighted_bipred); g_snprintf (buf, 64, "%d", param.i_bframe); gtk_entry_set_text (GTK_ENTRY (config->mb.bframes.bframe), buf); @@ -602,7 +602,7 @@ x264_gtk->bframe_pyramid = param.b_bframe_pyramid; x264_gtk->bidir_me = param.analyse.b_bidir_me; - x264_gtk->bframe_adaptive = param.b_bframe_adaptive; + x264_gtk->bframe_adaptive = param.i_bframe_adaptive; x264_gtk->weighted_bipred = param.analyse.b_weighted_bipred; x264_gtk->bframe = param.i_bframe; x264_gtk->bframe_bias = param.i_bframe_bias;
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/gtk/x264_gtk.h ^
@@ -76,7 +76,7 @@ gint threads; guint trellis; gint noise_reduction; - + gint strength; gint threshold;
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/gtk/x264_gtk_encode_main_window.c ^
@@ -753,7 +753,7 @@ gtk_entry_set_text (GTK_ENTRY (thread_data->video_rendering_rate), str); - snprintf (str, 128, "%lld:%02lld:%02lld", + snprintf (str, 128, "%" PRId64 ":%02" PRId64 ":%02" PRId64, (pipe_data.elapsed / 1000000) / 3600, ((pipe_data.elapsed / 1000000) / 60) % 60, (pipe_data.elapsed / 1000000) % 60);
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/gtk/x264_gtk_encode_private.h ^
@@ -54,7 +54,7 @@ GIOChannel io_write; / use it with write */ }; -struct X264_Pipe_Data_ +struct X264_Pipe_Data_ { int frame; int frame_total;
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/matroska.c ^
@@ -407,7 +407,7 @@ return 0; } -int mk_flushFrame(mk_Writer w) { +static int mk_flushFrame(mk_Writer w) { int64_t delta, ref = 0; unsigned fsize, bgsize; unsigned char c_delta_flags[3];
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/muxers.c ^
@@ -216,7 +216,8 @@ tokstart = strchr(tokstart, 0x20); break; case 'A': /* Pixel aspect - 0:0 if unknown / - if( sscanf(tokstart, "%d:%d", &n, &d) == 2 && n && d ) + / Don't override the aspect ratio if sar has been explicitly set on the commandline. / + if( sscanf(tokstart, "%d:%d", &n, &d) == 2 && n && d && !p_param->vui.i_sar_width && !p_param->vui.i_sar_height ) { x264_reduce_fraction( &n, &d ); p_param->vui.i_sar_width = n; @@ -285,7 +286,7 @@ / Read frame header - without terminating '\n' / if (fread(header, 1, slen, h->fh) != slen) return -1; - + header[slen] = 0; if (strncmp(header, Y4M_FRAME_MAGIC, slen)) { @@ -293,7 +294,7 @@ ((uint32_t)header), header); return -1; } - + / Skip most of it / while (i<MAX_FRAME_HEADER && fgetc(h->fh) != '\n') i++; @@ -426,6 +427,7 @@ x264_pthread_t tid; int next_frame; int frame_total; + int in_progress; struct thread_input_arg_t next_args; } thread_input_t; @@ -443,6 +445,7 @@ h->p_read_frame = p_read_frame; h->p_close_infile = p_close_infile; h->p_handle = p_handle; + h->in_progress = 0; h->next_frame = -1; h->next_args = malloc(sizeof(thread_input_arg_t)); h->next_args->h = h; @@ -459,7 +462,7 @@ return h->frame_total; } -void read_frame_thread_int( thread_input_arg_t i ) +static void read_frame_thread_int( thread_input_arg_t i ) { i->status = i->h->p_read_frame( i->pic, i->h->p_handle, i->i_frame ); } @@ -474,6 +477,7 @@ { x264_pthread_join( h->tid, &stuff ); ret \|= h->next_args->status; + h->in_progress = 0; } if( h->next_frame == i_frame ) @@ -491,6 +495,7 @@ h->next_args->i_frame = i_frame+1; h->next_args->pic = &h->pic; x264_pthread_create( &h->tid, NULL, (void)read_frame_thread_int, h->next_args ); + h->in_progress = 1; } else h->next_frame = -1; @@ -503,7 +508,8 @@ thread_input_t h = handle; h->p_close_infile( h->p_handle ); x264_picture_clean( &h->pic ); - x264_pthread_join( h->tid, NULL ); + if( h->in_progress ) + x264_pthread_join( h->tid, NULL ); free( h->next_args ); free( h ); return 0; @@ -563,7 +569,7 @@ } mp4_t; -void recompute_bitrate_mp4(GF_ISOFile p_file, int i_track) +static void recompute_bitrate_mp4(GF_ISOFile p_file, int i_track) { u32 i, count, di, timescale, time_wnd, rate; u64 offset; @@ -806,7 +812,7 @@ char b_writing_frame; } mkv_t; -int write_header_mkv( mkv_t p_mkv ) +static int write_header_mkv( mkv_t p_mkv ) { int ret; uint8_t avcC;
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/tools/avc2avi.c ^
@@ -499,7 +499,7 @@ /* skip i_offset_for_top_to_bottom_field / bs_read_se( &s ); / read i_num_ref_frames_in_poc_cycle */ - i_cycle = bs_read_ue( &s ); + i_cycle = bs_read_ue( &s ); if( i_cycle > 256 ) i_cycle = 256; while( i_cycle > 0 ) {
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/tools/checkasm-a.asm ^
@@ -61,7 +61,6 @@ or r3, r5 jz .ok mov r3, eax - picgetgot r1 lea r1, [error_message GLOBAL] push r1 xor eax, eax
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/tools/checkasm.c ^
@@ -100,12 +100,12 @@ return &benchs[i].vers[j]; } -int cmp_nop( const void a, const void b ) +static int cmp_nop( const void a, const void b ) { return (uint16_t)a - (uint16_t)b; } -int cmp_bench( const void a, const void b ) +static int cmp_bench( const void a, const void b ) { // asciibetical sort except preserving numbers const char sa = ((bench_func_t)a)->name; @@ -258,6 +258,7 @@ report( "pixel " #name " :" ); TEST_PIXEL( sad, 0 ); + TEST_PIXEL( sad_aligned, 1 ); TEST_PIXEL( ssd, 1 ); TEST_PIXEL( satd, 0 ); TEST_PIXEL( sa8d, 0 ); @@ -302,7 +303,45 @@ TEST_PIXEL_X(3); TEST_PIXEL_X(4); -#define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \ +#define TEST_PIXEL_VAR( i ) \ + if( pixel_asm.var[i] != pixel_ref.var[i] ) \ + { \ + uint32_t res_c, res_asm; \ + uint32_t sad_c, sad_asm; \ + set_func_name( "%s_%s", "var", pixel_names[i] ); \ + used_asm = 1; \ + res_c = call_c( pixel_c.var[i], buf1, 16, &sad_c ); \ + res_asm = call_a( pixel_asm.var[i], buf1, 16, &sad_asm ); \ + if( (res_c != res_asm) \|\| (sad_c != sad_asm) ) \ + { \ + ok = 0; \ + fprintf( stderr, "var[%d]: %d,%d != %d,%d [FAILED]\n", i, res_c, sad_c, res_asm, sad_asm ); \ + } \ + } + + ok = 1; used_asm = 0; + TEST_PIXEL_VAR( PIXEL_16x16 ); + TEST_PIXEL_VAR( PIXEL_8x8 ); + report( "pixel var :" ); + + for( i=0, ok=1, used_asm=0; i<4; i++ ) + if( pixel_asm.hadamard_ac[i] != pixel_ref.hadamard_ac[i] ) + { + set_func_name( "hadamard_ac_%s", pixel_names[i] ); + used_asm = 1; + uint64_t rc = pixel_c.hadamard_ac[i]( buf1, 16 ); + uint64_t ra = pixel_asm.hadamard_ac[i]( buf1, 16 ); + if( rc != ra ) + { + ok = 0; + fprintf( stderr, "hadamard_ac[%d]: %d,%d != %d,%d\n", i, (int)rc, (int)(rc>>32), (int)ra, (int)(ra>>32) ); + } + call_c2( pixel_c.hadamard_ac[i], buf1, 16 ); + call_a2( pixel_asm.hadamard_ac[i], buf1, 16 ); + } + report( "pixel hadamard_ac :" ); + +#define TEST_INTRA_MBCMP( name, pred, satd, i8x8, ... ) \ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \ int res_c[3], res_asm[3]; \ @@ -311,10 +350,10 @@ memcpy( buf3, buf2, 1024 ); \ for( i=0; i<3; i++ ) \ { \ - pred[i]( buf3+40, ##__VA_ARGS__ ); \ - res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \ + pred[i]( buf3+48, ##__VA_ARGS__ ); \ + res_c[i] = pixel_c.satd( buf1+48, 16, buf3+48, 32 ); \ } \ - call_a( pixel_asm.name, buf1+40, i8x8 ? edge : buf3+40, res_asm ); \ + call_a( pixel_asm.name, buf1+48, i8x8 ? edge : buf3+48, res_asm ); \ if( memcmp(res_c, res_asm, sizeof(res_c)) ) \ { \ ok = 0; \ @@ -325,11 +364,13 @@ } ok = 1; used_asm = 0; - TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 ); - TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8], 0 ); - TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4], 0 ); - TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], 1, edge ); + TEST_INTRA_MBCMP( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 ); + TEST_INTRA_MBCMP( intra_satd_x3_8x8c , predict_8x8c , satd[PIXEL_8x8] , 0 ); + TEST_INTRA_MBCMP( intra_satd_x3_4x4 , predict_4x4 , satd[PIXEL_4x4] , 0 ); + TEST_INTRA_MBCMP( intra_sa8d_x3_8x8 , predict_8x8 , sa8d[PIXEL_8x8] , 1, edge ); report( "intra satd_x3 :" ); + TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 ); + report( "intra sad_x3 :" ); if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core \|\| pixel_asm.ssim_end4 != pixel_ref.ssim_end4 ) @@ -568,6 +609,7 @@ { \ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\ used_asm = 1; \ + memcpy(dct, buf1, sizesizeof(int16_t));\ call_c( zigzag_c.name, t1, dct ); \ call_a( zigzag_asm.name, t2, dct ); \ if( memcmp( t1, t2, sizesizeof(int16_t) ) ) \ @@ -729,31 +771,29 @@ #undef MC_TEST_LUMA #undef MC_TEST_CHROMA -#define MC_TEST_AVG( name, ... ) \ +#define MC_TEST_AVG( name, weight ) \ for( i = 0, ok = 1, used_asm = 0; i < 10; i++ ) \ { \ - memcpy( buf3, buf1, 1024 ); \ - memcpy( buf4, buf1, 1024 ); \ + memcpy( buf3, buf1+320, 320 ); \ + memcpy( buf4, buf1+320, 320 ); \ if( mc_a.name[i] != mc_ref.name[i] ) \ { \ set_func_name( "%s_%s", #name, pixel_names[i] );\ used_asm = 1; \ - call_c1( mc_c.name[i], buf3, 32, buf2, 16, ##__VA_ARGS__ ); \ - call_a1( mc_a.name[i], buf4, 32, buf2, 16, ##__VA_ARGS__ ); \ - if( memcmp( buf3, buf4, 1024 ) ) \ + call_c1( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \ + call_a1( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \ + if( memcmp( buf3, buf4, 320 ) ) \ { \ ok = 0; \ fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \ } \ - call_c2( mc_c.name[i], buf3, 32, buf2, 16, ##__VA_ARGS__ ); \ - call_a2( mc_a.name[i], buf4, 32, buf2, 16, ##__VA_ARGS__ ); \ + call_c2( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \ + call_a2( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \ } \ } - MC_TEST_AVG( avg ); - report( "mc avg :" ); ok = 1; used_asm = 0; - for( w = -64; w <= 128 && ok; w++ ) - MC_TEST_AVG( avg_weight, w ); + for( w = -63; w <= 127 && ok; w++ ) + MC_TEST_AVG( avg, w ); report( "mc wpredb :" ); if( mc_a.hpel_filter != mc_ref.hpel_filter ) @@ -1048,7 +1088,7 @@ report( "dequant :" ); - if( qf_a.denoise_dct_core != qf_ref.denoise_dct_core ) + if( qf_a.denoise_dct != qf_ref.denoise_dct ) { int size; for( size = 16; size <= 64; size += 48 ) @@ -1058,12 +1098,12 @@ memcpy(dct1, buf1, size2); memcpy(dct2, buf1, size2); memcpy(buf3+256, buf3, 256); - call_c1( qf_c.denoise_dct_core, dct1, (uint32_t)buf3, (uint16_t)buf2, size ); - call_a1( qf_a.denoise_dct_core, dct2, (uint32_t)(buf3+256), (uint16_t)buf2, size ); + call_c1( qf_c.denoise_dct, dct1, (uint32_t)buf3, (uint16_t)buf2, size ); + call_a1( qf_a.denoise_dct, dct2, (uint32_t)(buf3+256), (uint16_t)buf2, size ); if( memcmp( dct1, dct2, size2 ) \|\| memcmp( buf3+4, buf3+256+4, (size-1)sizeof(uint32_t) ) ) ok = 0; - call_c2( qf_c.denoise_dct_core, dct1, (uint32_t)buf3, (uint16_t)buf2, size ); - call_a2( qf_a.denoise_dct_core, dct2, (uint32_t)(buf3+256), (uint16_t)buf2, size ); + call_c2( qf_c.denoise_dct, dct1, (uint32_t)buf3, (uint16_t)buf2, size ); + call_a2( qf_a.denoise_dct, dct2, (uint32_t)(buf3+256), (uint16_t)buf2, size ); } } report( "denoise dct :" ); @@ -1178,7 +1218,7 @@ return ret; } -int check_all_funcs( int cpu_ref, int cpu_new ) +static int check_all_funcs( int cpu_ref, int cpu_new ) { return check_pixel( cpu_ref, cpu_new ) + check_dct( cpu_ref, cpu_new ) @@ -1189,7 +1229,7 @@ + check_cabac( cpu_ref, cpu_new ); } -int add_flags( int cpu_ref, int cpu_new, int flags, const char name ) +static int add_flags( int cpu_ref, int cpu_new, int flags, const char name ) { cpu_ref = cpu_new; cpu_new \|= flags; @@ -1200,7 +1240,7 @@ return check_all_funcs( cpu_ref, *cpu_new ); } -int check_all_flags( void ) +static int check_all_flags( void ) { int ret = 0; int cpu0 = 0, cpu1 = 0;
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/x264.c ^
@@ -37,6 +37,12 @@ #include "config.h" #endif +#ifdef _WIN32 +#include <windows.h> +#else +#define SetConsoleTitle(t) +#endif + uint8_t mux_buffer = NULL; int mux_buffer_size = 0; @@ -163,7 +169,11 @@ H1( " --pre-scenecut Faster, less precise scenecut detection.\n" " Required and implied by multi-threading.\n" ); H0( " -b, --bframes <integer> Number of B-frames between I and P [%d]\n", defaults->i_bframe ); - H1( " --no-b-adapt Disable adaptive B-frame decision\n" ); + H1( " --b-adapt Adaptive B-frame decision method [%d]\n" + " Higher values may lower threading efficiency.\n" + " - 0: Disabled\n" + " - 1: Fast\n" + " - 2: Optimal (slow with high --bframes)\n", defaults->i_bframe_adaptive ); H1( " --b-bias <integer> Influences how often B-frames are used [%d]\n", defaults->i_bframe_bias ); H0( " --b-pyramid Keep some B-frames as references\n" ); H0( " --no-cabac Disable CABAC\n" ); @@ -188,10 +198,9 @@ H0( " --ipratio <float> QP factor between I and P [%.2f]\n", defaults->rc.f_ip_factor ); H0( " --pbratio <float> QP factor between P and B [%.2f]\n", defaults->rc.f_pb_factor ); H1( " --chroma-qp-offset <integer> QP difference between chroma and luma [%d]\n", defaults->analyse.i_chroma_qp_offset ); - H0( " --aq-mode <integer> How AQ distributes bits [%d]\n" + H1( " --aq-mode <integer> AQ method [%d]\n" " - 0: Disabled\n" - " - 1: Avoid moving bits between frames\n" - " - 2: Move bits between frames\n", defaults->rc.i_aq_mode ); + " - 1: Variance AQ (complexity mask)\n", defaults->rc.i_aq_mode ); H0( " --aq-strength <float> Reduces blocking and blurring in flat and\n" " textured areas. [%.1f]\n" " - 0.5: weak AQ\n" @@ -202,7 +211,6 @@ " - 2: Last pass, does not overwrite stats file\n" " - 3: Nth pass, overwrites stats file\n" ); H0( " --stats <string> Filename for 2 pass stats [\"%s\"]\n", defaults->rc.psz_stat_out ); - H1( " --rceq <string> Ratecontrol equation [\"%s\"]\n", defaults->rc.psz_rc_eq ); H0( " --qcomp <float> QP curve compression: 0.0 => CBR, 1.0 => CQP [%.2f]\n", defaults->rc.f_qcompress ); H1( " --cplxblur <float> Reduce fluctuations in QP (before curve compression) [%.1f]\n", defaults->rc.f_complexity_blur ); H1( " --qblur <float> Reduce fluctuations in QP (after curve compression) [%.1f]\n", defaults->rc.f_qblur ); @@ -243,6 +251,10 @@ H0( " -m, --subme <integer> Subpixel motion estimation and partition\n" " decision quality: 1=fast, 7=best. [%d]\n", defaults->analyse.i_subpel_refine ); H0( " --b-rdo RD based mode decision for B-frames. Requires subme 6.\n" ); + H0( " --psy-rd Strength of psychovisual optimization [\"%.1f:%.1f\"]\n" + " #1: RDO (requires subme>=6)\n" + " #2: Trellis (requires trellis, experimental)\n", + defaults->analyse.f_psy_rd, defaults->analyse.f_psy_trellis ); H0( " --mixed-refs Decide references on a per partition basis\n" ); H1( " --no-chroma-me Ignore chroma in motion estimation\n" ); H1( " --bime Jointly optimize both MVs in B-frames\n" ); @@ -375,6 +387,7 @@ { "version", no_argument, NULL, 'V' }, { "bitrate", required_argument, NULL, 'B' }, { "bframes", required_argument, NULL, 'b' }, + { "b-adapt", required_argument, NULL, 0 }, { "no-b-adapt", no_argument, NULL, 0 }, { "b-bias", required_argument, NULL, 0 }, { "b-pyramid", no_argument, NULL, 0 }, @@ -411,6 +424,7 @@ { "mvrange", required_argument, NULL, 0 }, { "mvrange-thread", required_argument, NULL, 0 }, { "subme", required_argument, NULL, 'm' }, + { "psy-rd", required_argument, NULL, 0 }, { "b-rdo", no_argument, NULL, 0 }, { "mixed-refs", no_argument, NULL, 0 }, { "no-chroma-me", no_argument, NULL, 0 }, @@ -433,7 +447,6 @@ { "chroma-qp-offset", required_argument, NULL, 0 }, { "pass", required_argument, NULL, 'p' }, { "stats", required_argument, NULL, 0 }, - { "rceq", required_argument, NULL, 0 }, { "qcomp", required_argument, NULL, 0 }, { "qblur", required_argument, NULL, 0 }, { "cplxblur",required_argument, NULL, 0 }, @@ -541,7 +554,7 @@ return -1; } param->i_scenecut_threshold = -1; - param->b_bframe_adaptive = 0; + param->i_bframe_adaptive = X264_B_ADAPT_NONE; break; case OPT_THREAD_INPUT: b_thread_input = 1; @@ -632,7 +645,7 @@ sscanf( argv[optind++], "%ux%u", &param->i_width, &param->i_height ); } } - + if( !(b_avis \|\| b_y4m) && ( !param->i_width \|\| !param->i_height ) ) { fprintf( stderr, "x264 [error]: Rawyuv input requires a resolution.\n" ); @@ -772,14 +785,17 @@ int64_t i_start, i_end; int64_t i_file; int i_frame_size; - int i_progress; + int i_update_interval; + char buf[200]; + opt->b_progress &= param->i_log_level < X264_LOG_DEBUG; i_frame_total = p_get_frame_total( opt->hin ); i_frame_total -= opt->i_seek; if( ( i_frame_total == 0 \|\| param->i_frame_total < i_frame_total ) && param->i_frame_total > 0 ) i_frame_total = param->i_frame_total; param->i_frame_total = i_frame_total; + i_update_interval = i_frame_total ? x264_clip3( i_frame_total / 1000, 1, 10 ) : 10; if( ( h = x264_encoder_open( param ) ) == NULL ) { @@ -802,8 +818,7 @@ i_start = x264_mdate(); / Encode frames / - for( i_frame = 0, i_file = 0, i_progress = 0; - b_ctrl_c == 0 && (i_frame < i_frame_total \|\| i_frame_total == 0); ) + for( i_frame = 0, i_file = 0; b_ctrl_c == 0 && (i_frame < i_frame_total \|\| i_frame_total == 0); ) { if( p_read_frame( &pic, opt->hin, i_frame + opt->i_seek ) ) break; @@ -824,22 +839,24 @@ i_frame++; / update status line (up to 1000 times per input file) / - if( opt->b_progress && param->i_log_level < X264_LOG_DEBUG && - ( i_frame_total ? i_frame 1000 / i_frame_total > i_progress - : i_frame % 10 == 0 ) ) + if( opt->b_progress && i_frame % i_update_interval == 0 ) { int64_t i_elapsed = x264_mdate() - i_start; double fps = i_elapsed > 0 ? i_frame * 1000000. / i_elapsed : 0; + double bitrate = (double) i_file * 8 * param->i_fps_num / ( (double) param->i_fps_den * i_frame * 1000 ); if( i_frame_total ) { int eta = i_elapsed * (i_frame_total - i_frame) / ((int64_t)i_frame * 1000000); - i_progress = i_frame * 1000 / i_frame_total; - fprintf( stderr, "encoded frames: %d/%d (%.1f%%), %.2f fps, eta %d:%02d:%02d \r", - i_frame, i_frame_total, (float)i_progress / 10, fps, + sprintf( buf, "x264 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d", + 100. * i_frame / i_frame_total, i_frame, i_frame_total, fps, bitrate, eta/3600, (eta/60)%60, eta%60 ); } else - fprintf( stderr, "encoded frames: %d, %.2f fps \r", i_frame, fps ); + { + sprintf( buf, "x264 %d frames: %.2f fps, %.2f kb/s", i_frame, fps, bitrate ); + } + fprintf( stderr, "%s \r", buf+5 ); + SetConsoleTitle( buf ); fflush( stderr ); // needed in windows } } @@ -851,6 +868,9 @@ i_end = x264_mdate(); x264_picture_clean( &pic ); + /* Erase progress indicator before printing encoding stats. */ + if( opt->b_progress ) + fprintf( stderr, " \r" ); x264_encoder_close( h ); x264_free( mux_buffer ); fprintf( stderr, "\n" );
[-] [+]	Changed	x264-snapshot-20081001-2245.tar.bz2/x264.h ^
@@ -35,7 +35,7 @@ #include <stdarg.h> -#define X264_BUILD 60 +#define X264_BUILD 64 /* x264_t: * opaque handler for encoder / @@ -85,8 +85,10 @@ #define X264_RC_CRF 1 #define X264_RC_ABR 2 #define X264_AQ_NONE 0 -#define X264_AQ_LOCAL 1 -#define X264_AQ_GLOBAL 2 +#define X264_AQ_VARIANCE 1 +#define X264_B_ADAPT_NONE 0 +#define X264_B_ADAPT_FAST 1 +#define X264_B_ADAPT_TRELLIS 2 static const char const x264_direct_pred_names[] = { "none", "spatial", "temporal", "auto", 0 }; static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", "tesa", 0 }; @@ -155,7 +157,7 @@ int i_width; int i_height; int i_csp; /* CSP of encoded bitstream, only i420 supported / - int i_level_idc; + int i_level_idc; int i_frame_total; / number of frames to encode if known, else 0 / struct @@ -165,7 +167,7 @@ int i_sar_width; int i_overscan; / 0=undef, 1=no overscan, 2=overscan / - + / see h264 annex E for the values of the following / int i_vidformat; int b_fullrange; @@ -185,7 +187,7 @@ int i_scenecut_threshold; / how aggressively to insert extra I frames / int b_pre_scenecut; / compute scenecut on lowres frames / int i_bframe; / how many b-frame between 2 references pictures / - int b_bframe_adaptive; + int i_bframe_adaptive; int i_bframe_bias; int b_bframe_pyramid; / Keep some B-frames as references / @@ -239,6 +241,8 @@ int b_fast_pskip; / early SKIP detection on P-frames / int b_dct_decimate; / transform coefficient thresholding on P-frames / int i_noise_reduction; / adaptive pseudo-deadzone / + float f_psy_rd; / Psy RD strength / + float f_psy_trellis; / Psy trellis strength / / the deadzone size that will be used in luma quantization / int i_luma_deadzone[2]; / {inter, intra} / @@ -276,7 +280,6 @@ char psz_stat_in; /* 2pass params (same as ffmpeg ones) / - char psz_rc_eq; /* 2 pass rate control equation / float f_qcompress; / 0.0 => cbr, 1.0 => constant qp / float f_qblur; / temporally blur quants / float f_complexity_blur; / temporally blur complexity */
	Changed	x264-snapshot-20081218-2245.tar.bz2 ^
[-] [+]	Deleted	x264.spec.old ^
@@ -1,132 +0,0 @@ -# norootforbuild - -%define binname x264 -%define realname libx264 -%define soname 59 -%define svn 20080607 - -Name: %{binname} -Summary: A free h264/avc encoder - encoder binary. -Version: 0.0svn%{svn} -Release: 1 -License: GPL -Group: Productivity/Multimedia/Video -Url: http://developers.videolan.org/x264.html - -Source: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2 - -BuildRoot: %{_tmppath}/build-root-%{name} - -Requires: %{realname}-%{soname} - -%ifarch x86_64 -BuildRequires: nasm yasm -%else -BuildRequires: nasm -%endif - -%description -x264 is a free library for encoding next-generation H264/AVC video -streams. The code is written from scratch by Laurent Aimar, Loren -Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans -Rullgard, Radek Czyz, Christian Heine (asm), Alex Izvorski (asm), and -Alex Wright. It is released under the terms of the GPL license. This -package contains a shared library and a commandline tool for encoding -H264 streams. This library is needed for mplayer/mencoder for H264 -encoding support. - -Encoder features: -- CAVLC/CABAC -- Multi-references -- Intra: all macroblock types (16x16, 8x8, and 4x4 with all predictions) -- Inter P: all partitions (from 16x16 down to 4x4) -- Inter B: partitions from 16x16 down to 8x8 (including skip/direct) -- Ratecontrol: constant quantizer, single or multipass ABR, optional VBV -- Scene cut detection -- Adaptive B-frame placement -- B-frames as references / arbitrary frame order -- 8x8 and 4x4 adaptive spatial transform -- Lossless mode -- Custom quantization matrices -- Parallel encoding of multiple slices (currently disabled) - -Be aware that the x264 library is still in early development stage. The -command line tool x264 can handle only raw YUV 4:2:0 streams at the -moment so please use mencoder or another tool that supports x264 library -for all other file types. - -%package -n %{realname}-%{soname} -Summary: A free h264/avc encoder - encoder binary -Group: Productivity/Multimedia/Video - -%description -n %{realname}-%{soname} -x264 is a free library for encoding next-generation H264/AVC video -streams. The code is written from scratch by Laurent Aimar, Loren -Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans -Rullgard, Radek Czyz, Christian Heine (asm), Alex Izvorski (asm), and -Alex Wright. It is released under the terms of the GPL license. This -package contains a static library and a header needed for the -development with libx264. This library is needed to build -mplayer/mencoder with H264 encoding support. - - -%package -n %{realname}-devel -Summary: Libraries and include file for the x264 encoder. -Group: Development/Libraries/C and C++ -Requires: %{realname}-%{soname} = %{version}-%{release} -Requires: %{buildrequires} - -%description -n %{realname}-devel -x264 is a free library for encoding next-generation H264/AVC video -streams. The code is written from scratch by Laurent Aimar, Loren -Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans -Rullgard, Radek Czyz, Christian Heine (asm), Alex Izvorski (asm), and -Alex Wright. It is released under the terms of the GPL license. This -package contains a static library and a header needed for the -development with libx264. This library is needed to build -mplayer/mencoder with H264 encoding support. - -%prep -%setup -q -n x264-snapshot-%{svn}-2245 - - -%build -%{configure} --enable-shared --enable-pic -#TODO: to compile with --enable-mp4-output gpac is needed, this should be added in the future... -%{__make} %{?jobs:-j%{jobs}} - - -%install -%makeinstall - -rm $RPM_BUILD_ROOT/%{_libdir}/libx264.so -cd $RPM_BUILD_ROOT/%{_libdir} -ln -s libx264.so.%{soname} libx264.so - - -%clean -%__rm -rf %{buildroot} - - -%files -%defattr(755,root,root) -%doc doc/.txt -%{_bindir}/x264 - -%files -n %{realname}-%{soname} -%defattr(755,root,root) -%{_libdir}/libx264.so.%{soname} - - -%files -n %{realname}-devel -%defattr(755,root,root) -%{_libdir}/pkgconfig/x264.pc -%{_includedir}/x264.h -%{_libdir}/libx264.so -%{_libdir}/libx264.a - - -%changelog - Sun Sep 30 2007 Carsten Schoene <cs@linux-administrator.com> -- import for SLE_10 build -

Changes of Revision 9