[-]
[+]
|
Changed |
x264.spec
|
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/common/amd64/deblock-a.asm
^
|
@@ -318,7 +318,8 @@
lea r11, [r10+r10*2]
lea rax, [rdi-4]
lea r9, [rdi-4+r11]
- %define pix_tmp rsp-104 ; 16x6 for the buffer + 8 for x264_deblock_v_luma_sse2's return address
+ sub rsp, 0x68
+ %define pix_tmp rsp
; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
@@ -352,6 +353,7 @@
movq mm3, [pix_tmp+0x40]
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
+ add rsp, 0x68
ret
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/common/amd64/pixel-a.asm
^
|
@@ -805,9 +805,10 @@
; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal x264_intra_satd_x3_16x16_mmxext
-%define sums rsp-32 ; +24
-%define top_1d rsp-64 ; +32
-%define left_1d rsp-96 ; +32
+ sub rsp, 96
+%define sums rsp+64 ; size 24
+%define top_1d rsp+32 ; size 32
+%define left_1d rsp ; size 32
mov qword [sums+0], 0
mov qword [sums+8], 0
@@ -913,15 +914,17 @@
movd [parm3q+8], mm2 ; i16x16_dc satd
movd [parm3q+4], mm1 ; i16x16_h satd
movd [parm3q+0], mm0 ; i16x16_v satd
+ add rsp, 96
ret
;-----------------------------------------------------------------------------
; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal x264_intra_satd_x3_8x8c_mmxext
-%define sums rsp-32 ; +24
-%define top_1d rsp-48 ; +16
-%define left_1d rsp-64 ; +16
+ sub rsp, 64
+%define sums rsp+32 ; size 24
+%define top_1d rsp+16 ; size 16
+%define left_1d rsp ; size 16
mov qword [sums+0], 0
mov qword [sums+8], 0
@@ -1041,13 +1044,87 @@
movd [parm3q+0], mm0 ; i8x8c_dc satd
movd [parm3q+4], mm1 ; i8x8c_h satd
movd [parm3q+8], mm2 ; i8x8c_v satd
+ add rsp, 64
ret
+; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
+; {
+; int nmv=0, i, j;
+; *(uint32_t*)(masks+width) = 0;
+; for( i=0; i<width; i+=8 )
+; {
+; uint64_t mask = *(uint64_t*)(masks+i);
+; if( !mask ) continue;
+; for( j=0; j<8; j++ )
+; if( mask & (255<<j*8) )
+; mvs[nmv++] = i+j;
+; }
+; return nmv;
+; }
+cglobal x264_pixel_ads_mvs
+ ; mvs = parm5q
+ ; masks = rsp
+ ; width = r10
+ mov dword [rsp+r10], 0
+ xor eax, eax
+ xor esi, esi
+.loopi:
+ mov rdi, [rsp+rsi]
+ test rdi, rdi
+ jz .nexti
+ xor ecx, ecx
+%macro TEST 1
+ mov [parm5q+rax*2], si
+ test edi, 0xff<<(%1*8)
+ setne cl
+ add eax, ecx
+ inc esi
+%endmacro
+ TEST 0
+ TEST 1
+ TEST 2
+ TEST 3
+ shr rdi, 32
+ TEST 0
+ TEST 1
+ TEST 2
+ TEST 3
+ cmp esi, r10d
+ jl .loopi
+ leave
+ ret
+.nexti:
+ add esi, 8
+ cmp esi, r10d
+ jl .loopi
+ leave
+ ret
+
+%macro ADS_START 0
+ push rbp
+ mov rbp, rsp
+ sub rsp, parm6q
+ sub rsp, 4
+ and rsp, ~15
+ mov rax, rsp
+ mov r10d, parm6d
+ shl parm3q, 1
+%endmacro
+
+%macro ADS_END 1
+ add parm2q, 8*%1
+ add parm4q, 8*%1
+ add rax, 4*%1
+ sub parm6d, 4*%1
+ jg .loop
+ jmp x264_pixel_ads_mvs
+%endmacro
+
;-----------------------------------------------------------------------------
-; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
-; uint16_t *res, int width )
+; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
+; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
cglobal x264_pixel_ads4_mmxext
movq mm6, [parm1q]
@@ -1056,7 +1133,7 @@
pshufw mm6, mm6, 0xAA
pshufw mm5, mm4, 0
pshufw mm4, mm4, 0xAA
- shl parm3q, 1
+ ADS_START
.loop:
movq mm0, [parm2q]
movq mm1, [parm2q+16]
@@ -1073,19 +1150,19 @@
MMX_ABS mm3, mm1
paddw mm0, mm2
paddw mm0, mm3
- movq [parm4q], mm0
- add parm2q, 8
- add parm4q, 8
- sub parm5d, 4
- jg .loop
- nop
- ret
+ pshufw mm1, [rbp+16], 0
+ paddusw mm0, [parm4q]
+ psubusw mm1, mm0
+ packsswb mm1, mm1
+ movd [rax], mm1
+ ADS_END 1
cglobal x264_pixel_ads2_mmxext
movq mm6, [parm1q]
+ pshufw mm5, parm7q, 0
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
- shl parm3q, 1
+ ADS_START
.loop:
movq mm0, [parm2q]
movq mm1, [parm2q+parm3q]
@@ -1094,16 +1171,17 @@
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
paddw mm0, mm1
- movq [parm4q], mm0
- add parm2q, 8
- add parm4q, 8
- sub parm5d, 4
- jg .loop
- nop
- ret
+ paddusw mm0, [parm4q]
+ movq mm4, mm5
+ psubusw mm4, mm0
+ packsswb mm4, mm4
+ movd [rax], mm4
+ ADS_END 1
cglobal x264_pixel_ads1_mmxext
pshufw mm7, [parm1q], 0
+ pshufw mm6, parm7q, 0
+ ADS_START
.loop:
movq mm0, [parm2q]
movq mm1, [parm2q+8]
@@ -1111,11 +1189,113 @@
psubw mm1, mm7
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
- movq [parm4q], mm0
- movq [parm4q+8], mm1
- add parm2q, 16
- add parm4q, 16
- sub parm5d, 8
- jg .loop
- nop
- ret
+ paddusw mm0, [parm4q]
+ paddusw mm1, [parm4q+8]
+ movq mm4, mm6
+ movq mm5, mm6
+ psubusw mm4, mm0
+ psubusw mm5, mm1
+ packsswb mm4, mm5
+ movq [rax], mm4
+ ADS_END 2
+
+%macro ADS_SSE2 1
+cglobal x264_pixel_ads4_%1
+ movdqa xmm4, [parm1q]
+ pshuflw xmm8, parm7q, 0
+ pshuflw xmm7, xmm4, 0
+ pshuflw xmm6, xmm4, 0xAA
+ pshufhw xmm5, xmm4, 0
+ pshufhw xmm4, xmm4, 0xAA
+ punpcklqdq xmm8, xmm8
+ punpcklqdq xmm7, xmm7
+ punpcklqdq xmm6, xmm6
+ punpckhqdq xmm5, xmm5
+ punpckhqdq xmm4, xmm4
+ ADS_START
+ movdqu xmm10, [parm2q]
+ movdqu xmm11, [parm2q+parm3q]
+.loop:
+ movdqa xmm0, xmm10
+ movdqu xmm1, [parm2q+16]
+ movdqa xmm10, xmm1
+ psubw xmm0, xmm7
+ psubw xmm1, xmm6
+ MMX_ABS xmm0, xmm2
+ MMX_ABS xmm1, xmm3
+ movdqa xmm2, xmm11
+ movdqu xmm3, [parm2q+parm3q+16]
+ movdqa xmm11, xmm3
+ psubw xmm2, xmm5
+ psubw xmm3, xmm4
+ paddw xmm0, xmm1
+ movdqu xmm9, [parm4q]
+ MMX_ABS xmm2, xmm1
+ MMX_ABS xmm3, xmm1
+ paddw xmm0, xmm2
+ paddw xmm0, xmm3
+ paddusw xmm0, xmm9
+ movdqa xmm1, xmm8
+ psubusw xmm1, xmm0
+ packsswb xmm1, xmm1
+ movq [rax], xmm1
+ ADS_END 2
+
+cglobal x264_pixel_ads2_%1
+ movq xmm6, [parm1q]
+ pshuflw xmm8, parm7q, 0
+ pshuflw xmm7, xmm6, 0
+ pshuflw xmm6, xmm6, 0xAA
+ punpcklqdq xmm8, xmm8
+ punpcklqdq xmm7, xmm7
+ punpcklqdq xmm6, xmm6
+ ADS_START
+.loop:
+ movdqu xmm0, [parm2q]
+ movdqu xmm1, [parm2q+parm3q]
+ psubw xmm0, xmm7
+ psubw xmm1, xmm6
+ movdqu xmm9, [parm4q]
+ MMX_ABS xmm0, xmm2
+ MMX_ABS xmm1, xmm3
+ paddw xmm0, xmm1
+ paddusw xmm0, xmm9
+ movdqa xmm4, xmm8
+ psubusw xmm4, xmm0
+ packsswb xmm4, xmm4
+ movq [rax], xmm4
+ ADS_END 2
+
+cglobal x264_pixel_ads1_%1
+ pshuflw xmm7, [parm1q], 0
+ pshuflw xmm8, parm7q, 0
+ punpcklqdq xmm7, xmm7
+ punpcklqdq xmm8, xmm8
+ ADS_START
+.loop:
+ movdqu xmm0, [parm2q]
+ movdqu xmm1, [parm2q+16]
+ psubw xmm0, xmm7
+ psubw xmm1, xmm7
+ movdqu xmm9, [parm4q]
+ movdqu xmm10, [parm4q+16]
+ MMX_ABS xmm0, xmm2
+ MMX_ABS xmm1, xmm3
+ paddusw xmm0, xmm9
+ paddusw xmm1, xmm10
+ movdqa xmm4, xmm8
+ movdqa xmm5, xmm8
+ psubusw xmm4, xmm0
+ psubusw xmm5, xmm1
+ packsswb xmm4, xmm5
+ movdqa [rax], xmm4
+ ADS_END 4
+%endmacro
+
+ADS_SSE2 sse2
+%ifdef HAVE_SSE3
+%macro MMX_ABS 2
+ pabsw %1, %1
+%endmacro
+ADS_SSE2 ssse3
+%endif
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/common/amd64/pixel-sse2.asm
^
|
@@ -1000,9 +1000,9 @@
pshufd xmm6, xmm4, 0xB1
packssdw xmm1, xmm2
paddd xmm3, xmm5
- pmaddwd xmm1, xmm8
- paddd xmm4, xmm6
pshufd xmm1, xmm1, 0xD8
+ paddd xmm4, xmm6
+ pmaddwd xmm1, xmm8
movdqa xmm5, xmm3
punpckldq xmm3, xmm4
punpckhdq xmm5, xmm4
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/common/cpu.c
^
|
@@ -28,7 +28,7 @@
#ifdef SYS_BEOS
#include <kernel/OS.h>
#endif
-#ifdef SYS_MACOSX
+#if defined(SYS_MACOSX) || defined(SYS_FREEBSD)
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
@@ -219,7 +219,7 @@
#if !defined(HAVE_PTHREAD)
return 1;
-#elif defined(WIN32)
+#elif defined(_WIN32)
return pthread_num_processors_np();
#elif defined(SYS_LINUX)
@@ -237,7 +237,7 @@
get_system_info( &info );
return info.cpu_count;
-#elif defined(SYS_MACOSX)
+#elif defined(SYS_MACOSX) || defined(SYS_FREEBSD)
int numberOfCPUs;
size_t length = sizeof( numberOfCPUs );
if( sysctlbyname("hw.ncpu", &numberOfCPUs, &length, NULL, 0) )
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/common/i386/pixel-a.asm
^
|
@@ -1579,24 +1579,91 @@
+; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
+cglobal x264_pixel_ads_mvs
+ mov ebx, [ebp+24] ; mvs
+ mov ecx, esp ; masks
+ mov edi, [ebp+28] ; width
+ mov dword [ecx+edi], 0
+ push esi
+ push ebp
+ xor eax, eax
+ xor esi, esi
+.loopi:
+ mov ebp, [ecx+esi]
+ mov edx, [ecx+esi+4]
+ or edx, ebp
+ jz .nexti
+ xor edx, edx
+%macro TEST 1
+ mov [ebx+eax*2], si
+ test ebp, 0xff<<(%1*8)
+ setne dl
+ add eax, edx
+ inc esi
+%endmacro
+ TEST 0
+ TEST 1
+ TEST 2
+ TEST 3
+ mov ebp, [ecx+esi]
+ TEST 0
+ TEST 1
+ TEST 2
+ TEST 3
+ cmp esi, edi
+ jl .loopi
+ jmp .end
+.nexti:
+ add esi, 8
+ cmp esi, edi
+ jl .loopi
+.end:
+ pop ebp
+ pop esi
+ mov edi, [ebp-8]
+ mov ebx, [ebp-4]
+ leave
+ ret
+
+%macro ADS_START 0
+ push ebp
+ mov ebp, esp
+ push ebx
+ push edi
+ mov eax, [ebp+12] ; sums
+ mov ebx, [ebp+16] ; delta
+ mov ecx, [ebp+20] ; cost_mvx
+ mov edx, [ebp+28] ; width
+ sub esp, edx
+ sub esp, 4
+ and esp, ~15
+ mov edi, esp
+ shl ebx, 1
+%endmacro
+
+%macro ADS_END 1
+ add eax, 8*%1
+ add ecx, 8*%1
+ add edi, 4*%1
+ sub edx, 4*%1
+ jg .loop
+ jmp x264_pixel_ads_mvs
+%endmacro
+
;-----------------------------------------------------------------------------
-; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
-; uint16_t *res, int width )
+; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
+; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
cglobal x264_pixel_ads4_mmxext
- push ebx
- mov eax, [esp+8]
+ mov eax, [esp+4]
movq mm6, [eax]
movq mm4, [eax+8]
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
pshufw mm5, mm4, 0
pshufw mm4, mm4, 0xAA
- mov eax, [esp+12]
- mov ebx, [esp+16]
- mov ecx, [esp+20]
- mov edx, [esp+24]
- shl ebx, 1
+ ADS_START
.loop:
movq mm0, [eax]
movq mm1, [eax+16]
@@ -1613,25 +1680,20 @@
MMX_ABS mm3, mm1
paddw mm0, mm2
paddw mm0, mm3
- movq [ecx], mm0
- add eax, 8
- add ecx, 8
- sub edx, 4
- jg .loop
- pop ebx
- ret
+ pshufw mm1, [ebp+32], 0
+ paddusw mm0, [ecx]
+ psubusw mm1, mm0
+ packsswb mm1, mm1
+ movd [edi], mm1
+ ADS_END 1
cglobal x264_pixel_ads2_mmxext
- push ebx
- mov eax, [esp+8]
+ mov eax, [esp+4]
movq mm6, [eax]
+ pshufw mm5, [esp+28], 0
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
- mov eax, [esp+12]
- mov ebx, [esp+16]
- mov ecx, [esp+20]
- mov edx, [esp+24]
- shl ebx, 1
+ ADS_START
.loop:
movq mm0, [eax]
movq mm1, [eax+ebx]
@@ -1640,20 +1702,18 @@
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
paddw mm0, mm1
- movq [ecx], mm0
- add eax, 8
- add ecx, 8
- sub edx, 4
- jg .loop
- pop ebx
- ret
+ paddusw mm0, [ecx]
+ movq mm4, mm5
+ psubusw mm4, mm0
+ packsswb mm4, mm4
+ movd [edi], mm4
+ ADS_END 1
cglobal x264_pixel_ads1_mmxext
mov eax, [esp+4]
pshufw mm7, [eax], 0
- mov eax, [esp+8]
- mov ecx, [esp+16]
- mov edx, [esp+20]
+ pshufw mm6, [esp+28], 0
+ ADS_START
.loop:
movq mm0, [eax]
movq mm1, [eax+8]
@@ -1661,11 +1721,115 @@
psubw mm1, mm7
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
- movq [ecx], mm0
- movq [ecx+8], mm1
- add eax, 16
- add ecx, 16
- sub edx, 8
- jg .loop
- nop
- ret
+ paddusw mm0, [ecx]
+ paddusw mm1, [ecx+8]
+ movq mm4, mm6
+ movq mm5, mm6
+ psubusw mm4, mm0
+ psubusw mm5, mm1
+ packsswb mm4, mm5
+ movq [edi], mm4
+ ADS_END 2
+
+%macro ADS_SSE2 1
+cglobal x264_pixel_ads4_%1
+ mov eax, [esp+4] ; enc_dc
+ movdqa xmm4, [eax]
+ pshuflw xmm7, xmm4, 0
+ pshuflw xmm6, xmm4, 0xAA
+ pshufhw xmm5, xmm4, 0
+ pshufhw xmm4, xmm4, 0xAA
+ punpcklqdq xmm7, xmm7
+ punpcklqdq xmm6, xmm6
+ punpckhqdq xmm5, xmm5
+ punpckhqdq xmm4, xmm4
+ ADS_START
+.loop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax+16]
+ psubw xmm0, xmm7
+ psubw xmm1, xmm6
+ MMX_ABS xmm0, xmm2
+ MMX_ABS xmm1, xmm3
+ movdqu xmm2, [eax+ebx]
+ movdqu xmm3, [eax+ebx+16]
+ psubw xmm2, xmm5
+ psubw xmm3, xmm4
+ paddw xmm0, xmm1
+ MMX_ABS xmm2, xmm1
+ MMX_ABS xmm3, xmm1
+ paddw xmm0, xmm2
+ paddw xmm0, xmm3
+ movd xmm1, [ebp+32] ; thresh
+ movdqu xmm2, [ecx]
+ pshuflw xmm1, xmm1, 0
+ punpcklqdq xmm1, xmm1
+ paddusw xmm0, xmm2
+ psubusw xmm1, xmm0
+ packsswb xmm1, xmm1
+ movq [edi], xmm1
+ ADS_END 2
+
+cglobal x264_pixel_ads2_%1
+ mov eax, [esp+4] ; enc_dc
+ movq xmm6, [eax]
+ movd xmm5, [esp+28] ; thresh
+ pshuflw xmm7, xmm6, 0
+ pshuflw xmm6, xmm6, 0xAA
+ pshuflw xmm5, xmm5, 0
+ punpcklqdq xmm7, xmm7
+ punpcklqdq xmm6, xmm6
+ punpcklqdq xmm5, xmm5
+ ADS_START
+.loop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax+ebx]
+ psubw xmm0, xmm7
+ psubw xmm1, xmm6
+ movdqu xmm4, [ecx]
+ MMX_ABS xmm0, xmm2
+ MMX_ABS xmm1, xmm3
+ paddw xmm0, xmm1
+ paddusw xmm0, xmm4
+ movdqa xmm1, xmm5
+ psubusw xmm1, xmm0
+ packsswb xmm1, xmm1
+ movq [edi], xmm1
+ ADS_END 2
+
+cglobal x264_pixel_ads1_%1
+ mov eax, [esp+4] ; enc_dc
+ movd xmm7, [eax]
+ movd xmm6, [esp+28] ; thresh
+ pshuflw xmm7, xmm7, 0
+ pshuflw xmm6, xmm6, 0
+ punpcklqdq xmm7, xmm7
+ punpcklqdq xmm6, xmm6
+ ADS_START
+.loop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax+16]
+ psubw xmm0, xmm7
+ psubw xmm1, xmm7
+ movdqu xmm2, [ecx]
+ movdqu xmm3, [ecx+16]
+ MMX_ABS xmm0, xmm4
+ MMX_ABS xmm1, xmm5
+ paddusw xmm0, xmm2
+ paddusw xmm1, xmm3
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm6
+ psubusw xmm4, xmm0
+ psubusw xmm5, xmm1
+ packsswb xmm4, xmm5
+ movdqa [edi], xmm4
+ ADS_END 4
+%endmacro
+
+ADS_SSE2 sse2
+%ifdef HAVE_SSE3
+%macro MMX_ABS 2
+ pabsw %1, %1
+%endmacro
+ADS_SSE2 ssse3
+%endif
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/common/i386/pixel-sse2.asm
^
|
@@ -973,9 +973,9 @@
pshufd xmm6, xmm4, 0xB1
packssdw xmm1, xmm2
paddd xmm3, xmm5
- pmaddwd xmm1, xmm7
- paddd xmm4, xmm6
pshufd xmm1, xmm1, 0xD8
+ paddd xmm4, xmm6
+ pmaddwd xmm1, xmm7
movdqa xmm5, xmm3
punpckldq xmm3, xmm4
punpckhdq xmm5, xmm4
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/common/i386/pixel.h
^
|
@@ -81,11 +81,18 @@
const uint8_t *pix2, int stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
-void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
- uint16_t *res, int width );
-void x264_pixel_ads2_mmxext( int enc_dc[2], uint16_t *sums, int delta,
- uint16_t *res, int width );
-void x264_pixel_ads1_mmxext( int enc_dc[1], uint16_t *sums, int delta,
- uint16_t *res, int width );
+#define DECL_ADS( size, suffix ) \
+int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
+ uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
+DECL_ADS( 4, mmxext )
+DECL_ADS( 2, mmxext )
+DECL_ADS( 1, mmxext )
+DECL_ADS( 4, sse2 )
+DECL_ADS( 2, sse2 )
+DECL_ADS( 1, sse2 )
+DECL_ADS( 4, ssse3 )
+DECL_ADS( 2, ssse3 )
+DECL_ADS( 1, ssse3 )
+#undef DECL_ADS
#endif
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/common/mc.c
^
|
@@ -430,7 +430,7 @@
uint8_t *ref = frame->plane[0] + y * stride - PADH;
uint16_t *line = frame->integral + (y+1) * stride - PADH + 1;
uint16_t v = line[0] = 0;
- for( x = 0; x < stride-1; x++ )
+ for( x = 1; x < stride-1; x++ )
line[x] = v += ref[x] + line[x-stride] - line[x-stride-1];
line -= 8*stride;
if( y >= 9-PADV )
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/common/osdep.h
^
|
@@ -50,7 +50,7 @@
#if defined(_MSC_VER) || defined(SYS_SunOS) || defined(SYS_MACOSX)
#define sqrtf sqrt
#endif
-#ifdef __WIN32__
+#ifdef _WIN32
#define rename(src,dst) (unlink(dst), rename(src,dst)) // POSIX says that rename() removes the destination, but win32 doesn't.
#ifndef strtok_r
#define strtok_r(str,delim,save) strtok(str,delim)
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/common/pixel.c
^
|
@@ -408,32 +408,50 @@
/****************************************************************************
* successive elimination
****************************************************************************/
-static void pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
- uint16_t *res, int width )
+static int x264_pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
+ uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
{
- int i;
+ int nmv=0, i;
for( i=0; i<width; i++, sums++ )
- res[i] = abs( enc_dc[0] - sums[0] )
- + abs( enc_dc[1] - sums[8] )
- + abs( enc_dc[2] - sums[delta] )
- + abs( enc_dc[3] - sums[delta+8] );
+ {
+ int ads = abs( enc_dc[0] - sums[0] )
+ + abs( enc_dc[1] - sums[8] )
+ + abs( enc_dc[2] - sums[delta] )
+ + abs( enc_dc[3] - sums[delta+8] )
+ + cost_mvx[i];
+ if( ads < thresh )
+ mvs[nmv++] = i;
+ }
+ return nmv;
}
-static void pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
- uint16_t *res, int width )
+static int x264_pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
+ uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
{
- int i;
+ int nmv=0, i;
for( i=0; i<width; i++, sums++ )
- res[i] = abs( enc_dc[0] - sums[0] )
- + abs( enc_dc[1] - sums[delta] );
+ {
+ int ads = abs( enc_dc[0] - sums[0] )
+ + abs( enc_dc[1] - sums[delta] )
+ + cost_mvx[i];
+ if( ads < thresh )
+ mvs[nmv++] = i;
+ }
+ return nmv;
}
-static void pixel_ads1( int enc_dc[1], uint16_t *sums, int delta,
- uint16_t *res, int width )
+static int x264_pixel_ads1( int enc_dc[1], uint16_t *sums, int delta,
+ uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
{
- int i;
+ int nmv=0, i;
for( i=0; i<width; i++, sums++ )
- res[i] = abs( enc_dc[0] - sums[0] );
+ {
+ int ads = abs( enc_dc[0] - sums[0] )
+ + cost_mvx[i];
+ if( ads < thresh )
+ mvs[nmv++] = i;
+ }
+ return nmv;
}
@@ -459,20 +477,22 @@
pixf->name[PIXEL_4x8] = x264_pixel_##name##_4x8##cpu;\
pixf->name[PIXEL_4x4] = x264_pixel_##name##_4x4##cpu;
+#define INIT_ADS( cpu ) \
+ pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
+ pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\
+ pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu;
+
INIT7( sad, );
INIT7( sad_x3, );
INIT7( sad_x4, );
INIT7( ssd, );
INIT7( satd, );
INIT4( sa8d, );
+ INIT_ADS( );
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
- pixf->ads[PIXEL_16x16] = pixel_ads4;
- pixf->ads[PIXEL_16x8] = pixel_ads2;
- pixf->ads[PIXEL_8x8] = pixel_ads1;
-
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
{
@@ -485,10 +505,7 @@
INIT7( sad_x3, _mmxext );
INIT7( sad_x4, _mmxext );
INIT7( satd, _mmxext );
-
- pixf->ads[PIXEL_16x16] = x264_pixel_ads4_mmxext;
- pixf->ads[PIXEL_16x8 ] = x264_pixel_ads2_mmxext;
- pixf->ads[PIXEL_8x8 ] = x264_pixel_ads1_mmxext;
+ INIT_ADS( _mmxext );
#ifdef ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
@@ -535,6 +552,7 @@
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
INIT5( satd, _sse2 );
+ INIT_ADS( _sse2 );
#ifdef ARCH_X86
if( cpu&X264_CPU_CACHELINE_SPLIT )
@@ -570,6 +588,7 @@
if( cpu&X264_CPU_SSSE3 )
{
INIT5( satd, _ssse3 );
+ INIT_ADS( _ssse3 );
#ifdef ARCH_X86_64
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/common/pixel.h
^
|
@@ -80,9 +80,9 @@
x264_pixel_cmp_x4_t sad_x4[7];
/* abs-diff-sum for successive elimination.
- * may round width up to a multiple of 8. */
- void (*ads[7])( int enc_dc[4], uint16_t *sums, int delta,
- uint16_t *res, int width );
+ * may round width up to a multiple of 16. */
+ int (*ads[7])( int enc_dc[4], uint16_t *sums, int delta,
+ uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
/* calculate satd of V, H, and DC modes.
* may be NULL, in which case just use pred+satd instead. */
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/encoder/analyse.c
^
|
@@ -167,16 +167,18 @@
static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
+uint16_t *x264_cost_mv_fpel[52][4];
+
/* initialize an array of lambda*nbits for all possible mvs */
static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
{
static int16_t *p_cost_mv[52];
+ int i, j;
if( !p_cost_mv[a->i_qp] )
{
/* could be faster, but isn't called many times */
/* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
- int i;
p_cost_mv[a->i_qp] = x264_malloc( (4*4*2048 + 1) * sizeof(int16_t) );
p_cost_mv[a->i_qp] += 2*4*2048;
for( i = 0; i <= 2*4*2048; i++ )
@@ -185,8 +187,19 @@
p_cost_mv[a->i_qp][i] = a->i_lambda * bs_size_se( i );
}
}
-
a->p_cost_mv = p_cost_mv[a->i_qp];
+
+ /* FIXME is this useful for all me methods? */
+ if( h->param.analyse.i_me_method == X264_ME_ESA && !x264_cost_mv_fpel[a->i_qp][0] )
+ {
+ for( j=0; j<4; j++ )
+ {
+ x264_cost_mv_fpel[a->i_qp][j] = x264_malloc( (4*2048 + 1) * sizeof(int16_t) );
+ x264_cost_mv_fpel[a->i_qp][j] += 2*2048;
+ for( i = -2*2048; i < 2*2048; i++ )
+ x264_cost_mv_fpel[a->i_qp][j][i] = p_cost_mv[a->i_qp][i*4+j];
+ }
+ }
}
static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/encoder/encoder.c
^
|
@@ -885,7 +885,7 @@
h->i_ref1 = X264_MIN( h->i_ref1, h->frames.i_max_ref1 );
h->i_ref0 = X264_MIN( h->i_ref0, h->frames.i_max_ref0 );
h->i_ref0 = X264_MIN( h->i_ref0, h->param.i_frame_reference ); // if reconfig() has lowered the limit
- h->i_ref0 = X264_MIN( h->i_ref0, 16 - h->i_ref1 );
+ assert( h->i_ref0 + h->i_ref1 <= 16 );
h->mb.pic.i_fref[0] = h->i_ref0;
h->mb.pic.i_fref[1] = h->i_ref1;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/encoder/me.c
^
|
@@ -101,22 +101,19 @@
COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\
}
-#define COST_MV_X4_ABS( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
+#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
{\
- h->pixf.sad_x4[i_pixel]( m->p_fenc[0],\
+ h->pixf.sad_x3[i_pixel]( m->p_fenc[0],\
p_fref + (m0x) + (m0y)*m->i_stride[0],\
p_fref + (m1x) + (m1y)*m->i_stride[0],\
p_fref + (m2x) + (m2y)*m->i_stride[0],\
- p_fref + (m3x) + (m3y)*m->i_stride[0],\
m->i_stride[0], costs );\
- costs[0] += p_cost_mvx[m0x<<2]; /* no cost_mvy */\
- costs[1] += p_cost_mvx[m1x<<2];\
- costs[2] += p_cost_mvx[m2x<<2];\
- costs[3] += p_cost_mvx[m3x<<2];\
+ costs[0] += p_cost_mvx[(m0x)<<2]; /* no cost_mvy */\
+ costs[1] += p_cost_mvx[(m1x)<<2];\
+ costs[2] += p_cost_mvx[(m2x)<<2];\
COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\
COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\
COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\
- COPY3_IF_LT( bcost, costs[3], bmx, m3x, bmy, m3y );\
}
/* 1 */
@@ -454,13 +451,16 @@
case X264_ME_ESA:
{
- const int min_x = X264_MAX( bmx - i_me_range, mv_x_min);
- const int min_y = X264_MAX( bmy - i_me_range, mv_y_min);
- const int max_x = X264_MIN( bmx + i_me_range, mv_x_max);
- const int max_y = X264_MIN( bmy + i_me_range, mv_y_max);
- int mx, my;
+ const int min_x = X264_MAX( bmx - i_me_range, mv_x_min );
+ const int min_y = X264_MAX( bmy - i_me_range, mv_y_min );
+ const int max_x = X264_MIN( bmx + i_me_range, mv_x_max );
+ const int max_y = X264_MIN( bmy + i_me_range, mv_y_max );
+ /* SEA is fastest in multiples of 4 */
+ const int width = (max_x - min_x + 3) & ~3;
+ int my;
#if 0
/* plain old exhaustive search */
+ int mx;
for( my = min_y; my <= max_y; my++ )
for( mx = min_x; mx <= max_x; mx++ )
COST_MV( mx, my );
@@ -470,10 +470,13 @@
const int stride = m->i_stride[0];
static uint8_t zero[16*16] = {0,};
uint16_t *sums_base = m->integral;
- int enc_dc[4];
+ DECLARE_ALIGNED( int, enc_dc[4], 16 );
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int delta = x264_pixel_size[sad_size].w;
- uint16_t *ads = x264_malloc((max_x-min_x+8) * sizeof(uint16_t));
+ int16_t xs_buf[64];
+ int16_t *xs = width<=64 ? xs_buf : x264_malloc( (width+15)*sizeof(int16_t) );
+ int xn;
+ uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);
h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+delta,
m->p_fenc[0]+delta*FENC_STRIDE, m->p_fenc[0]+delta+delta*FENC_STRIDE,
@@ -487,29 +490,18 @@
for( my = min_y; my <= max_y; my++ )
{
- int mvs[3], i_mvs=0;
bcost -= p_cost_mvy[my<<2];
- h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
- ads, max_x-min_x+1 );
- for( mx = min_x; mx <= max_x; mx++ )
- {
- if( ads[mx-min_x] < bcost - p_cost_mvx[mx<<2] )
- {
- if( i_mvs == 3 )
- {
- COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my );
- i_mvs = 0;
- }
- else
- mvs[i_mvs++] = mx;
- }
- }
+ xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
+ cost_fpel_mvx+min_x, xs, width, bcost );
+ for( i=0; i<xn-2; i+=3 )
+ COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my );
bcost += p_cost_mvy[my<<2];
- for( i=0; i<i_mvs; i++ )
- COST_MV( mvs[i], my );
+ for( ; i<xn; i++ )
+ COST_MV( min_x+xs[i], my );
}
- x264_free(ads);
+ if( xs != xs_buf )
+ x264_free( xs );
#endif
}
break;
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/encoder/me.h
^
|
@@ -56,6 +56,8 @@
int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );
+extern uint16_t *x264_cost_mv_fpel[52][4];
+
#define COPY1_IF_LT(x,y)\
if((y)<(x))\
(x)=(y);
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/encoder/set.c
^
|
@@ -196,7 +196,7 @@
/* extra slot with pyramid so that we don't have to override the
* order of forgetting old pictures */
sps->vui.i_max_dec_frame_buffering =
- sps->i_num_ref_frames = X264_MIN(16, param->i_frame_reference + sps->vui.i_num_reorder_frames + param->b_bframe_pyramid);
+ sps->i_num_ref_frames = X264_MIN(16, X264_MAX(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames));
sps->vui.b_bitstream_restriction = 1;
if( sps->vui.b_bitstream_restriction )
|
[-]
[+]
|
Changed |
x264-snapshot-20080126-2245.tar.bz2/tools/checkasm.c
^
|
@@ -36,6 +36,7 @@
x264_predict_t predict_4x4[9+3];
x264_predict8x8_t predict_8x8[9+3];
DECLARE_ALIGNED( uint8_t, edge[33], 8 );
+ uint16_t cost_mv[32];
int ret = 0, ok, used_asm;
int i, j;
@@ -155,20 +156,24 @@
}
ok = 1; used_asm = 0;
- for( i=0; i<4; i++ )
- if( pixel_asm.ads[i] != pixel_ref.ads[i] )
+ for( i=0; i<32; i++ )
+ cost_mv[i] = i*10;
+ for( i=0; i<100; i++ )
+ if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] )
{
- uint16_t res_a[32], res_c[32];
- uint16_t sums[72];
- int dc[4];
+ DECLARE_ALIGNED( uint16_t, sums[72], 16 );
+ DECLARE_ALIGNED( int, dc[4], 16 );
+ int16_t mvs_a[32], mvs_c[32];
+ int mvn_a, mvn_c;
+ int thresh = rand() & 0x3fff;
for( j=0; j<72; j++ )
sums[j] = rand() & 0x3fff;
for( j=0; j<4; j++ )
dc[j] = rand() & 0x3fff;
used_asm = 1;
- pixel_c.ads[i]( dc, sums, 32, res_c, 32 );
- pixel_asm.ads[i]( dc, sums, 32, res_a, 32 );
- if( memcmp(res_a, res_c, sizeof(res_c)) )
+ mvn_c = pixel_c.ads[i&3]( dc, sums, 32, cost_mv, mvs_c, 32, thresh );
+ mvn_a = pixel_asm.ads[i&3]( dc, sums, 32, cost_mv, mvs_a, 32, thresh );
+ if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) )
ok = 0;
}
report( "esa ads:" );
|
|
Deleted |
x264-snapshot-20080814-2245.tar.bz2/.git/objects/pack/pack-86b450d425caafda828e437e0bbcfb6dd9c53021.idx
^
|
|
Deleted |
x264-snapshot-20080814-2245.tar.bz2/.git/objects/pack/pack-86b450d425caafda828e437e0bbcfb6dd9c53021.pack
^
|
[-]
[+]
|
Deleted |
x264-snapshot-20080814-2245.tar.bz2/encoder/eval.c
^
|
@@ -1,253 +0,0 @@
-/*****************************************************************************
- * simple arithmetic expression evaluator
- *****************************************************************************
- * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *****************************************************************************/
-
-/**
- * @file eval.c
- * simple arithmetic expression evaluator.
- *
- * see http://joe.hotchkiss.com/programming/eval/eval.html
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-
-#ifndef NAN
- #define NAN 0
-#endif
-
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
-#define STACK_SIZE 100
-
-typedef struct Parser{
- double stack[STACK_SIZE];
- int stack_index;
- char *s;
- double *const_value;
- const char **const_name; // NULL terminated
- double (**func1)(void *, double a); // NULL terminated
- const char **func1_name; // NULL terminated
- double (**func2)(void *, double a, double b); // NULL terminated
- char **func2_name; // NULL terminated
- void *opaque;
-} Parser;
-
-static void evalExpression(Parser *p);
-
-static void push(Parser *p, double d){
- if(p->stack_index+1>= STACK_SIZE){
- fprintf(stderr, "stack overflow in the parser\n");
- return;
- }
- p->stack[ p->stack_index++ ]= d;
-//printf("push %f\n", d); fflush(stdout);
-}
-
-static double pop(Parser *p){
- if(p->stack_index<=0){
- fprintf(stderr, "stack underflow in the parser\n");
- return NAN;
- }
-//printf("pop\n"); fflush(stdout);
- return p->stack[ --p->stack_index ];
-}
-
-static int strmatch(const char *s, const char *prefix){
- int i;
- for(i=0; prefix[i]; i++){
- if(prefix[i] != s[i]) return 0;
- }
- return 1;
-}
-
-static void evalPrimary(Parser *p){
- double d, d2=NAN;
- char *next= p->s;
- int i;
-
- /* number */
- d= strtod(p->s, &next);
- if(next != p->s){
- push(p, d);
- p->s= next;
- return;
- }
-
- /* named constants */
- for(i=0; p->const_name[i]; i++){
- if(strmatch(p->s, p->const_name[i])){
- push(p, p->const_value[i]);
- p->s+= strlen(p->const_name[i]);
- return;
- }
- }
-
- p->s= strchr(p->s, '(');
- if(p->s==NULL){
- fprintf(stderr, "Parser: missing ( in \"%s\"\n", next);
- return;
- }
- p->s++; // "("
- evalExpression(p);
- d= pop(p);
- if(p->s[0]== ','){
- p->s++; // ","
- evalExpression(p);
- d2= pop(p);
- }
- if(p->s[0] != ')'){
- fprintf(stderr, "Parser: missing ) in \"%s\"\n", next);
- return;
- }
- p->s++; // ")"
-
- if( strmatch(next, "sinh" ) ) d= sinh(d);
- else if( strmatch(next, "cosh" ) ) d= cosh(d);
- else if( strmatch(next, "tanh" ) ) d= tanh(d);
- else if( strmatch(next, "sin" ) ) d= sin(d);
- else if( strmatch(next, "cos" ) ) d= cos(d);
- else if( strmatch(next, "tan" ) ) d= tan(d);
- else if( strmatch(next, "exp" ) ) d= exp(d);
- else if( strmatch(next, "log" ) ) d= log(d);
- else if( strmatch(next, "squish") ) d= 1/(1+exp(4*d));
- else if( strmatch(next, "gauss" ) ) d= exp(-d*d/2)/sqrt(2*M_PI);
- else if( strmatch(next, "abs" ) ) d= fabs(d);
- else if( strmatch(next, "max" ) ) d= d > d2 ? d : d2;
- else if( strmatch(next, "min" ) ) d= d < d2 ? d : d2;
- else if( strmatch(next, "gt" ) ) d= d > d2 ? 1.0 : 0.0;
- else if( strmatch(next, "gte" ) ) d= d >= d2 ? 1.0 : 0.0;
- else if( strmatch(next, "lt" ) ) d= d > d2 ? 0.0 : 1.0;
- else if( strmatch(next, "lte" ) ) d= d >= d2 ? 0.0 : 1.0;
- else if( strmatch(next, "eq" ) ) d= d == d2 ? 1.0 : 0.0;
-// else if( strmatch(next, "l1" ) ) d= 1 + d2*(d - 1);
-// else if( strmatch(next, "sq01" ) ) d= (d >= 0.0 && d <=1.0) ? 1.0 : 0.0;
- else{
- int error=1;
- for(i=0; p->func1_name && p->func1_name[i]; i++){
- if(strmatch(next, p->func1_name[i])){
- d= p->func1[i](p->opaque, d);
- error=0;
- break;
- }
- }
-
- for(i=0; p->func2_name && p->func2_name[i]; i++){
- if(strmatch(next, p->func2_name[i])){
- d= p->func2[i](p->opaque, d, d2);
- error=0;
- break;
- }
- }
-
- if(error){
- fprintf(stderr, "Parser: unknown function in \"%s\"\n", next);
- return;
- }
- }
-
- push(p, d);
-}
-
-static void evalPow(Parser *p){
- int neg= 0;
- if(p->s[0]=='+') p->s++;
-
- if(p->s[0]=='-'){
- neg= 1;
- p->s++;
- }
-
- if(p->s[0]=='('){
- p->s++;;
- evalExpression(p);
-
- if(p->s[0]!=')')
- fprintf(stderr, "Parser: missing )\n");
- p->s++;
- }else{
- evalPrimary(p);
- }
-
- if(neg) push(p, -pop(p));
-}
-
-static void evalFactor(Parser *p){
- evalPow(p);
- while(p->s[0]=='^'){
- double d;
-
- p->s++;
- evalPow(p);
- d= pop(p);
- push(p, pow(pop(p), d));
- }
-}
-
-static void evalTerm(Parser *p){
- evalFactor(p);
- while(p->s[0]=='*' || p->s[0]=='/'){
- int inv= p->s[0]=='/';
- double d;
-
- p->s++;
- evalFactor(p);
- d= pop(p);
- if(inv) d= 1.0/d;
- push(p, d * pop(p));
- }
-}
-
-static void evalExpression(Parser *p){
- evalTerm(p);
- while(p->s[0]=='+' || p->s[0]=='-'){
- int sign= p->s[0]=='-';
- double d;
-
- p->s++;
- evalTerm(p);
- d= pop(p);
- if(sign) d= -d;
- push(p, d + pop(p));
- }
-}
-
-double x264_eval(char *s, double *const_value, const char **const_name,
- double (**func1)(void *, double), const char **func1_name,
- double (**func2)(void *, double, double), char **func2_name,
- void *opaque){
- Parser p;
-
- p.stack_index=0;
- p.s= s;
- p.const_value= const_value;
- p.const_name = const_name;
- p.func1 = func1;
- p.func1_name = func1_name;
- p.func2 = func2;
- p.func2_name = func2_name;
- p.opaque = opaque;
-
- evalExpression(&p);
- return pop(&p);
-}
|
|
Changed |
x264-snapshot-20080829-2245.tar.bz2
^
|
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/.git/index
^
|
|
Added |
x264-snapshot-20081001-2245.tar.bz2/.git/objects/pack/pack-cbee77041ce0953f87213f3295f7bddd63f94b6d.idx
^
|
|
Added |
x264-snapshot-20081001-2245.tar.bz2/.git/objects/pack/pack-cbee77041ce0953f87213f3295f7bddd63f94b6d.pack
^
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/.git/refs/heads/master
^
|
@@ -1 +1 @@
-fd1de69b8054ef718b52f5ae1520267a5e5402e8
+2324c7074585b8b3f56e49ae41df9cbca06f6185
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/.git/refs/heads/origin
^
|
@@ -1 +1 @@
-fd1de69b8054ef718b52f5ae1520267a5e5402e8
+2324c7074585b8b3f56e49ae41df9cbca06f6185
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/AUTHORS
^
|
@@ -15,9 +15,6 @@
D: Motion estimation (subpel and mixed refs)
D: B-RDO
-N: Andrew Dunstan
-D: win64 asm port
-
N: bobololo
D: Avisynth input
D: MP4 muxing
@@ -26,9 +23,6 @@
E: sennindemokrit AT gmx DOT net
D: x86 asm
-N: Placeholder
-D: Altivec optimizations
-
N: Eric Petit
E: eric.petit AT lapsus DOT org
C: titer
@@ -36,9 +30,6 @@
D: BeOS and MacOS X ports.
S: France
-N: Francesco Corriga
-D: VfW
-
N: Gabriel Bouvigne
E: gabriel.bouvigne AT joost DOT com
D: 2pass VBV
@@ -54,12 +45,6 @@
D: various speed optimizations, bugfixes
S: USA
-N: Justin Clay
-E: justin_clay AT hotmail DOT com
-C: wheatgerm
-D: Inital work on VfW
-S: Nova Scotia, Canada
-
N: Laurent Aimar
E: fenrir AT via.ecp DOT fr
C: fenrir
@@ -96,7 +81,6 @@
C: chenm001
D: Win32/VC 6.0 port
D: gcc asm to nasm conversion
-D: VfW
S: China
N: Phil Jensen
@@ -107,10 +91,6 @@
E: radoslaw AT syskin DOT cjb DOT net
D: Cached motion compensation
-N: Riccardo Stievano
-E: walkunafraid AT tin DOT it
-D: VfW
-
N: Tuukka Toivonen
E: tuukkat AT ee DOT oulu DOT fi
D: Visualization
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/Makefile
^
|
@@ -10,7 +10,7 @@
common/quant.c common/vlc.c \
encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
encoder/set.c encoder/macroblock.c encoder/cabac.c \
- encoder/cavlc.c encoder/encoder.c encoder/eval.c
+ encoder/cavlc.c encoder/encoder.c
SRCCLI = x264.c matroska.c muxers.c
@@ -161,7 +161,7 @@
ifeq ($(SYS),MINGW)
$(if $(SONAME), install -m 755 $(SONAME) $(DESTDIR)$(bindir))
else
- $(if $(SONAME), ln -sf $(SONAME) $(DESTDIR)$(libdir)/libx264.so)
+ $(if $(SONAME), ln -sf $(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX))
$(if $(SONAME), install -m 755 $(SONAME) $(DESTDIR)$(libdir))
endif
$(if $(IMPLIBNAME), install -m 644 $(IMPLIBNAME) $(DESTDIR)$(libdir))
@@ -172,7 +172,7 @@
uninstall:
rm -f $(DESTDIR)$(includedir)/x264.h $(DESTDIR)$(libdir)/libx264.a
rm -f $(DESTDIR)$(bindir)/x264 $(DESTDIR)$(libdir)/pkgconfig/x264.pc
- $(if $(SONAME), rm -f $(DESTDIR)$(libdir)/$(SONAME) $(DESTDIR)$(libdir)/libx264.so)
+ $(if $(SONAME), rm -f $(DESTDIR)$(libdir)/$(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX))
$(MAKE) -C gtk uninstall
etags: TAGS
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/bs.h
^
|
@@ -76,7 +76,11 @@
s->i_left -= i_count;
if( s->i_left <= 32 )
{
+#ifdef WORDS_BIGENDIAN
+ *(uint32_t*)s->p = s->cur_bits >> (32 - s->i_left);
+#else
*(uint32_t*)s->p = endian_fix( s->cur_bits << s->i_left );
+#endif
s->i_left += 32;
s->p += 4;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/cabac.h
^
|
@@ -40,7 +40,7 @@
/* aligned for memcpy_aligned starting here */
DECLARE_ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
-
+
/* context */
uint8_t state[460];
} x264_cabac_t;
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/common.c
^
|
@@ -69,7 +69,7 @@
param->i_keyint_min = 25;
param->i_bframe = 0;
param->i_scenecut_threshold = 40;
- param->b_bframe_adaptive = 1;
+ param->i_bframe_adaptive = X264_B_ADAPT_FAST;
param->i_bframe_bias = 0;
param->b_bframe_pyramid = 0;
@@ -93,14 +93,13 @@
param->rc.i_qp_step = 4;
param->rc.f_ip_factor = 1.4;
param->rc.f_pb_factor = 1.3;
- param->rc.i_aq_mode = X264_AQ_GLOBAL;
+ param->rc.i_aq_mode = X264_AQ_VARIANCE;
param->rc.f_aq_strength = 1.0;
param->rc.b_stat_write = 0;
param->rc.psz_stat_out = "x264_2pass.log";
param->rc.b_stat_read = 0;
param->rc.psz_stat_in = "x264_2pass.log";
- param->rc.psz_rc_eq = "blurCplx^(1-qComp)";
param->rc.f_qcompress = 0.6;
param->rc.f_qblur = 0.5;
param->rc.f_complexity_blur = 20;
@@ -117,8 +116,10 @@
| X264_ANALYSE_PSUB16x16 | X264_ANALYSE_BSUB16x16;
param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
param->analyse.i_me_method = X264_ME_HEX;
+ param->analyse.f_psy_rd = 1.0;
+ param->analyse.f_psy_trellis = 0;
param->analyse.i_me_range = 16;
- param->analyse.i_subpel_refine = 5;
+ param->analyse.i_subpel_refine = 6;
param->analyse.b_chroma_me = 1;
param->analyse.i_mv_range_thread = -1;
param->analyse.i_mv_range = -1; // set from level_idc
@@ -169,12 +170,12 @@
static int x264_atobool( const char *str, int *b_error )
{
- if( !strcmp(str, "1") ||
- !strcmp(str, "true") ||
+ if( !strcmp(str, "1") ||
+ !strcmp(str, "true") ||
!strcmp(str, "yes") )
return 1;
- if( !strcmp(str, "0") ||
- !strcmp(str, "false") ||
+ if( !strcmp(str, "0") ||
+ !strcmp(str, "false") ||
!strcmp(str, "no") )
return 0;
*b_error = 1;
@@ -329,7 +330,14 @@
OPT("bframes")
p->i_bframe = atoi(value);
OPT("b-adapt")
- p->b_bframe_adaptive = atobool(value);
+ {
+ p->i_bframe_adaptive = atobool(value);
+ if( b_error )
+ {
+ b_error = 0;
+ p->i_bframe_adaptive = atoi(value);
+ }
+ }
OPT("b-bias")
p->i_bframe_bias = atoi(value);
OPT("b-pyramid")
@@ -464,6 +472,21 @@
p->analyse.i_mv_range_thread = atoi(value);
OPT2("subme", "subq")
p->analyse.i_subpel_refine = atoi(value);
+ OPT("psy-rd")
+ {
+ if( 2 == sscanf( value, "%f:%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
+ 2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) )
+ { }
+ else if( sscanf( value, "%f", &p->analyse.f_psy_rd ) )
+ {
+ p->analyse.f_psy_trellis = 0;
+ }
+ else
+ {
+ p->analyse.f_psy_rd = 0;
+ p->analyse.f_psy_trellis = 0;
+ }
+ }
OPT("bime")
p->analyse.b_bidir_me = atobool(value);
OPT("chroma-me")
@@ -532,8 +555,6 @@
p->rc.psz_stat_in = strdup(value);
p->rc.psz_stat_out = strdup(value);
}
- OPT("rceq")
- p->rc.psz_rc_eq = strdup(value);
OPT("qcomp")
p->rc.f_qcompress = atof(value);
OPT("qblur")
@@ -644,7 +665,6 @@
uint8_t *dst = p_data;
uint8_t *src = nal->p_payload;
uint8_t *end = &nal->p_payload[nal->i_payload];
-
int i_count = 0;
/* FIXME this code doesn't check overflow */
@@ -669,13 +689,9 @@
i_count = 0;
}
if( *src == 0 )
- {
i_count++;
- }
else
- {
i_count = 0;
- }
*dst++ = *src++;
}
*pi_data = dst - (uint8_t*)p_data;
@@ -683,37 +699,6 @@
return *pi_data;
}
-/****************************************************************************
- * x264_nal_decode:
- ****************************************************************************/
-int x264_nal_decode( x264_nal_t *nal, void *p_data, int i_data )
-{
- uint8_t *src = p_data;
- uint8_t *end = &src[i_data];
- uint8_t *dst = nal->p_payload;
-
- nal->i_type = src[0]&0x1f;
- nal->i_ref_idc = (src[0] >> 5)&0x03;
-
- src++;
-
- while( src < end )
- {
- if( src < end - 3 && src[0] == 0x00 && src[1] == 0x00 && src[2] == 0x03 )
- {
- *dst++ = 0x00;
- *dst++ = 0x00;
-
- src += 3;
- continue;
- }
- *dst++ = *src++;
- }
-
- nal->i_payload = dst - (uint8_t*)p_data;
- return 0;
-}
-
/****************************************************************************
@@ -856,6 +841,7 @@
s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter );
s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] );
s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );
+ s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
s += sprintf( s, " brdo=%d", p->analyse.b_bframe_rdo );
s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );
s += sprintf( s, " me_range=%d", p->analyse.i_me_range );
@@ -874,7 +860,7 @@
if( p->i_bframe )
{
s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d wpredb=%d bime=%d",
- p->b_bframe_pyramid, p->b_bframe_adaptive, p->i_bframe_bias,
+ p->b_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias,
p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred,
p->analyse.b_bidir_me );
}
@@ -893,9 +879,8 @@
else
s += sprintf( s, " bitrate=%d ratetol=%.1f",
p->rc.i_bitrate, p->rc.f_rate_tolerance );
- s += sprintf( s, " rceq='%s' qcomp=%.2f qpmin=%d qpmax=%d qpstep=%d",
- p->rc.psz_rc_eq, p->rc.f_qcompress,
- p->rc.i_qp_min, p->rc.i_qp_max, p->rc.i_qp_step );
+ s += sprintf( s, " qcomp=%.2f qpmin=%d qpmax=%d qpstep=%d",
+ p->rc.f_qcompress, p->rc.i_qp_min, p->rc.i_qp_max, p->rc.i_qp_step );
if( p->rc.b_stat_read )
s += sprintf( s, " cplxblur=%.1f qblur=%.1f",
p->rc.f_complexity_blur, p->rc.f_qblur );
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/common.h
^
|
@@ -262,6 +262,8 @@
int i_frame_size;
} out;
+ /**** thread synchronization starts here ****/
+
/* frame number/poc */
int i_frame;
@@ -294,6 +296,8 @@
uint16_t (*quant4_bias[4])[16]; /* [4][52][16] */
uint16_t (*quant8_bias[2])[64]; /* [2][52][64] */
+ const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
+
DECLARE_ALIGNED_16( uint32_t nr_residual_sum[2][64] );
DECLARE_ALIGNED_16( uint16_t nr_offset[2][64] );
uint32_t nr_count[2];
@@ -307,11 +311,11 @@
struct
{
/* Frames to be encoded (whose types have been decided) */
- x264_frame_t *current[X264_BFRAME_MAX+3];
+ x264_frame_t *current[X264_BFRAME_MAX*4+3];
/* Temporary buffer (frames types not yet decided) */
- x264_frame_t *next[X264_BFRAME_MAX+3];
+ x264_frame_t *next[X264_BFRAME_MAX*4+3];
/* Unused frames */
- x264_frame_t *unused[X264_BFRAME_MAX + X264_THREAD_MAX*2 + 16+4];
+ x264_frame_t *unused[X264_BFRAME_MAX*4 + X264_THREAD_MAX*2 + 16+4];
/* For adaptive B decision */
x264_frame_t *last_nonb;
@@ -370,13 +374,15 @@
int i_mb_xy;
int i_b8_xy;
int i_b4_xy;
-
+
/* Search parameters */
int i_me_method;
int i_subpel_refine;
int b_chroma_me;
int b_trellis;
int b_noise_reduction;
+ int i_psy_rd; /* Psy RD strength--fixed point value*/
+ int i_psy_trellis; /* Psy trellis strength--fixed point value*/
int b_interlaced;
@@ -395,13 +401,17 @@
unsigned int i_neighbour;
unsigned int i_neighbour8[4]; /* neighbours of each 8x8 or 4x4 block that are available */
unsigned int i_neighbour4[16]; /* at the time the block is coded */
- int i_mb_type_top;
- int i_mb_type_left;
- int i_mb_type_topleft;
- int i_mb_type_topright;
+ int i_mb_type_top;
+ int i_mb_type_left;
+ int i_mb_type_topleft;
+ int i_mb_type_topright;
int i_mb_prev_xy;
int i_mb_top_xy;
+ /**** thread synchronization ends here ****/
+ /* subsequent variables are either thread-local or constant,
+ * and won't be copied from one thread to another */
+
/* mb table */
int8_t *type; /* mb type */
int8_t *qp; /* mb qp */
@@ -448,14 +458,26 @@
DECLARE_ALIGNED_16( uint8_t fenc_buf[24*FENC_STRIDE] );
DECLARE_ALIGNED_16( uint8_t fdec_buf[27*FDEC_STRIDE] );
- /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
+ /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
DECLARE_ALIGNED_16( uint8_t i4x4_fdec_buf[16*16] );
DECLARE_ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] );
DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
+ /* Psy trellis DCT data */
+ DECLARE_ALIGNED_16( int16_t fenc_dct8[4][64] );
+ DECLARE_ALIGNED_16( int16_t fenc_dct4[16][16] );
+
+ /* Psy RD SATD scores */
+ int fenc_satd[4][4];
+ int fenc_satd_sum;
+ int fenc_sa8d[2][2];
+ int fenc_sa8d_sum;
+
/* pointer over mb of the frame to be compressed */
uint8_t *p_fenc[3];
+ /* pointer to the actual source frame, not a block copy */
+ uint8_t *p_fenc_plane[3];
/* pointer over mb of the frame to be reconstructed */
uint8_t *p_fdec[3];
@@ -524,11 +546,10 @@
/* Current frame stats */
struct
{
- /* Headers bits (MV+Ref+MB Block Type */
- int i_hdr_bits;
- /* Texture bits (Intra/Predicted) */
- int i_itex_bits;
- int i_ptex_bits;
+ /* MV bits (MV+Ref+Block Type) */
+ int i_mv_bits;
+ /* Texture bits (DCT coefs) */
+ int i_tex_bits;
/* ? */
int i_misc_bits;
/* MB type counts */
@@ -559,7 +580,7 @@
double f_slice_qp[5];
int i_consecutive_bframes[X264_BFRAME_MAX+1];
/* */
- int64_t i_sqe_global[5];
+ int64_t i_ssd_global[5];
double f_psnr_average[5];
double f_psnr_mean_y[5];
double f_psnr_mean_u[5];
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/dct.c
^
|
@@ -460,45 +460,62 @@
// gcc pessimizes multi-dimensional arrays here, even with constant indices
#define ZIG(i,y,x) level[i] = dct[0][x*8+y];
+#define ZIGZAG8_FRAME\
+ ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
+ ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
+ ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
+ ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
+ ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
+ ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
+ ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
+ ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
+ ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
+ ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
+ ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
+ ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
+ ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
+ ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
+ ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
+ ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
+
+#define ZIGZAG8_FIELD\
+ ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
+ ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
+ ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
+ ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
+ ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
+ ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
+ ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
+ ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
+ ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
+ ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
+ ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
+ ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
+ ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
+ ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
+ ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
+ ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
+
+#define ZIGZAG4_FRAME\
+ ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
+ ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
+ ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
+ ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
+
+#define ZIGZAG4_FIELD\
+ ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
+ ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
+ ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
+ ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
{
- ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
- ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
- ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)
- ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)
- ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)
- ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)
- ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)
- ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)
- ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)
- ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)
- ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)
- ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)
- ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)
- ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)
- ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)
- ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)
+ ZIGZAG8_FRAME
}
static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
{
- ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)
- ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)
- ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)
- ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)
- ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)
- ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)
- ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)
- ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)
- ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)
- ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)
- ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)
- ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)
- ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)
- ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)
- ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)
- ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
+ ZIGZAG8_FIELD
}
#undef ZIG
@@ -506,10 +523,7 @@
static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
{
- ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
- ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
- ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
- ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
+ ZIGZAG4_FRAME
}
static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
@@ -531,26 +545,40 @@
*(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
*(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
*(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
- *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);\
+ *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);
+#define COPY8x8\
+ *(uint64_t*)(p_dst+0*FDEC_STRIDE) = *(uint64_t*)(p_src+0*FENC_STRIDE);\
+ *(uint64_t*)(p_dst+1*FDEC_STRIDE) = *(uint64_t*)(p_src+1*FENC_STRIDE);\
+ *(uint64_t*)(p_dst+2*FDEC_STRIDE) = *(uint64_t*)(p_src+2*FENC_STRIDE);\
+ *(uint64_t*)(p_dst+3*FDEC_STRIDE) = *(uint64_t*)(p_src+3*FENC_STRIDE);\
+ *(uint64_t*)(p_dst+4*FDEC_STRIDE) = *(uint64_t*)(p_src+4*FENC_STRIDE);\
+ *(uint64_t*)(p_dst+5*FDEC_STRIDE) = *(uint64_t*)(p_src+5*FENC_STRIDE);\
+ *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\
+ *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE);
static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
{
- ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
- ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
- ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
- ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
+ ZIGZAG4_FRAME
COPY4x4
}
static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
{
- ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)
- ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)
- ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)
- ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
+ ZIGZAG4_FIELD
COPY4x4
}
+static void zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
+{
+ ZIGZAG8_FRAME
+ COPY8x8
+}
+static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
+{
+ ZIGZAG8_FIELD
+ COPY8x8
+}
+
#undef ZIG
#undef COPY4x4
@@ -560,6 +588,7 @@
{
pf->scan_8x8 = zigzag_scan_8x8_field;
pf->scan_4x4 = zigzag_scan_4x4_field;
+ pf->sub_8x8 = zigzag_sub_8x8_field;
pf->sub_4x4 = zigzag_sub_4x4_field;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMXEXT )
@@ -575,10 +604,22 @@
{
pf->scan_8x8 = zigzag_scan_8x8_frame;
pf->scan_4x4 = zigzag_scan_4x4_frame;
+ pf->sub_8x8 = zigzag_sub_8x8_frame;
pf->sub_4x4 = zigzag_sub_4x4_frame;
#ifdef HAVE_MMX
+ if( cpu&X264_CPU_MMX )
+ pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
+ if( cpu&X264_CPU_MMXEXT )
+ pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
+ if( cpu&X264_CPU_SSE2_IS_FAST )
+ pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
if( cpu&X264_CPU_SSSE3 )
- pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
+ {
+ pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
+ pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
+ }
+ if( cpu&X264_CPU_PHADD_IS_FAST )
+ pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
#endif
#ifdef ARCH_PPC
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/dct.h
^
|
@@ -41,6 +41,17 @@
};
#undef W
+#define W(i) (i==0 ? FIX8(1.76777) :\
+ i==1 ? FIX8(1.11803) :\
+ i==2 ? FIX8(0.70711) :0)
+static const uint16_t x264_dct4_weight_tab[16] = {
+ W(0), W(1), W(0), W(1),
+ W(1), W(2), W(1), W(2),
+ W(0), W(1), W(0), W(1),
+ W(1), W(2), W(1), W(2)
+};
+#undef W
+
/* inverse squared */
#define W(i) (i==0 ? FIX8(3.125) :\
i==1 ? FIX8(1.25) :\
@@ -107,6 +118,7 @@
{
void (*scan_8x8)( int16_t level[64], int16_t dct[8][8] );
void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] );
+ void (*sub_8x8)( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst );
void (*sub_4x4)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
} x264_zigzag_function_t;
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/frame.c
^
|
@@ -77,6 +77,14 @@
CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
for( i = 0; i < 4; i++ )
frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
+
+ for( j = 0; j <= !!h->param.i_bframe; j++ )
+ for( i = 0; i <= h->param.i_bframe; i++ )
+ {
+ CHECKED_MALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
+ memset( frame->lowres_mvs[j][i], 0, 2*h->mb.i_mb_count*sizeof(int16_t) );
+ CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
+ }
}
if( h->param.analyse.i_me_method >= X264_ME_ESA )
@@ -97,6 +105,7 @@
CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
+ CHECKED_MALLOC( frame->i_intra_cost, i_mb_count * sizeof(uint16_t) );
if( h->param.i_bframe )
{
CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
@@ -114,6 +123,9 @@
for( j = 0; j < h->param.i_bframe + 2; j++ )
CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
+ if( h->param.rc.i_aq_mode )
+ CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
+
x264_pthread_mutex_init( &frame->mutex, NULL );
x264_pthread_cond_init( &frame->cv, NULL );
@@ -134,6 +146,13 @@
for( i = 0; i < X264_BFRAME_MAX+2; i++ )
for( j = 0; j < X264_BFRAME_MAX+2; j++ )
x264_free( frame->i_row_satds[i][j] );
+ for( j = 0; j < 2; j++ )
+ for( i = 0; i <= X264_BFRAME_MAX; i++ )
+ {
+ x264_free( frame->lowres_mvs[j][i] );
+ x264_free( frame->lowres_mv_costs[j][i] );
+ }
+ x264_free( frame->f_qp_offset );
x264_free( frame->i_row_bits );
x264_free( frame->i_row_qp );
x264_free( frame->mb_type );
@@ -233,7 +252,7 @@
void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
{
/* during filtering, 8 extra pixels were filtered on each edge,
- * but up to 3 of the horizontal ones may be wrong.
+ * but up to 3 of the horizontal ones may be wrong.
we want to expand border from the last filtered pixel */
int b_start = !mb_y;
int stride = frame->i_stride[0];
@@ -297,7 +316,7 @@
/* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
* entropy coding, but per 64 coeffs for the purpose of deblocking */
-void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
+static void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
{
uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
@@ -338,82 +357,86 @@
/* Deblocking filter */
-
-static const int i_alpha_table[52] =
+static const uint8_t i_alpha_table[52+12*2] =
{
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 4, 4, 5, 6,
7, 8, 9, 10, 12, 13, 15, 17, 20, 22,
25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
80, 90,101,113,127,144,162,182,203,226,
- 255, 255
+ 255,255,
+ 255,255,255,255,255,255,255,255,255,255,255,255,
};
-static const int i_beta_table[52] =
+static const uint8_t i_beta_table[52+12*2] =
{
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 2, 2, 2, 3,
3, 3, 3, 4, 4, 4, 6, 6, 7, 7,
8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
- 18, 18
+ 18, 18,
+ 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
};
-static const int i_tc0_table[52][3] =
+static const int8_t i_tc0_table[52+12*2][4] =
{
- { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
- { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
- { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 1 },
- { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 }, { 0, 1, 1 }, { 0, 1, 1 }, { 1, 1, 1 },
- { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 2 }, { 1, 1, 2 }, { 1, 1, 2 },
- { 1, 1, 2 }, { 1, 2, 3 }, { 1, 2, 3 }, { 2, 2, 3 }, { 2, 2, 4 }, { 2, 3, 4 },
- { 2, 3, 4 }, { 3, 3, 5 }, { 3, 4, 6 }, { 3, 4, 6 }, { 4, 5, 7 }, { 4, 5, 8 },
- { 4, 6, 9 }, { 5, 7,10 }, { 6, 8,11 }, { 6, 8,13 }, { 7,10,14 }, { 8,11,16 },
- { 9,12,18 }, {10,13,20 }, {11,15,23 }, {13,17,25 }
+ {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+ {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+ {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+ {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+ {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
+ {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
+ {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
+ {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
+ {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
+ {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
+ {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
+ {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+ {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
};
+#define alpha_table(x) i_alpha_table[(x)+12]
+#define beta_table(x) i_beta_table[(x)+12]
+#define tc0_table(x) i_tc0_table[(x)+12]
/* From ffmpeg */
-static inline int clip_uint8( int a )
-{
- if (a&(~255))
- return (-a)>>31;
- else
- return a;
-}
-
static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
{
int i, d;
- for( i = 0; i < 4; i++ ) {
- if( tc0[i] < 0 ) {
+ for( i = 0; i < 4; i++ )
+ {
+ if( tc0[i] < 0 )
+ {
pix += 4*ystride;
continue;
}
- for( d = 0; d < 4; d++ ) {
+ for( d = 0; d < 4; d++ )
+ {
const int p2 = pix[-3*xstride];
const int p1 = pix[-2*xstride];
const int p0 = pix[-1*xstride];
const int q0 = pix[ 0*xstride];
const int q1 = pix[ 1*xstride];
const int q2 = pix[ 2*xstride];
-
- if( abs( p0 - q0 ) < alpha &&
- abs( p1 - p0 ) < beta &&
- abs( q1 - q0 ) < beta ) {
-
+
+ if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+ {
int tc = tc0[i];
int delta;
-
- if( abs( p2 - p0 ) < beta ) {
+ if( abs( p2 - p0 ) < beta )
+ {
pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
- tc++;
+ tc++;
}
- if( abs( q2 - q0 ) < beta ) {
+ if( abs( q2 - q0 ) < beta )
+ {
pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
tc++;
}
-
+
delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
- pix[-1*xstride] = clip_uint8( p0 + delta ); /* p0' */
- pix[ 0*xstride] = clip_uint8( q0 - delta ); /* q0' */
+ pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
+ pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
}
pix += ystride;
}
@@ -421,7 +444,7 @@
}
static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
- deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
+ deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
}
static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
@@ -431,43 +454,45 @@
static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
{
int i, d;
- for( i = 0; i < 4; i++ ) {
+ for( i = 0; i < 4; i++ )
+ {
const int tc = tc0[i];
- if( tc <= 0 ) {
+ if( tc <= 0 )
+ {
pix += 2*ystride;
continue;
}
- for( d = 0; d < 2; d++ ) {
+ for( d = 0; d < 2; d++ )
+ {
const int p1 = pix[-2*xstride];
const int p0 = pix[-1*xstride];
const int q0 = pix[ 0*xstride];
const int q1 = pix[ 1*xstride];
- if( abs( p0 - q0 ) < alpha &&
- abs( p1 - p0 ) < beta &&
- abs( q1 - q0 ) < beta ) {
-
+ if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+ {
int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
- pix[-1*xstride] = clip_uint8( p0 + delta ); /* p0' */
- pix[ 0*xstride] = clip_uint8( q0 - delta ); /* q0' */
+ pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
+ pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
}
pix += ystride;
}
}
}
static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-{
+{
deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
}
static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-{
+{
deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
}
static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
{
int d;
- for( d = 0; d < 16; d++ ) {
+ for( d = 0; d < 16; d++ )
+ {
const int p2 = pix[-3*xstride];
const int p1 = pix[-2*xstride];
const int p0 = pix[-1*xstride];
@@ -475,35 +500,31 @@
const int q1 = pix[ 1*xstride];
const int q2 = pix[ 2*xstride];
- if( abs( p0 - q0 ) < alpha &&
- abs( p1 - p0 ) < beta &&
- abs( q1 - q0 ) < beta ) {
-
- if(abs( p0 - q0 ) < ((alpha >> 2) + 2) ){
- if( abs( p2 - p0 ) < beta)
+ if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+ {
+ if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
+ {
+ if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
{
const int p3 = pix[-4*xstride];
- /* p0', p1', p2' */
pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
- } else {
- /* p0' */
- pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
}
- if( abs( q2 - q0 ) < beta)
+ else /* p0' */
+ pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+ if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
{
const int q3 = pix[3*xstride];
- /* q0', q1', q2' */
pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
- } else {
- /* q0' */
- pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
}
- }else{
- /* p0', q0' */
+ else /* q0' */
+ pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+ }
+ else /* p0', q0' */
+ {
pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
}
@@ -512,59 +533,72 @@
}
}
static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
-{
+{
deblock_luma_intra_c( pix, stride, 1, alpha, beta );
}
static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
-{
+{
deblock_luma_intra_c( pix, 1, stride, alpha, beta );
}
static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
-{
- int d;
- for( d = 0; d < 8; d++ ) {
+{
+ int d;
+ for( d = 0; d < 8; d++ )
+ {
const int p1 = pix[-2*xstride];
const int p0 = pix[-1*xstride];
const int q0 = pix[ 0*xstride];
const int q1 = pix[ 1*xstride];
- if( abs( p0 - q0 ) < alpha &&
- abs( p1 - p0 ) < beta &&
- abs( q1 - q0 ) < beta ) {
-
+ if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+ {
pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */
pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
}
-
pix += ystride;
}
}
static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
-{
+{
deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
}
static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
-{
+{
deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
}
-static inline void deblock_edge( x264_t *h, uint8_t *pix, int i_stride, int bS[4], int i_qp, int b_chroma,
- x264_deblock_inter_t pf_inter, x264_deblock_intra_t pf_intra )
+static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
{
- int i;
- const int index_a = x264_clip3( i_qp + h->sh.i_alpha_c0_offset, 0, 51 );
- const int alpha = i_alpha_table[index_a];
- const int beta = i_beta_table[x264_clip3( i_qp + h->sh.i_beta_offset, 0, 51 )];
-
- if( bS[0] < 4 ) {
- int8_t tc[4];
- for(i=0; i<4; i++)
- tc[i] = (bS[i] ? i_tc0_table[index_a][bS[i] - 1] : -1) + b_chroma;
- pf_inter( pix, i_stride, alpha, beta, tc );
- } else {
- pf_intra( pix, i_stride, alpha, beta );
- }
+ const int index_a = i_qp + h->sh.i_alpha_c0_offset;
+ const int alpha = alpha_table(index_a);
+ const int beta = beta_table(i_qp + h->sh.i_beta_offset);
+ int8_t tc[4];
+
+ if( !alpha || !beta )
+ return;
+
+ tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
+ tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
+ tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
+ tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
+
+ pf_inter( pix1, i_stride, alpha, beta, tc );
+ if( b_chroma )
+ pf_inter( pix2, i_stride, alpha, beta, tc );
+}
+
+static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
+{
+ const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
+ const int beta = beta_table(i_qp + h->sh.i_beta_offset);
+
+ if( !alpha || !beta )
+ return;
+
+ pf_intra( pix1, i_stride, alpha, beta );
+ if( b_chroma )
+ pf_intra( pix2, i_stride, alpha, beta );
}
void x264_frame_deblock_row( x264_t *h, int mb_y )
@@ -573,152 +607,159 @@
const int s4x4 = 4 * h->mb.i_mb_stride;
const int b_interlaced = h->sh.b_mbaff;
const int mvy_limit = 4 >> b_interlaced;
+ const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
int mb_x;
-
- int i_stride2[3] = { h->fdec->i_stride[0] << b_interlaced,
- h->fdec->i_stride[1] << b_interlaced,
- h->fdec->i_stride[2] << b_interlaced };
+ int stridey = h->fdec->i_stride[0];
+ int stride2y = stridey << b_interlaced;
+ int strideuv = h->fdec->i_stride[1];
+ int stride2uv = strideuv << b_interlaced;
if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, munge_cavlc_nnz_row );
- for( mb_x = 0; mb_x < h->sps->i_mb_width; )
+ for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
{
const int mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
- const int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
- int i_edge;
-
- int i_pix_y[3] = { 16*mb_y*h->fdec->i_stride[0] + 16*mb_x,
- 8*mb_y*h->fdec->i_stride[1] + 8*mb_x,
- 8*mb_y*h->fdec->i_stride[2] + 8*mb_x };
+ const int i_qp = h->mb.qp[mb_xy];
+ int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
+ uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
+ uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
+ uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
if( b_interlaced && (mb_y&1) )
{
- i_pix_y[0] -= 15*h->fdec->i_stride[0];
- i_pix_y[1] -= 7*h->fdec->i_stride[1];
- i_pix_y[2] -= 7*h->fdec->i_stride[2];
+ pixy -= 15*stridey;
+ pixu -= 7*strideuv;
+ pixv -= 7*strideuv;
}
x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
- /* i_dir == 0 -> vertical edge
- * i_dir == 1 -> horizontal edge */
+ if( i_qp <= qp_thresh )
+ i_edge_end = 1;
- #define deblock_dir(i_dir)\
+ #define FILTER_DIR(intra, i_dir)\
{\
- int i_start = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
- int i_qp, i_qpn;\
- for( i_edge = i_start; i_edge < i_edge_end; i_edge++ )\
+ /* Y plane */\
+ i_qpn= h->mb.qp[mbn_xy];\
+ if( i_dir == 0 )\
{\
- int mbn_xy, mbn_8x8, mbn_4x4;\
- int bS[4]; /* filtering strength */\
- if( b_8x8_transform && (i_edge&1) )\
- continue;\
- mbn_xy = i_edge > 0 ? mb_xy : ( i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride );\
- mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8 );\
- mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 );\
- if( b_interlaced && i_edge == 0 && i_dir == 1 )\
+ /* vertical edge */\
+ deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
+ stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
+ h->loopf.deblock_h_luma##intra );\
+ if( !(i_edge & 1) )\
{\
- mbn_xy -= h->mb.i_mb_stride;\
- mbn_8x8 -= 2 * s8x8;\
- mbn_4x4 -= 4 * s4x4;\
+ /* U/V planes */\
+ int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
+ deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
+ stride2uv, bS, i_qpc, 1,\
+ h->loopf.deblock_h_chroma##intra );\
}\
- /* *** Get bS for each 4px for the current edge *** */\
- if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy] ) )\
- bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 && !(b_interlaced && i_dir) ? 4 : 3 );\
- else\
+ }\
+ else\
+ {\
+ /* horizontal edge */\
+ deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
+ stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
+ h->loopf.deblock_v_luma##intra );\
+ /* U/V planes */\
+ if( !(i_edge & 1) )\
{\
- int i;\
- for( i = 0; i < 4; i++ )\
+ int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
+ deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
+ stride2uv, bS, i_qpc, 1,\
+ h->loopf.deblock_v_chroma##intra );\
+ }\
+ }\
+ }
+
+ #define DEBLOCK_STRENGTH(i_dir)\
+ {\
+ /* *** Get bS for each 4px for the current edge *** */\
+ if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
+ *(uint32_t*)bS = 0x03030303;\
+ else\
+ {\
+ *(uint32_t*)bS = 0x00000000;\
+ for( i = 0; i < 4; i++ )\
+ {\
+ int x = i_dir == 0 ? i_edge : i;\
+ int y = i_dir == 0 ? i : i_edge;\
+ int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
+ int yn = i_dir == 0 ? y : (y - 1)&0x03;\
+ if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
+ h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
+ bS[i] = 2;\
+ else\
{\
- int x = i_dir == 0 ? i_edge : i;\
- int y = i_dir == 0 ? i : i_edge;\
- int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;\
- int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;\
- if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
- h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
- {\
- bS[i] = 2;\
- }\
- else\
- {\
- /* FIXME: A given frame may occupy more than one position in\
- * the reference list. So we should compare the frame numbers,\
- * not the indices in the ref list.\
- * No harm yet, as we don't generate that case.*/\
- int i8p= mb_8x8+(x/2)+(y/2)*s8x8;\
- int i8q= mbn_8x8+(xn/2)+(yn/2)*s8x8;\
- int i4p= mb_4x4+x+y*s4x4;\
- int i4q= mbn_4x4+xn+yn*s4x4;\
- int l;\
- bS[i] = 0;\
- for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )\
+ /* FIXME: A given frame may occupy more than one position in\
+ * the reference list. So we should compare the frame numbers,\
+ * not the indices in the ref list.\
+ * No harm yet, as we don't generate that case.*/\
+ int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
+ int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
+ int i4p= mb_4x4+x+y*s4x4;\
+ int i4q= mbn_4x4+xn+yn*s4x4;\
+ for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )\
+ if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||\
+ abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||\
+ abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )\
{\
- if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||\
- abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||\
- abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )\
- {\
- bS[i] = 1;\
- break;\
- }\
+ bS[i] = 1;\
+ break;\
}\
- }\
}\
}\
- /* *** filter *** */\
- /* Y plane */\
- i_qp = h->mb.qp[mb_xy];\
- i_qpn= h->mb.qp[mbn_xy];\
- if( i_dir == 0 )\
+ }\
+ }
+
+ /* i_dir == 0 -> vertical edge
+ * i_dir == 1 -> horizontal edge */
+ #define DEBLOCK_DIR(i_dir)\
+ {\
+ int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
+ int i_qpn, i, l, mbn_xy, mbn_8x8, mbn_4x4;\
+ DECLARE_ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
+ if( i_edge )\
+ i_edge+= b_8x8_transform;\
+ else\
+ {\
+ mbn_xy = i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride;\
+ mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
+ mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
+ if( b_interlaced && i_dir == 1 )\
{\
- /* vertical edge */\
- deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge],\
- i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\
- h->loopf.deblock_h_luma, h->loopf.deblock_h_luma_intra );\
- if( !(i_edge & 1) )\
- {\
- /* U/V planes */\
- int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\
- i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\
- deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge],\
- i_stride2[1], bS, i_qpc, 1,\
- h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\
- deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge],\
- i_stride2[2], bS, i_qpc, 1,\
- h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\
- }\
+ mbn_xy -= h->mb.i_mb_stride;\
+ mbn_8x8 -= 2 * s8x8;\
+ mbn_4x4 -= 4 * s4x4;\
}\
- else\
+ else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
{\
- /* horizontal edge */\
- deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge*i_stride2[0]],\
- i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\
- h->loopf.deblock_v_luma, h->loopf.deblock_v_luma_intra );\
- /* U/V planes */\
- if( !(i_edge & 1) )\
- {\
- int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\
- i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\
- deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge*i_stride2[1]],\
- i_stride2[1], bS, i_qpc, 1,\
- h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\
- deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge*i_stride2[2]],\
- i_stride2[2], bS, i_qpc, 1,\
- h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\
- }\
+ FILTER_DIR( _intra, i_dir );\
+ goto end##i_dir;\
}\
+ DEBLOCK_STRENGTH(i_dir);\
+ if( *(uint32_t*)bS )\
+ FILTER_DIR( , i_dir);\
+ end##i_dir:\
+ i_edge += b_8x8_transform+1;\
+ }\
+ mbn_xy = mb_xy;\
+ mbn_8x8 = mb_8x8;\
+ mbn_4x4 = mb_4x4;\
+ for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
+ {\
+ DEBLOCK_STRENGTH(i_dir);\
+ if( *(uint32_t*)bS )\
+ FILTER_DIR( , i_dir);\
}\
}
- deblock_dir(0);
- deblock_dir(1);
-
- /* next mb */
- if( !b_interlaced || (mb_y&1) )
- mb_x++;
- mb_y ^= b_interlaced;
+ DEBLOCK_DIR(0);
+ DEBLOCK_DIR(1);
}
if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
@@ -748,12 +789,12 @@
void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
-void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 );
x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
}
-void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
{
x264_deblock_v8_luma_intra_mmxext( pix, stride, alpha, beta );
x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
@@ -884,6 +925,7 @@
frame = x264_frame_new( h );
assert( frame->i_reference_count == 0 );
frame->i_reference_count = 1;
+ frame->b_intra_calculated = 0;
return frame;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/frame.h
^
|
@@ -62,6 +62,8 @@
/* motion data */
int8_t *mb_type;
int16_t (*mv[2])[2];
+ int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
+ int *lowres_mv_costs[2][X264_BFRAME_MAX+1];
int8_t *ref[2];
int i_ref[2];
int ref_poc[2][16];
@@ -71,17 +73,21 @@
* contains the SATD cost of the lowres frame encoded in various modes
* FIXME: how big an array do we need? */
int i_cost_est[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
+ int i_cost_est_aq[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
int i_satd; // the i_cost_est of the selected frametype
int i_intra_mbs[X264_BFRAME_MAX+2];
int *i_row_satds[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
int *i_row_satd;
int *i_row_bits;
int *i_row_qp;
+ float *f_qp_offset;
+ int b_intra_calculated;
+ uint16_t *i_intra_cost;
/* threading */
int i_lines_completed; /* in pixels */
int i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */
- x264_pthread_mutex_t mutex;
+ x264_pthread_mutex_t mutex;
x264_pthread_cond_t cv;
} x264_frame_t;
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/macroblock.c
^
|
@@ -24,71 +24,6 @@
#include "common.h"
-int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
-{
- const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
- const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8];
- const int m = X264_MIN( x264_mb_pred_mode4x4_fix(ma),
- x264_mb_pred_mode4x4_fix(mb) );
-
- if( m < 0 )
- return I_PRED_4x4_DC;
-
- return m;
-}
-
-int x264_mb_predict_non_zero_code( x264_t *h, int idx )
-{
- const int za = h->mb.cache.non_zero_count[x264_scan8[idx] - 1];
- const int zb = h->mb.cache.non_zero_count[x264_scan8[idx] - 8];
-
- int i_ret = za + zb;
-
- if( i_ret < 0x80 )
- {
- i_ret = ( i_ret + 1 ) >> 1;
- }
- return i_ret & 0x7f;
-}
-
-int x264_mb_transform_8x8_allowed( x264_t *h )
-{
- // intra and skip are disallowed
- // large partitions are allowed
- // direct and 8x8 are conditional
- static const uint8_t partition_tab[X264_MBTYPE_MAX] = {
- 0,0,0,0,1,2,0,2,1,1,1,1,1,1,1,1,1,2,0,
- };
- int p, i;
-
- if( !h->pps->b_transform_8x8_mode )
- return 0;
- p = partition_tab[h->mb.i_type];
- if( p < 2 )
- return p;
- else if( h->mb.i_type == B_DIRECT )
- return h->sps->b_direct8x8_inference;
- else if( h->mb.i_type == P_8x8 )
- {
- if( !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
- return 1;
- for( i=0; i<4; i++ )
- if( h->mb.i_sub_partition[i] != D_L0_8x8 )
- return 0;
- return 1;
- }
- else // B_8x8
- {
- // x264 currently doesn't use sub-8x8 B partitions, so don't check for them
- if( h->sps->b_direct8x8_inference )
- return 1;
- for( i=0; i<4; i++ )
- if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
- return 0;
- return 1;
- }
-}
-
void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] )
{
const int i8 = x264_scan8[idx];
@@ -223,9 +158,9 @@
int i8, i4;
int b8x8;
const int type_col = h->fref1[0]->mb_type[ h->mb.i_mb_xy ];
-
+
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
-
+
if( IS_INTRA( type_col ) )
{
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
@@ -338,7 +273,7 @@
if( ref[0] < 0 && ref[1] < 0 )
{
- ref[0] =
+ ref[0] =
ref[1] = 0;
*(uint64_t*)mv[0] = 0;
}
@@ -481,7 +416,7 @@
}
/* This just improves encoder performance, it's not part of the spec */
-void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[8][2], int *i_mvc )
+void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[9][2], int *i_mvc )
{
int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref];
int i = 0;
@@ -498,6 +433,13 @@
SET_MVP( h->mb.cache.mv[i_list][x264_scan8[12]] );
}
+ if( i_ref == 0 && h->frames.b_have_lowres )
+ {
+ int16_t (*lowres_mv)[2] = i_list ? h->fenc->lowres_mvs[1][h->fref1[0]->i_frame-h->fenc->i_frame-1]
+ : h->fenc->lowres_mvs[0][h->fenc->i_frame-h->fref0[0]->i_frame-1];
+ if( lowres_mv[0][0] != 0x7fff ) *(uint32_t*)mvc[i++] = (*(uint32_t*)lowres_mv[h->mb.i_mb_xy]*2)&0xfffeffff;
+ }
+
/* spatial predictors */
if( h->mb.i_neighbour & MB_LEFT )
{
@@ -612,48 +554,41 @@
static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
{
const int i8 = x264_scan8[0]+x+8*y;
-
+ const int i_ref0 = h->mb.cache.ref[0][i8];
const int i_ref1 = h->mb.cache.ref[1][i8];
+ const int weight = h->mb.bipred_weight[i_ref0][i_ref1];
+ const int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
const int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
+ int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
- DECLARE_ALIGNED_16( uint8_t tmp[16*16] );
- int i_mode = x264_size2pixel[height][width];
-
- x264_mb_mc_0xywh( h, x, y, width, height );
-
- h->mc.mc_luma( tmp, 16, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
- mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height );
+ int i_mode = x264_size2pixel[height][width];
+ int i_stride0 = 16, i_stride1 = 16;
+ DECLARE_ALIGNED_16( uint8_t tmp0[16*16] );
+ DECLARE_ALIGNED_16( uint8_t tmp1[16*16] );
+ uint8_t *src0, *src1;
+
+ src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
+ mvx0 + 4*4*x, mvy0 + 4*4*y, 4*width, 4*height );
+ src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
+ mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height );
+ h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
+ src0, i_stride0, src1, i_stride1, weight );
+ if( h->mb.b_interlaced & i_ref0 )
+ mvy0 += (h->mb.i_mb_y & 1)*4 - 2;
if( h->mb.b_interlaced & i_ref1 )
mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
- if( h->param.analyse.b_weighted_bipred )
- {
- const int i_ref0 = h->mb.cache.ref[0][i8];
- const int weight = h->mb.bipred_weight[i_ref0][i_ref1];
-
- h->mc.avg_weight[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, tmp, 16, weight );
-
- h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
- mvx1, mvy1, 2*width, 2*height );
- h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16, weight );
-
- h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
- mvx1, mvy1, 2*width, 2*height );
- h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16, weight );
- }
- else
- {
- h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, tmp, 16 );
-
- h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
- mvx1, mvy1, 2*width, 2*height );
- h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16 );
-
- h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
- mvx1, mvy1, 2*width, 2*height );
- h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16 );
- }
+ h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ mvx0, mvy0, 2*width, 2*height );
+ h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ mvx1, mvy1, 2*width, 2*height );
+ h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
+ h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ mvx0, mvy0, 2*width, 2*height );
+ h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ mvx1, mvy1, 2*width, 2*height );
+ h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
}
static void x264_mb_mc_direct8x8( x264_t *h, int x, int y )
@@ -885,6 +820,34 @@
memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
+ /* fdec: fenc:
+ * yyyyyyy
+ * yYYYY YYYY
+ * yYYYY YYYY
+ * yYYYY YYYY
+ * yYYYY YYYY
+ * uuu vvv UUVV
+ * uUU vVV UUVV
+ * uUU vVV
+ */
+ h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
+ h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
+ h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
+ h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
+ h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
+ h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
+
+ h->mb.i_neighbour4[6] =
+ h->mb.i_neighbour4[9] =
+ h->mb.i_neighbour4[12] =
+ h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
+ h->mb.i_neighbour4[3] =
+ h->mb.i_neighbour4[7] =
+ h->mb.i_neighbour4[11] =
+ h->mb.i_neighbour4[13] =
+ h->mb.i_neighbour4[15] =
+ h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
+
return 0;
fail: return -1;
}
@@ -982,8 +945,9 @@
if( h->mb.b_interlaced )
ref_pix_offset[1] += (1-2*(i_mb_y&1)) * i_stride;
h->mb.pic.i_stride[i] = i_stride2;
+ h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
- &h->fenc->plane[i][i_pix_offset], i_stride2, w );
+ h->mb.pic.p_fenc_plane[i], i_stride2, w );
memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
if( h->mb.b_interlaced )
{
@@ -1150,23 +1114,6 @@
+ !!(h->mb.i_neighbour & MB_TOP);
}
- /* fdec: fenc:
- * yyyyyyy
- * yYYYY YYYY
- * yYYYY YYYY
- * yYYYY YYYY
- * yYYYY YYYY
- * uuu vvv UUVV
- * uUU vVV UUVV
- * uUU vVV
- */
- h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
- h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
- h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
- h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
- h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
- h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
-
if( !h->mb.b_interlaced )
{
copy_column8( h->mb.pic.p_fdec[0]-1, h->mb.pic.p_fdec[0]+15 );
@@ -1267,8 +1214,10 @@
h->mb.cache.ref[i_list][i8+2*8] =
h->mb.cache.ref[i_list][i8+3*8] = h->mb.ref[i_list][ir + 1*s8x8];
- for( i = 0; i < 4; i++ )
- *(uint32_t*)h->mb.cache.mv[i_list][i8+i*8] = *(uint32_t*)h->mb.mv[i_list][iv + i*s4x4];
+ *(uint32_t*)h->mb.cache.mv[i_list][i8+0*8] = *(uint32_t*)h->mb.mv[i_list][iv + 0*s4x4];
+ *(uint32_t*)h->mb.cache.mv[i_list][i8+1*8] = *(uint32_t*)h->mb.mv[i_list][iv + 1*s4x4];
+ *(uint32_t*)h->mb.cache.mv[i_list][i8+2*8] = *(uint32_t*)h->mb.mv[i_list][iv + 2*s4x4];
+ *(uint32_t*)h->mb.cache.mv[i_list][i8+3*8] = *(uint32_t*)h->mb.mv[i_list][iv + 3*s4x4];
}
else
{
@@ -1300,8 +1249,10 @@
{
const int i8 = x264_scan8[0] - 1;
const int iv = i_mb_4x4 - 1;
- for( i = 0; i < 4; i++ )
- *(uint32_t*)h->mb.cache.mvd[i_list][i8+i*8] = *(uint32_t*)h->mb.mvd[i_list][iv + i*s4x4];
+ *(uint32_t*)h->mb.cache.mvd[i_list][i8+0*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 0*s4x4];
+ *(uint32_t*)h->mb.cache.mvd[i_list][i8+1*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 1*s4x4];
+ *(uint32_t*)h->mb.cache.mvd[i_list][i8+2*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 2*s4x4];
+ *(uint32_t*)h->mb.cache.mvd[i_list][i8+3*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 3*s4x4];
}
else
{
@@ -1343,19 +1294,9 @@
h->mb.i_neighbour4[8] =
h->mb.i_neighbour4[10] =
h->mb.i_neighbour8[2] = MB_TOP|MB_TOPRIGHT | ((h->mb.i_neighbour & MB_LEFT) ? (MB_LEFT|MB_TOPLEFT) : 0);
- h->mb.i_neighbour4[3] =
- h->mb.i_neighbour4[7] =
- h->mb.i_neighbour4[11] =
- h->mb.i_neighbour4[13] =
- h->mb.i_neighbour4[15] =
- h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
h->mb.i_neighbour4[5] =
h->mb.i_neighbour8[1] = MB_LEFT | (h->mb.i_neighbour & MB_TOPRIGHT)
| ((h->mb.i_neighbour & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0);
- h->mb.i_neighbour4[6] =
- h->mb.i_neighbour4[9] =
- h->mb.i_neighbour4[12] =
- h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
}
static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i)
@@ -1394,14 +1335,7 @@
x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
h->mb.type[i_mb_xy] = i_mb_type;
-
- if( h->mb.i_type == I_PCM || (h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0) )
- h->mb.i_qp = h->mb.i_last_qp;
- h->mb.qp[i_mb_xy] = i_mb_type != I_PCM ? h->mb.i_qp : 0;
-
- h->mb.i_last_dqp = h->mb.i_qp - h->mb.i_last_qp;
- h->mb.i_last_qp = h->mb.i_qp;
- h->mb.i_mb_prev_xy = h->mb.i_mb_xy;
+ h->mb.i_mb_prev_xy = i_mb_xy;
/* save intra4x4 */
if( i_mb_type == I_4x4 )
@@ -1416,6 +1350,8 @@
if( i_mb_type == I_PCM )
{
+ h->mb.qp[i_mb_xy] = 0;
+ h->mb.i_last_dqp = 0;
h->mb.i_cbp_chroma = 2;
h->mb.i_cbp_luma = 0xf;
h->mb.cbp[i_mb_xy] = 0x72f; /* all set */
@@ -1426,59 +1362,71 @@
else
{
/* save non zero count */
- for( y = 0; y < 4; y++ )
- *(uint32_t*)&non_zero_count[y*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+y*8];
- for( y = 0; y < 4; y++ )
- *(uint16_t*)&non_zero_count[16+y*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+y*2]-1] >> 8;
-
+ *(uint32_t*)&non_zero_count[0*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+0*8];
+ *(uint32_t*)&non_zero_count[1*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+1*8];
+ *(uint32_t*)&non_zero_count[2*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+2*8];
+ *(uint32_t*)&non_zero_count[3*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+3*8];
+ *(uint16_t*)&non_zero_count[16+0*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] >> 8;
+ *(uint16_t*)&non_zero_count[16+1*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] >> 8;
+ *(uint16_t*)&non_zero_count[16+2*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] >> 8;
+ *(uint16_t*)&non_zero_count[16+3*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] >> 8;
+
+ if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
+ h->mb.i_qp = h->mb.i_last_qp;
+ h->mb.qp[i_mb_xy] = h->mb.i_qp;
+ h->mb.i_last_dqp = h->mb.i_qp - h->mb.i_last_qp;
+ h->mb.i_last_qp = h->mb.i_qp;
}
if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 )
h->mb.b_transform_8x8 = 0;
h->mb.mb_transform_size[i_mb_xy] = h->mb.b_transform_8x8;
- if( !IS_INTRA( i_mb_type ) )
+ if( h->sh.i_type != SLICE_TYPE_I )
{
- h->mb.ref[0][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
- h->mb.ref[0][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
- h->mb.ref[0][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
- h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
- for( y = 0; y < 4; y++ )
- {
- *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+0];
- *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+2];
- }
- if(h->sh.i_type == SLICE_TYPE_B)
- {
- h->mb.ref[1][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
- h->mb.ref[1][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
- h->mb.ref[1][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
- h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
+ if( !IS_INTRA( i_mb_type ) )
+ {
+ h->mb.ref[0][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
+ h->mb.ref[0][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
+ h->mb.ref[0][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
+ h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+0];
- *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+2];
+ *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+0];
+ *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+2];
+ }
+ if( h->sh.i_type == SLICE_TYPE_B )
+ {
+ h->mb.ref[1][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
+ h->mb.ref[1][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
+ h->mb.ref[1][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
+ h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
+ for( y = 0; y < 4; y++ )
+ {
+ *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+0];
+ *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+2];
+ }
}
}
- }
- else
- {
- int i_list;
- for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2 : 1 ); i_list++ )
+ else
{
- *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+0*s8x8] = (uint8_t)(-1) * 0x0101;
- *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+1*s8x8] = (uint8_t)(-1) * 0x0101;
- for( y = 0; y < 4; y++ )
+ int i_list;
+ for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2 : 1 ); i_list++ )
{
- *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = 0;
- *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = 0;
+ *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+0*s8x8] = (uint8_t)(-1) * 0x0101;
+ *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+1*s8x8] = (uint8_t)(-1) * 0x0101;
+ for( y = 0; y < 4; y++ )
+ {
+ *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = 0;
+ *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = 0;
+ }
}
}
}
if( h->param.b_cabac )
{
- if( i_mb_type == I_4x4 || i_mb_type == I_16x16 )
+ if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM )
h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ];
else
h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC;
@@ -1554,7 +1502,12 @@
if( h->param.analyse.b_weighted_bipred
&& dist_scale_factor >= -64
&& dist_scale_factor <= 128 )
+ {
h->mb.bipred_weight[i_ref0][i_ref1] = 64 - dist_scale_factor;
+ // ssse3 implementation of biweight doesn't support the extrema.
+ // if we ever generate them, we'll have to drop that optimization.
+ assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 );
+ }
else
h->mb.bipred_weight[i_ref0][i_ref1] = 32;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/macroblock.h
^
|
@@ -251,14 +251,16 @@
2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE
};
-static const uint8_t i_chroma_qp_table[52] =
+static const uint8_t i_chroma_qp_table[52+12*2] =
{
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
- 39, 39
+ 39, 39,
+ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
};
enum cabac_ctx_block_cat_e
@@ -312,16 +314,6 @@
* h->mb. need only valid values from other blocks */
void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[8][2], int *i_mvc );
-
-int x264_mb_predict_intra4x4_mode( x264_t *h, int idx );
-int x264_mb_predict_non_zero_code( x264_t *h, int idx );
-
-/* x264_mb_transform_8x8_allowed:
- * check whether any partition is smaller than 8x8 (or at least
- * might be, according to just partition type.)
- * doesn't check for cbp */
-int x264_mb_transform_8x8_allowed( x264_t *h );
-
void x264_mb_mc( x264_t *h );
void x264_mb_mc_8x8( x264_t *h, int i8 );
@@ -444,6 +436,72 @@
return i_nz;
}
+static inline int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
+{
+ const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
+ const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8];
+ const int m = X264_MIN( x264_mb_pred_mode4x4_fix(ma),
+ x264_mb_pred_mode4x4_fix(mb) );
+
+ if( m < 0 )
+ return I_PRED_4x4_DC;
+
+ return m;
+}
+static inline int x264_mb_predict_non_zero_code( x264_t *h, int idx )
+{
+ const int za = h->mb.cache.non_zero_count[x264_scan8[idx] - 1];
+ const int zb = h->mb.cache.non_zero_count[x264_scan8[idx] - 8];
+
+ int i_ret = za + zb;
+
+ if( i_ret < 0x80 )
+ {
+ i_ret = ( i_ret + 1 ) >> 1;
+ }
+ return i_ret & 0x7f;
+}
+/* x264_mb_transform_8x8_allowed:
+ * check whether any partition is smaller than 8x8 (or at least
+ * might be, according to just partition type.)
+ * doesn't check for cbp */
+static inline int x264_mb_transform_8x8_allowed( x264_t *h )
+{
+ // intra and skip are disallowed
+ // large partitions are allowed
+ // direct and 8x8 are conditional
+ static const uint8_t partition_tab[X264_MBTYPE_MAX] = {
+ 0,0,0,0,1,2,0,2,1,1,1,1,1,1,1,1,1,2,0,
+ };
+ int p, i;
+
+ if( !h->pps->b_transform_8x8_mode )
+ return 0;
+ p = partition_tab[h->mb.i_type];
+ if( p < 2 )
+ return p;
+ else if( h->mb.i_type == B_DIRECT )
+ return h->sps->b_direct8x8_inference;
+ else if( h->mb.i_type == P_8x8 )
+ {
+ if( !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
+ return 1;
+ for( i=0; i<4; i++ )
+ if( h->mb.i_sub_partition[i] != D_L0_8x8 )
+ return 0;
+ return 1;
+ }
+ else // B_8x8
+ {
+ // x264 currently doesn't use sub-8x8 B partitions, so don't check for them
+ if( h->sps->b_direct8x8_inference )
+ return 1;
+ for( i=0; i<4; i++ )
+ if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
+ return 0;
+ return 1;
+ }
+}
#endif
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/mc.c
^
|
@@ -49,45 +49,30 @@
}
}
-static inline void pixel_avg_wxh( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int width, int height )
+static inline void pixel_avg_wxh( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int width, int height )
{
int x, y;
for( y = 0; y < height; y++ )
{
for( x = 0; x < width; x++ )
{
- dst[x] = ( dst[x] + src[x] + 1 ) >> 1;
+ dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
}
+ src1 += i_src1;
+ src2 += i_src2;
dst += i_dst;
- src += i_src;
}
}
-#define PIXEL_AVG_C( name, width, height ) \
-static void name( uint8_t *pix1, int i_stride_pix1, \
- uint8_t *pix2, int i_stride_pix2 ) \
-{ \
- pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \
-}
-PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
-PIXEL_AVG_C( pixel_avg_16x8, 16, 8 )
-PIXEL_AVG_C( pixel_avg_8x16, 8, 16 )
-PIXEL_AVG_C( pixel_avg_8x8, 8, 8 )
-PIXEL_AVG_C( pixel_avg_8x4, 8, 4 )
-PIXEL_AVG_C( pixel_avg_4x8, 4, 8 )
-PIXEL_AVG_C( pixel_avg_4x4, 4, 4 )
-PIXEL_AVG_C( pixel_avg_4x2, 4, 2 )
-PIXEL_AVG_C( pixel_avg_2x4, 2, 4 )
-PIXEL_AVG_C( pixel_avg_2x2, 2, 2 )
-
-
/* Implicit weighted bipred only:
* assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
-#define op_scale2(x) dst[x] = x264_clip_uint8( (dst[x]*i_weight1 + src[x]*i_weight2 + (1<<5)) >> 6 )
-static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int width, int height, int i_weight1 ){
+#define op_scale2(x) dst[x] = x264_clip_uint8( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 )
+static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int width, int height, int i_weight1 )
+{
int y;
const int i_weight2 = 64 - i_weight1;
- for(y=0; y<height; y++, dst += i_dst, src += i_src){
+ for( y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 )
+ {
op_scale2(0);
op_scale2(1);
if(width==2) continue;
@@ -109,27 +94,28 @@
op_scale2(15);
}
}
+#undef op_scale2
-#define PIXEL_AVG_WEIGHT_C( width, height ) \
-static void pixel_avg_weight_##width##x##height( \
- uint8_t *pix1, int i_stride_pix1, \
- uint8_t *pix2, int i_stride_pix2, int i_weight1 ) \
+#define PIXEL_AVG_C( name, width, height ) \
+static void name( uint8_t *pix1, int i_stride_pix1, \
+ uint8_t *pix2, int i_stride_pix2, \
+ uint8_t *pix3, int i_stride_pix3, int weight ) \
{ \
- pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height, i_weight1 ); \
+ if( weight == 32 )\
+ pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
+ else\
+ pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \
}
-
-PIXEL_AVG_WEIGHT_C(16,16)
-PIXEL_AVG_WEIGHT_C(16,8)
-PIXEL_AVG_WEIGHT_C(8,16)
-PIXEL_AVG_WEIGHT_C(8,8)
-PIXEL_AVG_WEIGHT_C(8,4)
-PIXEL_AVG_WEIGHT_C(4,8)
-PIXEL_AVG_WEIGHT_C(4,4)
-PIXEL_AVG_WEIGHT_C(4,2)
-PIXEL_AVG_WEIGHT_C(2,4)
-PIXEL_AVG_WEIGHT_C(2,2)
-#undef op_scale2
-#undef PIXEL_AVG_WEIGHT_C
+PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
+PIXEL_AVG_C( pixel_avg_16x8, 16, 8 )
+PIXEL_AVG_C( pixel_avg_8x16, 8, 16 )
+PIXEL_AVG_C( pixel_avg_8x8, 8, 8 )
+PIXEL_AVG_C( pixel_avg_8x4, 8, 4 )
+PIXEL_AVG_C( pixel_avg_4x8, 4, 8 )
+PIXEL_AVG_C( pixel_avg_4x4, 4, 4 )
+PIXEL_AVG_C( pixel_avg_4x2, 4, 2 )
+PIXEL_AVG_C( pixel_avg_2x4, 2, 4 )
+PIXEL_AVG_C( pixel_avg_2x2, 2, 2 )
static void mc_copy( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
{
@@ -299,9 +285,15 @@
i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres );
x264_frame_expand_border_lowres( frame );
- for( y=0; y<16; y++ )
- for( x=0; x<16; x++ )
- frame->i_cost_est[y][x] = -1;
+ memset( frame->i_cost_est, -1, sizeof(frame->i_cost_est) );
+
+ for( x = 0; x < h->param.i_bframe + 2; x++ )
+ for( y = 0; y < h->param.i_bframe + 2; y++ )
+ frame->i_row_satds[y][x][0] = -1;
+
+ for( y = 0; y <= !!h->param.i_bframe; y++ )
+ for( x = 0; x <= h->param.i_bframe; x++ )
+ frame->lowres_mvs[y][x][0][0] = 0x7FFF;
}
static void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
@@ -346,18 +338,8 @@
pf->avg[PIXEL_4x2] = pixel_avg_4x2;
pf->avg[PIXEL_2x4] = pixel_avg_2x4;
pf->avg[PIXEL_2x2] = pixel_avg_2x2;
-
- pf->avg_weight[PIXEL_16x16]= pixel_avg_weight_16x16;
- pf->avg_weight[PIXEL_16x8] = pixel_avg_weight_16x8;
- pf->avg_weight[PIXEL_8x16] = pixel_avg_weight_8x16;
- pf->avg_weight[PIXEL_8x8] = pixel_avg_weight_8x8;
- pf->avg_weight[PIXEL_8x4] = pixel_avg_weight_8x4;
- pf->avg_weight[PIXEL_4x8] = pixel_avg_weight_4x8;
- pf->avg_weight[PIXEL_4x4] = pixel_avg_weight_4x4;
- pf->avg_weight[PIXEL_4x2] = pixel_avg_weight_4x2;
- pf->avg_weight[PIXEL_2x4] = pixel_avg_weight_2x4;
- pf->avg_weight[PIXEL_2x2] = pixel_avg_weight_2x2;
+ pf->copy_16x16_unaligned = mc_copy_w16;
pf->copy[PIXEL_16x16] = mc_copy_w16;
pf->copy[PIXEL_8x8] = mc_copy_w8;
pf->copy[PIXEL_4x4] = mc_copy_w4;
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/mc.h
^
|
@@ -45,11 +45,11 @@
int mvx, int mvy,
int i_width, int i_height );
- void (*avg[10])( uint8_t *dst, int, uint8_t *src, int );
- void (*avg_weight[10])( uint8_t *dst, int, uint8_t *src, int, int i_weight );
+ void (*avg[10])( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight );
/* only 16x16, 8x8, and 4x4 defined */
void (*copy[7])( uint8_t *dst, int, uint8_t *src, int, int i_height );
+ void (*copy_16x16_unaligned)( uint8_t *dst, int, uint8_t *src, int, int i_height );
void (*plane_copy)( uint8_t *dst, int i_dst,
uint8_t *src, int i_src, int w, int h);
@@ -62,7 +62,7 @@
uint8_t *pix_uv, int stride_uv, int mb_x );
/* prefetch the next few macroblocks of a hpel reference frame */
void (*prefetch_ref)( uint8_t *pix, int stride, int parity );
-
+
void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
void (*memzero_aligned)( void *dst, int n );
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/mdate.c
^
|
@@ -26,6 +26,7 @@
#endif
#include <time.h>
+#include "common.h"
#include "osdep.h"
int64_t x264_mdate( void )
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/pixel.c
^
|
@@ -136,29 +136,49 @@
}
-static inline void pixel_sub_wxh( int16_t *diff, int i_size,
- uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
-{
- int y, x;
- for( y = 0; y < i_size; y++ )
- {
- for( x = 0; x < i_size; x++ )
- {
- diff[x + y*i_size] = pix1[x] - pix2[x];
- }
- pix1 += i_pix1;
- pix2 += i_pix2;
- }
+/****************************************************************************
+ * pixel_var_wxh
+ ****************************************************************************/
+#define PIXEL_VAR_C( name, w, shift ) \
+static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \
+{ \
+ uint32_t var = 0, sum = 0, sqr = 0; \
+ int x, y; \
+ for( y = 0; y < w; y++ ) \
+ { \
+ for( x = 0; x < w; x++ ) \
+ { \
+ sum += pix[x]; \
+ sqr += pix[x] * pix[x]; \
+ } \
+ pix += i_stride; \
+ } \
+ var = sqr - (sum * sum >> shift); \
+ *sad = sum; \
+ return var; \
+}
+
+PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
+PIXEL_VAR_C( x264_pixel_var_8x8, 8, 6 )
+
+
+#define HADAMARD4(d0,d1,d2,d3,s0,s1,s2,s3) {\
+ int t0 = s0 + s1;\
+ int t1 = s0 - s1;\
+ int t2 = s2 + s3;\
+ int t3 = s2 - s3;\
+ d0 = t0 + t2;\
+ d2 = t0 - t2;\
+ d1 = t1 + t3;\
+ d3 = t1 - t3;\
}
-
/****************************************************************************
* pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
****************************************************************************/
static int pixel_satd_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
{
int16_t tmp[4][4];
- int16_t diff[4][4];
int x, y;
int i_satd = 0;
@@ -166,32 +186,22 @@
{
for( x = 0; x < i_width; x += 4 )
{
- int d;
-
- pixel_sub_wxh( (int16_t*)diff, 4, &pix1[x], i_pix1, &pix2[x], i_pix2 );
+ int i;
+ uint8_t *p1 = pix1+x, *p2 = pix2+x;
- for( d = 0; d < 4; d++ )
+ for( i=0; i<4; i++, p1+=i_pix1, p2+=i_pix2 )
{
- int s01, s23;
- int d01, d23;
-
- s01 = diff[d][0] + diff[d][1]; s23 = diff[d][2] + diff[d][3];
- d01 = diff[d][0] - diff[d][1]; d23 = diff[d][2] - diff[d][3];
-
- tmp[d][0] = s01 + s23;
- tmp[d][1] = s01 - s23;
- tmp[d][2] = d01 - d23;
- tmp[d][3] = d01 + d23;
+ int a0 = p1[0] - p2[0];
+ int a1 = p1[1] - p2[1];
+ int a2 = p1[2] - p2[2];
+ int a3 = p1[3] - p2[3];
+ HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 );
}
- for( d = 0; d < 4; d++ )
+ for( i=0; i<4; i++ )
{
- int s01, s23;
- int d01, d23;
-
- s01 = tmp[0][d] + tmp[1][d]; s23 = tmp[2][d] + tmp[3][d];
- d01 = tmp[0][d] - tmp[1][d]; d23 = tmp[2][d] - tmp[3][d];
-
- i_satd += abs( s01 + s23 ) + abs( s01 - s23 ) + abs( d01 - d23 ) + abs( d01 + d23 );
+ int a0,a1,a2,a3;
+ HADAMARD4( a0,a1,a2,a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
+ i_satd += abs(a0) + abs(a1) + abs(a2) + abs(a3);
}
}
@@ -220,30 +230,17 @@
* pixel_sa8d_WxH: sum of 8x8 Hadamard transformed differences
****************************************************************************/
#define SA8D_1D {\
- const int a0 = SRC(0) + SRC(4);\
- const int a4 = SRC(0) - SRC(4);\
- const int a1 = SRC(1) + SRC(5);\
- const int a5 = SRC(1) - SRC(5);\
- const int a2 = SRC(2) + SRC(6);\
- const int a6 = SRC(2) - SRC(6);\
- const int a3 = SRC(3) + SRC(7);\
- const int a7 = SRC(3) - SRC(7);\
- const int b0 = a0 + a2;\
- const int b2 = a0 - a2;\
- const int b1 = a1 + a3;\
- const int b3 = a1 - a3;\
- const int b4 = a4 + a6;\
- const int b6 = a4 - a6;\
- const int b5 = a5 + a7;\
- const int b7 = a5 - a7;\
- DST(0, b0 + b1);\
- DST(1, b0 - b1);\
- DST(2, b2 + b3);\
- DST(3, b2 - b3);\
- DST(4, b4 + b5);\
- DST(5, b4 - b5);\
- DST(6, b6 + b7);\
- DST(7, b6 - b7);\
+ int b0,b1,b2,b3,b4,b5,b6,b7;\
+ HADAMARD4( b0,b1,b2,b3, SRC(0), SRC(1), SRC(2), SRC(3) );\
+ HADAMARD4( b4,b5,b6,b7, SRC(4), SRC(5), SRC(6), SRC(7) );\
+ DST(0, b0 + b4);\
+ DST(4, b0 - b4);\
+ DST(1, b1 + b5);\
+ DST(5, b1 - b5);\
+ DST(2, b2 + b6);\
+ DST(6, b2 - b6);\
+ DST(3, b3 + b7);\
+ DST(7, b3 - b7);\
}
static inline int pixel_sa8d_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2,
@@ -258,18 +255,28 @@
for( x = 0; x < i_width; x += 8 )
{
int i;
- pixel_sub_wxh( (int16_t*)diff, 8, pix1+x, i_pix1, pix2+x, i_pix2 );
+ uint8_t *p1 = pix1+x, *p2 = pix2+x;
-#define SRC(x) diff[i][x]
+#define SRC(x) a##x
#define DST(x,rhs) diff[i][x] = (rhs)
- for( i = 0; i < 8; i++ )
+ for( i=0; i<8; i++, p1+=i_pix1, p2+=i_pix2 )
+ {
+ int a0 = p1[0] - p2[0];
+ int a1 = p1[1] - p2[1];
+ int a2 = p1[2] - p2[2];
+ int a3 = p1[3] - p2[3];
+ int a4 = p1[4] - p2[4];
+ int a5 = p1[5] - p2[5];
+ int a6 = p1[6] - p2[6];
+ int a7 = p1[7] - p2[7];
SA8D_1D
+ }
#undef SRC
#undef DST
#define SRC(x) diff[x][i]
#define DST(x,rhs) i_satd += abs(rhs)
- for( i = 0; i < 8; i++ )
+ for( i=0; i<8; i++ )
SA8D_1D
#undef SRC
#undef DST
@@ -292,6 +299,69 @@
PIXEL_SA8D_C( 8, 16 )
PIXEL_SA8D_C( 8, 8 )
+
+static uint64_t pixel_hadamard_ac( uint8_t *pix, int stride )
+{
+ int16_t tmp[8][8];
+ int sum4=0, sum8=0;
+ int i;
+ for( i=0; i<8; i++, pix+=stride )
+ {
+ HADAMARD4( tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i],
+ pix[0], pix[1], pix[2], pix[3] );
+ HADAMARD4( tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i],
+ pix[4], pix[5], pix[6], pix[7] );
+ }
+ for( i=0; i<8; i++ )
+ {
+ int a0,a1,a2,a3,a4,a5,a6,a7;
+ HADAMARD4( a0,a1,a2,a3, tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3] );
+ sum4 += abs(a0) + abs(a1) + abs(a2) + abs(a3);
+ HADAMARD4( a4,a5,a6,a7, tmp[i][4], tmp[i][5], tmp[i][6], tmp[i][7] );
+ sum4 += abs(a4) + abs(a5) + abs(a6) + abs(a7);
+ tmp[i][0] = a0 + a4;
+ tmp[i][4] = a0 - a4;
+ tmp[i][1] = a1 + a5;
+ tmp[i][5] = a1 - a5;
+ tmp[i][2] = a2 + a6;
+ tmp[i][6] = a2 - a6;
+ tmp[i][3] = a3 + a7;
+ tmp[i][7] = a3 - a7;
+ }
+ for( i=0; i<8; i++ )
+ {
+ sum8 += abs( tmp[0][i] + tmp[4][i] )
+ + abs( tmp[0][i] - tmp[4][i] )
+ + abs( tmp[1][i] + tmp[5][i] )
+ + abs( tmp[1][i] - tmp[5][i] )
+ + abs( tmp[2][i] + tmp[6][i] )
+ + abs( tmp[2][i] - tmp[6][i] )
+ + abs( tmp[3][i] + tmp[7][i] )
+ + abs( tmp[3][i] - tmp[7][i] );
+ }
+ sum4 -= tmp[0][0]+tmp[4][0];
+ sum8 -= tmp[0][0]+tmp[4][0];
+ return ((uint64_t)sum8<<32) + sum4;
+}
+
+#define HADAMARD_AC(w,h) \
+static uint64_t x264_pixel_hadamard_ac_##w##x##h( uint8_t *pix, int stride )\
+{\
+ uint64_t sum = pixel_hadamard_ac( pix, stride );\
+ if( w==16 )\
+ sum += pixel_hadamard_ac( pix+8, stride );\
+ if( h==16 )\
+ sum += pixel_hadamard_ac( pix+8*stride, stride );\
+ if( w==16 && h==16 )\
+ sum += pixel_hadamard_ac( pix+8*stride+8, stride );\
+ return ((sum>>34)<<32) + ((uint32_t)sum>>1);\
+}
+HADAMARD_AC( 16, 16 )
+HADAMARD_AC( 16, 8 )
+HADAMARD_AC( 8, 16 )
+HADAMARD_AC( 8, 8 )
+
+
/****************************************************************************
* pixel_sad_x4
****************************************************************************/
@@ -502,20 +572,24 @@
{
memset( pixf, 0, sizeof(*pixf) );
-#define INIT2( name, cpu ) \
- pixf->name[PIXEL_16x16] = x264_pixel_##name##_16x16##cpu;\
- pixf->name[PIXEL_16x8] = x264_pixel_##name##_16x8##cpu;
-#define INIT4( name, cpu ) \
- INIT2( name, cpu ) \
- pixf->name[PIXEL_8x16] = x264_pixel_##name##_8x16##cpu;\
- pixf->name[PIXEL_8x8] = x264_pixel_##name##_8x8##cpu;
-#define INIT5( name, cpu ) \
- INIT4( name, cpu ) \
- pixf->name[PIXEL_8x4] = x264_pixel_##name##_8x4##cpu;
-#define INIT7( name, cpu ) \
- INIT5( name, cpu ) \
- pixf->name[PIXEL_4x8] = x264_pixel_##name##_4x8##cpu;\
- pixf->name[PIXEL_4x4] = x264_pixel_##name##_4x4##cpu;
+#define INIT2_NAME( name1, name2, cpu ) \
+ pixf->name1[PIXEL_16x16] = x264_pixel_##name2##_16x16##cpu;\
+ pixf->name1[PIXEL_16x8] = x264_pixel_##name2##_16x8##cpu;
+#define INIT4_NAME( name1, name2, cpu ) \
+ INIT2_NAME( name1, name2, cpu ) \
+ pixf->name1[PIXEL_8x16] = x264_pixel_##name2##_8x16##cpu;\
+ pixf->name1[PIXEL_8x8] = x264_pixel_##name2##_8x8##cpu;
+#define INIT5_NAME( name1, name2, cpu ) \
+ INIT4_NAME( name1, name2, cpu ) \
+ pixf->name1[PIXEL_8x4] = x264_pixel_##name2##_8x4##cpu;
+#define INIT7_NAME( name1, name2, cpu ) \
+ INIT5_NAME( name1, name2, cpu ) \
+ pixf->name1[PIXEL_4x8] = x264_pixel_##name2##_4x8##cpu;\
+ pixf->name1[PIXEL_4x4] = x264_pixel_##name2##_4x4##cpu;
+#define INIT2( name, cpu ) INIT2_NAME( name, name, cpu )
+#define INIT4( name, cpu ) INIT4_NAME( name, name, cpu )
+#define INIT5( name, cpu ) INIT5_NAME( name, name, cpu )
+#define INIT7( name, cpu ) INIT7_NAME( name, name, cpu )
#define INIT_ADS( cpu ) \
pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
@@ -523,6 +597,7 @@
pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu;
INIT7( sad, );
+ INIT7_NAME( sad_aligned, sad, );
INIT7( sad_x3, );
INIT7( sad_x4, );
INIT7( ssd, );
@@ -530,8 +605,12 @@
INIT7( satd_x3, );
INIT7( satd_x4, );
INIT4( sa8d, );
+ INIT4( hadamard_ac, );
INIT_ADS( );
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16;
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8;
+
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
@@ -544,13 +623,16 @@
if( cpu&X264_CPU_MMXEXT )
{
INIT7( sad, _mmxext );
+ INIT7_NAME( sad_aligned, sad, _mmxext );
INIT7( sad_x3, _mmxext );
INIT7( sad_x4, _mmxext );
INIT7( satd, _mmxext );
INIT7( satd_x3, _mmxext );
INIT7( satd_x4, _mmxext );
+ INIT4( hadamard_ac, _mmxext );
INIT_ADS( _mmxext );
-
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext;
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext;
#ifdef ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext;
@@ -582,6 +664,7 @@
}
#endif
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
}
@@ -591,8 +674,10 @@
INIT2( sad, _sse2 );
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
+ INIT4( hadamard_ac, _sse2 );
INIT_ADS( _sse2 );
-
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
#ifdef ARCH_X86
if( cpu&X264_CPU_CACHELINE_64 )
{
@@ -608,6 +693,8 @@
INIT5( satd, _sse2 );
INIT5( satd_x3, _sse2 );
INIT5( satd_x4, _sse2 );
+ INIT2_NAME( sad_aligned, sad, _sse2_aligned );
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
@@ -629,10 +716,12 @@
INIT7( satd, _ssse3 );
INIT7( satd_x3, _ssse3 );
INIT7( satd_x4, _ssse3 );
+ INIT4( hadamard_ac, _ssse3 );
INIT_ADS( _ssse3 );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3;
#ifdef ARCH_X86_64
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/pixel.h
^
|
@@ -68,9 +68,14 @@
x264_pixel_cmp_t ssim[7];
x264_pixel_cmp_t sa8d[4];
x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */
+ x264_pixel_cmp_t mbcmp_unaligned[7]; /* unaligned mbcmp for subpel */
x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */
x264_pixel_cmp_x3_t fpelcmp_x3[7];
x264_pixel_cmp_x4_t fpelcmp_x4[7];
+ x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
+
+ int (*var[4])( uint8_t *pix, int stride, uint32_t *sad );
+ uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
@@ -87,12 +92,14 @@
int (*ads[7])( int enc_dc[4], uint16_t *sums, int delta,
uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
- /* calculate satd of V, H, and DC modes.
+ /* calculate satd or sad of V, H, and DC modes.
* may be NULL, in which case just use pred+satd instead. */
- void (*intra_satd_x3_16x16)( uint8_t *fenc, uint8_t *fdec, int res[3] );
- void (*intra_satd_x3_8x8c)( uint8_t *fenc, uint8_t *fdec, int res[3] );
- void (*intra_satd_x3_4x4)( uint8_t *fenc, uint8_t *fdec, int res[3] );
- void (*intra_sa8d_x3_8x8)( uint8_t *fenc, uint8_t edge[33], int res[3] );
+ void (*intra_mbcmp_x3_16x16)( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_satd_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_sad_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_satd_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_satd_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_sa8d_x3_8x8) ( uint8_t *fenc, uint8_t edge[33], int res[3] );
} x264_pixel_function_t;
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/predict.c
^
|
@@ -27,9 +27,6 @@
#include "common.h"
-#ifdef _MSC_VER
-#undef HAVE_MMX /* not finished now */
-#endif
#ifdef HAVE_MMX
# include "x86/predict.h"
#endif
@@ -646,7 +643,7 @@
SRC(5,0)=SRC(6,1)=SRC(7,2)= F2(t3,t4,t5);
SRC(6,0)=SRC(7,1)= F2(t4,t5,t6);
SRC(7,0)= F2(t5,t6,t7);
-
+
}
static void predict_8x8_vr( uint8_t *src, uint8_t edge[33] )
{
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/quant.c
^
|
@@ -194,7 +194,7 @@
}
}
-void x264_denoise_dct_core( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
+static void x264_denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
{
int i;
for( i=1; i<size; i++ )
@@ -218,7 +218,7 @@
pf->dequant_4x4 = dequant_4x4;
pf->dequant_8x8 = dequant_8x8;
- pf->denoise_dct_core = x264_denoise_dct_core;
+ pf->denoise_dct = x264_denoise_dct;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
@@ -233,7 +233,7 @@
pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx;
pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx;
}
- pf->denoise_dct_core = x264_denoise_dct_core_mmx;
+ pf->denoise_dct = x264_denoise_dct_mmx;
#endif
}
@@ -257,7 +257,7 @@
pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
}
- pf->denoise_dct_core = x264_denoise_dct_core_sse2;
+ pf->denoise_dct = x264_denoise_dct_sse2;
}
if( cpu&X264_CPU_SSSE3 )
@@ -266,7 +266,7 @@
pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
pf->quant_4x4 = x264_quant_4x4_ssse3;
pf->quant_8x8 = x264_quant_8x8_ssse3;
- pf->denoise_dct_core = x264_denoise_dct_core_ssse3;
+ pf->denoise_dct = x264_denoise_dct_ssse3;
}
#endif // HAVE_MMX
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/quant.h
^
|
@@ -33,7 +33,7 @@
void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
- void (*denoise_dct_core)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
+ void (*denoise_dct)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
} x264_quant_function_t;
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/set.c
^
|
@@ -75,7 +75,7 @@
int quant8_mf[2][6][8][8];
int q, i, j, i_list;
int deadzone[4] = { 32 - h->param.analyse.i_luma_deadzone[1],
- 32 - h->param.analyse.i_luma_deadzone[0],
+ 32 - h->param.analyse.i_luma_deadzone[0],
32 - 11, 32 - 21 };
int max_qp_err = -1;
@@ -195,7 +195,7 @@
}
}
-int x264_cqm_parse_jmlist( x264_t *h, const char *buf, const char *name,
+static int x264_cqm_parse_jmlist( x264_t *h, const char *buf, const char *name,
uint8_t *cqm, const uint8_t *jvt, int length )
{
char *p;
@@ -247,7 +247,7 @@
int b_error = 0;
h->param.i_cqm_preset = X264_CQM_CUSTOM;
-
+
buf = x264_slurp_file( filename );
if( !buf )
{
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/set.h
^
|
@@ -28,11 +28,12 @@
{
PROFILE_BASELINE = 66,
PROFILE_MAIN = 77,
- PROFILE_EXTENTED = 88,
+ PROFILE_EXTENDED = 88,
PROFILE_HIGH = 100,
PROFILE_HIGH10 = 110,
PROFILE_HIGH422 = 122,
- PROFILE_HIGH444 = 144
+ PROFILE_HIGH444 = 144,
+ PROFILE_HIGH444_PREDICTIVE = 244,
};
enum cqm4_e
@@ -94,7 +95,7 @@
int b_aspect_ratio_info_present;
int i_sar_width;
int i_sar_height;
-
+
int b_overscan_info_present;
int b_overscan_info;
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/cabac-a.asm
^
|
@@ -63,20 +63,13 @@
endstruc
%macro LOAD_GLOBAL 4
-%ifdef PIC64
+%ifdef PIC
; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
lea r11, [%2 GLOBAL]
%ifnidn %3, 0
add r11, %3
%endif
movzx %1, byte [r11+%4]
-%elifdef PIC32
- %ifnidn %3, 0
- lea %1, [%3+%4]
- movzx %1, byte [%2+%1 GLOBAL]
- %else
- movzx %1, byte [%2+%3+%4 GLOBAL]
- %endif
%else
movzx %1, byte [%2+%3+%4]
%endif
@@ -85,7 +78,6 @@
cglobal x264_cabac_encode_decision_asm, 0,7
movifnidn t0d, r0m
movifnidn t1d, r1m
- picgetgot t2
mov t5d, [r0+cb.range]
movzx t3d, byte [r0+cb.state+t1]
mov t4d, t5d
@@ -95,22 +87,13 @@
sub t4d, t5d
mov t6d, t3d
shr t6d, 6
-%ifdef PIC32
- cmp t6d, r2m
-%else
movifnidn t2d, r2m
cmp t6d, t2d
-%endif
mov t6d, [r0+cb.low]
lea t7, [t6+t4]
cmovne t4d, t5d
cmovne t6d, t7d
-%ifdef PIC32
- mov t1, r2m
- LOAD_GLOBAL t3d, x264_cabac_transition, t1, t3*2
-%else
LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
-%endif
movifnidn t1d, r1m
mov [r0+cb.state+t1], t3b
.renorm:
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/dct-32.asm
^
|
@@ -32,21 +32,6 @@
SECTION .text
-%macro SBUTTERFLY 4
- mova m%4, m%2
- punpckl%1 m%2, m%3
- punpckh%1 m%4, m%3
- SWAP %3, %4
-%endmacro
-
-%macro TRANSPOSE4x4W 5
- SBUTTERFLY wd, %1, %2, %5
- SBUTTERFLY wd, %3, %4, %5
- SBUTTERFLY dq, %1, %3, %5
- SBUTTERFLY dq, %2, %4, %5
- SWAP %2, %3
-%endmacro
-
; in: m0..m7
; out: 0,4,6 in mem, rest in regs
%macro DCT8_1D 9
@@ -359,31 +344,6 @@
INIT_XMM
-; in: m0..m7, except m6 which is in [%9+0x60]
-; out: m0..m7, except m4 which is in [%9+0x40]
-%macro TRANSPOSE8x8W 9
- SBUTTERFLY wd, %1, %2, %7
- movdqa [%9+16], m%2
- movdqa m%7, [%9+0x60]
- SBUTTERFLY wd, %3, %4, %2
- SBUTTERFLY wd, %5, %6, %2
- SBUTTERFLY wd, %7, %8, %2
- SBUTTERFLY dq, %1, %3, %2
- movdqa [%9], m%3
- movdqa m%2, [%9+16]
- SBUTTERFLY dq, %2, %4, %3
- SBUTTERFLY dq, %5, %7, %3
- SBUTTERFLY dq, %6, %8, %3
- SBUTTERFLY qdq, %1, %5, %3
- SBUTTERFLY qdq, %2, %6, %3
- movdqa [%9+0x40], m%2
- movdqa m%3, [%9]
- SBUTTERFLY qdq, %3, %7, %2
- SBUTTERFLY qdq, %4, %8, %2
- SWAP %2, %5
- SWAP %4, %7
-%endmacro
-
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
@@ -402,7 +362,7 @@
UNSPILL r0, 0
DCT8_1D 0,1,2,3,4,5,6,7,r0
UNSPILL r0, 0,4
- TRANSPOSE8x8W 0,1,2,3,4,5,6,7,r0
+ TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1
UNSPILL r0, 4
DCT8_1D 0,1,2,3,4,5,6,7,r0
SPILL r0, 1,2,3,5,7
@@ -417,8 +377,7 @@
UNSPILL r1, 1,2,3,5,6,7
IDCT8_1D 0,1,2,3,4,5,6,7,r1
SPILL r1, 6
- TRANSPOSE8x8W 0,1,2,3,4,5,6,7,r1
- picgetgot edx
+ TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1
paddw m0, [pw_32 GLOBAL]
SPILL r1, 0
IDCT8_1D 0,1,2,3,4,5,6,7,r1
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/dct-64.asm
^
|
@@ -29,35 +29,8 @@
pw_32: times 8 dw 32
SECTION .text
-
INIT_XMM
-%macro SBUTTERFLY 4
- mova m%4, m%2
- punpckl%1 m%2, m%3
- punpckh%1 m%4, m%3
- SWAP %3, %4
-%endmacro
-
-%macro TRANSPOSE8x8W 9
- SBUTTERFLY wd, %1, %2, %9
- SBUTTERFLY wd, %3, %4, %9
- SBUTTERFLY wd, %5, %6, %9
- SBUTTERFLY wd, %7, %8, %9
- SBUTTERFLY dq, %1, %3, %9
- SBUTTERFLY dq, %2, %4, %9
- SBUTTERFLY dq, %5, %7, %9
- SBUTTERFLY dq, %6, %8, %9
- SBUTTERFLY qdq, %1, %5, %9
- SBUTTERFLY qdq, %2, %6, %9
- SBUTTERFLY qdq, %3, %7, %9
- SBUTTERFLY qdq, %4, %8, %9
- SWAP %2, %5
- SWAP %4, %7
-%endmacro
-
-SECTION .text
-
%macro DCT8_1D 10
SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07
SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16
@@ -151,7 +124,7 @@
paddw m%9, m%2
paddw m%9, m%4
paddw m%9, m%6 ; %9=a7
-
+
movdqa m%10, m%6
psraw m%10, 1
paddw m%10, m%6
@@ -208,7 +181,7 @@
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
paddw m0, [pw_32 GLOBAL] ; rounding for the >>6 at the end
IDCT8_1D 0,1,2,3,4,5,6,7,8,9
-
+
pxor m9, m9
STORE_DIFF m0, m8, m9, [r0+0*FDEC_STRIDE]
STORE_DIFF m1, m8, m9, [r0+1*FDEC_STRIDE]
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/dct-a.asm
^
|
@@ -28,34 +28,12 @@
SECTION_RODATA
pw_1: times 8 dw 1
pw_32: times 8 dw 32
-pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
+pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
+pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
+pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
SECTION .text
-%macro SBUTTERFLY 4
- mova m%4, m%2
- punpckl%1 m%2, m%3
- punpckh%1 m%4, m%3
- SWAP %3, %4
-%endmacro
-
-%macro TRANSPOSE4x4W 5
- SBUTTERFLY wd, %1, %2, %5
- SBUTTERFLY wd, %3, %4, %5
- SBUTTERFLY dq, %1, %3, %5
- SBUTTERFLY dq, %2, %4, %5
- SWAP %2, %3
-%endmacro
-
-%macro TRANSPOSE2x4x4W 5
- SBUTTERFLY wd, %1, %2, %5
- SBUTTERFLY wd, %3, %4, %5
- SBUTTERFLY dq, %1, %3, %5
- SBUTTERFLY dq, %2, %4, %5
- SBUTTERFLY qdq, %1, %2, %5
- SBUTTERFLY qdq, %3, %4, %5
-%endmacro
-
%macro HADAMARD4_1D 4
SUMSUB_BADC m%2, m%1, m%4, m%3
SUMSUB_BADC m%4, m%2, m%3, m%1
@@ -65,7 +43,7 @@
;-----------------------------------------------------------------------------
; void x264_dct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_dct4x4dc_mmx, 1,1,1
+cglobal x264_dct4x4dc_mmx, 1,1
movq m0, [r0+ 0]
movq m1, [r0+ 8]
movq m2, [r0+16]
@@ -143,7 +121,7 @@
;-----------------------------------------------------------------------------
; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_add4x4_idct_mmx, 2,2,1
+cglobal x264_add4x4_idct_mmx, 2,2
.skip_prologue:
movq m0, [r1+ 0]
movq m1, [r1+ 8]
@@ -179,7 +157,7 @@
movhps [r0+56], m3
ret
-cglobal x264_add8x8_idct_sse2, 2,2,1
+cglobal x264_add8x8_idct_sse2, 2,2
.skip_prologue:
call .8x4
add r1, 64
@@ -221,7 +199,7 @@
; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6
-cglobal %1, 2,2,1
+cglobal %1, 2,2
.skip_prologue:
call %2
add r0, %4-%5-%6*FDEC_STRIDE
@@ -257,7 +235,264 @@
SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0
ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
+;-----------------------------------------------------------------------------
+; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
+;-----------------------------------------------------------------------------
+%macro SCAN_8x8 1
+cglobal x264_zigzag_scan_8x8_frame_%1, 2,2
+ movdqa xmm0, [r1]
+ movdqa xmm1, [r1+16]
+ movdq2q mm0, xmm0
+ PALIGNR xmm1, xmm1, 14, xmm2
+ movdq2q mm1, xmm1
+
+ movdqa xmm2, [r1+32]
+ movdqa xmm3, [r1+48]
+ PALIGNR xmm2, xmm2, 12, xmm4
+ movdq2q mm2, xmm2
+ PALIGNR xmm3, xmm3, 10, xmm4
+ movdq2q mm3, xmm3
+
+ punpckhwd xmm0, xmm1
+ punpckhwd xmm2, xmm3
+
+ movq mm4, mm1
+ movq mm5, mm1
+ movq mm6, mm2
+ movq mm7, mm3
+ punpckhwd mm1, mm0
+ psllq mm0, 16
+ psrlq mm3, 16
+ punpckhdq mm1, mm1
+ punpckhdq mm2, mm0
+ punpcklwd mm0, mm4
+ punpckhwd mm4, mm3
+ punpcklwd mm4, mm2
+ punpckhdq mm0, mm2
+ punpcklwd mm6, mm3
+ punpcklwd mm5, mm7
+ punpcklwd mm5, mm6
+
+ movdqa xmm4, [r1+64]
+ movdqa xmm5, [r1+80]
+ movdqa xmm6, [r1+96]
+ movdqa xmm7, [r1+112]
+
+ movq [r0+2*00], mm0
+ movq [r0+2*04], mm4
+ movd [r0+2*08], mm1
+ movq [r0+2*36], mm5
+ movq [r0+2*46], mm6
+
+ PALIGNR xmm4, xmm4, 14, xmm3
+ movdq2q mm4, xmm4
+ PALIGNR xmm5, xmm5, 12, xmm3
+ movdq2q mm5, xmm5
+ PALIGNR xmm6, xmm6, 10, xmm3
+ movdq2q mm6, xmm6
+%ifidn %1, ssse3
+ PALIGNR xmm7, xmm7, 8, xmm3
+ movdq2q mm7, xmm7
+%else
+ movhlps xmm3, xmm7
+ movlhps xmm7, xmm7
+ movdq2q mm7, xmm3
+%endif
+
+ punpckhwd xmm4, xmm5
+ punpckhwd xmm6, xmm7
+ movq mm0, mm4
+ movq mm1, mm5
+ movq mm3, mm7
+ punpcklwd mm7, mm6
+ psrlq mm6, 16
+ punpcklwd mm4, mm6
+ punpcklwd mm5, mm4
+ punpckhdq mm4, mm3
+ punpcklwd mm3, mm6
+ punpckhwd mm3, mm4
+ punpckhwd mm0, mm1
+ punpckldq mm4, mm0
+ punpckhdq mm0, mm6
+ pshufw mm4, mm4, 0x6c
+
+ movq [r0+2*14], mm4
+ movq [r0+2*25], mm0
+ movd [r0+2*54], mm7
+ movq [r0+2*56], mm5
+ movq [r0+2*60], mm3
+
+ movdqa xmm3, xmm0
+ movdqa xmm7, xmm4
+ punpckldq xmm0, xmm2
+ punpckldq xmm4, xmm6
+ punpckhdq xmm3, xmm2
+ punpckhdq xmm7, xmm6
+ pshufhw xmm0, xmm0, 0x1b
+ pshuflw xmm4, xmm4, 0x1b
+ pshufhw xmm3, xmm3, 0x1b
+ pshuflw xmm7, xmm7, 0x1b
+
+ movlps [r0+2*10], xmm0
+ movhps [r0+2*17], xmm0
+ movlps [r0+2*21], xmm3
+ movlps [r0+2*28], xmm4
+ movhps [r0+2*32], xmm3
+ movhps [r0+2*39], xmm4
+ movlps [r0+2*43], xmm7
+ movhps [r0+2*50], xmm7
+
+ RET
+%endmacro
+
+INIT_XMM
+%define PALIGNR PALIGNR_MMX
+SCAN_8x8 sse2
+%define PALIGNR PALIGNR_SSSE3
+SCAN_8x8 ssse3
+
+;-----------------------------------------------------------------------------
+; void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] )
+;-----------------------------------------------------------------------------
+cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2
+ movq mm0, [r1]
+ movq mm1, [r1+2*8]
+ movq mm2, [r1+2*14]
+ movq mm3, [r1+2*21]
+ movq mm4, [r1+2*28]
+ movq mm5, mm0
+ movq mm6, mm1
+ psrlq mm0, 16
+ punpckldq mm1, mm1
+ punpcklwd mm5, mm6
+ punpckhwd mm1, mm3
+ punpckhwd mm6, mm0
+ punpckldq mm5, mm0
+ movq mm7, [r1+2*52]
+ movq mm0, [r1+2*60]
+ punpckhwd mm1, mm2
+ punpcklwd mm2, mm4
+ punpckhwd mm4, mm3
+ punpckldq mm3, mm3
+ punpckhwd mm3, mm2
+ movq [r0], mm5
+ movq [r0+2*4], mm1
+ movq [r0+2*8], mm6
+ punpcklwd mm6, mm0
+ punpcklwd mm6, mm7
+ movq mm1, [r1+2*32]
+ movq mm5, [r1+2*39]
+ movq mm2, [r1+2*46]
+ movq [r0+2*35], mm3
+ movq [r0+2*47], mm4
+ punpckhwd mm7, mm0
+ psllq mm0, 16
+ movq mm3, mm5
+ punpcklwd mm5, mm1
+ punpckhwd mm1, mm2
+ punpckhdq mm3, mm3
+ movq [r0+2*52], mm6
+ movq [r0+2*13], mm5
+ movq mm4, [r1+2*11]
+ movq mm6, [r1+2*25]
+ punpcklwd mm5, mm7
+ punpcklwd mm1, mm3
+ punpckhdq mm0, mm7
+ movq mm3, [r1+2*4]
+ movq mm7, [r1+2*18]
+ punpcklwd mm2, mm5
+ movq [r0+2*25], mm1
+ movq mm1, mm4
+ movq mm5, mm6
+ punpcklwd mm4, mm3
+ punpcklwd mm6, mm7
+ punpckhwd mm1, mm3
+ punpckhwd mm5, mm7
+ movq mm3, mm6
+ movq mm7, mm5
+ punpckldq mm6, mm4
+ punpckldq mm5, mm1
+ punpckhdq mm3, mm4
+ punpckhdq mm7, mm1
+ movq mm4, [r1+2*35]
+ movq mm1, [r1+2*49]
+ pshufw mm6, mm6, 0x1b
+ pshufw mm5, mm5, 0x1b
+ movq [r0+2*60], mm0
+ movq [r0+2*56], mm2
+ movq mm0, [r1+2*42]
+ movq mm2, [r1+2*56]
+ movq [r0+2*17], mm3
+ movq [r0+2*32], mm7
+ movq [r0+2*10], mm6
+ movq [r0+2*21], mm5
+ movq mm3, mm0
+ movq mm7, mm2
+ punpcklwd mm0, mm4
+ punpcklwd mm2, mm1
+ punpckhwd mm3, mm4
+ punpckhwd mm7, mm1
+ movq mm4, mm2
+ movq mm1, mm7
+ punpckhdq mm2, mm0
+ punpckhdq mm7, mm3
+ punpckldq mm4, mm0
+ punpckldq mm1, mm3
+ pshufw mm2, mm2, 0x1b
+ pshufw mm7, mm7, 0x1b
+ movq [r0+2*28], mm4
+ movq [r0+2*43], mm1
+ movq [r0+2*39], mm2
+ movq [r0+2*50], mm7
+ RET
+
+;-----------------------------------------------------------------------------
+; void x264_zigzag_scan_4x4_frame_mmx( int16_t level[16], int16_t dct[4][4] )
+;-----------------------------------------------------------------------------
+cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
+ movq mm0, [r1]
+ movq mm1, [r1+8]
+ movq mm2, [r1+16]
+ movq mm3, [r1+24]
+ movq mm4, mm0
+ movq mm5, mm1
+ movq mm6, mm2
+ movq mm7, mm3
+ psllq mm3, 16
+ psrlq mm0, 16
+ punpckldq mm2, mm2
+ punpckhdq mm1, mm1
+ punpcklwd mm4, mm5
+ punpcklwd mm5, mm3
+ punpckldq mm4, mm0
+ punpckhwd mm5, mm2
+ punpckhwd mm0, mm6
+ punpckhwd mm6, mm7
+ punpcklwd mm1, mm0
+ punpckhdq mm3, mm6
+ movq [r0], mm4
+ movq [r0+8], mm5
+ movq [r0+16], mm1
+ movq [r0+24], mm3
+ RET
+
+;-----------------------------------------------------------------------------
+; void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[4][4] )
+;-----------------------------------------------------------------------------
+cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
+ movdqa xmm1, [r1+16]
+ movdqa xmm0, [r1]
+ pshufb xmm1, [pb_scan4frameb GLOBAL]
+ pshufb xmm0, [pb_scan4framea GLOBAL]
+ movdqa xmm2, xmm1
+ psrldq xmm1, 6
+ palignr xmm2, xmm0, 6
+ pslldq xmm0, 10
+ palignr xmm1, xmm0, 10
+ movdqa [r0], xmm2
+ movdqa [r0+16], xmm1
+ RET
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
@@ -292,14 +527,13 @@
movd [r2+1*FDEC_STRIDE], xmm1
movd [r2+2*FDEC_STRIDE], xmm2
movd [r2+3*FDEC_STRIDE], xmm3
- picgetgot r1
punpckldq xmm0, xmm1
punpckldq xmm2, xmm3
punpckldq xmm4, xmm5
punpckldq xmm6, xmm7
movlhps xmm0, xmm2
movlhps xmm4, xmm6
- movdqa xmm7, [pb_zigzag4 GLOBAL]
+ movdqa xmm7, [pb_sub4frame GLOBAL]
pshufb xmm0, xmm7
pshufb xmm4, xmm7
pxor xmm6, xmm6
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/dct.h
^
|
@@ -24,32 +24,37 @@
#ifndef X264_I386_DCT_H
#define X264_I386_DCT_H
-void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_sse2( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_sse2( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
-
-void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] );
-void x264_add8x8_idct_mmx( uint8_t *p_dst, int16_t dct[4][4][4] );
-void x264_add16x16_idct_mmx( uint8_t *p_dst, int16_t dct[16][4][4] );
-void x264_add8x8_idct_sse2( uint8_t *p_dst, int16_t dct[4][4][4] );
-void x264_add16x16_idct_sse2( uint8_t *p_dst, int16_t dct[16][4][4] );
-
-void x264_dct4x4dc_mmx( int16_t d[4][4] );
-void x264_idct4x4dc_mmx( int16_t d[4][4] );
-
-void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
-
-void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] );
-void x264_add16x16_idct8_mmx( uint8_t *dst, int16_t dct[4][8][8] );
-void x264_add8x8_idct8_sse2( uint8_t *dst, int16_t dct[8][8] );
+void x264_sub4x4_dct_mmx ( int16_t dct[ 4][4] , uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_mmx ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_mmx ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_sse2 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
+
+void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] );
+void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
+void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][4][4] );
+void x264_add8x8_idct_sse2 ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
+void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][4][4] );
+
+void x264_dct4x4dc_mmx ( int16_t d[4][4] );
+void x264_idct4x4dc_mmx ( int16_t d[4][4] );
+
+void x264_sub8x8_dct8_mmx ( int16_t dct[8][8] , uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_mmx ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct8_sse2 ( int16_t dct[8][8] , uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_sse2 ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
+
+void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct[8][8] );
+void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][8][8] );
+void x264_add8x8_idct8_sse2 ( uint8_t *dst, int16_t dct[8][8] );
void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][8][8] );
+void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[8][8] );
+void x264_zigzag_scan_8x8_frame_sse2 ( int16_t level[64], int16_t dct[8][8] );
+void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] );
+void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[4][4] );
+void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
-void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst );
+void x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
#endif
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/deblock-a.asm
^
|
@@ -373,7 +373,7 @@
;-----------------------------------------------------------------------------
; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_%1, 5,5,1
+cglobal x264_deblock_%2_luma_%1, 5,5
lea r4, [r1*3]
dec r2 ; alpha-1
neg r4
@@ -609,7 +609,7 @@
;-----------------------------------------------------------------------------
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_intra_%1, 4,6,1
+cglobal x264_deblock_%2_luma_intra_%1, 4,6
%ifndef ARCH_X86_64
sub esp, 0x60
%endif
@@ -813,7 +813,6 @@
movd m6, [r4] ; tc0
punpcklbw m6, m6
pand m7, m6
- picgetgot r4
DEBLOCK_P0_Q0
ret
@@ -862,7 +861,6 @@
LOAD_MASK r2d, r3d
movq m5, m1
movq m6, m2
- picgetgot r2
CHROMA_INTRA_P0 m1, m0, m3
CHROMA_INTRA_P0 m2, m3, m0
psubb m1, m5
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/mc-a.asm
^
|
@@ -36,33 +36,21 @@
SECTION .text
;=============================================================================
-; pixel avg
+; weighted prediction
;=============================================================================
-
-;-----------------------------------------------------------------------------
-; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
-; uint8_t *src, int src_stride );
-;-----------------------------------------------------------------------------
-%macro AVGH 3
-%assign function_align 8 ; the whole function fits in 8 bytes, so a larger align just wastes space
-cglobal x264_pixel_avg_%1x%2_%3
- mov eax, %2
- jmp x264_pixel_avg_w%1_%3
-%assign function_align 16
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
-; uint8_t *src, int src_stride,
-; int height );
-;-----------------------------------------------------------------------------
+; implicit bipred only:
+; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%ifdef ARCH_X86_64
%define t0 r0
%define t1 r1
%define t2 r2
%define t3 r3
- %macro AVG_START 1
- cglobal %1, 4,5
+ %define t4 r4
+ %define t5 r5
+ %define t6d r10d
+ %define t7d r11d
+ %macro AVG_START 0
+ PROLOGUE 6,7
.height_loop:
%endmacro
%else
@@ -70,79 +58,228 @@
%define t1 r2
%define t2 r3
%define t3 r4
- %macro AVG_START 1
- cglobal %1, 0,5
+ %define t4 r5
+ %define t5 r6
+ %define t6d r1d
+ %define t7d r2d
+ %macro AVG_START 0
+ PROLOGUE 0,7
mov t0, r0m
mov t1, r1m
mov t2, r2m
mov t3, r3m
+ mov t4, r4m
+ mov t5, r5m
.height_loop:
%endmacro
%endif
+%macro SPLATW 2
+%if mmsize==16
+ pshuflw %1, %2, 0
+ movlhps %1, %1
+%else
+ pshufw %1, %2, 0
+%endif
+%endmacro
+
+%macro BIWEIGHT_MMX 2
+ movh m0, %1
+ movh m1, %2
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m1
+ paddw m0, m6
+ psraw m0, 6
+%endmacro
+
+%macro BIWEIGHT_START_MMX 0
+ movd m4, r6m
+ SPLATW m4, m4 ; weight_dst
+ mova m5, [pw_64 GLOBAL]
+ psubw m5, m4 ; weight_src
+ mova m6, [pw_32 GLOBAL] ; rounding
+ pxor m7, m7
+%endmacro
+
+%macro BIWEIGHT_SSSE3 2
+ movh m0, %1
+ movh m1, %2
+ punpcklbw m0, m1
+ pmaddubsw m0, m5
+ paddw m0, m6
+ psraw m0, 6
+%endmacro
+
+%macro BIWEIGHT_START_SSSE3 0
+ movzx t6d, byte r6m ; FIXME x86_64
+ mov t7d, 64
+ sub t7d, t6d
+ shl t7d, 8
+ add t6d, t7d
+ movd m5, t6d
+ mova m6, [pw_32 GLOBAL]
+ SPLATW m5, m5 ; weight_dst,src
+%endmacro
+
+%macro BIWEIGHT_ROW 4
+ BIWEIGHT [%2], [%3]
+%if %4==mmsize/2
+ packuswb m0, m0
+ movh [%1], m0
+%else
+ SWAP 0, 2
+ BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
+ packuswb m2, m0
+ mova [%1], m2
+%endif
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
+;-----------------------------------------------------------------------------
+%macro AVG_WEIGHT 2
+cglobal x264_pixel_avg_weight_w%2_%1, 0,0
+ BIWEIGHT_START
+ AVG_START
+%if %2==8 && mmsize==16
+ BIWEIGHT [t2], [t4]
+ SWAP 0, 2
+ BIWEIGHT [t2+t3], [t4+t5]
+ packuswb m2, m0
+ movlps [t0], m2
+ movhps [t0+t1], m2
+%else
+%assign x 0
+%rep 1+%2/(mmsize*2)
+ BIWEIGHT_ROW t0+x, t2+x, t4+x, %2
+ BIWEIGHT_ROW t0+x+t1, t2+x+t3, t4+x+t5, %2
+%assign x x+mmsize
+%endrep
+%endif
+ lea t0, [t0+t1*2]
+ lea t2, [t2+t3*2]
+ lea t4, [t4+t5*2]
+ sub eax, 2
+ jg .height_loop
+ REP_RET
+%endmacro
+
+%define BIWEIGHT BIWEIGHT_MMX
+%define BIWEIGHT_START BIWEIGHT_START_MMX
+INIT_MMX
+AVG_WEIGHT mmxext, 4
+AVG_WEIGHT mmxext, 8
+AVG_WEIGHT mmxext, 16
+INIT_XMM
+%define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext
+AVG_WEIGHT sse2, 8
+AVG_WEIGHT sse2, 16
+%define BIWEIGHT BIWEIGHT_SSSE3
+%define BIWEIGHT_START BIWEIGHT_START_SSSE3
+INIT_MMX
+AVG_WEIGHT ssse3, 4
+INIT_XMM
+AVG_WEIGHT ssse3, 8
+AVG_WEIGHT ssse3, 16
+
+
+
+;=============================================================================
+; pixel avg
+;=============================================================================
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
+; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
+;-----------------------------------------------------------------------------
+%macro AVGH 3
+cglobal x264_pixel_avg_%1x%2_%3,0,0
+ mov eax, %2
+ cmp dword r6m, 32
+ jne x264_pixel_avg_weight_w%1_%3
+%if mmsize == 16 && %1 == 16
+ test dword r4m, 15
+ jz x264_pixel_avg_w%1_sse2
+%endif
+ jmp x264_pixel_avg_w%1_mmxext
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
+; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
+; int height, int weight );
+;-----------------------------------------------------------------------------
+
%macro AVG_END 0
sub eax, 2
+ lea t4, [t4+t5*2]
lea t2, [t2+t3*2]
lea t0, [t0+t1*2]
jg .height_loop
REP_RET
%endmacro
-AVG_START x264_pixel_avg_w4_mmxext
- movd mm0, [t2]
- movd mm1, [t2+t3]
- pavgb mm0, [t0]
- pavgb mm1, [t0+t1]
- movd [t0], mm0
- movd [t0+t1], mm1
-AVG_END
+%macro AVG_FUNC 3
+cglobal %1
+ AVG_START
+ %2 m0, [t2]
+ %2 m1, [t2+t3]
+ pavgb m0, [t4]
+ pavgb m1, [t4+t5]
+ %3 [t0], m0
+ %3 [t0+t1], m1
+ AVG_END
+%endmacro
+INIT_MMX
+AVG_FUNC x264_pixel_avg_w4_mmxext, movd, movd
AVGH 4, 8, mmxext
AVGH 4, 4, mmxext
AVGH 4, 2, mmxext
-AVG_START x264_pixel_avg_w8_mmxext
- movq mm0, [t2]
- movq mm1, [t2+t3]
- pavgb mm0, [t0]
- pavgb mm1, [t0+t1]
- movq [t0], mm0
- movq [t0+t1], mm1
-AVG_END
-
+AVG_FUNC x264_pixel_avg_w8_mmxext, movq, movq
AVGH 8, 16, mmxext
AVGH 8, 8, mmxext
AVGH 8, 4, mmxext
-AVG_START x264_pixel_avg_w16_mmxext
+cglobal x264_pixel_avg_w16_mmxext
+ AVG_START
movq mm0, [t2 ]
movq mm1, [t2+8]
movq mm2, [t2+t3 ]
movq mm3, [t2+t3+8]
- pavgb mm0, [t0 ]
- pavgb mm1, [t0+8]
- pavgb mm2, [t0+t1 ]
- pavgb mm3, [t0+t1+8]
+ pavgb mm0, [t4 ]
+ pavgb mm1, [t4+8]
+ pavgb mm2, [t4+t5 ]
+ pavgb mm3, [t4+t5+8]
movq [t0 ], mm0
movq [t0+8], mm1
movq [t0+t1 ], mm2
movq [t0+t1+8], mm3
-AVG_END
+ AVG_END
AVGH 16, 16, mmxext
AVGH 16, 8, mmxext
-AVG_START x264_pixel_avg_w16_sse2
- movdqu xmm0, [t2]
- movdqu xmm1, [t2+t3]
- pavgb xmm0, [t0]
- pavgb xmm1, [t0+t1]
- movdqa [t0], xmm0
- movdqa [t0+t1], xmm1
-AVG_END
-
+INIT_XMM
+AVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqa
AVGH 16, 16, sse2
-AVGH 16, 8, sse2
+AVGH 16, 8, sse2
+AVGH 8, 16, sse2
+AVGH 8, 8, sse2
+AVGH 8, 4, sse2
+AVGH 16, 16, ssse3
+AVGH 16, 8, ssse3
+AVGH 8, 16, ssse3
+AVGH 8, 8, ssse3
+AVGH 8, 4, ssse3
+INIT_MMX
+AVGH 4, 8, ssse3
+AVGH 4, 4, ssse3
+AVGH 4, 2, ssse3
@@ -284,17 +421,9 @@
%macro INIT_SHIFT 2
and eax, 7
shl eax, 3
-%ifdef PIC32
- ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx
- mov r2, 64
- sub r2, eax
- movd %2, eax
- movd %1, r2
-%else
movd %1, [sw_64 GLOBAL]
movd %2, eax
psubw %1, %2
-%endif
%endmacro
%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
@@ -316,7 +445,7 @@
INIT_SHIFT mm6, mm7
mov eax, r4m
INIT_SHIFT mm4, mm5
- PROLOGUE 6,6,0
+ PROLOGUE 6,6
and r2, ~7
and r4, ~7
sub r4, r2
@@ -474,102 +603,12 @@
;=============================================================================
-; weighted prediction
-;=============================================================================
-; implicit bipred only:
-; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
-
-%macro SPLATW 2
-%if mmsize==16
- pshuflw %1, %2, 0
- movlhps %1, %1
-%else
- pshufw %1, %2, 0
-%endif
-%endmacro
-
-%macro BIWEIGHT 2
- movh m0, %1
- movh m1, %2
- punpcklbw m0, m7
- punpcklbw m1, m7
- pmullw m0, m4
- pmullw m1, m5
- paddw m0, m1
- paddw m0, m6
- psraw m0, 6
- pmaxsw m0, m7
- packuswb m0, m0
- movh %1, m0
-%endmacro
-
-%macro BIWEIGHT_START 1
-%ifidn r4m, r4d
- movd m4, r4m
- SPLATW m4, m4 ; weight_dst
-%else
- SPLATW m4, r4m
-%endif
- picgetgot r4
- mova m5, [pw_64 GLOBAL]
- psubw m5, m4 ; weight_src
- mova m6, [pw_32 GLOBAL] ; rounding
- pxor m7, m7
-%if %1
-%ifidn r5m, r5d
- %define t0 r5d
-%else
- %define t0 r4d
- mov r4d, r5m
-%endif
-%endif
-.height_loop:
-%endmacro
-
-INIT_MMX
-;-----------------------------------------------------------------------------
-; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4,1
- BIWEIGHT_START 0
- BIWEIGHT [r0 ], [r2 ]
- BIWEIGHT [r0+r1 ], [r2+r3 ]
- BIWEIGHT [r0+r1*2], [r2+r3*2]
- add r0, r1
- add r2, r3
- BIWEIGHT [r0+r1*2], [r2+r3*2]
- RET
-
-%macro AVG_WEIGHT 2
-cglobal x264_pixel_avg_weight_w%2_%1, 4,5
- BIWEIGHT_START 1
-%assign x 0
-%rep %2*2/mmsize
- BIWEIGHT [r0+x], [r2+x]
-%assign x x+mmsize/2
-%endrep
- add r0, r1
- add r2, r3
- dec t0
- jg .height_loop
- REP_RET
-%endmacro
-
-AVG_WEIGHT mmxext, 8
-AVG_WEIGHT mmxext, 16
-INIT_XMM
-AVG_WEIGHT sse2, 8
-AVG_WEIGHT sse2, 16
-
-
-
-;=============================================================================
; prefetch
;=============================================================================
; FIXME assumes 64 byte cachelines
;-----------------------------------------------------------------------------
-; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
+; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
; uint8_t *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
%ifdef ARCH_X86_64
@@ -671,7 +710,7 @@
; int width, int height )
;-----------------------------------------------------------------------------
%macro MC_CHROMA 1
-cglobal x264_mc_chroma_%1, 0,6,1
+cglobal x264_mc_chroma_%1, 0,6
%if mmsize == 16
cmp dword r6m, 4
jle x264_mc_chroma_mmxext %+ .skip_prologue
@@ -833,7 +872,7 @@
MC_CHROMA sse2
INIT_MMX
-cglobal x264_mc_chroma_ssse3, 0,6,1
+cglobal x264_mc_chroma_ssse3, 0,6
MC_CHROMA_START
and r4d, 7
and r5d, 7
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/mc-a2.asm
^
|
@@ -32,12 +32,25 @@
SECTION .text
-%macro LOAD_ADD 3
+%macro LOAD_ADD 4
+ movh %4, %3
movh %1, %2
- movh m7, %3
+ punpcklbw %4, m0
punpcklbw %1, m0
- punpcklbw m7, m0
- paddw %1, m7
+ paddw %1, %4
+%endmacro
+
+%macro LOAD_ADD_2 6
+ mova %5, %3
+ mova %1, %4
+ mova %6, %5
+ mova %2, %1
+ punpcklbw %5, m0
+ punpcklbw %1, m0
+ punpckhbw %6, m0
+ punpckhbw %2, m0
+ paddw %1, %5
+ paddw %2, %6
%endmacro
%macro FILT_V2 0
@@ -64,27 +77,27 @@
paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
%endmacro
-%macro FILT_H2 0
- psubw m1, m2
- psubw m4, m5
- psraw m1, 2
- psraw m4, 2
- psubw m1, m2
- psubw m4, m5
- paddw m1, m3
- paddw m4, m6
- psraw m1, 2
- psraw m4, 2
- paddw m1, m3
- paddw m4, m6
+%macro FILT_H2 6
+ psubw %1, %2
+ psubw %4, %5
+ psraw %1, 2
+ psraw %4, 2
+ psubw %1, %2
+ psubw %4, %5
+ paddw %1, %3
+ paddw %4, %6
+ psraw %1, 2
+ psraw %4, 2
+ paddw %1, %3
+ paddw %4, %6
%endmacro
-%macro FILT_PACK 1
- paddw m1, m7
- paddw m4, m7
- psraw m1, %1
- psraw m4, %1
- packuswb m1, m4
+%macro FILT_PACK 3
+ paddw %1, m7
+ paddw %2, m7
+ psraw %1, %3
+ psraw %2, %3
+ packuswb %1, %2
%endmacro
%macro PALIGNR_MMX 4
@@ -111,7 +124,7 @@
;-----------------------------------------------------------------------------
; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_v_%1, 5,6,1
+cglobal x264_hpel_filter_v_%1, 5,6
lea r5, [r1+r3]
sub r1, r3
sub r1, r3
@@ -120,13 +133,10 @@
neg r4
pxor m0, m0
.loop:
- prefetcht0 [r5+r3*2+64]
- LOAD_ADD m1, [r1 ], [r5+r3*2] ; a0
- LOAD_ADD m2, [r1+r3 ], [r5+r3 ] ; b0
- LOAD_ADD m3, [r1+r3*2], [r5 ] ; c0
- LOAD_ADD m4, [r1 +mmsize/2], [r5+r3*2+mmsize/2] ; a1
- LOAD_ADD m5, [r1+r3 +mmsize/2], [r5+r3 +mmsize/2] ; b1
- LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5 +mmsize/2] ; c1
+ LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
+ LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
+ LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
+ LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
FILT_V2
mova m7, [pw_16 GLOBAL]
mova [r2+r4*2], m1
@@ -136,7 +146,7 @@
psraw m1, 5
psraw m4, 5
packuswb m1, m4
- movnt [r0+r4], m1
+ mova [r0+r4], m1
add r1, mmsize
add r5, mmsize
add r4, mmsize
@@ -148,7 +158,7 @@
;-----------------------------------------------------------------------------
; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_c_mmxext, 3,3,1
+cglobal x264_hpel_filter_c_mmxext, 3,3
add r0, r2
lea r1, [r1+r2*2]
neg r2
@@ -167,8 +177,8 @@
paddw m4, [src+14] ; a1
paddw m5, [src+12] ; b1
paddw m6, [src+10] ; c1
- FILT_H2
- FILT_PACK 6
+ FILT_H2 m1, m2, m3, m4, m5, m6
+ FILT_PACK m1, m4, 6
movntq [r0+r2], m1
add r2, 8
jl .loop
@@ -177,7 +187,7 @@
;-----------------------------------------------------------------------------
; void x264_hpel_filter_h_mmxext( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_h_mmxext, 3,3,1
+cglobal x264_hpel_filter_h_mmxext, 3,3
add r0, r2
add r1, r2
neg r2
@@ -211,8 +221,8 @@
punpcklbw m6, m0
paddw m6, m7 ; a1
movq m7, [pw_1 GLOBAL]
- FILT_H2
- FILT_PACK 1
+ FILT_H2 m1, m2, m3, m4, m5, m6
+ FILT_PACK m1, m4, 1
movntq [r0+r2], m1
add r2, 8
jl .loop
@@ -224,7 +234,7 @@
;-----------------------------------------------------------------------------
; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_c_%1, 3,3,1
+cglobal x264_hpel_filter_c_%1, 3,3
add r0, r2
lea r1, [r1+r2*2]
neg r2
@@ -267,7 +277,7 @@
;-----------------------------------------------------------------------------
; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_h_sse2, 3,3,1
+cglobal x264_hpel_filter_h_sse2, 3,3
add r0, r2
add r1, r2
neg r2
@@ -305,24 +315,217 @@
punpcklbw m7, m0
paddw m6, m7 ; c1
mova m7, [pw_1 GLOBAL] ; FIXME xmm8
- FILT_H2
- FILT_PACK 1
+ FILT_H2 m1, m2, m3, m4, m5, m6
+ FILT_PACK m1, m4, 1
movntdq [r0+r2], m1
add r2, 16
jl .loop
REP_RET
+;-----------------------------------------------------------------------------
+; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width );
+;-----------------------------------------------------------------------------
+cglobal x264_hpel_filter_h_ssse3, 3,3
+ add r0, r2
+ add r1, r2
+ neg r2
+ %define src r1+r2
+ pxor m0, m0
+ movh m1, [src-8]
+ punpcklbw m1, m0 ; 00 -1 00 -2 00 -3 00 -4 00 -5 00 -6 00 -7 00 -8
+ movh m2, [src]
+ punpcklbw m2, m0
+ mova m7, [pw_1 GLOBAL]
+.loop:
+ movh m3, [src+8]
+ punpcklbw m3, m0
+
+ mova m4, m2
+ palignr m2, m1, 14
+ mova m5, m3
+ palignr m3, m4, 4
+ paddw m3, m2
+
+ mova m2, m4
+ palignr m4, m1, 12
+ mova m1, m5
+ palignr m5, m2, 6
+ paddw m5, m4
+
+ mova m4, m1
+ palignr m1, m2, 2
+ paddw m1, m2
+
+ FILT_H m5, m3, m1
+
+ movh m1, [src+16]
+ punpcklbw m1, m0
+
+ mova m3, m4
+ palignr m4, m2, 14
+ mova m6, m1
+ palignr m1, m3, 4
+ paddw m1, m4
+
+ mova m4, m3
+ palignr m3, m2, 12
+ mova m2, m6
+ palignr m6, m4, 6
+ paddw m6, m3
+
+ mova m3, m2
+ palignr m2, m4, 2
+ paddw m2, m4
+
+ FILT_H m6, m1, m2
+ FILT_PACK m5, m6, 1
+ movdqa [r0+r2], m5
+
+ add r2, 16
+ mova m2, m3
+ mova m1, m4
+
+ jl .loop
+ REP_RET
+
+
%define PALIGNR PALIGNR_MMX
HPEL_V sse2
HPEL_C sse2
%define PALIGNR PALIGNR_SSSE3
HPEL_C ssse3
-cglobal x264_sfence
+%ifdef ARCH_X86_64
+
+%macro DO_FILT_V 5
+ LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
+ LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
+ LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
+ FILT_V2
+ mova %1, m1
+ mova %2, m4
+ paddw m1, m15
+ paddw m4, m15
+ add r3, 16
+ add r1, 16
+ psraw m1, 5
+ psraw m4, 5
+ packuswb m1, m4
+ movntps [r11+r4+%5], m1
+%endmacro
+
+%macro DO_FILT_H 4
+ mova m1, %2
+ PALIGNR m1, %1, 12, m4
+ mova m2, %2
+ PALIGNR m2, %1, 14, m4
+ mova %1, %3
+ PALIGNR %3, %2, 6, m4
+ mova m3, %1
+ PALIGNR m3, %2, 4, m4
+ mova m4, %1
+ paddw %3, m1
+ PALIGNR m4, %2, 2, m1
+ paddw m3, m2
+ paddw m4, %2
+ FILT_H %3, m3, m4
+ paddw %3, m15
+ psraw %3, %4
+%endmacro
+
+%macro DO_FILT_CC 4
+ DO_FILT_H %1, %2, %3, 6
+ DO_FILT_H %2, %1, %4, 6
+ packuswb %3, %4
+ movntps [r5+r4], %3
+%endmacro
+
+%macro DO_FILT_HH 4
+ DO_FILT_H %1, %2, %3, 1
+ DO_FILT_H %2, %1, %4, 1
+ packuswb %3, %4
+ movntps [r0+r4], %3
+%endmacro
+
+%macro DO_FILT_H2 6
+ DO_FILT_H %1, %2, %3, 6
+ psrlw m15, 5
+ DO_FILT_H %4, %5, %6, 1
+ packuswb %6, %3
+%endmacro
+
+%macro HPEL 1
+;-----------------------------------------------------------------------------
+; void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
+; uint8_t *src, int stride, int width, int height)
+;-----------------------------------------------------------------------------
+cglobal x264_hpel_filter_%1, 7,7
+ mov r10, r3
+ sub r5, 16
+ mov r11, r1
+ and r10, 15
+ sub r3, r10
+ add r0, r5
+ add r11, r5
+ add r10, r5
+ add r5, r2
+ mov r2, r4
+ neg r10
+ lea r1, [r3+r2]
+ sub r3, r2
+ sub r3, r2
+ mov r4, r10
+ pxor m0, m0
+ pcmpeqw m15, m15
+ psrlw m15, 15 ; pw_1
+ psllw m15, 4
+;ALIGN 16
+.loopy:
+; first filter_v
+; prefetching does not help here! lots of variants tested, all slower
+ DO_FILT_V m8, m7, m13, m12, 0
+;ALIGN 16
+.loopx:
+ DO_FILT_V m6, m5, m11, m10, 16
+.lastx:
+ paddw m15, m15
+ DO_FILT_CC m9, m8, m7, m6
+ movdqa m7, m12 ; not really necessary, but seems free and
+ movdqa m6, m11 ; gives far shorter code
+ psrlw m15, 5
+ DO_FILT_HH m14, m13, m7, m6
+ psllw m15, 4 ; pw_16
+ movdqa m7, m5
+ movdqa m12, m10
+ add r4, 16
+ jl .loopx
+ cmp r4, 16
+ jl .lastx
+; setup regs for next y
+ sub r4, r10
+ sub r4, r2
+ sub r1, r4
+ sub r3, r4
+ add r0, r2
+ add r11, r2
+ add r5, r2
+ mov r4, r10
+ sub r6d, 1
+ jg .loopy
sfence
- ret
+ RET
+%endmacro
+%define PALIGNR PALIGNR_MMX
+HPEL sse2
+%define PALIGNR PALIGNR_SSSE3
+HPEL ssse3
+
+%endif
+cglobal x264_sfence
+ sfence
+ ret
;-----------------------------------------------------------------------------
; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/mc-c.c
^
|
@@ -27,29 +27,27 @@
#include <string.h>
#include "common/common.h"
+#include "mc.h"
-/* NASM functions */
-extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_8x8_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_8x4_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int );
+#define DECL_SUF( func, args )\
+ void func##_mmxext args;\
+ void func##_sse2 args;\
+ void func##_ssse3 args;
+
+DECL_SUF( x264_pixel_avg_16x16, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
+DECL_SUF( x264_pixel_avg_16x8, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
+DECL_SUF( x264_pixel_avg_8x16, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
+DECL_SUF( x264_pixel_avg_8x8, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
+DECL_SUF( x264_pixel_avg_8x4, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
+DECL_SUF( x264_pixel_avg_4x8, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
+DECL_SUF( x264_pixel_avg_4x4, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
+DECL_SUF( x264_pixel_avg_4x2, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
-extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
-extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
-extern void x264_pixel_avg_weight_w8_sse2( uint8_t *, int, uint8_t *, int, int, int );
-extern void x264_pixel_avg_weight_w16_sse2( uint8_t *, int, uint8_t *, int, int, int );
-extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int );
extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
@@ -86,23 +84,6 @@
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
-#define AVG_WEIGHT(W,H,name) \
-void x264_pixel_avg_weight_ ## W ## x ## H ## _##name( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
-{ \
- x264_pixel_avg_weight_w ## W ## _##name( dst, i_dst, src, i_src, i_weight_dst, H ); \
-}
-
-AVG_WEIGHT(16,16,mmxext)
-AVG_WEIGHT(16,8,mmxext)
-AVG_WEIGHT(8,16,mmxext)
-AVG_WEIGHT(8,8,mmxext)
-AVG_WEIGHT(8,4,mmxext)
-AVG_WEIGHT(16,16,sse2)
-AVG_WEIGHT(16,8,sse2)
-AVG_WEIGHT(8,16,sse2)
-AVG_WEIGHT(8,8,sse2)
-AVG_WEIGHT(8,4,sse2)
-
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
{\
@@ -143,7 +124,7 @@
static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
#define MC_LUMA(name,instr1,instr2)\
-void mc_luma_##name( uint8_t *dst, int i_dst_stride,\
+static void mc_luma_##name( uint8_t *dst, int i_dst_stride,\
uint8_t *src[4], int i_src_stride,\
int mvx, int mvy,\
int i_width, int i_height )\
@@ -174,7 +155,7 @@
MC_LUMA(cache64_sse2,cache64_sse2,sse2)
#define GET_REF(name)\
-uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
+static uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
uint8_t *src[4], int i_src_stride,\
int mvx, int mvy,\
int i_width, int i_height )\
@@ -210,7 +191,7 @@
void x264_hpel_filter_c_##cpuc( uint8_t *dst, int16_t *buf, int width );\
void x264_hpel_filter_h_##cpuh( uint8_t *dst, uint8_t *src, int width );\
void x264_sfence( void );\
-void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,\
+static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,\
int stride, int width, int height )\
{\
int16_t *buf;\
@@ -237,14 +218,20 @@
HPEL(8, mmxext, mmxext, mmxext, mmxext)
HPEL(16, sse2_amd, mmxext, mmxext, sse2)
+#ifdef ARCH_X86_64
+void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height );
+void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height );
+#else
HPEL(16, sse2, sse2, sse2, sse2)
-HPEL(16, ssse3, sse2, ssse3, sse2)
+HPEL(16, ssse3, sse2, ssse3, ssse3)
+#endif
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
if( !(cpu&X264_CPU_MMX) )
return;
+ pf->copy_16x16_unaligned = x264_mc_copy_w16_mmx;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
@@ -267,14 +254,6 @@
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext;
- pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_mmxext;
- pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_mmxext;
- pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_mmxext;
- pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_mmxext;
- pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_mmxext;
- pf->avg_weight[PIXEL_4x4] = x264_pixel_avg_weight_4x4_mmxext;
- // avg_weight_4x8 is rare and 4x2 is not used
-
pf->plane_copy = x264_plane_copy_mmxext;
pf->hpel_filter = x264_hpel_filter_mmxext;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext;
@@ -310,14 +289,9 @@
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
- if( !(cpu&X264_CPU_STACK_MOD4) )
- {
- pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_sse2;
- pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_sse2;
- pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_sse2;
- pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_sse2;
- pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_sse2;
- }
+ pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
+ pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
+ pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
pf->hpel_filter = x264_hpel_filter_sse2;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
pf->mc_chroma = x264_mc_chroma_sse2;
@@ -336,6 +310,15 @@
if( !(cpu&X264_CPU_SSSE3) )
return;
+ pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_ssse3;
+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_ssse3;
+ pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_ssse3;
+ pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_ssse3;
+ pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_ssse3;
+ pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_ssse3;
+ pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3;
+ pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3;
+
pf->hpel_filter = x264_hpel_filter_ssse3;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3;
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/mc.h
^
|
@@ -26,7 +26,4 @@
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf );
-void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
- int dx, int dy, int i_width, int i_height );
#endif
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/pixel-32.asm
^
|
@@ -25,21 +25,9 @@
%include "x86util.asm"
SECTION .text
+INIT_MMX
-%macro SBUTTERFLY 5
- mov%1 %5, %3
- punpckl%2 %3, %4
- punpckh%2 %5, %4
-%endmacro
-
-%macro TRANSPOSE4x4W 5 ; abcd-t -> adtc
- SBUTTERFLY q, wd, %1, %2, %5
- SBUTTERFLY q, wd, %3, %4, %2
- SBUTTERFLY q, dq, %1, %3, %4
- SBUTTERFLY q, dq, %5, %2, %3
-%endmacro
-
-%macro LOAD_DIFF_4P 4 ; mmp, mmt, dx, dy
+%macro LOAD_DIFF_4P 4 ; mp, mt, dx, dy
movd %1, [eax+ebx*%4+%3]
movd %2, [ecx+edx*%4+%3]
punpcklbw %1, %2
@@ -48,40 +36,40 @@
%endmacro
%macro LOAD_DIFF_4x8P 1 ; dx
- LOAD_DIFF_4P mm0, mm7, %1, 0
- LOAD_DIFF_4P mm1, mm7, %1, 1
+ LOAD_DIFF_4P m0, m7, %1, 0
+ LOAD_DIFF_4P m1, m7, %1, 1
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
- LOAD_DIFF_4P mm2, mm7, %1, 0
- LOAD_DIFF_4P mm3, mm7, %1, 1
+ LOAD_DIFF_4P m2, m7, %1, 0
+ LOAD_DIFF_4P m3, m7, %1, 1
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
- LOAD_DIFF_4P mm4, mm7, %1, 0
- LOAD_DIFF_4P mm5, mm7, %1, 1
+ LOAD_DIFF_4P m4, m7, %1, 0
+ LOAD_DIFF_4P m5, m7, %1, 1
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
- LOAD_DIFF_4P mm6, mm7, %1, 0
- movq [spill], mm6
- LOAD_DIFF_4P mm7, mm6, %1, 1
- movq mm6, [spill]
+ LOAD_DIFF_4P m6, m7, %1, 0
+ movq [spill], m6
+ LOAD_DIFF_4P m7, m6, %1, 1
+ movq m6, [spill]
%endmacro
%macro SUM4x8_MM 0
- movq [spill], mm6
- movq [spill+8], mm7
- ABS2 mm0, mm1, mm6, mm7
- ABS2 mm2, mm3, mm6, mm7
- paddw mm0, mm2
- paddw mm1, mm3
- movq mm6, [spill]
- movq mm7, [spill+8]
- ABS2 mm4, mm5, mm2, mm3
- ABS2 mm6, mm7, mm2, mm3
- paddw mm4, mm6
- paddw mm5, mm7
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm0, mm1
+ movq [spill], m6
+ movq [spill+8], m7
+ ABS2 m0, m1, m6, m7
+ ABS2 m2, m3, m6, m7
+ paddw m0, m2
+ paddw m1, m3
+ movq m6, [spill]
+ movq m7, [spill+8]
+ ABS2 m4, m5, m2, m3
+ ABS2 m6, m7, m2, m3
+ paddw m4, m6
+ paddw m5, m7
+ paddw m0, m4
+ paddw m1, m5
+ paddw m0, m1
%endmacro
;-----------------------------------------------------------------------------
@@ -98,67 +86,67 @@
%define spill esp+0x60 ; +16
%define trans esp+0 ; +96
LOAD_DIFF_4x8P 0
- HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
- movq [spill], mm0
- TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
- movq [trans+0x00], mm4
- movq [trans+0x08], mm7
- movq [trans+0x10], mm0
- movq [trans+0x18], mm6
- movq mm0, [spill]
- TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
- movq [trans+0x20], mm0
- movq [trans+0x28], mm3
- movq [trans+0x30], mm4
- movq [trans+0x38], mm2
+ movq [spill], m0
+ TRANSPOSE4x4W 4, 5, 6, 7, 0
+ movq [trans+0x00], m4
+ movq [trans+0x08], m5
+ movq [trans+0x10], m6
+ movq [trans+0x18], m7
+ movq m0, [spill]
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ movq [trans+0x20], m0
+ movq [trans+0x28], m1
+ movq [trans+0x30], m2
+ movq [trans+0x38], m3
mov eax, [args+4]
mov ecx, [args+12]
LOAD_DIFF_4x8P 4
- HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
- movq [spill], mm7
- TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7
- movq [trans+0x40], mm0
- movq [trans+0x48], mm3
- movq [trans+0x50], mm7
- movq [trans+0x58], mm2
- movq mm7, [spill]
- TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
- movq mm5, [trans+0x00]
- movq mm1, [trans+0x08]
- movq mm2, [trans+0x10]
- movq mm3, [trans+0x18]
+ movq [spill], m7
+ TRANSPOSE4x4W 0, 1, 2, 3, 7
+ movq [trans+0x40], m0
+ movq [trans+0x48], m1
+ movq [trans+0x50], m2
+ movq [trans+0x58], m3
+ movq m7, [spill]
+ TRANSPOSE4x4W 4, 5, 6, 7, 0
+ movq m0, [trans+0x00]
+ movq m1, [trans+0x08]
+ movq m2, [trans+0x10]
+ movq m3, [trans+0x18]
- HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
SUM4x8_MM
- movq [trans], mm0
+ movq [trans], m0
+
+ movq m0, [trans+0x20]
+ movq m1, [trans+0x28]
+ movq m2, [trans+0x30]
+ movq m3, [trans+0x38]
+ movq m4, [trans+0x40]
+ movq m5, [trans+0x48]
+ movq m6, [trans+0x50]
+ movq m7, [trans+0x58]
- movq mm0, [trans+0x20]
- movq mm1, [trans+0x28]
- movq mm2, [trans+0x30]
- movq mm3, [trans+0x38]
- movq mm4, [trans+0x40]
- movq mm5, [trans+0x48]
- movq mm6, [trans+0x50]
- movq mm7, [trans+0x58]
-
- HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
SUM4x8_MM
- pavgw mm0, [esp]
- pshufw mm1, mm0, 01001110b
- paddw mm0, mm1
- pshufw mm1, mm0, 10110001b
- paddw mm0, mm1
- movd eax, mm0
- and eax, 0xffff
- mov ecx, eax ; preserve rounding for 16x16
- add eax, 1
- shr eax, 1
- add esp, 0x70
- pop ebx
+ pavgw m0, [trans]
+ pshufw m1, m0, 01001110b
+ paddw m0, m1
+ pshufw m1, m0, 10110001b
+ paddw m0, m1
+ movd eax, m0
+ and eax, 0xffff
+ mov ecx, eax ; preserve rounding for 16x16
+ add eax, 1
+ shr eax, 1
+ add esp, 0x70
+ pop ebx
ret
%undef args
%undef spill
@@ -184,25 +172,25 @@
%endmacro
%macro LOAD_4x8P 1 ; dx
- pxor mm7, mm7
- movd mm6, [eax+%1+7*FENC_STRIDE]
- movd mm0, [eax+%1+0*FENC_STRIDE]
- movd mm1, [eax+%1+1*FENC_STRIDE]
- movd mm2, [eax+%1+2*FENC_STRIDE]
- movd mm3, [eax+%1+3*FENC_STRIDE]
- movd mm4, [eax+%1+4*FENC_STRIDE]
- movd mm5, [eax+%1+5*FENC_STRIDE]
- punpcklbw mm6, mm7
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- movq [spill], mm6
- punpcklbw mm2, mm7
- punpcklbw mm3, mm7
- movd mm6, [eax+%1+6*FENC_STRIDE]
- punpcklbw mm4, mm7
- punpcklbw mm5, mm7
- punpcklbw mm6, mm7
- movq mm7, [spill]
+ pxor m7, m7
+ movd m6, [eax+%1+7*FENC_STRIDE]
+ movd m0, [eax+%1+0*FENC_STRIDE]
+ movd m1, [eax+%1+1*FENC_STRIDE]
+ movd m2, [eax+%1+2*FENC_STRIDE]
+ movd m3, [eax+%1+3*FENC_STRIDE]
+ movd m4, [eax+%1+4*FENC_STRIDE]
+ movd m5, [eax+%1+5*FENC_STRIDE]
+ punpcklbw m6, m7
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ movq [spill], m6
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ movd m6, [eax+%1+6*FENC_STRIDE]
+ punpcklbw m4, m7
+ punpcklbw m5, m7
+ punpcklbw m6, m7
+ movq m7, [spill]
%endmacro
;-----------------------------------------------------------------------------
@@ -217,146 +205,146 @@
%define trans esp+0 ; +96
%define sum esp+0 ; +32
LOAD_4x8P 0
- HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
- movq [spill], mm0
- TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
- movq [trans+0x00], mm4
- movq [trans+0x08], mm7
- movq [trans+0x10], mm0
- movq [trans+0x18], mm6
- movq mm0, [spill]
- TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
- movq [trans+0x20], mm0
- movq [trans+0x28], mm3
- movq [trans+0x30], mm4
- movq [trans+0x38], mm2
+ movq [spill], m0
+ TRANSPOSE4x4W 4, 5, 6, 7, 0
+ movq [trans+0x00], m4
+ movq [trans+0x08], m5
+ movq [trans+0x10], m6
+ movq [trans+0x18], m7
+ movq m0, [spill]
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ movq [trans+0x20], m0
+ movq [trans+0x28], m1
+ movq [trans+0x30], m2
+ movq [trans+0x38], m3
LOAD_4x8P 4
- HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
- movq [spill], mm7
- TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7
- movq [trans+0x40], mm0
- movq [trans+0x48], mm3
- movq [trans+0x50], mm7
- movq [trans+0x58], mm2
- movq mm7, [spill]
- TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
- movq mm5, [trans+0x00]
- movq mm1, [trans+0x08]
- movq mm2, [trans+0x10]
- movq mm3, [trans+0x18]
-
- HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
-
- movq [spill+0], mm5
- movq [spill+8], mm7
- ABS2 mm0, mm1, mm5, mm7
- ABS2 mm2, mm3, mm5, mm7
- paddw mm0, mm2
- paddw mm1, mm3
- paddw mm0, mm1
- ABS2 mm4, mm6, mm2, mm3
- movq mm5, [spill+0]
- movq mm7, [spill+8]
- paddw mm0, mm4
- paddw mm0, mm6
- ABS1 mm7, mm1
- paddw mm0, mm7 ; 7x4 sum
- movq mm6, mm5
- movq mm7, [ecx+8] ; left bottom
- psllw mm7, 3
- psubw mm6, mm7
- ABS2 mm5, mm6, mm2, mm3
- paddw mm5, mm0
- paddw mm6, mm0
- movq [sum+0], mm5 ; dc
- movq [sum+8], mm6 ; left
-
- movq mm0, [trans+0x20]
- movq mm1, [trans+0x28]
- movq mm2, [trans+0x30]
- movq mm3, [trans+0x38]
- movq mm4, [trans+0x40]
- movq mm5, [trans+0x48]
- movq mm6, [trans+0x50]
- movq mm7, [trans+0x58]
-
- HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
-
- movd [sum+0x10], mm0
- movd [sum+0x12], mm1
- movd [sum+0x14], mm2
- movd [sum+0x16], mm3
- movd [sum+0x18], mm4
- movd [sum+0x1a], mm5
- movd [sum+0x1c], mm6
- movd [sum+0x1e], mm7
-
- movq [spill], mm0
- movq [spill+8], mm1
- ABS2 mm2, mm3, mm0, mm1
- ABS2 mm4, mm5, mm0, mm1
- paddw mm2, mm3
- paddw mm4, mm5
- paddw mm2, mm4
- movq mm0, [spill]
- movq mm1, [spill+8]
- ABS2 mm6, mm7, mm4, mm5
- ABS1 mm1, mm4
- paddw mm2, mm7
- paddw mm1, mm6
- paddw mm2, mm1 ; 7x4 sum
- movq mm1, mm0
+ movq [spill], m7
+ TRANSPOSE4x4W 0, 1, 2, 3, 7
+ movq [trans+0x40], m0
+ movq [trans+0x48], m1
+ movq [trans+0x50], m2
+ movq [trans+0x58], m3
+ movq m7, [spill]
+ TRANSPOSE4x4W 4, 5, 6, 7, 0
+ movq m0, [trans+0x00]
+ movq m1, [trans+0x08]
+ movq m2, [trans+0x10]
+ movq m3, [trans+0x18]
+
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+
+ movq [spill+0], m0
+ movq [spill+8], m1
+ ABS2 m2, m3, m0, m1
+ ABS2 m4, m5, m0, m1
+ paddw m2, m4
+ paddw m3, m5
+ ABS2 m6, m7, m4, m5
+ movq m0, [spill+0]
+ movq m1, [spill+8]
+ paddw m2, m6
+ paddw m3, m7
+ paddw m2, m3
+ ABS1 m1, m4
+ paddw m2, m1 ; 7x4 sum
+ movq m7, m0
+ movq m1, [ecx+8] ; left bottom
+ psllw m1, 3
+ psubw m7, m1
+ ABS2 m0, m7, m5, m3
+ paddw m0, m2
+ paddw m7, m2
+ movq [sum+0], m0 ; dc
+ movq [sum+8], m7 ; left
+
+ movq m0, [trans+0x20]
+ movq m1, [trans+0x28]
+ movq m2, [trans+0x30]
+ movq m3, [trans+0x38]
+ movq m4, [trans+0x40]
+ movq m5, [trans+0x48]
+ movq m6, [trans+0x50]
+ movq m7, [trans+0x58]
+
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+
+ movd [sum+0x10], m0
+ movd [sum+0x12], m1
+ movd [sum+0x14], m2
+ movd [sum+0x16], m3
+ movd [sum+0x18], m4
+ movd [sum+0x1a], m5
+ movd [sum+0x1c], m6
+ movd [sum+0x1e], m7
+
+ movq [spill], m0
+ movq [spill+8], m1
+ ABS2 m2, m3, m0, m1
+ ABS2 m4, m5, m0, m1
+ paddw m2, m4
+ paddw m3, m5
+ paddw m2, m3
+ movq m0, [spill]
+ movq m1, [spill+8]
+ ABS2 m6, m7, m4, m5
+ ABS1 m1, m3
+ paddw m2, m7
+ paddw m1, m6
+ paddw m2, m1 ; 7x4 sum
+ movq m1, m0
- movq mm7, [ecx+0]
- psllw mm7, 3 ; left top
+ movq m7, [ecx+0]
+ psllw m7, 3 ; left top
- movzx edx, word [ecx+0]
+ movzx edx, word [ecx+0]
add dx, [ecx+16]
- lea edx, [4*edx+32]
- and edx, -64
- movd mm6, edx ; dc
-
- psubw mm1, mm7
- psubw mm0, mm6
- ABS2 mm0, mm1, mm5, mm6
- movq mm3, [sum+0] ; dc
- paddw mm0, mm2
- paddw mm1, mm2
- movq mm2, mm0
- paddw mm0, mm3
- paddw mm1, [sum+8] ; h
- psrlq mm2, 16
- paddw mm2, mm3
-
- movq mm3, [ecx+16] ; top left
- movq mm4, [ecx+24] ; top right
- psllw mm3, 3
- psllw mm4, 3
- psubw mm3, [sum+16]
- psubw mm4, [sum+24]
- ABS2 mm3, mm4, mm5, mm6
- paddw mm2, mm3
- paddw mm2, mm4 ; v
-
- SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
- mov eax, [args+8]
- movd ecx, mm2
- movd edx, mm1
- add ecx, 2
- add edx, 2
- shr ecx, 2
- shr edx, 2
- mov [eax+0], ecx ; i8x8_v satd
- mov [eax+4], edx ; i8x8_h satd
- movd ecx, mm0
- add ecx, 2
- shr ecx, 2
- mov [eax+8], ecx ; i8x8_dc satd
+ lea edx, [4*edx+32]
+ and edx, -64
+ movd m6, edx ; dc
+
+ psubw m1, m7
+ psubw m0, m6
+ ABS2 m0, m1, m5, m6
+ movq m3, [sum+0] ; dc
+ paddw m0, m2
+ paddw m1, m2
+ movq m2, m0
+ paddw m0, m3
+ paddw m1, [sum+8] ; h
+ psrlq m2, 16
+ paddw m2, m3
+
+ movq m3, [ecx+16] ; top left
+ movq m4, [ecx+24] ; top right
+ psllw m3, 3
+ psllw m4, 3
+ psubw m3, [sum+16]
+ psubw m4, [sum+24]
+ ABS2 m3, m4, m5, m6
+ paddw m2, m3
+ paddw m2, m4 ; v
+
+ SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
+ mov eax, [args+8]
+ movd ecx, m2
+ movd edx, m1
+ add ecx, 2
+ add edx, 2
+ shr ecx, 2
+ shr edx, 2
+ mov [eax+0], ecx ; i8x8_v satd
+ mov [eax+4], edx ; i8x8_h satd
+ movd ecx, m0
+ add ecx, 2
+ shr ecx, 2
+ mov [eax+8], ecx ; i8x8_dc satd
- add esp, 0x70
+ add esp, 0x70
ret
%undef args
%undef spill
@@ -370,57 +358,57 @@
; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
cglobal x264_pixel_ssim_4x4x2_core_mmxext
- push ebx
- push edi
- mov ebx, [esp+16]
- mov edx, [esp+24]
- mov edi, 4
- pxor mm0, mm0
+ push ebx
+ push edi
+ mov ebx, [esp+16]
+ mov edx, [esp+24]
+ mov edi, 4
+ pxor m0, m0
.loop:
- mov eax, [esp+12]
- mov ecx, [esp+20]
- add eax, edi
- add ecx, edi
- pxor mm1, mm1
- pxor mm2, mm2
- pxor mm3, mm3
- pxor mm4, mm4
+ mov eax, [esp+12]
+ mov ecx, [esp+20]
+ add eax, edi
+ add ecx, edi
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+ pxor m4, m4
%rep 4
- movd mm5, [eax]
- movd mm6, [ecx]
- punpcklbw mm5, mm0
- punpcklbw mm6, mm0
- paddw mm1, mm5
- paddw mm2, mm6
- movq mm7, mm5
- pmaddwd mm5, mm5
- pmaddwd mm7, mm6
- pmaddwd mm6, mm6
- paddd mm3, mm5
- paddd mm4, mm7
- paddd mm3, mm6
- add eax, ebx
- add ecx, edx
+ movd m5, [eax]
+ movd m6, [ecx]
+ punpcklbw m5, m0
+ punpcklbw m6, m0
+ paddw m1, m5
+ paddw m2, m6
+ movq m7, m5
+ pmaddwd m5, m5
+ pmaddwd m7, m6
+ pmaddwd m6, m6
+ paddd m3, m5
+ paddd m4, m7
+ paddd m3, m6
+ add eax, ebx
+ add ecx, edx
%endrep
- mov eax, [esp+28]
- lea eax, [eax+edi*4]
- pshufw mm5, mm1, 0xE
- pshufw mm6, mm2, 0xE
- paddusw mm1, mm5
- paddusw mm2, mm6
- punpcklwd mm1, mm2
- pshufw mm2, mm1, 0xE
- pshufw mm5, mm3, 0xE
- pshufw mm6, mm4, 0xE
- paddusw mm1, mm2
- paddd mm3, mm5
- paddd mm4, mm6
- punpcklwd mm1, mm0
- punpckldq mm3, mm4
- movq [eax+0], mm1
- movq [eax+8], mm3
- sub edi, 4
- jge .loop
+ mov eax, [esp+28]
+ lea eax, [eax+edi*4]
+ pshufw m5, m1, 0xE
+ pshufw m6, m2, 0xE
+ paddusw m1, m5
+ paddusw m2, m6
+ punpcklwd m1, m2
+ pshufw m2, m1, 0xE
+ pshufw m5, m3, 0xE
+ pshufw m6, m4, 0xE
+ paddusw m1, m2
+ paddd m3, m5
+ paddd m4, m6
+ punpcklwd m1, m0
+ punpckldq m3, m4
+ movq [eax+0], m1
+ movq [eax+8], m3
+ sub edi, 4
+ jge .loop
pop edi
pop ebx
emms
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/pixel-a.asm
^
|
@@ -31,6 +31,8 @@
ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
mask_ff: times 16 db 0xff
times 16 db 0
+mask_ac4: dw 0,-1,-1,-1, 0,-1,-1,-1
+mask_ac8: dw 0,-1,-1,-1,-1,-1,-1,-1
SECTION .text
@@ -162,6 +164,112 @@
SSD 8, 4, sse2
+;=============================================================================
+; variance
+;=============================================================================
+
+%macro VAR_START 0
+ pxor m5, m5 ; sum
+ pxor m6, m6 ; sum squared
+ pxor m7, m7 ; zero
+%ifdef ARCH_X86_64
+ %define t3d r3d
+%else
+ %define t3d r2d
+%endif
+%endmacro
+
+%macro VAR_END 1
+%if mmsize == 16
+ movhlps m0, m5
+ paddw m5, m0
+%endif
+ movifnidn r2d, r2m
+ movd r1d, m5
+ movd [r2], m5 ; return sum
+ imul r1d, r1d
+ HADDD m6, m1
+ shr r1d, %1
+ movd eax, m6
+ sub eax, r1d ; sqr - (sum * sum >> shift)
+ RET
+%endmacro
+
+%macro VAR_2ROW 2
+ mov t3d, %2
+.loop:
+ mova m0, [r0]
+ mova m1, m0
+ mova m3, [r0+%1]
+ mova m2, m0
+ punpcklbw m0, m7
+ mova m4, m3
+ punpckhbw m1, m7
+%ifidn %1, r1
+ lea r0, [r0+%1*2]
+%else
+ add r0, r1
+%endif
+ punpckhbw m4, m7
+ psadbw m2, m7
+ paddw m5, m2
+ mova m2, m3
+ punpcklbw m3, m7
+ dec t3d
+ psadbw m2, m7
+ pmaddwd m0, m0
+ paddw m5, m2
+ pmaddwd m1, m1
+ paddd m6, m0
+ pmaddwd m3, m3
+ paddd m6, m1
+ pmaddwd m4, m4
+ paddd m6, m3
+ paddd m6, m4
+ jg .loop
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_var_wxh_mmxext( uint8_t *, int, int * )
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal x264_pixel_var_16x16_mmxext, 2,3
+ VAR_START
+ VAR_2ROW 8, 16
+ VAR_END 8
+
+cglobal x264_pixel_var_8x8_mmxext, 2,3
+ VAR_START
+ VAR_2ROW r1, 4
+ VAR_END 6
+
+INIT_XMM
+cglobal x264_pixel_var_16x16_sse2, 2,3
+ VAR_START
+ VAR_2ROW r1, 8
+ VAR_END 8
+
+cglobal x264_pixel_var_8x8_sse2, 2,3
+ VAR_START
+ mov t3d, 4
+.loop:
+ movh m0, [r0]
+ movhps m0, [r0+r1]
+ lea r0, [r0+r1*2]
+ mova m1, m0
+ punpcklbw m0, m7
+ mova m2, m1
+ punpckhbw m1, m7
+ dec t3d
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ psadbw m2, m7
+ paddw m5, m2
+ paddd m6, m0
+ paddd m6, m1
+ jnz .loop
+ VAR_END 6
+
;=============================================================================
; SATD
@@ -173,16 +281,18 @@
; whereas phaddw-based transform doesn't care what order the coefs end up in.
%macro PHSUMSUB 3
- movdqa %3, %1
- phaddw %1, %2
- phsubw %3, %2
+ movdqa m%3, m%1
+ phaddw m%1, m%2
+ phsubw m%3, m%2
+ SWAP %2, %3
%endmacro
-%macro HADAMARD4_ROW_PHADD 5 ; abcd-t -> adtc
- PHSUMSUB %1, %2, %5
- PHSUMSUB %3, %4, %2
- PHSUMSUB %1, %3, %4
- PHSUMSUB %5, %2, %3
+%macro HADAMARD4_ROW_PHADD 5
+ PHSUMSUB %1, %2, %5
+ PHSUMSUB %3, %4, %5
+ PHSUMSUB %1, %3, %5
+ PHSUMSUB %2, %4, %5
+ SWAP %3, %4
%endmacro
%macro HADAMARD4_1D 4
@@ -190,102 +300,29 @@
SUMSUB_BADC %1, %3, %2, %4
%endmacro
-%macro SBUTTERFLY 5
- mov%1 %5, %3
- punpckl%2 %3, %4
- punpckh%2 %5, %4
-%endmacro
-
-%macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4x2 to not shuffle registers
- mov%1 %5, %3
- punpckh%2 %3, %4
- punpckl%2 %5, %4
-%endmacro
-
-%macro TRANSPOSE4x4W 5 ; abcd-t -> adtc
- SBUTTERFLY q, wd, %1, %2, %5
- SBUTTERFLY q, wd, %3, %4, %2
- SBUTTERFLY q, dq, %1, %3, %4
- SBUTTERFLY q, dq, %5, %2, %3
-%endmacro
-
-%macro TRANSPOSE4x4D 5 ; abcd-t -> adtc
- SBUTTERFLY dqa, dq, %1, %2, %5
- SBUTTERFLY dqa, dq, %3, %4, %2
- SBUTTERFLY dqa, qdq, %1, %3, %4
- SBUTTERFLY dqa, qdq, %5, %2, %3
-%endmacro
-
-%macro TRANSPOSE2x4x4W 5 ; abcd-t -> abcd
- SBUTTERFLY dqa, wd, %1, %2, %5
- SBUTTERFLY dqa, wd, %3, %4, %2
- SBUTTERFLY dqa, dq, %1, %3, %4
- SBUTTERFLY2 dqa, dq, %5, %2, %3
- SBUTTERFLY dqa, qdq, %1, %3, %2
- SBUTTERFLY2 dqa, qdq, %4, %5, %3
-%endmacro
-
-%ifdef ARCH_X86_64
-%macro TRANSPOSE8x8W 9 ; abcdefgh-t -> afhdtecb
- SBUTTERFLY dqa, wd, %1, %2, %9
- SBUTTERFLY dqa, wd, %3, %4, %2
- SBUTTERFLY dqa, wd, %5, %6, %4
- SBUTTERFLY dqa, wd, %7, %8, %6
- SBUTTERFLY dqa, dq, %1, %3, %8
- SBUTTERFLY dqa, dq, %9, %2, %3
- SBUTTERFLY dqa, dq, %5, %7, %2
- SBUTTERFLY dqa, dq, %4, %6, %7
- SBUTTERFLY dqa, qdq, %1, %5, %6
- SBUTTERFLY dqa, qdq, %9, %4, %5
- SBUTTERFLY dqa, qdq, %8, %2, %4
- SBUTTERFLY dqa, qdq, %3, %7, %2
-%endmacro
-%else
-%macro TRANSPOSE8x8W 9 ; abcdefgh -> afhdgecb
- movdqa [%9], %8
- SBUTTERFLY dqa, wd, %1, %2, %8
- movdqa [%9+16], %8
- movdqa %8, [%9]
- SBUTTERFLY dqa, wd, %3, %4, %2
- SBUTTERFLY dqa, wd, %5, %6, %4
- SBUTTERFLY dqa, wd, %7, %8, %6
- SBUTTERFLY dqa, dq, %1, %3, %8
- movdqa [%9], %8
- movdqa %8, [16+%9]
- SBUTTERFLY dqa, dq, %8, %2, %3
- SBUTTERFLY dqa, dq, %5, %7, %2
- SBUTTERFLY dqa, dq, %4, %6, %7
- SBUTTERFLY dqa, qdq, %1, %5, %6
- SBUTTERFLY dqa, qdq, %8, %4, %5
- movdqa [%9+16], %8
- movdqa %8, [%9]
- SBUTTERFLY dqa, qdq, %8, %2, %4
- SBUTTERFLY dqa, qdq, %3, %7, %2
- movdqa %7, [%9+16]
-%endmacro
-%endif
-
%macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block)
- HADAMARD4_1D mm4, mm5, mm6, mm7
- TRANSPOSE4x4W mm4, mm5, mm6, mm7, %1
- HADAMARD4_1D mm4, mm7, %1, mm6
- ABS2 mm4, mm7, mm3, mm5
- ABS2 %1, mm6, mm3, mm5
- paddw %1, mm4
- paddw mm6, mm7
- pavgw %1, mm6
+ %xdefine %%n n%1
+ HADAMARD4_1D m4, m5, m6, m7
+ TRANSPOSE4x4W 4, 5, 6, 7, %%n
+ HADAMARD4_1D m4, m5, m6, m7
+ ABS2 m4, m5, m3, m %+ %%n
+ ABS2 m6, m7, m3, m %+ %%n
+ paddw m6, m4
+ paddw m7, m5
+ pavgw m6, m7
+ SWAP %%n, 6
%endmacro
; in: r4=3*stride1, r5=3*stride2
; in: %2 = horizontal offset
; in: %3 = whether we need to increment pix1 and pix2
-; clobber: mm3..mm7
+; clobber: m3..m7
; out: %1 = satd
%macro SATD_4x4_MMX 3
- LOAD_DIFF mm4, mm3, none, [r0+%2], [r2+%2]
- LOAD_DIFF mm5, mm3, none, [r0+r1+%2], [r2+r3+%2]
- LOAD_DIFF mm6, mm3, none, [r0+2*r1+%2], [r2+2*r3+%2]
- LOAD_DIFF mm7, mm3, none, [r0+r4+%2], [r2+r5+%2]
+ LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2]
+ LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2]
+ LOAD_DIFF m6, m3, none, [r0+2*r1+%2], [r2+2*r3+%2]
+ LOAD_DIFF m7, m3, none, [r0+r4+%2], [r2+r5+%2]
%if %3
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
@@ -294,66 +331,66 @@
%endmacro
%macro SATD_8x4_START 1
- SATD_4x4_MMX mm0, 0, 0
- SATD_4x4_MMX mm1, 4, %1
+ SATD_4x4_MMX m0, 0, 0
+ SATD_4x4_MMX m1, 4, %1
%endmacro
%macro SATD_8x4_INC 1
- SATD_4x4_MMX mm2, 0, 0
- paddw mm0, mm1
- SATD_4x4_MMX mm1, 4, %1
- paddw mm0, mm2
+ SATD_4x4_MMX m2, 0, 0
+ paddw m0, m1
+ SATD_4x4_MMX m1, 4, %1
+ paddw m0, m2
%endmacro
%macro SATD_16x4_START 1
- SATD_4x4_MMX mm0, 0, 0
- SATD_4x4_MMX mm1, 4, 0
- SATD_4x4_MMX mm2, 8, 0
- paddw mm0, mm1
- SATD_4x4_MMX mm1, 12, %1
- paddw mm0, mm2
+ SATD_4x4_MMX m0, 0, 0
+ SATD_4x4_MMX m1, 4, 0
+ SATD_4x4_MMX m2, 8, 0
+ paddw m0, m1
+ SATD_4x4_MMX m1, 12, %1
+ paddw m0, m2
%endmacro
%macro SATD_16x4_INC 1
- SATD_4x4_MMX mm2, 0, 0
- paddw mm0, mm1
- SATD_4x4_MMX mm1, 4, 0
- paddw mm0, mm2
- SATD_4x4_MMX mm2, 8, 0
- paddw mm0, mm1
- SATD_4x4_MMX mm1, 12, %1
- paddw mm0, mm2
+ SATD_4x4_MMX m2, 0, 0
+ paddw m0, m1
+ SATD_4x4_MMX m1, 4, 0
+ paddw m0, m2
+ SATD_4x4_MMX m2, 8, 0
+ paddw m0, m1
+ SATD_4x4_MMX m1, 12, %1
+ paddw m0, m2
%endmacro
%macro SATD_8x4_SSE2 1
- LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
%if %1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
- HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
- TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
- HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
- ABS4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- paddusw xmm0, xmm1
- paddusw xmm2, xmm3
- paddusw xmm6, xmm0
- paddusw xmm6, xmm2
+ HADAMARD4_1D m0, m1, m2, m3
+ TRANSPOSE2x4x4W 0, 1, 2, 3, 4
+ HADAMARD4_1D m0, m1, m2, m3
+ ABS4 m0, m1, m2, m3, m4, m5
+ paddusw m0, m1
+ paddusw m2, m3
+ paddusw m6, m0
+ paddusw m6, m2
%endmacro
%macro SATD_8x4_PHADD 1
- LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
%if %1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
- HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
- HADAMARD4_ROW_PHADD xmm0, xmm1, xmm2, xmm3, xmm4
- ABS4 xmm0, xmm3, xmm4, xmm2, xmm1, xmm5
- paddusw xmm0, xmm3
- paddusw xmm2, xmm4
- paddusw xmm6, xmm0
- paddusw xmm6, xmm2
+ HADAMARD4_1D m0, m1, m2, m3
+ HADAMARD4_ROW_PHADD 0, 1, 2, 3, 4
+ ABS4 m0, m1, m2, m3, m4, m5
+ paddusw m0, m1
+ paddusw m2, m3
+ paddusw m6, m0
+ paddusw m6, m2
%endmacro
%macro SATD_START_MMX 0
@@ -362,12 +399,12 @@
%endmacro
%macro SATD_END_MMX 0
- pshufw mm1, mm0, 01001110b
- paddw mm0, mm1
- pshufw mm1, mm0, 10110001b
- paddw mm0, mm1
- movd eax, mm0
- and eax, 0xffff
+ pshufw m1, m0, 01001110b
+ paddw m0, m1
+ pshufw m1, m0, 10110001b
+ paddw m0, m1
+ movd eax, m0
+ and eax, 0xffff
RET
%endmacro
@@ -377,27 +414,28 @@
;-----------------------------------------------------------------------------
; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
+INIT_MMX
cglobal x264_pixel_satd_16x16_mmxext, 4,6
SATD_START_MMX
SATD_16x4_START 1
SATD_16x4_INC 1
SATD_16x4_INC 1
SATD_16x4_INC 0
- paddw mm0, mm1
- pxor mm3, mm3
- pshufw mm1, mm0, 01001110b
- paddw mm0, mm1
- punpcklwd mm0, mm3
- pshufw mm1, mm0, 01001110b
- paddd mm0, mm1
- movd eax, mm0
+ paddw m0, m1
+ pxor m3, m3
+ pshufw m1, m0, 01001110b
+ paddw m0, m1
+ punpcklwd m0, m3
+ pshufw m1, m0, 01001110b
+ paddd m0, m1
+ movd eax, m0
RET
cglobal x264_pixel_satd_16x8_mmxext, 4,6
SATD_START_MMX
SATD_16x4_START 1
SATD_16x4_INC 0
- paddw mm0, mm1
+ paddw m0, m1
SATD_END_MMX
cglobal x264_pixel_satd_8x16_mmxext, 4,6
@@ -406,56 +444,56 @@
SATD_8x4_INC 1
SATD_8x4_INC 1
SATD_8x4_INC 0
- paddw mm0, mm1
+ paddw m0, m1
SATD_END_MMX
cglobal x264_pixel_satd_8x8_mmxext, 4,6
SATD_START_MMX
SATD_8x4_START 1
SATD_8x4_INC 0
- paddw mm0, mm1
+ paddw m0, m1
SATD_END_MMX
cglobal x264_pixel_satd_8x4_mmxext, 4,6
SATD_START_MMX
SATD_8x4_START 0
- paddw mm0, mm1
+ paddw m0, m1
SATD_END_MMX
%macro SATD_W4 1
+INIT_MMX
cglobal x264_pixel_satd_4x8_%1, 4,6
SATD_START_MMX
- SATD_4x4_MMX mm0, 0, 1
- SATD_4x4_MMX mm1, 0, 0
- paddw mm0, mm1
+ SATD_4x4_MMX m0, 0, 1
+ SATD_4x4_MMX m1, 0, 0
+ paddw m0, m1
SATD_END_MMX
cglobal x264_pixel_satd_4x4_%1, 4,6
SATD_START_MMX
- SATD_4x4_MMX mm0, 0, 0
+ SATD_4x4_MMX m0, 0, 0
SATD_END_MMX
%endmacro
SATD_W4 mmxext
%macro SATD_START_SSE2 0
- pxor xmm6, xmm6
- lea r4, [3*r1]
- lea r5, [3*r3]
+ pxor m6, m6
+ lea r4, [3*r1]
+ lea r5, [3*r3]
%endmacro
%macro SATD_END_SSE2 0
- picgetgot ebx
- psrlw xmm6, 1
- HADDW xmm6, xmm7
- movd eax, xmm6
+ psrlw m6, 1
+ HADDW m6, m7
+ movd eax, m6
RET
%endmacro
%macro BACKUP_POINTERS 0
%ifdef ARCH_X86_64
- mov r10, r0
- mov r11, r2
+ mov r10, r0
+ mov r11, r2
%endif
%endmacro
@@ -475,6 +513,7 @@
; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SATDS_SSE2 1
+INIT_XMM
cglobal x264_pixel_satd_16x16_%1, 4,6
SATD_START_SSE2
BACKUP_POINTERS
@@ -526,26 +565,26 @@
lea r4, [3*r1]
lea r5, [3*r3]
.skip_lea:
- LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm8, xmm9
+ LOAD_DIFF_8x4P m0, m1, m2, m3, m8, m9
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
- LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
+ LOAD_DIFF_8x4P m4, m5, m6, m7, m8, m9
- HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
- TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
- HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
-
- ABS4 xmm0, xmm1, xmm2, xmm3, xmm6, xmm9
- ABS4 xmm4, xmm5, xmm7, xmm8, xmm6, xmm9
- paddusw xmm0, xmm1
- paddusw xmm2, xmm3
- paddusw xmm4, xmm5
- paddusw xmm7, xmm8
- paddusw xmm0, xmm2
- paddusw xmm4, xmm7
- pavgw xmm0, xmm4
- HADDW xmm0, xmm1
- movd eax, xmm0
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+
+ ABS4 m0, m1, m2, m3, m8, m9
+ ABS4 m4, m5, m6, m7, m8, m9
+ paddusw m0, m1
+ paddusw m2, m3
+ paddusw m4, m5
+ paddusw m6, m7
+ paddusw m0, m2
+ paddusw m4, m6
+ pavgw m0, m4
+ HADDW m0, m1
+ movd eax, m0
add r10d, eax ; preserve rounding for 16x16
add eax, 1
shr eax, 1
@@ -576,39 +615,38 @@
sub esp, 32
lea r4, [3*r1]
lea r5, [3*r3]
- LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm6, xmm7
- movdqa [esp], xmm2
+ LOAD_DIFF_8x4P m0, m1, m2, m3, m6, m7
+ movdqa [esp], m2
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
- LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm2, xmm2
- movdqa xmm2, [esp]
+ LOAD_DIFF_8x4P m4, m5, m6, m7, m2, m2
+ movdqa m2, [esp]
- HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
- TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, esp
- HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm6, xmm4, xmm2, xmm1
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [esp], [esp+16]
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
%ifidn %1, sse2
- movdqa [esp], xmm6
- movdqa [esp+16], xmm7
+ movdqa [esp], m4
+ movdqa [esp+16], m2
%endif
- ABS2 xmm2, xmm3, xmm6, xmm7
- ABS2 xmm0, xmm1, xmm6, xmm7
- paddusw xmm0, xmm2
- paddusw xmm1, xmm3
+ ABS2 m6, m3, m4, m2
+ ABS2 m0, m7, m4, m2
+ paddusw m0, m6
+ paddusw m7, m3
%ifidn %1, sse2
- movdqa xmm6, [esp]
- movdqa xmm7, [esp+16]
+ movdqa m4, [esp]
+ movdqa m2, [esp+16]
%endif
- ABS2 xmm4, xmm5, xmm2, xmm3
- ABS2 xmm6, xmm7, xmm2, xmm3
- paddusw xmm4, xmm5
- paddusw xmm6, xmm7
- paddusw xmm0, xmm1
- paddusw xmm4, xmm6
- pavgw xmm0, xmm4
- picgetgot ebx
- HADDW xmm0, xmm1
- movd eax, xmm0
+ ABS2 m5, m1, m6, m3
+ ABS2 m4, m2, m6, m3
+ paddusw m5, m1
+ paddusw m4, m2
+ paddusw m0, m7
+ paddusw m5, m4
+ pavgw m0, m5
+ HADDW m0, m7
+ movd eax, m0
mov ecx, eax ; preserve rounding for 16x16
add eax, 1
shr eax, 1
@@ -658,31 +696,32 @@
%macro INTRA_SA8D_SSE2 1
%ifdef ARCH_X86_64
+INIT_XMM
;-----------------------------------------------------------------------------
; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
;-----------------------------------------------------------------------------
cglobal x264_intra_sa8d_x3_8x8_core_%1
; 8x8 hadamard
- pxor xmm4, xmm4
- movq xmm0, [r0+0*FENC_STRIDE]
- movq xmm7, [r0+1*FENC_STRIDE]
- movq xmm6, [r0+2*FENC_STRIDE]
- movq xmm3, [r0+3*FENC_STRIDE]
- movq xmm5, [r0+4*FENC_STRIDE]
- movq xmm1, [r0+5*FENC_STRIDE]
- movq xmm8, [r0+6*FENC_STRIDE]
- movq xmm2, [r0+7*FENC_STRIDE]
- punpcklbw xmm0, xmm4
- punpcklbw xmm7, xmm4
- punpcklbw xmm6, xmm4
- punpcklbw xmm3, xmm4
- punpcklbw xmm5, xmm4
- punpcklbw xmm1, xmm4
- punpcklbw xmm8, xmm4
- punpcklbw xmm2, xmm4
- HADAMARD8_1D xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2
- TRANSPOSE8x8W xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4
- HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+ pxor m8, m8
+ movq m0, [r0+0*FENC_STRIDE]
+ movq m1, [r0+1*FENC_STRIDE]
+ movq m2, [r0+2*FENC_STRIDE]
+ movq m3, [r0+3*FENC_STRIDE]
+ movq m4, [r0+4*FENC_STRIDE]
+ movq m5, [r0+5*FENC_STRIDE]
+ movq m6, [r0+6*FENC_STRIDE]
+ movq m7, [r0+7*FENC_STRIDE]
+ punpcklbw m0, m8
+ punpcklbw m1, m8
+ punpcklbw m2, m8
+ punpcklbw m3, m8
+ punpcklbw m4, m8
+ punpcklbw m5, m8
+ punpcklbw m6, m8
+ punpcklbw m7, m8
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
; dc
movzx edi, word [r1+0]
@@ -691,95 +730,97 @@
and edi, -16
shl edi, 2
- pxor xmm15, xmm15
- movdqa xmm8, xmm2
- movdqa xmm9, xmm3
- movdqa xmm10, xmm4
- movdqa xmm11, xmm5
- ABS4 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13
- paddusw xmm8, xmm10
- paddusw xmm9, xmm11
+ pxor m15, m15
+ movdqa m8, m2
+ movdqa m9, m3
+ movdqa m10, m4
+ movdqa m11, m5
+ ABS4 m8, m9, m10, m11, m12, m13
+ paddusw m8, m10
+ paddusw m9, m11
%ifidn %1, ssse3
- pabsw xmm10, xmm6
- pabsw xmm11, xmm7
- pabsw xmm15, xmm1
+ pabsw m10, m6
+ pabsw m11, m7
+ pabsw m15, m1
%else
- movdqa xmm10, xmm6
- movdqa xmm11, xmm7
- movdqa xmm15, xmm1
- ABS2 xmm10, xmm11, xmm13, xmm14
- ABS1 xmm15, xmm13
-%endif
- paddusw xmm10, xmm11
- paddusw xmm8, xmm9
- paddusw xmm15, xmm10
- paddusw xmm15, xmm8
- movdqa xmm14, xmm15 ; 7x8 sum
-
- movdqa xmm8, [r1+0] ; left edge
- movd xmm9, edi
- psllw xmm8, 3
- psubw xmm8, xmm0
- psubw xmm9, xmm0
- ABS1 xmm8, xmm10
- ABS1 xmm9, xmm11 ; 1x8 sum
- paddusw xmm14, xmm8
- paddusw xmm15, xmm9
- punpcklwd xmm0, xmm1
- punpcklwd xmm2, xmm3
- punpcklwd xmm4, xmm5
- punpcklwd xmm6, xmm7
- punpckldq xmm0, xmm2
- punpckldq xmm4, xmm6
- punpcklqdq xmm0, xmm4 ; transpose
- movdqa xmm1, [r1+16] ; top edge
- movdqa xmm2, xmm15
- psllw xmm1, 3
- psrldq xmm2, 2 ; 8x7 sum
- psubw xmm0, xmm1 ; 8x1 sum
- ABS1 xmm0, xmm1
- paddusw xmm2, xmm0
+ movdqa m10, m6
+ movdqa m11, m7
+ movdqa m15, m1
+ ABS2 m10, m11, m13, m14
+ ABS1 m15, m13
+%endif
+ paddusw m10, m11
+ paddusw m8, m9
+ paddusw m15, m10
+ paddusw m15, m8
+ movdqa m14, m15 ; 7x8 sum
+
+ movdqa m8, [r1+0] ; left edge
+ movd m9, edi
+ psllw m8, 3
+ psubw m8, m0
+ psubw m9, m0
+ ABS1 m8, m10
+ ABS1 m9, m11 ; 1x8 sum
+ paddusw m14, m8
+ paddusw m15, m9
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpcklwd m4, m5
+ punpcklwd m6, m7
+ punpckldq m0, m2
+ punpckldq m4, m6
+ punpcklqdq m0, m4 ; transpose
+ movdqa m1, [r1+16] ; top edge
+ movdqa m2, m15
+ psllw m1, 3
+ psrldq m2, 2 ; 8x7 sum
+ psubw m0, m1 ; 8x1 sum
+ ABS1 m0, m1
+ paddusw m2, m0
; 3x HADDW
- movdqa xmm7, [pw_1 GLOBAL]
- pmaddwd xmm2, xmm7
- pmaddwd xmm14, xmm7
- pmaddwd xmm15, xmm7
- movdqa xmm3, xmm2
- punpckldq xmm2, xmm14
- punpckhdq xmm3, xmm14
- pshufd xmm5, xmm15, 0xf5
- paddd xmm2, xmm3
- paddd xmm5, xmm15
- movdqa xmm3, xmm2
- punpcklqdq xmm2, xmm5
- punpckhqdq xmm3, xmm5
- pavgw xmm3, xmm2
- pxor xmm0, xmm0
- pavgw xmm3, xmm0
- movq [r2], xmm3 ; i8x8_v, i8x8_h
- psrldq xmm3, 8
- movd [r2+8], xmm3 ; i8x8_dc
+ movdqa m7, [pw_1 GLOBAL]
+ pmaddwd m2, m7
+ pmaddwd m14, m7
+ pmaddwd m15, m7
+ movdqa m3, m2
+ punpckldq m2, m14
+ punpckhdq m3, m14
+ pshufd m5, m15, 0xf5
+ paddd m2, m3
+ paddd m5, m15
+ movdqa m3, m2
+ punpcklqdq m2, m5
+ punpckhqdq m3, m5
+ pavgw m3, m2
+ pxor m0, m0
+ pavgw m3, m0
+ movq [r2], m3 ; i8x8_v, i8x8_h
+ psrldq m3, 8
+ movd [r2+8], m3 ; i8x8_dc
ret
%endif ; ARCH_X86_64
-%endmacro ; INTRA_SATDS
+%endmacro ; INTRA_SA8D_SSE2
; in: r0 = fenc
-; out: mm0..mm3 = hadamard coefs
+; out: m0..m3 = hadamard coefs
+INIT_MMX
ALIGN 16
load_hadamard:
- pxor mm7, mm7
- movd mm0, [r0+0*FENC_STRIDE]
- movd mm4, [r0+1*FENC_STRIDE]
- movd mm3, [r0+2*FENC_STRIDE]
- movd mm1, [r0+3*FENC_STRIDE]
- punpcklbw mm0, mm7
- punpcklbw mm4, mm7
- punpcklbw mm3, mm7
- punpcklbw mm1, mm7
- HADAMARD4_1D mm0, mm4, mm3, mm1
- TRANSPOSE4x4W mm0, mm4, mm3, mm1, mm2
- HADAMARD4_1D mm0, mm1, mm2, mm3
+ pxor m7, m7
+ movd m0, [r0+0*FENC_STRIDE]
+ movd m1, [r0+1*FENC_STRIDE]
+ movd m2, [r0+2*FENC_STRIDE]
+ movd m3, [r0+3*FENC_STRIDE]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ HADAMARD4_1D m0, m1, m2, m3
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ HADAMARD4_1D m0, m1, m2, m3
+ SAVE_MM_PERMUTATION load_hadamard
ret
%macro SCALAR_SUMSUB 4
@@ -848,53 +889,54 @@
mov qword [sums+8], 0
mov qword [sums+16], 0
%else
- pxor mm7, mm7
- movq [sums+0], mm7
- movq [sums+8], mm7
- movq [sums+16], mm7
+ pxor m7, m7
+ movq [sums+0], m7
+ movq [sums+8], m7
+ movq [sums+16], m7
%endif
%endmacro
-; in: mm1..mm3
-; out: mm7
-; clobber: mm4..mm6
+; in: m1..m3
+; out: m7
+; clobber: m4..m6
%macro SUM3x4 1
%ifidn %1, ssse3
- pabsw mm4, mm1
- pabsw mm5, mm2
- pabsw mm7, mm3
- paddw mm4, mm5
+ pabsw m4, m1
+ pabsw m5, m2
+ pabsw m7, m3
+ paddw m4, m5
%else
- movq mm4, mm1
- movq mm5, mm2
- ABS2 mm4, mm5, mm6, mm7
- movq mm7, mm3
- paddw mm4, mm5
- ABS1 mm7, mm6
-%endif
- paddw mm7, mm4
+ movq m4, m1
+ movq m5, m2
+ ABS2 m4, m5, m6, m7
+ movq m7, m3
+ paddw m4, m5
+ ABS1 m7, m6
+%endif
+ paddw m7, m4
%endmacro
-; in: mm0..mm3 (4x4), mm7 (3x4)
-; out: mm0 v, mm4 h, mm5 dc
-; clobber: mm6
+; in: m0..m3 (4x4), m7 (3x4)
+; out: m0 v, m4 h, m5 dc
+; clobber: m6
%macro SUM4x3 3 ; dc, left, top
- movq mm4, %2
- movd mm5, %1
- psllw mm4, 2
- psubw mm4, mm0
- psubw mm5, mm0
- punpcklwd mm0, mm1
- punpcklwd mm2, mm3
- punpckldq mm0, mm2 ; transpose
- movq mm1, %3
- psllw mm1, 2
- psubw mm0, mm1
- ABS2 mm4, mm5, mm2, mm3 ; 1x4 sum
- ABS1 mm0, mm1 ; 4x1 sum
+ movq m4, %2
+ movd m5, %1
+ psllw m4, 2
+ psubw m4, m0
+ psubw m5, m0
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpckldq m0, m2 ; transpose
+ movq m1, %3
+ psllw m1, 2
+ psubw m0, m1
+ ABS2 m4, m5, m2, m3 ; 1x4 sum
+ ABS1 m0, m1 ; 4x1 sum
%endmacro
%macro INTRA_SATDS_MMX 1
+INIT_MMX
;-----------------------------------------------------------------------------
; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
@@ -924,19 +966,19 @@
SUM3x4 %1
SUM4x3 t0d, [left_1d], [top_1d]
- paddw mm4, mm7
- paddw mm5, mm7
- movq mm1, mm5
- psrlq mm1, 16 ; 4x3 sum
- paddw mm0, mm1
+ paddw m4, m7
+ paddw m5, m7
+ movq m1, m5
+ psrlq m1, 16 ; 4x3 sum
+ paddw m0, m1
- SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw
+ SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw
%ifndef ARCH_X86_64
mov r2, r2m
%endif
- movd [r2+0], mm0 ; i4x4_v satd
- movd [r2+4], mm4 ; i4x4_h satd
- movd [r2+8], mm5 ; i4x4_dc satd
+ movd [r2+0], m0 ; i4x4_v satd
+ movd [r2+4], m4 ; i4x4_h satd
+ movd [r2+8], m5 ; i4x4_dc satd
%ifndef ARCH_X86_64
ADD esp, 16
%endif
@@ -966,10 +1008,10 @@
%assign stack_pad 88 + ((stack_offset+88+4)&15)
%endif
; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
- SUB rsp, stack_pad
-%define sums rsp+64 ; size 24
-%define top_1d rsp+32 ; size 32
-%define left_1d rsp ; size 32
+ SUB rsp, stack_pad
+%define sums rsp+64 ; size 24
+%define top_1d rsp+32 ; size 32
+%define left_1d rsp ; size 32
movifnidn r1d, r1m
CLEAR_SUMS
@@ -997,14 +1039,14 @@
SUM3x4 %1
SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
- pavgw mm4, mm7
- pavgw mm5, mm7
- paddw mm0, [sums+0] ; i16x16_v satd
- paddw mm4, [sums+8] ; i16x16_h satd
- paddw mm5, [sums+16] ; i16x16_dc satd
- movq [sums+0], mm0
- movq [sums+8], mm4
- movq [sums+16], mm5
+ pavgw m4, m7
+ pavgw m5, m7
+ paddw m0, [sums+0] ; i16x16_v satd
+ paddw m4, [sums+8] ; i16x16_h satd
+ paddw m5, [sums+16] ; i16x16_dc satd
+ movq [sums+0], m0
+ movq [sums+8], m4
+ movq [sums+16], m5
add r0, 4
inc r4d
@@ -1017,19 +1059,19 @@
; horizontal sum
movifnidn r2d, r2m
- movq mm2, [sums+16]
- movq mm1, [sums+8]
- movq mm0, [sums+0]
- movq mm7, mm2
- SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
- psrld mm0, 1
- pslld mm7, 16
- psrld mm7, 16
- paddd mm0, mm2
- psubd mm0, mm7
- movd [r2+8], mm2 ; i16x16_dc satd
- movd [r2+4], mm1 ; i16x16_h satd
- movd [r2+0], mm0 ; i16x16_v satd
+ movq m2, [sums+16]
+ movq m1, [sums+8]
+ movq m0, [sums+0]
+ movq m7, m2
+ SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
+ psrld m0, 1
+ pslld m7, 16
+ psrld m7, 16
+ paddd m0, m2
+ psubd m0, m7
+ movd [r2+8], m2 ; i16x16_dc satd
+ movd [r2+4], m1 ; i16x16_h satd
+ movd [r2+0], m0 ; i16x16_v satd
ADD rsp, stack_pad
RET
@@ -1086,14 +1128,14 @@
SUM3x4 %1
SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
- pavgw mm4, mm7
- pavgw mm5, mm7
- paddw mm0, [sums+16] ; i4x4_v satd
- paddw mm4, [sums+8] ; i4x4_h satd
- paddw mm5, [sums+0] ; i4x4_dc satd
- movq [sums+16], mm0
- movq [sums+8], mm4
- movq [sums+0], mm5
+ pavgw m4, m7
+ pavgw m5, m7
+ paddw m0, [sums+16] ; i4x4_v satd
+ paddw m4, [sums+8] ; i4x4_h satd
+ paddw m5, [sums+0] ; i4x4_dc satd
+ movq [sums+16], m0
+ movq [sums+8], m4
+ movq [sums+0], m5
add r0, 4
inc r4d
@@ -1106,21 +1148,336 @@
jl .loop_y
; horizontal sum
- movq mm0, [sums+0]
- movq mm1, [sums+8]
- movq mm2, [sums+16]
- movq mm7, mm0
- psrlq mm7, 15
- paddw mm2, mm7
- SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
- psrld mm2, 1
- movd [r2+0], mm0 ; i8x8c_dc satd
- movd [r2+4], mm1 ; i8x8c_h satd
- movd [r2+8], mm2 ; i8x8c_v satd
+ movq m0, [sums+0]
+ movq m1, [sums+8]
+ movq m2, [sums+16]
+ movq m7, m0
+ psrlq m7, 15
+ paddw m2, m7
+ SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
+ psrld m2, 1
+ movd [r2+0], m0 ; i8x8c_dc satd
+ movd [r2+4], m1 ; i8x8c_h satd
+ movd [r2+8], m2 ; i8x8c_v satd
ADD rsp, 72
RET
+%endmacro ; INTRA_SATDS_MMX
+
+
+%macro ABS_MOV_SSSE3 2
+ pabsw %1, %2
+%endmacro
+
+%macro ABS_MOV_MMX 2
+ pxor %1, %1
+ psubw %1, %2
+ pmaxsw %1, %2
%endmacro
+%define ABS_MOV ABS_MOV_MMX
+
+; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
+; out: [tmp]=hadamard4, m0=satd
+cglobal x264_hadamard_ac_4x4_mmxext
+ movh m0, [r0]
+ movh m1, [r0+r1]
+ movh m2, [r0+r1*2]
+ movh m3, [r0+r2]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ HADAMARD4_1D m0, m1, m2, m3
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ HADAMARD4_1D m0, m1, m2, m3
+ mova [r3], m0
+ mova [r3+8], m1
+ mova [r3+16], m2
+ mova [r3+24], m3
+ ABS1 m0, m4
+ ABS1 m1, m4
+ pand m0, m6
+ ABS1 m2, m4
+ ABS1 m3, m4
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m2
+ SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext
+ ret
+
+cglobal x264_hadamard_ac_2x2_mmxext
+ mova m0, [r3+0x00]
+ mova m1, [r3+0x20]
+ mova m2, [r3+0x40]
+ mova m3, [r3+0x60]
+ HADAMARD4_1D m0, m1, m2, m3
+ ABS2 m0, m1, m4, m5
+ ABS2 m2, m3, m4, m5
+ SAVE_MM_PERMUTATION x264_hadamard_ac_2x2_mmxext
+ ret
+
+cglobal x264_hadamard_ac_8x8_mmxext
+ mova m6, [mask_ac4 GLOBAL]
+ pxor m7, m7
+ call x264_hadamard_ac_4x4_mmxext
+ add r0, 4
+ add r3, 32
+ mova m5, m0
+ call x264_hadamard_ac_4x4_mmxext
+ lea r0, [r0+4*r1]
+ add r3, 64
+ paddw m5, m0
+ call x264_hadamard_ac_4x4_mmxext
+ sub r0, 4
+ sub r3, 32
+ paddw m5, m0
+ call x264_hadamard_ac_4x4_mmxext
+ paddw m5, m0
+ sub r3, 64
+ mova [rsp+gprsize+8], m5 ; save satd
+ call x264_hadamard_ac_2x2_mmxext
+ add r3, 8
+ pand m6, m0
+ mova m7, m1
+ paddw m6, m2
+ paddw m7, m3
+%rep 2
+ call x264_hadamard_ac_2x2_mmxext
+ add r3, 8
+ paddw m6, m0
+ paddw m7, m1
+ paddw m6, m2
+ paddw m7, m3
+%endrep
+ call x264_hadamard_ac_2x2_mmxext
+ sub r3, 24
+ paddw m6, m0
+ paddw m7, m1
+ paddw m6, m2
+ paddw m7, m3
+ paddw m6, m7
+ mova [rsp+gprsize], m6 ; save sa8d
+ SWAP m0, m6
+ SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext
+ ret
+
+%macro HADAMARD_AC_WXH_MMX 2
+cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4
+ %assign pad 16-gprsize-(stack_offset&15)
+ %define ysub r1
+ sub rsp, 16+128+pad
+ lea r2, [r1*3]
+ lea r3, [rsp+16]
+ call x264_hadamard_ac_8x8_mmxext
+%if %2==16
+ %define ysub r2
+ lea r0, [r0+r1*4]
+ sub rsp, 16
+ call x264_hadamard_ac_8x8_mmxext
+%endif
+%if %1==16
+ neg ysub
+ sub rsp, 16
+ lea r0, [r0+ysub*4+8]
+ neg ysub
+ call x264_hadamard_ac_8x8_mmxext
+%if %2==16
+ lea r0, [r0+r1*4]
+ sub rsp, 16
+ call x264_hadamard_ac_8x8_mmxext
+%endif
+%endif
+ mova m1, [rsp+0x08]
+%if %1*%2 >= 128
+ paddusw m0, [rsp+0x10]
+ paddusw m1, [rsp+0x18]
+%endif
+%if %1*%2 == 256
+ paddusw m0, [rsp+0x20]
+ paddusw m1, [rsp+0x28]
+ paddusw m0, [rsp+0x30]
+ paddusw m1, [rsp+0x38]
+%endif
+ psrlw m0, 1
+ psrlw m1, 1
+ HADDW m0, m2
+ HADDW m1, m3
+ movd edx, m0
+ movd eax, m1
+ shr edx, 1
+%ifdef ARCH_X86_64
+ shl rdx, 32
+ add rax, rdx
+%endif
+ add rsp, 128+%1*%2/4+pad
+ RET
+%endmacro ; HADAMARD_AC_WXH_MMX
+
+HADAMARD_AC_WXH_MMX 16, 16
+HADAMARD_AC_WXH_MMX 8, 16
+HADAMARD_AC_WXH_MMX 16, 8
+HADAMARD_AC_WXH_MMX 8, 8
+
+%macro HADAMARD_AC_SSE2 1
+INIT_XMM
+; in: r0=pix, r1=stride, r2=stride*3
+; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
+cglobal x264_hadamard_ac_8x8_%1
+%ifdef ARCH_X86_64
+ %define spill0 m8
+ %define spill1 m9
+ %define spill2 m10
+%else
+ %define spill0 [rsp+gprsize]
+ %define spill1 [rsp+gprsize+16]
+ %define spill2 [rsp+gprsize+32]
+%endif
+ pxor m7, m7
+ movh m0, [r0]
+ movh m1, [r0+r1]
+ movh m2, [r0+r1*2]
+ movh m3, [r0+r2]
+ lea r0, [r0+r1*4]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ HADAMARD4_1D m0, m1, m2, m3
+ mova spill0, m3
+ SWAP m3, m7
+ movh m4, [r0]
+ movh m5, [r0+r1]
+ movh m6, [r0+r1*2]
+ movh m7, [r0+r2]
+ punpcklbw m4, m3
+ punpcklbw m5, m3
+ punpcklbw m6, m3
+ punpcklbw m7, m3
+ HADAMARD4_1D m4, m5, m6, m7
+ mova m3, spill0
+%ifdef ARCH_X86_64
+ TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
+%else
+ TRANSPOSE8x8W 0,1,2,3,4,5,6,7,spill0,spill1
+%endif
+ HADAMARD4_1D m0, m1, m2, m3
+ HADAMARD4_1D m4, m5, m6, m7
+ mova spill0, m1
+ mova spill1, m2
+ mova spill2, m3
+ ABS_MOV m1, m0
+ ABS_MOV m2, m4
+ ABS_MOV m3, m5
+ paddw m1, m2
+ SUMSUB_BA m0, m4
+ pand m1, [mask_ac4 GLOBAL]
+ ABS_MOV m2, spill0
+ paddw m1, m3
+ ABS_MOV m3, spill1
+ paddw m1, m2
+ ABS_MOV m2, spill2
+ paddw m1, m3
+ ABS_MOV m3, m6
+ paddw m1, m2
+ ABS_MOV m2, m7
+ paddw m1, m3
+ mova m3, m7
+ paddw m1, m2
+ mova m2, m6
+ psubw m7, spill2
+ paddw m3, spill2
+ mova [rsp+gprsize+32], m1 ; save satd
+ mova m1, m5
+ psubw m6, spill1
+ paddw m2, spill1
+ psubw m5, spill0
+ paddw m1, spill0
+ mova spill1, m7
+ SBUTTERFLY qdq, 0, 4, 7
+ SBUTTERFLY qdq, 1, 5, 7
+ SBUTTERFLY qdq, 2, 6, 7
+ SUMSUB_BADC m0, m4, m1, m5
+ SUMSUB_BA m2, m6
+ ABS1 m0, m7
+ ABS1 m1, m7
+ pand m0, [mask_ac8 GLOBAL]
+ ABS1 m2, m7
+ ABS1 m4, m7
+ ABS1 m5, m7
+ ABS1 m6, m7
+ mova m7, spill1
+ paddw m0, m4
+ SBUTTERFLY qdq, 3, 7, 4
+ SUMSUB_BA m3, m7
+ paddw m1, m5
+ ABS1 m3, m4
+ ABS1 m7, m4
+ paddw m2, m6
+ paddw m3, m7
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m2
+ mova [rsp+gprsize+16], m0 ; save sa8d
+ SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1
+ ret
+
+HADAMARD_AC_WXH_SSE2 16, 16, %1
+HADAMARD_AC_WXH_SSE2 8, 16, %1
+HADAMARD_AC_WXH_SSE2 16, 8, %1
+HADAMARD_AC_WXH_SSE2 8, 8, %1
+%endmacro ; HADAMARD_AC_SSE2
+
+; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
+%macro HADAMARD_AC_WXH_SSE2 3
+cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3
+ %assign pad 16-gprsize-(stack_offset&15)
+ %define ysub r1
+ sub rsp, 48+pad
+ lea r2, [r1*3]
+ call x264_hadamard_ac_8x8_%3
+%if %2==16
+ %define ysub r2
+ lea r0, [r0+r1*4]
+ sub rsp, 32
+ call x264_hadamard_ac_8x8_%3
+%endif
+%if %1==16
+ neg ysub
+ sub rsp, 32
+ lea r0, [r0+ysub*4+8]
+ neg ysub
+ call x264_hadamard_ac_8x8_%3
+%if %2==16
+ lea r0, [r0+r1*4]
+ sub rsp, 32
+ call x264_hadamard_ac_8x8_%3
+%endif
+%endif
+ mova m1, [rsp+0x20]
+%if %1*%2 >= 128
+ paddusw m0, [rsp+0x30]
+ paddusw m1, [rsp+0x40]
+%endif
+%if %1*%2 == 256
+ paddusw m0, [rsp+0x50]
+ paddusw m1, [rsp+0x60]
+ paddusw m0, [rsp+0x70]
+ paddusw m1, [rsp+0x80]
+%endif
+ HADDW m0, m2
+ HADDW m1, m3
+ movd edx, m0
+ movd eax, m1
+ shr edx, 2
+ shr eax, 1
+%ifdef ARCH_X86_64
+ shl rdx, 32
+ add rax, rdx
+%endif
+ add rsp, 16+%1*%2/2+pad
+ RET
+%endmacro ; HADAMARD_AC_WXH_SSE2
+
; instantiate satds
%ifndef ARCH_X86_64
@@ -1134,13 +1491,16 @@
SA8D_16x16_32 sse2
INTRA_SA8D_SSE2 sse2
INTRA_SATDS_MMX mmxext
+HADAMARD_AC_SSE2 sse2
%define ABS1 ABS1_SSSE3
%define ABS2 ABS2_SSSE3
+%define ABS_MOV ABS_MOV_SSSE3
+SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
SATDS_SSE2 ssse3
SA8D_16x16_32 ssse3
INTRA_SA8D_SSE2 ssse3
INTRA_SATDS_MMX ssse3
-SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
+HADAMARD_AC_SSE2 ssse3
%define SATD_8x4_SSE2 SATD_8x4_PHADD
SATDS_SSE2 ssse3_phadd
@@ -1155,44 +1515,43 @@
; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4
- pxor xmm0, xmm0
- pxor xmm1, xmm1
- pxor xmm2, xmm2
- pxor xmm3, xmm3
- pxor xmm4, xmm4
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+ pxor m4, m4
%rep 4
- movq xmm5, [r0]
- movq xmm6, [r2]
- punpcklbw xmm5, xmm0
- punpcklbw xmm6, xmm0
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- movdqa xmm7, xmm5
- pmaddwd xmm5, xmm5
- pmaddwd xmm7, xmm6
- pmaddwd xmm6, xmm6
- paddd xmm3, xmm5
- paddd xmm4, xmm7
- paddd xmm3, xmm6
+ movq m5, [r0]
+ movq m6, [r2]
+ punpcklbw m5, m0
+ punpcklbw m6, m0
+ paddw m1, m5
+ paddw m2, m6
+ movdqa m7, m5
+ pmaddwd m5, m5
+ pmaddwd m7, m6
+ pmaddwd m6, m6
+ paddd m3, m5
+ paddd m4, m7
+ paddd m3, m6
add r0, r1
add r2, r3
%endrep
- ; PHADDW xmm1, xmm2
- ; PHADDD xmm3, xmm4
- picgetgot eax
- movdqa xmm7, [pw_1 GLOBAL]
- pshufd xmm5, xmm3, 0xb1
- pmaddwd xmm1, xmm7
- pmaddwd xmm2, xmm7
- pshufd xmm6, xmm4, 0xb1
- packssdw xmm1, xmm2
- paddd xmm3, xmm5
- pshufd xmm1, xmm1, 0xd8
- paddd xmm4, xmm6
- pmaddwd xmm1, xmm7
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
- punpckhdq xmm5, xmm4
+ ; PHADDW m1, m2
+ ; PHADDD m3, m4
+ movdqa m7, [pw_1 GLOBAL]
+ pshufd m5, m3, 0xb1
+ pmaddwd m1, m7
+ pmaddwd m2, m7
+ pshufd m6, m4, 0xb1
+ packssdw m1, m2
+ paddd m3, m5
+ pshufd m1, m1, 0xd8
+ paddd m4, m6
+ pmaddwd m1, m7
+ movdqa m5, m3
+ punpckldq m3, m4
+ punpckhdq m5, m4
%ifdef ARCH_X86_64
%define t0 r4
@@ -1201,77 +1560,76 @@
mov t0, r4m
%endif
- movq [t0+ 0], xmm1
- movq [t0+ 8], xmm3
- psrldq xmm1, 8
- movq [t0+16], xmm1
- movq [t0+24], xmm5
+ movq [t0+ 0], m1
+ movq [t0+ 8], m3
+ psrldq m1, 8
+ movq [t0+16], m1
+ movq [t0+24], m5
RET
;-----------------------------------------------------------------------------
; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
;-----------------------------------------------------------------------------
cglobal x264_pixel_ssim_end4_sse2, 3,3
- movdqa xmm0, [r0+ 0]
- movdqa xmm1, [r0+16]
- movdqa xmm2, [r0+32]
- movdqa xmm3, [r0+48]
- movdqa xmm4, [r0+64]
- paddd xmm0, [r1+ 0]
- paddd xmm1, [r1+16]
- paddd xmm2, [r1+32]
- paddd xmm3, [r1+48]
- paddd xmm4, [r1+64]
- paddd xmm0, xmm1
- paddd xmm1, xmm2
- paddd xmm2, xmm3
- paddd xmm3, xmm4
- picgetgot r1
- movdqa xmm5, [ssim_c1 GLOBAL]
- movdqa xmm6, [ssim_c2 GLOBAL]
- TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4
-
-; s1=mm0, s2=mm3, ss=mm4, s12=mm2
- movdqa xmm1, xmm3
- pslld xmm3, 16
- pmaddwd xmm1, xmm0 ; s1*s2
- por xmm0, xmm3
- pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2
- pslld xmm1, 1
- pslld xmm2, 7
- pslld xmm4, 6
- psubd xmm2, xmm1 ; covar*2
- psubd xmm4, xmm0 ; vars
- paddd xmm0, xmm5
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm4, xmm6
- cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
- cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1)
- cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2)
- cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2)
- mulps xmm1, xmm2
- mulps xmm0, xmm4
- divps xmm1, xmm0 ; ssim
+ movdqa m0, [r0+ 0]
+ movdqa m1, [r0+16]
+ movdqa m2, [r0+32]
+ movdqa m3, [r0+48]
+ movdqa m4, [r0+64]
+ paddd m0, [r1+ 0]
+ paddd m1, [r1+16]
+ paddd m2, [r1+32]
+ paddd m3, [r1+48]
+ paddd m4, [r1+64]
+ paddd m0, m1
+ paddd m1, m2
+ paddd m2, m3
+ paddd m3, m4
+ movdqa m5, [ssim_c1 GLOBAL]
+ movdqa m6, [ssim_c2 GLOBAL]
+ TRANSPOSE4x4D 0, 1, 2, 3, 4
+
+; s1=m0, s2=m1, ss=m2, s12=m3
+ movdqa m4, m1
+ pslld m1, 16
+ pmaddwd m4, m0 ; s1*s2
+ por m0, m1
+ pmaddwd m0, m0 ; s1*s1 + s2*s2
+ pslld m4, 1
+ pslld m3, 7
+ pslld m2, 6
+ psubd m3, m4 ; covar*2
+ psubd m2, m0 ; vars
+ paddd m0, m5
+ paddd m4, m5
+ paddd m3, m6
+ paddd m2, m6
+ cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
+ cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
+ cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
+ cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
+ mulps m4, m3
+ mulps m0, m2
+ divps m4, m0 ; ssim
- cmp r2d, 4
+ cmp r2d, 4
je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
- neg r2
-%ifdef PIC64
- lea r3, [mask_ff + 16 GLOBAL]
- movdqu xmm3, [r3 + r2*4]
+ neg r2
+%ifdef PIC
+ lea r3, [mask_ff + 16 GLOBAL]
+ movdqu m1, [r3 + r2*4]
%else
- movdqu xmm3, [mask_ff + r2*4 + 16 GLOBAL]
+ movdqu m1, [mask_ff + r2*4 + 16 GLOBAL]
%endif
- pand xmm1, xmm3
+ pand m4, m1
.skip:
- movhlps xmm0, xmm1
- addps xmm0, xmm1
- pshuflw xmm1, xmm0, 0xE
- addss xmm0, xmm1
+ movhlps m0, m4
+ addps m0, m4
+ pshuflw m4, m0, 0xE
+ addss m0, m4
%ifndef ARCH_X86_64
- movd r0m, xmm0
- fld dword r0m
+ movd r0m, m0
+ fld dword r0m
%endif
RET
@@ -1281,7 +1639,7 @@
; Successive Elimination ADS
;=============================================================================
-%macro ADS_START 1 ; unroll_size
+%macro ADS_START 1 ; unroll_size
%ifdef ARCH_X86_64
%define t0 r6
mov r10, rsp
@@ -1295,7 +1653,7 @@
and rsp, ~15
mov t0, rsp
shl r2d, 1
-%endmacro
+%endmacro
%macro ADS_END 1
add r1, 8*%1
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/pixel.h
^
|
@@ -43,6 +43,7 @@
DECL_X1( sad, mmxext )
DECL_X1( sad, sse2 )
DECL_X1( sad, sse3 )
+DECL_X1( sad, sse2_aligned )
DECL_X4( sad, mmxext )
DECL_X4( sad, sse2 )
DECL_X4( sad, sse3 )
@@ -64,22 +65,27 @@
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
-#undef DECL_PIXELS
-#undef DECL_X1
-#undef DECL_X4
+DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride, uint32_t *sad ))
+DECL_PIXELS( int, var, sse2, ( uint8_t *pix, int i_stride, uint32_t *sad ))
+DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( uint8_t *pix, int i_stride ))
-void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * );
-void x264_intra_satd_x3_4x4_ssse3( uint8_t *, uint8_t *, int * );
-void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * );
-void x264_intra_satd_x3_8x8c_ssse3( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * );
-void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_sse2( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_ssse3( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_16x16_mmxext ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_16x16_sse2 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_mmxext ( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_sse2 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * );
-void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *, int16_t [2][8], int * );
-void x264_intra_sa8d_x3_8x8_core_ssse3( uint8_t *, int16_t [2][8], int * );
+void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * );
+void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * );
void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
@@ -99,6 +105,10 @@
DECL_ADS( 4, ssse3 )
DECL_ADS( 2, ssse3 )
DECL_ADS( 1, ssse3 )
+
+#undef DECL_PIXELS
+#undef DECL_X1
+#undef DECL_X4
#undef DECL_ADS
#endif
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/predict-a.asm
^
|
@@ -22,6 +22,7 @@
;*****************************************************************************
%include "x86inc.asm"
+%include "x86util.asm"
%macro STORE8x8 2
movq [r0 + 0*FDEC_STRIDE], %1
@@ -66,13 +67,14 @@
ALIGN 16
pb_1: times 16 db 1
+pb_3: times 16 db 3
pw_2: times 4 dw 2
pw_4: times 4 dw 4
pw_8: times 8 dw 8
pw_76543210:
pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
pb_00s_ff: times 8 db 0
-pb_0s_ff: times 7 db 0
+pb_0s_ff: times 7 db 0
db 0xff
SECTION .text
@@ -99,7 +101,7 @@
;-----------------------------------------------------------------------------
; void predict_4x4_ddl_mmxext( uint8_t *src )
;-----------------------------------------------------------------------------
-cglobal predict_4x4_ddl_mmxext, 1,1,1
+cglobal predict_4x4_ddl_mmxext, 1,1
sub r0, FDEC_STRIDE
movq mm3, [r0]
movq mm1, [r0-1]
@@ -123,7 +125,7 @@
;-----------------------------------------------------------------------------
; void predict_4x4_vl_mmxext( uint8_t *src )
;-----------------------------------------------------------------------------
-cglobal predict_4x4_vl_mmxext, 1,1,1
+cglobal predict_4x4_vl_mmxext, 1,1
movq mm1, [r0-FDEC_STRIDE]
movq mm3, mm1
movq mm2, mm1
@@ -144,6 +146,31 @@
RET
;-----------------------------------------------------------------------------
+; void predict_4x4_dc( uint8_t *src )
+;-----------------------------------------------------------------------------
+
+cglobal predict_4x4_dc_mmxext, 1,4
+ pxor mm7, mm7
+ movd mm0, [r0-FDEC_STRIDE]
+ psadbw mm0, mm7
+ movd r3d, mm0
+ movzx r1d, byte [r0-1]
+%assign n 1
+%rep 3
+ movzx r2d, byte [r0+FDEC_STRIDE*n-1]
+ add r1d, r2d
+%assign n n+1
+%endrep
+ lea r1d, [r1+r3+4]
+ shr r1d, 3
+ imul r1d, 0x01010101
+ mov [r0+FDEC_STRIDE*0], r1d
+ mov [r0+FDEC_STRIDE*1], r1d
+ mov [r0+FDEC_STRIDE*2], r1d
+ mov [r0+FDEC_STRIDE*3], r1d
+ RET
+
+;-----------------------------------------------------------------------------
; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_v_mmxext, 2,2
@@ -152,9 +179,34 @@
RET
;-----------------------------------------------------------------------------
+; void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] )
+;-----------------------------------------------------------------------------
+
+INIT_MMX
+cglobal predict_8x8_h_mmxext, 2,2
+ movu m3, [r1+7]
+ mova m7, m3
+ punpckhbw m3, m3
+ punpcklbw m7, m7
+ pshufw m0, m3, 0xff
+ pshufw m1, m3, 0xaa
+ pshufw m2, m3, 0x55
+ pshufw m3, m3, 0x00
+ pshufw m4, m7, 0xff
+ pshufw m5, m7, 0xaa
+ pshufw m6, m7, 0x55
+ pshufw m7, m7, 0x00
+%assign n 0
+%rep 8
+ mova [r0+n*FDEC_STRIDE], m %+ n
+%assign n n+1
+%endrep
+ RET
+
+;-----------------------------------------------------------------------------
; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
-cglobal predict_8x8_dc_mmxext, 2,2,1
+cglobal predict_8x8_dc_mmxext, 2,2
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [r1+7]
@@ -171,7 +223,7 @@
; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
%macro PRED8x8_DC 2
-cglobal %1, 2,2,1
+cglobal %1, 2,2
pxor mm0, mm0
psadbw mm0, [r1+%2]
paddw mm0, [pw_4 GLOBAL]
@@ -192,7 +244,7 @@
;-----------------------------------------------------------------------------
; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddl_mmxext, 2,2,1
+cglobal predict_8x8_ddl_mmxext, 2,2
movq mm5, [r1+16]
movq mm2, [r1+17]
movq mm3, [r1+23]
@@ -223,7 +275,7 @@
;-----------------------------------------------------------------------------
; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddr_mmxext, 2,2,1
+cglobal predict_8x8_ddr_mmxext, 2,2
movq mm1, [r1+7]
movq mm2, [r1+9]
movq mm3, [r1+15]
@@ -254,7 +306,7 @@
;-----------------------------------------------------------------------------
; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddl_sse2, 2,2,1
+cglobal predict_8x8_ddl_sse2, 2,2
movdqa xmm3, [r1+16]
movdqu xmm2, [r1+17]
movdqa xmm1, xmm3
@@ -272,7 +324,7 @@
;-----------------------------------------------------------------------------
; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddr_sse2, 2,2,1
+cglobal predict_8x8_ddr_sse2, 2,2
movdqu xmm3, [r1+8]
movdqu xmm1, [r1+7]
movdqa xmm2, xmm3
@@ -297,7 +349,7 @@
;-----------------------------------------------------------------------------
; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_vl_sse2, 2,2,1
+cglobal predict_8x8_vl_sse2, 2,2
movdqa xmm4, [r1+16]
movdqa xmm2, xmm4
movdqa xmm1, xmm4
@@ -338,7 +390,7 @@
; 6 .....
; 7 ,,,,,
-cglobal predict_8x8_vr_core_mmxext, 2,2,1
+cglobal predict_8x8_vr_core_mmxext, 2,2
movq mm2, [r1+16]
movq mm3, [r1+15]
movq mm1, [r1+14]
@@ -368,9 +420,33 @@
RET
;-----------------------------------------------------------------------------
+; void predict_8x8c_h_mmxext( uint8_t *src )
+;-----------------------------------------------------------------------------
+
+%macro PRED_8x8C_H 1
+cglobal predict_8x8c_h_%1, 1,1
+%ifidn %1, ssse3
+ mova m1, [pb_3 GLOBAL]
+%endif
+%assign n 0
+%rep 8
+ SPLATB m0, r0+FDEC_STRIDE*n-1, m1
+ mova [r0+FDEC_STRIDE*n], m0
+%assign n n+1
+%endrep
+ REP_RET
+%endmacro
+
+INIT_MMX
+%define SPLATB SPLATB_MMX
+PRED_8x8C_H mmxext
+%define SPLATB SPLATB_SSSE3
+PRED_8x8C_H ssse3
+
+;-----------------------------------------------------------------------------
; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
;-----------------------------------------------------------------------------
-cglobal predict_8x8c_dc_core_mmxext, 1,1,1
+cglobal predict_8x8c_dc_core_mmxext, 1,1
movq mm0, [r0 - FDEC_STRIDE]
pxor mm1, mm1
pxor mm2, mm2
@@ -422,7 +498,7 @@
;-----------------------------------------------------------------------------
; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-cglobal predict_8x8c_p_core_mmxext, 1,2,1
+cglobal predict_8x8c_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
movq mm1, mm2
pmullw mm2, [pw_3210 GLOBAL]
@@ -450,7 +526,7 @@
;-----------------------------------------------------------------------------
; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-cglobal predict_16x16_p_core_mmxext, 1,2,1
+cglobal predict_16x16_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
movq mm5, mm2
movq mm1, mm2
@@ -492,7 +568,7 @@
;-----------------------------------------------------------------------------
; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-cglobal predict_16x16_p_core_sse2, 1,2,1
+cglobal predict_16x16_p_core_sse2, 1,2
movd xmm0, r1m
movd xmm1, r2m
movd xmm2, r3m
@@ -543,6 +619,39 @@
REP_RET
;-----------------------------------------------------------------------------
+; void predict_16x16_h_mmxext( uint8_t *src )
+;-----------------------------------------------------------------------------
+
+%macro PRED_16x16_H 1
+cglobal predict_16x16_h_%1, 1,2
+ mov r1, FDEC_STRIDE*12
+%ifidn %1, ssse3
+ mova m1, [pb_3 GLOBAL]
+%endif
+.vloop:
+%assign n 0
+%rep 4
+ SPLATB m0, r0+r1+FDEC_STRIDE*n-1, m1
+ mova [r0+r1+FDEC_STRIDE*n], m0
+%if mmsize==8
+ mova [r0+r1+FDEC_STRIDE*n+8], m0
+%endif
+%assign n n+1
+%endrep
+ add r1, -FDEC_STRIDE*4
+ jge .vloop
+ REP_RET
+%endmacro
+
+;no SSE2, its slower than MMX on all systems that don't support SSSE3
+INIT_MMX
+%define SPLATB SPLATB_MMX
+PRED_16x16_H mmxext
+INIT_XMM
+%define SPLATB SPLATB_SSSE3
+PRED_16x16_H ssse3
+
+;-----------------------------------------------------------------------------
; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
;-----------------------------------------------------------------------------
@@ -568,7 +677,7 @@
%endif
REP_RET
-cglobal predict_16x16_dc_top_mmxext, 1,2,1
+cglobal predict_16x16_dc_top_mmxext, 1,2
PRED16x16_DC [pw_8 GLOBAL], 4
REP_RET
@@ -594,7 +703,7 @@
PRED16x16_DC_SSE2 xmm2, 5
REP_RET
-cglobal predict_16x16_dc_top_sse2, 1,2,1
+cglobal predict_16x16_dc_top_sse2, 1,2
PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4
REP_RET
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/predict-c.c
^
|
@@ -26,13 +26,18 @@
#include "pixel.h"
extern void predict_16x16_v_mmx( uint8_t *src );
+extern void predict_16x16_h_mmxext( uint8_t *src );
+extern void predict_16x16_h_ssse3( uint8_t *src );
extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left );
extern void predict_16x16_dc_top_mmxext( uint8_t *src );
extern void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
extern void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
extern void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
extern void predict_8x8c_v_mmx( uint8_t *src );
+extern void predict_8x8c_h_mmxext( uint8_t *src );
+extern void predict_8x8c_h_ssse3( uint8_t *src );
extern void predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
@@ -44,6 +49,7 @@
extern void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] );
extern void predict_4x4_ddl_mmxext( uint8_t *src );
extern void predict_4x4_vl_mmxext( uint8_t *src );
+extern void predict_4x4_dc_mmxext( uint8_t *src );
extern void predict_16x16_dc_top_sse2( uint8_t *src );
extern void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left );
extern void predict_16x16_v_sse2( uint8_t *src );
@@ -126,40 +132,18 @@
}
#ifdef ARCH_X86_64
-static void predict_16x16_h( uint8_t *src )
-{
- int y;
- for( y = 0; y < 16; y++ )
- {
- const uint64_t v = 0x0101010101010101ULL * src[-1];
- uint64_t *p = (uint64_t*)src;
- p[0] = p[1] = v;
- src += FDEC_STRIDE;
- }
-}
-
-static void predict_8x8c_h( uint8_t *src )
-{
- int y;
- for( y = 0; y < 8; y++ )
- {
- *(uint64_t*)src = 0x0101010101010101ULL * src[-1];
- src += FDEC_STRIDE;
- }
-}
-
static void predict_16x16_dc_left( uint8_t *src )
{
uint32_t s = 0;
- uint64_t dc;
+ uint64_t dc;
int y;
-
+
for( y = 0; y < 16; y++ )
{
s += src[-1 + y * FDEC_STRIDE];
- }
+ }
dc = (( s + 8 ) >> 4) * 0x0101010101010101ULL;
-
+
for( y = 0; y < 16; y++ )
{
uint64_t *p = (uint64_t*)src;
@@ -496,7 +480,6 @@
if( !(cpu&X264_CPU_MMX) )
return;
#ifdef ARCH_X86_64
- pf[I_PRED_16x16_H] = predict_16x16_h;
pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left;
#endif
pf[I_PRED_16x16_V] = predict_16x16_v_mmx;
@@ -505,6 +488,7 @@
pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext;
pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext;
pf[I_PRED_16x16_P] = predict_16x16_p_mmxext;
+ pf[I_PRED_16x16_H] = predict_16x16_h_mmxext;
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_16x16_DC] = predict_16x16_dc_sse2;
@@ -513,6 +497,9 @@
return;
pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2;
pf[I_PRED_16x16_P] = predict_16x16_p_sse2;
+ if( !(cpu&X264_CPU_SSSE3) )
+ return;
+ pf[I_PRED_16x16_H] = predict_16x16_h_ssse3;
}
void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
@@ -520,15 +507,18 @@
if( !(cpu&X264_CPU_MMX) )
return;
#ifdef ARCH_X86_64
- pf[I_PRED_CHROMA_H] = predict_8x8c_h;
pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left;
pf[I_PRED_CHROMA_DC_TOP] = predict_8x8c_dc_top;
#endif
pf[I_PRED_CHROMA_V] = predict_8x8c_v_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
+ pf[I_PRED_CHROMA_H] = predict_8x8c_h_mmxext;
pf[I_PRED_CHROMA_P] = predict_8x8c_p_mmxext;
pf[I_PRED_CHROMA_DC] = predict_8x8c_dc_mmxext;
+ if( !(cpu&X264_CPU_SSSE3) )
+ return;
+ pf[I_PRED_CHROMA_H] = predict_8x8c_h_ssse3;
}
void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12] )
@@ -536,6 +526,7 @@
if( !(cpu&X264_CPU_MMXEXT) )
return;
pf[I_PRED_8x8_V] = predict_8x8_v_mmxext;
+ pf[I_PRED_8x8_H] = predict_8x8_h_mmxext;
pf[I_PRED_8x8_DC] = predict_8x8_dc_mmxext;
pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext;
pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext;
@@ -565,4 +556,5 @@
return;
pf[I_PRED_4x4_DDL] = predict_4x4_ddl_mmxext;
pf[I_PRED_4x4_VL] = predict_4x4_vl_mmxext;
+ pf[I_PRED_4x4_DC] = predict_4x4_dc_mmxext;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/quant-a.asm
^
|
@@ -270,7 +270,6 @@
.rshift32:
neg t0d
movd m5, t0d
- picgetgot t0d
mova m6, [pd_1 GLOBAL]
pxor m7, m7
pslld m6, m5
@@ -290,12 +289,11 @@
sub t2d, t1d
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %3
-%ifdef PIC64
+%ifdef PIC
lea r1, [dequant%2_scale GLOBAL]
add r1, t2
%else
- picgetgot r0
- lea r1, [t2 + dequant%2_scale GLOBAL]
+ lea r1, [dequant%2_scale + t2 GLOBAL]
%endif
movifnidn r0d, r0m
movd m7, t0d
@@ -331,10 +329,10 @@
;-----------------------------------------------------------------------------
-; void x264_denoise_dct_core_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
+; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
;-----------------------------------------------------------------------------
%macro DENOISE_DCT 1
-cglobal x264_denoise_dct_core_%1, 4,5
+cglobal x264_denoise_dct_%1, 4,5
movzx r4d, word [r0] ; backup DC coefficient
pxor m7, m7
.loop:
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/quant.h
^
|
@@ -43,8 +43,8 @@
void x264_dequant_8x8_flat16_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_flat16_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
-void x264_denoise_dct_core_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
-void x264_denoise_dct_core_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
-void x264_denoise_dct_core_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
+void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
+void x264_denoise_dct_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
+void x264_denoise_dct_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
#endif
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/sad-a.asm
^
|
@@ -23,8 +23,10 @@
;*****************************************************************************
%include "x86inc.asm"
+%include "x86util.asm"
SECTION_RODATA
+pb_3: times 16 db 3
sw_64: dd 64
SECTION .text
@@ -80,7 +82,7 @@
pxor mm0, mm0
%rep %2/2
SAD_INC_2x%1P
-%endrep
+%endrep
movd eax, mm0
RET
%endmacro
@@ -215,7 +217,99 @@
SAD_W16 sse2
%define movdqu lddqu
SAD_W16 sse3
-%undef movdqu
+%define movdqu movdqa
+SAD_W16 sse2_aligned
+%define movdqu movups
+
+
+
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+;-----------------------------------------------------------------------------
+
+;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
+;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
+%macro INTRA_SAD16 1
+cglobal x264_intra_sad_x3_16x16_%1,3,5
+ pxor mm0, mm0
+ pxor mm1, mm1
+ psadbw mm0, [r1-FDEC_STRIDE+0]
+ psadbw mm1, [r1-FDEC_STRIDE+8]
+ paddw mm0, mm1
+ movd r3d, mm0
+%ifidn %1, ssse3
+ mova m1, [pb_3 GLOBAL]
+%endif
+%assign n 0
+%rep 16
+ movzx r4d, byte [r1-1+FDEC_STRIDE*n]
+ add r3d, r4d
+%assign n n+1
+%endrep
+ add r3d, 16
+ shr r3d, 5
+ imul r3d, 0x01010101
+ movd m7, r3d
+ mova m5, [r1-FDEC_STRIDE]
+%if mmsize==16
+ pshufd m7, m7, 0
+%else
+ mova m1, [r1-FDEC_STRIDE+8]
+ punpckldq m7, m7
+%endif
+ pxor m4, m4
+ pxor m3, m3
+ pxor m2, m2
+ mov r3d, 15*FENC_STRIDE
+.vloop:
+ SPLATB m6, r1+r3*2-1, m1
+ mova m0, [r0+r3]
+ psadbw m0, m7
+ paddw m4, m0
+ mova m0, [r0+r3]
+ psadbw m0, m5
+ paddw m2, m0
+%if mmsize==8
+ mova m0, [r0+r3]
+ psadbw m0, m6
+ paddw m3, m0
+ mova m0, [r0+r3+8]
+ psadbw m0, m7
+ paddw m4, m0
+ mova m0, [r0+r3+8]
+ psadbw m0, m1
+ paddw m2, m0
+ psadbw m6, [r0+r3+8]
+ paddw m3, m6
+%else
+ psadbw m6, [r0+r3]
+ paddw m3, m6
+%endif
+ add r3d, -FENC_STRIDE
+ jge .vloop
+%if mmsize==16
+ pslldq m3, 4
+ por m3, m2
+ movhlps m1, m3
+ paddw m3, m1
+ movq [r2+0], m3
+ movhlps m1, m4
+ paddw m4, m1
+%else
+ movd [r2+0], m2
+ movd [r2+4], m3
+%endif
+ movd [r2+8], m4
+ RET
+%endmacro
+
+INIT_MMX
+%define SPLATB SPLATB_MMX
+INTRA_SAD16 mmxext
+INIT_XMM
+INTRA_SAD16 sse2
+%define SPLATB SPLATB_SSSE3
+INTRA_SAD16 ssse3
@@ -694,7 +788,7 @@
and eax, 0x37
cmp eax, 0x30
jle x264_pixel_sad_16x%2_sse2
- PROLOGUE 4,6,0
+ PROLOGUE 4,6
mov r4d, r2d
and r4d, 15
%ifidn %1, ssse3
@@ -704,11 +798,10 @@
shl r4d, 4 ; code size = 80
%endif
%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
-%ifdef PIC64
+%ifdef PIC
lea r5, [sad_w16_addr GLOBAL]
add r5, r4
%else
- picgetgot r5
lea r5, [sad_w16_addr + r4 GLOBAL]
%endif
and r2, ~15
@@ -728,18 +821,10 @@
jle x264_pixel_sad_%1x%2_mmxext
and eax, 7
shl eax, 3
-%ifdef PIC32
- ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx
- mov r2, 64
- sub r2, eax
- movd mm7, eax
- movd mm6, r2
-%else
movd mm6, [sw_64 GLOBAL]
movd mm7, eax
psubw mm6, mm7
-%endif
- PROLOGUE 4,5,0
+ PROLOGUE 4,5
and r2, ~7
mov r4d, %3
pxor mm0, mm0
@@ -825,11 +910,11 @@
call x264_pixel_sad_%1x%2_cache%3_%5
mov [r11], eax
pop r2
- mov r0, r10
+ mov r0, r10
call x264_pixel_sad_%1x%2_cache%3_%5
mov [r11+4], eax
pop r2
- mov r0, r10
+ mov r0, r10
call x264_pixel_sad_%1x%2_cache%3_%5
mov [r11+8], eax
%else
@@ -875,15 +960,15 @@
call x264_pixel_sad_%1x%2_cache%3_%5
mov [r11], eax
pop r2
- mov r0, r10
+ mov r0, r10
call x264_pixel_sad_%1x%2_cache%3_%5
mov [r11+4], eax
pop r2
- mov r0, r10
+ mov r0, r10
call x264_pixel_sad_%1x%2_cache%3_%5
mov [r11+8], eax
pop r2
- mov r0, r10
+ mov r0, r10
call x264_pixel_sad_%1x%2_cache%3_%5
mov [r11+12], eax
%else
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/util.h
^
|
@@ -69,7 +69,7 @@
"jg 1b \n"
"movq %%mm4, %0 \n"
:"=m"(output), "+r"(i_mvc)
- :"r"(mvc)
+ :"r"(mvc), "m"(*(struct {int16_t x[4];} *)mvc)
);
sum += output[0] + output[1] + output[2] + output[3];
return sum;
@@ -90,7 +90,7 @@
"psadbw %%mm7, %%mm1 \n"
"movd %%mm1, %0 \n"
:"=r"(count)
- :"r"(v)
+ :"r"(v), "m"(*(struct {int16_t x[16];} *)v)
);
return (count+0x10)&0xff;
}
@@ -100,7 +100,7 @@
{
if(i_count == 128)
{
- int nonzero;
+ int nonzero = 0;
asm(
"movq (%1), %%mm0 \n"
"por 8(%1), %%mm0 \n"
@@ -121,7 +121,7 @@
"packsswb %%mm0, %%mm0 \n"
"movd %%mm0, %0 \n"
:"=r"(nonzero)
- :"r"(v)
+ :"r"(v), "m"(*(struct {int16_t x[64];} *)v)
);
return !!nonzero;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/x86inc.asm
^
|
@@ -37,57 +37,26 @@
%endif
%endmacro
-; PIC support macros. All these macros are totally harmless when PIC is
-; not defined but can ruin everything if misused in PIC mode. On x86_32, shared
-; objects cannot directly access global variables by address, they need to
-; go through the GOT (global offset table). Most OSes do not care about it
-; and let you load non-shared .so objects (Linux, Win32...). However, OS X
-; requires PIC code in its .dylib objects.
-;
-; - GLOBAL should be used as a suffix for global addressing, eg.
-; picgetgot ebx
+; PIC support macros.
+; x86_64 can't fit 64bit address literals in most instruction types,
+; so shared objects (under the assumption that they might be anywhere
+; in memory) must use an address mode that does fit.
+; So all accesses to global variables must use this macro, e.g.
; mov eax, [foo GLOBAL]
-; instead of
+; instead of
; mov eax, [foo]
;
-; - picgetgot computes the GOT address into the given register in PIC
-; mode, otherwise does nothing. You need to do this before using GLOBAL.
-; Before in both execution order and compiled code order (so GLOBAL knows
-; which register the GOT is in).
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
-%ifndef PIC
- %define GLOBAL
- %macro picgetgot 1
- %endmacro
-%elifdef ARCH_X86_64
- %define PIC64
+%ifndef ARCH_X86_64
+ %undef PIC
+%endif
+%ifdef PIC
%define GLOBAL wrt rip
- %macro picgetgot 1
- %endmacro
%else
- %define PIC32
- %ifidn __OUTPUT_FORMAT__,macho
- ; There is no real global offset table on OS X, but we still
- ; need to reference our variables by offset.
- %macro picgetgot 1
- call %%getgot
- %%getgot:
- pop %1
- add %1, $$ - %%getgot
- %undef GLOBAL
- %define GLOBAL + %1 - fakegot
- %endmacro
- %else ; elf
- extern _GLOBAL_OFFSET_TABLE_
- %macro picgetgot 1
- call %%getgot
- %%getgot:
- pop %1
- add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc
- %undef GLOBAL
- %define GLOBAL + %1 wrt ..gotoff
- %endmacro
- %endif
+ %define GLOBAL
%endif
; Macros to eliminate most code duplication between x86_32 and x86_64:
@@ -97,14 +66,13 @@
; PROLOGUE:
; %1 = number of arguments. loads them from stack if needed.
-; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed.
-; %3 = whether global constants are used in this function. inits x86_32 PIC if needed.
-; %4 = list of names to define to registers
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = list of names to define to registers
; PROLOGUE can also be invoked by adding the same options to cglobal
; e.g.
-; cglobal foo, 2,3,0, dst, src, tmp
-; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals
+; cglobal foo, 2,3, dst, src, tmp
+; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
; TODO Some functions can use some args directly from the stack. If they're the
; last args then you can just not declare them, but if they're in the middle
@@ -240,12 +208,12 @@
%endif
%endmacro
-%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
+%macro PROLOGUE 2-3+ ; #args, #regs, arg_names...
ASSERT %2 >= %1
ASSERT %2 <= 7
%assign stack_offset 0
LOAD_IF_USED 6, %1
- DEFINE_ARGS %4
+ DEFINE_ARGS %3
%endmacro
%macro RET 0
@@ -288,15 +256,10 @@
%endif
%endmacro
-%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
+%macro PROLOGUE 2-3+ ; #args, #regs, arg_names...
ASSERT %2 >= %1
%assign stack_offset 0
%assign regs_used %2
- %ifdef PIC
- %if %3
- %assign regs_used regs_used+1
- %endif
- %endif
ASSERT regs_used <= 7
PUSH_IF_USED 3
PUSH_IF_USED 4
@@ -309,10 +272,7 @@
LOAD_IF_USED 4, %1
LOAD_IF_USED 5, %1
LOAD_IF_USED 6, %1
- %if %3
- picgetgot r%2
- %endif
- DEFINE_ARGS %4
+ DEFINE_ARGS %3
%endmacro
%macro RET 0
@@ -502,6 +462,7 @@
%assign %%i 0
%rep num_mmregs
CAT_XDEFINE m, %%i, %1_m %+ %%i
+ CAT_XDEFINE n, m %+ %%i, %%i
%assign %%i %%i+1
%endrep
%endmacro
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/common/x86/x86util.asm
^
|
@@ -18,6 +18,87 @@
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*****************************************************************************
+%macro SBUTTERFLY 4
+ mova m%4, m%2
+ punpckl%1 m%2, m%3
+ punpckh%1 m%4, m%3
+ SWAP %3, %4
+%endmacro
+
+%macro TRANSPOSE4x4W 5
+ SBUTTERFLY wd, %1, %2, %5
+ SBUTTERFLY wd, %3, %4, %5
+ SBUTTERFLY dq, %1, %3, %5
+ SBUTTERFLY dq, %2, %4, %5
+ SWAP %2, %3
+%endmacro
+
+%macro TRANSPOSE2x4x4W 5
+ SBUTTERFLY wd, %1, %2, %5
+ SBUTTERFLY wd, %3, %4, %5
+ SBUTTERFLY dq, %1, %3, %5
+ SBUTTERFLY dq, %2, %4, %5
+ SBUTTERFLY qdq, %1, %2, %5
+ SBUTTERFLY qdq, %3, %4, %5
+%endmacro
+
+%macro TRANSPOSE4x4D 5
+ SBUTTERFLY dq, %1, %2, %5
+ SBUTTERFLY dq, %3, %4, %5
+ SBUTTERFLY qdq, %1, %3, %5
+ SBUTTERFLY qdq, %2, %4, %5
+ SWAP %2, %3
+%endmacro
+
+%macro TRANSPOSE8x8W 9-11
+%ifdef ARCH_X86_64
+ SBUTTERFLY wd, %1, %2, %9
+ SBUTTERFLY wd, %3, %4, %9
+ SBUTTERFLY wd, %5, %6, %9
+ SBUTTERFLY wd, %7, %8, %9
+ SBUTTERFLY dq, %1, %3, %9
+ SBUTTERFLY dq, %2, %4, %9
+ SBUTTERFLY dq, %5, %7, %9
+ SBUTTERFLY dq, %6, %8, %9
+ SBUTTERFLY qdq, %1, %5, %9
+ SBUTTERFLY qdq, %2, %6, %9
+ SBUTTERFLY qdq, %3, %7, %9
+ SBUTTERFLY qdq, %4, %8, %9
+ SWAP %2, %5
+ SWAP %4, %7
+%else
+; in: m0..m7, unless %11 in which case m6 is in %9
+; out: m0..m7, unless %11 in which case m4 is in %10
+; spills into %9 and %10
+%if %0<11
+ movdqa %9, m%7
+%endif
+ SBUTTERFLY wd, %1, %2, %7
+ movdqa %10, m%2
+ movdqa m%7, %9
+ SBUTTERFLY wd, %3, %4, %2
+ SBUTTERFLY wd, %5, %6, %2
+ SBUTTERFLY wd, %7, %8, %2
+ SBUTTERFLY dq, %1, %3, %2
+ movdqa %9, m%3
+ movdqa m%2, %10
+ SBUTTERFLY dq, %2, %4, %3
+ SBUTTERFLY dq, %5, %7, %3
+ SBUTTERFLY dq, %6, %8, %3
+ SBUTTERFLY qdq, %1, %5, %3
+ SBUTTERFLY qdq, %2, %6, %3
+ movdqa %10, m%2
+ movdqa m%3, %9
+ SBUTTERFLY qdq, %3, %7, %2
+ SBUTTERFLY qdq, %4, %8, %2
+ SWAP %2, %5
+ SWAP %4, %7
+%if 0<11
+ movdqa m%5, %10
+%endif
+%endif
+%endmacro
+
%macro ABS1_MMX 2 ; a, tmp
pxor %2, %2
psubw %2, %1
@@ -50,6 +131,40 @@
ABS2 %3, %4, %5, %6
%endmacro
+%macro SPLATB_MMX 3
+ movd %1, [%2-3] ;to avoid crossing a cacheline
+ punpcklbw %1, %1
+%if mmsize==16
+ pshuflw %1, %1, 0xff
+ movlhps %1, %1
+%else
+ pshufw %1, %1, 0xff
+%endif
+%endmacro
+
+%macro SPLATB_SSSE3 3
+ movd %1, [%2-3]
+ pshufb %1, %3
+%endmacro
+
+%macro PALIGNR_MMX 4
+ %ifnidn %4, %2
+ mova %4, %2
+ %endif
+ %if mmsize == 8
+ psllq %1, (8-%3)*8
+ psrlq %4, %3*8
+ %else
+ pslldq %1, 16-%3
+ psrldq %4, %3
+ %endif
+ por %1, %4
+%endmacro
+
+%macro PALIGNR_SSSE3 4
+ palignr %1, %2, %3
+%endmacro
+
%macro SUMSUB_BA 2
paddw %1, %2
paddw %2, %2
@@ -122,3 +237,4 @@
packuswb %1, %1
movh %4, %1
%endmacro
+
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/configure
^
|
@@ -243,7 +243,6 @@
case $host_cpu in
i*86)
ARCH="X86"
- AS="yasm"
ASFLAGS="-O2"
if [ "$SYS" = MACOSX ]; then
ASFLAGS="$ASFLAGS -f macho -DPREFIX"
@@ -256,7 +255,6 @@
;;
x86_64)
ARCH="X86_64"
- AS="yasm"
if [ "$SYS" = MACOSX ];then
ASFLAGS="-f macho64 -m amd64 -DPIC -DPREFIX"
CFLAGS="$CFLAGS -arch x86_64"
@@ -309,15 +307,12 @@
fi
if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
- if [ $ARCH = X86 -a $pic = yes -a x$AS = xyasm -a\
- "`yasm --version 2>$DEVNULL | head -n 1`" "<" "yasm 0.6.2" ] ; then
- echo "yasm prior to 0.6.2 miscompiles PIC. trying nasm instead..."
- AS=nasm
- fi
if as_check "pabsw xmm0, xmm0" ; then
CFLAGS="$CFLAGS -DHAVE_MMX"
else
- echo "No suitable assembler found. Install 'yasm' to get MMX/SSE optimized code."
+ VER=`([ $AS == nasm ] && nasm -v || $AS --version || echo no assembler) 2>$DEVNULL | head -n 1`
+ echo "Found $VER"
+ echo "Minimum version is yasm-0.6.1 or nasm-2.0"
echo "If you really want to compile without asm, configure with --disable-asm."
exit 1
fi
@@ -455,15 +450,25 @@
echo 'IMPLIBNAME=libx264.dll.a' >> config.mak
echo 'SOFLAGS=-Wl,--out-implib,$(IMPLIBNAME) -Wl,--enable-auto-image-base' >> config.mak
elif [ "$SYS" = "MACOSX" ]; then
+ echo "SOSUFFIX=dylib" >> config.mak
echo "SONAME=libx264.$API.dylib" >> config.mak
- echo 'SOFLAGS=-dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress' >> config.mak
+ echo 'SOFLAGS=-dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name $(DESTDIR)$(libdir)/$(SONAME)' >> config.mak
else
+ echo "SOSUFFIX=so" >> config.mak
echo "SONAME=libx264.so.$API" >> config.mak
echo 'SOFLAGS=-Wl,-soname,$(SONAME)' >> config.mak
fi
echo 'default: $(SONAME)' >> config.mak
if [ "$gtk" = "yes" ]; then
- echo "SONAMEGTK=libx264gtk.so.$API" >> gtk/config.mak
+ if [ "$SYS" = "MACOSX" ]; then
+ echo "SOSUFFIX=dylib" >> gtk/config.mak
+ echo "SONAMEGTK=libx264gtk.$API.dylib" >> gtk/config.mak
+ echo 'SOFLAGS=-dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name $(DESTDIR)$(libdir)/$(SONAMEGTK)' >> gtk/config.mak
+ else
+ echo "SOSUFFIX=so" >> gtk/config.mak
+ echo "SONAMEGTK=libx264gtk.so.$API" >> gtk/config.mak
+ echo 'SOFLAGS=-Wl,-soname,$(SONAMEGTK)' >> gtk/config.mak
+ fi
fi
fi
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/encoder/analyse.c
^
|
@@ -205,7 +205,7 @@
{
/* conduct the analysis using this lamda and QP */
a->i_qp = h->mb.i_qp = i_qp;
- h->mb.i_chroma_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
+ h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
a->i_lambda = x264_lambda_tab[i_qp];
a->i_lambda2 = x264_lambda2_tab[i_qp];
a->b_mbrd = h->param.analyse.i_subpel_refine >= 6 &&
@@ -225,8 +225,8 @@
a->i_satd_i4x4 =
a->i_satd_i8x8chroma = COST_MAX;
- /* non-RD PCM decision is inaccurate, so don't do it */
- a->i_satd_pcm = a->b_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
+ /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
+ a->i_satd_pcm = !h->mb.i_psy_rd && a->b_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
a->b_fast_intra = 0;
h->mb.i_skip_intra =
@@ -467,6 +467,58 @@
}
}
+/* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
+static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
+{
+ DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
+ DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
+ DECLARE_ALIGNED_16( uint8_t zero[16*FDEC_STRIDE] ) = {0};
+ int i;
+
+ if( do_both_dct || h->mb.b_transform_8x8 )
+ {
+ h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero );
+ for( i = 0; i < 4; i++ )
+ h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] );
+ }
+ if( do_both_dct || !h->mb.b_transform_8x8 )
+ {
+ h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero );
+ for( i = 0; i < 16; i++ )
+ h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] );
+ }
+}
+
+/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
+static inline void x264_mb_cache_fenc_satd( x264_t *h )
+{
+ DECLARE_ALIGNED_16(uint8_t zero[16]) = {0};
+ uint8_t *fenc;
+ int x, y, satd_sum = 0, sa8d_sum = 0;
+ if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
+ x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
+ if( !h->mb.i_psy_rd )
+ return;
+ for( y = 0; y < 4; y++ )
+ for( x = 0; x < 4; x++ )
+ {
+ fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
+ h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
+ - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
+ satd_sum += h->mb.pic.fenc_satd[y][x];
+ }
+ for( y = 0; y < 2; y++ )
+ for( x = 0; x < 2; x++ )
+ {
+ fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
+ h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
+ - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
+ sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
+ }
+ h->mb.pic.fenc_satd_sum = satd_sum;
+ h->mb.pic.fenc_sa8d_sum = sa8d_sum;
+}
+
static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
{
int i;
@@ -498,7 +550,7 @@
h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE );
satdv[I_PRED_CHROMA_P] =
h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE );
-
+
for( i=0; i<i_max; i++ )
{
int i_mode = predict_mode[i];
@@ -517,8 +569,13 @@
int i_mode = predict_mode[i];
/* we do the prediction */
- h->predict_8x8c[i_mode]( p_dstc[0] );
- h->predict_8x8c[i_mode]( p_dstc[1] );
+ if( h->mb.b_lossless )
+ x264_predict_lossless_8x8_chroma( h, i_mode );
+ else
+ {
+ h->predict_8x8c[i_mode]( p_dstc[0] );
+ h->predict_8x8c[i_mode]( p_dstc[1] );
+ }
/* we calculate the cost */
i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
@@ -544,7 +601,7 @@
int i, idx;
int i_max;
int predict_mode[9];
- int b_merged_satd = h->pixf.intra_satd_x3_16x16 && h->pixf.mbcmp[0] == h->pixf.satd[0];
+ int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
/*---------------- Try all mode and calculate their score ---------------*/
@@ -553,7 +610,7 @@
if( b_merged_satd && i_max == 4 )
{
- h->pixf.intra_satd_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
+ h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
h->predict_16x16[I_PRED_16x16_P]( p_dst );
a->i_satd_i16x16_dir[I_PRED_16x16_P] =
h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
@@ -569,7 +626,11 @@
{
int i_satd;
int i_mode = predict_mode[i];
- h->predict_16x16[i_mode]( p_dst );
+
+ if( h->mb.b_lossless )
+ x264_predict_lossless_16x16( h, i_mode );
+ else
+ h->predict_16x16[i_mode]( p_dst );
i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
@@ -588,7 +649,7 @@
if( flags & X264_ANALYSE_I8x8 )
{
DECLARE_ALIGNED_16( uint8_t edge[33] );
- x264_pixel_cmp_t sa8d = (*h->pixf.mbcmp == *h->pixf.sad) ? h->pixf.sad[PIXEL_8x8] : h->pixf.sa8d[PIXEL_8x8];
+ x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
int i_satd_thresh = a->b_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
int i_cost = 0;
b_merged_satd = h->pixf.intra_sa8d_x3_8x8 && h->pixf.mbcmp[0] == h->pixf.satd[0];
@@ -629,7 +690,10 @@
int i_satd;
int i_mode = predict_mode[i];
- h->predict_8x8[i_mode]( p_dst_by, edge );
+ if( h->mb.b_lossless )
+ x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
+ else
+ h->predict_8x8[i_mode]( p_dst_by, edge );
i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE )
+ a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
@@ -711,8 +775,10 @@
{
int i_satd;
int i_mode = predict_mode[i];
-
- h->predict_4x4[i_mode]( p_dst_by );
+ if( h->mb.b_lossless )
+ x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
+ else
+ h->predict_4x4[i_mode]( p_dst_by );
i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
p_src_by, FENC_STRIDE )
@@ -824,7 +890,10 @@
for( i = 0; i < i_max; i++ )
{
i_mode = predict_mode[i];
- h->predict_4x4[i_mode]( p_dst_by );
+ if( h->mb.b_lossless )
+ x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
+ else
+ h->predict_4x4[i_mode]( p_dst_by );
i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
if( i_best > i_satd )
@@ -876,7 +945,10 @@
i_mode = predict_mode[i];
if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
continue;
- h->predict_8x8[i_mode]( p_dst_by, edge );
+ if( h->mb.b_lossless )
+ x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
+ else
+ h->predict_8x8[i_mode]( p_dst_by, edge );
i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
if( i_best > i_satd )
@@ -928,8 +1000,13 @@
for( i = 0; i < i_max; i++ )
{
i_mode = predict_mode[i];
- h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
- h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+ if( h->mb.b_lossless )
+ x264_predict_lossless_8x8_chroma( h, i_mode );
+ else
+ {
+ h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
+ h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+ }
/* if we've already found a mode that needs no residual, then
* probably any mode with a residual will be worse.
* so avoid dct on the remaining modes to improve speed. */
@@ -964,7 +1041,7 @@
{
x264_me_t m;
int i_ref, i_mvc;
- DECLARE_ALIGNED_4( int16_t mvc[7][2] );
+ DECLARE_ALIGNED_4( int16_t mvc[8][2] );
int i_halfpel_thresh = INT_MAX;
int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
@@ -1009,7 +1086,7 @@
h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
/* save mv for predicting neighbors */
- *(uint32_t*)a->l0.mvc[i_ref][0] =
+ *(uint32_t*)a->l0.mvc[i_ref][0] =
*(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
}
@@ -1017,12 +1094,15 @@
assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
h->mb.i_type = P_L0;
- if( a->b_mbrd && a->l0.me16x16.i_ref == 0
- && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
+ if( a->b_mbrd )
{
- h->mb.i_partition = D_16x16;
- x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
- a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+ x264_mb_cache_fenc_satd( h );
+ if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
+ {
+ h->mb.i_partition = D_16x16;
+ x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
+ a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+ }
}
}
@@ -1419,26 +1499,21 @@
}
}
-#define WEIGHTED_AVG( size, pix1, stride1, src2, stride2 ) \
- { \
- if( h->param.analyse.b_weighted_bipred ) \
- h->mc.avg_weight[size]( pix1, stride1, src2, stride2, \
- h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
- else \
- h->mc.avg[size]( pix1, stride1, src2, stride2 ); \
- }
+#define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
+{ \
+ h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
+}
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{
+ DECLARE_ALIGNED_16( uint8_t pix0[16*16] );
DECLARE_ALIGNED_16( uint8_t pix1[16*16] );
- DECLARE_ALIGNED_16( uint8_t pix2[16*16] );
- uint8_t *src2;
- int stride2 = 16;
- int weight;
+ uint8_t *src0, *src1;
+ int stride0 = 16, stride1 = 16;
x264_me_t m;
int i_ref, i_mvc;
- DECLARE_ALIGNED_4( int16_t mvc[8][2] );
+ DECLARE_ALIGNED_4( int16_t mvc[9][2] );
int i_halfpel_thresh = INT_MAX;
int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
@@ -1504,41 +1579,16 @@
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
/* get cost of BI mode */
- weight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
- if ( (*(uint32_t*)a->l0.me16x16.mv & 0x10001) == 0 )
- {
- /* l0 reference is halfpel, so get_ref on it will make it faster */
- src2 =
- h->mc.get_ref( pix2, &stride2,
- h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
- a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
- 16, 16 );
- h->mc.mc_luma( pix1, 16,
- h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
- a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
- 16, 16 );
- weight = 64 - weight;
- }
- else
- {
- /* if l0 was qpel, we'll use get_ref on l1 instead */
- h->mc.mc_luma( pix1, 16,
- h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
- a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
- 16, 16 );
- src2 =
- h->mc.get_ref( pix2, &stride2,
- h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
- a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
- 16, 16 );
- }
+ src0 = h->mc.get_ref( pix0, &stride0,
+ h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
+ a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 );
+ src1 = h->mc.get_ref( pix1, &stride1,
+ h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
+ a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 );
- if( h->param.analyse.b_weighted_bipred )
- h->mc.avg_weight[PIXEL_16x16]( pix1, 16, src2, stride2, weight );
- else
- h->mc.avg[PIXEL_16x16]( pix1, 16, src2, stride2 );
+ h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
- a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix1, 16 )
+ a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
+ REF_COST( 0, a->l0.i_ref )
+ REF_COST( 1, a->l1.i_ref )
+ a->l0.me16x16.cost_mv
@@ -1654,6 +1704,8 @@
const int y8 = i/2;
int i_part_cost;
int i_part_cost_bi = 0;
+ int stride[2] = {8,8};
+ uint8_t *src[2];
for( l = 0; l < 2; l++ )
{
@@ -1672,13 +1724,12 @@
x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
/* BI mode */
- h->mc.mc_luma( pix[l], 8, m->p_fref, m->i_stride[0],
- m->mv[0], m->mv[1], 8, 8 );
+ src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
+ m->mv[0], m->mv[1], 8, 8 );
i_part_cost_bi += m->cost_mv;
/* FIXME: ref cost */
}
-
- WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, pix[1], 8 );
+ h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
+ a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
@@ -1704,7 +1755,7 @@
uint8_t **p_fref[2] =
{ h->mb.pic.p_fref[0][a->l0.i_ref],
h->mb.pic.p_fref[1][a->l1.i_ref] };
- DECLARE_ALIGNED_16( uint8_t pix[2][16*8] );
+ DECLARE_ALIGNED_16( uint8_t pix[2][16*8] );
DECLARE_ALIGNED_4( int16_t mvc[2][2] );
int i, l;
@@ -1715,6 +1766,8 @@
{
int i_part_cost;
int i_part_cost_bi = 0;
+ int stride[2] = {16,16};
+ uint8_t *src[2];
/* TODO: check only the list(s) that were used in b8x8? */
for( l = 0; l < 2; l++ )
@@ -1735,13 +1788,12 @@
x264_me_search( h, m, mvc, 2 );
/* BI mode */
- h->mc.mc_luma( pix[l], 16, m->p_fref, m->i_stride[0],
- m->mv[0], m->mv[1], 16, 8 );
+ src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
+ m->mv[0], m->mv[1], 16, 8 );
/* FIXME: ref cost */
i_part_cost_bi += m->cost_mv;
}
-
- WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, pix[1], 16 );
+ h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
i_part_cost = a->l0.me16x8[i].cost;
@@ -1784,6 +1836,8 @@
{
int i_part_cost;
int i_part_cost_bi = 0;
+ int stride[2] = {8,8};
+ uint8_t *src[2];
for( l = 0; l < 2; l++ )
{
@@ -1803,13 +1857,13 @@
x264_me_search( h, m, mvc, 2 );
/* BI mode */
- h->mc.mc_luma( pix[l], 8, m->p_fref, m->i_stride[0],
- m->mv[0], m->mv[1], 8, 16 );
+ src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
+ m->mv[0], m->mv[1], 8, 16 );
/* FIXME: ref cost */
i_part_cost_bi += m->cost_mv;
}
- WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, pix[1], 8 );
+ h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
i_part_cost = a->l0.me8x16[i].cost;
@@ -1907,7 +1961,7 @@
static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
{
- int thresh = i_satd_inter * 17/16;
+ int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
{
@@ -2006,7 +2060,7 @@
static inline void x264_mb_analyse_transform( x264_t *h )
{
- if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
+ if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
{
int i_cost4, i_cost8;
/* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
@@ -2066,6 +2120,8 @@
/*--------------------------- Do the analysis ---------------------------*/
if( h->sh.i_type == SLICE_TYPE_I )
{
+ if( analysis.b_mbrd )
+ x264_mb_cache_fenc_satd( h );
x264_mb_analyse_intra( h, &analysis, COST_MAX );
if( analysis.b_mbrd )
x264_intra_rd( h, &analysis, COST_MAX );
@@ -2344,6 +2400,9 @@
int i_bskip_cost = COST_MAX;
int b_skip = 0;
+ if( analysis.b_mbrd )
+ x264_mb_cache_fenc_satd( h );
+
h->mb.i_type = B_SKIP;
if( h->mb.b_direct_auto_write )
{
@@ -2558,7 +2617,7 @@
h->mb.i_type = i_type;
h->mb.i_partition = i_partition;
}
-
+
x264_mb_analyse_intra( h, &analysis, i_satd_inter );
if( analysis.b_mbrd )
@@ -2589,6 +2648,8 @@
h->mb.b_trellis = h->param.analyse.i_trellis;
h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
+ if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
+ x264_psy_trellis_init( h, 0 );
if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
h->mb.i_skip_intra = 0;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/encoder/cabac.c
^
|
@@ -790,7 +790,7 @@
if( i_mb_type == I_PCM )
{
i_mb_pos_tex = x264_cabac_pos( cb );
- h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start;
+ h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
memcpy( cb->p, h->mb.pic.p_fenc[0], 256 );
cb->p += 256;
@@ -811,7 +811,7 @@
h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
- h->stat.frame.i_itex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex;
+ h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex;
return;
}
#endif
@@ -963,7 +963,7 @@
#ifndef RDO_SKIP_BS
i_mb_pos_tex = x264_cabac_pos( cb );
- h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start;
+ h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
#endif
if( i_mb_type != I_16x16 )
@@ -1018,10 +1018,7 @@
}
#ifndef RDO_SKIP_BS
- if( IS_INTRA( i_mb_type ) )
- h->stat.frame.i_itex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex;
- else
- h->stat.frame.i_ptex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex;
+ h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex;
#endif
}
@@ -1032,7 +1029,7 @@
* works on all partition sizes except 16x16
* for sub8x8, call once per 8x8 block
*****************************************************************************/
-void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_pixel )
+static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_pixel )
{
const int i_mb_type = h->mb.i_type;
int j;
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/encoder/cavlc.c
^
|
@@ -116,9 +116,7 @@
/* total/trailing */
if( i_idx == BLOCK_INDEX_CHROMA_DC )
- {
bs_write_vlc( s, x264_coeff_token[4][i_total*4+i_trailing] );
- }
else
{
/* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
@@ -132,9 +130,7 @@
i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
if( i_trailing > 0 )
- {
bs_write( s, i_trailing, i_sign );
- }
for( i = i_trailing; i < i_total; i++ )
{
int mask = level[i] >> 15;
@@ -145,19 +141,13 @@
i_level_code -= 2; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
if( ( i_level_code >> i_suffix_length ) < 14 )
- {
bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length,
(1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
- }
else if( i_suffix_length == 0 && i_level_code < 30 )
- {
bs_write( s, 19, (1<<4) + (i_level_code - 14) );
- }
else if( i_suffix_length > 0 && ( i_level_code >> i_suffix_length ) == 14 )
- {
bs_write( s, 15 + i_suffix_length,
(1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
- }
else
{
int i_level_prefix = 15;
@@ -344,7 +334,7 @@
{
bs_write_ue( s, i_mb_i_offset + 25 );
i_mb_pos_tex = bs_pos( s );
- h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start;
+ h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
bs_align_0( s );
@@ -362,7 +352,7 @@
h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
- h->stat.frame.i_itex_bits += bs_pos(s) - i_mb_pos_tex;
+ h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
return;
}
#endif
@@ -384,16 +374,10 @@
int i_pred = x264_mb_predict_intra4x4_mode( h, i );
int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
- if( i_pred == i_mode)
- {
+ if( i_pred == i_mode )
bs_write1( s, 1 ); /* b_prev_intra4x4_pred_mode */
- }
else
- {
- if( i_mode >= i_pred )
- i_mode--;
- bs_write( s, 4, i_mode );
- }
+ bs_write( s, 4, i_mode - (i_mode > i_pred) );
}
bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
}
@@ -412,9 +396,7 @@
bs_write_ue( s, 0 );
if( h->mb.pic.i_fref[0] > 1 )
- {
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
- }
x264_mb_predict_mv( h, 0, 0, 4, mvp );
bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
@@ -457,9 +439,8 @@
else if( i_mb_type == P_8x8 )
{
int b_sub_ref0;
-
- if( h->mb.cache.ref[0][x264_scan8[0]] == 0 && h->mb.cache.ref[0][x264_scan8[4]] == 0 &&
- h->mb.cache.ref[0][x264_scan8[8]] == 0 && h->mb.cache.ref[0][x264_scan8[12]] == 0 )
+ if( (h->mb.cache.ref[0][x264_scan8[0]] | h->mb.cache.ref[0][x264_scan8[ 4]] |
+ h->mb.cache.ref[0][x264_scan8[8]] | h->mb.cache.ref[0][x264_scan8[12]]) == 0 )
{
bs_write_ue( s, 4 );
b_sub_ref0 = 0;
@@ -469,11 +450,14 @@
bs_write_ue( s, 3 );
b_sub_ref0 = 1;
}
+
/* sub mb type */
- for( i = 0; i < 4; i++ )
- {
- bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i] ] );
- }
+ if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
+ for( i = 0; i < 4; i++ )
+ bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i] ] );
+ else
+ bs_write( s, 4, 0xf );
+
/* ref0 */
if( h->mb.pic.i_fref[0] > 1 && b_sub_ref0 )
{
@@ -492,24 +476,16 @@
/* sub mb type */
for( i = 0; i < 4; i++ )
- {
bs_write_ue( s, sub_mb_type_b_to_golomb[ h->mb.i_sub_partition[i] ] );
- }
+
/* ref */
for( i = 0; i < 4; i++ )
- {
if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
- {
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
- }
- }
for( i = 0; i < 4; i++ )
- {
if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
- {
bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );
- }
- }
+
/* mvd */
for( i = 0; i < 4; i++ )
cavlc_mb8x8_mvd( h, s, 0, i );
@@ -532,30 +508,27 @@
b_list[1][i] = x264_mb_type_list1_table[i_mb_type][i];
}
-
bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
for( i_list = 0; i_list < 2; i_list++ )
{
- const int i_ref_max = i_list == 0 ? h->mb.pic.i_fref[0] : h->mb.pic.i_fref[1];
+ const int i_ref_max = (i_list == 0 ? h->mb.pic.i_fref[0] : h->mb.pic.i_fref[1]) - 1;
- if( i_ref_max > 1 )
- {
+ if( i_ref_max )
switch( h->mb.i_partition )
{
case D_16x16:
- if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] );
+ if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] );
break;
case D_16x8:
- if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] );
- if( b_list[i_list][1] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[8]] );
+ if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] );
+ if( b_list[i_list][1] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[8]] );
break;
case D_8x16:
- if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] );
- if( b_list[i_list][1] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[4]] );
+ if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] );
+ if( b_list[i_list][1] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[4]] );
break;
}
- }
}
for( i_list = 0; i_list < 2; i_list++ )
{
@@ -601,9 +574,7 @@
}
}
else if( i_mb_type == B_DIRECT )
- {
bs_write_ue( s, 0 );
- }
else
{
x264_log(h, X264_LOG_ERROR, "invalid/unhandled mb_type\n" );
@@ -612,24 +583,18 @@
#ifndef RDO_SKIP_BS
i_mb_pos_tex = bs_pos( s );
- h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start;
+ h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
#endif
/* Coded block patern */
if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
- {
bs_write_ue( s, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
- }
else if( i_mb_type != I_16x16 )
- {
bs_write_ue( s, inter_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
- }
/* transform size 8x8 flag */
if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
- {
bs_write1( s, h->mb.b_transform_8x8 );
- }
/* write residual */
if( i_mb_type == I_16x16 )
@@ -640,19 +605,19 @@
block_residual_write_cavlc( h, s, BLOCK_INDEX_LUMA_DC , h->dct.luma16x16_dc, 16 );
/* AC Luma */
- if( h->mb.i_cbp_luma != 0 )
+ if( h->mb.i_cbp_luma )
for( i = 0; i < 16; i++ )
{
h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 );
}
}
- else if( h->mb.i_cbp_luma != 0 || h->mb.i_cbp_chroma != 0 )
+ else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
{
cavlc_qp_delta( h, s );
x264_macroblock_luma_write_cavlc( h, s, 0, 3 );
}
- if( h->mb.i_cbp_chroma != 0 )
+ if( h->mb.i_cbp_chroma )
{
/* Chroma DC residual present */
block_residual_write_cavlc( h, s, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[0], 4 );
@@ -666,10 +631,7 @@
}
#ifndef RDO_SKIP_BS
- if( IS_INTRA( i_mb_type ) )
- h->stat.frame.i_itex_bits += bs_pos(s) - i_mb_pos_tex;
- else
- h->stat.frame.i_ptex_bits += bs_pos(s) - i_mb_pos_tex;
+ h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
#endif
}
@@ -680,7 +642,7 @@
* works on all partition sizes except 16x16
* for sub8x8, call once per 8x8 block
*****************************************************************************/
-int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
+static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
{
bs_t s;
const int i_mb_type = h->mb.i_type;
@@ -770,7 +732,7 @@
static int x264_i8x8_chroma_size_cavlc( x264_t *h )
{
h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
- if( h->mb.i_cbp_chroma != 0 )
+ if( h->mb.i_cbp_chroma )
{
block_residual_write_cavlc( h, &h->out.bs, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[0], 4 );
block_residual_write_cavlc( h, &h->out.bs, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[1], 4 );
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/encoder/encoder.c
^
|
@@ -148,7 +148,7 @@
/* If effective qp <= 15, deblocking would have no effect anyway */
if( param->b_deblocking_filter
&& ( h->mb.b_variable_qp
- || 15 < i_qp + 2 * X264_MAX(param->i_deblocking_filter_alphac0, param->i_deblocking_filter_beta) ) )
+ || 15 < i_qp + 2 * X264_MIN(param->i_deblocking_filter_alphac0, param->i_deblocking_filter_beta) ) )
{
sh->i_disable_deblocking_filter_idc = 0;
}
@@ -237,7 +237,7 @@
{
bs_write_ue( s, sh->ref_pic_list_order[0][i].idc );
bs_write_ue( s, sh->ref_pic_list_order[0][i].arg );
-
+
}
bs_write_ue( s, 3 );
}
@@ -403,13 +403,16 @@
h->param.rc.i_rc_method = X264_RC_CQP;
h->param.rc.f_ip_factor = 1;
h->param.rc.f_pb_factor = 1;
- h->param.analyse.b_transform_8x8 = 0;
h->param.analyse.b_psnr = 0;
h->param.analyse.b_ssim = 0;
h->param.analyse.i_chroma_qp_offset = 0;
h->param.analyse.i_trellis = 0;
h->param.analyse.b_fast_pskip = 0;
h->param.analyse.i_noise_reduction = 0;
+ h->param.analyse.f_psy_rd = 0;
+ /* 8x8dct is not useful at all in CAVLC lossless */
+ if( !h->param.b_cabac )
+ h->param.analyse.b_transform_8x8 = 0;
}
if( h->param.rc.i_rc_method == X264_RC_CQP )
{
@@ -429,7 +432,7 @@
// There's nothing special about 1080 in that the warning still applies to it,
// but chances are the user can't help it if his content is already 1080p,
// so there's no point in warning in that case.
- x264_log( h, X264_LOG_WARNING,
+ x264_log( h, X264_LOG_WARNING,
"width or height not divisible by 16 (%dx%d), compression will suffer.\n",
h->param.i_width, h->param.i_height );
}
@@ -442,7 +445,8 @@
h->param.i_bframe = x264_clip3( h->param.i_bframe, 0, X264_BFRAME_MAX );
h->param.i_bframe_bias = x264_clip3( h->param.i_bframe_bias, -90, 100 );
h->param.b_bframe_pyramid = h->param.b_bframe_pyramid && h->param.i_bframe > 1;
- h->param.b_bframe_adaptive = h->param.b_bframe_adaptive && h->param.i_bframe > 0;
+ if( !h->param.i_bframe )
+ h->param.i_bframe_adaptive = X264_B_ADAPT_NONE;
h->param.analyse.b_weighted_bipred = h->param.analyse.b_weighted_bipred && h->param.i_bframe > 0;
h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
&& h->param.i_bframe
@@ -487,12 +491,29 @@
if( !h->param.b_cabac )
h->param.analyse.i_trellis = 0;
h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 );
- h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 );
+ if( !h->param.analyse.i_trellis )
+ h->param.analyse.f_psy_trellis = 0;
+ h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 );
+ h->param.analyse.f_psy_trellis = x264_clip3f( h->param.analyse.f_psy_trellis, 0, 10 );
+ if( h->param.analyse.i_subpel_refine < 6 )
+ h->param.analyse.f_psy_rd = 0;
+ h->mb.i_psy_rd = FIX8( h->param.analyse.f_psy_rd );
+ /* Psy RDO increases overall quantizers to improve the quality of luma--this indirectly hurts chroma quality */
+ /* so we lower the chroma QP offset to compensate */
+ /* This can be triggered repeatedly on multiple calls to parameter_validate, but since encoding
+ * uses the pps chroma qp offset not the param chroma qp offset, this is not a problem. */
+ if( h->mb.i_psy_rd )
+ h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_rd < 0.25 ? 1 : 2;
+ h->mb.i_psy_trellis = FIX8( h->param.analyse.f_psy_trellis / 4 );
+ /* Psy trellis has a similar effect. */
+ if( h->mb.i_psy_trellis )
+ h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_trellis < 0.25 ? 1 : 2;
+ else
+ h->mb.i_psy_trellis = 0;
+ h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12);
+ h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 1 );
if( h->param.rc.f_aq_strength <= 0 )
h->param.rc.i_aq_mode = 0;
- /* VAQ effectively replaces qcomp, so qcomp is raised towards 1 to compensate. */
- if( h->param.rc.i_aq_mode == X264_AQ_GLOBAL )
- h->param.rc.f_qcompress = x264_clip3f(h->param.rc.f_qcompress + h->param.rc.f_aq_strength / 0.7, 0, 1);
h->param.analyse.i_noise_reduction = x264_clip3( h->param.analyse.i_noise_reduction, 0, 1<<16 );
{
@@ -583,7 +604,9 @@
static void mbcmp_init( x264_t *h )
{
int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1;
- memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp) );
+ memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) );
+ memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
+ h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
satd &= h->param.analyse.i_me_method == X264_ME_TESA;
memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
@@ -621,8 +644,6 @@
h->param.rc.psz_stat_out = strdup( h->param.rc.psz_stat_out );
if( h->param.rc.psz_stat_in )
h->param.rc.psz_stat_in = strdup( h->param.rc.psz_stat_in );
- if( h->param.rc.psz_rc_eq )
- h->param.rc.psz_rc_eq = strdup( h->param.rc.psz_rc_eq );
/* VUI */
if( h->param.vui.i_sar_width > 0 && h->param.vui.i_sar_height > 0 )
@@ -672,18 +693,21 @@
x264_free( h );
return NULL;
}
-
+
h->mb.i_mb_count = h->sps->i_mb_width * h->sps->i_mb_height;
/* Init frames. */
- h->frames.i_delay = h->param.i_bframe + h->param.i_threads - 1;
+ if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
+ h->frames.i_delay = X264_MAX(h->param.i_bframe,3)*4 + h->param.i_threads - 1;
+ else
+ h->frames.i_delay = h->param.i_bframe + h->param.i_threads - 1;
h->frames.i_max_ref0 = h->param.i_frame_reference;
h->frames.i_max_ref1 = h->sps->vui.i_num_reorder_frames;
h->frames.i_max_dpb = h->sps->vui.i_max_dec_frame_buffering;
h->frames.b_have_lowres = !h->param.rc.b_stat_read
&& ( h->param.rc.i_rc_method == X264_RC_ABR
|| h->param.rc.i_rc_method == X264_RC_CRF
- || h->param.b_bframe_adaptive
+ || h->param.i_bframe_adaptive
|| h->param.b_pre_scenecut );
h->frames.b_have_lowres |= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0);
@@ -694,6 +718,8 @@
h->i_ref0 = 0;
h->i_ref1 = 0;
+ h->chroma_qp_table = i_chroma_qp_table + 12 + h->pps->i_chroma_qp_index_offset;
+
x264_rdo_init( );
/* init CPU functions */
@@ -1235,9 +1261,8 @@
/* Compute misc bits */
h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
+ NALU_OVERHEAD * 8
- - h->stat.frame.i_itex_bits
- - h->stat.frame.i_ptex_bits
- - h->stat.frame.i_hdr_bits;
+ - h->stat.frame.i_tex_bits
+ - h->stat.frame.i_mv_bits;
}
static void x264_thread_sync_context( x264_t *dst, x264_t *src )
@@ -1256,7 +1281,6 @@
// copy everything except the per-thread pointers and the constants.
memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.type) - offsetof(x264_t, i_frame) );
- memcpy( &dst->mb.i_type, &src->mb.i_type, offsetof(x264_t, rc) - offsetof(x264_t, mb.i_type) );
dst->stat = src->stat;
}
@@ -1362,6 +1386,9 @@
if( h->frames.b_have_lowres )
x264_frame_init_lowres( h, fenc );
+ if( h->param.rc.i_aq_mode )
+ x264_adaptive_quant_frame( h, fenc );
+
if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads )
{
/* Nothing yet to encode */
@@ -1542,7 +1569,7 @@
/* restore CPU state (before using float again) */
x264_emms();
- if( h->sh.i_type == SLICE_TYPE_P && !h->param.rc.b_stat_read
+ if( h->sh.i_type == SLICE_TYPE_P && !h->param.rc.b_stat_read
&& h->param.i_scenecut_threshold >= 0
&& !h->param.b_pre_scenecut )
{
@@ -1603,12 +1630,12 @@
/* If using B-frames, force GOP to be closed.
* Even if this frame is going to be I and not IDR, forcing a
* P-frame before the scenecut will probably help compression.
- *
+ *
* We don't yet know exactly which frame is the scene cut, so
* we can't assign an I-frame. Instead, change the previous
* B-frame to P, and rearrange coding order. */
- if( h->param.b_bframe_adaptive || b > 1 )
+ if( h->param.i_bframe_adaptive || b > 1 )
h->fenc->i_type = X264_TYPE_AUTO;
x264_frame_sort_pts( h->frames.current );
x264_frame_unshift( h->frames.next, h->fenc );
@@ -1734,22 +1761,22 @@
psz_message[0] = '\0';
if( h->param.analyse.b_psnr )
{
- int64_t sqe[3] = {
+ int64_t ssd[3] = {
h->stat.frame.i_ssd[0],
h->stat.frame.i_ssd[1],
h->stat.frame.i_ssd[2],
};
- h->stat.i_sqe_global[h->sh.i_type] += sqe[0] + sqe[1] + sqe[2];
- h->stat.f_psnr_average[h->sh.i_type] += x264_psnr( sqe[0] + sqe[1] + sqe[2], 3 * h->param.i_width * h->param.i_height / 2 );
- h->stat.f_psnr_mean_y[h->sh.i_type] += x264_psnr( sqe[0], h->param.i_width * h->param.i_height );
- h->stat.f_psnr_mean_u[h->sh.i_type] += x264_psnr( sqe[1], h->param.i_width * h->param.i_height / 4 );
- h->stat.f_psnr_mean_v[h->sh.i_type] += x264_psnr( sqe[2], h->param.i_width * h->param.i_height / 4 );
+ h->stat.i_ssd_global[h->sh.i_type] += ssd[0] + ssd[1] + ssd[2];
+ h->stat.f_psnr_average[h->sh.i_type] += x264_psnr( ssd[0] + ssd[1] + ssd[2], 3 * h->param.i_width * h->param.i_height / 2 );
+ h->stat.f_psnr_mean_y[h->sh.i_type] += x264_psnr( ssd[0], h->param.i_width * h->param.i_height );
+ h->stat.f_psnr_mean_u[h->sh.i_type] += x264_psnr( ssd[1], h->param.i_width * h->param.i_height / 4 );
+ h->stat.f_psnr_mean_v[h->sh.i_type] += x264_psnr( ssd[2], h->param.i_width * h->param.i_height / 4 );
snprintf( psz_message, 80, " PSNR Y:%5.2f U:%5.2f V:%5.2f",
- x264_psnr( sqe[0], h->param.i_width * h->param.i_height ),
- x264_psnr( sqe[1], h->param.i_width * h->param.i_height / 4),
- x264_psnr( sqe[2], h->param.i_width * h->param.i_height / 4) );
+ x264_psnr( ssd[0], h->param.i_width * h->param.i_height ),
+ x264_psnr( ssd[1], h->param.i_width * h->param.i_height / 4),
+ x264_psnr( ssd[2], h->param.i_width * h->param.i_height / 4) );
}
if( h->param.analyse.b_ssim )
@@ -1761,7 +1788,7 @@
" SSIM Y:%.5f", ssim_y );
}
psz_message[79] = '\0';
-
+
x264_log( h, X264_LOG_DEBUG,
"frame=%4d QP=%.2f NAL=%d Slice:%c Poc:%-3d I:%-4d P:%-4d SKIP:%-4d size=%d bytes%s\n",
h->i_frame,
@@ -1857,7 +1884,7 @@
(double)h->stat.i_slice_size[i_slice] / i_count,
h->stat.f_psnr_mean_y[i_slice] / i_count, h->stat.f_psnr_mean_u[i_slice] / i_count, h->stat.f_psnr_mean_v[i_slice] / i_count,
h->stat.f_psnr_average[i_slice] / i_count,
- x264_psnr( h->stat.i_sqe_global[i_slice], i_count * i_yuv_size ) );
+ x264_psnr( h->stat.i_ssd_global[i_slice], i_count * i_yuv_size ) );
}
else
{
@@ -2013,7 +2040,7 @@
SUM3( h->stat.f_psnr_mean_u ) / i_count,
SUM3( h->stat.f_psnr_mean_v ) / i_count,
SUM3( h->stat.f_psnr_average ) / i_count,
- x264_psnr( SUM3( h->stat.i_sqe_global ), i_count * i_yuv_size ),
+ x264_psnr( SUM3( h->stat.i_ssd_global ), i_count * i_yuv_size ),
f_bitrate );
}
else
@@ -2028,8 +2055,6 @@
free( h->param.rc.psz_stat_out );
if( h->param.rc.psz_stat_in )
free( h->param.rc.psz_stat_in );
- if( h->param.rc.psz_rc_eq )
- free( h->param.rc.psz_rc_eq );
x264_cqm_delete( h );
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/encoder/macroblock.c
^
|
@@ -79,7 +79,25 @@
return i_score;
}
-void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
+static ALWAYS_INLINE void x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
+{
+ int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
+ if( h->mb.b_trellis )
+ x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx );
+ else
+ h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
+}
+
+static ALWAYS_INLINE void x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx )
+{
+ int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY;
+ if( h->mb.b_trellis )
+ x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx );
+ else
+ h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
+}
+
+void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
{
uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
@@ -93,15 +111,12 @@
h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
- if( h->mb.b_trellis )
- x264_quant_4x4_trellis( h, dct4x4, CQM_4IY, i_qscale, DCT_LUMA_4x4, 1 );
- else
- h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
+ x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx );
if( array_non_zero( dct4x4 ) )
{
h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
- h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
+ h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qp );
/* output samples to fdec */
h->dctf.add4x4_idct( p_dst, dct4x4 );
@@ -110,7 +125,7 @@
memset( h->dct.luma4x4[idx], 0, sizeof(h->dct.luma4x4[idx]));
}
-void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
+void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
{
int x = 8 * (idx&1);
int y = 8 * (idx>>1);
@@ -118,19 +133,22 @@
uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
+ if( h->mb.b_lossless )
+ {
+ h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
+ return;
+ }
+
h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
- if( h->mb.b_trellis )
- x264_quant_8x8_trellis( h, dct8x8, CQM_8IY, i_qscale, 1 );
- else
- h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8IY][i_qscale], h->quant8_bias[CQM_8IY][i_qscale] );
+ x264_quant_8x8( h, dct8x8, i_qp, 1, idx );
h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
- h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
+ h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
h->dctf.add8x8_idct8( p_dst, dct8x8 );
}
-static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
+static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
{
uint8_t *p_src = h->mb.pic.p_fenc[0];
uint8_t *p_dst = h->mb.pic.p_fdec[0];
@@ -162,22 +180,19 @@
dct4x4[i][0][0] = 0;
/* quant/scan/dequant */
- if( h->mb.b_trellis )
- x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );
- else
- h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
+ x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i );
h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
- h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qscale );
+ h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp );
}
h->dctf.dct4x4dc( dct_dc4x4 );
- h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qscale][0]>>1, h->quant4_bias[CQM_4IY][i_qscale][0]<<1 );
+ h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
/* output samples to fdec */
h->dctf.idct4x4dc( dct_dc4x4 );
- x264_mb_dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qscale ); /* XXX not inversed */
+ x264_mb_dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp ); /* XXX not inversed */
/* calculate dct coeffs */
for( i = 0; i < 16; i++ )
@@ -189,7 +204,7 @@
h->dctf.add16x16_idct( p_dst, dct4x4 );
}
-void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
+void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
{
int i, ch;
int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
@@ -215,7 +230,7 @@
}
continue;
}
-
+
h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
/* calculate dct coeffs */
for( i = 0; i < 4; i++ )
@@ -225,22 +240,20 @@
dct4x4[i][0][0] = 0;
/* no trellis; it doesn't seem to help chroma noticeably */
- h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qscale], h->quant4_bias[CQM_4IC+b_inter][i_qscale] );
+ h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
if( b_decimate )
- {
i_decimate_score += x264_mb_decimate_score( h->dct.luma4x4[16+i+ch*4]+1, 15 );
- }
}
h->dctf.dct2x2dc( dct2x2 );
- h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qscale][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qscale][0]<<1 );
+ h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
/* output samples to fdec */
h->dctf.idct2x2dc( dct2x2 );
- x264_mb_dequant_2x2_dc( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qscale ); /* XXX not inversed */
+ x264_mb_dequant_2x2_dc( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ); /* XXX not inversed */
if( b_decimate && i_decimate_score < 7 )
{
@@ -253,7 +266,7 @@
else
{
for( i = 0; i < 4; i++ )
- h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale );
+ h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
}
dct4x4[0][0][0] = dct2x2[0][0];
dct4x4[1][0][0] = dct2x2[0][1];
@@ -289,7 +302,7 @@
* x264_macroblock_encode_pskip:
* Encode an already marked skip block
*****************************************************************************/
-void x264_macroblock_encode_pskip( x264_t *h )
+static void x264_macroblock_encode_pskip( x264_t *h )
{
const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
h->mb.mv_min[0], h->mb.mv_max[0] );
@@ -316,6 +329,74 @@
}
/*****************************************************************************
+ * Intra prediction for predictive lossless mode.
+ *****************************************************************************/
+
+/* Note that these functions take a shortcut (mc.copy instead of actual pixel prediction) which assumes
+ * that the edge pixels of the reconstructed frame are the same as that of the source frame. This means
+ * they will only work correctly if the neighboring blocks are losslessly coded. In practice, this means
+ * lossless mode cannot be mixed with lossy mode within a frame. */
+/* This can be resolved by explicitly copying the edge pixels after doing the mc.copy, but this doesn't
+ * need to be done unless we decide to allow mixing lossless and lossy compression. */
+
+void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode )
+{
+ int stride = h->fenc->i_stride[1] << h->mb.b_interlaced;
+ if( i_mode == I_PRED_CHROMA_V )
+ {
+ h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-stride, stride, 8 );
+ h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-stride, stride, 8 );
+ }
+ else if( i_mode == I_PRED_CHROMA_H )
+ {
+ h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-1, stride, 8 );
+ h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-1, stride, 8 );
+ }
+ else
+ {
+ h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
+ h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+ }
+}
+
+void x264_predict_lossless_4x4( x264_t *h, uint8_t *p_dst, int idx, int i_mode )
+{
+ int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
+ uint8_t *p_src = h->mb.pic.p_fenc_plane[0] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
+
+ if( i_mode == I_PRED_4x4_V )
+ h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
+ else if( i_mode == I_PRED_4x4_H )
+ h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 );
+ else
+ h->predict_4x4[i_mode]( p_dst );
+}
+
+void x264_predict_lossless_8x8( x264_t *h, uint8_t *p_dst, int idx, int i_mode, uint8_t edge[33] )
+{
+ int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
+ uint8_t *p_src = h->mb.pic.p_fenc_plane[0] + (idx&1)*8 + (idx>>1)*8*stride;
+
+ if( i_mode == I_PRED_8x8_V )
+ h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
+ else if( i_mode == I_PRED_8x8_H )
+ h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 );
+ else
+ h->predict_8x8[i_mode]( p_dst, edge );
+}
+
+void x264_predict_lossless_16x16( x264_t *h, int i_mode )
+{
+ int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
+ if( i_mode == I_PRED_16x16_V )
+ h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-stride, stride, 16 );
+ else if( i_mode == I_PRED_16x16_H )
+ h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-1, stride, 16 );
+ else
+ h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
+}
+
+/*****************************************************************************
* x264_macroblock_encode:
*****************************************************************************/
void x264_macroblock_encode( x264_t *h )
@@ -363,8 +444,11 @@
{
const int i_mode = h->mb.i_intra16x16_pred_mode;
h->mb.b_transform_8x8 = 0;
- /* do the right prediction */
- h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
+
+ if( h->mb.b_lossless )
+ x264_predict_lossless_16x16( h, i_mode );
+ else
+ h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
/* encode the 16x16 macroblock */
x264_mb_encode_i16x16( h, i_qp );
@@ -385,9 +469,13 @@
{
uint8_t *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * FDEC_STRIDE];
int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
-
x264_predict_8x8_filter( p_dst, edge, h->mb.i_neighbour8[i], x264_pred_i4x4_neighbors[i_mode] );
- h->predict_8x8[i_mode]( p_dst, edge );
+
+ if( h->mb.b_lossless )
+ x264_predict_lossless_8x8( h, p_dst, i, i_mode, edge );
+ else
+ h->predict_8x8[i_mode]( p_dst, edge );
+
x264_mb_encode_i8x8( h, i, i_qp );
}
for( i = 0; i < 4; i++ )
@@ -413,7 +501,10 @@
/* emulate missing topright samples */
*(uint32_t*) &p_dst[4-FDEC_STRIDE] = p_dst[3-FDEC_STRIDE] * 0x01010101U;
- h->predict_4x4[i_mode]( p_dst );
+ if( h->mb.b_lossless )
+ x264_predict_lossless_4x4( h, p_dst, i, i_mode );
+ else
+ h->predict_4x4[i_mode]( p_dst );
x264_mb_encode_i4x4( h, i, i_qp );
}
}
@@ -428,12 +519,23 @@
if( h->mb.b_lossless )
{
- for( i4x4 = 0; i4x4 < 16; i4x4++ )
- {
- h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
- h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
- h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
- }
+ if( h->mb.b_transform_8x8 )
+ for( i8x8 = 0; i8x8 < 4; i8x8++ )
+ {
+ int x = 8*(i8x8&1);
+ int y = 8*(i8x8>>1);
+ h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
+ h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
+ h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
+ nnz8x8[i8x8] = array_non_zero( h->dct.luma8x8[i8x8] );
+ }
+ else
+ for( i4x4 = 0; i4x4 < 16; i4x4++ )
+ {
+ h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
+ h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
+ h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
+ }
}
else if( h->mb.b_transform_8x8 )
{
@@ -445,11 +547,8 @@
for( idx = 0; idx < 4; idx++ )
{
if( h->mb.b_noise_reduction )
- h->quantf.denoise_dct_core( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
- if( h->mb.b_trellis )
- x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 );
- else
- h->quantf.quant_8x8( dct8x8[idx], h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] );
+ h->quantf.denoise_dct( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
+ x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );
h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
@@ -493,11 +592,8 @@
idx = i8x8 * 4 + i4x4;
if( h->mb.b_noise_reduction )
- h->quantf.denoise_dct_core( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
- if( h->mb.b_trellis )
- x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 );
- else
- h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
+ h->quantf.denoise_dct( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
+ x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
@@ -530,8 +626,13 @@
if( IS_INTRA( h->mb.i_type ) )
{
const int i_mode = h->mb.i_chroma_pred_mode;
- h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
- h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+ if( h->mb.b_lossless )
+ x264_predict_lossless_8x8_chroma( h, i_mode );
+ else
+ {
+ h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
+ h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+ }
}
/* encode the 8x8 blocks */
@@ -594,7 +695,7 @@
if( !b_force_no_skip )
{
if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
- !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
+ !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
*(uint32_t*)h->mb.cache.mv[0][x264_scan8[0]] == *(uint32_t*)h->mb.cache.pskip_mv
&& h->mb.cache.ref[0][x264_scan8[0]] == 0 )
{
@@ -755,12 +856,20 @@
if( h->mb.b_lossless )
{
int i4;
- for( i4 = i8*4; i4 < i8*4+4; i4++ )
+ if( h->mb.b_transform_8x8 )
+ {
+ h->zigzagf.sub_4x4( h->dct.luma4x4[i8], p_fenc, p_fdec );
+ nnz8x8 = array_non_zero( h->dct.luma8x8[i8] );
+ }
+ else
{
- h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
- h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4],
- h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] );
- nnz8x8 |= array_non_zero( h->dct.luma4x4[i4] );
+ for( i4 = i8*4; i4 < i8*4+4; i4++ )
+ {
+ h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
+ h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4],
+ h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] );
+ nnz8x8 |= array_non_zero( h->dct.luma4x4[i4] );
+ }
}
for( ch = 0; ch < 2; ch++ )
{
@@ -776,10 +885,10 @@
{
DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
- h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] );
+ x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
- if( b_decimate )
+ if( b_decimate && !h->mb.b_trellis )
nnz8x8 = 4 <= x264_mb_decimate_score( h->dct.luma8x8[i8], 64 );
else
nnz8x8 = array_non_zero( dct8x8 );
@@ -796,7 +905,8 @@
DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
for( i4 = 0; i4 < 4; i4++ )
- h->quantf.quant_4x4( dct4x4[i4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
+ x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 );
+
for( i4 = 0; i4 < 4; i4++ )
h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/encoder/macroblock.h
^
|
@@ -29,7 +29,7 @@
extern const int x264_lambda2_tab[52];
extern const int x264_lambda_tab[52];
-void x264_rdo_init( );
+void x264_rdo_init( void );
int x264_macroblock_probe_skip( x264_t *h, int b_bidir );
@@ -38,21 +38,26 @@
static inline int x264_macroblock_probe_bskip( x264_t *h )
{ return x264_macroblock_probe_skip( h, 1 ); }
+void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode );
+void x264_predict_lossless_4x4( x264_t *h, uint8_t *p_dst, int idx, int i_mode );
+void x264_predict_lossless_8x8( x264_t *h, uint8_t *p_dst, int idx, int i_mode, uint8_t edge[33] );
+void x264_predict_lossless_16x16( x264_t *h, int i_mode );
+
void x264_macroblock_encode ( x264_t *h );
void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb );
void x264_macroblock_write_cavlc ( x264_t *h, bs_t *s );
void x264_macroblock_encode_p8x8( x264_t *h, int i8 );
-void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale );
-void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale );
-void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale );
+void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp );
+void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp );
+void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp );
void x264_cabac_mb_skip( x264_t *h, int b_skip );
void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
- int i_qp, int i_ctxBlockCat, int b_intra );
+ int i_qp, int i_ctxBlockCat, int b_intra, int idx );
void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
- int i_qp, int b_intra );
+ int i_qp, int b_intra, int idx );
void x264_noise_reduction_update( x264_t *h );
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/encoder/me.c
^
|
@@ -31,7 +31,7 @@
* and refine_* are run only on the winner.
* the subme=7 values are much higher because any amount of satd search makes
* up its time by reducing the number of rd iterations. */
-static const int subpel_iterations[][4] =
+static const int subpel_iterations[][4] =
{{1,0,0,0},
{1,1,0,0},
{0,1,1,0},
@@ -162,7 +162,7 @@
int omx, omy, pmx, pmy;
uint8_t *p_fref = m->p_fref[0];
DECLARE_ALIGNED_16( uint8_t pix[16*16] );
-
+
int i = 0, j;
int dir;
int costs[6];
@@ -663,7 +663,7 @@
{ \
int stride = 16; \
uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
- int cost = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
if( b_chroma_me && cost < bcost ) \
{ \
@@ -787,8 +787,10 @@
#define BIME_CACHE( dx, dy ) \
{ \
int i = 4 + 3*dx + dy; \
- h->mc.mc_luma( pix0[i], bw, m0->p_fref, m0->i_stride[0], om0x+dx, om0y+dy, bw, bh ); \
- h->mc.mc_luma( pix1[i], bw, m1->p_fref, m1->i_stride[0], om1x+dx, om1y+dy, bw, bh ); \
+ stride0[i] = bw;\
+ stride1[i] = bw;\
+ src0[i] = h->mc.get_ref( pix0[i], &stride0[i], m0->p_fref, m0->i_stride[0], om0x+dx, om0y+dy, bw, bh ); \
+ src1[i] = h->mc.get_ref( pix1[i], &stride1[i], m1->p_fref, m1->i_stride[0], om1x+dx, om1y+dy, bw, bh ); \
}
#define BIME_CACHE2(a,b) \
@@ -802,11 +804,7 @@
int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \
int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \
visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\
- h->mc.memcpy_aligned( pix, pix0[i0], bs ); \
- if( i_weight == 32 ) \
- h->mc.avg[i_pixel]( pix, bw, pix1[i1], bw ); \
- else \
- h->mc.avg_weight[i_pixel]( pix, bw, pix1[i1], bw, i_weight ); \
+ h->mc.avg[i_pixel]( pix, bw, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight ); \
cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, bw ) \
+ p_cost_m0x[ m0x ] + p_cost_m0y[ m0y ] \
+ p_cost_m1x[ m1x ] + p_cost_m1y[ m1y ]; \
@@ -838,7 +836,6 @@
const int i_pixel = m0->i_pixel;
const int bw = x264_pixel_size[i_pixel].w;
const int bh = x264_pixel_size[i_pixel].h;
- const int bs = bw*bh;
const int16_t *p_cost_m0x = m0->p_cost_mv - x264_clip3( m0->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
const int16_t *p_cost_m0y = m0->p_cost_mv - x264_clip3( m0->mvp[1], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
const int16_t *p_cost_m1x = m1->p_cost_mv - x264_clip3( m1->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
@@ -846,6 +843,10 @@
DECLARE_ALIGNED_16( uint8_t pix0[9][16*16] );
DECLARE_ALIGNED_16( uint8_t pix1[9][16*16] );
DECLARE_ALIGNED_16( uint8_t pix[16*16] );
+ uint8_t *src0[9];
+ uint8_t *src1[9];
+ int stride0[9];
+ int stride1[9];
int bm0x = m0->mv[0], om0x = bm0x;
int bm0y = m0->mv[1], om0y = bm0y;
int bm1x = m1->mv[0], om1x = bm1x;
@@ -853,7 +854,7 @@
int bcost = COST_MAX;
int pass = 0;
/* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
- uint8_t visited[8][8][8];
+ DECLARE_ALIGNED_16( uint8_t visited[8][8][8] );
h->mc.memzero_aligned( visited, sizeof(visited) );
BIME_CACHE( 0, 0 );
@@ -904,7 +905,7 @@
{ \
int stride = 16; \
uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4 ); \
- dst = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ dst = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[mx] + p_cost_mvy[my]; \
COPY1_IF_LT( bsatd, dst ); \
}
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/encoder/ratecontrol.c
^
|
@@ -40,8 +40,7 @@
int kept_as_ref;
float qscale;
int mv_bits;
- int i_tex_bits;
- int p_tex_bits;
+ int tex_bits;
int misc_bits;
uint64_t expected_bits;
double expected_vbv;
@@ -118,10 +117,6 @@
double lmin[5]; /* min qscale by frame type */
double lmax[5];
double lstep; /* max change (multiply) in qscale per frame */
- double i_cplx_sum[5]; /* estimated total texture bits in intra MBs at qscale=1 */
- double p_cplx_sum[5];
- double mv_bits_sum[5];
- int frame_count[5]; /* number of frames of each type */
/* MBRC stuff */
double frame_size_estimated;
@@ -132,10 +127,6 @@
int bframes; /* # consecutive B-frames before this P-frame */
int bframe_bits; /* total cost of those frames */
- /* AQ stuff */
- float aq_threshold;
- int *ac_energy;
-
int i_zones;
x264_zone_t *zones;
x264_zone_t *prev_zone;
@@ -149,7 +140,6 @@
static void update_vbv_plan( x264_t *h );
static double predict_size( predictor_t *p, double q, double var );
static void update_predictor( predictor_t *p, double q, double var, double bits );
-int x264_rc_analyse_slice( x264_t *h );
/* Terminology:
* qp = h.264's quantizer
@@ -172,72 +162,46 @@
{
if(qscale<0.1)
qscale = 0.1;
- return (rce->i_tex_bits + rce->p_tex_bits + .1) * pow( rce->qscale / qscale, 1.1 )
+ return (rce->tex_bits + .1) * pow( rce->qscale / qscale, 1.1 )
+ rce->mv_bits * pow( X264_MAX(rce->qscale, 1) / X264_MAX(qscale, 1), 0.5 )
+ rce->misc_bits;
}
// Find the total AC energy of the block in all planes.
-static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, int *satd )
+static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
{
/* This function contains annoying hacks because GCC has a habit of reordering emms
* and putting it after floating point ops. As a result, we put the emms at the end of the
* function and make sure that its always called before the float math. Noinline makes
* sure no reordering goes on. */
- /* FIXME: This array is larger than necessary because a bug in GCC causes an all-zero
- * array to be placed in .bss despite .bss not being correctly aligned on some platforms (win32?) */
- DECLARE_ALIGNED_16( static uint8_t zero[17] ) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1};
- unsigned int var=0, sad, ssd, i;
- if( satd || h->param.rc.i_aq_mode == X264_AQ_GLOBAL )
- {
- for( i=0; i<3; i++ )
- {
- int w = i ? 8 : 16;
- int stride = h->fenc->i_stride[i];
- int offset = h->mb.b_interlaced
- ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
- : w * (mb_x + mb_y * stride);
- int pix = i ? PIXEL_8x8 : PIXEL_16x16;
- stride <<= h->mb.b_interlaced;
- sad = h->pixf.sad[pix]( zero, 0, h->fenc->plane[i]+offset, stride );
- ssd = h->pixf.ssd[pix]( zero, 0, h->fenc->plane[i]+offset, stride );
- var += ssd - (sad * sad >> (i?6:8));
- // SATD to represent the block's overall complexity (bit cost) for intra encoding.
- // exclude the DC coef, because nothing short of an actual intra prediction will estimate DC cost.
- if( var && satd )
- *satd += h->pixf.satd[pix]( zero, 0, h->fenc->plane[i]+offset, stride ) - sad/2;
- }
- var = X264_MAX(var,1);
+ unsigned int var=0, sad, i;
+ for( i=0; i<3; i++ )
+ {
+ int w = i ? 8 : 16;
+ int stride = frame->i_stride[i];
+ int offset = h->mb.b_interlaced
+ ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
+ : w * (mb_x + mb_y * stride);
+ int pix = i ? PIXEL_8x8 : PIXEL_16x16;
+ stride <<= h->mb.b_interlaced;
+ var += h->pixf.var[pix]( frame->plane[i]+offset, stride, &sad );
}
- else var = h->rc->ac_energy[h->mb.i_mb_xy];
+ var = X264_MAX(var,1);
x264_emms();
return var;
}
-void x264_autosense_aq( x264_t *h )
+void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
{
- double total = 0;
- double n = 0;
int mb_x, mb_y;
- // FIXME: Some of the SATDs might be already calculated elsewhere (ratecontrol?). Can we reuse them?
- // FIXME: Is chroma SATD necessary?
for( mb_y=0; mb_y<h->sps->i_mb_height; mb_y++ )
for( mb_x=0; mb_x<h->sps->i_mb_width; mb_x++ )
{
- int satd=0;
- int energy = ac_energy_mb( h, mb_x, mb_y, &satd );
- h->rc->ac_energy[mb_x + mb_y * h->sps->i_mb_width] = energy;
- /* Weight the energy value by the SATD value of the MB.
- * This represents the fact that the more complex blocks in a frame should
- * be weighted more when calculating the optimal threshold. This also helps
- * diminish the negative effect of large numbers of simple blocks in a frame,
- * such as in the case of a letterboxed film. */
- total += logf(energy) * satd;
- n += satd;
+ int energy = ac_energy_mb( h, mb_x, mb_y, frame );
+ /* 10 constant chosen to result in approximately the same overall bitrate as without AQ. */
+ float qp_adj = h->param.rc.f_aq_strength * 1.5 * (logf(energy) - 10.0);
+ frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
}
- x264_emms();
- /* Calculate and store the threshold. */
- h->rc->aq_threshold = n ? total/n : 15;
}
/*****************************************************************************
@@ -249,18 +213,16 @@
*****************************************************************************/
void x264_adaptive_quant( x264_t *h )
{
- int energy = ac_energy_mb( h, h->mb.i_mb_x, h->mb.i_mb_y, NULL );
- /* Adjust the QP based on the AC energy of the macroblock. */
- float qp = h->rc->f_qpm;
- float qp_adj = 1.5 * (logf(energy) - h->rc->aq_threshold);
- if( h->param.rc.i_aq_mode == X264_AQ_LOCAL )
- qp_adj = x264_clip3f( qp_adj, -5, 5 );
- h->mb.i_qp = x264_clip3( qp + qp_adj * h->param.rc.f_aq_strength + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
+ float qp, qp_adj;
+ x264_emms();
+ qp = h->rc->f_qpm;
+ qp_adj = h->fenc->f_qp_offset[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride];
+ h->mb.i_qp = x264_clip3( qp + qp_adj + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
/* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
* to lower the bit cost of the qp_delta. */
if( abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
h->mb.i_qp = h->mb.i_last_qp;
- h->mb.i_chroma_qp = i_chroma_qp_table[x264_clip3( h->mb.i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
+ h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
}
int x264_ratecontrol_new( x264_t *h )
@@ -275,7 +237,7 @@
rc->b_abr = h->param.rc.i_rc_method != X264_RC_CQP && !h->param.rc.b_stat_read;
rc->b_2pass = h->param.rc.i_rc_method == X264_RC_ABR && h->param.rc.b_stat_read;
-
+
/* FIXME: use integers */
if(h->param.i_fps_num > 0 && h->param.i_fps_den > 0)
rc->fps = (float) h->param.i_fps_num / h->param.i_fps_den;
@@ -435,6 +397,25 @@
if( strstr( opts, "qp=0" ) && h->param.rc.i_rc_method == X264_RC_ABR )
x264_log( h, X264_LOG_WARNING, "1st pass was lossless, bitrate prediction will be inaccurate\n" );
+
+ if( ( p = strstr( opts, "b_adapt=" ) ) && sscanf( p, "b_adapt=%d", &i ) && i >= X264_B_ADAPT_NONE && i <= X264_B_ADAPT_TRELLIS )
+ h->param.i_bframe_adaptive = i;
+ else if( h->param.i_bframe )
+ {
+ x264_log( h, X264_LOG_ERROR, "b_adapt method specified in stats file not valid\n" );
+ return -1;
+ }
+
+ if( ( p = strstr( opts, "scenecut=" ) ) && sscanf( p, "scenecut=%d", &i ) && i >= -1 && i <= 100 )
+ {
+ h->param.i_scenecut_threshold = i;
+ h->param.b_pre_scenecut = !!strstr( p, "(pre)" );
+ }
+ else
+ {
+ x264_log( h, X264_LOG_ERROR, "scenecut method specified in stats file not valid\n" );
+ return -1;
+ }
}
/* find number of pics */
@@ -503,8 +484,8 @@
rce = &rc->entry[frame_number];
rce->direct_mode = 0;
- e += sscanf(p, " in:%*d out:%*d type:%c q:%f itex:%d ptex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c",
- &pict_type, &qp, &rce->i_tex_bits, &rce->p_tex_bits,
+ e += sscanf(p, " in:%*d out:%*d type:%c q:%f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c",
+ &pict_type, &qp, &rce->tex_bits,
&rce->mv_bits, &rce->misc_bits, &rce->i_count, &rce->p_count,
&rce->s_count, &rce->direct_mode);
@@ -561,9 +542,11 @@
{
h->thread[i]->rc = rc+i;
if( i )
+ {
rc[i] = rc[0];
- if( h->param.rc.i_aq_mode == X264_AQ_LOCAL )
- rc[i].ac_energy = x264_malloc( h->mb.i_mb_count * sizeof(int) );
+ memcpy( &h->thread[i]->param, &h->param, sizeof( x264_param_t ) );
+ h->thread[i]->mb.b_variable_qp = h->mb.b_variable_qp;
+ }
}
return 0;
@@ -673,7 +656,7 @@
return 0;
}
-x264_zone_t *get_zone( x264_t *h, int frame_num )
+static x264_zone_t *get_zone( x264_t *h, int frame_num )
{
int i;
for( i = h->rc->i_zones-1; i >= 0; i-- )
@@ -691,7 +674,7 @@
if( rc->b_abr && h->param.rc.i_rc_method == X264_RC_ABR && rc->cbr_decay > .9999 )
{
double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
- x264_log( h, X264_LOG_INFO, "final ratefactor: %.2f\n",
+ x264_log( h, X264_LOG_INFO, "final ratefactor: %.2f\n",
qscale2qp( pow( base_cplx, 1 - h->param.rc.f_qcompress )
* rc->cplxr_sum / rc->wanted_bits_window ) );
}
@@ -725,8 +708,6 @@
x264_free( rc->zones[i].param );
x264_free( rc->zones );
}
- for( i=0; i<h->param.i_threads; i++ )
- x264_free( rc[i].ac_energy );
x264_free( rc );
}
@@ -850,26 +831,19 @@
if( h->sh.i_type != SLICE_TYPE_B )
rc->last_non_b_pict_type = h->sh.i_type;
-
- /* Adaptive AQ thresholding algorithm. */
- if( h->param.rc.i_aq_mode == X264_AQ_GLOBAL )
- /* Arbitrary value for "center" of the AQ curve.
- * Chosen so that any given value of CRF has on average similar bitrate with and without AQ. */
- h->rc->aq_threshold = logf(5000);
- else if( h->param.rc.i_aq_mode == X264_AQ_LOCAL )
- x264_autosense_aq(h);
}
-double predict_row_size( x264_t *h, int y, int qp )
+static double predict_row_size( x264_t *h, int y, int qp )
{
/* average between two predictors:
* absolute SATD, and scaled bit cost of the colocated row in the previous frame */
x264_ratecontrol_t *rc = h->rc;
double pred_s = predict_size( rc->row_pred, qp2qscale(qp), h->fdec->i_row_satd[y] );
double pred_t = 0;
- if( h->sh.i_type != SLICE_TYPE_I
+ if( h->sh.i_type != SLICE_TYPE_I
&& h->fref0[0]->i_type == h->fdec->i_type
- && h->fref0[0]->i_row_satd[y] > 0 )
+ && h->fref0[0]->i_row_satd[y] > 0
+ && (abs(h->fref0[0]->i_row_satd[y] - h->fdec->i_row_satd[y]) < h->fdec->i_row_satd[y]/2))
{
pred_t = h->fref0[0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref0[0]->i_row_satd[y]
* qp2qscale(h->fref0[0]->i_row_qp[y]) / qp2qscale(qp);
@@ -880,7 +854,7 @@
return (pred_s + pred_t) / 2;
}
-double row_bits_so_far( x264_t *h, int y )
+static double row_bits_so_far( x264_t *h, int y )
{
int i;
double bits = 0;
@@ -889,7 +863,7 @@
return bits;
}
-double predict_row_size_sum( x264_t *h, int y, int qp )
+static double predict_row_size_sum( x264_t *h, int y, int qp )
{
int i;
double bits = row_bits_so_far(h, y);
@@ -1016,14 +990,14 @@
x264_log(h, X264_LOG_ERROR, "2nd pass has more frames than 1st pass (%d)\n", rc->num_entries);
x264_log(h, X264_LOG_ERROR, "continuing anyway, at constant QP=%d\n", h->param.rc.i_qp_constant);
- if( h->param.b_bframe_adaptive )
+ if( h->param.i_bframe_adaptive )
x264_log(h, X264_LOG_ERROR, "disabling adaptive B-frames\n");
rc->b_abr = 0;
rc->b_2pass = 0;
h->param.rc.i_rc_method = X264_RC_CQP;
h->param.rc.b_stat_read = 0;
- h->param.b_bframe_adaptive = 0;
+ h->param.i_bframe_adaptive = 0;
if( h->param.i_bframe > 1 )
h->param.i_bframe = 1;
return X264_TYPE_P;
@@ -1073,15 +1047,16 @@
int dir_frame = h->stat.frame.i_direct_score[1] - h->stat.frame.i_direct_score[0];
int dir_avg = h->stat.i_direct_score[1] - h->stat.i_direct_score[0];
char c_direct = h->mb.b_direct_auto_write ?
- ( dir_frame>0 ? 's' : dir_frame<0 ? 't' :
+ ( dir_frame>0 ? 's' : dir_frame<0 ? 't' :
dir_avg>0 ? 's' : dir_avg<0 ? 't' : '-' )
: '-';
fprintf( rc->p_stat_file_out,
- "in:%d out:%d type:%c q:%.2f itex:%d ptex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c;\n",
+ "in:%d out:%d type:%c q:%.2f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c;\n",
h->fenc->i_frame, h->i_frame,
c_type, rc->qpa_rc,
- h->stat.frame.i_itex_bits, h->stat.frame.i_ptex_bits,
- h->stat.frame.i_hdr_bits, h->stat.frame.i_misc_bits,
+ h->stat.frame.i_tex_bits,
+ h->stat.frame.i_mv_bits,
+ h->stat.frame.i_misc_bits,
h->stat.frame.i_mb_count_i,
h->stat.frame.i_mb_count_p,
h->stat.frame.i_mb_count_skip,
@@ -1132,76 +1107,19 @@
* 2 pass functions
***************************************************************************/
-double x264_eval( char *s, double *const_value, const char **const_name,
- double (**func1)(void *, double), const char **func1_name,
- double (**func2)(void *, double, double), char **func2_name,
- void *opaque );
-
/**
* modify the bitrate curve from pass1 for one frame
*/
static double get_qscale(x264_t *h, ratecontrol_entry_t *rce, double rate_factor, int frame_num)
{
x264_ratecontrol_t *rcc= h->rc;
- const int pict_type = rce->pict_type;
double q;
x264_zone_t *zone = get_zone( h, frame_num );
- double const_values[]={
- rce->i_tex_bits * rce->qscale,
- rce->p_tex_bits * rce->qscale,
- (rce->i_tex_bits + rce->p_tex_bits) * rce->qscale,
- rce->mv_bits * rce->qscale,
- (double)rce->i_count / rcc->nmb,
- (double)rce->p_count / rcc->nmb,
- (double)rce->s_count / rcc->nmb,
- rce->pict_type == SLICE_TYPE_I,
- rce->pict_type == SLICE_TYPE_P,
- rce->pict_type == SLICE_TYPE_B,
- h->param.rc.f_qcompress,
- rcc->i_cplx_sum[SLICE_TYPE_I] / rcc->frame_count[SLICE_TYPE_I],
- rcc->i_cplx_sum[SLICE_TYPE_P] / rcc->frame_count[SLICE_TYPE_P],
- rcc->p_cplx_sum[SLICE_TYPE_P] / rcc->frame_count[SLICE_TYPE_P],
- rcc->p_cplx_sum[SLICE_TYPE_B] / rcc->frame_count[SLICE_TYPE_B],
- (rcc->i_cplx_sum[pict_type] + rcc->p_cplx_sum[pict_type]) / rcc->frame_count[pict_type],
- rce->blurred_complexity,
- 0
- };
- static const char *const_names[]={
- "iTex",
- "pTex",
- "tex",
- "mv",
- "iCount",
- "pCount",
- "sCount",
- "isI",
- "isP",
- "isB",
- "qComp",
- "avgIITex",
- "avgPITex",
- "avgPPTex",
- "avgBPTex",
- "avgTex",
- "blurCplx",
- NULL
- };
- static double (*func1[])(void *, double)={
-// (void *)bits2qscale,
- (void *)qscale2bits,
- NULL
- };
- static const char *func1_names[]={
-// "bits2qp",
- "qp2bits",
- NULL
- };
-
- q = x264_eval((char*)h->param.rc.psz_rc_eq, const_values, const_names, func1, func1_names, NULL, NULL, rce);
+ q = pow( rce->blurred_complexity, 1 - h->param.rc.f_qcompress );
// avoid NaN's in the rc_eq
- if(!isfinite(q) || rce->i_tex_bits + rce->p_tex_bits + rce->mv_bits == 0)
+ if(!isfinite(q) || rce->tex_bits + rce->mv_bits == 0)
q = rcc->last_qscale;
else
{
@@ -1253,7 +1171,7 @@
}
else if( pict_type == SLICE_TYPE_P
&& rcc->last_non_b_pict_type == SLICE_TYPE_P
- && rce->i_tex_bits + rce->p_tex_bits == 0 )
+ && rce->tex_bits == 0 )
{
q = last_p_q;
}
@@ -1431,6 +1349,21 @@
+ h->stat.i_slice_size[SLICE_TYPE_P]
+ h->stat.i_slice_size[SLICE_TYPE_B]);
+ if( h->param.i_threads > 1 )
+ {
+ int j = h->rc - h->thread[0]->rc;
+ int i;
+ for( i=1; i<h->param.i_threads; i++ )
+ {
+ x264_t *t = h->thread[ (j+i)%h->param.i_threads ];
+ double bits = t->rc->frame_size_planned;
+ if( !t->b_thread_active )
+ continue;
+ bits = X264_MAX(bits, x264_ratecontrol_get_estimated_size(t));
+ total_bits += (int64_t)bits;
+ }
+ }
+
if( rcc->b_2pass )
{
rce = *rcc->rce;
@@ -1501,10 +1434,12 @@
double expected_fullness = rce.expected_vbv / rcc->buffer_size;
double qmax = q*(2 - expected_fullness);
double size_constraint = 1 + expected_fullness;
+ qmax = X264_MAX(qmax, rce.new_qscale);
if (expected_fullness < .05)
qmax = lmax;
qmax = X264_MIN(qmax, lmax);
- while( (expected_vbv < rce.expected_vbv/size_constraint) && (q < qmax) )
+ while( ((expected_vbv < rce.expected_vbv/size_constraint) && (q < qmax)) ||
+ ((expected_vbv < 0) && (q < lmax)))
{
q *= 1.05;
expected_size = qscale2bits(&rce, q);
@@ -1534,9 +1469,8 @@
rcc->short_term_cplxsum += rcc->last_satd;
rcc->short_term_cplxcount ++;
- rce.p_tex_bits = rcc->last_satd;
+ rce.tex_bits = rcc->last_satd;
rce.blurred_complexity = rcc->short_term_cplxsum / rcc->short_term_cplxcount;
- rce.i_tex_bits = 0;
rce.mv_bits = 0;
rce.p_count = rcc->nmb;
rce.i_count = 0;
@@ -1789,10 +1723,6 @@
{
ratecontrol_entry_t *rce = &rcc->entry[i];
all_const_bits += rce->misc_bits;
- rcc->i_cplx_sum[rce->pict_type] += rce->i_tex_bits * rce->qscale;
- rcc->p_cplx_sum[rce->pict_type] += rce->p_tex_bits * rce->qscale;
- rcc->mv_bits_sum[rce->pict_type] += rce->mv_bits * rce->qscale;
- rcc->frame_count[rce->pict_type] ++;
}
if( all_available_bits < all_const_bits)
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/encoder/ratecontrol.h
^
|
@@ -27,6 +27,8 @@
int x264_ratecontrol_new ( x264_t * );
void x264_ratecontrol_delete( x264_t * );
+void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
+void x264_adaptive_quant( x264_t * );
void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
void x264_ratecontrol_start( x264_t *, int i_force_qp );
int x264_ratecontrol_slice_type( x264_t *, int i_frame );
@@ -34,9 +36,9 @@
int x264_ratecontrol_qp( x264_t * );
void x264_ratecontrol_end( x264_t *, int bits );
void x264_ratecontrol_summary( x264_t * );
-void x264_adaptive_quant( x264_t * );
void x264_ratecontrol_set_estimated_size( x264_t *, int bits );
int x264_ratecontrol_get_estimated_size( x264_t const *);
+int x264_rc_analyse_slice( x264_t *h );
#endif
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/encoder/rdo.c
^
|
@@ -34,7 +34,7 @@
#define bs_write_ue(s,v) ((s)->i_bits_encoded += bs_size_ue(v))
#define bs_write_se(s,v) ((s)->i_bits_encoded += bs_size_se(v))
#define bs_write_te(s,v,l) ((s)->i_bits_encoded += bs_size_te(v,l))
-#define x264_macroblock_write_cavlc x264_macroblock_size_cavlc
+#define x264_macroblock_write_cavlc static x264_macroblock_size_cavlc
#include "cavlc.c"
/* CABAC: not exactly the same. x264_cabac_size_decision() keeps track of
@@ -45,26 +45,84 @@
#define x264_cabac_encode_bypass(c,v) ((c)->f8_bits_encoded += 256)
#define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue_big(v+(1<<e)-1)-e)<<8)
#define x264_cabac_encode_flush(h,c)
-#define x264_macroblock_write_cabac x264_macroblock_size_cabac
+#define x264_macroblock_write_cabac static x264_macroblock_size_cabac
#include "cabac.c"
#define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
-
-static int ssd_mb( x264_t *h )
+
+
+/* Sum the cached SATDs to avoid repeating them. */
+static inline int sum_satd( x264_t *h, int pixel, int x, int y )
{
- return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
- h->mb.pic.p_fdec[0], FDEC_STRIDE )
- + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE,
- h->mb.pic.p_fdec[1], FDEC_STRIDE )
- + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE,
- h->mb.pic.p_fdec[2], FDEC_STRIDE );
+ int satd = 0;
+ int min_x = x>>2;
+ int min_y = y>>2;
+ int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2);
+ int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2);
+ if( pixel == PIXEL_16x16 )
+ return h->mb.pic.fenc_satd_sum;
+ for( y = min_y; y < max_y; y++ )
+ for( x = min_x; x < max_x; x++ )
+ satd += h->mb.pic.fenc_satd[y][x];
+ return satd;
+}
+
+static inline int sum_sa8d( x264_t *h, int pixel, int x, int y )
+{
+ int sa8d = 0;
+ int min_x = x>>3;
+ int min_y = y>>3;
+ int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3);
+ int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3);
+ if( pixel == PIXEL_16x16 )
+ return h->mb.pic.fenc_sa8d_sum;
+ for( y = min_y; y < max_y; y++ )
+ for( x = min_x; x < max_x; x++ )
+ sa8d += h->mb.pic.fenc_sa8d[y][x];
+ return sa8d;
+}
+
+/* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
+/* SATD and SA8D are used to measure block complexity. */
+/* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size. Using SATD */
+/* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */
+
+/* FIXME: Is there a better metric than averaged SATD/SA8D difference for complexity difference? */
+/* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */
+/* This optimization can also be used in non-RD transform decision. */
+
+static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
+{
+ DECLARE_ALIGNED_16(static uint8_t zero[16]);
+ int satd = 0;
+ uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
+ uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
+ if( p == 0 && h->mb.i_psy_rd )
+ {
+ /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
+ if( size <= PIXEL_8x8 )
+ {
+ uint64_t acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
+ satd = abs((int32_t)acs - sum_satd( h, size, x, y ))
+ + abs((int32_t)(acs>>32) - sum_sa8d( h, size, x, y ));
+ satd >>= 1;
+ }
+ else
+ {
+ int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
+ satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - sum_satd( h, size, x, y ));
+ }
+ satd = (satd * h->mb.i_psy_rd * x264_lambda_tab[h->mb.i_qp] + 128) >> 8;
+ }
+ return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd;
}
-static int ssd_plane( x264_t *h, int size, int p, int x, int y )
+static inline int ssd_mb( x264_t *h )
{
- return h->pixf.ssd[size]( h->mb.pic.p_fenc[p] + x+y*FENC_STRIDE, FENC_STRIDE,
- h->mb.pic.p_fdec[p] + x+y*FDEC_STRIDE, FDEC_STRIDE );
+ return ssd_plane(h, PIXEL_16x16, 0, 0, 0)
+ + ssd_plane(h, PIXEL_8x8, 1, 0, 0)
+ + ssd_plane(h, PIXEL_8x8, 2, 0, 0);
}
static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
@@ -140,7 +198,7 @@
return (i_ssd<<8) + i_bits;
}
-uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
+static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
{
uint64_t i_ssd, i_bits;
@@ -162,7 +220,7 @@
return (i_ssd<<8) + i_bits;
}
-uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
+static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
{
uint64_t i_ssd, i_bits;
@@ -184,7 +242,7 @@
return (i_ssd<<8) + i_bits;
}
-uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
+static uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
{
uint64_t i_ssd, i_bits;
@@ -219,7 +277,7 @@
#define LAMBDA_BITS 4
/* precalculate the cost of coding abs_level_m1 */
-void x264_rdo_init( )
+void x264_rdo_init( void )
{
int i_prefix;
int i_ctx;
@@ -247,29 +305,29 @@
// I'm just matching the behaviour of deadzone quant.
static const int lambda2_tab[2][52] = {
// inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
- { 46, 58, 73, 92, 117, 147,
- 185, 233, 294, 370, 466, 587,
- 740, 932, 1174, 1480, 1864, 2349,
- 2959, 3728, 4697, 5918, 7457, 9395,
- 11837, 14914, 18790, 23674, 29828, 37581,
- 47349, 59656, 75163, 94699, 119313, 150326,
- 189399, 238627, 300652, 378798, 477255, 601304,
- 757596, 954511, 1202608, 1515192, 1909022, 2405217,
+ { 46, 58, 73, 92, 117, 147,
+ 185, 233, 294, 370, 466, 587,
+ 740, 932, 1174, 1480, 1864, 2349,
+ 2959, 3728, 4697, 5918, 7457, 9395,
+ 11837, 14914, 18790, 23674, 29828, 37581,
+ 47349, 59656, 75163, 94699, 119313, 150326,
+ 189399, 238627, 300652, 378798, 477255, 601304,
+ 757596, 954511, 1202608, 1515192, 1909022, 2405217,
3030384, 3818045, 4810435, 6060769 },
// intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
- { 27, 34, 43, 54, 68, 86,
- 108, 136, 172, 216, 273, 343,
- 433, 545, 687, 865, 1090, 1374,
- 1731, 2180, 2747, 3461, 4361, 5494,
- 6922, 8721, 10988, 13844, 17442, 21976,
- 27688, 34885, 43953, 55377, 69771, 87906,
- 110755, 139543, 175813, 221511, 279087, 351627,
- 443023, 558174, 703255, 886046, 1116348, 1406511,
+ { 27, 34, 43, 54, 68, 86,
+ 108, 136, 172, 216, 273, 343,
+ 433, 545, 687, 865, 1090, 1374,
+ 1731, 2180, 2747, 3461, 4361, 5494,
+ 6922, 8721, 10988, 13844, 17442, 21976,
+ 27688, 34885, 43953, 55377, 69771, 87906,
+ 110755, 139543, 175813, 221511, 279087, 351627,
+ 443023, 558174, 703255, 886046, 1116348, 1406511,
1772093, 2232697, 2813022, 3544186 }
};
typedef struct {
- uint64_t score;
+ int64_t score;
int level_idx; // index into level_tree[]
uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1
} trellis_node_t;
@@ -298,7 +356,7 @@
static inline void quant_trellis_cabac( x264_t *h, int16_t *dct,
const uint16_t *quant_mf, const int *unquant_mf,
const int *coef_weight, const uint8_t *zigzag,
- int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs )
+ int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs, int idx )
{
int abs_coefs[64], signs[64];
trellis_node_t nodes[2][8];
@@ -430,8 +488,20 @@
// that are better left coded, especially at QP > 40.
for( abs_level = q; abs_level >= q-1; abs_level-- )
{
- int d = i_coef - ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
- uint64_t ssd = (int64_t)d*d * coef_weight[i];
+ int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
+ int d = i_coef - unquant_abs_level;
+ int64_t ssd;
+ /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
+ if( h->mb.i_psy_trellis && i )
+ {
+ int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][i] : h->mb.pic.fenc_dct4[idx][i];
+ int predicted_coef = orig_coef - i_coef * signs[i];
+ int psy_value = h->mb.i_psy_trellis * abs(predicted_coef + unquant_abs_level * signs[i]);
+ int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]];
+ ssd = (int64_t)d*d * coef_weight[i] - psy_weight * psy_value;
+ }
+ else
+ ssd = (int64_t)d*d * coef_weight[i];
for( j = 0; j < 8; j++ )
{
@@ -495,24 +565,24 @@
void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
- int i_qp, int i_ctxBlockCat, int b_intra )
+ int i_qp, int i_ctxBlockCat, int b_intra, int idx )
{
int b_ac = (i_ctxBlockCat == DCT_LUMA_AC);
quant_trellis_cabac( h, (int16_t*)dct,
h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
x264_dct4_weight2_zigzag[h->mb.b_interlaced],
x264_zigzag_scan4[h->mb.b_interlaced],
- i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 16 );
+ i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 16, idx );
}
void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
- int i_qp, int b_intra )
+ int i_qp, int b_intra, int idx )
{
quant_trellis_cabac( h, (int16_t*)dct,
h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
x264_dct8_weight2_zigzag[h->mb.b_interlaced],
x264_zigzag_scan8[h->mb.b_interlaced],
- DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 64 );
+ DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 64, idx );
}
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/encoder/set.c
^
|
@@ -27,6 +27,7 @@
#ifndef _MSC_VER
#include "config.h"
#endif
+#include "set.h"
#define bs_write_ue bs_write_ue_big
@@ -79,7 +80,7 @@
sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0;
if( sps->b_qpprime_y_zero_transform_bypass )
- sps->i_profile_idc = PROFILE_HIGH444;
+ sps->i_profile_idc = PROFILE_HIGH444_PREDICTIVE;
else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT )
sps->i_profile_idc = PROFILE_HIGH;
else if( param->b_cabac || param->i_bframe > 0 )
@@ -150,11 +151,11 @@
sps->vui.i_sar_width = param->vui.i_sar_width;
sps->vui.i_sar_height= param->vui.i_sar_height;
}
-
+
sps->vui.b_overscan_info_present = ( param->vui.i_overscan ? 1 : 0 );
if( sps->vui.b_overscan_info_present )
sps->vui.b_overscan_info = ( param->vui.i_overscan == 2 ? 1 : 0 );
-
+
sps->vui.b_signal_type_present = 0;
sps->vui.i_vidformat = ( param->vui.i_vidformat <= 5 ? param->vui.i_vidformat : 5 );
sps->vui.b_fullrange = ( param->vui.b_fullrange ? 1 : 0 );
@@ -176,7 +177,7 @@
{
sps->vui.b_signal_type_present = 1;
}
-
+
/* FIXME: not sufficient for interlaced video */
sps->vui.b_chroma_loc_info_present = ( param->vui.i_chroma_loc ? 1 : 0 );
if( sps->vui.b_chroma_loc_info_present )
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/encoder/slicetype.c
^
|
@@ -37,9 +37,9 @@
h->mb.b_chroma_me = 0;
}
-int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
+static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
x264_frame_t **frames, int p0, int p1, int b,
- int dist_scale_factor )
+ int dist_scale_factor, int do_search[2] )
{
x264_frame_t *fref0 = frames[p0];
x264_frame_t *fref1 = frames[p1];
@@ -51,6 +51,9 @@
const int i_mb_xy = i_mb_x + i_mb_y * i_mb_stride;
const int i_stride = fenc->i_stride_lowres;
const int i_pel_offset = 8 * ( i_mb_x + i_mb_y * i_stride );
+ const int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
+ int16_t (*fenc_mvs[2])[2] = { &frames[b]->lowres_mvs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mvs[1][p1-b-1][i_mb_xy] };
+ int (*fenc_costs[2]) = { &frames[b]->lowres_mv_costs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mv_costs[1][p1-b-1][i_mb_xy] };
DECLARE_ALIGNED_8( uint8_t pix1[9*FDEC_STRIDE] );
uint8_t *pix2 = pix1+8;
@@ -70,7 +73,7 @@
h->mb.mv_max_fpel[0] = 8*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 4;
h->mb.mv_min_spel[0] = 4*( h->mb.mv_min_fpel[0] - 8 );
h->mb.mv_max_spel[0] = 4*( h->mb.mv_max_fpel[0] + 8 );
- if( h->mb.i_mb_x <= 1 )
+ if( h->mb.i_mb_x >= h->sps->i_mb_width - 2 )
{
h->mb.mv_min_fpel[1] = -8*h->mb.i_mb_y - 4;
h->mb.mv_max_fpel[1] = 8*( h->sps->i_mb_height - h->mb.i_mb_y - 1 ) + 4;
@@ -85,12 +88,6 @@
(dst)[2] = &(src)[2][i_pel_offset]; \
(dst)[3] = &(src)[3][i_pel_offset]; \
}
-#define SAVE_MVS( mv0, mv1 ) \
- { \
- *(uint32_t*)fenc->mv[0][i_mb_xy] = *(uint32_t*)mv0; \
- if( b_bidir ) \
- *(uint32_t*)fenc->mv[1][i_mb_xy] = *(uint32_t*)mv1; \
- }
#define CLIP_MV( mv ) \
{ \
mv[0] = x264_clip3( mv[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); \
@@ -98,21 +95,18 @@
}
#define TRY_BIDIR( mv0, mv1, penalty ) \
{ \
- int stride2 = 16; \
- uint8_t *src2; \
+ int stride1 = 16, stride2 = 16; \
+ uint8_t *src1, *src2; \
int i_cost; \
- h->mc.mc_luma( pix1, 16, m[0].p_fref, m[0].i_stride[0], \
- (mv0)[0], (mv0)[1], 8, 8 ); \
+ src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \
+ (mv0)[0], (mv0)[1], 8, 8 ); \
src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \
- (mv1)[0], (mv1)[1], 8, 8 ); \
- h->mc.avg[PIXEL_8x8]( pix1, 16, src2, stride2 ); \
+ (mv1)[0], (mv1)[1], 8, 8 ); \
+ h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
i_cost = penalty + h->pixf.mbcmp[PIXEL_8x8]( \
m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \
if( i_bcost > i_cost ) \
- { \
i_bcost = i_cost; \
- SAVE_MVS( mv0, mv1 ); \
- } \
}
m[0].i_pixel = PIXEL_8x8;
@@ -123,7 +117,7 @@
if( b_bidir )
{
- int16_t *mvr = fref1->mv[0][i_mb_xy];
+ int16_t *mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy];
int dmv[2][2];
int mv0[2] = {0,0};
@@ -149,68 +143,79 @@
{
DECLARE_ALIGNED_4(int16_t mvc[4][2]) = {{0}};
int i_mvc = 0;
- int16_t (*fenc_mv)[2] = &fenc->mv[l][i_mb_xy];
-#define MVC(mv) { *(uint32_t*)mvc[i_mvc] = *(uint32_t*)mv; i_mvc++; }
- if( i_mb_x > 0 )
- MVC(fenc_mv[-1]);
- if( i_mb_y > 0 )
+ int16_t (*fenc_mv)[2] = fenc_mvs[l];
+
+ if( do_search[l] )
{
- MVC(fenc_mv[-i_mb_stride]);
+ /* Reverse-order MV prediction. */
+#define MVC(mv) { *(uint32_t*)mvc[i_mvc] = *(uint32_t*)mv; i_mvc++; }
if( i_mb_x < h->sps->i_mb_width - 1 )
- MVC(fenc_mv[-i_mb_stride+1]);
- if( i_mb_x > 0 )
- MVC(fenc_mv[-i_mb_stride-1]);
- }
+ MVC(fenc_mv[1]);
+ if( i_mb_y < h->sps->i_mb_height - 1 )
+ {
+ MVC(fenc_mv[i_mb_stride]);
+ if( i_mb_x > 0 )
+ MVC(fenc_mv[i_mb_stride-1]);
+ if( i_mb_x < h->sps->i_mb_width - 1 )
+ MVC(fenc_mv[i_mb_stride+1]);
+ }
#undef MVC
- x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );
- x264_me_search( h, &m[l], mvc, i_mvc );
+ x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );
+ x264_me_search( h, &m[l], mvc, i_mvc );
- m[l].cost -= 2; // remove mvcost from skip mbs
- if( *(uint32_t*)m[l].mv )
- m[l].cost += 5;
+ m[l].cost -= 2; // remove mvcost from skip mbs
+ if( *(uint32_t*)m[l].mv )
+ m[l].cost += 5;
+ *(uint32_t*)fenc_mvs[l] = *(uint32_t*)m[l].mv;
+ *fenc_costs[l] = m[l].cost;
+ }
+ else
+ {
+ *(uint32_t*)m[l].mv = *(uint32_t*)fenc_mvs[l];
+ m[l].cost = *fenc_costs[l];
+ }
i_bcost = X264_MIN( i_bcost, m[l].cost );
}
if( b_bidir && ( *(uint32_t*)m[0].mv || *(uint32_t*)m[1].mv ) )
TRY_BIDIR( m[0].mv, m[1].mv, 5 );
- if( i_bcost < i_cost_bak )
- SAVE_MVS( m[0].mv, m[1].mv );
-
- //FIXME intra part could be shared across multiple encodings of the frame
lowres_intra_mb:
- if( !b_bidir ) // forbid intra-mbs in B-frames, because it's rare and not worth checking
+ /* forbid intra-mbs in B-frames, because it's rare and not worth checking */
+ /* FIXME: Should we still forbid them now that we cache intra scores? */
+ if( !b_bidir )
{
- uint8_t *pix = &pix1[8+FDEC_STRIDE - 1];
- uint8_t *src = &fenc->lowres[0][i_pel_offset - 1];
- const int intra_penalty = 5;
- int satds[4], i_icost, b_intra;
-
- memcpy( pix-FDEC_STRIDE, src-i_stride, 17 );
- for( i=0; i<8; i++ )
- pix[i*FDEC_STRIDE] = src[i*i_stride];
- pix++;
-
- if( h->pixf.intra_satd_x3_8x8c && h->pixf.mbcmp[0] == h->pixf.satd[0] )
- {
- h->pixf.intra_satd_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds );
- h->predict_8x8c[I_PRED_CHROMA_P]( pix );
- satds[I_PRED_CHROMA_P] =
- h->pixf.satd[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
- }
- else
+ int i_icost, b_intra;
+ if( !fenc->b_intra_calculated )
{
- for( i=0; i<4; i++ )
+ DECLARE_ALIGNED_16( uint8_t edge[33] );
+ uint8_t *pix = &pix1[8+FDEC_STRIDE - 1];
+ uint8_t *src = &fenc->lowres[0][i_pel_offset - 1];
+ const int intra_penalty = 5;
+ int satds[4];
+
+ memcpy( pix-FDEC_STRIDE, src-i_stride, 17 );
+ for( i=0; i<8; i++ )
+ pix[i*FDEC_STRIDE] = src[i*i_stride];
+ pix++;
+
+ if( h->pixf.intra_satd_x3_8x8c && h->pixf.mbcmp[0] == h->pixf.satd[0] )
{
- h->predict_8x8c[i]( pix );
- satds[i] = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
+ h->pixf.intra_satd_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds );
+ h->predict_8x8c[I_PRED_CHROMA_P]( pix );
+ satds[I_PRED_CHROMA_P] =
+ h->pixf.satd[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
}
- }
- i_icost = X264_MIN4( satds[0], satds[1], satds[2], satds[3] );
+ else
+ {
+ for( i=0; i<4; i++ )
+ {
+ h->predict_8x8c[i]( pix );
+ satds[i] = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
+ }
+ }
+ i_icost = X264_MIN4( satds[0], satds[1], satds[2], satds[3] );
- if( i_icost < i_bcost * 2 )
- {
- DECLARE_ALIGNED_16( uint8_t edge[33] );
x264_predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
for( i=3; i<9; i++ )
{
@@ -219,9 +224,12 @@
satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
i_icost = X264_MIN( i_icost, satd );
}
- }
- i_icost += intra_penalty;
+ i_icost += intra_penalty;
+ fenc->i_intra_cost[i_mb_xy] = i_icost;
+ }
+ else
+ i_icost = fenc->i_intra_cost[i_mb_xy];
b_intra = i_icost < i_bcost;
if( b_intra )
i_bcost = i_icost;
@@ -236,18 +244,26 @@
return i_bcost;
}
#undef TRY_BIDIR
-#undef SAVE_MVS
-int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
+#define NUM_MBS\
+ (h->sps->i_mb_width > 2 && h->sps->i_mb_height > 2 ?\
+ (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2) :\
+ h->sps->i_mb_width * h->sps->i_mb_height)
+
+static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
x264_frame_t **frames, int p0, int p1, int b,
int b_intra_penalty )
{
int i_score = 0;
+ /* Don't use the AQ'd scores for slicetype decision. */
+ int i_score_aq = 0;
+ int do_search[2];
/* Check whether we already evaluated this frame
* If we have tried this frame as P, then we have also tried
* the preceding frames as B. (is this still true?) */
- if( frames[b]->i_cost_est[b-p0][p1-b] >= 0 )
+ /* Also check that we already calculated the row SATDs for the current frame. */
+ if( frames[b]->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || frames[b]->i_row_satds[b-p0][p1-b][0] != -1) )
{
i_score = frames[b]->i_cost_est[b-p0][p1-b];
}
@@ -256,11 +272,11 @@
int dist_scale_factor = 128;
int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
- /* Init MVs so that we don't have to check edge conditions when loading predictors. */
- /* FIXME: not needed every time */
- memset( frames[b]->mv[0], 0, h->sps->i_mb_height * h->sps->i_mb_width * 2*sizeof(int16_t) );
- if( b != p1 )
- memset( frames[b]->mv[1], 0, h->sps->i_mb_height * h->sps->i_mb_width * 2*sizeof(int16_t) );
+ /* For each list, check to see whether we have lowres motion-searched this reference frame before. */
+ do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
+ do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
+ if( do_search[0] ) frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+ if( do_search[1] ) frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
if( b == p1 )
{
@@ -270,50 +286,157 @@
if( p1 != p0 )
dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
+ /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode. */
+ /* This considerably improves MV prediction overall. */
+ if( h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
+ {
+ for( h->mb.i_mb_y = h->sps->i_mb_height - 1; h->mb.i_mb_y >= 0 ; h->mb.i_mb_y-- )
+ for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0 ; h->mb.i_mb_x-- )
+ i_score += x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search );
+ }
/* the edge mbs seem to reduce the predictive quality of the
* whole frame's score, but are needed for a spatial distribution. */
- if( h->param.rc.i_vbv_buffer_size )
+ else if( h->param.rc.i_vbv_buffer_size )
{
- for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ )
+ for( h->mb.i_mb_y = h->sps->i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
{
row_satd[ h->mb.i_mb_y ] = 0;
- for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++ )
+ for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
{
- int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor );
- row_satd[ h->mb.i_mb_y ] += i_mb_cost;
+ int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search );
+ int i_mb_cost_aq = i_mb_cost;
+ if( h->param.rc.i_aq_mode )
+ {
+ x264_emms();
+ i_mb_cost_aq *= pow(2.0,-(frames[b]->f_qp_offset[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride])/6.0);
+ }
+ row_satd[ h->mb.i_mb_y ] += i_mb_cost_aq;
if( h->mb.i_mb_y > 0 && h->mb.i_mb_y < h->sps->i_mb_height - 1 &&
h->mb.i_mb_x > 0 && h->mb.i_mb_x < h->sps->i_mb_width - 1 )
{
+ /* Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. */
i_score += i_mb_cost;
+ i_score_aq += i_mb_cost_aq;
}
}
}
}
else
{
- for( h->mb.i_mb_y = 1; h->mb.i_mb_y < h->sps->i_mb_height - 1; h->mb.i_mb_y++ )
- for( h->mb.i_mb_x = 1; h->mb.i_mb_x < h->sps->i_mb_width - 1; h->mb.i_mb_x++ )
- i_score += x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor );
+ for( h->mb.i_mb_y = h->sps->i_mb_height - 2; h->mb.i_mb_y > 0; h->mb.i_mb_y-- )
+ for( h->mb.i_mb_x = h->sps->i_mb_width - 2; h->mb.i_mb_x > 0; h->mb.i_mb_x-- )
+ {
+ int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search );
+ int i_mb_cost_aq = i_mb_cost;
+ if( h->param.rc.i_aq_mode )
+ {
+ x264_emms();
+ i_mb_cost_aq *= pow(2.0,-(frames[b]->f_qp_offset[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride])/6.0);
+ }
+ i_score += i_mb_cost;
+ i_score_aq += i_mb_cost_aq;
+ }
}
if( b != p1 )
i_score = i_score * 100 / (120 + h->param.i_bframe_bias);
+ else
+ frames[b]->b_intra_calculated = 1;
frames[b]->i_cost_est[b-p0][p1-b] = i_score;
-// fprintf( stderr, "frm %d %c(%d,%d): %6d %6d imb:%d \n", frames[b]->i_frame,
-// (p1==0?'I':b<p1?'B':'P'), b-p0, p1-b, i_score, frames[b]->i_cost_est[0][0], frames[b]->i_intra_mbs[b-p0] );
+ frames[b]->i_cost_est_aq[b-p0][p1-b] = i_score_aq;
x264_emms();
}
if( b_intra_penalty )
{
// arbitrary penalty for I-blocks after B-frames
- int nmb = (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2);
+ int nmb = NUM_MBS;
i_score += i_score * frames[b]->i_intra_mbs[b-p0] / (nmb * 8);
}
return i_score;
}
+#define MAX_LENGTH (X264_BFRAME_MAX*4)
+
+static int x264_slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, char *path, int threshold )
+{
+ int loc = 1;
+ int cost = 0;
+ int cur_p = 0;
+ path--; /* Since the 1st path element is really the second frame */
+ while( path[loc] )
+ {
+ int next_p = loc;
+ int next_b;
+ /* Find the location of the next P-frame. */
+ while( path[next_p] && path[next_p] != 'P' )
+ next_p++;
+ /* Return if the path doesn't end on a P-frame. */
+ if( path[next_p] != 'P' )
+ return cost;
+
+ /* Add the cost of the P-frame found above */
+ cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_p, 0 );
+ /* Early terminate if the cost we have found is larger than the best path cost so far */
+ if( cost > threshold )
+ break;
+
+ for( next_b = loc; next_b < next_p && cost < threshold; next_b++ )
+ cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_b, 0 );
+
+ loc = next_p + 1;
+ cur_p = next_p;
+ }
+ return cost;
+}
+
+/* Viterbi/trellis slicetype decision algorithm. */
+/* Uses strings due to the fact that the speed of the control functions is
+ negligable compared to the cost of running slicetype_frame_cost, and because
+ it makes debugging easier. */
+static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int max_bframes, int buffer_size, char (*best_paths)[MAX_LENGTH] )
+{
+ char paths[X264_BFRAME_MAX+2][MAX_LENGTH] = {{0}};
+ int num_paths = X264_MIN(max_bframes+1, length);
+ int suffix_size, loc, path;
+ int best_cost = COST_MAX;
+ int best_path_index = 0;
+ length = X264_MIN(length,MAX_LENGTH);
+
+ /* Iterate over all currently possible paths and add suffixes to each one */
+ for( suffix_size = 0; suffix_size < num_paths; suffix_size++ )
+ {
+ memcpy( paths[suffix_size], best_paths[length - (suffix_size + 1)], length - (suffix_size + 1) );
+ for( loc = 0; loc < suffix_size; loc++ )
+ strcat( paths[suffix_size], "B" );
+ strcat( paths[suffix_size], "P" );
+ }
+
+ /* Calculate the actual cost of each of the current paths */
+ for( path = 0; path < num_paths; path++ )
+ {
+ int cost = x264_slicetype_path_cost( h, a, frames, paths[path], best_cost );
+ if( cost < best_cost )
+ {
+ best_cost = cost;
+ best_path_index = path;
+ }
+ }
+
+ /* Store the best path. */
+ memcpy( best_paths[length], paths[best_path_index], length );
+}
+
+static int x264_slicetype_path_search( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int bframes, int buffer )
+{
+ char best_paths[MAX_LENGTH][MAX_LENGTH] = {"","P"};
+ int n;
+ for( n = 2; n < length-1; n++ )
+ x264_slicetype_path( h, a, frames, n, bframes, buffer, best_paths );
+ return strspn( best_paths[length-2], "B" );
+}
+
static int scenecut( x264_t *h, x264_frame_t *frame, int pdist )
{
int icost = frame->i_cost_est[0][0];
@@ -336,15 +459,15 @@
{
f_bias = f_thresh_min
+ ( f_thresh_max - f_thresh_min )
- * ( i_gop_size - h->param.i_keyint_min )
- / ( h->param.i_keyint_max - h->param.i_keyint_min );
+ * ( i_gop_size - h->param.i_keyint_min )
+ / ( h->param.i_keyint_max - h->param.i_keyint_min ) ;
}
res = pcost >= (1.0 - f_bias) * icost;
if( res )
{
int imb = frame->i_intra_mbs[pdist];
- int pmb = (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2) - imb;
+ int pmb = NUM_MBS - imb;
x264_log( h, X264_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n",
frame->i_frame,
icost, pcost, 1. - (double)pcost / icost,
@@ -353,14 +476,14 @@
return res;
}
-void x264_slicetype_analyse( x264_t *h )
+static void x264_slicetype_analyse( x264_t *h )
{
x264_mb_analysis_t a;
- x264_frame_t *frames[X264_BFRAME_MAX+3] = { NULL, };
+ x264_frame_t *frames[X264_BFRAME_MAX*4+3] = { NULL, };
int num_frames;
int keyint_limit;
int j;
- int i_mb_count = (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2);
+ int i_mb_count = NUM_MBS;
int cost1p0, cost2p0, cost1b1, cost2p1;
int idr_frame_type;
@@ -392,37 +515,65 @@
return;
}
- cost2p1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 2, 1 );
- if( frames[2]->i_intra_mbs[2] > i_mb_count / 2 )
- goto no_b_frames;
-
- cost1b1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 1, 0 );
- cost1p0 = x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 );
- cost2p0 = x264_slicetype_frame_cost( h, &a, frames, 1, 2, 2, 0 );
-// fprintf( stderr, "PP: %d + %d <=> BP: %d + %d \n",
-// cost1p0, cost2p0, cost1b1, cost2p1 );
- if( cost1p0 + cost2p0 < cost1b1 + cost2p1 )
- goto no_b_frames;
-
-// arbitrary and untuned
-#define INTER_THRESH 300
-#define P_SENS_BIAS (50 - h->param.i_bframe_bias)
- frames[1]->i_type = X264_TYPE_B;
-
- for( j = 2; j <= X264_MIN( h->param.i_bframe, num_frames-1 ); j++ )
- {
- int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-1), INTER_THRESH/10);
- int pcost = x264_slicetype_frame_cost( h, &a, frames, 0, j+1, j+1, 1 );
-// fprintf( stderr, "frm%d+%d: %d <=> %d, I:%d/%d \n",
-// frames[0]->i_frame, j-1, pthresh, pcost/i_mb_count,
-// frames[j+1]->i_intra_mbs[j+1], i_mb_count );
- if( pcost > pthresh*i_mb_count || frames[j+1]->i_intra_mbs[j+1] > i_mb_count/3 )
+ if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
+ {
+ int num_bframes;
+ int max_bframes = X264_MIN(num_frames-1, h->param.i_bframe);
+ if( h->param.b_pre_scenecut )
{
- frames[j]->i_type = X264_TYPE_P;
- break;
+ x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 );
+ if( scenecut( h, frames[1], 1 ) )
+ {
+ frames[1]->i_type = idr_frame_type;
+ return;
+ }
}
- else
+ num_bframes = x264_slicetype_path_search( h, &a, frames, num_frames, max_bframes, num_frames-max_bframes );
+ assert(num_bframes < num_frames);
+
+ for( j = 1; j < num_bframes+1; j++ )
+ {
+ if( h->param.b_pre_scenecut && scenecut( h, frames[j+1], j+1 ) )
+ {
+ frames[j]->i_type = X264_TYPE_P;
+ frames[j+1]->i_type = idr_frame_type;
+ return;
+ }
frames[j]->i_type = X264_TYPE_B;
+ }
+ frames[num_bframes+1]->i_type = X264_TYPE_P;
+ }
+ else
+ {
+ cost2p1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 2, 1 );
+ if( frames[2]->i_intra_mbs[2] > i_mb_count / 2 )
+ goto no_b_frames;
+
+ cost1b1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 1, 0 );
+ cost1p0 = x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 );
+ cost2p0 = x264_slicetype_frame_cost( h, &a, frames, 1, 2, 2, 0 );
+
+ if( cost1p0 + cost2p0 < cost1b1 + cost2p1 )
+ goto no_b_frames;
+
+ // arbitrary and untuned
+ #define INTER_THRESH 300
+ #define P_SENS_BIAS (50 - h->param.i_bframe_bias)
+ frames[1]->i_type = X264_TYPE_B;
+
+ for( j = 2; j <= X264_MIN( h->param.i_bframe, num_frames-1 ); j++ )
+ {
+ int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-1), INTER_THRESH/10);
+ int pcost = x264_slicetype_frame_cost( h, &a, frames, 0, j+1, j+1, 1 );
+
+ if( pcost > pthresh*i_mb_count || frames[j+1]->i_intra_mbs[j+1] > i_mb_count/3 )
+ {
+ frames[j]->i_type = X264_TYPE_P;
+ break;
+ }
+ else
+ frames[j]->i_type = X264_TYPE_B;
+ }
}
}
@@ -442,7 +593,7 @@
h->frames.next[i]->i_type =
x264_ratecontrol_slice_type( h, h->frames.next[i]->i_frame );
}
- else if( (h->param.i_bframe && h->param.b_bframe_adaptive)
+ else if( (h->param.i_bframe && h->param.i_bframe_adaptive)
|| h->param.b_pre_scenecut )
x264_slicetype_analyse( h );
@@ -492,7 +643,7 @@
int x264_rc_analyse_slice( x264_t *h )
{
x264_mb_analysis_t a;
- x264_frame_t *frames[X264_BFRAME_MAX+2] = { NULL, };
+ x264_frame_t *frames[X264_BFRAME_MAX*4+2] = { NULL, };
int p0=0, p1, b;
int cost;
@@ -520,6 +671,11 @@
frames[b] = h->fenc;
cost = x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
+
+ /* In AQ, use the weighted score instead. */
+ if( h->param.rc.i_aq_mode )
+ cost = frames[b]->i_cost_est[b-p0][p1-b];
+
h->fenc->i_row_satd = h->fenc->i_row_satds[b-p0][p1-b];
h->fdec->i_row_satd = h->fdec->i_row_satds[b-p0][p1-b];
h->fdec->i_satd = cost;
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/gtk/Makefile
^
|
@@ -93,7 +93,7 @@
$(SONAMEGTK): $(OBJECTS_LIB)
@echo " L: $(@F)"
- @$(CC) -shared -o $@ $(OBJECTS_LIB) -Wl,-soname,$(SONAMEGTK) $(LDFLAGS)
+ @$(CC) -shared -o $@ $(OBJECTS_LIB) $(SOFLAGS) $(LDFLAGS)
# Program : test
$(TEST_BIN): $(OBJECTS_LIB) $(OBJECTS_TEST)
@@ -142,8 +142,8 @@
@install -d "$(DESTDIR)$(libdir)"
@echo " I: $(DESTDIR)$(libdir)/libx264gtk.a"
@install -m 644 libx264gtk.a "$(DESTDIR)$(libdir)"
- @echo " I: $(DESTDIR)$(libdir)/libx264gtk.so"
- @$(if $(SONAMEGTK), ln -sf $(SONAMEGTK) $(DESTDIR)$(libdir)/libx264gtk.so)
+ @echo " I: $(DESTDIR)$(libdir)/libx264gtk.$(SOSUFFIX)"
+ @$(if $(SONAMEGTK), ln -sf $(SONAMEGTK) $(DESTDIR)$(libdir)/libx264gtk.$(SOSUFFIX))
@$(if $(SONAMEGTK), install -m 755 $(SONAMEGTK) $(DESTDIR)$(libdir))
@echo " D: $(DESTDIR)$(bindir)"
@install -d "$(DESTDIR)$(bindir)"
@@ -172,9 +172,9 @@
@rm -f "$(DESTDIR)$(includedir)/x264_gtk_enum.h"
@echo " U: $(DESTDIR)$(libdir)/libx264gtk.a"
@rm -f "$(DESTDIR)$(libdir)/libx264gtk.a"
- @echo " U: $(DESTDIR)$(libdir)/libx264gtk.so"
+ @echo " U: $(DESTDIR)$(libdir)/$(SONAMEGTK)"
@$(if $(SONAMEGTK), rm -f "$(DESTDIR)$(libdir)/$(SONAMEGTK)")
- @rm -f "$(DESTDIR)$(libdir)/libx264gtk.so"
+ @rm -f "$(DESTDIR)$(libdir)/libx264gtk.$(SOSUFFIX)"
@echo " U: $(DESTDIR)$(bindir)/$(ENCODE_BIN)"
@rm -f "$(DESTDIR)$(bindir)/$(ENCODE_BIN)"
@echo " U: $(DESTDIR)${datadir}/x264"
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/gtk/x264_gtk.c
^
|
@@ -115,7 +115,7 @@
param->b_bframe_pyramid = x264_gtk->bframe_pyramid && x264_gtk->bframe;
param->analyse.b_bidir_me = x264_gtk->bidir_me;
- param->b_bframe_adaptive = x264_gtk->bframe_adaptive;
+ param->i_bframe_adaptive = x264_gtk->bframe_adaptive;
param->analyse.b_weighted_bipred = x264_gtk->weighted_bipred;
param->i_bframe = x264_gtk->bframe;
param->i_bframe_bias = x264_gtk->bframe_bias;
@@ -470,7 +470,7 @@
gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (config->mb.bframes.bframe_pyramid), param.b_bframe_pyramid);
gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (config->mb.bframes.bidir_me), param.analyse.b_bidir_me);
- gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (config->mb.bframes.bframe_adaptive), param.b_bframe_adaptive);
+ gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (config->mb.bframes.bframe_adaptive), param.i_bframe_adaptive);
gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (config->mb.bframes.weighted_bipred), param.analyse.b_weighted_bipred);
g_snprintf (buf, 64, "%d", param.i_bframe);
gtk_entry_set_text (GTK_ENTRY (config->mb.bframes.bframe), buf);
@@ -602,7 +602,7 @@
x264_gtk->bframe_pyramid = param.b_bframe_pyramid;
x264_gtk->bidir_me = param.analyse.b_bidir_me;
- x264_gtk->bframe_adaptive = param.b_bframe_adaptive;
+ x264_gtk->bframe_adaptive = param.i_bframe_adaptive;
x264_gtk->weighted_bipred = param.analyse.b_weighted_bipred;
x264_gtk->bframe = param.i_bframe;
x264_gtk->bframe_bias = param.i_bframe_bias;
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/gtk/x264_gtk.h
^
|
@@ -76,7 +76,7 @@
gint threads;
guint trellis;
gint noise_reduction;
-
+
gint strength;
gint threshold;
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/gtk/x264_gtk_encode_main_window.c
^
|
@@ -753,7 +753,7 @@
gtk_entry_set_text (GTK_ENTRY (thread_data->video_rendering_rate),
str);
- snprintf (str, 128, "%lld:%02lld:%02lld",
+ snprintf (str, 128, "%" PRId64 ":%02" PRId64 ":%02" PRId64,
(pipe_data.elapsed / 1000000) / 3600,
((pipe_data.elapsed / 1000000) / 60) % 60,
(pipe_data.elapsed / 1000000) % 60);
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/gtk/x264_gtk_encode_private.h
^
|
@@ -54,7 +54,7 @@
GIOChannel *io_write; /* use it with write */
};
-struct X264_Pipe_Data_
+struct X264_Pipe_Data_
{
int frame;
int frame_total;
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/matroska.c
^
|
@@ -407,7 +407,7 @@
return 0;
}
-int mk_flushFrame(mk_Writer *w) {
+static int mk_flushFrame(mk_Writer *w) {
int64_t delta, ref = 0;
unsigned fsize, bgsize;
unsigned char c_delta_flags[3];
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/muxers.c
^
|
@@ -216,7 +216,8 @@
tokstart = strchr(tokstart, 0x20);
break;
case 'A': /* Pixel aspect - 0:0 if unknown */
- if( sscanf(tokstart, "%d:%d", &n, &d) == 2 && n && d )
+ /* Don't override the aspect ratio if sar has been explicitly set on the commandline. */
+ if( sscanf(tokstart, "%d:%d", &n, &d) == 2 && n && d && !p_param->vui.i_sar_width && !p_param->vui.i_sar_height )
{
x264_reduce_fraction( &n, &d );
p_param->vui.i_sar_width = n;
@@ -285,7 +286,7 @@
/* Read frame header - without terminating '\n' */
if (fread(header, 1, slen, h->fh) != slen)
return -1;
-
+
header[slen] = 0;
if (strncmp(header, Y4M_FRAME_MAGIC, slen))
{
@@ -293,7 +294,7 @@
*((uint32_t*)header), header);
return -1;
}
-
+
/* Skip most of it */
while (i<MAX_FRAME_HEADER && fgetc(h->fh) != '\n')
i++;
@@ -426,6 +427,7 @@
x264_pthread_t tid;
int next_frame;
int frame_total;
+ int in_progress;
struct thread_input_arg_t *next_args;
} thread_input_t;
@@ -443,6 +445,7 @@
h->p_read_frame = p_read_frame;
h->p_close_infile = p_close_infile;
h->p_handle = *p_handle;
+ h->in_progress = 0;
h->next_frame = -1;
h->next_args = malloc(sizeof(thread_input_arg_t));
h->next_args->h = h;
@@ -459,7 +462,7 @@
return h->frame_total;
}
-void read_frame_thread_int( thread_input_arg_t *i )
+static void read_frame_thread_int( thread_input_arg_t *i )
{
i->status = i->h->p_read_frame( i->pic, i->h->p_handle, i->i_frame );
}
@@ -474,6 +477,7 @@
{
x264_pthread_join( h->tid, &stuff );
ret |= h->next_args->status;
+ h->in_progress = 0;
}
if( h->next_frame == i_frame )
@@ -491,6 +495,7 @@
h->next_args->i_frame = i_frame+1;
h->next_args->pic = &h->pic;
x264_pthread_create( &h->tid, NULL, (void*)read_frame_thread_int, h->next_args );
+ h->in_progress = 1;
}
else
h->next_frame = -1;
@@ -503,7 +508,8 @@
thread_input_t *h = handle;
h->p_close_infile( h->p_handle );
x264_picture_clean( &h->pic );
- x264_pthread_join( h->tid, NULL );
+ if( h->in_progress )
+ x264_pthread_join( h->tid, NULL );
free( h->next_args );
free( h );
return 0;
@@ -563,7 +569,7 @@
} mp4_t;
-void recompute_bitrate_mp4(GF_ISOFile *p_file, int i_track)
+static void recompute_bitrate_mp4(GF_ISOFile *p_file, int i_track)
{
u32 i, count, di, timescale, time_wnd, rate;
u64 offset;
@@ -806,7 +812,7 @@
char b_writing_frame;
} mkv_t;
-int write_header_mkv( mkv_t *p_mkv )
+static int write_header_mkv( mkv_t *p_mkv )
{
int ret;
uint8_t *avcC;
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/tools/avc2avi.c
^
|
@@ -499,7 +499,7 @@
/* skip i_offset_for_top_to_bottom_field */
bs_read_se( &s );
/* read i_num_ref_frames_in_poc_cycle */
- i_cycle = bs_read_ue( &s );
+ i_cycle = bs_read_ue( &s );
if( i_cycle > 256 ) i_cycle = 256;
while( i_cycle > 0 )
{
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/tools/checkasm-a.asm
^
|
@@ -61,7 +61,6 @@
or r3, r5
jz .ok
mov r3, eax
- picgetgot r1
lea r1, [error_message GLOBAL]
push r1
xor eax, eax
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/tools/checkasm.c
^
|
@@ -100,12 +100,12 @@
return &benchs[i].vers[j];
}
-int cmp_nop( const void *a, const void *b )
+static int cmp_nop( const void *a, const void *b )
{
return *(uint16_t*)a - *(uint16_t*)b;
}
-int cmp_bench( const void *a, const void *b )
+static int cmp_bench( const void *a, const void *b )
{
// asciibetical sort except preserving numbers
const char *sa = ((bench_func_t*)a)->name;
@@ -258,6 +258,7 @@
report( "pixel " #name " :" );
TEST_PIXEL( sad, 0 );
+ TEST_PIXEL( sad_aligned, 1 );
TEST_PIXEL( ssd, 1 );
TEST_PIXEL( satd, 0 );
TEST_PIXEL( sa8d, 0 );
@@ -302,7 +303,45 @@
TEST_PIXEL_X(3);
TEST_PIXEL_X(4);
-#define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \
+#define TEST_PIXEL_VAR( i ) \
+ if( pixel_asm.var[i] != pixel_ref.var[i] ) \
+ { \
+ uint32_t res_c, res_asm; \
+ uint32_t sad_c, sad_asm; \
+ set_func_name( "%s_%s", "var", pixel_names[i] ); \
+ used_asm = 1; \
+ res_c = call_c( pixel_c.var[i], buf1, 16, &sad_c ); \
+ res_asm = call_a( pixel_asm.var[i], buf1, 16, &sad_asm ); \
+ if( (res_c != res_asm) || (sad_c != sad_asm) ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, "var[%d]: %d,%d != %d,%d [FAILED]\n", i, res_c, sad_c, res_asm, sad_asm ); \
+ } \
+ }
+
+ ok = 1; used_asm = 0;
+ TEST_PIXEL_VAR( PIXEL_16x16 );
+ TEST_PIXEL_VAR( PIXEL_8x8 );
+ report( "pixel var :" );
+
+ for( i=0, ok=1, used_asm=0; i<4; i++ )
+ if( pixel_asm.hadamard_ac[i] != pixel_ref.hadamard_ac[i] )
+ {
+ set_func_name( "hadamard_ac_%s", pixel_names[i] );
+ used_asm = 1;
+ uint64_t rc = pixel_c.hadamard_ac[i]( buf1, 16 );
+ uint64_t ra = pixel_asm.hadamard_ac[i]( buf1, 16 );
+ if( rc != ra )
+ {
+ ok = 0;
+ fprintf( stderr, "hadamard_ac[%d]: %d,%d != %d,%d\n", i, (int)rc, (int)(rc>>32), (int)ra, (int)(ra>>32) );
+ }
+ call_c2( pixel_c.hadamard_ac[i], buf1, 16 );
+ call_a2( pixel_asm.hadamard_ac[i], buf1, 16 );
+ }
+ report( "pixel hadamard_ac :" );
+
+#define TEST_INTRA_MBCMP( name, pred, satd, i8x8, ... ) \
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
{ \
int res_c[3], res_asm[3]; \
@@ -311,10 +350,10 @@
memcpy( buf3, buf2, 1024 ); \
for( i=0; i<3; i++ ) \
{ \
- pred[i]( buf3+40, ##__VA_ARGS__ ); \
- res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \
+ pred[i]( buf3+48, ##__VA_ARGS__ ); \
+ res_c[i] = pixel_c.satd( buf1+48, 16, buf3+48, 32 ); \
} \
- call_a( pixel_asm.name, buf1+40, i8x8 ? edge : buf3+40, res_asm ); \
+ call_a( pixel_asm.name, buf1+48, i8x8 ? edge : buf3+48, res_asm ); \
if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
{ \
ok = 0; \
@@ -325,11 +364,13 @@
}
ok = 1; used_asm = 0;
- TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 );
- TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8], 0 );
- TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4], 0 );
- TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], 1, edge );
+ TEST_INTRA_MBCMP( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 );
+ TEST_INTRA_MBCMP( intra_satd_x3_8x8c , predict_8x8c , satd[PIXEL_8x8] , 0 );
+ TEST_INTRA_MBCMP( intra_satd_x3_4x4 , predict_4x4 , satd[PIXEL_4x4] , 0 );
+ TEST_INTRA_MBCMP( intra_sa8d_x3_8x8 , predict_8x8 , sa8d[PIXEL_8x8] , 1, edge );
report( "intra satd_x3 :" );
+ TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 );
+ report( "intra sad_x3 :" );
if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )
@@ -568,6 +609,7 @@
{ \
set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
used_asm = 1; \
+ memcpy(dct, buf1, size*sizeof(int16_t));\
call_c( zigzag_c.name, t1, dct ); \
call_a( zigzag_asm.name, t2, dct ); \
if( memcmp( t1, t2, size*sizeof(int16_t) ) ) \
@@ -729,31 +771,29 @@
#undef MC_TEST_LUMA
#undef MC_TEST_CHROMA
-#define MC_TEST_AVG( name, ... ) \
+#define MC_TEST_AVG( name, weight ) \
for( i = 0, ok = 1, used_asm = 0; i < 10; i++ ) \
{ \
- memcpy( buf3, buf1, 1024 ); \
- memcpy( buf4, buf1, 1024 ); \
+ memcpy( buf3, buf1+320, 320 ); \
+ memcpy( buf4, buf1+320, 320 ); \
if( mc_a.name[i] != mc_ref.name[i] ) \
{ \
set_func_name( "%s_%s", #name, pixel_names[i] );\
used_asm = 1; \
- call_c1( mc_c.name[i], buf3, 32, buf2, 16, ##__VA_ARGS__ ); \
- call_a1( mc_a.name[i], buf4, 32, buf2, 16, ##__VA_ARGS__ ); \
- if( memcmp( buf3, buf4, 1024 ) ) \
+ call_c1( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \
+ call_a1( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \
+ if( memcmp( buf3, buf4, 320 ) ) \
{ \
ok = 0; \
fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
} \
- call_c2( mc_c.name[i], buf3, 32, buf2, 16, ##__VA_ARGS__ ); \
- call_a2( mc_a.name[i], buf4, 32, buf2, 16, ##__VA_ARGS__ ); \
+ call_c2( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \
+ call_a2( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \
} \
}
- MC_TEST_AVG( avg );
- report( "mc avg :" );
ok = 1; used_asm = 0;
- for( w = -64; w <= 128 && ok; w++ )
- MC_TEST_AVG( avg_weight, w );
+ for( w = -63; w <= 127 && ok; w++ )
+ MC_TEST_AVG( avg, w );
report( "mc wpredb :" );
if( mc_a.hpel_filter != mc_ref.hpel_filter )
@@ -1048,7 +1088,7 @@
report( "dequant :" );
- if( qf_a.denoise_dct_core != qf_ref.denoise_dct_core )
+ if( qf_a.denoise_dct != qf_ref.denoise_dct )
{
int size;
for( size = 16; size <= 64; size += 48 )
@@ -1058,12 +1098,12 @@
memcpy(dct1, buf1, size*2);
memcpy(dct2, buf1, size*2);
memcpy(buf3+256, buf3, 256);
- call_c1( qf_c.denoise_dct_core, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
- call_a1( qf_a.denoise_dct_core, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
+ call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
+ call_a1( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
if( memcmp( dct1, dct2, size*2 ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) )
ok = 0;
- call_c2( qf_c.denoise_dct_core, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
- call_a2( qf_a.denoise_dct_core, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
+ call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
+ call_a2( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
}
}
report( "denoise dct :" );
@@ -1178,7 +1218,7 @@
return ret;
}
-int check_all_funcs( int cpu_ref, int cpu_new )
+static int check_all_funcs( int cpu_ref, int cpu_new )
{
return check_pixel( cpu_ref, cpu_new )
+ check_dct( cpu_ref, cpu_new )
@@ -1189,7 +1229,7 @@
+ check_cabac( cpu_ref, cpu_new );
}
-int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
+static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
{
*cpu_ref = *cpu_new;
*cpu_new |= flags;
@@ -1200,7 +1240,7 @@
return check_all_funcs( *cpu_ref, *cpu_new );
}
-int check_all_flags( void )
+static int check_all_flags( void )
{
int ret = 0;
int cpu0 = 0, cpu1 = 0;
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/x264.c
^
|
@@ -37,6 +37,12 @@
#include "config.h"
#endif
+#ifdef _WIN32
+#include <windows.h>
+#else
+#define SetConsoleTitle(t)
+#endif
+
uint8_t *mux_buffer = NULL;
int mux_buffer_size = 0;
@@ -163,7 +169,11 @@
H1( " --pre-scenecut Faster, less precise scenecut detection.\n"
" Required and implied by multi-threading.\n" );
H0( " -b, --bframes <integer> Number of B-frames between I and P [%d]\n", defaults->i_bframe );
- H1( " --no-b-adapt Disable adaptive B-frame decision\n" );
+ H1( " --b-adapt Adaptive B-frame decision method [%d]\n"
+ " Higher values may lower threading efficiency.\n"
+ " - 0: Disabled\n"
+ " - 1: Fast\n"
+ " - 2: Optimal (slow with high --bframes)\n", defaults->i_bframe_adaptive );
H1( " --b-bias <integer> Influences how often B-frames are used [%d]\n", defaults->i_bframe_bias );
H0( " --b-pyramid Keep some B-frames as references\n" );
H0( " --no-cabac Disable CABAC\n" );
@@ -188,10 +198,9 @@
H0( " --ipratio <float> QP factor between I and P [%.2f]\n", defaults->rc.f_ip_factor );
H0( " --pbratio <float> QP factor between P and B [%.2f]\n", defaults->rc.f_pb_factor );
H1( " --chroma-qp-offset <integer> QP difference between chroma and luma [%d]\n", defaults->analyse.i_chroma_qp_offset );
- H0( " --aq-mode <integer> How AQ distributes bits [%d]\n"
+ H1( " --aq-mode <integer> AQ method [%d]\n"
" - 0: Disabled\n"
- " - 1: Avoid moving bits between frames\n"
- " - 2: Move bits between frames\n", defaults->rc.i_aq_mode );
+ " - 1: Variance AQ (complexity mask)\n", defaults->rc.i_aq_mode );
H0( " --aq-strength <float> Reduces blocking and blurring in flat and\n"
" textured areas. [%.1f]\n"
" - 0.5: weak AQ\n"
@@ -202,7 +211,6 @@
" - 2: Last pass, does not overwrite stats file\n"
" - 3: Nth pass, overwrites stats file\n" );
H0( " --stats <string> Filename for 2 pass stats [\"%s\"]\n", defaults->rc.psz_stat_out );
- H1( " --rceq <string> Ratecontrol equation [\"%s\"]\n", defaults->rc.psz_rc_eq );
H0( " --qcomp <float> QP curve compression: 0.0 => CBR, 1.0 => CQP [%.2f]\n", defaults->rc.f_qcompress );
H1( " --cplxblur <float> Reduce fluctuations in QP (before curve compression) [%.1f]\n", defaults->rc.f_complexity_blur );
H1( " --qblur <float> Reduce fluctuations in QP (after curve compression) [%.1f]\n", defaults->rc.f_qblur );
@@ -243,6 +251,10 @@
H0( " -m, --subme <integer> Subpixel motion estimation and partition\n"
" decision quality: 1=fast, 7=best. [%d]\n", defaults->analyse.i_subpel_refine );
H0( " --b-rdo RD based mode decision for B-frames. Requires subme 6.\n" );
+ H0( " --psy-rd Strength of psychovisual optimization [\"%.1f:%.1f\"]\n"
+ " #1: RDO (requires subme>=6)\n"
+ " #2: Trellis (requires trellis, experimental)\n",
+ defaults->analyse.f_psy_rd, defaults->analyse.f_psy_trellis );
H0( " --mixed-refs Decide references on a per partition basis\n" );
H1( " --no-chroma-me Ignore chroma in motion estimation\n" );
H1( " --bime Jointly optimize both MVs in B-frames\n" );
@@ -375,6 +387,7 @@
{ "version", no_argument, NULL, 'V' },
{ "bitrate", required_argument, NULL, 'B' },
{ "bframes", required_argument, NULL, 'b' },
+ { "b-adapt", required_argument, NULL, 0 },
{ "no-b-adapt", no_argument, NULL, 0 },
{ "b-bias", required_argument, NULL, 0 },
{ "b-pyramid", no_argument, NULL, 0 },
@@ -411,6 +424,7 @@
{ "mvrange", required_argument, NULL, 0 },
{ "mvrange-thread", required_argument, NULL, 0 },
{ "subme", required_argument, NULL, 'm' },
+ { "psy-rd", required_argument, NULL, 0 },
{ "b-rdo", no_argument, NULL, 0 },
{ "mixed-refs", no_argument, NULL, 0 },
{ "no-chroma-me", no_argument, NULL, 0 },
@@ -433,7 +447,6 @@
{ "chroma-qp-offset", required_argument, NULL, 0 },
{ "pass", required_argument, NULL, 'p' },
{ "stats", required_argument, NULL, 0 },
- { "rceq", required_argument, NULL, 0 },
{ "qcomp", required_argument, NULL, 0 },
{ "qblur", required_argument, NULL, 0 },
{ "cplxblur",required_argument, NULL, 0 },
@@ -541,7 +554,7 @@
return -1;
}
param->i_scenecut_threshold = -1;
- param->b_bframe_adaptive = 0;
+ param->i_bframe_adaptive = X264_B_ADAPT_NONE;
break;
case OPT_THREAD_INPUT:
b_thread_input = 1;
@@ -632,7 +645,7 @@
sscanf( argv[optind++], "%ux%u", ¶m->i_width, ¶m->i_height );
}
}
-
+
if( !(b_avis || b_y4m) && ( !param->i_width || !param->i_height ) )
{
fprintf( stderr, "x264 [error]: Rawyuv input requires a resolution.\n" );
@@ -772,14 +785,17 @@
int64_t i_start, i_end;
int64_t i_file;
int i_frame_size;
- int i_progress;
+ int i_update_interval;
+ char buf[200];
+ opt->b_progress &= param->i_log_level < X264_LOG_DEBUG;
i_frame_total = p_get_frame_total( opt->hin );
i_frame_total -= opt->i_seek;
if( ( i_frame_total == 0 || param->i_frame_total < i_frame_total )
&& param->i_frame_total > 0 )
i_frame_total = param->i_frame_total;
param->i_frame_total = i_frame_total;
+ i_update_interval = i_frame_total ? x264_clip3( i_frame_total / 1000, 1, 10 ) : 10;
if( ( h = x264_encoder_open( param ) ) == NULL )
{
@@ -802,8 +818,7 @@
i_start = x264_mdate();
/* Encode frames */
- for( i_frame = 0, i_file = 0, i_progress = 0;
- b_ctrl_c == 0 && (i_frame < i_frame_total || i_frame_total == 0); )
+ for( i_frame = 0, i_file = 0; b_ctrl_c == 0 && (i_frame < i_frame_total || i_frame_total == 0); )
{
if( p_read_frame( &pic, opt->hin, i_frame + opt->i_seek ) )
break;
@@ -824,22 +839,24 @@
i_frame++;
/* update status line (up to 1000 times per input file) */
- if( opt->b_progress && param->i_log_level < X264_LOG_DEBUG &&
- ( i_frame_total ? i_frame * 1000 / i_frame_total > i_progress
- : i_frame % 10 == 0 ) )
+ if( opt->b_progress && i_frame % i_update_interval == 0 )
{
int64_t i_elapsed = x264_mdate() - i_start;
double fps = i_elapsed > 0 ? i_frame * 1000000. / i_elapsed : 0;
+ double bitrate = (double) i_file * 8 * param->i_fps_num / ( (double) param->i_fps_den * i_frame * 1000 );
if( i_frame_total )
{
int eta = i_elapsed * (i_frame_total - i_frame) / ((int64_t)i_frame * 1000000);
- i_progress = i_frame * 1000 / i_frame_total;
- fprintf( stderr, "encoded frames: %d/%d (%.1f%%), %.2f fps, eta %d:%02d:%02d \r",
- i_frame, i_frame_total, (float)i_progress / 10, fps,
+ sprintf( buf, "x264 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d",
+ 100. * i_frame / i_frame_total, i_frame, i_frame_total, fps, bitrate,
eta/3600, (eta/60)%60, eta%60 );
}
else
- fprintf( stderr, "encoded frames: %d, %.2f fps \r", i_frame, fps );
+ {
+ sprintf( buf, "x264 %d frames: %.2f fps, %.2f kb/s", i_frame, fps, bitrate );
+ }
+ fprintf( stderr, "%s \r", buf+5 );
+ SetConsoleTitle( buf );
fflush( stderr ); // needed in windows
}
}
@@ -851,6 +868,9 @@
i_end = x264_mdate();
x264_picture_clean( &pic );
+ /* Erase progress indicator before printing encoding stats. */
+ if( opt->b_progress )
+ fprintf( stderr, " \r" );
x264_encoder_close( h );
x264_free( mux_buffer );
fprintf( stderr, "\n" );
|
[-]
[+]
|
Changed |
x264-snapshot-20081001-2245.tar.bz2/x264.h
^
|
@@ -35,7 +35,7 @@
#include <stdarg.h>
-#define X264_BUILD 60
+#define X264_BUILD 64
/* x264_t:
* opaque handler for encoder */
@@ -85,8 +85,10 @@
#define X264_RC_CRF 1
#define X264_RC_ABR 2
#define X264_AQ_NONE 0
-#define X264_AQ_LOCAL 1
-#define X264_AQ_GLOBAL 2
+#define X264_AQ_VARIANCE 1
+#define X264_B_ADAPT_NONE 0
+#define X264_B_ADAPT_FAST 1
+#define X264_B_ADAPT_TRELLIS 2
static const char * const x264_direct_pred_names[] = { "none", "spatial", "temporal", "auto", 0 };
static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", "tesa", 0 };
@@ -155,7 +157,7 @@
int i_width;
int i_height;
int i_csp; /* CSP of encoded bitstream, only i420 supported */
- int i_level_idc;
+ int i_level_idc;
int i_frame_total; /* number of frames to encode if known, else 0 */
struct
@@ -165,7 +167,7 @@
int i_sar_width;
int i_overscan; /* 0=undef, 1=no overscan, 2=overscan */
-
+
/* see h264 annex E for the values of the following */
int i_vidformat;
int b_fullrange;
@@ -185,7 +187,7 @@
int i_scenecut_threshold; /* how aggressively to insert extra I frames */
int b_pre_scenecut; /* compute scenecut on lowres frames */
int i_bframe; /* how many b-frame between 2 references pictures */
- int b_bframe_adaptive;
+ int i_bframe_adaptive;
int i_bframe_bias;
int b_bframe_pyramid; /* Keep some B-frames as references */
@@ -239,6 +241,8 @@
int b_fast_pskip; /* early SKIP detection on P-frames */
int b_dct_decimate; /* transform coefficient thresholding on P-frames */
int i_noise_reduction; /* adaptive pseudo-deadzone */
+ float f_psy_rd; /* Psy RD strength */
+ float f_psy_trellis; /* Psy trellis strength */
/* the deadzone size that will be used in luma quantization */
int i_luma_deadzone[2]; /* {inter, intra} */
@@ -276,7 +280,6 @@
char *psz_stat_in;
/* 2pass params (same as ffmpeg ones) */
- char *psz_rc_eq; /* 2 pass rate control equation */
float f_qcompress; /* 0.0 => cbr, 1.0 => constant qp */
float f_qblur; /* temporally blur quants */
float f_complexity_blur; /* temporally blur complexity */
|
|
Changed |
x264-snapshot-20081218-2245.tar.bz2
^
|
[-]
[+]
|
Deleted |
x264.spec.old
^
|
@@ -1,132 +0,0 @@
-# norootforbuild
-
-%define binname x264
-%define realname libx264
-%define soname 59
-%define svn 20080607
-
-Name: %{binname}
-Summary: A free h264/avc encoder - encoder binary.
-Version: 0.0svn%{svn}
-Release: 1
-License: GPL
-Group: Productivity/Multimedia/Video
-Url: http://developers.videolan.org/x264.html
-
-Source: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2
-
-BuildRoot: %{_tmppath}/build-root-%{name}
-
-Requires: %{realname}-%{soname}
-
-%ifarch x86_64
-BuildRequires: nasm yasm
-%else
-BuildRequires: nasm
-%endif
-
-%description
-x264 is a free library for encoding next-generation H264/AVC video
-streams. The code is written from scratch by Laurent Aimar, Loren
-Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans
-Rullgard, Radek Czyz, Christian Heine (asm), Alex Izvorski (asm), and
-Alex Wright. It is released under the terms of the GPL license. This
-package contains a shared library and a commandline tool for encoding
-H264 streams. This library is needed for mplayer/mencoder for H264
-encoding support.
-
-Encoder features:
-- CAVLC/CABAC
-- Multi-references
-- Intra: all macroblock types (16x16, 8x8, and 4x4 with all predictions)
-- Inter P: all partitions (from 16x16 down to 4x4)
-- Inter B: partitions from 16x16 down to 8x8 (including skip/direct)
-- Ratecontrol: constant quantizer, single or multipass ABR, optional VBV
-- Scene cut detection
-- Adaptive B-frame placement
-- B-frames as references / arbitrary frame order
-- 8x8 and 4x4 adaptive spatial transform
-- Lossless mode
-- Custom quantization matrices
-- Parallel encoding of multiple slices (currently disabled)
-
-Be aware that the x264 library is still in early development stage. The
-command line tool x264 can handle only raw YUV 4:2:0 streams at the
-moment so please use mencoder or another tool that supports x264 library
-for all other file types.
-
-%package -n %{realname}-%{soname}
-Summary: A free h264/avc encoder - encoder binary
-Group: Productivity/Multimedia/Video
-
-%description -n %{realname}-%{soname}
-x264 is a free library for encoding next-generation H264/AVC video
-streams. The code is written from scratch by Laurent Aimar, Loren
-Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans
-Rullgard, Radek Czyz, Christian Heine (asm), Alex Izvorski (asm), and
-Alex Wright. It is released under the terms of the GPL license. This
-package contains a static library and a header needed for the
-development with libx264. This library is needed to build
-mplayer/mencoder with H264 encoding support.
-
-
-%package -n %{realname}-devel
-Summary: Libraries and include file for the x264 encoder.
-Group: Development/Libraries/C and C++
-Requires: %{realname}-%{soname} = %{version}-%{release}
-Requires: %{buildrequires}
-
-%description -n %{realname}-devel
-x264 is a free library for encoding next-generation H264/AVC video
-streams. The code is written from scratch by Laurent Aimar, Loren
-Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans
-Rullgard, Radek Czyz, Christian Heine (asm), Alex Izvorski (asm), and
-Alex Wright. It is released under the terms of the GPL license. This
-package contains a static library and a header needed for the
-development with libx264. This library is needed to build
-mplayer/mencoder with H264 encoding support.
-
-%prep
-%setup -q -n x264-snapshot-%{svn}-2245
-
-
-%build
-%{configure} --enable-shared --enable-pic
-#TODO: to compile with --enable-mp4-output gpac is needed, this should be added in the future...
-%{__make} %{?jobs:-j%{jobs}}
-
-
-%install
-%makeinstall
-
-rm $RPM_BUILD_ROOT/%{_libdir}/libx264.so
-cd $RPM_BUILD_ROOT/%{_libdir}
-ln -s libx264.so.%{soname} libx264.so
-
-
-%clean
-%__rm -rf %{buildroot}
-
-
-%files
-%defattr(755,root,root)
-%doc doc/*.txt
-%{_bindir}/x264
-
-%files -n %{realname}-%{soname}
-%defattr(755,root,root)
-%{_libdir}/libx264.so.%{soname}
-
-
-%files -n %{realname}-devel
-%defattr(755,root,root)
-%{_libdir}/pkgconfig/x264.pc
-%{_includedir}/x264.h
-%{_libdir}/libx264.so
-%{_libdir}/libx264.a
-
-
-%changelog
-* Sun Sep 30 2007 Carsten Schoene <cs@linux-administrator.com>
-- import for SLE_10 build
-
|