Changes - j0ke.net Open Build Service

@@ -8,6 +8,7 @@ common/frame.c common/dct.c common/cpu.c common/cabac.c \ common/common.c common/mdate.c common/rectangle.c \ common/set.c common/quant.c common/deblock.c common/vlc.c \ + common/mvpred.c \ encoder/analyse.c encoder/me.c encoder/ratecontrol.c \ encoder/set.c encoder/macroblock.c encoder/cabac.c \ encoder/cavlc.c encoder/encoder.c encoder/lookahead.c @@ -49,8 +50,8 @@ # MMX/SSE optims ifneq ($(AS),) -X86SRC0 = cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \ - pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \ +X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \ + mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \ cpu-a.asm dct-32.asm X86SRC = $(X86SRC0:%=common/x86/%)

@@ -112,8 +112,8 @@ x264_mc_copy_w16_neon, }; -static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; -static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; +static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; +static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; static void mc_luma_neon( uint8_t *dst, int i_dst_stride, uint8_t *src[4], int i_src_stride,

@@ -664,75 +664,44 @@ } }; -/* FIXME could avoid this duplication by reversing the order of states - * with MPS=0, but that would uglify the other tables */ -const uint8_t x264_cabac_range_lps[128][4] = -{ - { 2, 2, 2, 2 }, - { 6, 7, 8, 9 }, { 6, 7, 9, 10 }, { 6, 8, 9, 11 }, - { 7, 8, 10, 11 }, { 7, 9, 10, 12 }, { 7, 9, 11, 12 }, - { 8, 9, 11, 13 }, { 8, 10, 12, 14 }, { 9, 11, 12, 14 }, - { 9, 11, 13, 15 }, { 10, 12, 14, 16 }, { 10, 12, 15, 17 }, - { 11, 13, 15, 18 }, { 11, 14, 16, 19 }, { 12, 14, 17, 20 }, - { 12, 15, 18, 21 }, { 13, 16, 19, 22 }, { 14, 17, 20, 23 }, - { 14, 18, 21, 24 }, { 15, 19, 22, 25 }, { 16, 20, 23, 27 }, - { 17, 21, 25, 28 }, { 18, 22, 26, 30 }, { 19, 23, 27, 31 }, - { 20, 24, 29, 33 }, { 21, 26, 30, 35 }, { 22, 27, 32, 37 }, - { 23, 28, 33, 39 }, { 24, 30, 35, 41 }, { 26, 31, 37, 43 }, - { 27, 33, 39, 45 }, { 29, 35, 41, 48 }, { 30, 37, 43, 50 }, - { 32, 39, 46, 53 }, { 33, 41, 48, 56 }, { 35, 43, 51, 59 }, - { 37, 45, 54, 62 }, { 39, 48, 56, 65 }, { 41, 50, 59, 69 }, - { 43, 53, 63, 72 }, { 46, 56, 66, 76 }, { 48, 59, 69, 80 }, - { 51, 62, 73, 85 }, { 53, 65, 77, 89 }, { 56, 69, 81, 94 }, - { 59, 72, 86, 99 }, { 62, 76, 90, 104 }, { 66, 80, 95, 110 }, - { 69, 85, 100, 116 }, { 73, 89, 105, 122 }, { 77, 94, 111, 128 }, - { 81, 99, 117, 135 }, { 85, 104, 123, 142 }, { 90, 110, 130, 150 }, - { 95, 116, 137, 158 }, { 100, 122, 144, 166 }, { 105, 128, 152, 175 }, - { 111, 135, 160, 185 }, { 116, 142, 169, 195 }, { 123, 150, 178, 205 }, - { 128, 158, 187, 216 }, { 128, 167, 197, 227 }, { 128, 176, 208, 240 }, - - { 128, 176, 208, 240 }, { 128, 167, 197, 227 }, { 128, 158, 187, 216 }, - { 123, 150, 178, 205 }, { 116, 142, 169, 195 }, { 111, 135, 160, 185 }, - { 105, 128, 152, 175 }, { 100, 122, 144, 166 }, { 95, 116, 137, 158 }, - { 90, 110, 130, 150 }, { 85, 104, 123, 142 }, { 81, 99, 117, 135 }, - { 77, 94, 111, 128 }, { 73, 89, 105, 122 }, { 69, 85, 100, 116 }, - { 66, 80, 95, 110 }, { 62, 76, 90, 104 }, { 59, 72, 86, 99 }, - { 56, 69, 81, 94 }, { 53, 65, 77, 89 }, { 51, 62, 73, 85 }, - { 48, 59, 69, 80 }, { 46, 56, 66, 76 }, { 43, 53, 63, 72 }, - { 41, 50, 59, 69 }, { 39, 48, 56, 65 }, { 37, 45, 54, 62 }, - { 35, 43, 51, 59 }, { 33, 41, 48, 56 }, { 32, 39, 46, 53 }, - { 30, 37, 43, 50 }, { 29, 35, 41, 48 }, { 27, 33, 39, 45 }, - { 26, 31, 37, 43 }, { 24, 30, 35, 41 }, { 23, 28, 33, 39 }, - { 22, 27, 32, 37 }, { 21, 26, 30, 35 }, { 20, 24, 29, 33 }, - { 19, 23, 27, 31 }, { 18, 22, 26, 30 }, { 17, 21, 25, 28 }, - { 16, 20, 23, 27 }, { 15, 19, 22, 25 }, { 14, 18, 21, 24 }, - { 14, 17, 20, 23 }, { 13, 16, 19, 22 }, { 12, 15, 18, 21 }, - { 12, 14, 17, 20 }, { 11, 14, 16, 19 }, { 11, 13, 15, 18 }, - { 10, 12, 15, 17 }, { 10, 12, 14, 16 }, { 9, 11, 13, 15 }, - { 9, 11, 12, 14 }, { 8, 10, 12, 14 }, { 8, 9, 11, 13 }, - { 7, 9, 11, 12 }, { 7, 9, 10, 12 }, { 7, 8, 10, 11 }, - { 6, 8, 9, 11 }, { 6, 7, 9, 10 }, { 6, 7, 8, 9 }, - { 2, 2, 2, 2 }, +const uint8_t x264_cabac_range_lps[64][4] = +{ + { 2, 2, 2, 2}, { 6, 7, 8, 9}, { 6, 7, 9, 10}, { 6, 8, 9, 11}, + { 7, 8, 10, 11}, { 7, 9, 10, 12}, { 7, 9, 11, 12}, { 8, 9, 11, 13}, + { 8, 10, 12, 14}, { 9, 11, 12, 14}, { 9, 11, 13, 15}, { 10, 12, 14, 16}, + { 10, 12, 15, 17}, { 11, 13, 15, 18}, { 11, 14, 16, 19}, { 12, 14, 17, 20}, + { 12, 15, 18, 21}, { 13, 16, 19, 22}, { 14, 17, 20, 23}, { 14, 18, 21, 24}, + { 15, 19, 22, 25}, { 16, 20, 23, 27}, { 17, 21, 25, 28}, { 18, 22, 26, 30}, + { 19, 23, 27, 31}, { 20, 24, 29, 33}, { 21, 26, 30, 35}, { 22, 27, 32, 37}, + { 23, 28, 33, 39}, { 24, 30, 35, 41}, { 26, 31, 37, 43}, { 27, 33, 39, 45}, + { 29, 35, 41, 48}, { 30, 37, 43, 50}, { 32, 39, 46, 53}, { 33, 41, 48, 56}, + { 35, 43, 51, 59}, { 37, 45, 54, 62}, { 39, 48, 56, 65}, { 41, 50, 59, 69}, + { 43, 53, 63, 72}, { 46, 56, 66, 76}, { 48, 59, 69, 80}, { 51, 62, 73, 85}, + { 53, 65, 77, 89}, { 56, 69, 81, 94}, { 59, 72, 86, 99}, { 62, 76, 90, 104}, + { 66, 80, 95, 110}, { 69, 85, 100, 116}, { 73, 89, 105, 122}, { 77, 94, 111, 128}, + { 81, 99, 117, 135}, { 85, 104, 123, 142}, { 90, 110, 130, 150}, { 95, 116, 137, 158}, + {100, 122, 144, 166}, {105, 128, 152, 175}, {111, 135, 160, 185}, {116, 142, 169, 195}, + {123, 150, 178, 205}, {128, 158, 187, 216}, {128, 167, 197, 227}, {128, 176, 208, 240} }; const uint8_t x264_cabac_transition[128][2] = { - { 0, 0}, { 1, 25}, { 1, 25}, { 2, 26}, { 3, 26}, { 4, 26}, { 5, 27}, { 6, 27}, - { 7, 27}, { 8, 28}, { 9, 28}, { 10, 28}, { 11, 29}, { 12, 29}, { 13, 30}, { 14, 30}, - { 15, 30}, { 16, 31}, { 17, 31}, { 18, 32}, { 19, 33}, { 20, 33}, { 21, 33}, { 22, 34}, - { 23, 34}, { 24, 35}, { 25, 36}, { 26, 36}, { 27, 37}, { 28, 37}, { 29, 38}, { 30, 39}, - { 31, 39}, { 32, 40}, { 33, 41}, { 34, 41}, { 35, 42}, { 36, 42}, { 37, 44}, { 38, 44}, - { 39, 45}, { 40, 45}, { 41, 47}, { 42, 47}, { 43, 48}, { 44, 48}, { 45, 50}, { 46, 50}, - { 47, 51}, { 48, 52}, { 49, 52}, { 50, 54}, { 51, 54}, { 52, 55}, { 53, 56}, { 54, 57}, - { 55, 58}, { 56, 59}, { 57, 59}, { 58, 61}, { 59, 61}, { 60, 62}, { 61, 63}, { 62, 64}, - { 63, 65}, { 64, 66}, { 65, 67}, { 66, 68}, { 66, 69}, { 68, 70}, { 68, 71}, { 69, 72}, - { 70, 73}, { 71, 74}, { 72, 75}, { 73, 76}, { 73, 77}, { 75, 78}, { 75, 79}, { 76, 80}, - { 77, 81}, { 77, 82}, { 79, 83}, { 79, 84}, { 80, 85}, { 80, 86}, { 82, 87}, { 82, 88}, - { 83, 89}, { 83, 90}, { 85, 91}, { 85, 92}, { 86, 93}, { 86, 94}, { 87, 95}, { 88, 96}, - { 88, 97}, { 89, 98}, { 90, 99}, { 90,100}, { 91,101}, { 91,102}, { 92,103}, { 93,104}, - { 93,105}, { 94,106}, { 94,107}, { 94,108}, { 95,109}, { 96,110}, { 96,111}, { 97,112}, - { 97,113}, { 97,114}, { 98,115}, { 98,116}, { 99,117}, { 99,118}, { 99,119}, {100,120}, - {100,121}, {100,122}, {101,123}, {101,124}, {101,125}, {102,126}, {102,126}, {127,127}, + { 0, 0}, { 1, 1}, { 2, 50}, { 51, 3}, { 2, 50}, { 51, 3}, { 4, 52}, { 53, 5}, + { 6, 52}, { 53, 7}, { 8, 52}, { 53, 9}, { 10, 54}, { 55, 11}, { 12, 54}, { 55, 13}, + { 14, 54}, { 55, 15}, { 16, 56}, { 57, 17}, { 18, 56}, { 57, 19}, { 20, 56}, { 57, 21}, + { 22, 58}, { 59, 23}, { 24, 58}, { 59, 25}, { 26, 60}, { 61, 27}, { 28, 60}, { 61, 29}, + { 30, 60}, { 61, 31}, { 32, 62}, { 63, 33}, { 34, 62}, { 63, 35}, { 36, 64}, { 65, 37}, + { 38, 66}, { 67, 39}, { 40, 66}, { 67, 41}, { 42, 66}, { 67, 43}, { 44, 68}, { 69, 45}, + { 46, 68}, { 69, 47}, { 48, 70}, { 71, 49}, { 50, 72}, { 73, 51}, { 52, 72}, { 73, 53}, + { 54, 74}, { 75, 55}, { 56, 74}, { 75, 57}, { 58, 76}, { 77, 59}, { 60, 78}, { 79, 61}, + { 62, 78}, { 79, 63}, { 64, 80}, { 81, 65}, { 66, 82}, { 83, 67}, { 68, 82}, { 83, 69}, + { 70, 84}, { 85, 71}, { 72, 84}, { 85, 73}, { 74, 88}, { 89, 75}, { 76, 88}, { 89, 77}, + { 78, 90}, { 91, 79}, { 80, 90}, { 91, 81}, { 82, 94}, { 95, 83}, { 84, 94}, { 95, 85}, + { 86, 96}, { 97, 87}, { 88, 96}, { 97, 89}, { 90, 100}, {101, 91}, { 92, 100}, {101, 93}, + { 94, 102}, {103, 95}, { 96, 104}, {105, 97}, { 98, 104}, {105, 99}, {100, 108}, {109, 101}, + {102, 108}, {109, 103}, {104, 110}, {111, 105}, {106, 112}, {113, 107}, {108, 114}, {115, 109}, + {110, 116}, {117, 111}, {112, 118}, {119, 113}, {114, 118}, {119, 115}, {116, 122}, {123, 117}, + {118, 122}, {123, 119}, {120, 124}, {125, 121}, {122, 126}, {127, 123}, {124, 127}, {126, 125} }; const uint8_t x264_cabac_renorm_shift[64]= { @@ -743,41 +712,40 @@ }; /* -ln2(probability) */ -#define F(a,b) {FIX8(a),FIX8(b)} -const uint16_t x264_cabac_entropy[128][2] = +const uint16_t x264_cabac_entropy[128] = { - F(0.0273,5.7370), F(0.0288,5.6618), F(0.0303,5.5866), F(0.0320,5.5114), - F(0.0337,5.4362), F(0.0355,5.3610), F(0.0375,5.2859), F(0.0395,5.2106), - F(0.0416,5.1354), F(0.0439,5.0602), F(0.0463,4.9851), F(0.0488,4.9099), - F(0.0515,4.8347), F(0.0543,4.7595), F(0.0572,4.6843), F(0.0604,4.6091), - F(0.0637,4.5339), F(0.0671,4.4588), F(0.0708,4.3836), F(0.0747,4.3083), - F(0.0788,4.2332), F(0.0832,4.1580), F(0.0878,4.0828), F(0.0926,4.0076), - F(0.0977,3.9324), F(0.1032,3.8572), F(0.1089,3.7820), F(0.1149,3.7068), - F(0.1214,3.6316), F(0.1282,3.5565), F(0.1353,3.4813), F(0.1429,3.4061), - F(0.1510,3.3309), F(0.1596,3.2557), F(0.1686,3.1805), F(0.1782,3.1053), - F(0.1884,3.0301), F(0.1992,2.9549), F(0.2107,2.8797), F(0.2229,2.8046), - F(0.2358,2.7294), F(0.2496,2.6542), F(0.2642,2.5790), F(0.2798,2.5038), - F(0.2964,2.4286), F(0.3142,2.3534), F(0.3331,2.2782), F(0.3532,2.2030), - F(0.3748,2.1278), F(0.3979,2.0527), F(0.4226,1.9775), F(0.4491,1.9023), - F(0.4776,1.8271), F(0.5082,1.7519), F(0.5412,1.6767), F(0.5768,1.6015), - F(0.6152,1.5263), F(0.6568,1.4511), F(0.7020,1.3759), F(0.7513,1.3008), - F(0.8050,1.2256), F(0.8638,1.1504), F(0.9285,1.0752), F(1.0000,1.0000), - F(1.0000,1.0000), F(1.0752,0.9285), F(1.1504,0.8638), F(1.2256,0.8050), - F(1.3008,0.7513), F(1.3759,0.7020), F(1.4511,0.6568), F(1.5263,0.6152), - F(1.6015,0.5768), F(1.6767,0.5412), F(1.7519,0.5082), F(1.8271,0.4776), - F(1.9023,0.4491), F(1.9775,0.4226), F(2.0527,0.3979), F(2.1278,0.3748), - F(2.2030,0.3532), F(2.2782,0.3331), F(2.3534,0.3142), F(2.4286,0.2964), - F(2.5038,0.2798), F(2.5790,0.2642), F(2.6542,0.2496), F(2.7294,0.2358), - F(2.8046,0.2229), F(2.8797,0.2107), F(2.9549,0.1992), F(3.0301,0.1884), - F(3.1053,0.1782), F(3.1805,0.1686), F(3.2557,0.1596), F(3.3309,0.1510), - F(3.4061,0.1429), F(3.4813,0.1353), F(3.5565,0.1282), F(3.6316,0.1214), - F(3.7068,0.1149), F(3.7820,0.1089), F(3.8572,0.1032), F(3.9324,0.0977), - F(4.0076,0.0926), F(4.0828,0.0878), F(4.1580,0.0832), F(4.2332,0.0788), - F(4.3083,0.0747), F(4.3836,0.0708), F(4.4588,0.0671), F(4.5339,0.0637), - F(4.6091,0.0604), F(4.6843,0.0572), F(4.7595,0.0543), F(4.8347,0.0515), - F(4.9099,0.0488), F(4.9851,0.0463), F(5.0602,0.0439), F(5.1354,0.0416), - F(5.2106,0.0395), F(5.2859,0.0375), F(5.3610,0.0355), F(5.4362,0.0337), - F(5.5114,0.0320), F(5.5866,0.0303), F(5.6618,0.0288), F(5.7370,0.0273), + FIX8(0.0273), FIX8(5.7370), FIX8(0.0288), FIX8(5.6618), + FIX8(0.0303), FIX8(5.5866), FIX8(0.0320), FIX8(5.5114), + FIX8(0.0337), FIX8(5.4362), FIX8(0.0355), FIX8(5.3610), + FIX8(0.0375), FIX8(5.2859), FIX8(0.0395), FIX8(5.2106), + FIX8(0.0416), FIX8(5.1354), FIX8(0.0439), FIX8(5.0602), + FIX8(0.0463), FIX8(4.9851), FIX8(0.0488), FIX8(4.9099), + FIX8(0.0515), FIX8(4.8347), FIX8(0.0543), FIX8(4.7595), + FIX8(0.0572), FIX8(4.6843), FIX8(0.0604), FIX8(4.6091), + FIX8(0.0637), FIX8(4.5339), FIX8(0.0671), FIX8(4.4588), + FIX8(0.0708), FIX8(4.3836), FIX8(0.0747), FIX8(4.3083), + FIX8(0.0788), FIX8(4.2332), FIX8(0.0832), FIX8(4.1580), + FIX8(0.0878), FIX8(4.0828), FIX8(0.0926), FIX8(4.0076), + FIX8(0.0977), FIX8(3.9324), FIX8(0.1032), FIX8(3.8572), + FIX8(0.1089), FIX8(3.7820), FIX8(0.1149), FIX8(3.7068), + FIX8(0.1214), FIX8(3.6316), FIX8(0.1282), FIX8(3.5565), + FIX8(0.1353), FIX8(3.4813), FIX8(0.1429), FIX8(3.4061), + FIX8(0.1510), FIX8(3.3309), FIX8(0.1596), FIX8(3.2557), + FIX8(0.1686), FIX8(3.1805), FIX8(0.1782), FIX8(3.1053), + FIX8(0.1884), FIX8(3.0301), FIX8(0.1992), FIX8(2.9549), + FIX8(0.2107), FIX8(2.8797), FIX8(0.2229), FIX8(2.8046), + FIX8(0.2358), FIX8(2.7294), FIX8(0.2496), FIX8(2.6542), + FIX8(0.2642), FIX8(2.5790), FIX8(0.2798), FIX8(2.5038), + FIX8(0.2964), FIX8(2.4286), FIX8(0.3142), FIX8(2.3534), + FIX8(0.3331), FIX8(2.2782), FIX8(0.3532), FIX8(2.2030), + FIX8(0.3748), FIX8(2.1278), FIX8(0.3979), FIX8(2.0527), + FIX8(0.4226), FIX8(1.9775), FIX8(0.4491), FIX8(1.9023), + FIX8(0.4776), FIX8(1.8271), FIX8(0.5082), FIX8(1.7519), + FIX8(0.5412), FIX8(1.6767), FIX8(0.5768), FIX8(1.6015), + FIX8(0.6152), FIX8(1.5263), FIX8(0.6568), FIX8(1.4511), + FIX8(0.7020), FIX8(1.3759), FIX8(0.7513), FIX8(1.3008), + FIX8(0.8050), FIX8(1.2256), FIX8(0.8638), FIX8(1.1504), + FIX8(0.9285), FIX8(1.0752), FIX8(1.0000), FIX8(1.0000) }; @@ -794,14 +762,17 @@ cabac_context_init = &x264_cabac_context_init_PB[i_model]; for( int i = 0; i < 460; i++ ) - cb->state[i] = x264_clip3( (((*cabac_context_init)[i][0] * i_qp) >> 4) + (*cabac_context_init)[i][1], 1, 126 ); + { + int state = x264_clip3( (((*cabac_context_init)[i][0] * i_qp) >> 4) + (*cabac_context_init)[i][1], 1, 126 ); + cb->state[i] = (X264_MIN( state, 127-state ) << 1) | (state >> 6); + } } void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end ) { cb->i_low = 0; cb->i_range = 0x01FE; - cb->i_queue = -1; // the first bit will be shifted away and not written + cb->i_queue = -9; // the first bit will be shifted away and not written cb->i_bytes_outstanding = 0; cb->p_start = p_data; cb->p = p_data; @@ -810,10 +781,10 @@ static inline void x264_cabac_putbyte( x264_cabac_t *cb ) { - if( cb->i_queue >= 8 ) + if( cb->i_queue >= 0 ) { - int out = cb->i_low >> (cb->i_queue+2); - cb->i_low &= (4<<cb->i_queue)-1; + int out = cb->i_low >> (cb->i_queue+10); + cb->i_low &= (0x400<<cb->i_queue)-1; cb->i_queue -= 8; if( (out & 0xff) == 0xff ) @@ -855,9 +826,9 @@ void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b ) { int i_state = cb->state[i_ctx]; - int i_range_lps = x264_cabac_range_lps[i_state][(cb->i_range>>6)-4]; + int i_range_lps = x264_cabac_range_lps[i_state>>1][(cb->i_range>>6)-4]; cb->i_range -= i_range_lps; - if( b != (i_state >> 6) ) + if( b != (i_state & 1) ) { cb->i_low += cb->i_range; cb->i_range = i_range_lps; @@ -866,7 +837,7 @@ x264_cabac_encode_renorm( cb ); } -void x264_cabac_encode_bypass( x264_cabac_t *cb, int b ) +void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b ) { cb->i_low <<= 1; cb->i_low += -b & cb->i_range; @@ -892,7 +863,7 @@ } while( k > 0 ); } -void x264_cabac_encode_terminal( x264_cabac_t *cb ) +void x264_cabac_encode_terminal_c( x264_cabac_t *cb ) { cb->i_range -= 2; x264_cabac_encode_renorm( cb );

@@ -31,7 +31,7 @@ int i_range; /* bit stream */ - int i_queue; + int i_queue; //stored with an offset of -8 for faster asm int i_bytes_outstanding; uint8_t *p_start; @@ -46,7 +46,7 @@ } x264_cabac_t; extern const uint8_t x264_cabac_transition[128][2]; -extern const uint16_t x264_cabac_entropy[128][2]; +extern const uint16_t x264_cabac_entropy[128]; /* init the contexts given i_slice_type, the quantif and the model */ void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model ); @@ -55,15 +55,21 @@ void x264_cabac_encode_init ( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end ); void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b ); void x264_cabac_encode_decision_asm( x264_cabac_t *cb, int i_ctx, int b ); -void x264_cabac_encode_bypass( x264_cabac_t *cb, int b ); +void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b ); +void x264_cabac_encode_bypass_asm( x264_cabac_t *cb, int b ); +void x264_cabac_encode_terminal_c( x264_cabac_t *cb ); +void x264_cabac_encode_terminal_asm( x264_cabac_t *cb ); void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val ); -void x264_cabac_encode_terminal( x264_cabac_t *cb ); void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb ); #ifdef HAVE_MMX #define x264_cabac_encode_decision x264_cabac_encode_decision_asm +#define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm +#define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm #else #define x264_cabac_encode_decision x264_cabac_encode_decision_c +#define x264_cabac_encode_bypass x264_cabac_encode_bypass_c +#define x264_cabac_encode_terminal x264_cabac_encode_terminal_c #endif #define x264_cabac_encode_decision_noup x264_cabac_encode_decision @@ -78,25 +84,25 @@ { int i_state = cb->state[i_ctx]; cb->state[i_ctx] = x264_cabac_transition[i_state][b]; - cb->f8_bits_encoded += x264_cabac_entropy[i_state][b]; + cb->f8_bits_encoded += x264_cabac_entropy[i_state^b]; } static ALWAYS_INLINE int x264_cabac_size_decision2( uint8_t *state, long b ) { int i_state = *state; *state = x264_cabac_transition[i_state][b]; - return x264_cabac_entropy[i_state][b]; + return x264_cabac_entropy[i_state^b]; } static ALWAYS_INLINE void x264_cabac_size_decision_noup( x264_cabac_t *cb, long i_ctx, long b ) { int i_state = cb->state[i_ctx]; - cb->f8_bits_encoded += x264_cabac_entropy[i_state][b]; + cb->f8_bits_encoded += x264_cabac_entropy[i_state^b]; } static ALWAYS_INLINE int x264_cabac_size_decision_noup2( uint8_t *state, long b ) { - return x264_cabac_entropy[*state][b]; + return x264_cabac_entropy[*state^b]; } #endif

@@ -22,7 +22,6 @@ *****************************************************************************/ #include "common.h" -#include "cpu.h" #include <stdarg.h> #include <ctype.h> @@ -1225,11 +1224,11 @@ s += sprintf( s, " bframes=%d", p->i_bframe ); if( p->i_bframe ) { - s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d wpredb=%d", + s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d weightb=%d", p->i_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias, p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred ); } - s += sprintf( s, " wpredp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 ); + s += sprintf( s, " weightp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 ); s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d intra_refresh=%d", p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold, p->b_intra_refresh ); @@ -1238,7 +1237,7 @@ s += sprintf( s, " rc_lookahead=%d", p->rc.i_lookahead ); s += sprintf( s, " rc=%s mbtree=%d", p->rc.i_rc_method == X264_RC_ABR ? - ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_buffer_size == p->rc.i_bitrate ? "cbr" : "abr" ) + ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_max_bitrate == p->rc.i_bitrate ? "cbr" : "abr" ) : p->rc.i_rc_method == X264_RC_CRF ? "crf" : "cqp", p->rc.b_mb_tree ); if( p->rc.i_rc_method == X264_RC_ABR || p->rc.i_rc_method == X264_RC_CRF ) { @@ -1257,7 +1256,7 @@ s += sprintf( s, " vbv_maxrate=%d vbv_bufsize=%d", p->rc.i_vbv_max_bitrate, p->rc.i_vbv_buffer_size ); if( p->rc.i_rc_method == X264_RC_CRF ) - s += sprintf( s, " crf-max=%.1f", p->rc.f_rf_constant_max ); + s += sprintf( s, " crf_max=%.1f", p->rc.f_rf_constant_max ); } } else if( p->rc.i_rc_method == X264_RC_CQP )

@@ -110,6 +110,7 @@ #include "dct.h" #include "cabac.h" #include "quant.h" +#include "cpu.h" /**************************************************************************** * General functions @@ -188,14 +189,14 @@ return amvd0 + (amvd1<<8); } -static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max ) +static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max ) { for( int i = 0; i < i_mvc; i++ ) { int mx = (mvc[i][0] + 2) >> 2; int my = (mvc[i][1] + 2) >> 2; - mvc[i][0] = x264_clip3( mx, mv_x_min, mv_x_max ); - mvc[i][1] = x264_clip3( my, mv_y_min, mv_y_max ); + dst[i][0] = x264_clip3( mx, mv_x_min, mv_x_max ); + dst[i][1] = x264_clip3( my, mv_y_min, mv_y_max ); } } @@ -407,6 +408,8 @@ int i_coded_fields_lookahead; /* Use separate counters for lookahead */ int i_cpb_delay_lookahead; + int b_queued_intra_refresh; + /* We use only one SPS and one PPS */ x264_sps_t sps_array[1]; x264_sps_t *sps; @@ -658,7 +661,7 @@ ALIGNED_8( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] ); /* i_non_zero_count if available else 0x80 */ - ALIGNED_4( uint8_t non_zero_count[X264_SCAN8_SIZE] ); + ALIGNED_16( uint8_t non_zero_count[X264_SCAN8_SIZE] ); /* -1 if unused, -2 if unavailable */ ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );

@@ -87,8 +87,8 @@ #endif #ifdef HAVE_MMX -extern int x264_cpu_cpuid_test( void ); -extern uint32_t x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx ); +int x264_cpu_cpuid_test( void ); +uint32_t x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx ); uint32_t x264_cpu_detect( void ) { @@ -324,13 +324,6 @@ #endif -#ifndef HAVE_MMX -void x264_emms( void ) -{ -} -#endif - - int x264_cpu_num_processors( void ) { #if !defined(HAVE_PTHREAD)

@@ -23,7 +23,14 @@ uint32_t x264_cpu_detect( void ); int x264_cpu_num_processors( void ); -void x264_emms( void ); +void x264_cpu_emms( void ); +void x264_cpu_sfence( void ); +#ifdef HAVE_MMX +#define x264_emms() x264_cpu_emms() +#else +#define x264_emms() +#endif +#define x264_sfence x264_cpu_sfence void x264_cpu_mask_misalign_sse( void ); /* kluge:

@@ -105,6 +105,7 @@ CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t)); CHECKED_MALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t)); CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) ); + CHECKED_MALLOC( frame->mv16x16, 2*i_mb_count * sizeof(int16_t) ); CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) ); if( h->param.i_bframe ) { @@ -117,7 +118,7 @@ frame->ref[1] = NULL; } CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) ); - CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) ); + CHECKED_MALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) ); if( h->param.analyse.i_me_method >= X264_ME_ESA ) { CHECKED_MALLOC( frame->buffer[3], @@ -148,10 +149,7 @@ CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) ); for( int j = 0; j <= h->param.i_bframe+1; j++ ) for( int i = 0; i <= h->param.i_bframe+1; i++ ) - { CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) ); - CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) ); - } frame->i_intra_cost = frame->lowres_costs[0][0]; memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) ); } @@ -199,19 +197,17 @@ x264_free( frame->i_propagate_cost ); for( int j = 0; j <= X264_BFRAME_MAX+1; j++ ) for( int i = 0; i <= X264_BFRAME_MAX+1; i++ ) - { x264_free( frame->lowres_costs[j][i] ); - x264_free( frame->lowres_inter_types[j][i] ); - } x264_free( frame->f_qp_offset ); x264_free( frame->f_qp_offset_aq ); x264_free( frame->i_inv_qscale_factor ); x264_free( frame->i_row_bits ); - x264_free( frame->i_row_qp ); + x264_free( frame->f_row_qp ); x264_free( frame->mb_type ); x264_free( frame->mb_partition ); x264_free( frame->mv[0] ); x264_free( frame->mv[1] ); + x264_free( frame->mv16x16 ); x264_free( frame->ref[0] ); x264_free( frame->ref[1] ); x264_pthread_mutex_destroy( &frame->mutex ); @@ -225,7 +221,7 @@ int i_csp = src->img.i_csp & X264_CSP_MASK; if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 ) { - x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" ); + x264_log( h, X264_LOG_ERROR, "Invalid input colorspace\n" ); return -1; } @@ -247,6 +243,11 @@ plane += (height-1)*stride; stride = -stride; } + if( width > abs(stride) ) + { + x264_log( h, X264_LOG_ERROR, "Input picture width is greater than stride\n" ); + return -1; + } h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height ); } return 0;

@@ -83,15 +83,21 @@ int8_t *mb_type; uint8_t *mb_partition; int16_t (*mv[2])[2]; + int16_t (*mv16x16)[2]; int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2]; + + /* Stored as (lists_used << LOWRES_COST_SHIFT) + (cost). + * Doesn't need special addressing for intra cost because + * lists_used is guaranteed to be zero in that cast. */ uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]); - /* Actually a width-2 bitfield with 4 values per uint8_t. */ - uint8_t (*lowres_inter_types[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]); + #define LOWRES_COST_MASK ((1<<14)-1) + #define LOWRES_COST_SHIFT 14 + int *lowres_mv_costs[2][X264_BFRAME_MAX+1]; int8_t *ref[2]; int i_ref[2]; int ref_poc[2][16]; - int16_t inv_ref_poc[2][32]; // inverse values (list0 only) to avoid divisions in MB encoding + int16_t inv_ref_poc[2]; // inverse values of ref0 poc to avoid divisions in temporal MV prediction /* for adaptive B-frame decision. * contains the SATD cost of the lowres frame encoded in various modes @@ -103,7 +109,7 @@ int *i_row_satds[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]; int *i_row_satd; int *i_row_bits; - int *i_row_qp; + float *f_row_qp; float *f_qp_offset; float *f_qp_offset_aq; int b_intra_calculated; @@ -136,6 +142,7 @@ float f_pir_position; int i_pir_start_col; int i_pir_end_col; + int i_frames_since_pir; } x264_frame_t; /* synchronized frame list */

@@ -3,9 +3,9 @@ ***************************************************************************** * Copyright (C) 2003-2008 x264 project * - * Authors: Laurent Aimar <fenrir@via.ecp.fr> + * Authors: Jason Garrett-Glaser <darkshikari@gmail.com> + * Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -25,469 +25,6 @@ #include "common.h" #include "encoder/me.h" -void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] ) -{ - const int i8 = x264_scan8[idx]; - const int i_ref= h->mb.cache.ref[i_list][i8]; - int i_refa = h->mb.cache.ref[i_list][i8 - 1]; - int16_t *mv_a = h->mb.cache.mv[i_list][i8 - 1]; - int i_refb = h->mb.cache.ref[i_list][i8 - 8]; - int16_t *mv_b = h->mb.cache.mv[i_list][i8 - 8]; - int i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width]; - int16_t *mv_c = h->mb.cache.mv[i_list][i8 - 8 + i_width]; - - if( (idx&3) >= 2 + (i_width&1) || i_refc == -2 ) - { - i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1]; - mv_c = h->mb.cache.mv[i_list][i8 - 8 - 1]; - } - - if( h->mb.i_partition == D_16x8 ) - { - if( idx == 0 ) - { - if( i_refb == i_ref ) - { - CP32( mvp, mv_b ); - return; - } - } - else - { - if( i_refa == i_ref ) - { - CP32( mvp, mv_a ); - return; - } - } - } - else if( h->mb.i_partition == D_8x16 ) - { - if( idx == 0 ) - { - if( i_refa == i_ref ) - { - CP32( mvp, mv_a ); - return; - } - } - else - { - if( i_refc == i_ref ) - { - CP32( mvp, mv_c ); - return; - } - } - } - - int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref); - - if( i_count > 1 ) - { -median: - x264_median_mv( mvp, mv_a, mv_b, mv_c ); - } - else if( i_count == 1 ) - { - if( i_refa == i_ref ) - CP32( mvp, mv_a ); - else if( i_refb == i_ref ) - CP32( mvp, mv_b ); - else - CP32( mvp, mv_c ); - } - else if( i_refb == -2 && i_refc == -2 && i_refa != -2 ) - CP32( mvp, mv_a ); - else - goto median; -} - -void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] ) -{ - int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1]; - int16_t *mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1]; - int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8]; - int16_t *mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8]; - int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4]; - int16_t *mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4]; - if( i_refc == -2 ) - { - i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1]; - mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1]; - } - - int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref); - - if( i_count > 1 ) - { -median: - x264_median_mv( mvp, mv_a, mv_b, mv_c ); - } - else if( i_count == 1 ) - { - if( i_refa == i_ref ) - CP32( mvp, mv_a ); - else if( i_refb == i_ref ) - CP32( mvp, mv_b ); - else - CP32( mvp, mv_c ); - } - else if( i_refb == -2 && i_refc == -2 && i_refa != -2 ) - CP32( mvp, mv_a ); - else - goto median; -} - - -void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] ) -{ - int i_refa = h->mb.cache.ref[0][X264_SCAN8_0 - 1]; - int i_refb = h->mb.cache.ref[0][X264_SCAN8_0 - 8]; - int16_t *mv_a = h->mb.cache.mv[0][X264_SCAN8_0 - 1]; - int16_t *mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8]; - - if( i_refa == -2 || i_refb == -2 || - !( i_refa | M32( mv_a ) ) || - !( i_refb | M32( mv_b ) ) ) - { - M32( mv ) = 0; - } - else - x264_mb_predict_mv_16x16( h, 0, 0, mv ); -} - -static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h ) -{ - int i_mb_4x4 = 16 * h->mb.i_mb_stride * h->mb.i_mb_y + 4 * h->mb.i_mb_x; - int i_mb_8x8 = 4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x; - const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy]; - const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy]; - - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 ); - - h->mb.i_partition = partition_col; - - if( IS_INTRA( type_col ) ) - { - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); - x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, 0 ); - x264_macroblock_cache_mv( h, 0, 0, 4, 4, 1, 0 ); - return 1; - } - - /* Don't do any checks other than the ones we have to, based - * on the size of the colocated partitions. - * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */ - int max_i8 = (D_16x16 - partition_col) + 1; - int step = (partition_col == D_16x8) + 1; - int width = 4 >> ((D_16x16 - partition_col)&1); - int height = 4 >> ((D_16x16 - partition_col)>>1); - - for( int i8 = 0; i8 < max_i8; i8 += step ) - { - int x8 = i8&1; - int y8 = i8>>1; - int i_part_8x8 = i_mb_8x8 + x8 + y8 * h->mb.i_b8_stride; - int i_ref1_ref = h->fref1[0]->ref[0][i_part_8x8]; - int i_ref = (map_col_to_list0(i_ref1_ref>>h->sh.b_mbaff) << h->sh.b_mbaff) + (i_ref1_ref&h->sh.b_mbaff); - - if( i_ref >= 0 ) - { - int dist_scale_factor = h->mb.dist_scale_factor[i_ref][0]; - int16_t *mv_col = h->fref1[0]->mv[0][i_mb_4x4 + 3*x8 + 3*y8 * h->mb.i_b4_stride]; - int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8; - int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8; - if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] || l0y-mv_col[1] > h->mb.mv_max_spel[1]) ) - return 0; - x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, i_ref ); - x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, pack16to32_mask(l0x, l0y) ); - x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) ); - } - else - { - /* the collocated ref isn't in the current list0 */ - /* FIXME: we might still be able to use direct_8x8 on some partitions */ - /* FIXME: with B-pyramid + extensive ref list reordering - * (not currently used), we would also have to check - * l1mv1 like in spatial mode */ - return 0; - } - } - - return 1; -} - -static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h ) -{ - int8_t ref[2]; - ALIGNED_ARRAY_8( int16_t, mv,[2],[2] ); - const int8_t *l1ref0 = &h->fref1[0]->ref[0][h->mb.i_b8_xy]; - const int8_t *l1ref1 = &h->fref1[0]->ref[1][h->mb.i_b8_xy]; - const int16_t (*l1mv[2])[2] = { (const int16_t (*)[2]) &h->fref1[0]->mv[0][h->mb.i_b4_xy], - (const int16_t (*)[2]) &h->fref1[0]->mv[1][h->mb.i_b4_xy] }; - const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy]; - const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy]; - - h->mb.i_partition = partition_col; - - for( int i_list = 0; i_list < 2; i_list++ ) - { - int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1]; - int16_t *mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1]; - int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8]; - int16_t *mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8]; - int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4]; - int16_t *mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4]; - if( i_refc == -2 ) - { - i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1]; - mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1]; - } - - int i_ref = X264_MIN3( (unsigned)i_refa, (unsigned)i_refb, (unsigned)i_refc ); - if( i_ref < 0 ) - { - i_ref = -1; - M32( mv[i_list] ) = 0; - } - else - { - /* Same as x264_mb_predict_mv_16x16, but simplified to eliminate cases - * not relevant to spatial direct. */ - int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref); - - if( i_count > 1 ) - x264_median_mv( mv[i_list], mv_a, mv_b, mv_c ); - else - { - if( i_refa == i_ref ) - CP32( mv[i_list], mv_a ); - else if( i_refb == i_ref ) - CP32( mv[i_list], mv_b ); - else - CP32( mv[i_list], mv_c ); - } - } - - x264_macroblock_cache_ref( h, 0, 0, 4, 4, i_list, i_ref ); - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, i_list, mv[i_list] ); - ref[i_list] = i_ref; - } - - if( (M16( ref ) & 0x8080) == 0x8080 ) /* if( ref[0] < 0 && ref[1] < 0 ) */ - { - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 ); - return 1; - } - - if( h->param.i_threads > 1 - && ( mv[0][1] > h->mb.mv_max_spel[1] - || mv[1][1] > h->mb.mv_max_spel[1] ) ) - { -#if 0 - fprintf(stderr, "direct_spatial: (%d,%d) (%d,%d) > %d \n", - mv[0][0], mv[0][1], mv[1][0], mv[1][1], - h->mb.mv_max_spel[1]); -#endif - return 0; - } - - if( !M64( mv ) || IS_INTRA( type_col ) || (ref[0]&&ref[1]) ) - return 1; - - /* Don't do any checks other than the ones we have to, based - * on the size of the colocated partitions. - * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */ - int max_i8 = (D_16x16 - partition_col) + 1; - int step = (partition_col == D_16x8) + 1; - int width = 4 >> ((D_16x16 - partition_col)&1); - int height = 4 >> ((D_16x16 - partition_col)>>1); - - /* col_zero_flag */ - for( int i8 = 0; i8 < max_i8; i8 += step ) - { - const int x8 = i8&1; - const int y8 = i8>>1; - const int o8 = x8 + y8 * h->mb.i_b8_stride; - const int o4 = 3*(x8 + y8 * h->mb.i_b4_stride); - int idx; - if( l1ref0[o8] == 0 ) - idx = 0; - else if( l1ref0[o8] < 0 && l1ref1[o8] == 0 ) - idx = 1; - else - continue; - - if( abs( l1mv[idx][o4][0] ) <= 1 && abs( l1mv[idx][o4][1] ) <= 1 ) - { - if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, 0 ); - if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, 0 ); - } - } - - return 1; -} - -int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed ) -{ - int b_available; - if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_NONE ) - return 0; - else if( h->sh.b_direct_spatial_mv_pred ) - b_available = x264_mb_predict_mv_direct16x16_spatial( h ); - else - b_available = x264_mb_predict_mv_direct16x16_temporal( h ); - - if( b_changed != NULL && b_available ) - { - int changed; - - changed = M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][x264_scan8[0]] ); - changed |= M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][x264_scan8[0]] ); - changed |= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][x264_scan8[0]]; - changed |= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][x264_scan8[0]]; - if( !changed && h->mb.i_partition != D_16x16 ) - { - changed |= M32( h->mb.cache.direct_mv[0][3] ) ^ M32( h->mb.cache.mv[0][x264_scan8[12]] ); - changed |= M32( h->mb.cache.direct_mv[1][3] ) ^ M32( h->mb.cache.mv[1][x264_scan8[12]] ); - changed |= h->mb.cache.direct_ref[0][3] ^ h->mb.cache.ref[0][x264_scan8[12]]; - changed |= h->mb.cache.direct_ref[1][3] ^ h->mb.cache.ref[1][x264_scan8[12]]; - } - if( !changed && h->mb.i_partition == D_8x8 ) - { - changed |= M32( h->mb.cache.direct_mv[0][1] ) ^ M32( h->mb.cache.mv[0][x264_scan8[4]] ); - changed |= M32( h->mb.cache.direct_mv[1][1] ) ^ M32( h->mb.cache.mv[1][x264_scan8[4]] ); - changed |= M32( h->mb.cache.direct_mv[0][2] ) ^ M32( h->mb.cache.mv[0][x264_scan8[8]] ); - changed |= M32( h->mb.cache.direct_mv[1][2] ) ^ M32( h->mb.cache.mv[1][x264_scan8[8]] ); - changed |= h->mb.cache.direct_ref[0][1] ^ h->mb.cache.ref[0][x264_scan8[4]]; - changed |= h->mb.cache.direct_ref[1][1] ^ h->mb.cache.ref[1][x264_scan8[4]]; - changed |= h->mb.cache.direct_ref[0][2] ^ h->mb.cache.ref[0][x264_scan8[8]]; - changed |= h->mb.cache.direct_ref[1][2] ^ h->mb.cache.ref[1][x264_scan8[8]]; - } - *b_changed = changed; - if( !changed ) - return b_available; - } - - /* cache ref & mv */ - if( b_available ) - for( int l = 0; l < 2; l++ ) - { - CP32( h->mb.cache.direct_mv[l][0], h->mb.cache.mv[l][x264_scan8[ 0]] ); - CP32( h->mb.cache.direct_mv[l][1], h->mb.cache.mv[l][x264_scan8[ 4]] ); - CP32( h->mb.cache.direct_mv[l][2], h->mb.cache.mv[l][x264_scan8[ 8]] ); - CP32( h->mb.cache.direct_mv[l][3], h->mb.cache.mv[l][x264_scan8[12]] ); - h->mb.cache.direct_ref[l][0] = h->mb.cache.ref[l][x264_scan8[ 0]]; - h->mb.cache.direct_ref[l][1] = h->mb.cache.ref[l][x264_scan8[ 4]]; - h->mb.cache.direct_ref[l][2] = h->mb.cache.ref[l][x264_scan8[ 8]]; - h->mb.cache.direct_ref[l][3] = h->mb.cache.ref[l][x264_scan8[12]]; - h->mb.cache.direct_partition = h->mb.i_partition; - } - - return b_available; -} - -/* This just improves encoder performance, it's not part of the spec */ -void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[9][2], int *i_mvc ) -{ - int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref]; - int i = 0; - -#define SET_MVP(mvp)\ - { \ - CP32( mvc[i], mvp ); \ - i++; \ - } - - /* b_direct */ - if( h->sh.i_type == SLICE_TYPE_B - && h->mb.cache.ref[i_list][x264_scan8[12]] == i_ref ) - { - SET_MVP( h->mb.cache.mv[i_list][x264_scan8[12]] ); - } - - if( i_ref == 0 && h->frames.b_have_lowres ) - { - int16_t (*lowres_mv)[2] = i_list ? h->fenc->lowres_mvs[1][h->fref1[0]->i_frame-h->fenc->i_frame-1] - : h->fenc->lowres_mvs[0][h->fenc->i_frame-h->fref0[0]->i_frame-1]; - if( lowres_mv[0][0] != 0x7fff ) - { - M32( mvc[i] ) = (M32( lowres_mv[h->mb.i_mb_xy] )*2)&0xfffeffff; - i++; - } - } - - /* spatial predictors */ - if( h->mb.i_neighbour_frame & MB_LEFT ) - { - SET_MVP( mvr[h->mb.i_mb_left_xy] ); - } - if( h->mb.i_neighbour_frame & MB_TOP ) - { - SET_MVP( mvr[h->mb.i_mb_top_xy] ); - - if( h->mb.i_neighbour_frame & MB_TOPLEFT ) - SET_MVP( mvr[h->mb.i_mb_topleft_xy] ); - if( h->mb.i_neighbour_frame & MB_TOPRIGHT ) - SET_MVP( mvr[h->mb.i_mb_topright_xy] ); - } -#undef SET_MVP - - /* temporal predictors */ - if( h->fref0[0]->i_ref[0] > 0 ) - { - x264_frame_t *l0 = h->fref0[0]; - x264_frame_t **fref = i_list ? h->fref1 : h->fref0; - int field = h->mb.i_mb_y&1; - int curpoc = h->fdec->i_poc + field*h->sh.i_delta_poc_bottom; - int refpoc = fref[i_ref>>h->sh.b_mbaff]->i_poc; - if( h->sh.b_mbaff && field^(i_ref&1) ) - refpoc += h->sh.i_delta_poc_bottom; - -#define SET_TMVP(dx, dy) { \ - int i_b4 = h->mb.i_b4_xy + dx*4 + dy*4*h->mb.i_b4_stride; \ - int i_b8 = h->mb.i_b8_xy + dx*2 + dy*2*h->mb.i_b8_stride; \ - int ref_col = l0->ref[0][i_b8]; \ - if( ref_col >= 0 ) \ - { \ - int scale = (curpoc - refpoc) * l0->inv_ref_poc[h->mb.b_interlaced&field][ref_col];\ - mvc[i][0] = (l0->mv[0][i_b4][0]*scale + 128) >> 8;\ - mvc[i][1] = (l0->mv[0][i_b4][1]*scale + 128) >> 8;\ - i++; \ - } \ - } - - SET_TMVP(0,0); - if( h->mb.i_mb_x < h->sps->i_mb_width-1 ) - SET_TMVP(1,0); - if( h->mb.i_mb_y < h->sps->i_mb_height-1 ) - SET_TMVP(0,1); -#undef SET_TMVP - } - - *i_mvc = i; -} - -/* Set up a lookup table for delta pocs to reduce an IDIV to an IMUL */ -static void setup_inverse_delta_pocs( x264_t *h ) -{ - for( int field = 0; field <= h->sh.b_mbaff; field++ ) - { - int curpoc = h->fdec->i_poc + field*h->sh.i_delta_poc_bottom; - for( int i = 0; i < (h->i_ref0<<h->sh.b_mbaff); i++ ) - { - int refpoc = h->fref0[i>>h->sh.b_mbaff]->i_poc; - if( h->sh.b_mbaff && field^(i&1) ) - refpoc += h->sh.i_delta_poc_bottom; - int delta = curpoc - refpoc; - - h->fdec->inv_ref_poc[field][i] = (256 + delta/2) / delta; - } - } -} - static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height ) { int i8 = x264_scan8[0]+x+8*y; @@ -713,7 +250,7 @@ else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND ) i_refs = X264_MIN(16, i_refs + 1); //blind weights add one duplicate frame - for( int j = 0; j < i_refs; j++ ) + for( int j = !i; j < i_refs; j++ ) CHECKED_MALLOC( h->mb.mvr[i][j], 2 * i_mb_count * sizeof(int16_t) ); } @@ -758,12 +295,13 @@ } return 0; -fail: return -1; +fail: + return -1; } void x264_macroblock_cache_free( x264_t *h ) { for( int i = 0; i < 2; i++ ) - for( int j = 0; j < 32; j++ ) + for( int j = !i; j < 32; j++ ) x264_free( h->mb.mvr[i][j] ); for( int i = 0; i < 16; i++ ) x264_free( h->mb.p_weight_buf[i] ); @@ -811,7 +349,8 @@ CHECKED_MALLOC( h->scratch_buffer, scratch_size ); return 0; -fail: return -1; +fail: + return -1; } void x264_macroblock_thread_free( x264_t *h, int b_lookahead ) @@ -827,6 +366,7 @@ { h->mb.mv[0] = h->fdec->mv[0]; h->mb.mv[1] = h->fdec->mv[1]; + h->mb.mvr[0][0] = h->fdec->mv16x16; h->mb.ref[0] = h->fdec->ref[0]; h->mb.ref[1] = h->fdec->ref[1]; h->mb.type = h->fdec->mb_type; @@ -861,7 +401,17 @@ /* init with not available (for top right idx=7,15) */ memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) ); - setup_inverse_delta_pocs( h ); + if( h->i_ref0 > 0 ) + for( int field = 0; field <= h->sh.b_mbaff; field++ ) + { + int curpoc = h->fdec->i_poc + field*h->sh.i_delta_poc_bottom; + int refpoc = h->fref0[0]->i_poc; + if( h->sh.b_mbaff && field ) + refpoc += h->sh.i_delta_poc_bottom; + int delta = curpoc - refpoc; + + h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta; + } h->mb.i_neighbour4[6] = h->mb.i_neighbour4[9] =

@@ -97,9 +97,9 @@ uint8_t *pix2, int i_stride_pix2, \ uint8_t *pix3, int i_stride_pix3, int weight ) \ { \ - if( weight == 32 )\ + if( weight == 32 ) \ pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \ - else\ + else \ pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \ } PIXEL_AVG_C( pixel_avg_16x16, 16, 16 ) @@ -203,8 +203,8 @@ } } -static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; -static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; +static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; +static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; static void mc_luma( uint8_t *dst, int i_dst_stride, uint8_t *src[4], int i_src_stride, @@ -427,7 +427,7 @@ for( int i = 0; i < len; i++ ) { int propagate_amount = propagate_in[i] + ((intra_costs[i] * inv_qscales[i] + 128)>>8); - dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - inter_costs[i]), intra_costs[i]); + dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK)), intra_costs[i]); } }

@@ -0,0 +1,466 @@ +/***************************************************************************** + * mvpred.c: h264 encoder library + ***************************************************************************** + * Copyright (C) 2003-2008 x264 project + * + * Authors: Loren Merritt <lorenm@u.washington.edu> + * Jason Garrett-Glaser <darkshikari@gmail.com> + * Laurent Aimar <fenrir@via.ecp.fr> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + *****************************************************************************/ + +#include "common.h" + +void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] ) +{ + const int i8 = x264_scan8[idx]; + const int i_ref= h->mb.cache.ref[i_list][i8]; + int i_refa = h->mb.cache.ref[i_list][i8 - 1]; + int16_t *mv_a = h->mb.cache.mv[i_list][i8 - 1]; + int i_refb = h->mb.cache.ref[i_list][i8 - 8]; + int16_t *mv_b = h->mb.cache.mv[i_list][i8 - 8]; + int i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width]; + int16_t *mv_c = h->mb.cache.mv[i_list][i8 - 8 + i_width]; + + if( (idx&3) >= 2 + (i_width&1) || i_refc == -2 ) + { + i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1]; + mv_c = h->mb.cache.mv[i_list][i8 - 8 - 1]; + } + + if( h->mb.i_partition == D_16x8 ) + { + if( idx == 0 ) + { + if( i_refb == i_ref ) + { + CP32( mvp, mv_b ); + return; + } + } + else + { + if( i_refa == i_ref ) + { + CP32( mvp, mv_a ); + return; + } + } + } + else if( h->mb.i_partition == D_8x16 ) + { + if( idx == 0 ) + { + if( i_refa == i_ref ) + { + CP32( mvp, mv_a ); + return; + } + } + else + { + if( i_refc == i_ref ) + { + CP32( mvp, mv_c ); + return; + } + } + } + + int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref); + + if( i_count > 1 ) + { +median: + x264_median_mv( mvp, mv_a, mv_b, mv_c ); + } + else if( i_count == 1 ) + { + if( i_refa == i_ref ) + CP32( mvp, mv_a ); + else if( i_refb == i_ref ) + CP32( mvp, mv_b ); + else + CP32( mvp, mv_c ); + } + else if( i_refb == -2 && i_refc == -2 && i_refa != -2 ) + CP32( mvp, mv_a ); + else + goto median; +} + +void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] ) +{ + int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1]; + int16_t *mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1]; + int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8]; + int16_t *mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8]; + int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4]; + int16_t *mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4]; + if( i_refc == -2 ) + { + i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1]; + mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1]; + } + + int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref); + + if( i_count > 1 ) + { +median: + x264_median_mv( mvp, mv_a, mv_b, mv_c ); + } + else if( i_count == 1 ) + { + if( i_refa == i_ref ) + CP32( mvp, mv_a ); + else if( i_refb == i_ref ) + CP32( mvp, mv_b ); + else + CP32( mvp, mv_c ); + } + else if( i_refb == -2 && i_refc == -2 && i_refa != -2 ) + CP32( mvp, mv_a ); + else + goto median; +} + + +void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] ) +{ + int i_refa = h->mb.cache.ref[0][X264_SCAN8_0 - 1]; + int i_refb = h->mb.cache.ref[0][X264_SCAN8_0 - 8]; + int16_t *mv_a = h->mb.cache.mv[0][X264_SCAN8_0 - 1]; + int16_t *mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8]; + + if( i_refa == -2 || i_refb == -2 || + !( i_refa | M32( mv_a ) ) || + !( i_refb | M32( mv_b ) ) ) + { + M32( mv ) = 0; + } + else + x264_mb_predict_mv_16x16( h, 0, 0, mv ); +} + +static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h ) +{ + int i_mb_4x4 = 16 * h->mb.i_mb_stride * h->mb.i_mb_y + 4 * h->mb.i_mb_x; + int i_mb_8x8 = 4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x; + const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy]; + const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy]; + + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 ); + + h->mb.i_partition = partition_col; + + if( IS_INTRA( type_col ) ) + { + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); + x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, 0 ); + x264_macroblock_cache_mv( h, 0, 0, 4, 4, 1, 0 ); + return 1; + } + + /* Don't do any checks other than the ones we have to, based + * on the size of the colocated partitions. + * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */ + int max_i8 = (D_16x16 - partition_col) + 1; + int step = (partition_col == D_16x8) + 1; + int width = 4 >> ((D_16x16 - partition_col)&1); + int height = 4 >> ((D_16x16 - partition_col)>>1); + + for( int i8 = 0; i8 < max_i8; i8 += step ) + { + int x8 = i8&1; + int y8 = i8>>1; + int i_part_8x8 = i_mb_8x8 + x8 + y8 * h->mb.i_b8_stride; + int i_ref1_ref = h->fref1[0]->ref[0][i_part_8x8]; + int i_ref = (map_col_to_list0(i_ref1_ref>>h->sh.b_mbaff) << h->sh.b_mbaff) + (i_ref1_ref&h->sh.b_mbaff); + + if( i_ref >= 0 ) + { + int dist_scale_factor = h->mb.dist_scale_factor[i_ref][0]; + int16_t *mv_col = h->fref1[0]->mv[0][i_mb_4x4 + 3*x8 + 3*y8 * h->mb.i_b4_stride]; + int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8; + int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8; + if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] || l0y-mv_col[1] > h->mb.mv_max_spel[1]) ) + return 0; + x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, i_ref ); + x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, pack16to32_mask(l0x, l0y) ); + x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) ); + } + else + { + /* the collocated ref isn't in the current list0 */ + /* FIXME: we might still be able to use direct_8x8 on some partitions */ + /* FIXME: with B-pyramid + extensive ref list reordering + * (not currently used), we would also have to check + * l1mv1 like in spatial mode */ + return 0; + } + } + + return 1; +} + +static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h ) +{ + int8_t ref[2]; + ALIGNED_ARRAY_8( int16_t, mv,[2],[2] ); + const int8_t *l1ref0 = &h->fref1[0]->ref[0][h->mb.i_b8_xy]; + const int8_t *l1ref1 = &h->fref1[0]->ref[1][h->mb.i_b8_xy]; + const int16_t (*l1mv[2])[2] = { (const int16_t (*)[2]) &h->fref1[0]->mv[0][h->mb.i_b4_xy], + (const int16_t (*)[2]) &h->fref1[0]->mv[1][h->mb.i_b4_xy] }; + const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy]; + const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy]; + + h->mb.i_partition = partition_col; + + for( int i_list = 0; i_list < 2; i_list++ ) + { + int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1]; + int16_t *mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1]; + int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8]; + int16_t *mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8]; + int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4]; + int16_t *mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4]; + if( i_refc == -2 ) + { + i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1]; + mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1]; + } + + int i_ref = X264_MIN3( (unsigned)i_refa, (unsigned)i_refb, (unsigned)i_refc ); + if( i_ref < 0 ) + { + i_ref = -1; + M32( mv[i_list] ) = 0; + } + else + { + /* Same as x264_mb_predict_mv_16x16, but simplified to eliminate cases + * not relevant to spatial direct. */ + int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref); + + if( i_count > 1 ) + x264_median_mv( mv[i_list], mv_a, mv_b, mv_c ); + else + { + if( i_refa == i_ref ) + CP32( mv[i_list], mv_a ); + else if( i_refb == i_ref ) + CP32( mv[i_list], mv_b ); + else + CP32( mv[i_list], mv_c ); + } + } + + x264_macroblock_cache_ref( h, 0, 0, 4, 4, i_list, i_ref ); + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, i_list, mv[i_list] ); + ref[i_list] = i_ref; + } + + if( (M16( ref ) & 0x8080) == 0x8080 ) /* if( ref[0] < 0 && ref[1] < 0 ) */ + { + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 ); + return 1; + } + + if( h->param.i_threads > 1 + && ( mv[0][1] > h->mb.mv_max_spel[1] + || mv[1][1] > h->mb.mv_max_spel[1] ) ) + { +#if 0 + fprintf(stderr, "direct_spatial: (%d,%d) (%d,%d) > %d \n", + mv[0][0], mv[0][1], mv[1][0], mv[1][1], + h->mb.mv_max_spel[1]); +#endif + return 0; + } + + if( !M64( mv ) || IS_INTRA( type_col ) || (ref[0]&&ref[1]) ) + return 1; + + /* Don't do any checks other than the ones we have to, based + * on the size of the colocated partitions. + * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */ + int max_i8 = (D_16x16 - partition_col) + 1; + int step = (partition_col == D_16x8) + 1; + int width = 4 >> ((D_16x16 - partition_col)&1); + int height = 4 >> ((D_16x16 - partition_col)>>1); + + /* col_zero_flag */ + for( int i8 = 0; i8 < max_i8; i8 += step ) + { + const int x8 = i8&1; + const int y8 = i8>>1; + const int o8 = x8 + y8 * h->mb.i_b8_stride; + const int o4 = 3*(x8 + y8 * h->mb.i_b4_stride); + int idx; + if( l1ref0[o8] == 0 ) + idx = 0; + else if( l1ref0[o8] < 0 && l1ref1[o8] == 0 ) + idx = 1; + else + continue; + + if( abs( l1mv[idx][o4][0] ) <= 1 && abs( l1mv[idx][o4][1] ) <= 1 ) + { + if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, 0 ); + if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, 0 ); + } + } + + return 1; +} + +int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed ) +{ + int b_available; + if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_NONE ) + return 0; + else if( h->sh.b_direct_spatial_mv_pred ) + b_available = x264_mb_predict_mv_direct16x16_spatial( h ); + else + b_available = x264_mb_predict_mv_direct16x16_temporal( h ); + + if( b_changed != NULL && b_available ) + { + int changed; + + changed = M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][x264_scan8[0]] ); + changed |= M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][x264_scan8[0]] ); + changed |= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][x264_scan8[0]]; + changed |= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][x264_scan8[0]]; + if( !changed && h->mb.i_partition != D_16x16 ) + { + changed |= M32( h->mb.cache.direct_mv[0][3] ) ^ M32( h->mb.cache.mv[0][x264_scan8[12]] ); + changed |= M32( h->mb.cache.direct_mv[1][3] ) ^ M32( h->mb.cache.mv[1][x264_scan8[12]] ); + changed |= h->mb.cache.direct_ref[0][3] ^ h->mb.cache.ref[0][x264_scan8[12]]; + changed |= h->mb.cache.direct_ref[1][3] ^ h->mb.cache.ref[1][x264_scan8[12]]; + } + if( !changed && h->mb.i_partition == D_8x8 ) + { + changed |= M32( h->mb.cache.direct_mv[0][1] ) ^ M32( h->mb.cache.mv[0][x264_scan8[4]] ); + changed |= M32( h->mb.cache.direct_mv[1][1] ) ^ M32( h->mb.cache.mv[1][x264_scan8[4]] ); + changed |= M32( h->mb.cache.direct_mv[0][2] ) ^ M32( h->mb.cache.mv[0][x264_scan8[8]] ); + changed |= M32( h->mb.cache.direct_mv[1][2] ) ^ M32( h->mb.cache.mv[1][x264_scan8[8]] ); + changed |= h->mb.cache.direct_ref[0][1] ^ h->mb.cache.ref[0][x264_scan8[4]]; + changed |= h->mb.cache.direct_ref[1][1] ^ h->mb.cache.ref[1][x264_scan8[4]]; + changed |= h->mb.cache.direct_ref[0][2] ^ h->mb.cache.ref[0][x264_scan8[8]]; + changed |= h->mb.cache.direct_ref[1][2] ^ h->mb.cache.ref[1][x264_scan8[8]]; + } + *b_changed = changed; + if( !changed ) + return b_available; + } + + /* cache ref & mv */ + if( b_available ) + for( int l = 0; l < 2; l++ ) + { + CP32( h->mb.cache.direct_mv[l][0], h->mb.cache.mv[l][x264_scan8[ 0]] ); + CP32( h->mb.cache.direct_mv[l][1], h->mb.cache.mv[l][x264_scan8[ 4]] ); + CP32( h->mb.cache.direct_mv[l][2], h->mb.cache.mv[l][x264_scan8[ 8]] ); + CP32( h->mb.cache.direct_mv[l][3], h->mb.cache.mv[l][x264_scan8[12]] ); + h->mb.cache.direct_ref[l][0] = h->mb.cache.ref[l][x264_scan8[ 0]]; + h->mb.cache.direct_ref[l][1] = h->mb.cache.ref[l][x264_scan8[ 4]]; + h->mb.cache.direct_ref[l][2] = h->mb.cache.ref[l][x264_scan8[ 8]]; + h->mb.cache.direct_ref[l][3] = h->mb.cache.ref[l][x264_scan8[12]]; + h->mb.cache.direct_partition = h->mb.i_partition; + } + + return b_available; +} + +/* This just improves encoder performance, it's not part of the spec */ +void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[9][2], int *i_mvc ) +{ + int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref]; + int i = 0; + +#define SET_MVP(mvp) \ + { \ + CP32( mvc[i], mvp ); \ + i++; \ + } + + /* b_direct */ + if( h->sh.i_type == SLICE_TYPE_B + && h->mb.cache.ref[i_list][x264_scan8[12]] == i_ref ) + { + SET_MVP( h->mb.cache.mv[i_list][x264_scan8[12]] ); + } + + if( i_ref == 0 && h->frames.b_have_lowres ) + { + int16_t (*lowres_mv)[2] = i_list ? h->fenc->lowres_mvs[1][h->fref1[0]->i_frame-h->fenc->i_frame-1] + : h->fenc->lowres_mvs[0][h->fenc->i_frame-h->fref0[0]->i_frame-1]; + if( lowres_mv[0][0] != 0x7fff ) + { + M32( mvc[i] ) = (M32( lowres_mv[h->mb.i_mb_xy] )*2)&0xfffeffff; + i++; + } + } + + /* spatial predictors */ + if( h->mb.i_neighbour_frame & MB_LEFT ) + { + SET_MVP( mvr[h->mb.i_mb_left_xy] ); + } + if( h->mb.i_neighbour_frame & MB_TOP ) + { + SET_MVP( mvr[h->mb.i_mb_top_xy] ); + + if( h->mb.i_neighbour_frame & MB_TOPLEFT ) + SET_MVP( mvr[h->mb.i_mb_topleft_xy] ); + if( h->mb.i_neighbour_frame & MB_TOPRIGHT ) + SET_MVP( mvr[h->mb.i_mb_topright_xy] ); + } +#undef SET_MVP + + /* temporal predictors */ + if( h->fref0[0]->i_ref[0] > 0 ) + { + x264_frame_t *l0 = h->fref0[0]; + x264_frame_t **fref = i_list ? h->fref1 : h->fref0; + int field = h->mb.i_mb_y&1; + int curpoc = h->fdec->i_poc + field*h->sh.i_delta_poc_bottom; + int refpoc = fref[i_ref>>h->sh.b_mbaff]->i_poc; + if( h->sh.b_mbaff && field^(i_ref&1) ) + refpoc += h->sh.i_delta_poc_bottom; + +#define SET_TMVP( dx, dy ) \ + { \ + int mb_index = h->mb.i_mb_xy + dx + dy*h->mb.i_mb_stride; \ + int scale = (curpoc - refpoc) * l0->inv_ref_poc[h->mb.b_interlaced&field]; \ + mvc[i][0] = (l0->mv16x16[mb_index][0]*scale + 128) >> 8; \ + mvc[i][1] = (l0->mv16x16[mb_index][1]*scale + 128) >> 8; \ + i++; \ + } + + SET_TMVP(0,0); + if( h->mb.i_mb_x < h->sps->i_mb_width-1 ) + SET_TMVP(1,0); + if( h->mb.i_mb_y < h->sps->i_mb_height-1 ) + SET_TMVP(0,1); +#undef SET_TMVP + } + + *i_mvc = i; +}

@@ -205,7 +205,7 @@ vec_st( dct_tr1v, 16, (signed short *)dct ); vec_st( dct_tr2v, 32, (signed short *)dct ); vec_st( dct_tr3v, 48, (signed short *)dct ); - + vec_st( dct_tr4v, 64, (signed short *)dct ); vec_st( dct_tr5v, 80, (signed short *)dct ); vec_st( dct_tr6v, 96, (signed short *)dct );

@@ -37,8 +37,8 @@ uint8_t *dst, int i_dst, int i_height ); -static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; -static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; +static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; +static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; static inline int x264_tapfilter( uint8_t *pix, int i_pix_next ) @@ -291,8 +291,8 @@ } -#define DO_PROCESS_W4( a ) \ - dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \ +#define DO_PROCESS_W4( a ) \ + dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \ dstv_16B = vec_mladd( src##a##v_16B, coeff##a##v, dstv_16B ) static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride, @@ -369,10 +369,10 @@ } } -#define DO_PROCESS_W8( a ) \ - src##a##v_16A = vec_u8_to_u16( src##a##v_8A ); \ - src##a##v_16B = vec_u8_to_u16( src##a##v_8B ); \ - dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \ +#define DO_PROCESS_W8( a ) \ + src##a##v_16A = vec_u8_to_u16( src##a##v_8A ); \ + src##a##v_16B = vec_u8_to_u16( src##a##v_8B ); \ + dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \ dstv_16B = vec_mladd( src##a##v_16B, coeff##a##v, dstv_16B ) static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,

@@ -113,13 +113,13 @@ vec_u8_t _hv, _lv #define PREP_LOAD_SRC( src ) \ - vec_u8_t _##src##_ = vec_lvsl(0, src) + vec_u8_t _##src##_ = vec_lvsl(0, src) #define VEC_LOAD_G( p, v, n, t ) \ _hv = vec_ld( 0, p ); \ v = (t) vec_lvsl( 0, p ); \ _lv = vec_ld( n - 1, p ); \ - v = (t) vec_perm( _hv, _lv, (vec_u8_t) v ) + v = (t) vec_perm( _hv, _lv, (vec_u8_t) v ) #define VEC_LOAD( p, v, n, t, g ) \ _hv = vec_ld( 0, p ); \ @@ -134,7 +134,7 @@ #define VEC_LOAD_PARTIAL( p, v, n, t, g) \ _hv = vec_ld( 0, p); \ v = (t) vec_perm( _hv, _hv, (vec_u8_t) _##g##_ ) - + /*********************************************************************** * PREP_STORE##n: declares required vectors to store n bytes to a @@ -155,7 +155,7 @@ _lv = vec_perm( (vec_u8_t) v, _tmp1v, _##o##r_ ); \ vec_st( _lv, 15, (uint8_t *) p ); \ _hv = vec_perm( _tmp1v, (vec_u8_t) v, _##o##r_ ); \ - vec_st( _hv, 0, (uint8_t *) p ) + vec_st( _hv, 0, (uint8_t *) p ) #define PREP_STORE8 \

@@ -20,7 +20,7 @@ #include "common/common.h" #include "ppccommon.h" -#include "quant.h" +#include "quant.h" // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled" #define QUANT_16_U( idx0, idx1 ) \ @@ -55,7 +55,7 @@ nz = vec_or(nz, vec_or(temp1v, temp2v)); \ vec_st(temp2v, (idx1), (int16_t*)dct); \ } - + int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ) { LOAD_ZERO; @@ -220,7 +220,7 @@ vec_u16_t biasvB; vec_s16_t temp1v, temp2v; - + vec_u32_u qbits_u; qbits_u.s[0]=16; i_qbitsv = vec_splat(qbits_u.v, 0);

@@ -41,7 +41,7 @@ * 16x16 prediction for intra luma block ****************************************************************************/ -#define PREDICT_16x16_DC(v) \ +#define PREDICT_16x16_DC(v)\ for( int i = 0; i < 16; i++ )\ {\ M32( src+ 0 ) = v;\

@@ -23,7 +23,7 @@ #define SHIFT(x,s) ((s)<=0 ? (x)<<-(s) : ((x)+(1<<((s)-1)))>>(s)) #define DIV(n,d) (((n) + ((d)>>1)) / (d)) -static const int dequant4_scale[6][3] = +static const uint8_t dequant4_scale[6][3] = { { 10, 13, 16 }, { 11, 14, 18 }, @@ -32,7 +32,7 @@ { 16, 20, 25 }, { 18, 23, 29 } }; -static const int quant4_scale[6][3] = +static const uint16_t quant4_scale[6][3] = { { 13107, 8066, 5243 }, { 11916, 7490, 4660 }, @@ -42,11 +42,11 @@ { 7282, 4559, 2893 }, }; -static const int quant8_scan[16] = +static const uint8_t quant8_scan[16] = { 0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1 }; -static const int dequant8_scale[6][6] = +static const uint8_t dequant8_scale[6][6] = { { 20, 18, 32, 19, 25, 24 }, { 22, 19, 35, 21, 28, 26 }, @@ -55,7 +55,7 @@ { 32, 28, 51, 30, 40, 38 }, { 36, 32, 58, 34, 46, 43 }, }; -static const int quant8_scale[6][6] = +static const uint16_t quant8_scale[6][6] = { { 13107, 11428, 20972, 12222, 16777, 15481 }, { 11916, 10826, 19174, 11058, 14980, 14290 },

@@ -24,23 +24,21 @@ %include "x86inc.asm" -SECTION_RODATA - SECTION .text -cextern x264_cabac_range_lps -cextern x264_cabac_transition -cextern x264_cabac_renorm_shift +cextern cabac_range_lps +cextern cabac_transition +cextern cabac_renorm_shift ; t3 must be ecx, since it's used for shift. %ifdef WIN64 - DECLARE_REG_TMP 3,1,2,0,4,5,6,10 + DECLARE_REG_TMP 3,1,2,0,4,5,6,10,2 %define pointer resq %elifdef ARCH_X86_64 - DECLARE_REG_TMP 0,1,2,3,4,5,6,10 + DECLARE_REG_TMP 0,1,2,3,4,5,6,10,6 %define pointer resq %else - DECLARE_REG_TMP 0,4,2,1,3,5,6,2 + DECLARE_REG_TMP 0,4,2,1,3,5,6,2,2 %define pointer resd %endif @@ -70,17 +68,19 @@ %endif %endmacro -cglobal x264_cabac_encode_decision_asm, 0,7 +cglobal cabac_encode_decision_asm, 0,7 movifnidn t0, r0mp movifnidn t1d, r1m mov t5d, [t0+cb.range] - movzx t6d, byte [t0+cb.state+t1] + movzx t4d, byte [t0+cb.state+t1] mov t3d, t5d + mov t6d, t4d shr t5d, 6 + shr t4d, 1 movifnidn t2d, r2m - LOAD_GLOBAL t5d, x264_cabac_range_lps-4, t5, t6*4 - LOAD_GLOBAL t4d, x264_cabac_transition, t2, t6*2 - shr t6d, 6 + LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*4 + LOAD_GLOBAL t4d, cabac_transition, t2, t6*2 + and t6d, 1 sub t3d, t5d cmp t6d, t2d mov t6d, [t0+cb.low] @@ -88,28 +88,74 @@ cmovne t3d, t5d cmovne t6d, t7d mov [t0+cb.state+t1], t4b -;x264_cabac_encode_renorm +;cabac_encode_renorm mov t4d, t3d shr t3d, 3 - LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3 + LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3 shl t4d, t3b shl t6d, t3b add t3d, [t0+cb.queue] mov [t0+cb.range], t4d - cmp t3d, 8 - jl .update_queue_low -;x264_cabac_putbyte + jge cabac_putbyte +.update_queue_low: + mov [t0+cb.low], t6d + mov [t0+cb.queue], t3d + RET + +cglobal cabac_encode_bypass_asm, 0,3 + movifnidn t0, r0mp + movifnidn t3d, r1m + neg t3d + mov t8d, [t0+cb.low] + and t3d, [t0+cb.range] + lea t8d, [t8*2+t3] + mov t3d, [t0+cb.queue] + inc t3d +%ifdef UNIX64 ; .putbyte compiles to nothing but a jmp + jge cabac_putbyte +%else + jge .putbyte +%endif + mov [t0+cb.low], t8d + mov [t0+cb.queue], t3d + RET +.putbyte: + PROLOGUE 0,7 + movifnidn t6d, t8d + jmp cabac_putbyte + +cglobal cabac_encode_terminal_asm, 0,3 + movifnidn t0, r0mp + sub dword [t0+cb.range], 2 +; shortcut: the renormalization shift in terminal +; can only be 0 or 1 and is zero over 99% of the time. + test dword [t0+cb.range], 0x100 + je .renorm + REP_RET +.renorm: + shl dword [t0+cb.low], 1 + shl dword [t0+cb.range], 1 + inc dword [t0+cb.queue] + jge .putbyte + REP_RET +.putbyte: + PROLOGUE 0,7 + mov t3d, [t0+cb.queue] + mov t6d, [t0+cb.low] + jmp cabac_putbyte + +cabac_putbyte: ; alive: t0=cb t3=queue t6=low %ifdef WIN64 DECLARE_REG_TMP 3,4,1,0,2,5,6,10 %endif mov t1d, -1 - add t3d, 2 + add t3d, 10 mov t2d, t6d shl t1d, t3b shr t2d, t3b ; out not t1d - sub t3d, 10 + sub t3d, 18 and t6d, t1d mov t5d, [t0+cb.bytes_outstanding] cmp t2b, 0xff ; FIXME is a 32bit op faster? @@ -127,8 +173,4 @@ .postpone: inc t5d mov [t0+cb.bytes_outstanding], t5d -.update_queue_low: - mov [t0+cb.low], t6d - mov [t0+cb.queue], t3d - RET - + jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)

@@ -0,0 +1,54 @@ +;***************************************************************************** +;* const-a.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2010 x264 project +;* +;* Author: Loren Merritt <lorenm@u.washington.edu> +;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;***************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA + +const pb_01, times 8 db 0,1 +const pb_0, times 16 db 0 +const pb_a1, times 16 db 0xa1 +const pb_1, times 16 db 1 +const pb_3, times 16 db 3 +const hsub_mul, times 8 db 1, -1 +const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6 + +const pw_1, times 8 dw 1 +const pw_2, times 8 dw 2 +const pw_4, times 8 dw 4 +const pw_8, times 8 dw 8 +const pw_16, times 8 dw 16 +const pw_32, times 8 dw 32 +const pw_64, times 8 dw 64 +const pw_32_0, times 4 dw 32, + times 4 dw 0 +const pw_8000, times 8 dw 0x8000 +const pw_3fff, times 8 dw 0x3fff + +const pd_1, times 4 dd 1 +const pd_128, times 4 dd 128 +const pw_00ff, times 8 dw 0x00ff +const pw_ff00, times 8 dw 0xff00 + +const pb_reverse, db 7, 6, 5, 4, 3, 2, 1, 0 +const sw_64, dd 64

@@ -29,9 +29,9 @@ %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) +; int cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) ;----------------------------------------------------------------------------- -cglobal x264_cpu_cpuid, 5,7 +cglobal cpu_cpuid, 5,7 push rbx mov r11, r1 mov r10, r2 @@ -49,10 +49,10 @@ %else ;----------------------------------------------------------------------------- -; int x264_cpu_cpuid_test( void ) +; int cpu_cpuid_test( void ) ; return 0 if unsupported ;----------------------------------------------------------------------------- -cglobal x264_cpu_cpuid_test +cglobal cpu_cpuid_test pushfd push ebx push ebp @@ -75,9 +75,9 @@ ret ;----------------------------------------------------------------------------- -; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) +; int cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) ;----------------------------------------------------------------------------- -cglobal x264_cpu_cpuid, 0,6 +cglobal cpu_cpuid, 0,6 mov eax, r0m cpuid mov esi, r1m @@ -91,9 +91,9 @@ RET ;----------------------------------------------------------------------------- -; void x264_stack_align( void (*func)(void*), void *arg ); +; void stack_align( void (*func)(void*), void *arg ); ;----------------------------------------------------------------------------- -cglobal x264_stack_align +cglobal stack_align push ebp mov ebp, esp sub esp, 8 @@ -110,16 +110,23 @@ %endif ;----------------------------------------------------------------------------- -; void x264_emms( void ) +; void cpu_emms( void ) ;----------------------------------------------------------------------------- -cglobal x264_emms +cglobal cpu_emms emms ret ;----------------------------------------------------------------------------- -; void x264_cpu_mask_misalign_sse(void) +; void cpu_sfence( void ) ;----------------------------------------------------------------------------- -cglobal x264_cpu_mask_misalign_sse +cglobal cpu_sfence + sfence + ret + +;----------------------------------------------------------------------------- +; void cpu_mask_misalign_sse( void ) +;----------------------------------------------------------------------------- +cglobal cpu_mask_misalign_sse sub rsp, 4 stmxcsr [rsp] or dword [rsp], 1<<17

@@ -27,13 +27,11 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA - -pw_32: times 8 dw 32 -hsub_mul: times 8 db 1, -1 - SECTION .text +cextern pw_32 +cextern hsub_mul + ; in: m0..m7 ; out: 0,4,6 in mem, rest in regs %macro DCT8_1D 9 @@ -188,10 +186,10 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) +; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- -cglobal x264_sub8x8_dct8_mmx, 3,3 -global x264_sub8x8_dct8_mmx.skip_prologue +cglobal sub8x8_dct8_mmx, 3,3 +global sub8x8_dct8_mmx.skip_prologue .skip_prologue: INIT_MMX call load_diff_4x8_mmx @@ -254,10 +252,10 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] ) +; void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct8_mmx, 2,2 -global x264_add8x8_idct8_mmx.skip_prologue +cglobal add8x8_idct8_mmx, 2,2 +global add8x8_idct8_mmx.skip_prologue .skip_prologue: INIT_MMX add word [r1], 32 @@ -344,9 +342,9 @@ INIT_XMM %macro DCT_SUB8 1 -cglobal x264_sub8x8_dct_%1, 3,3 +cglobal sub8x8_dct_%1, 3,3 add r2, 4*FDEC_STRIDE -global x264_sub8x8_dct_%1.skip_prologue +global sub8x8_dct_%1.skip_prologue .skip_prologue: %ifnidn %1, sse2 mova m7, [hsub_mul] @@ -375,11 +373,11 @@ ret ;----------------------------------------------------------------------------- -; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) +; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- -cglobal x264_sub8x8_dct8_%1, 3,3 +cglobal sub8x8_dct8_%1, 3,3 add r2, 4*FDEC_STRIDE -global x264_sub8x8_dct8_%1.skip_prologue +global sub8x8_dct8_%1.skip_prologue .skip_prologue: %ifidn %1, sse2 LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE] @@ -419,11 +417,11 @@ DCT_SUB8 ssse3 ;----------------------------------------------------------------------------- -; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] ) +; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct_sse2, 2,2 +cglobal add8x8_idct_sse2, 2,2 add r0, 4*FDEC_STRIDE -global x264_add8x8_idct_sse2.skip_prologue +global add8x8_idct_sse2.skip_prologue .skip_prologue: UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3 SBUTTERFLY qdq, 0, 1, 4 @@ -456,11 +454,11 @@ ret ;----------------------------------------------------------------------------- -; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) +; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct8_sse2, 2,2 +cglobal add8x8_idct8_sse2, 2,2 add r0, 4*FDEC_STRIDE -global x264_add8x8_idct8_sse2.skip_prologue +global add8x8_idct8_sse2.skip_prologue .skip_prologue: UNSPILL r1, 1,2,3,5,6,7 IDCT8_1D 0,1,2,3,4,5,6,7,r1

@@ -26,11 +26,10 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA -pw_32: times 8 dw 32 -hsub_mul: times 8 db 1, -1 - SECTION .text + +cextern pw_32 +cextern hsub_mul INIT_XMM %macro DCT8_1D 10 @@ -140,7 +139,7 @@ %endmacro %macro DCT_SUB8 1 -cglobal x264_sub8x8_dct_%1, 3,3,11 +cglobal sub8x8_dct_%1, 3,3,11 add r2, 4*FDEC_STRIDE %ifnidn %1, sse2 mova m7, [hsub_mul] @@ -149,7 +148,7 @@ call .skip_prologue RET %endif -global x264_sub8x8_dct_%1.skip_prologue +global sub8x8_dct_%1.skip_prologue .skip_prologue: SWAP 7, 9 LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE @@ -165,9 +164,9 @@ ret ;----------------------------------------------------------------------------- -; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) +; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- -cglobal x264_sub8x8_dct8_%1, 3,3,11 +cglobal sub8x8_dct8_%1, 3,3,11 add r2, 4*FDEC_STRIDE %ifnidn %1, sse2 mova m7, [hsub_mul] @@ -176,7 +175,7 @@ call .skip_prologue RET %endif -global x264_sub8x8_dct8_%1.skip_prologue +global sub8x8_dct8_%1.skip_prologue .skip_prologue: SWAP 7, 10 LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE @@ -205,16 +204,16 @@ DCT_SUB8 ssse3 ;----------------------------------------------------------------------------- -; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) +; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct8_sse2, 2,2,11 +cglobal add8x8_idct8_sse2, 2,2,11 add r0, 4*FDEC_STRIDE pxor m7, m7 %ifdef WIN64 call .skip_prologue RET %endif -global x264_add8x8_idct8_sse2.skip_prologue +global add8x8_idct8_sse2.skip_prologue .skip_prologue: SWAP 7, 9 movdqa m0, [r1+0x00] @@ -237,16 +236,16 @@ ret ;----------------------------------------------------------------------------- -; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] ) +; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct_sse2, 2,2,11 +cglobal add8x8_idct_sse2, 2,2,11 add r0, 4*FDEC_STRIDE pxor m7, m7 %ifdef WIN64 call .skip_prologue RET %endif -global x264_add8x8_idct_sse2.skip_prologue +global add8x8_idct_sse2.skip_prologue .skip_prologue: SWAP 7, 9 mova m0, [r1+ 0]

@@ -35,12 +35,6 @@ %endmacro SECTION_RODATA -pw_32_0: times 4 dw 32 - times 4 dw 0 -pw_32: times 8 dw 32 -pw_8000: times 8 dw 0x8000 -hsub_mul: times 8 db 1, -1 - pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15 pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1 @@ -48,11 +42,16 @@ pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 -pb_1: times 16 db 1 -pw_1: times 8 dw 1 SECTION .text +cextern pw_32_0 +cextern pw_32 +cextern pw_8000 +cextern hsub_mul +cextern pb_1 +cextern pw_1 + %macro WALSH4_1D 5 SUMSUB_BADC m%4, m%3, m%2, m%1, m%5 SUMSUB_BADC m%4, m%2, m%3, m%1, m%5 @@ -73,9 +72,9 @@ INIT_MMX ;----------------------------------------------------------------------------- -; void x264_dct4x4dc_mmx( int16_t d[4][4] ) +; void dct4x4dc( int16_t d[4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_dct4x4dc_mmx, 1,1 +cglobal dct4x4dc_mmx, 1,1 movq m3, [r0+24] movq m2, [r0+16] movq m1, [r0+ 8] @@ -95,9 +94,9 @@ RET ;----------------------------------------------------------------------------- -; void x264_idct4x4dc_mmx( int16_t d[4][4] ) +; void idct4x4dc( int16_t d[4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_idct4x4dc_mmx, 1,1 +cglobal idct4x4dc_mmx, 1,1 movq m3, [r0+24] movq m2, [r0+16] movq m1, [r0+ 8] @@ -113,9 +112,9 @@ %macro SUB_DCT4 1 ;----------------------------------------------------------------------------- -; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 ) +; void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- -cglobal x264_sub4x4_dct_%1, 3,3 +cglobal sub4x4_dct_%1, 3,3 %ifidn %1, mmx .skip_prologue: LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] @@ -140,9 +139,9 @@ SUB_DCT4 ssse3 ;----------------------------------------------------------------------------- -; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] ) +; void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_add4x4_idct_mmx, 2,2 +cglobal add4x4_idct_mmx, 2,2 pxor m7, m7 .skip_prologue: movq m1, [r1+ 8] @@ -160,7 +159,7 @@ RET INIT_XMM -cglobal x264_add4x4_idct_sse4, 2,2,6 +cglobal add4x4_idct_sse4, 2,2,6 mova m0, [r1+0x00] ; row1/row0 mova m2, [r1+0x10] ; row3/row2 mova m1, m0 ; row1/row0 @@ -213,7 +212,7 @@ INIT_MMX ;----------------------------------------------------------------------------- -; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ) +; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- %macro SUB_NxN_DCT 6 cglobal %1, 3,3,11 @@ -249,7 +248,7 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] ) +; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] ) ;----------------------------------------------------------------------------- %macro ADD_NxN_IDCT 6-7 cglobal %1, 2,2,11 @@ -280,33 +279,33 @@ %endmacro %ifndef ARCH_X86_64 -SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0 -ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0 -SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4 -ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4 - -cextern x264_sub8x8_dct8_mmx.skip_prologue -cextern x264_add8x8_idct8_mmx.skip_prologue -SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0 -ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0 +SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0 +ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0 +SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4 +ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4 + +cextern sub8x8_dct8_mmx.skip_prologue +cextern add8x8_idct8_mmx.skip_prologue +SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0 +ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0 %endif INIT_XMM -cextern x264_sub8x8_dct_sse2.skip_prologue -cextern x264_sub8x8_dct_ssse3.skip_prologue -SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0 -SUB_NxN_DCT x264_sub16x16_dct_ssse3, x264_sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0 -cextern x264_add8x8_idct_sse2.skip_prologue -ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0 - -cextern x264_sub8x8_dct8_sse2.skip_prologue -cextern x264_add8x8_idct8_sse2.skip_prologue -SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0 -ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0 +cextern sub8x8_dct_sse2.skip_prologue +cextern sub8x8_dct_ssse3.skip_prologue +SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0 +SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0 +cextern add8x8_idct_sse2.skip_prologue +ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0 + +cextern sub8x8_dct8_sse2.skip_prologue +cextern add8x8_idct8_sse2.skip_prologue +SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0 +ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0 -cextern x264_sub8x8_dct8_ssse3.skip_prologue -SUB_NxN_DCT x264_sub16x16_dct8_ssse3, x264_sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0 +cextern sub8x8_dct8_ssse3.skip_prologue +SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0 ;----------------------------------------------------------------------------- @@ -331,7 +330,7 @@ movq [%3+FDEC_STRIDE*3], %1 %endmacro -cglobal x264_add8x8_idct_dc_mmx, 2,2 +cglobal add8x8_idct_dc_mmx, 2,2 movq mm0, [r1] pxor mm1, mm1 add r0, FDEC_STRIDE*4 @@ -350,7 +349,7 @@ ADD_DC mm2, mm3, r0 RET -cglobal x264_add8x8_idct_dc_ssse3, 2,2 +cglobal add8x8_idct_dc_ssse3, 2,2 movq xmm0, [r1] pxor xmm1, xmm1 add r0, FDEC_STRIDE*4 @@ -388,7 +387,7 @@ movhps [r0+FDEC_STRIDE* 3], xmm5 RET -cglobal x264_add16x16_idct_dc_mmx, 2,3 +cglobal add16x16_idct_dc_mmx, 2,3 mov r2, 4 .loop: movq mm0, [r1] @@ -431,7 +430,7 @@ movdqa [r0+%1+FDEC_STRIDE*3], xmm7 %endmacro -cglobal x264_add16x16_idct_dc_sse2, 2,2,8 +cglobal add16x16_idct_dc_sse2, 2,2,8 call .loop add r0, FDEC_STRIDE*4 %ifdef WIN64 @@ -465,7 +464,7 @@ IDCT_DC_STORE 0, xmm2, xmm3 ret -cglobal x264_add16x16_idct_dc_ssse3, 2,2,8 +cglobal add16x16_idct_dc_ssse3, 2,2,8 call .loop add r0, FDEC_STRIDE*4 %ifdef WIN64 @@ -531,7 +530,7 @@ %endmacro INIT_MMX -cglobal x264_sub8x8_dct_dc_mmxext, 3,3 +cglobal sub8x8_dct_dc_mmxext, 3,3 DCTDC_2ROW_MMX m0, m4, 0 DCTDC_2ROW_MMX m5, m6, 2 paddw m0, m5 @@ -567,7 +566,7 @@ %endif %endmacro -cglobal x264_sub8x8_dct_dc_sse2, 3,3,8 +cglobal sub8x8_dct_dc_sse2, 3,3,8 pxor m7, m7 DCTDC_2ROW_SSE2 0, 0, m4 DCTDC_2ROW_SSE2 2, 1, m4 @@ -586,10 +585,10 @@ RET ;----------------------------------------------------------------------------- -; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] ) +; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] ) ;----------------------------------------------------------------------------- %macro SCAN_8x8 1 -cglobal x264_zigzag_scan_8x8_frame_%1, 2,2,8 +cglobal zigzag_scan_8x8_frame_%1, 2,2,8 movdqa xmm0, [r1] movdqa xmm1, [r1+16] movdq2q mm0, xmm0 @@ -703,9 +702,9 @@ SCAN_8x8 ssse3 ;----------------------------------------------------------------------------- -; void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] ) +; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] ) ;----------------------------------------------------------------------------- -cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2 +cglobal zigzag_scan_8x8_frame_mmxext, 2,2 movq mm0, [r1] movq mm1, [r1+2*8] movq mm2, [r1+2*14] @@ -798,9 +797,9 @@ RET ;----------------------------------------------------------------------------- -; void x264_zigzag_scan_4x4_frame_mmx( int16_t level[16], int16_t dct[4][4] ) +; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2 +cglobal zigzag_scan_4x4_frame_mmx, 2,2 movq mm0, [r1] movq mm1, [r1+8] movq mm2, [r1+16] @@ -828,9 +827,9 @@ RET ;----------------------------------------------------------------------------- -; void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[4][4] ) +; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2 +cglobal zigzag_scan_4x4_frame_ssse3, 2,2 movdqa xmm1, [r1+16] movdqa xmm0, [r1] pshufb xmm1, [pb_scan4frameb] @@ -845,10 +844,10 @@ RET ;----------------------------------------------------------------------------- -; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] ) +; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] ) ;----------------------------------------------------------------------------- ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2 -cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3 +cglobal zigzag_scan_4x4_field_mmxext, 2,3 pshufw mm0, [r1+4], 0xd2 movq mm1, [r1+16] movq mm2, [r1+24] @@ -862,7 +861,7 @@ RET ;----------------------------------------------------------------------------- -; void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[8][8] ) +; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] ) ;----------------------------------------------------------------------------- ; Output order: @@ -875,7 +874,7 @@ ; 45 46 47 51 56 57 52 53 ; 54 55 58 59 60 61 62 63 -cglobal x264_zigzag_scan_8x8_field_mmxext, 2,3 +cglobal zigzag_scan_8x8_field_mmxext, 2,3 movq mm0, [r1+2*0] ; 03 02 01 00 movq mm1, [r1+2*4] ; 07 06 05 04 movq mm2, [r1+2*8] ; 11 10 09 08 @@ -954,13 +953,13 @@ RET ;----------------------------------------------------------------------------- -; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst ) +; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst ) ;----------------------------------------------------------------------------- %macro ZIGZAG_SUB_4x4 2 %ifidn %1, ac -cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 4,4,8 +cglobal zigzag_sub_4x4%1_%2_ssse3, 4,4,8 %else -cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8 +cglobal zigzag_sub_4x4%1_%2_ssse3, 3,3,8 %endif movd xmm0, [r1+0*FENC_STRIDE] movd xmm1, [r1+1*FENC_STRIDE] @@ -1020,7 +1019,7 @@ ZIGZAG_SUB_4x4 ac, field ;----------------------------------------------------------------------------- -; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz ) +; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz ) ;----------------------------------------------------------------------------- %macro INTERLEAVE 1 @@ -1047,7 +1046,7 @@ %endmacro INIT_MMX -cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3 +cglobal zigzag_interleave_8x8_cavlc_mmx, 3,3 INTERLEAVE 0 INTERLEAVE 8 INTERLEAVE 16 @@ -1095,7 +1094,7 @@ %endmacro INIT_XMM -cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8 +cglobal zigzag_interleave_8x8_cavlc_sse2, 3,3,8 INTERLEAVE_XMM 0 INTERLEAVE_XMM 16 packsswb m2, m3

@@ -22,14 +22,13 @@ %include "x86inc.asm" -SECTION_RODATA -pb_00: times 16 db 0x00 -pb_01: times 16 db 0x01 -pb_03: times 16 db 0x03 -pb_a1: times 16 db 0xa1 - SECTION .text +cextern pb_0 +cextern pb_1 +cextern pb_3 +cextern pb_a1 + ; expands to [base],...,[base+7*stride] %define PASS8ROWS(base, base3, stride, stride3) \ [base], [base+stride], [base+stride*2], [base3], \ @@ -234,11 +233,11 @@ %macro DEBLOCK_P0_Q0 0 mova m5, m1 pxor m5, m2 ; p0^q0 - pand m5, [pb_01] ; (p0^q0)&1 + pand m5, [pb_1] ; (p0^q0)&1 pcmpeqb m4, m4 pxor m3, m4 pavgb m3, m0 ; (p1 - q1 + 256)>>1 - pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 + pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 pxor m4, m1 pavgb m4, m2 ; (q0 - p0 + 256)>>1 pavgb m3, m5 @@ -263,7 +262,7 @@ pavgb %6, m2 pavgb %2, %6 ; avg(p2,avg(p0,q0)) pxor %6, %3 - pand %6, [pb_01] ; (p2^avg(p0,q0))&1 + pand %6, [pb_1] ; (p2^avg(p0,q0))&1 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 mova %6, %1 psubusb %6, %5 @@ -275,10 +274,10 @@ %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_XMM -cglobal x264_deblock_v_luma_sse2, 5,5,10 +cglobal deblock_v_luma_sse2, 5,5,10 movd m8, [r4] ; tc0 lea r4, [r1*3] dec r2d ; alpha-1 @@ -321,10 +320,10 @@ RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_deblock_h_luma_sse2, 5,7 +cglobal deblock_h_luma_sse2, 5,7 movsxd r10, r1d lea r11, [r10+r10*2] lea r6, [r0-4] @@ -345,13 +344,13 @@ ; vertical filter ; alpha, beta, tc0 are still in r2d, r3d, r4 - ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them + ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them lea r0, [pix_tmp+0x30] mov r1d, 0x10 %ifdef WIN64 mov [rsp+0x20], r4 %endif - call x264_deblock_v_luma_sse2 + call deblock_v_luma_sse2 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) add r6, 2 @@ -383,9 +382,9 @@ %macro DEBLOCK_LUMA 3 ;----------------------------------------------------------------------------- -; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_%1, 5,5 +cglobal deblock_%2_luma_%1, 5,5 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 @@ -436,10 +435,10 @@ RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_deblock_h_luma_%1, 0,5 +cglobal deblock_h_luma_%1, 0,5 mov r0, r0mp mov r3, r1m lea r4, [r3*3] @@ -462,11 +461,11 @@ PUSH dword r2m PUSH dword 16 PUSH dword r0 - call x264_deblock_%2_luma_%1 + call deblock_%2_luma_%1 %ifidn %2, v8 add dword [esp ], 8 ; pix_tmp+0x38 add dword [esp+16], 2 ; tc0+2 - call x264_deblock_%2_luma_%1 + call deblock_%2_luma_%1 %endif ADD esp, 20 @@ -517,9 +516,9 @@ mova t3, t2 mova t4, t2 psrlw t2, 1 - pavgb t2, mpb_00 + pavgb t2, mpb_0 pxor t2, t0 - pand t2, mpb_01 + pand t2, mpb_1 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; mova t1, p2 @@ -528,21 +527,21 @@ psubb t2, q1 paddb t3, t3 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 - pand t2, mpb_01 + pand t2, mpb_1 psubb t1, t2 pavgb t1, p1 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 psrlw t3, 2 - pavgb t3, mpb_00 + pavgb t3, mpb_0 pxor t3, t1 - pand t3, mpb_01 + pand t3, mpb_1 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 mova t3, p0 mova t2, p0 pxor t3, q1 pavgb t2, q1 - pand t3, mpb_01 + pand t3, mpb_1 psubb t2, t3 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 @@ -562,9 +561,9 @@ paddb t2, t2 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 psrlw t2, 2 - pavgb t2, mpb_00 + pavgb t2, mpb_0 pxor t2, t1 - pand t2, mpb_01 + pand t2, mpb_1 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 pxor t0, p1 @@ -603,8 +602,8 @@ %define mask0 m12 %define mask1p m13 %define mask1q [rsp-24] - %define mpb_00 m14 - %define mpb_01 m15 + %define mpb_0 m14 + %define mpb_1 m15 %else %define spill(x) [esp+16*x+((stack_offset+4)&15)] %define p2 [r4+r1] @@ -614,14 +613,14 @@ %define mask0 spill(2) %define mask1p spill(3) %define mask1q spill(4) - %define mpb_00 [pb_00] - %define mpb_01 [pb_01] + %define mpb_0 [pb_0] + %define mpb_1 [pb_1] %endif ;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 +cglobal deblock_%2_luma_intra_%1, 4,6,16 %ifndef ARCH_X86_64 sub esp, 0x60 %endif @@ -638,12 +637,12 @@ mova q0, [r0] mova q1, [r0+r1] %ifdef ARCH_X86_64 - pxor mpb_00, mpb_00 - mova mpb_01, [pb_01] + pxor mpb_0, mpb_0 + mova mpb_1, [pb_1] LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 SWAP 7, 12 ; m12=mask0 - pavgb t5, mpb_00 - pavgb t5, mpb_01 ; alpha/4+1 + pavgb t5, mpb_0 + pavgb t5, mpb_1 ; alpha/4+1 movdqa p2, [r4+r1] movdqa q2, [r0+2*r1] DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 @@ -658,8 +657,8 @@ LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 mova m4, t5 mova mask0, m7 - pavgb m4, [pb_00] - pavgb m4, [pb_01] ; alpha/4+1 + pavgb m4, [pb_0] + pavgb m4, [pb_1] ; alpha/4+1 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 pand m6, mask0 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 @@ -681,9 +680,9 @@ INIT_MMX %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_luma_intra_%1, 4,7 +cglobal deblock_h_luma_intra_%1, 4,7 movsxd r10, r1d lea r11, [r10*3] lea r6, [r0-4] @@ -699,7 +698,7 @@ lea r0, [pix_tmp+0x40] mov r1, 0x10 - call x264_deblock_v_luma_intra_%1 + call deblock_v_luma_intra_%1 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) lea r5, [r6+r11] @@ -712,7 +711,7 @@ add rsp, 0x88 RET %else -cglobal x264_deblock_h_luma_intra_%1, 2,4 +cglobal deblock_h_luma_intra_%1, 2,4 lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] @@ -731,10 +730,10 @@ PUSH dword r2m PUSH dword 16 PUSH r0 - call x264_deblock_%2_luma_intra_%1 + call deblock_%2_luma_intra_%1 %ifidn %2, v8 add dword [rsp], 8 ; pix_tmp+8 - call x264_deblock_%2_luma_intra_%1 + call deblock_%2_luma_intra_%1 %endif ADD esp, 16 @@ -785,9 +784,9 @@ %define t6 r6 ;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_mmxext, 5,6 +cglobal deblock_v_chroma_mmxext, 5,6 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] @@ -799,9 +798,9 @@ RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_mmxext, 5,7 +cglobal deblock_h_chroma_mmxext, 5,7 %ifdef ARCH_X86_64 %define buf0 [rsp-24] %define buf1 [rsp-16] @@ -835,7 +834,7 @@ %macro CHROMA_INTRA_P0 3 movq m4, %1 pxor m4, %3 - pand m4, [pb_01] ; m4 = (p0^q1)&1 + pand m4, [pb_1] ; m4 = (p0^q1)&1 pavgb %1, %3 psubusb %1, m4 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) @@ -845,9 +844,9 @@ %define t6 r5 ;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 +cglobal deblock_v_chroma_intra_mmxext, 4,5 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] @@ -859,9 +858,9 @@ RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 +cglobal deblock_h_chroma_intra_mmxext, 4,6 CHROMA_H_START TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_intra_body_mmxext

@@ -29,15 +29,16 @@ SECTION_RODATA 32 ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0 -pw_1: times 8 dw 1 -pw_4: times 8 dw 4 -pw_8: times 8 dw 8 -pw_32: times 8 dw 32 -pw_64: times 8 dw 64 -sw_64: dd 64 SECTION .text +cextern pw_1 +cextern pw_4 +cextern pw_8 +cextern pw_32 +cextern pw_64 +cextern sw_64 + ;============================================================================= ; implicit weighted biprediction ;============================================================================= @@ -129,10 +130,10 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight ) +; int pixel_avg_weight_w16( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight ) ;----------------------------------------------------------------------------- %macro AVG_WEIGHT 2-3 0 -cglobal x264_pixel_avg_weight_w%2_%1 +cglobal pixel_avg_weight_w%2_%1 BIWEIGHT_START AVG_START %3 %if %2==8 && mmsize==16 @@ -165,7 +166,7 @@ AVG_WEIGHT mmxext, 8 AVG_WEIGHT mmxext, 16 INIT_XMM -%define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext +%define pixel_avg_weight_w4_sse2 pixel_avg_weight_w4_mmxext AVG_WEIGHT sse2, 8, 7 AVG_WEIGHT sse2, 16, 7 %define BIWEIGHT BIWEIGHT_SSSE3 @@ -293,8 +294,9 @@ %endrep %endmacro - -;void x264_mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src,int i_src_stride, x264_weight_t *weight,int h) +;----------------------------------------------------------------------------- +;void mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, weight_t *weight, int h ) +;----------------------------------------------------------------------------- %ifdef ARCH_X86_64 %define NUMREGS 6 @@ -307,7 +309,7 @@ %endif %macro WEIGHTER 2 - cglobal x264_mc_weight_w%1_%2, NUMREGS, NUMREGS, 7 + cglobal mc_weight_w%1_%2, NUMREGS, NUMREGS, 7 WEIGHT_START %1 LOAD_HEIGHT .loop: @@ -363,9 +365,11 @@ %endrep %endmacro -;void x264_mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, x264_weight_t *w, int h ) +;----------------------------------------------------------------------------- +;void mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, weight_t *w, int h ) +;----------------------------------------------------------------------------- %macro OFFSET 3 - cglobal x264_mc_offset%3_w%1_%2, NUMREGS, NUMREGS + cglobal mc_offset%3_w%1_%2, NUMREGS, NUMREGS mova m2, [r4] LOAD_HEIGHT .loop: @@ -402,25 +406,25 @@ ;============================================================================= ;----------------------------------------------------------------------------- -; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride, -; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight ); +; void pixel_avg_4x4( uint8_t *dst, int dst_stride, +; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight ); ;----------------------------------------------------------------------------- %macro AVGH 3 -cglobal x264_pixel_avg_%1x%2_%3 +cglobal pixel_avg_%1x%2_%3 mov eax, %2 cmp dword r6m, 32 - jne x264_pixel_avg_weight_w%1_%3 + jne pixel_avg_weight_w%1_%3 %if mmsize == 16 && %1 == 16 test dword r4m, 15 - jz x264_pixel_avg_w%1_sse2 + jz pixel_avg_w%1_sse2 %endif - jmp x264_pixel_avg_w%1_mmxext + jmp pixel_avg_w%1_mmxext %endmacro ;----------------------------------------------------------------------------- -; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride, -; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, -; int height, int weight ); +; void pixel_avg_w4( uint8_t *dst, int dst_stride, +; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, +; int height, int weight ); ;----------------------------------------------------------------------------- %macro AVG_END 0 @@ -445,17 +449,17 @@ %endmacro INIT_MMX -AVG_FUNC x264_pixel_avg_w4_mmxext, movd, movd +AVG_FUNC pixel_avg_w4_mmxext, movd, movd AVGH 4, 8, mmxext AVGH 4, 4, mmxext AVGH 4, 2, mmxext -AVG_FUNC x264_pixel_avg_w8_mmxext, movq, movq +AVG_FUNC pixel_avg_w8_mmxext, movq, movq AVGH 8, 16, mmxext AVGH 8, 8, mmxext AVGH 8, 4, mmxext -cglobal x264_pixel_avg_w16_mmxext +cglobal pixel_avg_w16_mmxext AVG_START movq mm0, [t2 ] movq mm1, [t2+8] @@ -475,7 +479,7 @@ AVGH 16, 8, mmxext INIT_XMM -AVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqa +AVG_FUNC pixel_avg_w16_sse2, movdqu, movdqa AVGH 16, 16, sse2 AVGH 16, 8, sse2 AVGH 8, 16, sse2 @@ -498,12 +502,12 @@ ;============================================================================= ;----------------------------------------------------------------------------- -; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride, -; uint8_t *src1, int src_stride, -; uint8_t *src2, int height ); +; void pixel_avg2_w4( uint8_t *dst, int dst_stride, +; uint8_t *src1, int src_stride, +; uint8_t *src2, int height ); ;----------------------------------------------------------------------------- %macro AVG2_W8 2 -cglobal x264_pixel_avg2_w%1_mmxext, 6,7 +cglobal pixel_avg2_w%1_mmxext, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -524,7 +528,7 @@ AVG2_W8 8, movq %macro AVG2_W16 2 -cglobal x264_pixel_avg2_w%1_mmxext, 6,7 +cglobal pixel_avg2_w%1_mmxext, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -550,7 +554,7 @@ AVG2_W16 12, movd AVG2_W16 16, movq -cglobal x264_pixel_avg2_w20_mmxext, 6,7 +cglobal pixel_avg2_w20_mmxext, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -578,7 +582,7 @@ jg .height_loop REP_RET -cglobal x264_pixel_avg2_w16_sse2, 6,7 +cglobal pixel_avg2_w16_sse2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -597,7 +601,7 @@ REP_RET %macro AVG2_W20 1 -cglobal x264_pixel_avg2_w20_%1, 6,7 +cglobal pixel_avg2_w20_%1, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -647,16 +651,16 @@ %endmacro %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set -cglobal x264_pixel_avg2_w%1_cache%2_%3 +cglobal pixel_avg2_w%1_cache%2_%3 mov eax, r2m and eax, 0x1f|(%2>>1) cmp eax, (32-%1)|(%2>>1) - jle x264_pixel_avg2_w%1_%3 + jle pixel_avg2_w%1_%3 ;w12 isn't needed because w16 is just as fast if there's no cacheline split %if %1 == 12 - jmp x264_pixel_avg2_w16_cache_mmxext + jmp pixel_avg2_w16_cache_mmxext %else - jmp x264_pixel_avg2_w%1_cache_mmxext + jmp pixel_avg2_w%1_cache_mmxext %endif %endmacro @@ -687,7 +691,7 @@ %2 [r0+%1], mm0 %endmacro -x264_pixel_avg2_w8_cache_mmxext: +pixel_avg2_w8_cache_mmxext: AVG_CACHELINE_START AVG_CACHELINE_LOOP 0, movq add r2, r3 @@ -696,7 +700,7 @@ jg .height_loop REP_RET -x264_pixel_avg2_w16_cache_mmxext: +pixel_avg2_w16_cache_mmxext: AVG_CACHELINE_START AVG_CACHELINE_LOOP 0, movq AVG_CACHELINE_LOOP 8, movq @@ -706,7 +710,7 @@ jg .height_loop REP_RET -x264_pixel_avg2_w20_cache_mmxext: +pixel_avg2_w20_cache_mmxext: AVG_CACHELINE_START AVG_CACHELINE_LOOP 0, movq AVG_CACHELINE_LOOP 8, movq @@ -754,11 +758,11 @@ rep ret %endmacro -cglobal x264_pixel_avg2_w16_cache64_ssse3 +cglobal pixel_avg2_w16_cache64_ssse3 mov eax, r2m and eax, 0x3f cmp eax, 0x30 - jle x264_pixel_avg2_w16_sse2 + jle pixel_avg2_w16_sse2 PROLOGUE 6,7 lea r6, [r4+r2] and r4, ~0xf @@ -807,10 +811,10 @@ INIT_MMX ;----------------------------------------------------------------------------- -; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride, -; uint8_t *src, int i_src_stride, int i_height ) +; void mc_copy_w4( uint8_t *dst, int i_dst_stride, +; uint8_t *src, int i_src_stride, int i_height ) ;----------------------------------------------------------------------------- -cglobal x264_mc_copy_w4_mmx, 4,6 +cglobal mc_copy_w4_mmx, 4,6 cmp dword r4m, 4 lea r5, [r3*3] lea r4, [r1*3] @@ -822,7 +826,7 @@ COPY4 movd, movd, r4, r5 RET -cglobal x264_mc_copy_w8_mmx, 5,7 +cglobal mc_copy_w8_mmx, 5,7 lea r6, [r3*3] lea r5, [r1*3] .height_loop: @@ -833,7 +837,7 @@ jg .height_loop REP_RET -cglobal x264_mc_copy_w16_mmx, 5,7 +cglobal mc_copy_w16_mmx, 5,7 lea r6, [r3*3] lea r5, [r1*3] .height_loop: @@ -873,11 +877,11 @@ REP_RET %endmacro -COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu +COPY_W16_SSE2 mc_copy_w16_sse2, movdqu ; cacheline split with mmx has too much overhead; the speed benefit is near-zero. ; but with SSE3 the overhead is zero, so there's no reason not to include it. -COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu -COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa +COPY_W16_SSE2 mc_copy_w16_sse3, lddqu +COPY_W16_SSE2 mc_copy_w16_aligned_sse2, movdqa @@ -887,11 +891,11 @@ ; FIXME assumes 64 byte cachelines ;----------------------------------------------------------------------------- -; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, -; uint8_t *pix_uv, int stride_uv, int mb_x ) +; void prefetch_fenc( uint8_t *pix_y, int stride_y, +; uint8_t *pix_uv, int stride_uv, int mb_x ) ;----------------------------------------------------------------------------- %ifdef ARCH_X86_64 -cglobal x264_prefetch_fenc_mmxext, 5,5 +cglobal prefetch_fenc_mmxext, 5,5 mov eax, r4d and eax, 3 imul eax, r1d @@ -910,7 +914,7 @@ RET %else -cglobal x264_prefetch_fenc_mmxext +cglobal prefetch_fenc_mmxext mov r2, [esp+20] mov r1, [esp+8] mov r0, [esp+4] @@ -935,9 +939,9 @@ %endif ; ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity ) +; void prefetch_ref( uint8_t *pix, int stride, int parity ) ;----------------------------------------------------------------------------- -cglobal x264_prefetch_ref_mmxext, 3,3 +cglobal prefetch_ref_mmxext, 3,3 dec r2d and r2d, r1d lea r0, [r0+r2*8+64] @@ -982,16 +986,16 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_mc_chroma_mmxext( uint8_t *dst, int dst_stride, -; uint8_t *src, int src_stride, -; int dx, int dy, -; int width, int height ) +; void mc_chroma( uint8_t *dst, int dst_stride, +; uint8_t *src, int src_stride, +; int dx, int dy, +; int width, int height ) ;----------------------------------------------------------------------------- %macro MC_CHROMA 1-2 0 -cglobal x264_mc_chroma_%1 +cglobal mc_chroma_%1 %if mmsize == 16 cmp dword r6m, 4 - jle x264_mc_chroma_mmxext + jle mc_chroma_mmxext %endif PROLOGUE 0,6,%2 MC_CHROMA_START @@ -1151,7 +1155,7 @@ %macro MC_CHROMA_SSSE3 2 INIT_MMX -cglobal x264_mc_chroma_ssse3%1, 0,6,%2 +cglobal mc_chroma_ssse3%1, 0,6,%2 MC_CHROMA_START and r4d, 7 and r5d, 7

@@ -33,13 +33,14 @@ filt_mul51: times 8 db -5, 1 hpel_shuf: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 -pw_1: times 8 dw 1 -pw_16: times 8 dw 16 -pw_32: times 8 dw 32 -pd_128: times 4 dd 128 - SECTION .text +cextern pw_1 +cextern pw_16 +cextern pw_32 +cextern pd_128 +cextern pw_3fff + %macro LOAD_ADD 4 movh %4, %3 movh %1, %2 @@ -121,9 +122,9 @@ %macro HPEL_V 1-2 0 ;----------------------------------------------------------------------------- -; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width ); +; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_v_%1, 5,6,%2 +cglobal hpel_filter_v_%1, 5,6,%2 %ifdef WIN64 movsxd r4, r4d %endif @@ -180,9 +181,9 @@ HPEL_V mmxext ;----------------------------------------------------------------------------- -; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width ); +; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_c_mmxext, 3,3 +cglobal hpel_filter_c_mmxext, 3,3 add r0, r2 lea r1, [r1+r2*2] neg r2 @@ -209,9 +210,9 @@ REP_RET ;----------------------------------------------------------------------------- -; void x264_hpel_filter_h_mmxext( uint8_t *dst, uint8_t *src, int width ); +; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_h_mmxext, 3,3 +cglobal hpel_filter_h_mmxext, 3,3 add r0, r2 add r1, r2 neg r2 @@ -256,9 +257,9 @@ %macro HPEL_C 1 ;----------------------------------------------------------------------------- -; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width ); +; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_c_%1, 3,3,9 +cglobal hpel_filter_c_%1, 3,3,9 add r0, r2 lea r1, [r1+r2*2] neg r2 @@ -331,9 +332,9 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width ); +; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_h_sse2, 3,3,8 +cglobal hpel_filter_h_sse2, 3,3,8 add r0, r2 add r1, r2 neg r2 @@ -380,9 +381,9 @@ %ifndef ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width ); +; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_h_ssse3, 3,3 +cglobal hpel_filter_h_ssse3, 3,3 add r0, r2 add r1, r2 neg r2 @@ -557,10 +558,10 @@ %macro HPEL 1 ;----------------------------------------------------------------------------- -; void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, -; uint8_t *src, int stride, int width, int height) +; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, +; uint8_t *src, int stride, int width, int height) ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_%1, 7,7,16 +cglobal hpel_filter_%1, 7,7,16 %ifdef WIN64 movsxd r4, r4d movsxd r5, r5d @@ -626,20 +627,16 @@ HPEL ssse3 %endif -cglobal x264_sfence - sfence - ret - %undef movntq %undef movntps %undef sfence ;----------------------------------------------------------------------------- -; void x264_plane_copy_core_mmxext( uint8_t *dst, int i_dst, -; uint8_t *src, int i_src, int w, int h) +; void plane_copy_core( uint8_t *dst, int i_dst, +; uint8_t *src, int i_src, int w, int h) ;----------------------------------------------------------------------------- ; assumes i_dst and w are multiples of 16, and i_dst>w -cglobal x264_plane_copy_core_mmxext, 6,7 +cglobal plane_copy_core_mmxext, 6,7 movsxdifnidn r1, r1d movsxdifnidn r3, r3d movsxdifnidn r4, r4d @@ -698,9 +695,9 @@ ; memzero SSE will fail for non-mod128. ;----------------------------------------------------------------------------- -; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n ); +; void *memcpy_aligned( void *dst, const void *src, size_t n ); ;----------------------------------------------------------------------------- -cglobal x264_memcpy_aligned_mmx, 3,3 +cglobal memcpy_aligned_mmx, 3,3 test r2d, 16 jz .copy32 sub r2d, 16 @@ -722,9 +719,9 @@ REP_RET ;----------------------------------------------------------------------------- -; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n ); +; void *memcpy_aligned( void *dst, const void *src, size_t n ); ;----------------------------------------------------------------------------- -cglobal x264_memcpy_aligned_sse2, 3,3 +cglobal memcpy_aligned_sse2, 3,3 test r2d, 16 jz .copy32 sub r2d, 16 @@ -752,10 +749,10 @@ REP_RET ;----------------------------------------------------------------------------- -; void *x264_memzero_aligned( void *dst, size_t n ); +; void *memzero_aligned( void *dst, size_t n ); ;----------------------------------------------------------------------------- %macro MEMZERO 1 -cglobal x264_memzero_aligned_%1, 2,2 +cglobal memzero_aligned_%1, 2,2 add r0, r1 neg r1 pxor m0, m0 @@ -778,9 +775,9 @@ ;----------------------------------------------------------------------------- -; void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride ) +; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride ) ;----------------------------------------------------------------------------- -cglobal x264_integral_init4h_sse4, 3,4 +cglobal integral_init4h_sse4, 3,4 lea r3, [r0+r2*2] add r1, r2 neg r2 @@ -799,7 +796,7 @@ jl .loop REP_RET -cglobal x264_integral_init8h_sse4, 3,4 +cglobal integral_init8h_sse4, 3,4 lea r3, [r0+r2*2] add r1, r2 neg r2 @@ -826,9 +823,9 @@ %macro INTEGRAL_INIT_8V 1 ;----------------------------------------------------------------------------- -; void x264_integral_init8v_mmx( uint16_t *sum8, int stride ) +; void integral_init8v( uint16_t *sum8, int stride ) ;----------------------------------------------------------------------------- -cglobal x264_integral_init8v_%1, 3,3 +cglobal integral_init8v_%1, 3,3 shl r1, 1 add r0, r1 lea r2, [r0+r1*8] @@ -851,10 +848,10 @@ INTEGRAL_INIT_8V sse2 ;----------------------------------------------------------------------------- -; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride ) +; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_integral_init4v_mmx, 3,5 +cglobal integral_init4v_mmx, 3,5 shl r2, 1 lea r3, [r0+r2*4] lea r4, [r0+r2*8] @@ -876,7 +873,7 @@ REP_RET INIT_XMM -cglobal x264_integral_init4v_sse2, 3,5 +cglobal integral_init4v_sse2, 3,5 shl r2, 1 add r0, r2 add r1, r2 @@ -901,7 +898,7 @@ jl .loop REP_RET -cglobal x264_integral_init4v_ssse3, 3,5 +cglobal integral_init4v_ssse3, 3,5 shl r2, 1 add r0, r2 add r1, r2 @@ -993,7 +990,7 @@ ; int src_stride, int dst_stride, int width, int height ) ;----------------------------------------------------------------------------- %macro FRAME_INIT_LOWRES 1-2 0 ; FIXME -cglobal x264_frame_init_lowres_core_%1, 6,7,%2 +cglobal frame_init_lowres_core_%1, 6,7,%2 %ifdef WIN64 movsxd r5, r5d %endif @@ -1114,7 +1111,7 @@ ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, ; uint16_t *inter_costs, uint16_t *inv_qscales, int len ) ;----------------------------------------------------------------------------- -cglobal x264_mbtree_propagate_cost_sse2, 6,6 +cglobal mbtree_propagate_cost_sse2, 6,6 shl r5d, 1 lea r0, [r0+r5*2] add r1, r5 @@ -1132,8 +1129,9 @@ pmaddwd xmm0, xmm2 paddd xmm0, xmm4 psrld xmm0, 8 ; intra*invq>>8 - movq xmm1, [r1+r5] ; prop movq xmm3, [r3+r5] ; inter + movq xmm1, [r1+r5] ; prop + pand xmm3, [pw_3fff] punpcklwd xmm1, xmm5 punpcklwd xmm3, xmm5 paddd xmm0, xmm1 ; prop + (intra*invq>>8)

@@ -44,11 +44,11 @@ DECL_SUF( x264_pixel_avg_4x2, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) #define MC_WEIGHT(w,type) \ - extern void x264_mc_weight_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); + void x264_mc_weight_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); #define MC_WEIGHT_OFFSET(w,type) \ - extern void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \ - extern void x264_mc_offsetsub_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \ + void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \ + void x264_mc_offsetsub_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \ MC_WEIGHT(w,type) MC_WEIGHT_OFFSET( 4, mmxext ) @@ -68,51 +68,51 @@ #undef MC_OFFSET #undef MC_WEIGHT -extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int ); -extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int ); -extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int ); -extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int ); -extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int ); -extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int ); -extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int ); -extern void x264_prefetch_ref_mmxext( uint8_t *, int, int ); -extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride, +void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int ); +void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int ); +void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int ); +void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int ); +void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int ); +void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int ); +void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int ); +void x264_prefetch_ref_mmxext( uint8_t *, int, int ); +void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride, + uint8_t *dst, int i_dst_stride, + int dx, int dy, int i_width, int i_height ); +void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride, + uint8_t *dst, int i_dst_stride, + int dx, int dy, int i_width, int i_height ); +void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride, + uint8_t *dst, int i_dst_stride, + int dx, int dy, int i_width, int i_height ); +void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int dx, int dy, int i_width, int i_height ); -extern void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -extern void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -extern void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -extern void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h); -extern void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h); -extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n ); -extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n ); -extern void x264_memzero_aligned_mmx( void * dst, int n ); -extern void x264_memzero_aligned_sse2( void * dst, int n ); -extern void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride ); -extern void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride ); -extern void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride ); -extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride ); -extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride ); -extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride ); -extern void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride ); -extern void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, int len ); -#define LOWRES(cpu) \ -extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\ - int src_stride, int dst_stride, int width, int height ); +void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h); +void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h); +void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n ); +void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n ); +void x264_memzero_aligned_mmx( void * dst, int n ); +void x264_memzero_aligned_sse2( void * dst, int n ); +void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride ); +void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride ); +void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride ); +void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride ); +void x264_integral_init8v_mmx( uint16_t *sum8, int stride ); +void x264_integral_init8v_sse2( uint16_t *sum8, int stride ); +void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride ); +void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, int len ); +#define LOWRES(cpu)\ +void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\ + int src_stride, int dst_stride, int width, int height ); LOWRES(mmxext) LOWRES(cache32_mmxext) LOWRES(sse2) LOWRES(ssse3) #define PIXEL_AVG_W(width,cpu)\ -extern void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int ); +void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int ); /* This declares some functions that don't exist, but that isn't a problem. */ #define PIXEL_AVG_WALL(cpu)\ PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(20,cpu); @@ -228,8 +228,8 @@ } } -static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; -static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; +static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; +static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; #define MC_LUMA(name,instr1,instr2)\ static void mc_luma_##name( uint8_t *dst, int i_dst_stride,\ @@ -309,7 +309,6 @@ void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\ void x264_hpel_filter_c_##cpuc( uint8_t *dst, int16_t *buf, int width );\ void x264_hpel_filter_h_##cpuh( uint8_t *dst, uint8_t *src, int width );\ -void x264_sfence( void );\ static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,\ int stride, int width, int height, int16_t *buf )\ {\

@@ -61,9 +61,9 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int ) +; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_sa8d_8x8_internal_mmxext +cglobal pixel_sa8d_8x8_internal_mmxext push r0 push r2 sub esp, 0x74 @@ -169,9 +169,9 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res ) +; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res ) ;----------------------------------------------------------------------------- -cglobal x264_intra_sa8d_x3_8x8_core_mmxext +cglobal intra_sa8d_x3_8x8_core_mmxext mov eax, [esp+4] mov ecx, [esp+8] sub esp, 0x70 @@ -329,10 +329,10 @@ ;----------------------------------------------------------------------------- -; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1, -; const uint8_t *pix2, int stride2, int sums[2][4] ) +; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1, +; const uint8_t *pix2, int stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_ssim_4x4x2_core_mmxext +cglobal pixel_ssim_4x4x2_core_mmxext push ebx push edi mov ebx, [esp+16]

@@ -27,17 +27,14 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA -pw_1: times 8 dw 1 -pw_00ff: times 8 dw 0xff -ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 -ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 +SECTION_RODATA 32 mask_ff: times 16 db 0xff times 16 db 0 +ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 +ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1 mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1 mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1 -hsub_mul: times 8 db 1, -1 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 hmul_8p: times 8 db 1 times 4 db 1, -1 @@ -46,6 +43,11 @@ SECTION .text +cextern pw_1 +cextern pw_00ff + +cextern hsub_mul + %macro HADDD 2 ; sum junk %if mmsize == 16 movhlps %2, %1 @@ -213,7 +215,7 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int ) +; int pixel_ssd_16x16( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SSD 3-4 0 %if %1 != %2 @@ -221,7 +223,7 @@ %else %assign function_align 16 %endif -cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0 +cglobal pixel_ssd_%1x%2_%3, 0,0,0 mov al, %1*%2/mmsize/2 %if %1 != %2 @@ -365,21 +367,21 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_var_wxh_mmxext( uint8_t *, int ) +; int pixel_var_wxh( uint8_t *, int ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_pixel_var_16x16_mmxext, 2,3 +cglobal pixel_var_16x16_mmxext, 2,3 VAR_START 0 VAR_2ROW 8, 16 VAR_END -cglobal x264_pixel_var_8x8_mmxext, 2,3 +cglobal pixel_var_8x8_mmxext, 2,3 VAR_START 0 VAR_2ROW r1, 4 VAR_END INIT_XMM -cglobal x264_pixel_var_16x16_sse2, 2,3,8 +cglobal pixel_var_16x16_sse2, 2,3,8 VAR_START 1 mov r2d, 8 .loop: @@ -392,7 +394,7 @@ jg .loop VAR_END -cglobal x264_pixel_var_8x8_sse2, 2,4,8 +cglobal pixel_var_8x8_sse2, 2,4,8 VAR_START 1 mov r2d, 2 lea r3, [r1*3] @@ -421,11 +423,11 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * ) +; int pixel_var2_8x8( uint8_t *, int, uint8_t *, int, int * ) ;----------------------------------------------------------------------------- %ifndef ARCH_X86_64 INIT_MMX -cglobal x264_pixel_var2_8x8_mmxext, 5,6 +cglobal pixel_var2_8x8_mmxext, 5,6 VAR_START 0 mov r5d, 8 .loop: @@ -455,7 +457,7 @@ %endif INIT_XMM -cglobal x264_pixel_var2_8x8_sse2, 5,6,8 +cglobal pixel_var2_8x8_sse2, 5,6,8 VAR_START 1 mov r5d, 4 .loop: @@ -479,7 +481,7 @@ VAR2_END RET -cglobal x264_pixel_var2_8x8_ssse3, 5,6,8 +cglobal pixel_var2_8x8_ssse3, 5,6,8 pxor m5, m5 ; sum pxor m6, m6 ; sum squared mova m7, [hsub_mul] @@ -692,10 +694,10 @@ ; for small blocks on x86_32, modify pixel pointer instead. ;----------------------------------------------------------------------------- -; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int ) +; int pixel_satd_16x16( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_pixel_satd_16x4_internal_mmxext +cglobal pixel_satd_16x4_internal_mmxext SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 0 paddw m0, m2 @@ -706,69 +708,69 @@ paddw m0, m1 ret -cglobal x264_pixel_satd_8x8_internal_mmxext +cglobal pixel_satd_8x8_internal_mmxext SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 1 paddw m0, m2 paddw m0, m1 -x264_pixel_satd_8x4_internal_mmxext: +pixel_satd_8x4_internal_mmxext: SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 0 paddw m0, m2 paddw m0, m1 ret -cglobal x264_pixel_satd_16x16_mmxext, 4,6 +cglobal pixel_satd_16x16_mmxext, 4,6 SATD_START_MMX pxor m0, m0 %rep 3 - call x264_pixel_satd_16x4_internal_mmxext + call pixel_satd_16x4_internal_mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endrep - call x264_pixel_satd_16x4_internal_mmxext + call pixel_satd_16x4_internal_mmxext HADDUW m0, m1 movd eax, m0 RET -cglobal x264_pixel_satd_16x8_mmxext, 4,6 +cglobal pixel_satd_16x8_mmxext, 4,6 SATD_START_MMX pxor m0, m0 - call x264_pixel_satd_16x4_internal_mmxext + call pixel_satd_16x4_internal_mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3] - call x264_pixel_satd_16x4_internal_mmxext + call pixel_satd_16x4_internal_mmxext SATD_END_MMX -cglobal x264_pixel_satd_8x16_mmxext, 4,6 +cglobal pixel_satd_8x16_mmxext, 4,6 SATD_START_MMX pxor m0, m0 - call x264_pixel_satd_8x8_internal_mmxext + call pixel_satd_8x8_internal_mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3] - call x264_pixel_satd_8x8_internal_mmxext + call pixel_satd_8x8_internal_mmxext SATD_END_MMX -cglobal x264_pixel_satd_8x8_mmxext, 4,6 +cglobal pixel_satd_8x8_mmxext, 4,6 SATD_START_MMX pxor m0, m0 - call x264_pixel_satd_8x8_internal_mmxext + call pixel_satd_8x8_internal_mmxext SATD_END_MMX -cglobal x264_pixel_satd_8x4_mmxext, 4,6 +cglobal pixel_satd_8x4_mmxext, 4,6 SATD_START_MMX pxor m0, m0 - call x264_pixel_satd_8x4_internal_mmxext + call pixel_satd_8x4_internal_mmxext SATD_END_MMX -cglobal x264_pixel_satd_4x8_mmxext, 4,6 +cglobal pixel_satd_4x8_mmxext, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 1 SATD_4x4_MMX m1, 0, 0 paddw m0, m1 SATD_END_MMX -cglobal x264_pixel_satd_4x4_mmxext, 4,6 +cglobal pixel_satd_4x4_mmxext, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 0 SATD_END_MMX @@ -808,12 +810,12 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int ) +; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SATDS_SSE2 1 INIT_XMM %ifnidn %1, sse2 -cglobal x264_pixel_satd_4x4_%1, 4, 6, 6 +cglobal pixel_satd_4x4_%1, 4, 6, 6 SATD_START_MMX mova m4, [hmul_4p] LOAD_DUP_2x4P m2, m5, [r2], [r2+r3] @@ -829,7 +831,7 @@ RET %endif -cglobal x264_pixel_satd_4x8_%1, 4, 6, 8 +cglobal pixel_satd_4x8_%1, 4, 6, 8 SATD_START_MMX %ifnidn %1, sse2 mova m7, [hmul_4p] @@ -869,16 +871,16 @@ movd eax, m6 RET -cglobal x264_pixel_satd_8x8_internal_%1 +cglobal pixel_satd_8x8_internal_%1 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6 -x264_pixel_satd_8x4_internal_%1: +pixel_satd_8x4_internal_%1: LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6 ret %ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same -cglobal x264_pixel_satd_16x4_internal_%1 +cglobal pixel_satd_16x4_internal_%1 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11 lea r2, [r2+4*r3] lea r0, [r0+4*r1] @@ -886,67 +888,67 @@ SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10 ret -cglobal x264_pixel_satd_16x8_%1, 4,6,12 +cglobal pixel_satd_16x8_%1, 4,6,12 SATD_START_SSE2 %1, m10, m7 %ifidn %1, sse2 mova m7, [pw_00ff] %endif - jmp x264_pixel_satd_16x8_internal_%1 + jmp pixel_satd_16x8_internal_%1 -cglobal x264_pixel_satd_16x16_%1, 4,6,12 +cglobal pixel_satd_16x16_%1, 4,6,12 SATD_START_SSE2 %1, m10, m7 %ifidn %1, sse2 mova m7, [pw_00ff] %endif - call x264_pixel_satd_16x4_internal_%1 - call x264_pixel_satd_16x4_internal_%1 -x264_pixel_satd_16x8_internal_%1: - call x264_pixel_satd_16x4_internal_%1 - call x264_pixel_satd_16x4_internal_%1 + call pixel_satd_16x4_internal_%1 + call pixel_satd_16x4_internal_%1 +pixel_satd_16x8_internal_%1: + call pixel_satd_16x4_internal_%1 + call pixel_satd_16x4_internal_%1 SATD_END_SSE2 %1, m10 %else -cglobal x264_pixel_satd_16x8_%1, 4,6,8 +cglobal pixel_satd_16x8_%1, 4,6,8 SATD_START_SSE2 %1, m6, m7 BACKUP_POINTERS - call x264_pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 RESTORE_AND_INC_POINTERS - call x264_pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 SATD_END_SSE2 %1, m6 -cglobal x264_pixel_satd_16x16_%1, 4,6,8 +cglobal pixel_satd_16x16_%1, 4,6,8 SATD_START_SSE2 %1, m6, m7 BACKUP_POINTERS - call x264_pixel_satd_8x8_internal_%1 - call x264_pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 RESTORE_AND_INC_POINTERS - call x264_pixel_satd_8x8_internal_%1 - call x264_pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 SATD_END_SSE2 %1, m6 %endif -cglobal x264_pixel_satd_8x16_%1, 4,6,8 +cglobal pixel_satd_8x16_%1, 4,6,8 SATD_START_SSE2 %1, m6, m7 - call x264_pixel_satd_8x8_internal_%1 - call x264_pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 SATD_END_SSE2 %1, m6 -cglobal x264_pixel_satd_8x8_%1, 4,6,8 +cglobal pixel_satd_8x8_%1, 4,6,8 SATD_START_SSE2 %1, m6, m7 - call x264_pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 SATD_END_SSE2 %1, m6 -cglobal x264_pixel_satd_8x4_%1, 4,6,8 +cglobal pixel_satd_8x4_%1, 4,6,8 SATD_START_SSE2 %1, m6, m7 - call x264_pixel_satd_8x4_internal_%1 + call pixel_satd_8x4_internal_%1 SATD_END_SSE2 %1, m6 %endmacro ; SATDS_SSE2 %macro SA8D 1 %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int ) +; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_sa8d_8x8_internal_%1 +cglobal pixel_sa8d_8x8_internal_%1 lea r10, [r0+4*r1] lea r11, [r2+4*r3] LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2 @@ -970,41 +972,41 @@ paddw m0, m1 paddw m0, m2 paddw m0, m8 - SAVE_MM_PERMUTATION x264_pixel_sa8d_8x8_internal_%1 + SAVE_MM_PERMUTATION pixel_sa8d_8x8_internal_%1 ret -cglobal x264_pixel_sa8d_8x8_%1, 4,6,12 +cglobal pixel_sa8d_8x8_%1, 4,6,12 lea r4, [3*r1] lea r5, [3*r3] %ifnidn %1, sse2 mova m7, [hmul_8p] %endif - call x264_pixel_sa8d_8x8_internal_%1 + call pixel_sa8d_8x8_internal_%1 HADDW m0, m1 movd eax, m0 add eax, 1 shr eax, 1 RET -cglobal x264_pixel_sa8d_16x16_%1, 4,6,12 +cglobal pixel_sa8d_16x16_%1, 4,6,12 lea r4, [3*r1] lea r5, [3*r3] %ifnidn %1, sse2 mova m7, [hmul_8p] %endif - call x264_pixel_sa8d_8x8_internal_%1 ; pix[0] + call pixel_sa8d_8x8_internal_%1 ; pix[0] add r2, 8 add r0, 8 mova m10, m0 - call x264_pixel_sa8d_8x8_internal_%1 ; pix[8] + call pixel_sa8d_8x8_internal_%1 ; pix[8] lea r2, [r2+8*r3] lea r0, [r0+8*r1] paddusw m10, m0 - call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8] + call pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8] sub r2, 8 sub r0, 8 paddusw m10, m0 - call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride] + call pixel_sa8d_8x8_internal_%1 ; pix[8*stride] paddusw m0, m10 HADDUW m0, m1 movd eax, m0 @@ -1014,7 +1016,7 @@ %else ; ARCH_X86_32 %ifnidn %1, mmxext -cglobal x264_pixel_sa8d_8x8_internal_%1 +cglobal pixel_sa8d_8x8_internal_%1 %define spill0 [esp+4] %define spill1 [esp+20] %define spill2 [esp+36] @@ -1064,13 +1066,13 @@ ret %endif ; ifndef mmxext -cglobal x264_pixel_sa8d_8x8_%1, 4,7 +cglobal pixel_sa8d_8x8_%1, 4,7 mov r6, esp and esp, ~15 sub esp, 48 lea r4, [3*r1] lea r5, [3*r3] - call x264_pixel_sa8d_8x8_internal_%1 + call pixel_sa8d_8x8_internal_%1 HADDW m0, m1 movd eax, m0 add eax, 1 @@ -1078,26 +1080,26 @@ mov esp, r6 RET -cglobal x264_pixel_sa8d_16x16_%1, 4,7 +cglobal pixel_sa8d_16x16_%1, 4,7 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [3*r1] lea r5, [3*r3] - call x264_pixel_sa8d_8x8_internal_%1 + call pixel_sa8d_8x8_internal_%1 %ifidn %1, mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endif mova [esp+48], m0 - call x264_pixel_sa8d_8x8_internal_%1 + call pixel_sa8d_8x8_internal_%1 mov r0, [r6+20] mov r2, [r6+28] add r0, 8 add r2, 8 paddusw m0, [esp+48] mova [esp+48], m0 - call x264_pixel_sa8d_8x8_internal_%1 + call pixel_sa8d_8x8_internal_%1 %ifidn %1, mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3] @@ -1106,7 +1108,7 @@ paddusw m0, [esp+48] %endif mova [esp+64-mmsize], m0 - call x264_pixel_sa8d_8x8_internal_%1 + call pixel_sa8d_8x8_internal_%1 paddusw m0, [esp+64-mmsize] %if mmsize == 16 HADDUW m0, m1 @@ -1140,9 +1142,9 @@ %ifdef ARCH_X86_64 INIT_XMM ;----------------------------------------------------------------------------- -; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res ) +; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res ) ;----------------------------------------------------------------------------- -cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16 +cglobal intra_sa8d_x3_8x8_core_%1, 3,3,16 ; 8x8 hadamard pxor m8, m8 movq m0, [r0+0*FENC_STRIDE] @@ -1247,7 +1249,7 @@ ; in: r0 = fenc ; out: m0..m3 = hadamard coefs INIT_MMX -cglobal x264_hadamard_load +cglobal hadamard_load ; not really a global, but otherwise cycles get attributed to the wrong function in profiling pxor m7, m7 movd m0, [r0+0*FENC_STRIDE] @@ -1259,7 +1261,7 @@ punpcklbw m2, m7 punpcklbw m3, m7 HADAMARD4_2D 0, 1, 2, 3, 4 - SAVE_MM_PERMUTATION x264_hadamard_load + SAVE_MM_PERMUTATION hadamard_load ret %macro SCALAR_SUMSUB 4 @@ -1377,9 +1379,9 @@ %macro INTRA_SATDS_MMX 1 INIT_MMX ;----------------------------------------------------------------------------- -; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) +; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- -cglobal x264_intra_satd_x3_4x4_%1, 2,6 +cglobal intra_satd_x3_4x4_%1, 2,6 %ifdef ARCH_X86_64 ; stack is 16 byte aligned because abi says so %define top_1d rsp-8 ; size 8 @@ -1393,7 +1395,7 @@ %define t0 r2 %endif - call x264_hadamard_load + call hadamard_load SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5 mov t0d, r0d SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5 @@ -1430,9 +1432,9 @@ %endif ;----------------------------------------------------------------------------- -; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) +; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- -cglobal x264_intra_satd_x3_16x16_%1, 0,7 +cglobal intra_satd_x3_16x16_%1, 0,7 %ifdef ARCH_X86_64 %assign stack_pad 88 %else @@ -1466,7 +1468,7 @@ .loop_y: xor r4d, r4d .loop_x: - call x264_hadamard_load + call hadamard_load SUM3x4 %1 SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4] @@ -1507,9 +1509,9 @@ RET ;----------------------------------------------------------------------------- -; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) +; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- -cglobal x264_intra_satd_x3_8x8c_%1, 0,6 +cglobal intra_satd_x3_8x8c_%1, 0,6 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call SUB rsp, 72 %define sums rsp+48 ; size 24 @@ -1555,7 +1557,7 @@ .loop_y: xor r4d, r4d .loop_x: - call x264_hadamard_load + call hadamard_load SUM3x4 %1 SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4] @@ -1609,7 +1611,7 @@ ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0 ; out: [tmp]=hadamard4, m0=satd -cglobal x264_hadamard_ac_4x4_mmxext +cglobal hadamard_ac_4x4_mmxext movh m0, [r0] movh m1, [r0+r1] movh m2, [r0+r1*2] @@ -1631,10 +1633,10 @@ paddw m0, m1 paddw m2, m3 paddw m0, m2 - SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext + SAVE_MM_PERMUTATION hadamard_ac_4x4_mmxext ret -cglobal x264_hadamard_ac_2x2max_mmxext +cglobal hadamard_ac_2x2max_mmxext mova m0, [r3+0x00] mova m1, [r3+0x20] mova m2, [r3+0x40] @@ -1646,30 +1648,30 @@ HADAMARD 0, max, 1, 3, 4, 5 paddw m7, m0 paddw m7, m1 - SAVE_MM_PERMUTATION x264_hadamard_ac_2x2max_mmxext + SAVE_MM_PERMUTATION hadamard_ac_2x2max_mmxext ret -cglobal x264_hadamard_ac_8x8_mmxext +cglobal hadamard_ac_8x8_mmxext mova m6, [mask_ac4] pxor m7, m7 - call x264_hadamard_ac_4x4_mmxext + call hadamard_ac_4x4_mmxext add r0, 4 add r3, 32 mova m5, m0 - call x264_hadamard_ac_4x4_mmxext + call hadamard_ac_4x4_mmxext lea r0, [r0+4*r1] add r3, 64 paddw m5, m0 - call x264_hadamard_ac_4x4_mmxext + call hadamard_ac_4x4_mmxext sub r0, 4 sub r3, 32 paddw m5, m0 - call x264_hadamard_ac_4x4_mmxext + call hadamard_ac_4x4_mmxext paddw m5, m0 sub r3, 40 mova [rsp+gprsize+8], m5 ; save satd %rep 3 - call x264_hadamard_ac_2x2max_mmxext + call hadamard_ac_2x2max_mmxext %endrep mova m0, [r3+0x00] mova m1, [r3+0x20] @@ -1686,33 +1688,33 @@ paddw m6, m7 mova [rsp+gprsize], m6 ; save sa8d SWAP m0, m6 - SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext + SAVE_MM_PERMUTATION hadamard_ac_8x8_mmxext ret %macro HADAMARD_AC_WXH_MMX 2 -cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4 +cglobal pixel_hadamard_ac_%1x%2_mmxext, 2,4 %assign pad 16-gprsize-(stack_offset&15) %define ysub r1 sub rsp, 16+128+pad lea r2, [r1*3] lea r3, [rsp+16] - call x264_hadamard_ac_8x8_mmxext + call hadamard_ac_8x8_mmxext %if %2==16 %define ysub r2 lea r0, [r0+r1*4] sub rsp, 16 - call x264_hadamard_ac_8x8_mmxext + call hadamard_ac_8x8_mmxext %endif %if %1==16 neg ysub sub rsp, 16 lea r0, [r0+ysub*4+8] neg ysub - call x264_hadamard_ac_8x8_mmxext + call hadamard_ac_8x8_mmxext %if %2==16 lea r0, [r0+r1*4] sub rsp, 16 - call x264_hadamard_ac_8x8_mmxext + call hadamard_ac_8x8_mmxext %endif %endif mova m1, [rsp+0x08] @@ -1779,7 +1781,7 @@ INIT_XMM ; in: r0=pix, r1=stride, r2=stride*3 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4 -cglobal x264_hadamard_ac_8x8_%1 +cglobal hadamard_ac_8x8_%1 %ifdef ARCH_X86_64 %define spill0 m8 %define spill1 m9 @@ -1883,7 +1885,7 @@ paddw m2, m4 paddw m0, m2 mova [rsp+gprsize+16], m0 ; save sa8d - SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1 + SAVE_MM_PERMUTATION hadamard_ac_8x8_%1 ret HADAMARD_AC_WXH_SSE2 16, 16, %1 @@ -1892,30 +1894,30 @@ HADAMARD_AC_WXH_SSE2 8, 8, %1 %endmacro ; HADAMARD_AC_SSE2 -; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride ) +; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride ) %macro HADAMARD_AC_WXH_SSE2 3 -cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3,11 +cglobal pixel_hadamard_ac_%1x%2_%3, 2,3,11 %assign pad 16-gprsize-(stack_offset&15) %define ysub r1 sub rsp, 48+pad lea r2, [r1*3] - call x264_hadamard_ac_8x8_%3 + call hadamard_ac_8x8_%3 %if %2==16 %define ysub r2 lea r0, [r0+r1*4] sub rsp, 32 - call x264_hadamard_ac_8x8_%3 + call hadamard_ac_8x8_%3 %endif %if %1==16 neg ysub sub rsp, 32 lea r0, [r0+ysub*4+8] neg ysub - call x264_hadamard_ac_8x8_%3 + call hadamard_ac_8x8_%3 %if %2==16 lea r0, [r0+r1*4] sub rsp, 32 - call x264_hadamard_ac_8x8_%3 + call hadamard_ac_8x8_%3 %endif %endif mova m1, [rsp+0x20] @@ -1947,7 +1949,7 @@ ; instantiate satds %ifndef ARCH_X86_64 -cextern x264_pixel_sa8d_8x8_internal_mmxext +cextern pixel_sa8d_8x8_internal_mmxext SA8D mmxext %endif @@ -1999,8 +2001,8 @@ ;============================================================================= ;----------------------------------------------------------------------------- -; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, -; const uint8_t *pix2, int stride2, int sums[2][4] ) +; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1, +; const uint8_t *pix2, int stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- %macro SSIM_ITER 1 @@ -2033,7 +2035,7 @@ paddd m3, m6 %endmacro -cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8 +cglobal pixel_ssim_4x4x2_core_sse2, 4,4,8 pxor m0, m0 SSIM_ITER 0 SSIM_ITER 1 @@ -2069,9 +2071,9 @@ RET ;----------------------------------------------------------------------------- -; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width ) +; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_ssim_end4_sse2, 3,3,7 +cglobal pixel_ssim_end4_sse2, 3,3,7 movdqa m0, [r0+ 0] movdqa m1, [r0+16] movdqa m2, [r0+32] @@ -2175,10 +2177,10 @@ %define ABS1 ABS1_MMX ;----------------------------------------------------------------------------- -; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, -; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) +; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta, +; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_ads4_mmxext, 4,7 +cglobal pixel_ads4_mmxext, 4,7 movq mm6, [r0] movq mm4, [r0+8] pshufw mm7, mm6, 0 @@ -2215,7 +2217,7 @@ movd [t0], mm1 ADS_END 1 -cglobal x264_pixel_ads2_mmxext, 4,7 +cglobal pixel_ads2_mmxext, 4,7 movq mm6, [r0] pshufw mm5, r6m, 0 pshufw mm7, mm6, 0 @@ -2236,7 +2238,7 @@ movd [t0], mm4 ADS_END 1 -cglobal x264_pixel_ads1_mmxext, 4,7 +cglobal pixel_ads1_mmxext, 4,7 pshufw mm7, [r0], 0 pshufw mm6, r6m, 0 ADS_START 2 @@ -2258,7 +2260,7 @@ ADS_END 2 %macro ADS_SSE2 1 -cglobal x264_pixel_ads4_%1, 4,7,12 +cglobal pixel_ads4_%1, 4,7,12 movdqa xmm4, [r0] pshuflw xmm7, xmm4, 0 pshuflw xmm6, xmm4, 0xAA @@ -2327,7 +2329,7 @@ %endif ; ARCH ADS_END 2 -cglobal x264_pixel_ads2_%1, 4,7,8 +cglobal pixel_ads2_%1, 4,7,8 movq xmm6, [r0] movd xmm5, r6m pshuflw xmm7, xmm6, 0 @@ -2353,7 +2355,7 @@ movq [t0], xmm1 ADS_END 2 -cglobal x264_pixel_ads1_%1, 4,7,8 +cglobal pixel_ads1_%1, 4,7,8 movd xmm7, [r0] movd xmm6, r6m pshuflw xmm7, xmm7, 0 @@ -2385,7 +2387,7 @@ %define ABS1 ABS1_SSSE3 ADS_SSE2 ssse3 -; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ) +; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ) ; { ; int nmv=0, i, j; ; *(uint32_t*)(masks+width) = 0; @@ -2399,7 +2401,7 @@ ; } ; return nmv; ; } -cglobal x264_pixel_ads_mvs, 0,7,0 +cglobal pixel_ads_mvs, 0,7,0 ads_mvs: %ifdef ARCH_X86_64 ; mvs = r4

@@ -25,6 +25,24 @@ %include "x86inc.asm" %include "x86util.asm" +SECTION_RODATA + +pw_76543210: +pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 +pb_00s_ff: times 8 db 0 +pb_0s_ff: times 7 db 0 + db 0xff + +SECTION .text + +cextern pb_1 +cextern pb_3 +cextern pw_2 +cextern pw_4 +cextern pw_8 +cextern pw_ff00 +cextern pb_reverse + %macro STORE8x8 2 add r0, 4*FDEC_STRIDE movq [r0 + -4*FDEC_STRIDE], %1 @@ -74,24 +92,6 @@ movdqa [r0 + 3*FDEC_STRIDE], %1 %endmacro -SECTION_RODATA - -ALIGN 16 -pb_1: times 16 db 1 -pb_3: times 16 db 3 -pw_2: times 4 dw 2 -pw_4: times 4 dw 4 -pw_8: times 8 dw 8 -pw_76543210: -pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 -pb_00s_ff: times 8 db 0 -pb_0s_ff: times 7 db 0 - db 0xff -pw_ff00: times 8 dw 0xff00 -pb_reverse: db 7, 6, 5, 4, 3, 2, 1, 0 - -SECTION .text - ; dest, left, right, src, tmp ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 %macro PRED8x8_LOWPASS0 6 @@ -126,7 +126,7 @@ %endmacro ;----------------------------------------------------------------------------- -; void predict_4x4_ddl_mmxext( uint8_t *src ) +; void predict_4x4_ddl( uint8_t *src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_ddl_mmxext, 1,1 movq mm1, [r0-FDEC_STRIDE] @@ -149,7 +149,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_4x4_ddr_mmxext( uint8_t *src ) +; void predict_4x4_ddr( uint8_t *src ) ;----------------------------------------------------------------------------- %macro PREDICT_4x4 1 cglobal predict_4x4_ddr_%1, 1,1 @@ -233,7 +233,7 @@ PREDICT_4x4 ssse3 ;----------------------------------------------------------------------------- -; void predict_4x4_hu_mmxext( uint8_t *src ) +; void predict_4x4_hu( uint8_t *src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_hu_mmxext, 1,1 movq mm0, [r0+0*FDEC_STRIDE-8] @@ -264,7 +264,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_4x4_vl_mmxext( uint8_t *src ) +; void predict_4x4_vl( uint8_t *src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_vl_mmxext, 1,1 movq mm1, [r0-FDEC_STRIDE] @@ -426,7 +426,7 @@ PREDICT_FILTER ssse3 ;----------------------------------------------------------------------------- -; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge ) +; void predict_8x8_v( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_v_mmxext, 2,2 movq mm0, [r1+16] @@ -434,7 +434,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] ) +; void predict_8x8_h( uint8_t *src, uint8_t edge[33] ) ;----------------------------------------------------------------------------- INIT_MMX @@ -459,7 +459,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge ); +; void predict_8x8_dc( uint8_t *src, uint8_t *edge ); ;----------------------------------------------------------------------------- cglobal predict_8x8_dc_mmxext, 2,2 pxor mm0, mm0 @@ -475,7 +475,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge ); +; void predict_8x8_dc_top( uint8_t *src, uint8_t *edge ); ;----------------------------------------------------------------------------- %macro PRED8x8_DC 2 cglobal %1, 2,2 @@ -497,7 +497,7 @@ ; functions if we know sse2 is available. ;----------------------------------------------------------------------------- -; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge ) +; void predict_8x8_ddl( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddl_mmxext, 2,2 movq mm5, [r1+16] @@ -506,10 +506,10 @@ movq mm4, [r1+25] movq mm1, mm5 psllq mm1, 8 + add r0, FDEC_STRIDE*4 PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7 PRED8x8_LOWPASS mm1, mm3, mm4, [r1+24], mm6 - -%assign Y 7 +%assign Y 3 %rep 6 movq [r0+Y*FDEC_STRIDE], mm1 movq mm2, mm0 @@ -528,17 +528,17 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge ) +; void predict_8x8_ddr( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddr_mmxext, 2,2 movq mm1, [r1+7] movq mm2, [r1+9] movq mm3, [r1+15] movq mm4, [r1+17] + add r0, FDEC_STRIDE*4 PRED8x8_LOWPASS mm0, mm1, mm2, [r1+8], mm7 PRED8x8_LOWPASS mm1, mm3, mm4, [r1+16], mm6 - -%assign Y 7 +%assign Y 3 %rep 6 movq [r0+Y*FDEC_STRIDE], mm0 movq mm2, mm1 @@ -557,7 +557,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_hu_mmxext( uint8_t *src, uint8_t *edge ) +; void predict_8x8_hu( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- %define PALIGNR PALIGNR_MMX cglobal predict_8x8_hu_mmxext, 2,2 @@ -602,7 +602,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge ) +; void predict_8x8_vr_core( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- ; fills only some pixels: @@ -622,9 +622,10 @@ movq mm1, [r1+14] movq mm4, mm3 pavgb mm3, mm2 + add r0, FDEC_STRIDE*4 PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7 -%assign Y 0 +%assign Y -4 %rep 3 movq [r0+ Y *FDEC_STRIDE], mm3 movq [r0+(Y+1)*FDEC_STRIDE], mm0 @@ -638,7 +639,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ) +; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- cglobal predict_8x8c_p_core_mmxext, 1,2 LOAD_PLANE_ARGS @@ -666,7 +667,7 @@ REP_RET ;----------------------------------------------------------------------------- -; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c ) +; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- cglobal predict_16x16_p_core_mmxext, 1,2 LOAD_PLANE_ARGS @@ -710,16 +711,17 @@ %endif ; !ARCH_X86_64 ;----------------------------------------------------------------------------- -; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge ) +; void predict_8x8_ddl( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddl_sse2, 2,2 movdqa xmm3, [r1+16] movdqu xmm2, [r1+17] movdqa xmm1, xmm3 pslldq xmm1, 1 + add r0, FDEC_STRIDE*4 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4 -%assign Y 0 +%assign Y -4 %rep 8 psrldq xmm0, 1 movq [r0+Y*FDEC_STRIDE], xmm0 @@ -728,18 +730,19 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge ) +; void predict_8x8_ddr( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddr_sse2, 2,2 movdqu xmm3, [r1+8] movdqu xmm1, [r1+7] movdqa xmm2, xmm3 psrldq xmm2, 1 + add r0, FDEC_STRIDE*4 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4 movdqa xmm1, xmm0 psrldq xmm1, 1 -%assign Y 7 +%assign Y 3 %rep 3 movq [r0+Y*FDEC_STRIDE], xmm0 movq [r0+(Y-1)*FDEC_STRIDE], xmm1 @@ -747,13 +750,13 @@ psrldq xmm1, 2 %assign Y (Y-2) %endrep - movq [r0+1*FDEC_STRIDE], xmm0 - movq [r0+0*FDEC_STRIDE], xmm1 + movq [r0-3*FDEC_STRIDE], xmm0 + movq [r0-4*FDEC_STRIDE], xmm1 RET ;----------------------------------------------------------------------------- -; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge ) +; void predict_8x8_vl( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_vl_sse2, 2,2 movdqa xmm4, [r1+16] @@ -763,11 +766,12 @@ psrldq xmm2, 1 pslldq xmm1, 1 pavgb xmm3, xmm2 + add r0, FDEC_STRIDE*4 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5 ; xmm0: (t0 + 2*t1 + t2 + 2) >> 2 ; xmm3: (t0 + t1 + 1) >> 1 -%assign Y 0 +%assign Y -4 %rep 3 psrldq xmm0, 1 movq [r0+ Y *FDEC_STRIDE], xmm3 @@ -782,7 +786,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_vr_sse2( uint8_t *src, uint8_t *edge ) +; void predict_8x8_vr( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_vr_sse2, 2,2,7 movdqu xmm0, [r1+8] @@ -817,7 +821,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_hd_mmxext( uint8_t *src, uint8_t *edge ) +; void predict_8x8_hd( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- %define PALIGNR PALIGNR_MMX cglobal predict_8x8_hd_mmxext, 2,2 @@ -864,7 +868,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_hd_ssse3( uint8_t *src, uint8_t *edge ) +; void predict_8x8_hd( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_HD 1 cglobal predict_8x8_hd_%1, 2,2 @@ -903,7 +907,7 @@ %define PALIGNR PALIGNR_MMX ;----------------------------------------------------------------------------- -; void predict_8x8_hu_sse2( uint8_t *src, uint8_t *edge ) +; void predict_8x8_hu( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_HU 1 cglobal predict_8x8_hu_%1, 2,2 @@ -965,7 +969,7 @@ PREDICT_8x8_HU ssse3 ;----------------------------------------------------------------------------- -; void predict_8x8c_v_mmx( uint8_t *src ) +; void predict_8x8c_v( uint8_t *src ) ;----------------------------------------------------------------------------- cglobal predict_8x8c_v_mmx, 1,1 movq mm0, [r0 - FDEC_STRIDE] @@ -973,7 +977,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8c_h_mmxext( uint8_t *src ) +; void predict_8x8c_h( uint8_t *src ) ;----------------------------------------------------------------------------- %macro PRED_8x8C_H 1 @@ -997,7 +1001,7 @@ PRED_8x8C_H ssse3 ;----------------------------------------------------------------------------- -; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 ) +; void predict_8x8c_dc_core( uint8_t *src, int s2, int s3 ) ;----------------------------------------------------------------------------- cglobal predict_8x8c_dc_core_mmxext, 1,1 movq mm0, [r0 - FDEC_STRIDE] @@ -1052,7 +1056,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c ) +; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- cglobal predict_8x8c_p_core_sse2, 1,1 @@ -1094,7 +1098,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c ) +; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- cglobal predict_16x16_p_core_sse2, 1,2,8 movd xmm0, r1m @@ -1138,7 +1142,7 @@ REP_RET ;----------------------------------------------------------------------------- -; void predict_16x16_v_mmx( uint8_t *src ) +; void predict_16x16_v( uint8_t *src ) ;----------------------------------------------------------------------------- cglobal predict_16x16_v_mmx, 1,2 movq mm0, [r0 - FDEC_STRIDE] @@ -1147,7 +1151,7 @@ REP_RET ;----------------------------------------------------------------------------- -; void predict_16x16_v_sse2( uint8_t *src ) +; void predict_16x16_v( uint8_t *src ) ;----------------------------------------------------------------------------- cglobal predict_16x16_v_sse2, 1,1 movdqa xmm0, [r0 - FDEC_STRIDE] @@ -1155,7 +1159,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_16x16_h_mmxext( uint8_t *src ) +; void predict_16x16_h( uint8_t *src ) ;----------------------------------------------------------------------------- %macro PRED_16x16_H 1 @@ -1188,7 +1192,7 @@ PRED_16x16_H ssse3 ;----------------------------------------------------------------------------- -; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left ) +; void predict_16x16_dc_core( uint8_t *src, int i_dc_left ) ;----------------------------------------------------------------------------- %macro PRED16x16_DC 2 @@ -1225,7 +1229,7 @@ REP_RET ;----------------------------------------------------------------------------- -; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left ) +; void predict_16x16_dc_core( uint8_t *src, int i_dc_left ) ;----------------------------------------------------------------------------- %macro PRED16x16_DC_SSE2 2

@@ -25,55 +25,55 @@ #include "predict.h" #include "pixel.h" -extern void predict_16x16_v_mmx( uint8_t *src ); -extern void predict_16x16_h_mmxext( uint8_t *src ); -extern void predict_16x16_h_ssse3( uint8_t *src ); -extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left ); -extern void predict_16x16_dc_left_core_mmxext( uint8_t *src, int i_dc_left ); -extern void predict_16x16_dc_top_mmxext( uint8_t *src ); -extern void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c ); -extern void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ); -extern void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c ); -extern void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 ); -extern void predict_8x8c_dc_top_mmxext( uint8_t *src ); -extern void predict_8x8c_v_mmx( uint8_t *src ); -extern void predict_8x8c_h_mmxext( uint8_t *src ); -extern void predict_8x8c_h_ssse3( uint8_t *src ); -extern void predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_vr_sse2( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_hu_sse2( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_hd_sse2( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_hd_ssse3( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_hu_ssse3( uint8_t *src, uint8_t edge[33] ); -extern void predict_8x8_filter_mmxext ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters ); -extern void predict_8x8_filter_ssse3 ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters ); -extern void predict_4x4_ddl_mmxext( uint8_t *src ); -extern void predict_4x4_ddr_mmxext( uint8_t *src ); -extern void predict_4x4_vl_mmxext( uint8_t *src ); -extern void predict_4x4_vr_mmxext( uint8_t *src ); -extern void predict_4x4_vr_ssse3( uint8_t *src ); -extern void predict_4x4_hd_mmxext( uint8_t *src ); -extern void predict_4x4_hd_ssse3( uint8_t *src ); -extern void predict_4x4_dc_mmxext( uint8_t *src ); -extern void predict_4x4_ddr_ssse3( uint8_t *src ); -extern void predict_4x4_hu_mmxext( uint8_t *src ); -extern void predict_16x16_dc_top_sse2( uint8_t *src ); -extern void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left ); -extern void predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left ); -extern void predict_16x16_v_sse2( uint8_t *src ); -extern void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c ); + void x264_predict_16x16_v_mmx( uint8_t *src ); + void x264_predict_16x16_h_mmxext( uint8_t *src ); + void x264_predict_16x16_h_ssse3( uint8_t *src ); + void x264_predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left ); + void x264_predict_16x16_dc_left_core_mmxext( uint8_t *src, int i_dc_left ); + void x264_predict_16x16_dc_top_mmxext( uint8_t *src ); + void x264_predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c ); + void x264_predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ); + void x264_predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c ); + void x264_predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 ); + void x264_predict_8x8c_dc_top_mmxext( uint8_t *src ); + void x264_predict_8x8c_v_mmx( uint8_t *src ); + void x264_predict_8x8c_h_mmxext( uint8_t *src ); + void x264_predict_8x8c_h_ssse3( uint8_t *src ); + void x264_predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_ddl_mmxext( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_vr_sse2( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_hu_sse2( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_hd_sse2( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_hd_ssse3( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_hu_ssse3( uint8_t *src, uint8_t edge[33] ); + void x264_predict_8x8_filter_mmxext( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters ); + void x264_predict_8x8_filter_ssse3( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters ); + void x264_predict_4x4_ddl_mmxext( uint8_t *src ); + void x264_predict_4x4_ddr_mmxext( uint8_t *src ); + void x264_predict_4x4_vl_mmxext( uint8_t *src ); + void x264_predict_4x4_vr_mmxext( uint8_t *src ); + void x264_predict_4x4_vr_ssse3( uint8_t *src ); + void x264_predict_4x4_hd_mmxext( uint8_t *src ); + void x264_predict_4x4_hd_ssse3( uint8_t *src ); + void x264_predict_4x4_dc_mmxext( uint8_t *src ); + void x264_predict_4x4_ddr_ssse3( uint8_t *src ); + void x264_predict_4x4_hu_mmxext( uint8_t *src ); + void x264_predict_16x16_dc_top_sse2( uint8_t *src ); + void x264_predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left ); + void x264_predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left ); + void x264_predict_16x16_v_sse2( uint8_t *src ); + void x264_predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c ); ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8}; ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1}; @@ -84,7 +84,7 @@ V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\ #define PREDICT_16x16_P(name)\ -static void predict_16x16_p_##name( uint8_t *src )\ +static void x264_predict_16x16_p_##name( uint8_t *src )\ {\ int a, b, c;\ int H = 0;\ @@ -102,7 +102,7 @@ b = ( 5 * H + 32 ) >> 6;\ c = ( 5 * V + 32 ) >> 6;\ i00 = a - b * 7 - c * 7 + 16;\ - predict_16x16_p_core_##name( src, i00, b, c );\ + x264_predict_16x16_p_core_##name( src, i00, b, c );\ } #ifndef ARCH_X86_64 @@ -111,7 +111,7 @@ PREDICT_16x16_P( sse2 ) #ifdef __GNUC__ -static void predict_16x16_p_ssse3( uint8_t *src ) +static void x264_predict_16x16_p_ssse3( uint8_t *src ) { int a, b, c, i00; int H, V; @@ -143,12 +143,12 @@ b = ( 5 * H + 32 ) >> 6; c = ( 5 * V + 32 ) >> 6; i00 = a - b * 7 - c * 7 + 16; - predict_16x16_p_core_sse2( src, i00, b, c ); + x264_predict_16x16_p_core_sse2( src, i00, b, c ); } #endif #define PREDICT_8x8_P(name)\ -static void predict_8x8c_p_##name( uint8_t *src )\ +static void x264_predict_8x8c_p_##name( uint8_t *src )\ {\ int a, b, c;\ int H = 0;\ @@ -162,7 +162,7 @@ b = ( 17 * H + 16 ) >> 5;\ c = ( 17 * V + 16 ) >> 5;\ i00 = a -3*b -3*c + 16;\ - predict_8x8c_p_core_##name( src, i00, b, c );\ + x264_predict_8x8c_p_core_##name( src, i00, b, c );\ } #ifndef ARCH_X86_64 @@ -171,7 +171,7 @@ PREDICT_8x8_P( sse2 ) #ifdef __GNUC__ -static void predict_8x8c_p_ssse3( uint8_t *src ) +static void x264_predict_8x8c_p_ssse3( uint8_t *src ) { int a, b, c, i00; int H, V; @@ -196,12 +196,12 @@ b = ( 17 * H + 16 ) >> 5; c = ( 17 * V + 16 ) >> 5; i00 = a -3*b -3*c + 16; - predict_8x8c_p_core_sse2( src, i00, b, c ); + x264_predict_8x8c_p_core_sse2( src, i00, b, c ); } #endif #define PREDICT_16x16_DC(name)\ -static void predict_16x16_dc_##name( uint8_t *src )\ +static void x264_predict_16x16_dc_##name( uint8_t *src )\ {\ uint32_t dc=16;\ int i;\ @@ -210,14 +210,14 @@ dc += src[-1 + i * FDEC_STRIDE];\ dc += src[-1 + (i+1) * FDEC_STRIDE];\ }\ - predict_16x16_dc_core_##name( src, dc );\ + x264_predict_16x16_dc_core_##name( src, dc );\ } PREDICT_16x16_DC( mmxext ) PREDICT_16x16_DC( sse2 ) #define PREDICT_16x16_DC_LEFT(name)\ -static void predict_16x16_dc_left_##name( uint8_t *src )\ +static void x264_predict_16x16_dc_left_##name( uint8_t *src )\ {\ uint32_t dc=8;\ int i;\ @@ -226,13 +226,13 @@ dc += src[-1 + i * FDEC_STRIDE];\ dc += src[-1 + (i+1) * FDEC_STRIDE];\ }\ - predict_16x16_dc_left_core_##name( src, dc>>4 );\ + x264_predict_16x16_dc_left_core_##name( src, dc>>4 );\ } PREDICT_16x16_DC_LEFT( mmxext ) PREDICT_16x16_DC_LEFT( sse2 ) -static void predict_8x8c_dc_mmxext( uint8_t *src ) +static void x264_predict_8x8c_dc_mmxext( uint8_t *src ) { int s2 = 4 + src[-1 + 0*FDEC_STRIDE] @@ -246,11 +246,11 @@ + src[-1 + 6*FDEC_STRIDE] + src[-1 + 7*FDEC_STRIDE]; - predict_8x8c_dc_core_mmxext( src, s2, s3 ); + x264_predict_8x8c_dc_core_mmxext( src, s2, s3 ); } #ifdef ARCH_X86_64 -static void predict_8x8c_dc_left( uint8_t *src ) +static void x264_predict_8x8c_dc_left( uint8_t *src ) { int y; uint32_t s0 = 0, s1 = 0; @@ -304,9 +304,9 @@ #define SRC(x,y) src[(x)+(y)*FDEC_STRIDE] #ifndef ARCH_X86_64 -static void predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] ) +static void x264_predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] ) { - predict_8x8_vr_core_mmxext( src, edge ); + x264_predict_8x8_vr_core_mmxext( src, edge ); { PREDICT_8x8_LOAD_TOPLEFT PREDICT_8x8_LOAD_LEFT @@ -326,7 +326,7 @@ t=e; e+=f; f-=t;\ t=g; g+=h; h-=t; -#define INTRA_SA8D_X3(cpu) \ +#define INTRA_SA8D_X3(cpu)\ void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] )\ {\ PREDICT_8x8_LOAD_TOP\ @@ -372,30 +372,30 @@ { if( !(cpu&X264_CPU_MMX) ) return; - pf[I_PRED_16x16_V] = predict_16x16_v_mmx; + pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx; if( !(cpu&X264_CPU_MMXEXT) ) return; - pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext; - pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext; - pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_mmxext; + pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_mmxext; + pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_mmxext; + pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_mmxext; #ifndef ARCH_X86_64 - pf[I_PRED_16x16_P] = predict_16x16_p_mmxext; + pf[I_PRED_16x16_P] = x264_predict_16x16_p_mmxext; #endif - pf[I_PRED_16x16_H] = predict_16x16_h_mmxext; + pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmxext; if( !(cpu&X264_CPU_SSE2) ) return; - pf[I_PRED_16x16_DC] = predict_16x16_dc_sse2; - pf[I_PRED_16x16_V] = predict_16x16_v_sse2; + pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2; + pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse2; if( cpu&X264_CPU_SSE2_IS_SLOW ) return; - pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2; - pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_sse2; - pf[I_PRED_16x16_P] = predict_16x16_p_sse2; + pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2; + pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2; + pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; - pf[I_PRED_16x16_H] = predict_16x16_h_ssse3; + pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3; #ifdef __GNUC__ - pf[I_PRED_16x16_P] = predict_16x16_p_ssse3; + pf[I_PRED_16x16_P] = x264_predict_16x16_p_ssse3; #endif } @@ -404,25 +404,25 @@ if( !(cpu&X264_CPU_MMX) ) return; #ifdef ARCH_X86_64 - pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left; + pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left; #endif - pf[I_PRED_CHROMA_V] = predict_8x8c_v_mmx; + pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_mmx; if( !(cpu&X264_CPU_MMXEXT) ) return; - pf[I_PRED_CHROMA_DC_TOP] = predict_8x8c_dc_top_mmxext; - pf[I_PRED_CHROMA_H] = predict_8x8c_h_mmxext; + pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_mmxext; + pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmxext; #ifndef ARCH_X86_64 - pf[I_PRED_CHROMA_P] = predict_8x8c_p_mmxext; + pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_mmxext; #endif - pf[I_PRED_CHROMA_DC] = predict_8x8c_dc_mmxext; + pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmxext; if( !(cpu&X264_CPU_SSE2) ) return; - pf[I_PRED_CHROMA_P] = predict_8x8c_p_sse2; + pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; - pf[I_PRED_CHROMA_H] = predict_8x8c_h_ssse3; + pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_ssse3; #ifdef __GNUC__ - pf[I_PRED_CHROMA_P] = predict_8x8c_p_ssse3; + pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_ssse3; #endif } @@ -430,48 +430,48 @@ { if( !(cpu&X264_CPU_MMXEXT) ) return; - pf[I_PRED_8x8_V] = predict_8x8_v_mmxext; - pf[I_PRED_8x8_H] = predict_8x8_h_mmxext; - pf[I_PRED_8x8_DC] = predict_8x8_dc_mmxext; - pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext; - pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext; - pf[I_PRED_8x8_HD] = predict_8x8_hd_mmxext; - *predict_8x8_filter = predict_8x8_filter_mmxext; + pf[I_PRED_8x8_V] = x264_predict_8x8_v_mmxext; + pf[I_PRED_8x8_H] = x264_predict_8x8_h_mmxext; + pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_mmxext; + pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmxext; + pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmxext; + pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_mmxext; + *predict_8x8_filter = x264_predict_8x8_filter_mmxext; #ifdef ARCH_X86 - pf[I_PRED_8x8_DDL] = predict_8x8_ddl_mmxext; - pf[I_PRED_8x8_DDR] = predict_8x8_ddr_mmxext; - pf[I_PRED_8x8_VR] = predict_8x8_vr_mmxext; - pf[I_PRED_8x8_HU] = predict_8x8_hu_mmxext; + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_mmxext; + pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_mmxext; + pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_mmxext; + pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_mmxext; #endif if( !(cpu&X264_CPU_SSE2) ) return; - pf[I_PRED_8x8_DDL] = predict_8x8_ddl_sse2; - pf[I_PRED_8x8_VL] = predict_8x8_vl_sse2; - pf[I_PRED_8x8_VR] = predict_8x8_vr_sse2; - pf[I_PRED_8x8_DDR] = predict_8x8_ddr_sse2; - pf[I_PRED_8x8_HD] = predict_8x8_hd_sse2; - pf[I_PRED_8x8_HU] = predict_8x8_hu_sse2; + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_sse2; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_sse2; + pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_sse2; + pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_sse2; + pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_sse2; + pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; - pf[I_PRED_8x8_HD] = predict_8x8_hd_ssse3; - pf[I_PRED_8x8_HU] = predict_8x8_hu_ssse3; - *predict_8x8_filter = predict_8x8_filter_ssse3; + pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3; + pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3; + *predict_8x8_filter = x264_predict_8x8_filter_ssse3; } void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] ) { if( !(cpu&X264_CPU_MMXEXT) ) return; - pf[I_PRED_4x4_VR] = predict_4x4_vr_mmxext; - pf[I_PRED_4x4_DDL] = predict_4x4_ddl_mmxext; - pf[I_PRED_4x4_VL] = predict_4x4_vl_mmxext; - pf[I_PRED_4x4_DC] = predict_4x4_dc_mmxext; - pf[I_PRED_4x4_DDR] = predict_4x4_ddr_mmxext; - pf[I_PRED_4x4_HD] = predict_4x4_hd_mmxext; - pf[I_PRED_4x4_HU] = predict_4x4_hu_mmxext; + pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmxext; + pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmxext; + pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_mmxext; + pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_mmxext; + pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_mmxext; + pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_mmxext; + pf[I_PRED_4x4_HU] = x264_predict_4x4_hu_mmxext; if( !(cpu&X264_CPU_SSSE3) ) return; - pf[I_PRED_4x4_DDR] = predict_4x4_ddr_ssse3; - pf[I_PRED_4x4_VR] = predict_4x4_vr_ssse3; - pf[I_PRED_4x4_HD] = predict_4x4_hd_ssse3; + pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3; + pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3; + pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3; }

@@ -26,10 +26,6 @@ %include "x86util.asm" SECTION_RODATA -pb_1: times 16 db 1 -pw_1: times 8 dw 1 -pd_1: times 4 dd 1 -pb_01: times 8 db 0, 1 %macro DQM4 3 dw %1, %2, %1, %2, %2, %3, %2, %3 @@ -71,6 +67,11 @@ SECTION .text +cextern pb_1 +cextern pw_1 +cextern pd_1 +cextern pb_01 + %macro QUANT_DC_START_MMX 0 movd m6, r1m ; mf movd m7, r2m ; bias @@ -183,7 +184,7 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias ) +; void quant_4x4_dc( int16_t dct[16], int mf, int bias ) ;----------------------------------------------------------------------------- %macro QUANT_DC 2-3 0 cglobal %1, 1,1,%3 @@ -202,7 +203,7 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) +; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) ;----------------------------------------------------------------------------- %macro QUANT_AC 2 cglobal %1, 3,3 @@ -220,33 +221,33 @@ %define PABSW PABSW_MMX %define PSIGNW PSIGNW_MMX %define QUANT_DC_START QUANT_DC_START_MMX -QUANT_DC x264_quant_2x2_dc_mmxext, 1 +QUANT_DC quant_2x2_dc_mmxext, 1 %ifndef ARCH_X86_64 ; not needed because sse2 is faster -QUANT_DC x264_quant_4x4_dc_mmxext, 4 -QUANT_AC x264_quant_4x4_mmx, 4 -QUANT_AC x264_quant_8x8_mmx, 16 +QUANT_DC quant_4x4_dc_mmxext, 4 +QUANT_AC quant_4x4_mmx, 4 +QUANT_AC quant_8x8_mmx, 16 %endif INIT_XMM -QUANT_DC x264_quant_4x4_dc_sse2, 2, 8 -QUANT_AC x264_quant_4x4_sse2, 2 -QUANT_AC x264_quant_8x8_sse2, 8 +QUANT_DC quant_4x4_dc_sse2, 2, 8 +QUANT_AC quant_4x4_sse2, 2 +QUANT_AC quant_8x8_sse2, 8 %define PABSW PABSW_SSSE3 %define PSIGNW PSIGNW_SSSE3 -QUANT_DC x264_quant_4x4_dc_ssse3, 2, 8 -QUANT_AC x264_quant_4x4_ssse3, 2 -QUANT_AC x264_quant_8x8_ssse3, 8 +QUANT_DC quant_4x4_dc_ssse3, 2, 8 +QUANT_AC quant_4x4_ssse3, 2 +QUANT_AC quant_8x8_ssse3, 8 INIT_MMX -QUANT_DC x264_quant_2x2_dc_ssse3, 1 +QUANT_DC quant_2x2_dc_ssse3, 1 %define QUANT_END QUANT_END_SSE4 ;Not faster on Conroe, so only used in SSE4 versions %define QUANT_DC_START QUANT_DC_START_SSSE3 INIT_XMM -QUANT_DC x264_quant_4x4_dc_sse4, 2, 8 -QUANT_AC x264_quant_4x4_sse4, 2 -QUANT_AC x264_quant_8x8_sse4, 8 +QUANT_DC quant_4x4_dc_sse4, 2, 8 +QUANT_AC quant_4x4_sse4, 2 +QUANT_AC quant_8x8_sse4, 8 @@ -347,10 +348,10 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) +; void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) ;----------------------------------------------------------------------------- %macro DEQUANT 4 -cglobal x264_dequant_%2x%2_%1, 0,3 +cglobal dequant_%2x%2_%1, 0,3 .skip_prologue: DEQUANT_START %3+2, %3 @@ -367,11 +368,11 @@ psrld m3, 1 DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4 -cglobal x264_dequant_%2x%2_flat16_%1, 0,3 +cglobal dequant_%2x%2_flat16_%1, 0,3 movifnidn t2d, r2m %if %2 == 8 cmp t2d, 12 - jl x264_dequant_%2x%2_%1.skip_prologue + jl dequant_%2x%2_%1.skip_prologue sub t2d, 12 %endif imul t0d, t2d, 0x2b @@ -418,7 +419,7 @@ DEQUANT sse2, 8, 6, 2 %macro DEQUANT_DC 1 -cglobal x264_dequant_4x4dc_%1, 0,3 +cglobal dequant_4x4dc_%1, 0,3 DEQUANT_START 6, 6 .lshift: @@ -480,10 +481,10 @@ DEQUANT_DC sse2 ;----------------------------------------------------------------------------- -; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ) +; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ) ;----------------------------------------------------------------------------- %macro DENOISE_DCT 1-2 0 -cglobal x264_denoise_dct_%1, 4,5,%2 +cglobal denoise_dct_%1, 4,5,%2 movzx r4d, word [r0] ; backup DC coefficient pxor m6, m6 .loop: @@ -534,7 +535,7 @@ ;----------------------------------------------------------------------------- -; int x264_decimate_score( int16_t *dct ) +; int decimate_score( int16_t *dct ) ;----------------------------------------------------------------------------- %macro DECIMATE_MASK_SSE2 6 @@ -579,21 +580,21 @@ or %2, %6 %endmacro -cextern x264_decimate_table4 -cextern x264_decimate_table8 +cextern decimate_table4 +cextern decimate_table8 %macro DECIMATE4x4 2 ;A LUT is faster than bsf on AMD processors, and no slower on Intel ;This is not true for score64. -cglobal x264_decimate_score%1_%2, 1,3 +cglobal decimate_score%1_%2, 1,3 %ifdef PIC - lea r10, [x264_decimate_table4] + lea r10, [decimate_table4] lea r11, [decimate_mask_table4] %define table r10 %define mask_table r11 %else - %define table x264_decimate_table4 + %define table decimate_table4 %define mask_table decimate_mask_table4 %endif DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx @@ -638,12 +639,12 @@ %macro DECIMATE8x8 1 %ifdef ARCH_X86_64 -cglobal x264_decimate_score64_%1, 1,4 +cglobal decimate_score64_%1, 1,4 %ifdef PIC - lea r10, [x264_decimate_table8] + lea r10, [decimate_table8] %define table r10 %else - %define table x264_decimate_table8 + %define table decimate_table8 %endif mova m5, [pb_1] DECIMATE_MASK r1d, eax, r0, m5, %1, null @@ -677,9 +678,9 @@ %else ; ARCH %ifidn %1, mmxext -cglobal x264_decimate_score64_%1, 1,6 +cglobal decimate_score64_%1, 1,6 %else -cglobal x264_decimate_score64_%1, 1,5 +cglobal decimate_score64_%1, 1,5 %endif mova m7, [pb_1] DECIMATE_MASK r3, r2, r0, m7, %1, r5 @@ -705,7 +706,7 @@ je .largerun shrd r3, r4, cl shr r4, cl - add r0b, byte [x264_decimate_table8 + ecx] + add r0b, byte [decimate_table8 + ecx] shrd r3, r4, 1 shr r4, 1 cmp r0, 6 ;score64's threshold is never higher than 6 @@ -746,7 +747,7 @@ DECIMATE8x8 ssse3 ;----------------------------------------------------------------------------- -; int x264_coeff_last( int16_t *dct ) +; int coeff_last( int16_t *dct ) ;----------------------------------------------------------------------------- %macro LAST_MASK_SSE2 2-3 @@ -780,12 +781,12 @@ %macro COEFF_LAST4 1 %ifdef ARCH_X86_64 -cglobal x264_coeff_last4_%1, 1,1 +cglobal coeff_last4_%1, 1,1 LAST rax, [r0], 0x3f shr eax, 4 RET %else -cglobal x264_coeff_last4_%1, 0,3 +cglobal coeff_last4_%1, 0,3 mov edx, r0mp mov eax, [edx+4] xor ecx, ecx @@ -805,7 +806,7 @@ COEFF_LAST4 mmxext_lzcnt %macro COEFF_LAST 1 -cglobal x264_coeff_last15_%1, 1,3 +cglobal coeff_last15_%1, 1,3 pxor m2, m2 LAST_MASK r1d, r0-2, r2d xor r1d, 0xffff @@ -813,7 +814,7 @@ dec eax RET -cglobal x264_coeff_last16_%1, 1,3 +cglobal coeff_last16_%1, 1,3 pxor m2, m2 LAST_MASK r1d, r0, r2d xor r1d, 0xffff @@ -821,7 +822,7 @@ RET %ifndef ARCH_X86_64 -cglobal x264_coeff_last64_%1, 1, 5-mmsize/16 +cglobal coeff_last64_%1, 1, 5-mmsize/16 pxor m2, m2 LAST_MASK r2d, r0+64, r4d LAST_MASK r3d, r0+96, r4d @@ -841,7 +842,7 @@ add eax, 32 RET %else -cglobal x264_coeff_last64_%1, 1,4 +cglobal coeff_last64_%1, 1,4 pxor m2, m2 LAST_MASK_SSE2 r1d, r0 LAST_MASK_SSE2 r2d, r0+32 @@ -872,7 +873,7 @@ COEFF_LAST sse2_lzcnt ;----------------------------------------------------------------------------- -; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel ) +; int coeff_level_run( int16_t *dct, run_level_t *runlevel ) ;----------------------------------------------------------------------------- %macro LAST_MASK4_MMX 2-3 @@ -901,7 +902,7 @@ %endif %macro COEFF_LEVELRUN 2 -cglobal x264_coeff_level_run%2_%1,0,7 +cglobal coeff_level_run%2_%1,0,7 movifnidn t0, r0mp movifnidn t1, r1mp pxor m2, m2

@@ -26,14 +26,13 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA -pb_3: times 16 db 3 -pb_shuf8x8c: db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6 -pw_8: times 4 dw 8 -sw_64: dd 64 - SECTION .text +cextern pb_3 +cextern pb_shuf8x8c +cextern pw_8 +cextern sw_64 + ;============================================================================= ; SAD MMX ;============================================================================= @@ -78,10 +77,10 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int ) +; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SAD 2 -cglobal x264_pixel_sad_%1x%2_mmxext, 4,4 +cglobal pixel_sad_%1x%2_mmxext, 4,4 pxor mm0, mm0 %rep %2/2 SAD_INC_2x%1P @@ -113,9 +112,9 @@ %macro SAD_W16 1 ;----------------------------------------------------------------------------- -; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int ) +; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_sad_16x16_%1, 4,4,8 +cglobal pixel_sad_16x16_%1, 4,4,8 movdqu m0, [r2] movdqu m1, [r2+r3] lea r2, [r2+2*r3] @@ -180,9 +179,9 @@ SAD_END_SSE2 ;----------------------------------------------------------------------------- -; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int ) +; int pixel_sad_16x8( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_sad_16x8_%1, 4,4 +cglobal pixel_sad_16x8_%1, 4,4 movdqu m0, [r2] movdqu m2, [r2+r3] lea r2, [r2+2*r3] @@ -249,7 +248,7 @@ %endmacro ;Even on Nehalem, no sizes other than 8x16 benefit from this method. -cglobal x264_pixel_sad_8x16_sse2, 4,4 +cglobal pixel_sad_8x16_sse2, 4,4 SAD_INC_4x8P_SSE 0 SAD_INC_4x8P_SSE 1 SAD_INC_4x8P_SSE 1 @@ -258,10 +257,10 @@ RET ;----------------------------------------------------------------------------- -; void intra_sad_x3_4x4 ( uint8_t *fenc, uint8_t *fdec, int res[3] ); +; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] ); ;----------------------------------------------------------------------------- -cglobal x264_intra_sad_x3_4x4_mmxext, 3,3 +cglobal intra_sad_x3_4x4_mmxext, 3,3 pxor mm7, mm7 movd mm0, [r1-FDEC_STRIDE] movd mm1, [r0+FENC_STRIDE*0] @@ -305,7 +304,7 @@ RET ;----------------------------------------------------------------------------- -; void intra_sad_x3_8x8 ( uint8_t *fenc, uint8_t edge[33], int res[3]); +; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[33], int res[3]); ;----------------------------------------------------------------------------- ;m0 = DC @@ -343,7 +342,7 @@ %endmacro INIT_MMX -cglobal x264_intra_sad_x3_8x8_mmxext, 3,3 +cglobal intra_sad_x3_8x8_mmxext, 3,3 movq m7, [r1+7] pxor m0, m0 movq m6, [r1+16] ;V prediction @@ -372,7 +371,7 @@ RET ;----------------------------------------------------------------------------- -; void intra_sad_x3_8x8c ( uint8_t *fenc, uint8_t *fdec, int res[3] ); +; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] ); ;----------------------------------------------------------------------------- %macro INTRA_SAD_HV_ITER 2 @@ -407,7 +406,7 @@ %endmacro %macro INTRA_SAD_8x8C 1 -cglobal x264_intra_sad_x3_8x8c_%1, 3,3 +cglobal intra_sad_x3_8x8c_%1, 3,3 movq m6, [r1 - FDEC_STRIDE] add r1, FDEC_STRIDE*4 %ifidn %1,ssse3 @@ -508,13 +507,13 @@ ;----------------------------------------------------------------------------- -; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] ); +; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] ); ;----------------------------------------------------------------------------- ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score %macro INTRA_SAD16 1-2 0 -cglobal x264_intra_sad_x3_16x16_%1,3,5,%2 +cglobal intra_sad_x3_16x16_%1,3,5,%2 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r1-FDEC_STRIDE+0] @@ -526,10 +525,14 @@ %endif %assign x 0 %rep 16 - movzx r4d, byte [r1-1+FDEC_STRIDE*x] + movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)] +%if (x&3)==3 && x!=15 + add r1, FDEC_STRIDE*4 +%endif add r3d, r4d %assign x x+1 %endrep + sub r1, FDEC_STRIDE*12 add r3d, 16 shr r3d, 5 imul r3d, 0x01010101 @@ -813,11 +816,11 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, -; uint8_t *pix2, int i_stride, int scores[3] ) +; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, +; uint8_t *pix2, int i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X 3 -cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2 +cglobal pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2 %ifdef WIN64 %assign i %1+1 movsxd r %+ i, r %+ i %+ d @@ -1162,11 +1165,11 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, -; uint8_t *pix2, int i_stride, int scores[3] ) +; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, +; uint8_t *pix2, int i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X_SSE2 4 -cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9 +cglobal pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9 %ifdef WIN64 %assign i %1+1 movsxd r %+ i, r %+ i %+ d @@ -1179,7 +1182,7 @@ %endmacro %macro SAD_X_SSE2_MISALIGN 4 -cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9 +cglobal pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9 %ifdef WIN64 %assign i %1+1 movsxd r %+ i, r %+ i %+ d @@ -1285,11 +1288,11 @@ %endmacro %macro SAD16_CACHELINE_FUNC 2 ; cpu, height -cglobal x264_pixel_sad_16x%2_cache64_%1 +cglobal pixel_sad_16x%2_cache64_%1 mov eax, r2m and eax, 0x37 cmp eax, 0x30 - jle x264_pixel_sad_16x%2_sse2 + jle pixel_sad_16x%2_sse2 PROLOGUE 4,6 mov r4d, r2d and r4d, 15 @@ -1320,7 +1323,7 @@ mov eax, r2m and eax, 0x17|%1|(%4>>1) cmp eax, 0x10|%1|(%4>>1) - jle x264_pixel_sad_%1x%2_mmxext + jle pixel_sad_%1x%2_mmxext and eax, 7 shl eax, 3 movd mm6, [sw_64] @@ -1333,7 +1336,7 @@ %endmacro %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline -cglobal x264_pixel_sad_16x%1_cache%2_mmxext +cglobal pixel_sad_16x%1_cache%2_mmxext SAD_CACHELINE_START_MMX2 16, %1, %1, %2 .loop: movq mm1, [r2] @@ -1359,7 +1362,7 @@ %endmacro %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline -cglobal x264_pixel_sad_8x%1_cache%2_mmxext +cglobal pixel_sad_8x%1_cache%2_mmxext SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2 .loop: movq mm1, [r2+8] @@ -1395,11 +1398,11 @@ %endmacro %macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name -cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6 +cglobal pixel_sad_x3_%1x%2_cache%3_%6 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 - jmp x264_pixel_sad_x3_%1x%2_%4 + jmp pixel_sad_x3_%1x%2_%4 .split: %ifdef ARCH_X86_64 PROLOGUE 6,7 @@ -1414,7 +1417,7 @@ mov r3, r4 mov r10, r0 mov r11, r5 - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [r11], eax %ifdef WIN64 mov r2, [rsp] @@ -1422,7 +1425,7 @@ pop r2 %endif mov r0, r10 - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [r11+4], eax %ifdef WIN64 mov r2, [rsp+8] @@ -1430,7 +1433,7 @@ pop r2 %endif mov r0, r10 - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [r11+8], eax %ifdef WIN64 add rsp, 24 @@ -1443,15 +1446,15 @@ push dword [esp+16] push dword 16 push dword [esp+20] - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+32] mov [edi], eax mov [esp+8], ecx - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+36] mov [edi+4], eax mov [esp+8], ecx - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [edi+8], eax add esp, 16 pop edi @@ -1460,12 +1463,12 @@ %endmacro %macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name -cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6 +cglobal pixel_sad_x4_%1x%2_cache%3_%6 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 CHECK_SPLIT r4m, %1, %3 - jmp x264_pixel_sad_x4_%1x%2_%4 + jmp pixel_sad_x4_%1x%2_%4 .split: %ifdef ARCH_X86_64 PROLOGUE 6,7 @@ -1480,7 +1483,7 @@ mov r1, FENC_STRIDE mov r3, r5 mov r10, r0 - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [r11], eax %ifdef WIN64 mov r2, [rsp] @@ -1488,7 +1491,7 @@ pop r2 %endif mov r0, r10 - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [r11+4], eax %ifdef WIN64 mov r2, [rsp+8] @@ -1496,7 +1499,7 @@ pop r2 %endif mov r0, r10 - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [r11+8], eax %ifdef WIN64 mov r2, [rsp+16] @@ -1504,7 +1507,7 @@ pop r2 %endif mov r0, r10 - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [r11+12], eax %ifdef WIN64 add rsp, 24 @@ -1517,19 +1520,19 @@ push dword [esp+16] push dword 16 push dword [esp+20] - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+32] mov [edi], eax mov [esp+8], ecx - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+36] mov [edi+4], eax mov [esp+8], ecx - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+40] mov [edi+8], eax mov [esp+8], ecx - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [edi+12], eax add esp, 16 pop edi

@@ -26,7 +26,9 @@ #ifdef __GNUC__ +#ifdef __SSE__ #include <xmmintrin.h> +#endif #define x264_median_mv x264_median_mv_mmxext static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b, int16_t *c ) @@ -107,7 +109,7 @@ } #define x264_predictor_roundclip x264_predictor_roundclip_mmxext -static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max ) +static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max ) { uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min ); uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max ); @@ -121,7 +123,7 @@ "punpckldq %%mm6, %%mm6 \n" "test $1, %0 \n" "jz 1f \n" - "movd -4(%5,%0,4), %%mm0 \n" + "movd -4(%6,%0,4), %%mm0 \n" "paddw %%mm7, %%mm0 \n" "psraw $2, %%mm0 \n" "pmaxsw %%mm5, %%mm0 \n" @@ -130,7 +132,7 @@ "dec %0 \n" "jz 2f \n" "1: \n" - "movq -8(%5,%0,4), %%mm0 \n" + "movq -8(%6,%0,4), %%mm0 \n" "paddw %%mm7, %%mm0 \n" "psraw $2, %%mm0 \n" "pmaxsw %%mm5, %%mm0 \n" @@ -139,15 +141,17 @@ "sub $2, %0 \n" "jnz 1b \n" "2: \n" - :"+r"(i), "+m"(M64( mvc )) - :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(mvc) + :"+r"(i), "=m"(M64( dst )) + :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(dst), "r"(mvc), "m"(M64( mvc )) ); } +#ifdef __SSE__ #undef M128_ZERO #define M128_ZERO ((__m128){0,0,0,0}) #define x264_union128_t x264_union128_sse_t typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t; +#endif #endif

@@ -32,6 +32,8 @@ ; as this feature might be useful for others as well. Send patches or ideas ; to x264-devel@videolan.org . +%define program_name x264 + %ifdef ARCH_X86_64 %ifidn __OUTPUT_FORMAT__,win32 %define WIN64 @@ -169,7 +171,7 @@ %endrep %endmacro -DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7 +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 %ifdef ARCH_X86_64 %define gprsize 8 @@ -436,7 +438,7 @@ ; Symbol prefix for C linkage %macro cglobal 1-2+ - %xdefine %1 mangle(%1) + %xdefine %1 mangle(program_name %+ _ %+ %1) %xdefine %1.skip_prologue %1 %+ .skip_prologue %ifidn __OUTPUT_FORMAT__,elf global %1:function hidden @@ -453,10 +455,22 @@ %endmacro %macro cextern 1 + %xdefine %1 mangle(program_name %+ _ %+ %1) + extern %1 +%endmacro + +;like cextern, but without the prefix +%macro cextern_naked 1 %xdefine %1 mangle(%1) extern %1 %endmacro +%macro const 2+ + %xdefine %1 mangle(program_name %+ _ %+ %1) + global %1 + %1: %2 +%endmacro + ; This is needed for ELF, otherwise the GNU linker assumes the stack is ; executable by default. %ifidn __OUTPUT_FORMAT__,elf

@@ -118,7 +118,7 @@ ffms_input="auto" mp4_output="auto" pthread="auto" -asm="yes" +asm="auto" debug="no" gprof="no" pic="no" @@ -154,33 +154,18 @@ --includedir=*) includedir="$optarg" ;; - --enable-asm) - asm="yes" - ;; --disable-asm) asm="no" ;; - --enable-avs-input) - avs_input="auto" - ;; --disable-avs-input) avs_input="no" ;; - --enable-lavf-input) - lavf_input="auto" - ;; --disable-lavf-input) lavf_input="no" ;; - --enable-ffms-input) - ffms_input="auto" - ;; --disable-ffms-input) ffms_input="no" ;; - --enable-mp4-output) - mp4_output="yes" - ;; --disable-mp4-output) mp4_output="no" ;; @@ -193,9 +178,6 @@ --extra-ldflags=*) LDFLAGS="$LDFLAGS ${opt#--extra-ldflags=}" ;; - --enable-pthread) - pthread="auto" # can't skip detection, since it differs by OS - ;; --disable-pthread) pthread="no" ;; @@ -214,8 +196,6 @@ shared="yes" ;; --enable-visualize) - LDFLAGS="$LDFLAGS -L/usr/X11R6/lib -lX11" - define HAVE_VISUALIZE vis="yes" ;; --host=*) @@ -425,7 +405,7 @@ pic="yes" fi -if [ $asm = yes -a $ $ARCH = X86 -o $ARCH = X86_64 $ ] ; then +if [ $asm = auto -a $ $ARCH = X86 -o $ARCH = X86_64 $ ] ; then if ! as_check "lzcnt eax, eax" ; then VER=`($AS --version || echo no assembler) 2>$DEVNULL | head -n 1` echo "Found $VER" @@ -444,7 +424,7 @@ define HAVE_MMX fi -if [ $asm = yes -a $ARCH = ARM ] ; then +if [ $asm = auto -a $ARCH = ARM ] ; then # set flags so neon is built by default echo $CFLAGS | grep -Eq '(-mcpu|-march|-mfpu|-mfloat-abi)' || CFLAGS="$CFLAGS -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp" @@ -460,7 +440,7 @@ fi [ $asm = no ] && AS="" -[ "x$AS" = x ] && asm="no" +[ "x$AS" = x ] && asm="no" || asm="yes" define ARCH_$ARCH define SYS_$SYS @@ -516,6 +496,13 @@ define HAVE_LOG2F fi +if [ "$vis" = "yes" ] && cc_check "X11/Xlib.h" "-L/usr/X11R6/lib -lX11" "XOpenDisplay( 0 );" ; then + LDFLAGS="-L/usr/X11R6/lib -lX11 $LDFLAGS" + define HAVE_VISUALIZE +else + vis="no" +fi + if [ "$lavf_input" = "auto" ] ; then lavf_input="no" if ${cross_prefix}pkg-config --exists libavformat libavcodec libswscale 2>$DEVNULL; then

@@ -27,7 +27,6 @@ #include <unistd.h> #include "common/common.h" -#include "common/cpu.h" #include "macroblock.h" #include "me.h" #include "ratecontrol.h" @@ -2569,15 +2568,11 @@ x264_mb_analysis_t analysis; int i_cost = COST_MAX; - h->mb.i_qp = x264_ratecontrol_qp( h ); - if( h->param.rc.i_aq_mode ) - { - x264_adaptive_quant( h ); - /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB, - * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */ - if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ) - h->mb.i_qp = h->mb.i_last_qp; - } + h->mb.i_qp = x264_ratecontrol_mb_qp( h ); + /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB, + * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */ + if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ) + h->mb.i_qp = h->mb.i_last_qp; x264_mb_analyse_init( h, &analysis, h->mb.i_qp );

@@ -539,16 +539,16 @@ // node ctx: 0..3: abslevel1 (with abslevelgt1 == 0). // 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter). /* map node ctx => cabac ctx for level=1 */ -static const int coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; +static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; /* map node ctx => cabac ctx for level>1 */ -static const int coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; +static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; static const uint8_t coeff_abs_level_transition[2][8] = { /* update node ctx after coding a level=1 */ { 1, 2, 3, 3, 4, 5, 6, 7 }, /* update node ctx after coding a level>1 */ { 4, 4, 4, 4, 5, 6, 7, 7 } }; -static const int count_cat_m1[5] = {15, 14, 15, 3, 14}; +static const uint8_t count_cat_m1[5] = {15, 14, 15, 3, 14}; #if !RDO_SKIP_BS static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int16_t *l ) @@ -736,13 +736,13 @@ } #endif -#define block_residual_write_cabac_cbf( h, cb, i_ctxBlockCat, i_idx, l, b_intra ) \ -{ \ - int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx, b_intra ); \ +#define block_residual_write_cabac_cbf( h, cb, i_ctxBlockCat, i_idx, l, b_intra )\ +{\ + int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx, b_intra );\ if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\ {\ x264_cabac_encode_decision( cb, ctxidxinc, 1 );\ - block_residual_write_cabac( h, cb, i_ctxBlockCat, l ); \ + block_residual_write_cabac( h, cb, i_ctxBlockCat, l );\ }\ else\ x264_cabac_encode_decision( cb, ctxidxinc, 0 );\

@@ -117,7 +117,7 @@ { bs_t *s = &h->out.bs; static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0}; - static const int count_cat[5] = {16, 15, 16, 4, 15}; + static const uint8_t count_cat[5] = {16, 15, 16, 4, 15}; x264_run_level_t runlevel; int i_trailing, i_total_zero, i_suffix_length; int i_total = 0; @@ -172,7 +172,7 @@ } } - if( i_total < count_cat[i_ctxBlockCat] ) + if( (uint8_t)i_total < count_cat[i_ctxBlockCat] ) { if( i_ctxBlockCat == DCT_CHROMA_DC ) bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );

@@ -25,7 +25,6 @@ #include <math.h> #include "common/common.h" -#include "common/cpu.h" #include "set.h" #include "analyse.h" @@ -356,9 +355,15 @@ static int x264_validate_parameters( x264_t *h ) { #ifdef HAVE_MMX +#ifdef __SSE__ if( !(x264_cpu_detect() & X264_CPU_SSE) ) { x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n"); +#else + if( !(x264_cpu_detect() & X264_CPU_MMXEXT) ) + { + x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n"); +#endif x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n"); return -1; } @@ -1885,24 +1890,27 @@ x264_macroblock_cache_save( h ); /* accumulate mb stats */ - h->stat.frame.i_mb_count[h->mb.i_type]++; int b_intra = IS_INTRA( h->mb.i_type ); - if( !b_intra && !IS_SKIP( h->mb.i_type ) && !IS_DIRECT( h->mb.i_type ) ) + if( h->param.i_log_level >= X264_LOG_INFO || h->param.rc.b_stat_write ) { - if( h->mb.i_partition != D_8x8 ) - h->stat.frame.i_mb_partition[h->mb.i_partition] += 4; - else - for( int i = 0; i < 4; i++ ) - h->stat.frame.i_mb_partition[h->mb.i_sub_partition[i]] ++; - if( h->param.i_frame_reference > 1 ) - for( int i_list = 0; i_list <= (h->sh.i_type == SLICE_TYPE_B); i_list++ ) - for( int i = 0; i < 4; i++ ) - { - int i_ref = h->mb.cache.ref[i_list][ x264_scan8[4*i] ]; - if( i_ref >= 0 ) - h->stat.frame.i_mb_count_ref[i_list][i_ref] ++; - } + h->stat.frame.i_mb_count[h->mb.i_type]++; + if( !b_intra && !IS_SKIP( h->mb.i_type ) && !IS_DIRECT( h->mb.i_type ) ) + { + if( h->mb.i_partition != D_8x8 ) + h->stat.frame.i_mb_partition[h->mb.i_partition] += 4; + else + for( int i = 0; i < 4; i++ ) + h->stat.frame.i_mb_partition[h->mb.i_sub_partition[i]] ++; + if( h->param.i_frame_reference > 1 ) + for( int i_list = 0; i_list <= (h->sh.i_type == SLICE_TYPE_B); i_list++ ) + for( int i = 0; i < 4; i++ ) + { + int i_ref = h->mb.cache.ref[i_list][ x264_scan8[4*i] ]; + if( i_ref >= 0 ) + h->stat.frame.i_mb_count_ref[i_list][i_ref] ++; + } + } } if( h->param.i_log_level >= X264_LOG_INFO ) @@ -2058,6 +2066,10 @@ static int x264_threaded_slices_write( x264_t *h ) { void *ret = NULL; +#ifdef HAVE_MMX + if( h->param.cpu&X264_CPU_SSE_MISALIGN ) + x264_cpu_mask_misalign_sse(); +#endif /* set first/last mb and sync contexts */ for( int i = 0; i < h->param.i_threads; i++ ) { @@ -2095,7 +2107,11 @@ /* Go back and fix up the hpel on the borders between slices. */ for( int i = 1; i < h->param.i_threads; i++ ) + { x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 1, 0 ); + if( h->sh.b_mbaff ) + x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 2, 0 ); + } x264_threads_merge_ratecontrol( h ); @@ -2119,6 +2135,12 @@ return 0; } +void x264_encoder_intra_refresh( x264_t *h ) +{ + h = h->thread[h->i_thread_phase]; + h->b_queued_intra_refresh = 1; +} + /**************************************************************************** * x264_encoder_encode: * XXX: i_poc : is the poc of the current given picture @@ -2363,25 +2385,34 @@ h->i_nal_type = i_nal_type; h->i_nal_ref_idc = i_nal_ref_idc; - if( h->param.b_intra_refresh && h->fenc->i_type == X264_TYPE_P ) + if( h->param.b_intra_refresh ) { - int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2; - float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 ); - int max_position = (int)(increment * h->param.i_keyint_max); - if( IS_X264_TYPE_I( h->fref0[0]->i_type ) ) - h->fdec->f_pir_position = 0; - else + if( IS_X264_TYPE_I( h->fenc->i_type ) ) + { + h->fdec->i_frames_since_pir = 0; + h->b_queued_intra_refresh = 0; + /* PIR is currently only supported with ref == 1, so any intra frame effectively refreshes + * the whole frame and counts as an intra refresh. */ + h->fdec->f_pir_position = h->sps->i_mb_width; + } + else if( h->fenc->i_type == X264_TYPE_P ) { + int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2; + float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 ); h->fdec->f_pir_position = h->fref0[0]->f_pir_position; - if( h->fdec->f_pir_position+0.5 >= max_position ) + h->fdec->i_frames_since_pir = h->fref0[0]->i_frames_since_pir + pocdiff; + if( h->fdec->i_frames_since_pir >= h->param.i_keyint_max || + (h->b_queued_intra_refresh && h->fdec->f_pir_position + 0.5 >= h->sps->i_mb_width) ) { h->fdec->f_pir_position = 0; + h->fdec->i_frames_since_pir = 0; + h->b_queued_intra_refresh = 0; h->fenc->b_keyframe = 1; } + h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5; + h->fdec->f_pir_position += increment * pocdiff; + h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5; } - h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5; - h->fdec->f_pir_position += increment * pocdiff; - h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5; } if( h->fenc->b_keyframe ) @@ -2789,8 +2820,8 @@ /* Slices used and PSNR */ for( int i = 0; i < 5; i++ ) { - static const int slice_order[] = { SLICE_TYPE_I, SLICE_TYPE_SI, SLICE_TYPE_P, SLICE_TYPE_SP, SLICE_TYPE_B }; - static const char *slice_name[] = { "P", "B", "I", "SP", "SI" }; + static const uint8_t slice_order[] = { SLICE_TYPE_I, SLICE_TYPE_SI, SLICE_TYPE_P, SLICE_TYPE_SP, SLICE_TYPE_B }; + static const char * const slice_name[] = { "P", "B", "I", "SP", "SI" }; int i_slice = slice_order[i]; if( h->stat.i_frame_count[i_slice] > 0 )

@@ -35,7 +35,6 @@ * # of bframes + # of threads. */ #include "common/common.h" -#include "common/cpu.h" #include "analyse.h" static void x264_lookahead_shift( x264_synch_frame_list_t *dst, x264_synch_frame_list_t *src, int count )

@@ -458,10 +458,10 @@ static void x264_macroblock_encode_skip( x264_t *h ) { - h->mb.i_cbp_luma = 0x00; - h->mb.i_cbp_chroma = 0x00; - memset( h->mb.cache.non_zero_count, 0, sizeof( h->mb.cache.non_zero_count ) ); - /* store cbp */ + for( int i = 0; i < sizeof( h->mb.cache.non_zero_count ); i += 16 ) + M128( &h->mb.cache.non_zero_count[i] ) = M128_ZERO; + h->mb.i_cbp_luma = 0; + h->mb.i_cbp_chroma = 0; h->mb.cbp[h->mb.i_mb_xy] = 0; }

@@ -48,8 +48,8 @@ /* (x-1)%6 */ static const uint8_t mod6m1[8] = {5,0,1,2,3,4,5,0}; /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */ -static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}}; -static const int square1[9][2] = {{0,0}, {0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {-1,1}, {1,-1}, {1,1}}; +static const int8_t hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}}; +static const int8_t square1[9][2] = {{0,0}, {0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {-1,1}, {1,-1}, {1,1}}; static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel ); @@ -245,14 +245,15 @@ pmv = pack16to32_mask( bmx, bmy ); if( i_mvc > 0 ) { - x264_predictor_roundclip( mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max ); + ALIGNED_ARRAY_8( int16_t, mvc_fpel,[16][2] ); + x264_predictor_roundclip( mvc_fpel, mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max ); bcost <<= 4; for( int i = 1; i <= i_mvc; i++ ) { - if( M32( mvc[i-1] ) && (pmv != M32( mvc[i-1] )) ) + if( M32( mvc_fpel[i-1] ) && (pmv != M32( mvc[i-1] )) ) { - int mx = mvc[i-1][0]; - int my = mvc[i-1][1]; + int mx = mvc_fpel[i-1][0]; + int my = mvc_fpel[i-1][1]; int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my ); cost = (cost << 4) + i; COPY1_IF_LT( bcost, cost ); @@ -260,8 +261,8 @@ } if( bcost&15 ) { - bmx = mvc[(bcost&15)-1][0]; - bmy = mvc[(bcost&15)-1][1]; + bmx = mvc_fpel[(bcost&15)-1][0]; + bmy = mvc_fpel[(bcost&15)-1][1]; } bcost >>= 4; } @@ -376,7 +377,7 @@ /* Uneven-cross Multi-Hexagon-grid Search * as in JM, except with different early termination */ - static const int x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 }; + static const uint8_t x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 }; int ucost1, ucost2; int cross_start = 1; @@ -423,7 +424,7 @@ /* range multipliers based on casual inspection of some statistics of * average distance between current predictor and final mv found by ESA. * these have not been tuned much by actual encoding. */ - static const int range_mul[4][4] = + static const uint8_t range_mul[4][4] = { { 3, 3, 4, 4 }, { 3, 4, 4, 4 }, @@ -467,7 +468,7 @@ : mvd < 20*denom ? 1 : mvd < 40*denom ? 2 : 3; - i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] / 4; + i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] >> 2; } /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy. @@ -483,7 +484,7 @@ int i = 1; do { - static const int hex4[16][2] = { + static const int8_t hex4[16][2] = { { 0,-4}, { 0, 4}, {-2,-3}, { 2,-3}, {-4,-2}, { 4,-2}, {-4,-1}, { 4,-1}, {-4, 0}, { 4, 0}, {-4, 1}, { 4, 1}, @@ -657,7 +658,7 @@ bsad += ycost; } - limit = i_me_range / 2; + limit = i_me_range >> 1; sad_thresh = bsad*sad_thresh>>3; while( nmvsad > limit*2 && sad_thresh > bsad ) { @@ -913,14 +914,14 @@ m->cost_mv = p_cost_mvx[bmx] + p_cost_mvy[bmy]; } -#define BIME_CACHE( dx, dy, list ) \ -{ \ +#define BIME_CACHE( dx, dy, list )\ +{\ x264_me_t *m = m##list;\ - int i = 4 + 3*dx + dy; \ + int i = 4 + 3*dx + dy;\ int mvx = bm##list##x+dx;\ int mvy = bm##list##y+dy;\ stride[list][i] = bw;\ - src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none ); \ + src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none );\ if( rd )\ {\ h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ @@ -1106,11 +1107,11 @@ { \ uint64_t cost; \ M32( cache_mv ) = pack16to32_mask(mx,my); \ - if( m->i_pixel <= PIXEL_8x8 )\ - {\ - h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\ - h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\ - }\ + if( m->i_pixel <= PIXEL_8x8 ) \ + { \ + h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \ + h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \ + } \ cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \ } \

@@ -29,7 +29,6 @@ #include <math.h> #include "common/common.h" -#include "common/cpu.h" #include "ratecontrol.h" #include "me.h" @@ -84,8 +83,7 @@ /* current frame */ ratecontrol_entry_t *rce; int qp; /* qp for current frame */ - int qpm; /* qp for current macroblock */ - float f_qpm; /* qp for current macroblock: precise float for AQ */ + float qpm; /* qp for current macroblock: precise float for AQ */ float qpa_rc; /* average of macroblocks' qp before aq */ float qpa_aq; /* average of macroblocks' qp after aq */ float qp_novbv; /* QP for the current frame if 1-pass VBV was disabled. */ @@ -293,22 +291,6 @@ } } - -/***************************************************************************** -* x264_adaptive_quant: - * adjust macroblock QP based on variance (AC energy) of the MB. - * high variance = higher QP - * low variance = lower QP - * This generally increases SSIM and lowers PSNR. -*****************************************************************************/ -void x264_adaptive_quant( x264_t *h ) -{ - x264_emms(); - /* MB-tree currently doesn't adjust quantizers in unreferenced frames. */ - float qp_offset = h->fdec->b_kept_as_ref ? h->fenc->f_qp_offset[h->mb.i_mb_xy] : h->fenc->f_qp_offset_aq[h->mb.i_mb_xy]; - h->mb.i_qp = x264_clip3( h->rc->f_qpm + qp_offset + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); -} - int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame ) { x264_ratecontrol_t *rc = h->rc; @@ -669,7 +651,7 @@ return -1; } - CMP_OPT_FIRST_PASS( "wpredp", X264_MAX( 0, h->param.analyse.i_weighted_pred ) ); + CMP_OPT_FIRST_PASS( "weightp", X264_MAX( 0, h->param.analyse.i_weighted_pred ) ); CMP_OPT_FIRST_PASS( "bframes", h->param.i_bframe ); CMP_OPT_FIRST_PASS( "b_pyramid", h->param.i_bframe_pyramid ); CMP_OPT_FIRST_PASS( "intra_refresh", h->param.b_intra_refresh ); @@ -1180,28 +1162,27 @@ rc->qpa_rc = rc->qpa_aq = 0; - rc->qpm = rc->qp = x264_clip3( (int)(q + 0.5), 0, 51 ); h->fdec->f_qp_avg_rc = h->fdec->f_qp_avg_aq = - rc->f_qpm = q; + rc->qpm = q; if( rce ) rce->new_qp = rc->qp; - accum_p_qp_update( h, rc->f_qpm ); + accum_p_qp_update( h, rc->qpm ); if( h->sh.i_type != SLICE_TYPE_B ) rc->last_non_b_pict_type = h->sh.i_type; } -static double predict_row_size( x264_t *h, int y, int qp ) +static double predict_row_size( x264_t *h, int y, double qp ) { /* average between two predictors: * absolute SATD, and scaled bit cost of the colocated row in the previous frame */ x264_ratecontrol_t *rc = h->rc; double pred_s = predict_size( rc->row_pred[0], qp2qscale( qp ), h->fdec->i_row_satd[y] ); double pred_t = 0; - if( h->sh.i_type == SLICE_TYPE_I || qp >= h->fref0[0]->i_row_qp[y] ) + if( h->sh.i_type == SLICE_TYPE_I || qp >= h->fref0[0]->f_row_qp[y] ) { if( h->sh.i_type == SLICE_TYPE_P && h->fref0[0]->i_type == h->fdec->i_type @@ -1209,7 +1190,7 @@ && (abs(h->fref0[0]->i_row_satd[y] - h->fdec->i_row_satd[y]) < h->fdec->i_row_satd[y]/2)) { pred_t = h->fref0[0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref0[0]->i_row_satd[y] - * qp2qscale( h->fref0[0]->i_row_qp[y] ) / qp2qscale( qp ); + * qp2qscale( h->fref0[0]->f_row_qp[y] ) / qp2qscale( qp ); } if( pred_t == 0 ) pred_t = pred_s; @@ -1232,7 +1213,7 @@ return bits; } -static double predict_row_size_sum( x264_t *h, int y, int qp ) +static double predict_row_size_sum( x264_t *h, int y, double qp ) { double bits = row_bits_so_far(h, y); for( int i = y+1; i < h->i_threadslice_end; i++ ) @@ -1249,33 +1230,34 @@ x264_emms(); h->fdec->i_row_bits[y] += bits; - rc->qpa_rc += rc->f_qpm; + rc->qpa_rc += rc->qpm; rc->qpa_aq += h->mb.i_qp; if( h->mb.i_mb_x != h->sps->i_mb_width - 1 || !rc->b_vbv ) return; - h->fdec->i_row_qp[y] = rc->qpm; + h->fdec->f_row_qp[y] = rc->qpm; update_predictor( rc->row_pred[0], qp2qscale( rc->qpm ), h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] ); - if( h->sh.i_type == SLICE_TYPE_P && rc->qpm < h->fref0[0]->i_row_qp[y] ) + if( h->sh.i_type == SLICE_TYPE_P && rc->qpm < h->fref0[0]->f_row_qp[y] ) update_predictor( rc->row_pred[1], qp2qscale( rc->qpm ), h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] ); /* tweak quality based on difference from predicted size */ if( y < h->i_threadslice_end-1 ) { - int prev_row_qp = h->fdec->i_row_qp[y]; - int i_qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min ); - int i_qp_absolute_max = h->param.rc.i_qp_max; + float prev_row_qp = h->fdec->f_row_qp[y]; + float qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min ); + float qp_absolute_max = h->param.rc.i_qp_max; if( rc->rate_factor_max_increment ) - i_qp_absolute_max = X264_MIN( i_qp_absolute_max, rc->qp_novbv + rc->rate_factor_max_increment ); - int i_qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, i_qp_absolute_max ); + qp_absolute_max = X264_MIN( qp_absolute_max, rc->qp_novbv + rc->rate_factor_max_increment ); + float qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, qp_absolute_max ); + float step_size = 0.5; /* B-frames shouldn't use lower QP than their reference frames. */ if( h->sh.i_type == SLICE_TYPE_B ) { - i_qp_min = X264_MAX( i_qp_min, X264_MAX( h->fref0[0]->i_row_qp[y+1], h->fref1[0]->i_row_qp[y+1] ) ); - rc->qpm = X264_MAX( rc->qpm, i_qp_min ); + qp_min = X264_MAX( qp_min, X264_MAX( h->fref0[0]->f_row_qp[y+1], h->fref1[0]->f_row_qp[y+1] ) ); + rc->qpm = X264_MAX( rc->qpm, qp_min ); } float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned; @@ -1303,45 +1285,53 @@ rc_tol /= 2; if( !rc->b_vbv_min_rate ) - i_qp_min = X264_MAX( i_qp_min, h->sh.i_qp ); + qp_min = X264_MAX( qp_min, rc->qp_novbv ); - while( rc->qpm < i_qp_max + while( rc->qpm < qp_max && ((b1 > rc->frame_size_planned + rc_tol) || (rc->buffer_fill - b1 < buffer_left_planned * 0.5) || (b1 > rc->frame_size_planned && rc->qpm < rc->qp_novbv)) ) { - rc->qpm ++; + rc->qpm += step_size; b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices; } - while( rc->qpm > i_qp_min - && (rc->qpm > h->fdec->i_row_qp[0] || rc->single_frame_vbv) + while( rc->qpm > qp_min + && (rc->qpm > h->fdec->f_row_qp[0] || rc->single_frame_vbv) && ((b1 < rc->frame_size_planned * 0.8 && rc->qpm <= prev_row_qp) || b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1) ) { - rc->qpm --; + rc->qpm -= step_size; b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices; } /* avoid VBV underflow or MinCR violation */ - while( (rc->qpm < i_qp_absolute_max) + while( (rc->qpm < qp_absolute_max) && ((rc->buffer_fill - b1 < rc->buffer_rate * rc->max_frame_error) || (rc->frame_size_maximum - b1 < rc->frame_size_maximum * rc->max_frame_error))) { - rc->qpm ++; + rc->qpm += step_size; b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices; } h->rc->frame_size_estimated = predict_row_size_sum( h, y, rc->qpm ); } - - /* loses the fractional part of the frame-wise qp */ - rc->f_qpm = rc->qpm; } int x264_ratecontrol_qp( x264_t *h ) { - return h->rc->qpm; + x264_emms(); + return x264_clip3( h->rc->qpm + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); +} + +int x264_ratecontrol_mb_qp( x264_t *h ) +{ + x264_emms(); + float qp = h->rc->qpm; + if( h->param.rc.i_aq_mode ) + /* MB-tree currently doesn't adjust quantizers in unreferenced frames. */ + qp += h->fdec->b_kept_as_ref ? h->fenc->f_qp_offset[h->mb.i_mb_xy] : h->fenc->f_qp_offset_aq[h->mb.i_mb_xy]; + return x264_clip3( qp + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); } /* In 2pass, force the same frame types as in the 1st pass */

@@ -30,7 +30,6 @@ void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init ); void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame ); -void x264_adaptive_quant( x264_t * ); int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame ); int x264_reference_build_list_optimal( x264_t *h ); void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next ); @@ -39,6 +38,7 @@ void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm ); void x264_ratecontrol_mb( x264_t *, int bits ); int x264_ratecontrol_qp( x264_t * ); +int x264_ratecontrol_mb_qp( x264_t *h ); int x264_ratecontrol_end( x264_t *, int bits, int *filler ); void x264_ratecontrol_summary( x264_t * ); void x264_ratecontrol_set_estimated_size( x264_t *, int bits );

@@ -50,6 +50,8 @@ * fractional bits, but only finite precision. */ #undef x264_cabac_encode_decision #undef x264_cabac_encode_decision_noup +#undef x264_cabac_encode_bypass +#undef x264_cabac_encode_terminal #define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v) #define x264_cabac_encode_decision_noup(c,x,v) x264_cabac_size_decision_noup(c,x,v) #define x264_cabac_encode_terminal(c) ((c)->f8_bits_encoded += 7) @@ -438,10 +440,13 @@ if( i < b_ac ) { - /* We only need to memset an empty 4x4 block. 8x8 can be + /* We only need to zero an empty 4x4 block. 8x8 can be implicitly emptied via zero nnz, as can dc. */ if( i_coefs == 16 && !dc ) - memset( dct, 0, 16 * sizeof(int16_t) ); + { + M128( &dct[0] ) = M128_ZERO; + M128( &dct[8] ) = M128_ZERO; + } return 0; } @@ -608,7 +613,10 @@ if( bnode == &nodes_cur[0] ) { if( i_coefs == 16 && !dc ) - memset( dct, 0, 16 * sizeof(int16_t) ); + { + M128( &dct[0] ) = M128_ZERO; + M128( &dct[8] ) = M128_ZERO; + } return 0; }

@@ -315,26 +315,22 @@ if( sps->vui.b_aspect_ratio_info_present ) { int i; - static const struct { int w, h; int sar; } sar[] = + static const struct { uint8_t w, h, sar; } sar[] = { { 1, 1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 }, { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 }, { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12}, - { 160,99, 13}, { 0, 0, -1 } + { 160,99, 13}, { 0, 0, 255 } }; - for( i = 0; sar[i].sar != -1; i++ ) + for( i = 0; sar[i].sar != 255; i++ ) { if( sar[i].w == sps->vui.i_sar_width && sar[i].h == sps->vui.i_sar_height ) break; } - if( sar[i].sar != -1 ) + bs_write( s, 8, sar[i].sar ); + if( sar[i].sar == 255 ) /* aspect_ratio_idc (extended) */ { - bs_write( s, 8, sar[i].sar ); - } - else - { - bs_write( s, 8, 255); /* aspect_ratio_idc (extended) */ bs_write( s, 16, sps->vui.i_sar_width ); bs_write( s, 16, sps->vui.i_sar_height ); }

@@ -25,7 +25,6 @@ #include <math.h> #include "common/common.h" -#include "common/cpu.h" #include "macroblock.h" #include "me.h" @@ -382,21 +381,23 @@ /* Reverse-order MV prediction. */ M32( mvc[0] ) = 0; - M32( mvc[1] ) = 0; M32( mvc[2] ) = 0; #define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; } if( i_mb_x < h->sps->i_mb_width - 1 ) - MVC(fenc_mv[1]); + MVC( fenc_mv[1] ); if( i_mb_y < h->sps->i_mb_height - 1 ) { - MVC(fenc_mv[i_mb_stride]); + MVC( fenc_mv[i_mb_stride] ); if( i_mb_x > 0 ) - MVC(fenc_mv[i_mb_stride-1]); + MVC( fenc_mv[i_mb_stride-1] ); if( i_mb_x < h->sps->i_mb_width - 1 ) - MVC(fenc_mv[i_mb_stride+1]); + MVC( fenc_mv[i_mb_stride+1] ); } #undef MVC - x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] ); + if( i_mvc <= 1 ) + CP32( m[l].mvp, mvc[0] ); + else + x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] ); x264_me_search( h, &m[l], mvc, i_mvc ); m[l].cost -= 2; // remove mvcost from skip mbs @@ -416,10 +417,6 @@ if( b_bidir && ( M32( m[0].mv ) || M32( m[1].mv ) ) ) TRY_BIDIR( m[0].mv, m[1].mv, 5 ); - /* Store to width-2 bitfield. */ - frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] &= ~(3<<((i_mb_xy&3)*2)); - frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] |= list_used<<((i_mb_xy&3)*2); - lowres_intra_mb: if( !fenc->b_intra_calculated ) { @@ -481,7 +478,10 @@ int i_icost = fenc->i_intra_cost[i_mb_xy]; int b_intra = i_icost < i_bcost; if( b_intra ) + { i_bcost = i_icost; + list_used = 0; + } if( b_frame_score_mb ) fenc->i_intra_mbs[b-p0] += b_intra; } @@ -501,7 +501,7 @@ } } - fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost; + fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost + (list_used << LOWRES_COST_SHIFT); } #undef TRY_BIDIR @@ -615,7 +615,7 @@ for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- ) { int i_mb_xy = h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride; - int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy]; + int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy] & LOWRES_COST_MASK; float qp_adj = qp_offset[i_mb_xy]; i_mb_cost = (i_mb_cost * x264_exp2fix8(qp_adj) + 128) >> 8; row_satd[ h->mb.i_mb_y ] += i_mb_cost; @@ -681,7 +681,7 @@ if( propagate_amount > 0 ) { /* Access width-2 bitfield. */ - int lists_used = (frames[b]->lowres_inter_types[b-p0][p1-b][mb_index>>2] >> ((mb_index&3)*2))&3; + int lists_used = frames[b]->lowres_costs[b-p0][p1-b][mb_index] >> LOWRES_COST_SHIFT; /* Follow the MVs to the previous frame(s). */ for( int list = 0; list < 2; list++ ) if( (lists_used >> list)&1 ) @@ -1490,7 +1490,7 @@ for( int x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ ) { int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor + 128) >> 8; - int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy]; + int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy] & LOWRES_COST_MASK; int diff = intra_cost - inter_cost; if( h->param.rc.i_aq_mode ) h->fdec->i_row_satd[y] += (diff * frames[b]->i_inv_qscale_factor[mb_xy] + 128) >> 8;

@@ -45,7 +45,7 @@ /* maximum size of the sequence of filters to try on non script files */ #define AVS_MAX_SEQUENCE 5 -#define LOAD_AVS_FUNC(name, continue_on_fail) \ +#define LOAD_AVS_FUNC(name, continue_on_fail)\ {\ h->func.name = (void*)GetProcAddress( h->library, #name );\ if( !continue_on_fail && !h->func.name )\

@@ -104,13 +104,21 @@ case 'I': /* Interlace type */ switch( *tokstart++ ) { - case 'p': break; - case '?': case 't': + info->interlaced = 1; + info->tff = 1; + break; case 'b': + info->interlaced = 1; + info->tff = 0; + break; case 'm': - default: info->interlaced = 1; + break; + //case '?': + //case 'p': + default: + break; } break; case 'F': /* Frame rate - 0:0 if unknown */

@@ -43,7 +43,7 @@ SECTION .text -cextern puts +cextern_naked puts ; max number of args used by any x264 asm function. ; (max_args % 4) must equal 3 for stack alignment @@ -54,7 +54,7 @@ ;----------------------------------------------------------------------------- ; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ) ;----------------------------------------------------------------------------- -cglobal x264_checkasm_call, 4,7,16 +cglobal checkasm_call, 4,7,16 sub rsp, max_args*8 %assign stack_offset stack_offset+max_args*8 mov r6, r0 @@ -113,7 +113,7 @@ ;----------------------------------------------------------------------------- ; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ) ;----------------------------------------------------------------------------- -cglobal x264_checkasm_call, 1,7 +cglobal checkasm_call, 1,7 mov r3, n3 mov r4, n4 mov r5, n5 @@ -147,7 +147,7 @@ ;----------------------------------------------------------------------------- ; int x264_stack_pagealign( int (*func)(), int align ) ;----------------------------------------------------------------------------- -cglobal x264_stack_pagealign, 2,2 +cglobal stack_pagealign, 2,2 push rbp mov rbp, rsp and rsp, ~0xfff

@@ -265,7 +265,7 @@ buf3[i] = ~(buf4[i] = -(buf1[i&~0x88]&1)); #define TEST_PIXEL( name, align ) \ - ok = 1, used_asm = 0;\ + ok = 1, used_asm = 0; \ for( int i = 0; i < 7; i++ ) \ { \ int res_c, res_asm; \ @@ -305,7 +305,7 @@ TEST_PIXEL( sa8d, 1 ); #define TEST_PIXEL_X( N ) \ - ok = 1; used_asm = 0;\ + ok = 1; used_asm = 0; \ for( int i = 0; i < 7; i++ ) \ { \ int res_c[4]={0}, res_asm[4]={0}; \ @@ -350,7 +350,7 @@ { \ set_func_name( "%s_%s", "var", pixel_names[i] ); \ used_asm = 1; \ - /* abi-check wrapper can't return uint64_t, so separate it from return value check */\ + /* abi-check wrapper can't return uint64_t, so separate it from return value check */ \ call_c1( pixel_c.var[i], buf1, 16 ); \ call_a1( pixel_asm.var[i], buf1, 16 ); \ uint64_t res_c = pixel_c.var[i]( buf1, 16 ); \ @@ -415,7 +415,7 @@ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \ int res_c[3], res_asm[3]; \ - set_func_name( #name );\ + set_func_name( #name ); \ used_asm = 1; \ memcpy( buf3, buf2, 1024 ); \ for( int i = 0; i < 3; i++ ) \ @@ -538,7 +538,7 @@ #define TEST_DCT( name, t1, t2, size ) \ if( dct_asm.name != dct_ref.name ) \ { \ - set_func_name( #name );\ + set_func_name( #name ); \ used_asm = 1; \ call_c( dct_c.name, t1, buf1, buf2 ); \ call_a( dct_asm.name, t2, buf1, buf2 ); \ @@ -579,7 +579,7 @@ #define TEST_IDCT( name, src ) \ if( dct_asm.name != dct_ref.name ) \ { \ - set_func_name( #name );\ + set_func_name( #name ); \ used_asm = 1; \ memcpy( buf3, buf1, 32*32 ); \ memcpy( buf4, buf1, 32*32 ); \ @@ -644,12 +644,12 @@ ALIGNED_16( int16_t level1[64] ); ALIGNED_16( int16_t level2[64] ); -#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \ +#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \ if( zigzag_asm.name != zigzag_ref.name ) \ { \ - set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\ + set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ - memcpy(dct, buf1, size*sizeof(int16_t));\ + memcpy(dct, buf1, size*sizeof(int16_t)); \ call_c( zigzag_c.name, t1, dct ); \ call_a( zigzag_asm.name, t2, dct ); \ if( memcmp( t1, t2, size*sizeof(int16_t) ) ) \ @@ -663,18 +663,18 @@ if( zigzag_asm.name != zigzag_ref.name ) \ { \ int nz_a, nz_c; \ - set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\ + set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ memcpy( buf3, buf1, 16*FDEC_STRIDE ); \ memcpy( buf4, buf1, 16*FDEC_STRIDE ); \ - nz_c = call_c1( zigzag_c.name, t1, buf2, buf3 ); \ + nz_c = call_c1( zigzag_c.name, t1, buf2, buf3 ); \ nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4 ); \ - if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a ) \ + if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ } \ - call_c2( zigzag_c.name, t1, buf2, buf3 ); \ + call_c2( zigzag_c.name, t1, buf2, buf3 ); \ call_a2( zigzag_asm.name, t2, buf2, buf4 ); \ } @@ -683,7 +683,7 @@ { \ int nz_a, nz_c; \ int16_t dc_a, dc_c; \ - set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\ + set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ for( int i = 0; i < 2; i++ ) \ { \ @@ -694,27 +694,27 @@ memcpy( buf3 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \ memcpy( buf4 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \ } \ - nz_c = call_c1( zigzag_c.name, t1, buf2, buf3, &dc_c ); \ + nz_c = call_c1( zigzag_c.name, t1, buf2, buf3, &dc_c ); \ nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \ - if( memcmp( t1+1, t2+1, 15*sizeof(int16_t) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a || dc_c != dc_a ) \ + if( memcmp( t1+1, t2+1, 15*sizeof(int16_t) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a || dc_c != dc_a ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ break; \ } \ } \ - call_c2( zigzag_c.name, t1, buf2, buf3, &dc_c ); \ + call_c2( zigzag_c.name, t1, buf2, buf3, &dc_c ); \ call_a2( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \ } -#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \ +#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \ if( zigzag_asm.name != zigzag_ref.name ) \ { \ for( int j = 0; j < 100; j++ ) \ { \ - set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\ + set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ - memcpy(dct, buf1, size*sizeof(int16_t));\ + memcpy(dct, buf1, size*sizeof(int16_t)); \ for( int i = 0; i < size; i++ ) \ dct[i] = rand()&0x1F ? 0 : dct[i]; \ memcpy(buf3, buf4, 10*sizeof(uint8_t)); \ @@ -784,7 +784,7 @@ if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \ { \ const x264_weight_t *weight = weight_none; \ - set_func_name( "mc_luma_%dx%d", w, h );\ + set_func_name( "mc_luma_%dx%d", w, h ); \ used_asm = 1; \ memset( buf3, 0xCD, 1024 ); \ memset( buf4, 0xCD, 1024 ); \ @@ -801,7 +801,7 @@ uint8_t *ref = dst2; \ int ref_stride = 32; \ const x264_weight_t *weight = weight_none; \ - set_func_name( "get_ref_%dx%d", w, h );\ + set_func_name( "get_ref_%dx%d", w, h ); \ used_asm = 1; \ memset( buf3, 0xCD, 1024 ); \ memset( buf4, 0xCD, 1024 ); \ @@ -819,13 +819,13 @@ #define MC_TEST_CHROMA( w, h ) \ if( mc_a.mc_chroma != mc_ref.mc_chroma ) \ { \ - set_func_name( "mc_chroma_%dx%d", w, h );\ + set_func_name( "mc_chroma_%dx%d", w, h ); \ used_asm = 1; \ memset( buf3, 0xCD, 1024 ); \ memset( buf4, 0xCD, 1024 ); \ call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \ call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \ - /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\ + /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \ for( int j = 0; j < h; j++ ) \ for( int i = w; i < 4; i++ ) \ dst2[i+j*16] = dst1[i+j*16]; \ @@ -878,7 +878,7 @@ memcpy( buf4, buf1+320, 320 ); \ if( mc_a.name[i] != mc_ref.name[i] ) \ { \ - set_func_name( "%s_%s", #name, pixel_names[i] );\ + set_func_name( "%s_%s", #name, pixel_names[i] ); \ used_asm = 1; \ call_c1( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \ call_a1( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \ @@ -899,7 +899,7 @@ #define MC_TEST_WEIGHT( name, weight, aligned ) \ int align_off = (aligned ? 0 : rand()%16); \ - ok = 1, used_asm = 0;\ + ok = 1, used_asm = 0; \ for( int i = 1; i <= 5; i++ ) \ { \ ALIGNED_16( uint8_t buffC[640] ); \ @@ -1115,14 +1115,14 @@ #define TEST_DEBLOCK( name, align, ... ) \ for( int i = 0; i < 36; i++ ) \ { \ - int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */\ + int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \ for( int j = 0; j < 1024; j++ ) \ - /* two distributions of random to excersize different failure modes */\ + /* two distributions of random to excersize different failure modes */ \ buf3[j] = rand() & (i&1 ? 0xf : 0xff ); \ memcpy( buf4, buf3, 1024 ); \ if( db_a.name != db_ref.name ) \ { \ - set_func_name( #name );\ + set_func_name( #name ); \ used_asm = 1; \ call_c1( db_c.name, buf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ call_a1( db_a.name, buf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ @@ -1236,7 +1236,7 @@ dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \ result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ - if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a ) \ + if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a ) \ { \ oks[0] = 0; \ fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \ @@ -1491,11 +1491,11 @@ ip_c.predict_8x8_filter( buf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); -#define INTRA_TEST( name, dir, w, ... ) \ +#define INTRA_TEST( name, dir, w, ... )\ if( ip_a.name[dir] != ip_ref.name[dir] )\ - { \ + {\ set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\ - used_asm = 1; \ + used_asm = 1;\ memcpy( buf3, buf1, 32*20 );\ memcpy( buf4, buf1, 32*20 );\ call_c( ip_c.name[dir], buf3+48, ##__VA_ARGS__ );\ @@ -1556,32 +1556,66 @@ } #define DECL_CABAC(cpu) \ -static void run_cabac_##cpu( uint8_t *dst )\ +static void run_cabac_decision_##cpu( uint8_t *dst )\ {\ x264_cabac_t cb;\ x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\ x264_cabac_encode_init( &cb, dst, dst+0xff0 );\ for( int i = 0; i < 0x1000; i++ )\ x264_cabac_encode_decision_##cpu( &cb, buf1[i]>>1, buf1[i]&1 );\ +}\ +static void run_cabac_bypass_##cpu( uint8_t *dst )\ +{\ + x264_cabac_t cb;\ + x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\ + x264_cabac_encode_init( &cb, dst, dst+0xff0 );\ + for( int i = 0; i < 0x1000; i++ )\ + x264_cabac_encode_bypass_##cpu( &cb, buf1[i]&1 );\ +}\ +static void run_cabac_terminal_##cpu( uint8_t *dst )\ +{\ + x264_cabac_t cb;\ + x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\ + x264_cabac_encode_init( &cb, dst, dst+0xff0 );\ + for( int i = 0; i < 0x1000; i++ )\ + x264_cabac_encode_terminal_##cpu( &cb );\ } DECL_CABAC(c) #ifdef HAVE_MMX DECL_CABAC(asm) #else -#define run_cabac_asm run_cabac_c +#define run_cabac_decision_asm run_cabac_decision_c +#define run_cabac_bypass_asm run_cabac_bypass_c +#define run_cabac_terminal_asm run_cabac_terminal_c #endif static int check_cabac( int cpu_ref, int cpu_new ) { int ret = 0, ok, used_asm = 1; - if( cpu_ref || run_cabac_c == run_cabac_asm) + if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm ) return 0; + set_func_name( "cabac_encode_decision" ); memcpy( buf4, buf3, 0x1000 ); - call_c( run_cabac_c, buf3 ); - call_a( run_cabac_asm, buf4 ); + call_c( run_cabac_decision_c, buf3 ); + call_a( run_cabac_decision_asm, buf4 ); + ok = !memcmp( buf3, buf4, 0x1000 ); + report( "cabac decision:" ); + + set_func_name( "cabac_encode_bypass" ); + memcpy( buf4, buf3, 0x1000 ); + call_c( run_cabac_bypass_c, buf3 ); + call_a( run_cabac_bypass_asm, buf4 ); ok = !memcmp( buf3, buf4, 0x1000 ); - report( "cabac :" ); + report( "cabac bypass:" ); + + set_func_name( "cabac_encode_terminal" ); + memcpy( buf4, buf3, 0x1000 ); + call_c( run_cabac_terminal_c, buf3 ); + call_a( run_cabac_terminal_asm, buf4 ); + ok = !memcmp( buf3, buf4, 0x1000 ); + report( "cabac terminal:" ); + return ret; }

@@ -120,7 +120,7 @@ static const cli_pulldown_t pulldown_values[] = { - [X264_PULLDOWN_22] = {1, {TB}, 2.0}, + [X264_PULLDOWN_22] = {1, {TB}, 1.0}, [X264_PULLDOWN_32] = {4, {TBT, BT, BTB, TB}, 1.25}, [X264_PULLDOWN_64] = {2, {PIC_STRUCT_DOUBLE, PIC_STRUCT_TRIPLE}, 1.0}, [X264_PULLDOWN_DOUBLE] = {1, {PIC_STRUCT_DOUBLE}, 2.0}, @@ -1312,7 +1312,7 @@ * Encode: *****************************************************************************/ -static int Encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic, int64_t *last_pts ) +static int Encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic, int64_t *last_dts ) { x264_picture_t pic_out; x264_nal_t *nal; @@ -1330,18 +1330,22 @@ if( i_frame_size ) { i_frame_size = output.write_frame( hout, nal[0].p_payload, i_frame_size, &pic_out ); - *last_pts = pic_out.i_pts; + *last_dts = pic_out.i_dts; } return i_frame_size; } -static void Print_status( int64_t i_start, int i_frame, int i_frame_total, int64_t i_file, x264_param_t *param, int64_t last_pts ) +static void Print_status( int64_t i_start, int i_frame, int i_frame_total, int64_t i_file, x264_param_t *param, int64_t last_ts ) { char buf[200]; int64_t i_elapsed = x264_mdate() - i_start; double fps = i_elapsed > 0 ? i_frame * 1000000. / i_elapsed : 0; - double bitrate = (double) i_file * 8 / ( (double) last_pts * 1000 * param->i_timebase_num / param->i_timebase_den ); + double bitrate; + if( last_ts ) + bitrate = (double) i_file * 8 / ( (double) last_ts * 1000 * param->i_timebase_num / param->i_timebase_den ); + else + bitrate = (double) i_file * 8 / ( (double) 1000 * param->i_fps_den / param->i_fps_num ); if( i_frame_total ) { int eta = i_elapsed * (i_frame_total - i_frame) / ((int64_t)i_frame * 1000000); @@ -1369,7 +1373,9 @@ int64_t i_file = 0; int i_frame_size; int i_update_interval; - int64_t last_pts = 0; + int64_t last_dts = 0; + int64_t prev_dts = 0; + int64_t first_dts = 0; # define MAX_PTS_WARNING 3 /* arbitrary */ int pts_warning_cnt = 0; int64_t largest_pts = -1; @@ -1506,12 +1512,17 @@ pic.i_qpplus1 = 0; } - i_frame_size = Encode_frame( h, opt->hout, &pic, &last_pts ); + prev_dts = last_dts; + i_frame_size = Encode_frame( h, opt->hout, &pic, &last_dts ); if( i_frame_size < 0 ) return -1; i_file += i_frame_size; if( i_frame_size ) + { i_frame_output++; + if( i_frame_output == 1 ) + first_dts = prev_dts = last_dts; + } i_frame++; @@ -1520,19 +1531,24 @@ /* update status line (up to 1000 times per input file) */ if( opt->b_progress && i_frame_output % i_update_interval == 0 && i_frame_output ) - Print_status( i_start, i_frame_output, i_frame_total, i_file, param, last_pts ); + Print_status( i_start, i_frame_output, i_frame_total, i_file, param, 2 * last_dts - prev_dts - first_dts ); } /* Flush delayed frames */ while( !b_ctrl_c && x264_encoder_delayed_frames( h ) ) { - i_frame_size = Encode_frame( h, opt->hout, NULL, &last_pts ); + prev_dts = last_dts; + i_frame_size = Encode_frame( h, opt->hout, NULL, &last_dts ); if( i_frame_size < 0 ) return -1; i_file += i_frame_size; if( i_frame_size ) + { i_frame_output++; + if( i_frame_output == 1 ) + first_dts = prev_dts = last_dts; + } if( opt->b_progress && i_frame_output % i_update_interval == 0 && i_frame_output ) - Print_status( i_start, i_frame_output, i_frame_total, i_file, param, last_pts ); + Print_status( i_start, i_frame_output, i_frame_total, i_file, param, 2 * last_dts - prev_dts - first_dts ); } if( pts_warning_cnt >= MAX_PTS_WARNING && param->i_log_level < X264_LOG_DEBUG ) fprintf( stderr, "x264 [warning]: %d suppressed nonmonotonic pts warnings\n", pts_warning_cnt-MAX_PTS_WARNING );

@@ -35,7 +35,7 @@ #include <stdarg.h> -#define X264_BUILD 94 +#define X264_BUILD 95 /* x264_t: * opaque handler for encoder */ @@ -639,5 +639,13 @@ * return the number of currently delayed (buffered) frames * this should be used at the end of the stream, to know when you have all the encoded frames. */ int x264_encoder_delayed_frames( x264_t * ); +/* x264_encoder_intra_refresh: + * If an intra refresh is not in progress, begin one with the next P-frame. + * If an intra refresh is in progress, begin one as soon as the current one finishes. + * Requires that b_intra_refresh be set. + * Useful for interactive streaming where the client can tell the server that packet loss has + * occurred. In this case, keyint can be set to an extremely high value so that intra refreshes + * only occur when calling x264_encoder_intra_refresh. */ +void x264_encoder_intra_refresh( x264_t * ); #endif

[-] [+]	Changed	x264.spec
@@ -1,14 +1,14 @@ # norootforbuild -%define soname 94 -%define svn 20100425 +%define soname 95 +%define svn 20100517 %define binname x264 %define realname libx264 Name: %{binname} Summary: A free h264/avc encoder - encoder binary Version: 0.%{soname}svn%{svn} -Release: 3 +Release: 1 License: GNU General Public License (GPL) Group: Productivity/Multimedia/Video/Editors and Convertors URL: http://developers.videolan.org/x264.html @@ -17,7 +17,7 @@ BuildRoot: %{_tmppath}/build-root-%{name} Requires: %{realname}-%{soname} = %{version}-%{release} BuildRequires: nasm -BuildRequires: yasm >= 0.7.1 +BuildRequires: yasm >= 1.0.1 %description x264 is a free library for encoding next-generation H264/AVC video @@ -93,7 +93,7 @@ %__make %{?jobs:-j%{jobs}} %install -%makeinstall +%__make DESTDIR=%{buildroot} install %{__rm} -f "%{buildroot}/%{_libdir}/libx264.so" %{__rm} -f "%{buildroot}/%{_libdir}/libx264.a" @@ -121,6 +121,5 @@ %{_libdir}/libx264.so %changelog -* Wed May 05 2010 Carsten Schoene <cs@linux-administrator.com> - 0.94svn20100425 +* Wed May 05 2010 Carsten Schoene <cs@linux-administrator.com> - 0.95svn20100517-1 - EL5 import - x 1 @@ -1,14 +1,14 @@ 2 # norootforbuild 3 4 -%define soname 94 5 -%define svn 20100425 6 +%define soname 95 7 +%define svn 20100517 8 %define binname x264 9 %define realname libx264 10 11 Name: %{binname} 12 Summary: A free h264/avc encoder - encoder binary 13 Version: 0.%{soname}svn%{svn} 14 -Release: 3 15 +Release: 1 16 License: GNU General Public License (GPL) 17 Group: Productivity/Multimedia/Video/Editors and Convertors 18 URL: http://developers.videolan.org/x264.html 19 @@ -17,7 +17,7 @@ 20 BuildRoot: %{_tmppath}/build-root-%{name} 21 Requires: %{realname}-%{soname} = %{version}-%{release} 22 BuildRequires: nasm 23 -BuildRequires: yasm >= 0.7.1 24 +BuildRequires: yasm >= 1.0.1 25 26 %description 27 x264 is a free library for encoding next-generation H264/AVC video 28 @@ -93,7 +93,7 @@ 29 %__make %{?jobs:-j%{jobs}} 30 31 %install 32 -%makeinstall 33 +%__make DESTDIR=%{buildroot} install 34 35 %{__rm} -f "%{buildroot}/%{_libdir}/libx264.so" 36 %{__rm} -f "%{buildroot}/%{_libdir}/libx264.a" 37 @@ -121,6 +121,5 @@ 38 %{_libdir}/libx264.so 39 40 %changelog 41 -* Wed May 05 2010 Carsten Schoene <cs@linux-administrator.com> - 0.94svn20100425 42 +* Wed May 05 2010 Carsten Schoene <cs@linux-administrator.com> - 0.95svn20100517-1 43 - EL5 import 44 - 45
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/Makefile ^
@@ -8,6 +8,7 @@ common/frame.c common/dct.c common/cpu.c common/cabac.c \ common/common.c common/mdate.c common/rectangle.c \ common/set.c common/quant.c common/deblock.c common/vlc.c \ + common/mvpred.c \ encoder/analyse.c encoder/me.c encoder/ratecontrol.c \ encoder/set.c encoder/macroblock.c encoder/cabac.c \ encoder/cavlc.c encoder/encoder.c encoder/lookahead.c @@ -49,8 +50,8 @@ # MMX/SSE optims ifneq ($(AS),) -X86SRC0 = cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \ - pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \ +X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \ + mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \ cpu-a.asm dct-32.asm X86SRC = $(X86SRC0:%=common/x86/%)
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/arm/mc-c.c ^
@@ -112,8 +112,8 @@ x264_mc_copy_w16_neon, }; -static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; -static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; +static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; +static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; static void mc_luma_neon( uint8_t dst, int i_dst_stride, uint8_t src[4], int i_src_stride,
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/cabac.c ^
@@ -664,75 +664,44 @@ } }; -/* FIXME could avoid this duplication by reversing the order of states - * with MPS=0, but that would uglify the other tables / -const uint8_t x264_cabac_range_lps[128][4] = -{ - { 2, 2, 2, 2 }, - { 6, 7, 8, 9 }, { 6, 7, 9, 10 }, { 6, 8, 9, 11 }, - { 7, 8, 10, 11 }, { 7, 9, 10, 12 }, { 7, 9, 11, 12 }, - { 8, 9, 11, 13 }, { 8, 10, 12, 14 }, { 9, 11, 12, 14 }, - { 9, 11, 13, 15 }, { 10, 12, 14, 16 }, { 10, 12, 15, 17 }, - { 11, 13, 15, 18 }, { 11, 14, 16, 19 }, { 12, 14, 17, 20 }, - { 12, 15, 18, 21 }, { 13, 16, 19, 22 }, { 14, 17, 20, 23 }, - { 14, 18, 21, 24 }, { 15, 19, 22, 25 }, { 16, 20, 23, 27 }, - { 17, 21, 25, 28 }, { 18, 22, 26, 30 }, { 19, 23, 27, 31 }, - { 20, 24, 29, 33 }, { 21, 26, 30, 35 }, { 22, 27, 32, 37 }, - { 23, 28, 33, 39 }, { 24, 30, 35, 41 }, { 26, 31, 37, 43 }, - { 27, 33, 39, 45 }, { 29, 35, 41, 48 }, { 30, 37, 43, 50 }, - { 32, 39, 46, 53 }, { 33, 41, 48, 56 }, { 35, 43, 51, 59 }, - { 37, 45, 54, 62 }, { 39, 48, 56, 65 }, { 41, 50, 59, 69 }, - { 43, 53, 63, 72 }, { 46, 56, 66, 76 }, { 48, 59, 69, 80 }, - { 51, 62, 73, 85 }, { 53, 65, 77, 89 }, { 56, 69, 81, 94 }, - { 59, 72, 86, 99 }, { 62, 76, 90, 104 }, { 66, 80, 95, 110 }, - { 69, 85, 100, 116 }, { 73, 89, 105, 122 }, { 77, 94, 111, 128 }, - { 81, 99, 117, 135 }, { 85, 104, 123, 142 }, { 90, 110, 130, 150 }, - { 95, 116, 137, 158 }, { 100, 122, 144, 166 }, { 105, 128, 152, 175 }, - { 111, 135, 160, 185 }, { 116, 142, 169, 195 }, { 123, 150, 178, 205 }, - { 128, 158, 187, 216 }, { 128, 167, 197, 227 }, { 128, 176, 208, 240 }, - - { 128, 176, 208, 240 }, { 128, 167, 197, 227 }, { 128, 158, 187, 216 }, - { 123, 150, 178, 205 }, { 116, 142, 169, 195 }, { 111, 135, 160, 185 }, - { 105, 128, 152, 175 }, { 100, 122, 144, 166 }, { 95, 116, 137, 158 }, - { 90, 110, 130, 150 }, { 85, 104, 123, 142 }, { 81, 99, 117, 135 }, - { 77, 94, 111, 128 }, { 73, 89, 105, 122 }, { 69, 85, 100, 116 }, - { 66, 80, 95, 110 }, { 62, 76, 90, 104 }, { 59, 72, 86, 99 }, - { 56, 69, 81, 94 }, { 53, 65, 77, 89 }, { 51, 62, 73, 85 }, - { 48, 59, 69, 80 }, { 46, 56, 66, 76 }, { 43, 53, 63, 72 }, - { 41, 50, 59, 69 }, { 39, 48, 56, 65 }, { 37, 45, 54, 62 }, - { 35, 43, 51, 59 }, { 33, 41, 48, 56 }, { 32, 39, 46, 53 }, - { 30, 37, 43, 50 }, { 29, 35, 41, 48 }, { 27, 33, 39, 45 }, - { 26, 31, 37, 43 }, { 24, 30, 35, 41 }, { 23, 28, 33, 39 }, - { 22, 27, 32, 37 }, { 21, 26, 30, 35 }, { 20, 24, 29, 33 }, - { 19, 23, 27, 31 }, { 18, 22, 26, 30 }, { 17, 21, 25, 28 }, - { 16, 20, 23, 27 }, { 15, 19, 22, 25 }, { 14, 18, 21, 24 }, - { 14, 17, 20, 23 }, { 13, 16, 19, 22 }, { 12, 15, 18, 21 }, - { 12, 14, 17, 20 }, { 11, 14, 16, 19 }, { 11, 13, 15, 18 }, - { 10, 12, 15, 17 }, { 10, 12, 14, 16 }, { 9, 11, 13, 15 }, - { 9, 11, 12, 14 }, { 8, 10, 12, 14 }, { 8, 9, 11, 13 }, - { 7, 9, 11, 12 }, { 7, 9, 10, 12 }, { 7, 8, 10, 11 }, - { 6, 8, 9, 11 }, { 6, 7, 9, 10 }, { 6, 7, 8, 9 }, - { 2, 2, 2, 2 }, +const uint8_t x264_cabac_range_lps[64][4] = +{ + { 2, 2, 2, 2}, { 6, 7, 8, 9}, { 6, 7, 9, 10}, { 6, 8, 9, 11}, + { 7, 8, 10, 11}, { 7, 9, 10, 12}, { 7, 9, 11, 12}, { 8, 9, 11, 13}, + { 8, 10, 12, 14}, { 9, 11, 12, 14}, { 9, 11, 13, 15}, { 10, 12, 14, 16}, + { 10, 12, 15, 17}, { 11, 13, 15, 18}, { 11, 14, 16, 19}, { 12, 14, 17, 20}, + { 12, 15, 18, 21}, { 13, 16, 19, 22}, { 14, 17, 20, 23}, { 14, 18, 21, 24}, + { 15, 19, 22, 25}, { 16, 20, 23, 27}, { 17, 21, 25, 28}, { 18, 22, 26, 30}, + { 19, 23, 27, 31}, { 20, 24, 29, 33}, { 21, 26, 30, 35}, { 22, 27, 32, 37}, + { 23, 28, 33, 39}, { 24, 30, 35, 41}, { 26, 31, 37, 43}, { 27, 33, 39, 45}, + { 29, 35, 41, 48}, { 30, 37, 43, 50}, { 32, 39, 46, 53}, { 33, 41, 48, 56}, + { 35, 43, 51, 59}, { 37, 45, 54, 62}, { 39, 48, 56, 65}, { 41, 50, 59, 69}, + { 43, 53, 63, 72}, { 46, 56, 66, 76}, { 48, 59, 69, 80}, { 51, 62, 73, 85}, + { 53, 65, 77, 89}, { 56, 69, 81, 94}, { 59, 72, 86, 99}, { 62, 76, 90, 104}, + { 66, 80, 95, 110}, { 69, 85, 100, 116}, { 73, 89, 105, 122}, { 77, 94, 111, 128}, + { 81, 99, 117, 135}, { 85, 104, 123, 142}, { 90, 110, 130, 150}, { 95, 116, 137, 158}, + {100, 122, 144, 166}, {105, 128, 152, 175}, {111, 135, 160, 185}, {116, 142, 169, 195}, + {123, 150, 178, 205}, {128, 158, 187, 216}, {128, 167, 197, 227}, {128, 176, 208, 240} }; const uint8_t x264_cabac_transition[128][2] = { - { 0, 0}, { 1, 25}, { 1, 25}, { 2, 26}, { 3, 26}, { 4, 26}, { 5, 27}, { 6, 27}, - { 7, 27}, { 8, 28}, { 9, 28}, { 10, 28}, { 11, 29}, { 12, 29}, { 13, 30}, { 14, 30}, - { 15, 30}, { 16, 31}, { 17, 31}, { 18, 32}, { 19, 33}, { 20, 33}, { 21, 33}, { 22, 34}, - { 23, 34}, { 24, 35}, { 25, 36}, { 26, 36}, { 27, 37}, { 28, 37}, { 29, 38}, { 30, 39}, - { 31, 39}, { 32, 40}, { 33, 41}, { 34, 41}, { 35, 42}, { 36, 42}, { 37, 44}, { 38, 44}, - { 39, 45}, { 40, 45}, { 41, 47}, { 42, 47}, { 43, 48}, { 44, 48}, { 45, 50}, { 46, 50}, - { 47, 51}, { 48, 52}, { 49, 52}, { 50, 54}, { 51, 54}, { 52, 55}, { 53, 56}, { 54, 57}, - { 55, 58}, { 56, 59}, { 57, 59}, { 58, 61}, { 59, 61}, { 60, 62}, { 61, 63}, { 62, 64}, - { 63, 65}, { 64, 66}, { 65, 67}, { 66, 68}, { 66, 69}, { 68, 70}, { 68, 71}, { 69, 72}, - { 70, 73}, { 71, 74}, { 72, 75}, { 73, 76}, { 73, 77}, { 75, 78}, { 75, 79}, { 76, 80}, - { 77, 81}, { 77, 82}, { 79, 83}, { 79, 84}, { 80, 85}, { 80, 86}, { 82, 87}, { 82, 88}, - { 83, 89}, { 83, 90}, { 85, 91}, { 85, 92}, { 86, 93}, { 86, 94}, { 87, 95}, { 88, 96}, - { 88, 97}, { 89, 98}, { 90, 99}, { 90,100}, { 91,101}, { 91,102}, { 92,103}, { 93,104}, - { 93,105}, { 94,106}, { 94,107}, { 94,108}, { 95,109}, { 96,110}, { 96,111}, { 97,112}, - { 97,113}, { 97,114}, { 98,115}, { 98,116}, { 99,117}, { 99,118}, { 99,119}, {100,120}, - {100,121}, {100,122}, {101,123}, {101,124}, {101,125}, {102,126}, {102,126}, {127,127}, + { 0, 0}, { 1, 1}, { 2, 50}, { 51, 3}, { 2, 50}, { 51, 3}, { 4, 52}, { 53, 5}, + { 6, 52}, { 53, 7}, { 8, 52}, { 53, 9}, { 10, 54}, { 55, 11}, { 12, 54}, { 55, 13}, + { 14, 54}, { 55, 15}, { 16, 56}, { 57, 17}, { 18, 56}, { 57, 19}, { 20, 56}, { 57, 21}, + { 22, 58}, { 59, 23}, { 24, 58}, { 59, 25}, { 26, 60}, { 61, 27}, { 28, 60}, { 61, 29}, + { 30, 60}, { 61, 31}, { 32, 62}, { 63, 33}, { 34, 62}, { 63, 35}, { 36, 64}, { 65, 37}, + { 38, 66}, { 67, 39}, { 40, 66}, { 67, 41}, { 42, 66}, { 67, 43}, { 44, 68}, { 69, 45}, + { 46, 68}, { 69, 47}, { 48, 70}, { 71, 49}, { 50, 72}, { 73, 51}, { 52, 72}, { 73, 53}, + { 54, 74}, { 75, 55}, { 56, 74}, { 75, 57}, { 58, 76}, { 77, 59}, { 60, 78}, { 79, 61}, + { 62, 78}, { 79, 63}, { 64, 80}, { 81, 65}, { 66, 82}, { 83, 67}, { 68, 82}, { 83, 69}, + { 70, 84}, { 85, 71}, { 72, 84}, { 85, 73}, { 74, 88}, { 89, 75}, { 76, 88}, { 89, 77}, + { 78, 90}, { 91, 79}, { 80, 90}, { 91, 81}, { 82, 94}, { 95, 83}, { 84, 94}, { 95, 85}, + { 86, 96}, { 97, 87}, { 88, 96}, { 97, 89}, { 90, 100}, {101, 91}, { 92, 100}, {101, 93}, + { 94, 102}, {103, 95}, { 96, 104}, {105, 97}, { 98, 104}, {105, 99}, {100, 108}, {109, 101}, + {102, 108}, {109, 103}, {104, 110}, {111, 105}, {106, 112}, {113, 107}, {108, 114}, {115, 109}, + {110, 116}, {117, 111}, {112, 118}, {119, 113}, {114, 118}, {119, 115}, {116, 122}, {123, 117}, + {118, 122}, {123, 119}, {120, 124}, {125, 121}, {122, 126}, {127, 123}, {124, 127}, {126, 125} }; const uint8_t x264_cabac_renorm_shift[64]= { @@ -743,41 +712,40 @@ }; / -ln2(probability) / -#define F(a,b) {FIX8(a),FIX8(b)} -const uint16_t x264_cabac_entropy[128][2] = +const uint16_t x264_cabac_entropy[128] = { - F(0.0273,5.7370), F(0.0288,5.6618), F(0.0303,5.5866), F(0.0320,5.5114), - F(0.0337,5.4362), F(0.0355,5.3610), F(0.0375,5.2859), F(0.0395,5.2106), - F(0.0416,5.1354), F(0.0439,5.0602), F(0.0463,4.9851), F(0.0488,4.9099), - F(0.0515,4.8347), F(0.0543,4.7595), F(0.0572,4.6843), F(0.0604,4.6091), - F(0.0637,4.5339), F(0.0671,4.4588), F(0.0708,4.3836), F(0.0747,4.3083), - F(0.0788,4.2332), F(0.0832,4.1580), F(0.0878,4.0828), F(0.0926,4.0076), - F(0.0977,3.9324), F(0.1032,3.8572), F(0.1089,3.7820), F(0.1149,3.7068), - F(0.1214,3.6316), F(0.1282,3.5565), F(0.1353,3.4813), F(0.1429,3.4061), - F(0.1510,3.3309), F(0.1596,3.2557), F(0.1686,3.1805), F(0.1782,3.1053), - F(0.1884,3.0301), F(0.1992,2.9549), F(0.2107,2.8797), F(0.2229,2.8046), - F(0.2358,2.7294), F(0.2496,2.6542), F(0.2642,2.5790), F(0.2798,2.5038), - F(0.2964,2.4286), F(0.3142,2.3534), F(0.3331,2.2782), F(0.3532,2.2030), - F(0.3748,2.1278), F(0.3979,2.0527), F(0.4226,1.9775), F(0.4491,1.9023), - F(0.4776,1.8271), F(0.5082,1.7519), F(0.5412,1.6767), F(0.5768,1.6015), - F(0.6152,1.5263), F(0.6568,1.4511), F(0.7020,1.3759), F(0.7513,1.3008), - F(0.8050,1.2256), F(0.8638,1.1504), F(0.9285,1.0752), F(1.0000,1.0000), - F(1.0000,1.0000), F(1.0752,0.9285), F(1.1504,0.8638), F(1.2256,0.8050), - F(1.3008,0.7513), F(1.3759,0.7020), F(1.4511,0.6568), F(1.5263,0.6152), - F(1.6015,0.5768), F(1.6767,0.5412), F(1.7519,0.5082), F(1.8271,0.4776), - F(1.9023,0.4491), F(1.9775,0.4226), F(2.0527,0.3979), F(2.1278,0.3748), - F(2.2030,0.3532), F(2.2782,0.3331), F(2.3534,0.3142), F(2.4286,0.2964), - F(2.5038,0.2798), F(2.5790,0.2642), F(2.6542,0.2496), F(2.7294,0.2358), - F(2.8046,0.2229), F(2.8797,0.2107), F(2.9549,0.1992), F(3.0301,0.1884), - F(3.1053,0.1782), F(3.1805,0.1686), F(3.2557,0.1596), F(3.3309,0.1510), - F(3.4061,0.1429), F(3.4813,0.1353), F(3.5565,0.1282), F(3.6316,0.1214), - F(3.7068,0.1149), F(3.7820,0.1089), F(3.8572,0.1032), F(3.9324,0.0977), - F(4.0076,0.0926), F(4.0828,0.0878), F(4.1580,0.0832), F(4.2332,0.0788), - F(4.3083,0.0747), F(4.3836,0.0708), F(4.4588,0.0671), F(4.5339,0.0637), - F(4.6091,0.0604), F(4.6843,0.0572), F(4.7595,0.0543), F(4.8347,0.0515), - F(4.9099,0.0488), F(4.9851,0.0463), F(5.0602,0.0439), F(5.1354,0.0416), - F(5.2106,0.0395), F(5.2859,0.0375), F(5.3610,0.0355), F(5.4362,0.0337), - F(5.5114,0.0320), F(5.5866,0.0303), F(5.6618,0.0288), F(5.7370,0.0273), + FIX8(0.0273), FIX8(5.7370), FIX8(0.0288), FIX8(5.6618), + FIX8(0.0303), FIX8(5.5866), FIX8(0.0320), FIX8(5.5114), + FIX8(0.0337), FIX8(5.4362), FIX8(0.0355), FIX8(5.3610), + FIX8(0.0375), FIX8(5.2859), FIX8(0.0395), FIX8(5.2106), + FIX8(0.0416), FIX8(5.1354), FIX8(0.0439), FIX8(5.0602), + FIX8(0.0463), FIX8(4.9851), FIX8(0.0488), FIX8(4.9099), + FIX8(0.0515), FIX8(4.8347), FIX8(0.0543), FIX8(4.7595), + FIX8(0.0572), FIX8(4.6843), FIX8(0.0604), FIX8(4.6091), + FIX8(0.0637), FIX8(4.5339), FIX8(0.0671), FIX8(4.4588), + FIX8(0.0708), FIX8(4.3836), FIX8(0.0747), FIX8(4.3083), + FIX8(0.0788), FIX8(4.2332), FIX8(0.0832), FIX8(4.1580), + FIX8(0.0878), FIX8(4.0828), FIX8(0.0926), FIX8(4.0076), + FIX8(0.0977), FIX8(3.9324), FIX8(0.1032), FIX8(3.8572), + FIX8(0.1089), FIX8(3.7820), FIX8(0.1149), FIX8(3.7068), + FIX8(0.1214), FIX8(3.6316), FIX8(0.1282), FIX8(3.5565), + FIX8(0.1353), FIX8(3.4813), FIX8(0.1429), FIX8(3.4061), + FIX8(0.1510), FIX8(3.3309), FIX8(0.1596), FIX8(3.2557), + FIX8(0.1686), FIX8(3.1805), FIX8(0.1782), FIX8(3.1053), + FIX8(0.1884), FIX8(3.0301), FIX8(0.1992), FIX8(2.9549), + FIX8(0.2107), FIX8(2.8797), FIX8(0.2229), FIX8(2.8046), + FIX8(0.2358), FIX8(2.7294), FIX8(0.2496), FIX8(2.6542), + FIX8(0.2642), FIX8(2.5790), FIX8(0.2798), FIX8(2.5038), + FIX8(0.2964), FIX8(2.4286), FIX8(0.3142), FIX8(2.3534), + FIX8(0.3331), FIX8(2.2782), FIX8(0.3532), FIX8(2.2030), + FIX8(0.3748), FIX8(2.1278), FIX8(0.3979), FIX8(2.0527), + FIX8(0.4226), FIX8(1.9775), FIX8(0.4491), FIX8(1.9023), + FIX8(0.4776), FIX8(1.8271), FIX8(0.5082), FIX8(1.7519), + FIX8(0.5412), FIX8(1.6767), FIX8(0.5768), FIX8(1.6015), + FIX8(0.6152), FIX8(1.5263), FIX8(0.6568), FIX8(1.4511), + FIX8(0.7020), FIX8(1.3759), FIX8(0.7513), FIX8(1.3008), + FIX8(0.8050), FIX8(1.2256), FIX8(0.8638), FIX8(1.1504), + FIX8(0.9285), FIX8(1.0752), FIX8(1.0000), FIX8(1.0000) }; @@ -794,14 +762,17 @@ cabac_context_init = &x264_cabac_context_init_PB[i_model]; for( int i = 0; i < 460; i++ ) - cb->state[i] = x264_clip3( (((cabac_context_init)[i][0] * i_qp) >> 4) + (cabac_context_init)[i][1], 1, 126 ); + { + int state = x264_clip3( (((cabac_context_init)[i][0] * i_qp) >> 4) + (cabac_context_init)[i][1], 1, 126 ); + cb->state[i] = (X264_MIN( state, 127-state ) << 1) \| (state >> 6); + } } void x264_cabac_encode_init( x264_cabac_t cb, uint8_t p_data, uint8_t p_end ) { cb->i_low = 0; cb->i_range = 0x01FE; - cb->i_queue = -1; // the first bit will be shifted away and not written + cb->i_queue = -9; // the first bit will be shifted away and not written cb->i_bytes_outstanding = 0; cb->p_start = p_data; cb->p = p_data; @@ -810,10 +781,10 @@ static inline void x264_cabac_putbyte( x264_cabac_t cb ) { - if( cb->i_queue >= 8 ) + if( cb->i_queue >= 0 ) { - int out = cb->i_low >> (cb->i_queue+2); - cb->i_low &= (4<<cb->i_queue)-1; + int out = cb->i_low >> (cb->i_queue+10); + cb->i_low &= (0x400<<cb->i_queue)-1; cb->i_queue -= 8; if( (out & 0xff) == 0xff ) @@ -855,9 +826,9 @@ void x264_cabac_encode_decision_c( x264_cabac_t cb, int i_ctx, int b ) { int i_state = cb->state[i_ctx]; - int i_range_lps = x264_cabac_range_lps[i_state][(cb->i_range>>6)-4]; + int i_range_lps = x264_cabac_range_lps[i_state>>1][(cb->i_range>>6)-4]; cb->i_range -= i_range_lps; - if( b != (i_state >> 6) ) + if( b != (i_state & 1) ) { cb->i_low += cb->i_range; cb->i_range = i_range_lps; @@ -866,7 +837,7 @@ x264_cabac_encode_renorm( cb ); } -void x264_cabac_encode_bypass( x264_cabac_t cb, int b ) +void x264_cabac_encode_bypass_c( x264_cabac_t cb, int b ) { cb->i_low <<= 1; cb->i_low += -b & cb->i_range; @@ -892,7 +863,7 @@ } while( k > 0 ); } -void x264_cabac_encode_terminal( x264_cabac_t cb ) +void x264_cabac_encode_terminal_c( x264_cabac_t cb ) { cb->i_range -= 2; x264_cabac_encode_renorm( cb );
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/cabac.h ^
@@ -31,7 +31,7 @@ int i_range; /* bit stream / - int i_queue; + int i_queue; //stored with an offset of -8 for faster asm int i_bytes_outstanding; uint8_t p_start; @@ -46,7 +46,7 @@ } x264_cabac_t; extern const uint8_t x264_cabac_transition[128][2]; -extern const uint16_t x264_cabac_entropy[128][2]; +extern const uint16_t x264_cabac_entropy[128]; /* init the contexts given i_slice_type, the quantif and the model / void x264_cabac_context_init( x264_cabac_t cb, int i_slice_type, int i_qp, int i_model ); @@ -55,15 +55,21 @@ void x264_cabac_encode_init ( x264_cabac_t cb, uint8_t p_data, uint8_t p_end ); void x264_cabac_encode_decision_c( x264_cabac_t cb, int i_ctx, int b ); void x264_cabac_encode_decision_asm( x264_cabac_t cb, int i_ctx, int b ); -void x264_cabac_encode_bypass( x264_cabac_t cb, int b ); +void x264_cabac_encode_bypass_c( x264_cabac_t cb, int b ); +void x264_cabac_encode_bypass_asm( x264_cabac_t cb, int b ); +void x264_cabac_encode_terminal_c( x264_cabac_t cb ); +void x264_cabac_encode_terminal_asm( x264_cabac_t cb ); void x264_cabac_encode_ue_bypass( x264_cabac_t cb, int exp_bits, int val ); -void x264_cabac_encode_terminal( x264_cabac_t cb ); void x264_cabac_encode_flush( x264_t h, x264_cabac_t cb ); #ifdef HAVE_MMX #define x264_cabac_encode_decision x264_cabac_encode_decision_asm +#define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm +#define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm #else #define x264_cabac_encode_decision x264_cabac_encode_decision_c +#define x264_cabac_encode_bypass x264_cabac_encode_bypass_c +#define x264_cabac_encode_terminal x264_cabac_encode_terminal_c #endif #define x264_cabac_encode_decision_noup x264_cabac_encode_decision @@ -78,25 +84,25 @@ { int i_state = cb->state[i_ctx]; cb->state[i_ctx] = x264_cabac_transition[i_state][b]; - cb->f8_bits_encoded += x264_cabac_entropy[i_state][b]; + cb->f8_bits_encoded += x264_cabac_entropy[i_state^b]; } static ALWAYS_INLINE int x264_cabac_size_decision2( uint8_t state, long b ) { int i_state = state; state = x264_cabac_transition[i_state][b]; - return x264_cabac_entropy[i_state][b]; + return x264_cabac_entropy[i_state^b]; } static ALWAYS_INLINE void x264_cabac_size_decision_noup( x264_cabac_t cb, long i_ctx, long b ) { int i_state = cb->state[i_ctx]; - cb->f8_bits_encoded += x264_cabac_entropy[i_state][b]; + cb->f8_bits_encoded += x264_cabac_entropy[i_state^b]; } static ALWAYS_INLINE int x264_cabac_size_decision_noup2( uint8_t state, long b ) { - return x264_cabac_entropy[state][b]; + return x264_cabac_entropy[*state^b]; } #endif
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/common.c ^
@@ -22,7 +22,6 @@ *****************************************************************************/ #include "common.h" -#include "cpu.h" #include <stdarg.h> #include <ctype.h> @@ -1225,11 +1224,11 @@ s += sprintf( s, " bframes=%d", p->i_bframe ); if( p->i_bframe ) { - s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d wpredb=%d", + s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d weightb=%d", p->i_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias, p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred ); } - s += sprintf( s, " wpredp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 ); + s += sprintf( s, " weightp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 ); s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d intra_refresh=%d", p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold, p->b_intra_refresh ); @@ -1238,7 +1237,7 @@ s += sprintf( s, " rc_lookahead=%d", p->rc.i_lookahead ); s += sprintf( s, " rc=%s mbtree=%d", p->rc.i_rc_method == X264_RC_ABR ? - ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_buffer_size == p->rc.i_bitrate ? "cbr" : "abr" ) + ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_max_bitrate == p->rc.i_bitrate ? "cbr" : "abr" ) : p->rc.i_rc_method == X264_RC_CRF ? "crf" : "cqp", p->rc.b_mb_tree ); if( p->rc.i_rc_method == X264_RC_ABR \|\| p->rc.i_rc_method == X264_RC_CRF ) { @@ -1257,7 +1256,7 @@ s += sprintf( s, " vbv_maxrate=%d vbv_bufsize=%d", p->rc.i_vbv_max_bitrate, p->rc.i_vbv_buffer_size ); if( p->rc.i_rc_method == X264_RC_CRF ) - s += sprintf( s, " crf-max=%.1f", p->rc.f_rf_constant_max ); + s += sprintf( s, " crf_max=%.1f", p->rc.f_rf_constant_max ); } } else if( p->rc.i_rc_method == X264_RC_CQP )
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/common.h ^
@@ -110,6 +110,7 @@ #include "dct.h" #include "cabac.h" #include "quant.h" +#include "cpu.h" /**************************************************************************** * General functions @@ -188,14 +189,14 @@ return amvd0 + (amvd1<<8); } -static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max ) +static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (dst)[2], int16_t (mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max ) { for( int i = 0; i < i_mvc; i++ ) { int mx = (mvc[i][0] + 2) >> 2; int my = (mvc[i][1] + 2) >> 2; - mvc[i][0] = x264_clip3( mx, mv_x_min, mv_x_max ); - mvc[i][1] = x264_clip3( my, mv_y_min, mv_y_max ); + dst[i][0] = x264_clip3( mx, mv_x_min, mv_x_max ); + dst[i][1] = x264_clip3( my, mv_y_min, mv_y_max ); } } @@ -407,6 +408,8 @@ int i_coded_fields_lookahead; / Use separate counters for lookahead / int i_cpb_delay_lookahead; + int b_queued_intra_refresh; + / We use only one SPS and one PPS / x264_sps_t sps_array[1]; x264_sps_t sps; @@ -658,7 +661,7 @@ ALIGNED_8( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] ); /* i_non_zero_count if available else 0x80 / - ALIGNED_4( uint8_t non_zero_count[X264_SCAN8_SIZE] ); + ALIGNED_16( uint8_t non_zero_count[X264_SCAN8_SIZE] ); / -1 if unused, -2 if unavailable */ ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/cpu.c ^
@@ -87,8 +87,8 @@ #endif #ifdef HAVE_MMX -extern int x264_cpu_cpuid_test( void ); -extern uint32_t x264_cpu_cpuid( uint32_t op, uint32_t eax, uint32_t ebx, uint32_t ecx, uint32_t edx ); +int x264_cpu_cpuid_test( void ); +uint32_t x264_cpu_cpuid( uint32_t op, uint32_t eax, uint32_t ebx, uint32_t ecx, uint32_t edx ); uint32_t x264_cpu_detect( void ) { @@ -324,13 +324,6 @@ #endif -#ifndef HAVE_MMX -void x264_emms( void ) -{ -} -#endif - - int x264_cpu_num_processors( void ) { #if !defined(HAVE_PTHREAD)
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/cpu.h ^
@@ -23,7 +23,14 @@ uint32_t x264_cpu_detect( void ); int x264_cpu_num_processors( void ); -void x264_emms( void ); +void x264_cpu_emms( void ); +void x264_cpu_sfence( void ); +#ifdef HAVE_MMX +#define x264_emms() x264_cpu_emms() +#else +#define x264_emms() +#endif +#define x264_sfence x264_cpu_sfence void x264_cpu_mask_misalign_sse( void ); /* kluge:
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/frame.c ^
@@ -105,6 +105,7 @@ CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t)); CHECKED_MALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t)); CHECKED_MALLOC( frame->mv[0], 216 i_mb_count * sizeof(int16_t) ); + CHECKED_MALLOC( frame->mv16x16, 2i_mb_count sizeof(int16_t) ); CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) ); if( h->param.i_bframe ) { @@ -117,7 +118,7 @@ frame->ref[1] = NULL; } CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) ); - CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) ); + CHECKED_MALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) ); if( h->param.analyse.i_me_method >= X264_ME_ESA ) { CHECKED_MALLOC( frame->buffer[3], @@ -148,10 +149,7 @@ CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) ); for( int j = 0; j <= h->param.i_bframe+1; j++ ) for( int i = 0; i <= h->param.i_bframe+1; i++ ) - { CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) ); - CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) ); - } frame->i_intra_cost = frame->lowres_costs[0][0]; memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) ); } @@ -199,19 +197,17 @@ x264_free( frame->i_propagate_cost ); for( int j = 0; j <= X264_BFRAME_MAX+1; j++ ) for( int i = 0; i <= X264_BFRAME_MAX+1; i++ ) - { x264_free( frame->lowres_costs[j][i] ); - x264_free( frame->lowres_inter_types[j][i] ); - } x264_free( frame->f_qp_offset ); x264_free( frame->f_qp_offset_aq ); x264_free( frame->i_inv_qscale_factor ); x264_free( frame->i_row_bits ); - x264_free( frame->i_row_qp ); + x264_free( frame->f_row_qp ); x264_free( frame->mb_type ); x264_free( frame->mb_partition ); x264_free( frame->mv[0] ); x264_free( frame->mv[1] ); + x264_free( frame->mv16x16 ); x264_free( frame->ref[0] ); x264_free( frame->ref[1] ); x264_pthread_mutex_destroy( &frame->mutex ); @@ -225,7 +221,7 @@ int i_csp = src->img.i_csp & X264_CSP_MASK; if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 ) { - x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" ); + x264_log( h, X264_LOG_ERROR, "Invalid input colorspace\n" ); return -1; } @@ -247,6 +243,11 @@ plane += (height-1)*stride; stride = -stride; } + if( width > abs(stride) ) + { + x264_log( h, X264_LOG_ERROR, "Input picture width is greater than stride\n" ); + return -1; + } h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height ); } return 0;
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/frame.h ^
@@ -83,15 +83,21 @@ int8_t mb_type; uint8_t mb_partition; int16_t (mv[2])[2]; + int16_t (mv16x16)[2]; int16_t (lowres_mvs[2][X264_BFRAME_MAX+1])[2]; + + / Stored as (lists_used << LOWRES_COST_SHIFT) + (cost). + * Doesn't need special addressing for intra cost because + * lists_used is guaranteed to be zero in that cast. / uint16_t (lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]); - /* Actually a width-2 bitfield with 4 values per uint8_t. / - uint8_t (lowres_inter_types[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]); + #define LOWRES_COST_MASK ((1<<14)-1) + #define LOWRES_COST_SHIFT 14 + int lowres_mv_costs[2][X264_BFRAME_MAX+1]; int8_t ref[2]; int i_ref[2]; int ref_poc[2][16]; - int16_t inv_ref_poc[2][32]; // inverse values (list0 only) to avoid divisions in MB encoding + int16_t inv_ref_poc[2]; // inverse values of ref0 poc to avoid divisions in temporal MV prediction /* for adaptive B-frame decision. * contains the SATD cost of the lowres frame encoded in various modes @@ -103,7 +109,7 @@ int i_row_satds[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]; int i_row_satd; int i_row_bits; - int i_row_qp; + float f_row_qp; float f_qp_offset; float f_qp_offset_aq; int b_intra_calculated; @@ -136,6 +142,7 @@ float f_pir_position; int i_pir_start_col; int i_pir_end_col; + int i_frames_since_pir; } x264_frame_t; / synchronized frame list */
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/macroblock.c ^
@@ -3,9 +3,9 @@ ***************************************************************************** * Copyright (C) 2003-2008 x264 project * - * Authors: Laurent Aimar <fenrir@via.ecp.fr> + * Authors: Jason Garrett-Glaser <darkshikari@gmail.com> + * Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -25,469 +25,6 @@ #include "common.h" #include "encoder/me.h" -void x264_mb_predict_mv( x264_t h, int i_list, int idx, int i_width, int16_t mvp[2] ) -{ - const int i8 = x264_scan8[idx]; - const int i_ref= h->mb.cache.ref[i_list][i8]; - int i_refa = h->mb.cache.ref[i_list][i8 - 1]; - int16_t mv_a = h->mb.cache.mv[i_list][i8 - 1]; - int i_refb = h->mb.cache.ref[i_list][i8 - 8]; - int16_t mv_b = h->mb.cache.mv[i_list][i8 - 8]; - int i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width]; - int16_t mv_c = h->mb.cache.mv[i_list][i8 - 8 + i_width]; - - if( (idx&3) >= 2 + (i_width&1) \|\| i_refc == -2 ) - { - i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1]; - mv_c = h->mb.cache.mv[i_list][i8 - 8 - 1]; - } - - if( h->mb.i_partition == D_16x8 ) - { - if( idx == 0 ) - { - if( i_refb == i_ref ) - { - CP32( mvp, mv_b ); - return; - } - } - else - { - if( i_refa == i_ref ) - { - CP32( mvp, mv_a ); - return; - } - } - } - else if( h->mb.i_partition == D_8x16 ) - { - if( idx == 0 ) - { - if( i_refa == i_ref ) - { - CP32( mvp, mv_a ); - return; - } - } - else - { - if( i_refc == i_ref ) - { - CP32( mvp, mv_c ); - return; - } - } - } - - int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref); - - if( i_count > 1 ) - { -median: - x264_median_mv( mvp, mv_a, mv_b, mv_c ); - } - else if( i_count == 1 ) - { - if( i_refa == i_ref ) - CP32( mvp, mv_a ); - else if( i_refb == i_ref ) - CP32( mvp, mv_b ); - else - CP32( mvp, mv_c ); - } - else if( i_refb == -2 && i_refc == -2 && i_refa != -2 ) - CP32( mvp, mv_a ); - else - goto median; -} - -void x264_mb_predict_mv_16x16( x264_t h, int i_list, int i_ref, int16_t mvp[2] ) -{ - int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1]; - int16_t mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1]; - int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8]; - int16_t mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8]; - int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4]; - int16_t mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4]; - if( i_refc == -2 ) - { - i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1]; - mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1]; - } - - int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref); - - if( i_count > 1 ) - { -median: - x264_median_mv( mvp, mv_a, mv_b, mv_c ); - } - else if( i_count == 1 ) - { - if( i_refa == i_ref ) - CP32( mvp, mv_a ); - else if( i_refb == i_ref ) - CP32( mvp, mv_b ); - else - CP32( mvp, mv_c ); - } - else if( i_refb == -2 && i_refc == -2 && i_refa != -2 ) - CP32( mvp, mv_a ); - else - goto median; -} - - -void x264_mb_predict_mv_pskip( x264_t h, int16_t mv[2] ) -{ - int i_refa = h->mb.cache.ref[0][X264_SCAN8_0 - 1]; - int i_refb = h->mb.cache.ref[0][X264_SCAN8_0 - 8]; - int16_t mv_a = h->mb.cache.mv[0][X264_SCAN8_0 - 1]; - int16_t mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8]; - - if( i_refa == -2 \|\| i_refb == -2 \|\| - !( i_refa \| M32( mv_a ) ) \|\| - !( i_refb \| M32( mv_b ) ) ) - { - M32( mv ) = 0; - } - else - x264_mb_predict_mv_16x16( h, 0, 0, mv ); -} - -static int x264_mb_predict_mv_direct16x16_temporal( x264_t h ) -{ - int i_mb_4x4 = 16 * h->mb.i_mb_stride * h->mb.i_mb_y + 4 * h->mb.i_mb_x; - int i_mb_8x8 = 4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x; - const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy]; - const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy]; - - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 ); - - h->mb.i_partition = partition_col; - - if( IS_INTRA( type_col ) ) - { - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); - x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, 0 ); - x264_macroblock_cache_mv( h, 0, 0, 4, 4, 1, 0 ); - return 1; - } - - /* Don't do any checks other than the ones we have to, based - * on the size of the colocated partitions. - * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 / - int max_i8 = (D_16x16 - partition_col) + 1; - int step = (partition_col == D_16x8) + 1; - int width = 4 >> ((D_16x16 - partition_col)&1); - int height = 4 >> ((D_16x16 - partition_col)>>1); - - for( int i8 = 0; i8 < max_i8; i8 += step ) - { - int x8 = i8&1; - int y8 = i8>>1; - int i_part_8x8 = i_mb_8x8 + x8 + y8 h->mb.i_b8_stride; - int i_ref1_ref = h->fref1[0]->ref[0][i_part_8x8]; - int i_ref = (map_col_to_list0(i_ref1_ref>>h->sh.b_mbaff) << h->sh.b_mbaff) + (i_ref1_ref&h->sh.b_mbaff); - - if( i_ref >= 0 ) - { - int dist_scale_factor = h->mb.dist_scale_factor[i_ref][0]; - int16_t mv_col = h->fref1[0]->mv[0][i_mb_4x4 + 3x8 + 3y8 h->mb.i_b4_stride]; - int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8; - int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8; - if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] \|\| l0y-mv_col[1] > h->mb.mv_max_spel[1]) ) - return 0; - x264_macroblock_cache_ref( h, 2x8, 2y8, width, height, 0, i_ref ); - x264_macroblock_cache_mv( h, 2x8, 2y8, width, height, 0, pack16to32_mask(l0x, l0y) ); - x264_macroblock_cache_mv( h, 2x8, 2y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) ); - } - else - { - /* the collocated ref isn't in the current list0 / - / FIXME: we might still be able to use direct_8x8 on some partitions / - / FIXME: with B-pyramid + extensive ref list reordering - * (not currently used), we would also have to check - * l1mv1 like in spatial mode / - return 0; - } - } - - return 1; -} - -static int x264_mb_predict_mv_direct16x16_spatial( x264_t h ) -{ - int8_t ref[2]; - ALIGNED_ARRAY_8( int16_t, mv,[2],[2] ); - const int8_t l1ref0 = &h->fref1[0]->ref[0][h->mb.i_b8_xy]; - const int8_t l1ref1 = &h->fref1[0]->ref[1][h->mb.i_b8_xy]; - const int16_t (l1mv[2])[2] = { (const int16_t ()[2]) &h->fref1[0]->mv[0][h->mb.i_b4_xy], - (const int16_t ()[2]) &h->fref1[0]->mv[1][h->mb.i_b4_xy] }; - const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy]; - const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy]; - - h->mb.i_partition = partition_col; - - for( int i_list = 0; i_list < 2; i_list++ ) - { - int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1]; - int16_t mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1]; - int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8]; - int16_t mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8]; - int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4]; - int16_t mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4]; - if( i_refc == -2 ) - { - i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1]; - mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1]; - } - - int i_ref = X264_MIN3( (unsigned)i_refa, (unsigned)i_refb, (unsigned)i_refc ); - if( i_ref < 0 ) - { - i_ref = -1; - M32( mv[i_list] ) = 0; - } - else - { - /* Same as x264_mb_predict_mv_16x16, but simplified to eliminate cases - * not relevant to spatial direct. / - int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref); - - if( i_count > 1 ) - x264_median_mv( mv[i_list], mv_a, mv_b, mv_c ); - else - { - if( i_refa == i_ref ) - CP32( mv[i_list], mv_a ); - else if( i_refb == i_ref ) - CP32( mv[i_list], mv_b ); - else - CP32( mv[i_list], mv_c ); - } - } - - x264_macroblock_cache_ref( h, 0, 0, 4, 4, i_list, i_ref ); - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, i_list, mv[i_list] ); - ref[i_list] = i_ref; - } - - if( (M16( ref ) & 0x8080) == 0x8080 ) / if( ref[0] < 0 && ref[1] < 0 ) / - { - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 ); - return 1; - } - - if( h->param.i_threads > 1 - && ( mv[0][1] > h->mb.mv_max_spel[1] - \|\| mv[1][1] > h->mb.mv_max_spel[1] ) ) - { -#if 0 - fprintf(stderr, "direct_spatial: (%d,%d) (%d,%d) > %d \n", - mv[0][0], mv[0][1], mv[1][0], mv[1][1], - h->mb.mv_max_spel[1]); -#endif - return 0; - } - - if( !M64( mv ) \|\| IS_INTRA( type_col ) \|\| (ref[0]&&ref[1]) ) - return 1; - - / Don't do any checks other than the ones we have to, based - * on the size of the colocated partitions. - * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 / - int max_i8 = (D_16x16 - partition_col) + 1; - int step = (partition_col == D_16x8) + 1; - int width = 4 >> ((D_16x16 - partition_col)&1); - int height = 4 >> ((D_16x16 - partition_col)>>1); - - / col_zero_flag / - for( int i8 = 0; i8 < max_i8; i8 += step ) - { - const int x8 = i8&1; - const int y8 = i8>>1; - const int o8 = x8 + y8 h->mb.i_b8_stride; - const int o4 = 3(x8 + y8 h->mb.i_b4_stride); - int idx; - if( l1ref0[o8] == 0 ) - idx = 0; - else if( l1ref0[o8] < 0 && l1ref1[o8] == 0 ) - idx = 1; - else - continue; - - if( abs( l1mv[idx][o4][0] ) <= 1 && abs( l1mv[idx][o4][1] ) <= 1 ) - { - if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2x8, 2y8, width, height, 0, 0 ); - if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2x8, 2y8, width, height, 1, 0 ); - } - } - - return 1; -} - -int x264_mb_predict_mv_direct16x16( x264_t h, int b_changed ) -{ - int b_available; - if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_NONE ) - return 0; - else if( h->sh.b_direct_spatial_mv_pred ) - b_available = x264_mb_predict_mv_direct16x16_spatial( h ); - else - b_available = x264_mb_predict_mv_direct16x16_temporal( h ); - - if( b_changed != NULL && b_available ) - { - int changed; - - changed = M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][x264_scan8[0]] ); - changed \|= M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][x264_scan8[0]] ); - changed \|= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][x264_scan8[0]]; - changed \|= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][x264_scan8[0]]; - if( !changed && h->mb.i_partition != D_16x16 ) - { - changed \|= M32( h->mb.cache.direct_mv[0][3] ) ^ M32( h->mb.cache.mv[0][x264_scan8[12]] ); - changed \|= M32( h->mb.cache.direct_mv[1][3] ) ^ M32( h->mb.cache.mv[1][x264_scan8[12]] ); - changed \|= h->mb.cache.direct_ref[0][3] ^ h->mb.cache.ref[0][x264_scan8[12]]; - changed \|= h->mb.cache.direct_ref[1][3] ^ h->mb.cache.ref[1][x264_scan8[12]]; - } - if( !changed && h->mb.i_partition == D_8x8 ) - { - changed \|= M32( h->mb.cache.direct_mv[0][1] ) ^ M32( h->mb.cache.mv[0][x264_scan8[4]] ); - changed \|= M32( h->mb.cache.direct_mv[1][1] ) ^ M32( h->mb.cache.mv[1][x264_scan8[4]] ); - changed \|= M32( h->mb.cache.direct_mv[0][2] ) ^ M32( h->mb.cache.mv[0][x264_scan8[8]] ); - changed \|= M32( h->mb.cache.direct_mv[1][2] ) ^ M32( h->mb.cache.mv[1][x264_scan8[8]] ); - changed \|= h->mb.cache.direct_ref[0][1] ^ h->mb.cache.ref[0][x264_scan8[4]]; - changed \|= h->mb.cache.direct_ref[1][1] ^ h->mb.cache.ref[1][x264_scan8[4]]; - changed \|= h->mb.cache.direct_ref[0][2] ^ h->mb.cache.ref[0][x264_scan8[8]]; - changed \|= h->mb.cache.direct_ref[1][2] ^ h->mb.cache.ref[1][x264_scan8[8]]; - } - b_changed = changed; - if( !changed ) - return b_available; - } - - / cache ref & mv / - if( b_available ) - for( int l = 0; l < 2; l++ ) - { - CP32( h->mb.cache.direct_mv[l][0], h->mb.cache.mv[l][x264_scan8[ 0]] ); - CP32( h->mb.cache.direct_mv[l][1], h->mb.cache.mv[l][x264_scan8[ 4]] ); - CP32( h->mb.cache.direct_mv[l][2], h->mb.cache.mv[l][x264_scan8[ 8]] ); - CP32( h->mb.cache.direct_mv[l][3], h->mb.cache.mv[l][x264_scan8[12]] ); - h->mb.cache.direct_ref[l][0] = h->mb.cache.ref[l][x264_scan8[ 0]]; - h->mb.cache.direct_ref[l][1] = h->mb.cache.ref[l][x264_scan8[ 4]]; - h->mb.cache.direct_ref[l][2] = h->mb.cache.ref[l][x264_scan8[ 8]]; - h->mb.cache.direct_ref[l][3] = h->mb.cache.ref[l][x264_scan8[12]]; - h->mb.cache.direct_partition = h->mb.i_partition; - } - - return b_available; -} - -/ This just improves encoder performance, it's not part of the spec / -void x264_mb_predict_mv_ref16x16( x264_t h, int i_list, int i_ref, int16_t mvc[9][2], int i_mvc ) -{ - int16_t (mvr)[2] = h->mb.mvr[i_list][i_ref]; - int i = 0; - -#define SET_MVP(mvp)\ - { \ - CP32( mvc[i], mvp ); \ - i++; \ - } - - /* b_direct / - if( h->sh.i_type == SLICE_TYPE_B - && h->mb.cache.ref[i_list][x264_scan8[12]] == i_ref ) - { - SET_MVP( h->mb.cache.mv[i_list][x264_scan8[12]] ); - } - - if( i_ref == 0 && h->frames.b_have_lowres ) - { - int16_t (lowres_mv)[2] = i_list ? h->fenc->lowres_mvs[1][h->fref1[0]->i_frame-h->fenc->i_frame-1] - : h->fenc->lowres_mvs[0][h->fenc->i_frame-h->fref0[0]->i_frame-1]; - if( lowres_mv[0][0] != 0x7fff ) - { - M32( mvc[i] ) = (M32( lowres_mv[h->mb.i_mb_xy] )2)&0xfffeffff; - i++; - } - } - - / spatial predictors / - if( h->mb.i_neighbour_frame & MB_LEFT ) - { - SET_MVP( mvr[h->mb.i_mb_left_xy] ); - } - if( h->mb.i_neighbour_frame & MB_TOP ) - { - SET_MVP( mvr[h->mb.i_mb_top_xy] ); - - if( h->mb.i_neighbour_frame & MB_TOPLEFT ) - SET_MVP( mvr[h->mb.i_mb_topleft_xy] ); - if( h->mb.i_neighbour_frame & MB_TOPRIGHT ) - SET_MVP( mvr[h->mb.i_mb_topright_xy] ); - } -#undef SET_MVP - - / temporal predictors / - if( h->fref0[0]->i_ref[0] > 0 ) - { - x264_frame_t l0 = h->fref0[0]; - x264_frame_t *fref = i_list ? h->fref1 : h->fref0; - int field = h->mb.i_mb_y&1; - int curpoc = h->fdec->i_poc + fieldh->sh.i_delta_poc_bottom; - int refpoc = fref[i_ref>>h->sh.b_mbaff]->i_poc; - if( h->sh.b_mbaff && field^(i_ref&1) ) - refpoc += h->sh.i_delta_poc_bottom; - -#define SET_TMVP(dx, dy) { \ - int i_b4 = h->mb.i_b4_xy + dx4 + dy4h->mb.i_b4_stride; \ - int i_b8 = h->mb.i_b8_xy + dx2 + dy2h->mb.i_b8_stride; \ - int ref_col = l0->ref[0][i_b8]; \ - if( ref_col >= 0 ) \ - { \ - int scale = (curpoc - refpoc) * l0->inv_ref_poc[h->mb.b_interlaced&field][ref_col];\ - mvc[i][0] = (l0->mv[0][i_b4][0]scale + 128) >> 8;\ - mvc[i][1] = (l0->mv[0][i_b4][1]scale + 128) >> 8;\ - i++; \ - } \ - } - - SET_TMVP(0,0); - if( h->mb.i_mb_x < h->sps->i_mb_width-1 ) - SET_TMVP(1,0); - if( h->mb.i_mb_y < h->sps->i_mb_height-1 ) - SET_TMVP(0,1); -#undef SET_TMVP - } - - i_mvc = i; -} - -/ Set up a lookup table for delta pocs to reduce an IDIV to an IMUL / -static void setup_inverse_delta_pocs( x264_t h ) -{ - for( int field = 0; field <= h->sh.b_mbaff; field++ ) - { - int curpoc = h->fdec->i_poc + fieldh->sh.i_delta_poc_bottom; - for( int i = 0; i < (h->i_ref0<<h->sh.b_mbaff); i++ ) - { - int refpoc = h->fref0[i>>h->sh.b_mbaff]->i_poc; - if( h->sh.b_mbaff && field^(i&1) ) - refpoc += h->sh.i_delta_poc_bottom; - int delta = curpoc - refpoc; - - h->fdec->inv_ref_poc[field][i] = (256 + delta/2) / delta; - } - } -} - static NOINLINE void x264_mb_mc_0xywh( x264_t h, int x, int y, int width, int height ) { int i8 = x264_scan8[0]+x+8y; @@ -713,7 +250,7 @@ else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND ) i_refs = X264_MIN(16, i_refs + 1); //blind weights add one duplicate frame - for( int j = 0; j < i_refs; j++ ) + for( int j = !i; j < i_refs; j++ ) CHECKED_MALLOC( h->mb.mvr[i][j], 2 i_mb_count * sizeof(int16_t) ); } @@ -758,12 +295,13 @@ } return 0; -fail: return -1; +fail: + return -1; } void x264_macroblock_cache_free( x264_t h ) { for( int i = 0; i < 2; i++ ) - for( int j = 0; j < 32; j++ ) + for( int j = !i; j < 32; j++ ) x264_free( h->mb.mvr[i][j] ); for( int i = 0; i < 16; i++ ) x264_free( h->mb.p_weight_buf[i] ); @@ -811,7 +349,8 @@ CHECKED_MALLOC( h->scratch_buffer, scratch_size ); return 0; -fail: return -1; +fail: + return -1; } void x264_macroblock_thread_free( x264_t h, int b_lookahead ) @@ -827,6 +366,7 @@ { h->mb.mv[0] = h->fdec->mv[0]; h->mb.mv[1] = h->fdec->mv[1]; + h->mb.mvr[0][0] = h->fdec->mv16x16; h->mb.ref[0] = h->fdec->ref[0]; h->mb.ref[1] = h->fdec->ref[1]; h->mb.type = h->fdec->mb_type; @@ -861,7 +401,17 @@ /* init with not available (for top right idx=7,15) / memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) ); - setup_inverse_delta_pocs( h ); + if( h->i_ref0 > 0 ) + for( int field = 0; field <= h->sh.b_mbaff; field++ ) + { + int curpoc = h->fdec->i_poc + fieldh->sh.i_delta_poc_bottom; + int refpoc = h->fref0[0]->i_poc; + if( h->sh.b_mbaff && field ) + refpoc += h->sh.i_delta_poc_bottom; + int delta = curpoc - refpoc; + + h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta; + } h->mb.i_neighbour4[6] = h->mb.i_neighbour4[9] =
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/mc.c ^
@@ -97,9 +97,9 @@ uint8_t pix2, int i_stride_pix2, \ uint8_t pix3, int i_stride_pix3, int weight ) \ { \ - if( weight == 32 )\ + if( weight == 32 ) \ pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \ - else\ + else \ pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \ } PIXEL_AVG_C( pixel_avg_16x16, 16, 16 ) @@ -203,8 +203,8 @@ } } -static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; -static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; +static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; +static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; static void mc_luma( uint8_t dst, int i_dst_stride, uint8_t src[4], int i_src_stride, @@ -427,7 +427,7 @@ for( int i = 0; i < len; i++ ) { int propagate_amount = propagate_in[i] + ((intra_costs[i] * inv_qscales[i] + 128)>>8); - dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - inter_costs[i]), intra_costs[i]); + dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK)), intra_costs[i]); } }
[-] [+]	Added	x264-snapshot-20100517-2245.tar.bz2/common/mvpred.c ^
@@ -0,0 +1,466 @@ +/***************************************************************************** + * mvpred.c: h264 encoder library + ***************************************************************************** + * Copyright (C) 2003-2008 x264 project + * + * Authors: Loren Merritt <lorenm@u.washington.edu> + * Jason Garrett-Glaser <darkshikari@gmail.com> + * Laurent Aimar <fenrir@via.ecp.fr> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + ****************************************************************************/ + +#include "common.h" + +void x264_mb_predict_mv( x264_t h, int i_list, int idx, int i_width, int16_t mvp[2] ) +{ + const int i8 = x264_scan8[idx]; + const int i_ref= h->mb.cache.ref[i_list][i8]; + int i_refa = h->mb.cache.ref[i_list][i8 - 1]; + int16_t mv_a = h->mb.cache.mv[i_list][i8 - 1]; + int i_refb = h->mb.cache.ref[i_list][i8 - 8]; + int16_t mv_b = h->mb.cache.mv[i_list][i8 - 8]; + int i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width]; + int16_t mv_c = h->mb.cache.mv[i_list][i8 - 8 + i_width]; + + if( (idx&3) >= 2 + (i_width&1) \|\| i_refc == -2 ) + { + i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1]; + mv_c = h->mb.cache.mv[i_list][i8 - 8 - 1]; + } + + if( h->mb.i_partition == D_16x8 ) + { + if( idx == 0 ) + { + if( i_refb == i_ref ) + { + CP32( mvp, mv_b ); + return; + } + } + else + { + if( i_refa == i_ref ) + { + CP32( mvp, mv_a ); + return; + } + } + } + else if( h->mb.i_partition == D_8x16 ) + { + if( idx == 0 ) + { + if( i_refa == i_ref ) + { + CP32( mvp, mv_a ); + return; + } + } + else + { + if( i_refc == i_ref ) + { + CP32( mvp, mv_c ); + return; + } + } + } + + int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref); + + if( i_count > 1 ) + { +median: + x264_median_mv( mvp, mv_a, mv_b, mv_c ); + } + else if( i_count == 1 ) + { + if( i_refa == i_ref ) + CP32( mvp, mv_a ); + else if( i_refb == i_ref ) + CP32( mvp, mv_b ); + else + CP32( mvp, mv_c ); + } + else if( i_refb == -2 && i_refc == -2 && i_refa != -2 ) + CP32( mvp, mv_a ); + else + goto median; +} + +void x264_mb_predict_mv_16x16( x264_t h, int i_list, int i_ref, int16_t mvp[2] ) +{ + int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1]; + int16_t mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1]; + int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8]; + int16_t mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8]; + int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4]; + int16_t mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4]; + if( i_refc == -2 ) + { + i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1]; + mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1]; + } + + int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref); + + if( i_count > 1 ) + { +median: + x264_median_mv( mvp, mv_a, mv_b, mv_c ); + } + else if( i_count == 1 ) + { + if( i_refa == i_ref ) + CP32( mvp, mv_a ); + else if( i_refb == i_ref ) + CP32( mvp, mv_b ); + else + CP32( mvp, mv_c ); + } + else if( i_refb == -2 && i_refc == -2 && i_refa != -2 ) + CP32( mvp, mv_a ); + else + goto median; +} + + +void x264_mb_predict_mv_pskip( x264_t h, int16_t mv[2] ) +{ + int i_refa = h->mb.cache.ref[0][X264_SCAN8_0 - 1]; + int i_refb = h->mb.cache.ref[0][X264_SCAN8_0 - 8]; + int16_t mv_a = h->mb.cache.mv[0][X264_SCAN8_0 - 1]; + int16_t mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8]; + + if( i_refa == -2 \|\| i_refb == -2 \|\| + !( i_refa \| M32( mv_a ) ) \|\| + !( i_refb \| M32( mv_b ) ) ) + { + M32( mv ) = 0; + } + else + x264_mb_predict_mv_16x16( h, 0, 0, mv ); +} + +static int x264_mb_predict_mv_direct16x16_temporal( x264_t h ) +{ + int i_mb_4x4 = 16 h->mb.i_mb_stride * h->mb.i_mb_y + 4 * h->mb.i_mb_x; + int i_mb_8x8 = 4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x; + const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy]; + const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy]; + + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 ); + + h->mb.i_partition = partition_col; + + if( IS_INTRA( type_col ) ) + { + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); + x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, 0 ); + x264_macroblock_cache_mv( h, 0, 0, 4, 4, 1, 0 ); + return 1; + } + + /* Don't do any checks other than the ones we have to, based + * on the size of the colocated partitions. + * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 / + int max_i8 = (D_16x16 - partition_col) + 1; + int step = (partition_col == D_16x8) + 1; + int width = 4 >> ((D_16x16 - partition_col)&1); + int height = 4 >> ((D_16x16 - partition_col)>>1); + + for( int i8 = 0; i8 < max_i8; i8 += step ) + { + int x8 = i8&1; + int y8 = i8>>1; + int i_part_8x8 = i_mb_8x8 + x8 + y8 h->mb.i_b8_stride; + int i_ref1_ref = h->fref1[0]->ref[0][i_part_8x8]; + int i_ref = (map_col_to_list0(i_ref1_ref>>h->sh.b_mbaff) << h->sh.b_mbaff) + (i_ref1_ref&h->sh.b_mbaff); + + if( i_ref >= 0 ) + { + int dist_scale_factor = h->mb.dist_scale_factor[i_ref][0]; + int16_t mv_col = h->fref1[0]->mv[0][i_mb_4x4 + 3x8 + 3y8 h->mb.i_b4_stride]; + int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8; + int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8; + if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] \|\| l0y-mv_col[1] > h->mb.mv_max_spel[1]) ) + return 0; + x264_macroblock_cache_ref( h, 2x8, 2y8, width, height, 0, i_ref ); + x264_macroblock_cache_mv( h, 2x8, 2y8, width, height, 0, pack16to32_mask(l0x, l0y) ); + x264_macroblock_cache_mv( h, 2x8, 2y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) ); + } + else + { + /* the collocated ref isn't in the current list0 / + / FIXME: we might still be able to use direct_8x8 on some partitions / + / FIXME: with B-pyramid + extensive ref list reordering + * (not currently used), we would also have to check + * l1mv1 like in spatial mode / + return 0; + } + } + + return 1; +} + +static int x264_mb_predict_mv_direct16x16_spatial( x264_t h ) +{ + int8_t ref[2]; + ALIGNED_ARRAY_8( int16_t, mv,[2],[2] ); + const int8_t l1ref0 = &h->fref1[0]->ref[0][h->mb.i_b8_xy]; + const int8_t l1ref1 = &h->fref1[0]->ref[1][h->mb.i_b8_xy]; + const int16_t (l1mv[2])[2] = { (const int16_t ()[2]) &h->fref1[0]->mv[0][h->mb.i_b4_xy], + (const int16_t ()[2]) &h->fref1[0]->mv[1][h->mb.i_b4_xy] }; + const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy]; + const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy]; + + h->mb.i_partition = partition_col; + + for( int i_list = 0; i_list < 2; i_list++ ) + { + int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1]; + int16_t mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1]; + int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8]; + int16_t mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8]; + int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4]; + int16_t mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4]; + if( i_refc == -2 ) + { + i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1]; + mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1]; + } + + int i_ref = X264_MIN3( (unsigned)i_refa, (unsigned)i_refb, (unsigned)i_refc ); + if( i_ref < 0 ) + { + i_ref = -1; + M32( mv[i_list] ) = 0; + } + else + { + /* Same as x264_mb_predict_mv_16x16, but simplified to eliminate cases + * not relevant to spatial direct. / + int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref); + + if( i_count > 1 ) + x264_median_mv( mv[i_list], mv_a, mv_b, mv_c ); + else + { + if( i_refa == i_ref ) + CP32( mv[i_list], mv_a ); + else if( i_refb == i_ref ) + CP32( mv[i_list], mv_b ); + else + CP32( mv[i_list], mv_c ); + } + } + + x264_macroblock_cache_ref( h, 0, 0, 4, 4, i_list, i_ref ); + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, i_list, mv[i_list] ); + ref[i_list] = i_ref; + } + + if( (M16( ref ) & 0x8080) == 0x8080 ) / if( ref[0] < 0 && ref[1] < 0 ) / + { + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 ); + return 1; + } + + if( h->param.i_threads > 1 + && ( mv[0][1] > h->mb.mv_max_spel[1] + \|\| mv[1][1] > h->mb.mv_max_spel[1] ) ) + { +#if 0 + fprintf(stderr, "direct_spatial: (%d,%d) (%d,%d) > %d \n", + mv[0][0], mv[0][1], mv[1][0], mv[1][1], + h->mb.mv_max_spel[1]); +#endif + return 0; + } + + if( !M64( mv ) \|\| IS_INTRA( type_col ) \|\| (ref[0]&&ref[1]) ) + return 1; + + / Don't do any checks other than the ones we have to, based + * on the size of the colocated partitions. + * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 / + int max_i8 = (D_16x16 - partition_col) + 1; + int step = (partition_col == D_16x8) + 1; + int width = 4 >> ((D_16x16 - partition_col)&1); + int height = 4 >> ((D_16x16 - partition_col)>>1); + + / col_zero_flag / + for( int i8 = 0; i8 < max_i8; i8 += step ) + { + const int x8 = i8&1; + const int y8 = i8>>1; + const int o8 = x8 + y8 h->mb.i_b8_stride; + const int o4 = 3(x8 + y8 h->mb.i_b4_stride); + int idx; + if( l1ref0[o8] == 0 ) + idx = 0; + else if( l1ref0[o8] < 0 && l1ref1[o8] == 0 ) + idx = 1; + else + continue; + + if( abs( l1mv[idx][o4][0] ) <= 1 && abs( l1mv[idx][o4][1] ) <= 1 ) + { + if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2x8, 2y8, width, height, 0, 0 ); + if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2x8, 2y8, width, height, 1, 0 ); + } + } + + return 1; +} + +int x264_mb_predict_mv_direct16x16( x264_t h, int b_changed ) +{ + int b_available; + if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_NONE ) + return 0; + else if( h->sh.b_direct_spatial_mv_pred ) + b_available = x264_mb_predict_mv_direct16x16_spatial( h ); + else + b_available = x264_mb_predict_mv_direct16x16_temporal( h ); + + if( b_changed != NULL && b_available ) + { + int changed; + + changed = M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][x264_scan8[0]] ); + changed \|= M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][x264_scan8[0]] ); + changed \|= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][x264_scan8[0]]; + changed \|= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][x264_scan8[0]]; + if( !changed && h->mb.i_partition != D_16x16 ) + { + changed \|= M32( h->mb.cache.direct_mv[0][3] ) ^ M32( h->mb.cache.mv[0][x264_scan8[12]] ); + changed \|= M32( h->mb.cache.direct_mv[1][3] ) ^ M32( h->mb.cache.mv[1][x264_scan8[12]] ); + changed \|= h->mb.cache.direct_ref[0][3] ^ h->mb.cache.ref[0][x264_scan8[12]]; + changed \|= h->mb.cache.direct_ref[1][3] ^ h->mb.cache.ref[1][x264_scan8[12]]; + } + if( !changed && h->mb.i_partition == D_8x8 ) + { + changed \|= M32( h->mb.cache.direct_mv[0][1] ) ^ M32( h->mb.cache.mv[0][x264_scan8[4]] ); + changed \|= M32( h->mb.cache.direct_mv[1][1] ) ^ M32( h->mb.cache.mv[1][x264_scan8[4]] ); + changed \|= M32( h->mb.cache.direct_mv[0][2] ) ^ M32( h->mb.cache.mv[0][x264_scan8[8]] ); + changed \|= M32( h->mb.cache.direct_mv[1][2] ) ^ M32( h->mb.cache.mv[1][x264_scan8[8]] ); + changed \|= h->mb.cache.direct_ref[0][1] ^ h->mb.cache.ref[0][x264_scan8[4]]; + changed \|= h->mb.cache.direct_ref[1][1] ^ h->mb.cache.ref[1][x264_scan8[4]]; + changed \|= h->mb.cache.direct_ref[0][2] ^ h->mb.cache.ref[0][x264_scan8[8]]; + changed \|= h->mb.cache.direct_ref[1][2] ^ h->mb.cache.ref[1][x264_scan8[8]]; + } + b_changed = changed; + if( !changed ) + return b_available; + } + + / cache ref & mv / + if( b_available ) + for( int l = 0; l < 2; l++ ) + { + CP32( h->mb.cache.direct_mv[l][0], h->mb.cache.mv[l][x264_scan8[ 0]] ); + CP32( h->mb.cache.direct_mv[l][1], h->mb.cache.mv[l][x264_scan8[ 4]] ); + CP32( h->mb.cache.direct_mv[l][2], h->mb.cache.mv[l][x264_scan8[ 8]] ); + CP32( h->mb.cache.direct_mv[l][3], h->mb.cache.mv[l][x264_scan8[12]] ); + h->mb.cache.direct_ref[l][0] = h->mb.cache.ref[l][x264_scan8[ 0]]; + h->mb.cache.direct_ref[l][1] = h->mb.cache.ref[l][x264_scan8[ 4]]; + h->mb.cache.direct_ref[l][2] = h->mb.cache.ref[l][x264_scan8[ 8]]; + h->mb.cache.direct_ref[l][3] = h->mb.cache.ref[l][x264_scan8[12]]; + h->mb.cache.direct_partition = h->mb.i_partition; + } + + return b_available; +} + +/ This just improves encoder performance, it's not part of the spec / +void x264_mb_predict_mv_ref16x16( x264_t h, int i_list, int i_ref, int16_t mvc[9][2], int i_mvc ) +{ + int16_t (mvr)[2] = h->mb.mvr[i_list][i_ref]; + int i = 0; + +#define SET_MVP(mvp) \ + { \ + CP32( mvc[i], mvp ); \ + i++; \ + } + + /* b_direct / + if( h->sh.i_type == SLICE_TYPE_B + && h->mb.cache.ref[i_list][x264_scan8[12]] == i_ref ) + { + SET_MVP( h->mb.cache.mv[i_list][x264_scan8[12]] ); + } + + if( i_ref == 0 && h->frames.b_have_lowres ) + { + int16_t (lowres_mv)[2] = i_list ? h->fenc->lowres_mvs[1][h->fref1[0]->i_frame-h->fenc->i_frame-1] + : h->fenc->lowres_mvs[0][h->fenc->i_frame-h->fref0[0]->i_frame-1]; + if( lowres_mv[0][0] != 0x7fff ) + { + M32( mvc[i] ) = (M32( lowres_mv[h->mb.i_mb_xy] )2)&0xfffeffff; + i++; + } + } + + / spatial predictors / + if( h->mb.i_neighbour_frame & MB_LEFT ) + { + SET_MVP( mvr[h->mb.i_mb_left_xy] ); + } + if( h->mb.i_neighbour_frame & MB_TOP ) + { + SET_MVP( mvr[h->mb.i_mb_top_xy] ); + + if( h->mb.i_neighbour_frame & MB_TOPLEFT ) + SET_MVP( mvr[h->mb.i_mb_topleft_xy] ); + if( h->mb.i_neighbour_frame & MB_TOPRIGHT ) + SET_MVP( mvr[h->mb.i_mb_topright_xy] ); + } +#undef SET_MVP + + / temporal predictors / + if( h->fref0[0]->i_ref[0] > 0 ) + { + x264_frame_t l0 = h->fref0[0]; + x264_frame_t *fref = i_list ? h->fref1 : h->fref0; + int field = h->mb.i_mb_y&1; + int curpoc = h->fdec->i_poc + fieldh->sh.i_delta_poc_bottom; + int refpoc = fref[i_ref>>h->sh.b_mbaff]->i_poc; + if( h->sh.b_mbaff && field^(i_ref&1) ) + refpoc += h->sh.i_delta_poc_bottom; + +#define SET_TMVP( dx, dy ) \ + { \ + int mb_index = h->mb.i_mb_xy + dx + dyh->mb.i_mb_stride; \ + int scale = (curpoc - refpoc) l0->inv_ref_poc[h->mb.b_interlaced&field]; \ + mvc[i][0] = (l0->mv16x16[mb_index][0]scale + 128) >> 8; \ + mvc[i][1] = (l0->mv16x16[mb_index][1]scale + 128) >> 8; \ + i++; \ + } + + SET_TMVP(0,0); + if( h->mb.i_mb_x < h->sps->i_mb_width-1 ) + SET_TMVP(1,0); + if( h->mb.i_mb_y < h->sps->i_mb_height-1 ) + SET_TMVP(0,1); +#undef SET_TMVP + } + + *i_mvc = i; +}
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/ppc/dct.c ^
@@ -205,7 +205,7 @@ vec_st( dct_tr1v, 16, (signed short )dct ); vec_st( dct_tr2v, 32, (signed short )dct ); vec_st( dct_tr3v, 48, (signed short )dct ); - + vec_st( dct_tr4v, 64, (signed short )dct ); vec_st( dct_tr5v, 80, (signed short )dct ); vec_st( dct_tr6v, 96, (signed short )dct );
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/ppc/mc.c ^
@@ -37,8 +37,8 @@ uint8_t dst, int i_dst, int i_height ); -static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; -static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; +static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; +static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; static inline int x264_tapfilter( uint8_t pix, int i_pix_next ) @@ -291,8 +291,8 @@ } -#define DO_PROCESS_W4( a ) \ - dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \ +#define DO_PROCESS_W4( a ) \ + dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \ dstv_16B = vec_mladd( src##a##v_16B, coeff##a##v, dstv_16B ) static void mc_chroma_altivec_4xh( uint8_t dst, int i_dst_stride, @@ -369,10 +369,10 @@ } } -#define DO_PROCESS_W8( a ) \ - src##a##v_16A = vec_u8_to_u16( src##a##v_8A ); \ - src##a##v_16B = vec_u8_to_u16( src##a##v_8B ); \ - dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \ +#define DO_PROCESS_W8( a ) \ + src##a##v_16A = vec_u8_to_u16( src##a##v_8A ); \ + src##a##v_16B = vec_u8_to_u16( src##a##v_8B ); \ + dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \ dstv_16B = vec_mladd( src##a##v_16B, coeff##a##v, dstv_16B ) static void mc_chroma_altivec_8xh( uint8_t dst, int i_dst_stride,
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/ppc/ppccommon.h ^
@@ -113,13 +113,13 @@ vec_u8_t _hv, _lv #define PREP_LOAD_SRC( src ) \ - vec_u8_t _##src##_ = vec_lvsl(0, src) + vec_u8_t _##src##_ = vec_lvsl(0, src) #define VEC_LOAD_G( p, v, n, t ) \ _hv = vec_ld( 0, p ); \ v = (t) vec_lvsl( 0, p ); \ _lv = vec_ld( n - 1, p ); \ - v = (t) vec_perm( _hv, _lv, (vec_u8_t) v ) + v = (t) vec_perm( _hv, _lv, (vec_u8_t) v ) #define VEC_LOAD( p, v, n, t, g ) \ _hv = vec_ld( 0, p ); \ @@ -134,7 +134,7 @@ #define VEC_LOAD_PARTIAL( p, v, n, t, g) \ _hv = vec_ld( 0, p); \ v = (t) vec_perm( _hv, _hv, (vec_u8_t) _##g##_ ) - + /*********************************************************************** * PREP_STORE##n: declares required vectors to store n bytes to a @@ -155,7 +155,7 @@ _lv = vec_perm( (vec_u8_t) v, _tmp1v, _##o##r_ ); \ vec_st( _lv, 15, (uint8_t ) p ); \ _hv = vec_perm( _tmp1v, (vec_u8_t) v, _##o##r_ ); \ - vec_st( _hv, 0, (uint8_t ) p ) + vec_st( _hv, 0, (uint8_t *) p ) #define PREP_STORE8 \
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/ppc/quant.c ^
@@ -20,7 +20,7 @@ #include "common/common.h" #include "ppccommon.h" -#include "quant.h" +#include "quant.h" // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled" #define QUANT_16_U( idx0, idx1 ) \ @@ -55,7 +55,7 @@ nz = vec_or(nz, vec_or(temp1v, temp2v)); \ vec_st(temp2v, (idx1), (int16_t*)dct); \ } - + int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ) { LOAD_ZERO; @@ -220,7 +220,7 @@ vec_u16_t biasvB; vec_s16_t temp1v, temp2v; - + vec_u32_u qbits_u; qbits_u.s[0]=16; i_qbitsv = vec_splat(qbits_u.v, 0);
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/predict.c ^
@@ -41,7 +41,7 @@ * 16x16 prediction for intra luma block ****************************************************************************/ -#define PREDICT_16x16_DC(v) \ +#define PREDICT_16x16_DC(v)\ for( int i = 0; i < 16; i++ )\ {\ M32( src+ 0 ) = v;\
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/set.c ^
@@ -23,7 +23,7 @@ #define SHIFT(x,s) ((s)<=0 ? (x)<<-(s) : ((x)+(1<<((s)-1)))>>(s)) #define DIV(n,d) (((n) + ((d)>>1)) / (d)) -static const int dequant4_scale[6][3] = +static const uint8_t dequant4_scale[6][3] = { { 10, 13, 16 }, { 11, 14, 18 }, @@ -32,7 +32,7 @@ { 16, 20, 25 }, { 18, 23, 29 } }; -static const int quant4_scale[6][3] = +static const uint16_t quant4_scale[6][3] = { { 13107, 8066, 5243 }, { 11916, 7490, 4660 }, @@ -42,11 +42,11 @@ { 7282, 4559, 2893 }, }; -static const int quant8_scan[16] = +static const uint8_t quant8_scan[16] = { 0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1 }; -static const int dequant8_scale[6][6] = +static const uint8_t dequant8_scale[6][6] = { { 20, 18, 32, 19, 25, 24 }, { 22, 19, 35, 21, 28, 26 }, @@ -55,7 +55,7 @@ { 32, 28, 51, 30, 40, 38 }, { 36, 32, 58, 34, 46, 43 }, }; -static const int quant8_scale[6][6] = +static const uint16_t quant8_scale[6][6] = { { 13107, 11428, 20972, 12222, 16777, 15481 }, { 11916, 10826, 19174, 11058, 14980, 14290 },
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/cabac-a.asm ^
@@ -24,23 +24,21 @@ %include "x86inc.asm" -SECTION_RODATA - SECTION .text -cextern x264_cabac_range_lps -cextern x264_cabac_transition -cextern x264_cabac_renorm_shift +cextern cabac_range_lps +cextern cabac_transition +cextern cabac_renorm_shift ; t3 must be ecx, since it's used for shift. %ifdef WIN64 - DECLARE_REG_TMP 3,1,2,0,4,5,6,10 + DECLARE_REG_TMP 3,1,2,0,4,5,6,10,2 %define pointer resq %elifdef ARCH_X86_64 - DECLARE_REG_TMP 0,1,2,3,4,5,6,10 + DECLARE_REG_TMP 0,1,2,3,4,5,6,10,6 %define pointer resq %else - DECLARE_REG_TMP 0,4,2,1,3,5,6,2 + DECLARE_REG_TMP 0,4,2,1,3,5,6,2,2 %define pointer resd %endif @@ -70,17 +68,19 @@ %endif %endmacro -cglobal x264_cabac_encode_decision_asm, 0,7 +cglobal cabac_encode_decision_asm, 0,7 movifnidn t0, r0mp movifnidn t1d, r1m mov t5d, [t0+cb.range] - movzx t6d, byte [t0+cb.state+t1] + movzx t4d, byte [t0+cb.state+t1] mov t3d, t5d + mov t6d, t4d shr t5d, 6 + shr t4d, 1 movifnidn t2d, r2m - LOAD_GLOBAL t5d, x264_cabac_range_lps-4, t5, t64 - LOAD_GLOBAL t4d, x264_cabac_transition, t2, t62 - shr t6d, 6 + LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t44 + LOAD_GLOBAL t4d, cabac_transition, t2, t62 + and t6d, 1 sub t3d, t5d cmp t6d, t2d mov t6d, [t0+cb.low] @@ -88,28 +88,74 @@ cmovne t3d, t5d cmovne t6d, t7d mov [t0+cb.state+t1], t4b -;x264_cabac_encode_renorm +;cabac_encode_renorm mov t4d, t3d shr t3d, 3 - LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3 + LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3 shl t4d, t3b shl t6d, t3b add t3d, [t0+cb.queue] mov [t0+cb.range], t4d - cmp t3d, 8 - jl .update_queue_low -;x264_cabac_putbyte + jge cabac_putbyte +.update_queue_low: + mov [t0+cb.low], t6d + mov [t0+cb.queue], t3d + RET + +cglobal cabac_encode_bypass_asm, 0,3 + movifnidn t0, r0mp + movifnidn t3d, r1m + neg t3d + mov t8d, [t0+cb.low] + and t3d, [t0+cb.range] + lea t8d, [t8*2+t3] + mov t3d, [t0+cb.queue] + inc t3d +%ifdef UNIX64 ; .putbyte compiles to nothing but a jmp + jge cabac_putbyte +%else + jge .putbyte +%endif + mov [t0+cb.low], t8d + mov [t0+cb.queue], t3d + RET +.putbyte: + PROLOGUE 0,7 + movifnidn t6d, t8d + jmp cabac_putbyte + +cglobal cabac_encode_terminal_asm, 0,3 + movifnidn t0, r0mp + sub dword [t0+cb.range], 2 +; shortcut: the renormalization shift in terminal +; can only be 0 or 1 and is zero over 99% of the time. + test dword [t0+cb.range], 0x100 + je .renorm + REP_RET +.renorm: + shl dword [t0+cb.low], 1 + shl dword [t0+cb.range], 1 + inc dword [t0+cb.queue] + jge .putbyte + REP_RET +.putbyte: + PROLOGUE 0,7 + mov t3d, [t0+cb.queue] + mov t6d, [t0+cb.low] + jmp cabac_putbyte + +cabac_putbyte: ; alive: t0=cb t3=queue t6=low %ifdef WIN64 DECLARE_REG_TMP 3,4,1,0,2,5,6,10 %endif mov t1d, -1 - add t3d, 2 + add t3d, 10 mov t2d, t6d shl t1d, t3b shr t2d, t3b ; out not t1d - sub t3d, 10 + sub t3d, 18 and t6d, t1d mov t5d, [t0+cb.bytes_outstanding] cmp t2b, 0xff ; FIXME is a 32bit op faster? @@ -127,8 +173,4 @@ .postpone: inc t5d mov [t0+cb.bytes_outstanding], t5d -.update_queue_low: - mov [t0+cb.low], t6d - mov [t0+cb.queue], t3d - RET - + jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)
[-] [+]	Added	x264-snapshot-20100517-2245.tar.bz2/common/x86/const-a.asm ^
@@ -0,0 +1,54 @@ +;***************************************************************************** +;* const-a.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2010 x264 project +;* +;* Author: Loren Merritt <lorenm@u.washington.edu> +;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;***************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA + +const pb_01, times 8 db 0,1 +const pb_0, times 16 db 0 +const pb_a1, times 16 db 0xa1 +const pb_1, times 16 db 1 +const pb_3, times 16 db 3 +const hsub_mul, times 8 db 1, -1 +const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6 + +const pw_1, times 8 dw 1 +const pw_2, times 8 dw 2 +const pw_4, times 8 dw 4 +const pw_8, times 8 dw 8 +const pw_16, times 8 dw 16 +const pw_32, times 8 dw 32 +const pw_64, times 8 dw 64 +const pw_32_0, times 4 dw 32, + times 4 dw 0 +const pw_8000, times 8 dw 0x8000 +const pw_3fff, times 8 dw 0x3fff + +const pd_1, times 4 dd 1 +const pd_128, times 4 dd 128 +const pw_00ff, times 8 dw 0x00ff +const pw_ff00, times 8 dw 0xff00 + +const pb_reverse, db 7, 6, 5, 4, 3, 2, 1, 0 +const sw_64, dd 64
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/cpu-a.asm ^
@@ -29,9 +29,9 @@ %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; int x264_cpu_cpuid( int op, int eax, int ebx, int ecx, int edx ) +; int cpu_cpuid( int op, int eax, int ebx, int ecx, int edx ) ;----------------------------------------------------------------------------- -cglobal x264_cpu_cpuid, 5,7 +cglobal cpu_cpuid, 5,7 push rbx mov r11, r1 mov r10, r2 @@ -49,10 +49,10 @@ %else ;----------------------------------------------------------------------------- -; int x264_cpu_cpuid_test( void ) +; int cpu_cpuid_test( void ) ; return 0 if unsupported ;----------------------------------------------------------------------------- -cglobal x264_cpu_cpuid_test +cglobal cpu_cpuid_test pushfd push ebx push ebp @@ -75,9 +75,9 @@ ret ;----------------------------------------------------------------------------- -; int x264_cpu_cpuid( int op, int eax, int ebx, int ecx, int edx ) +; int cpu_cpuid( int op, int eax, int ebx, int ecx, int edx ) ;----------------------------------------------------------------------------- -cglobal x264_cpu_cpuid, 0,6 +cglobal cpu_cpuid, 0,6 mov eax, r0m cpuid mov esi, r1m @@ -91,9 +91,9 @@ RET ;----------------------------------------------------------------------------- -; void x264_stack_align( void (func)(void), void arg ); +; void stack_align( void (func)(void), void arg ); ;----------------------------------------------------------------------------- -cglobal x264_stack_align +cglobal stack_align push ebp mov ebp, esp sub esp, 8 @@ -110,16 +110,23 @@ %endif ;----------------------------------------------------------------------------- -; void x264_emms( void ) +; void cpu_emms( void ) ;----------------------------------------------------------------------------- -cglobal x264_emms +cglobal cpu_emms emms ret ;----------------------------------------------------------------------------- -; void x264_cpu_mask_misalign_sse(void) +; void cpu_sfence( void ) ;----------------------------------------------------------------------------- -cglobal x264_cpu_mask_misalign_sse +cglobal cpu_sfence + sfence + ret + +;----------------------------------------------------------------------------- +; void cpu_mask_misalign_sse( void ) +;----------------------------------------------------------------------------- +cglobal cpu_mask_misalign_sse sub rsp, 4 stmxcsr [rsp] or dword [rsp], 1<<17
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/dct-32.asm ^
@@ -27,13 +27,11 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA - -pw_32: times 8 dw 32 -hsub_mul: times 8 db 1, -1 - SECTION .text +cextern pw_32 +cextern hsub_mul + ; in: m0..m7 ; out: 0,4,6 in mem, rest in regs %macro DCT8_1D 9 @@ -188,10 +186,10 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t pix1, uint8_t pix2 ) +; void sub8x8_dct8( int16_t dct[8][8], uint8_t pix1, uint8_t pix2 ) ;----------------------------------------------------------------------------- -cglobal x264_sub8x8_dct8_mmx, 3,3 -global x264_sub8x8_dct8_mmx.skip_prologue +cglobal sub8x8_dct8_mmx, 3,3 +global sub8x8_dct8_mmx.skip_prologue .skip_prologue: INIT_MMX call load_diff_4x8_mmx @@ -254,10 +252,10 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_add8x8_idct8_mmx( uint8_t dst, int16_t dct[8][8] ) +; void add8x8_idct8( uint8_t dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct8_mmx, 2,2 -global x264_add8x8_idct8_mmx.skip_prologue +cglobal add8x8_idct8_mmx, 2,2 +global add8x8_idct8_mmx.skip_prologue .skip_prologue: INIT_MMX add word [r1], 32 @@ -344,9 +342,9 @@ INIT_XMM %macro DCT_SUB8 1 -cglobal x264_sub8x8_dct_%1, 3,3 +cglobal sub8x8_dct_%1, 3,3 add r2, 4FDEC_STRIDE -global x264_sub8x8_dct_%1.skip_prologue +global sub8x8_dct_%1.skip_prologue .skip_prologue: %ifnidn %1, sse2 mova m7, [hsub_mul] @@ -375,11 +373,11 @@ ret ;----------------------------------------------------------------------------- -; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t pix1, uint8_t pix2 ) +; void sub8x8_dct8( int16_t dct[8][8], uint8_t pix1, uint8_t pix2 ) ;----------------------------------------------------------------------------- -cglobal x264_sub8x8_dct8_%1, 3,3 +cglobal sub8x8_dct8_%1, 3,3 add r2, 4FDEC_STRIDE -global x264_sub8x8_dct8_%1.skip_prologue +global sub8x8_dct8_%1.skip_prologue .skip_prologue: %ifidn %1, sse2 LOAD_DIFF m0, m7, none, [r1+0FENC_STRIDE], [r2-4FDEC_STRIDE] @@ -419,11 +417,11 @@ DCT_SUB8 ssse3 ;----------------------------------------------------------------------------- -; void x264_add8x8_idct_sse2( uint8_t pix, int16_t dct[4][4][4] ) +; void add8x8_idct( uint8_t pix, int16_t dct[4][4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct_sse2, 2,2 +cglobal add8x8_idct_sse2, 2,2 add r0, 4FDEC_STRIDE -global x264_add8x8_idct_sse2.skip_prologue +global add8x8_idct_sse2.skip_prologue .skip_prologue: UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3 SBUTTERFLY qdq, 0, 1, 4 @@ -456,11 +454,11 @@ ret ;----------------------------------------------------------------------------- -; void x264_add8x8_idct8_sse2( uint8_t p_dst, int16_t dct[8][8] ) +; void add8x8_idct8( uint8_t p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct8_sse2, 2,2 +cglobal add8x8_idct8_sse2, 2,2 add r0, 4FDEC_STRIDE -global x264_add8x8_idct8_sse2.skip_prologue +global add8x8_idct8_sse2.skip_prologue .skip_prologue: UNSPILL r1, 1,2,3,5,6,7 IDCT8_1D 0,1,2,3,4,5,6,7,r1
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/dct-64.asm ^
@@ -26,11 +26,10 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA -pw_32: times 8 dw 32 -hsub_mul: times 8 db 1, -1 - SECTION .text + +cextern pw_32 +cextern hsub_mul INIT_XMM %macro DCT8_1D 10 @@ -140,7 +139,7 @@ %endmacro %macro DCT_SUB8 1 -cglobal x264_sub8x8_dct_%1, 3,3,11 +cglobal sub8x8_dct_%1, 3,3,11 add r2, 4FDEC_STRIDE %ifnidn %1, sse2 mova m7, [hsub_mul] @@ -149,7 +148,7 @@ call .skip_prologue RET %endif -global x264_sub8x8_dct_%1.skip_prologue +global sub8x8_dct_%1.skip_prologue .skip_prologue: SWAP 7, 9 LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4FDEC_STRIDE @@ -165,9 +164,9 @@ ret ;----------------------------------------------------------------------------- -; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t pix1, uint8_t pix2 ) +; void sub8x8_dct8( int16_t dct[8][8], uint8_t pix1, uint8_t pix2 ) ;----------------------------------------------------------------------------- -cglobal x264_sub8x8_dct8_%1, 3,3,11 +cglobal sub8x8_dct8_%1, 3,3,11 add r2, 4FDEC_STRIDE %ifnidn %1, sse2 mova m7, [hsub_mul] @@ -176,7 +175,7 @@ call .skip_prologue RET %endif -global x264_sub8x8_dct8_%1.skip_prologue +global sub8x8_dct8_%1.skip_prologue .skip_prologue: SWAP 7, 10 LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4FDEC_STRIDE @@ -205,16 +204,16 @@ DCT_SUB8 ssse3 ;----------------------------------------------------------------------------- -; void x264_add8x8_idct8_sse2( uint8_t p_dst, int16_t dct[8][8] ) +; void add8x8_idct8( uint8_t p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct8_sse2, 2,2,11 +cglobal add8x8_idct8_sse2, 2,2,11 add r0, 4FDEC_STRIDE pxor m7, m7 %ifdef WIN64 call .skip_prologue RET %endif -global x264_add8x8_idct8_sse2.skip_prologue +global add8x8_idct8_sse2.skip_prologue .skip_prologue: SWAP 7, 9 movdqa m0, [r1+0x00] @@ -237,16 +236,16 @@ ret ;----------------------------------------------------------------------------- -; void x264_add8x8_idct_sse2( uint8_t pix, int16_t dct[4][4][4] ) +; void add8x8_idct( uint8_t pix, int16_t dct[4][4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct_sse2, 2,2,11 +cglobal add8x8_idct_sse2, 2,2,11 add r0, 4FDEC_STRIDE pxor m7, m7 %ifdef WIN64 call .skip_prologue RET %endif -global x264_add8x8_idct_sse2.skip_prologue +global add8x8_idct_sse2.skip_prologue .skip_prologue: SWAP 7, 9 mova m0, [r1+ 0]
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/dct-a.asm ^
@@ -35,12 +35,6 @@ %endmacro SECTION_RODATA -pw_32_0: times 4 dw 32 - times 4 dw 0 -pw_32: times 8 dw 32 -pw_8000: times 8 dw 0x8000 -hsub_mul: times 8 db 1, -1 - pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15 pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1 @@ -48,11 +42,16 @@ pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 -pb_1: times 16 db 1 -pw_1: times 8 dw 1 SECTION .text +cextern pw_32_0 +cextern pw_32 +cextern pw_8000 +cextern hsub_mul +cextern pb_1 +cextern pw_1 + %macro WALSH4_1D 5 SUMSUB_BADC m%4, m%3, m%2, m%1, m%5 SUMSUB_BADC m%4, m%2, m%3, m%1, m%5 @@ -73,9 +72,9 @@ INIT_MMX ;----------------------------------------------------------------------------- -; void x264_dct4x4dc_mmx( int16_t d[4][4] ) +; void dct4x4dc( int16_t d[4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_dct4x4dc_mmx, 1,1 +cglobal dct4x4dc_mmx, 1,1 movq m3, [r0+24] movq m2, [r0+16] movq m1, [r0+ 8] @@ -95,9 +94,9 @@ RET ;----------------------------------------------------------------------------- -; void x264_idct4x4dc_mmx( int16_t d[4][4] ) +; void idct4x4dc( int16_t d[4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_idct4x4dc_mmx, 1,1 +cglobal idct4x4dc_mmx, 1,1 movq m3, [r0+24] movq m2, [r0+16] movq m1, [r0+ 8] @@ -113,9 +112,9 @@ %macro SUB_DCT4 1 ;----------------------------------------------------------------------------- -; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t pix1, uint8_t pix2 ) +; void sub4x4_dct( int16_t dct[4][4], uint8_t pix1, uint8_t pix2 ) ;----------------------------------------------------------------------------- -cglobal x264_sub4x4_dct_%1, 3,3 +cglobal sub4x4_dct_%1, 3,3 %ifidn %1, mmx .skip_prologue: LOAD_DIFF m0, m4, m5, [r1+0FENC_STRIDE], [r2+0FDEC_STRIDE] @@ -140,9 +139,9 @@ SUB_DCT4 ssse3 ;----------------------------------------------------------------------------- -; void x264_add4x4_idct_mmx( uint8_t p_dst, int16_t dct[4][4] ) +; void add4x4_idct( uint8_t p_dst, int16_t dct[4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_add4x4_idct_mmx, 2,2 +cglobal add4x4_idct_mmx, 2,2 pxor m7, m7 .skip_prologue: movq m1, [r1+ 8] @@ -160,7 +159,7 @@ RET INIT_XMM -cglobal x264_add4x4_idct_sse4, 2,2,6 +cglobal add4x4_idct_sse4, 2,2,6 mova m0, [r1+0x00] ; row1/row0 mova m2, [r1+0x10] ; row3/row2 mova m1, m0 ; row1/row0 @@ -213,7 +212,7 @@ INIT_MMX ;----------------------------------------------------------------------------- -; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t pix1, uint8_t pix2 ) +; void sub8x8_dct( int16_t dct[4][4][4], uint8_t pix1, uint8_t pix2 ) ;----------------------------------------------------------------------------- %macro SUB_NxN_DCT 6 cglobal %1, 3,3,11 @@ -249,7 +248,7 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_add8x8_idct_mmx( uint8_t pix, int16_t dct[4][4][4] ) +; void add8x8_idct( uint8_t pix, int16_t dct[4][4][4] ) ;----------------------------------------------------------------------------- %macro ADD_NxN_IDCT 6-7 cglobal %1, 2,2,11 @@ -280,33 +279,33 @@ %endmacro %ifndef ARCH_X86_64 -SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0 -ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0 -SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4 -ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4 - -cextern x264_sub8x8_dct8_mmx.skip_prologue -cextern x264_add8x8_idct8_mmx.skip_prologue -SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0 -ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0 +SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0 +ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0 +SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4 +ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4 + +cextern sub8x8_dct8_mmx.skip_prologue +cextern add8x8_idct8_mmx.skip_prologue +SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0 +ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0 %endif INIT_XMM -cextern x264_sub8x8_dct_sse2.skip_prologue -cextern x264_sub8x8_dct_ssse3.skip_prologue -SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0 -SUB_NxN_DCT x264_sub16x16_dct_ssse3, x264_sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0 -cextern x264_add8x8_idct_sse2.skip_prologue -ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2.skip_prologue, 264, 8, 0, 0 - -cextern x264_sub8x8_dct8_sse2.skip_prologue -cextern x264_add8x8_idct8_sse2.skip_prologue -SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0 -ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0 +cextern sub8x8_dct_sse2.skip_prologue +cextern sub8x8_dct_ssse3.skip_prologue +SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0 +SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0 +cextern add8x8_idct_sse2.skip_prologue +ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 264, 8, 0, 0 + +cextern sub8x8_dct8_sse2.skip_prologue +cextern add8x8_idct8_sse2.skip_prologue +SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0 +ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0 -cextern x264_sub8x8_dct8_ssse3.skip_prologue -SUB_NxN_DCT x264_sub16x16_dct8_ssse3, x264_sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0 +cextern sub8x8_dct8_ssse3.skip_prologue +SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0 ;----------------------------------------------------------------------------- @@ -331,7 +330,7 @@ movq [%3+FDEC_STRIDE3], %1 %endmacro -cglobal x264_add8x8_idct_dc_mmx, 2,2 +cglobal add8x8_idct_dc_mmx, 2,2 movq mm0, [r1] pxor mm1, mm1 add r0, FDEC_STRIDE4 @@ -350,7 +349,7 @@ ADD_DC mm2, mm3, r0 RET -cglobal x264_add8x8_idct_dc_ssse3, 2,2 +cglobal add8x8_idct_dc_ssse3, 2,2 movq xmm0, [r1] pxor xmm1, xmm1 add r0, FDEC_STRIDE4 @@ -388,7 +387,7 @@ movhps [r0+FDEC_STRIDE 3], xmm5 RET -cglobal x264_add16x16_idct_dc_mmx, 2,3 +cglobal add16x16_idct_dc_mmx, 2,3 mov r2, 4 .loop: movq mm0, [r1] @@ -431,7 +430,7 @@ movdqa [r0+%1+FDEC_STRIDE3], xmm7 %endmacro -cglobal x264_add16x16_idct_dc_sse2, 2,2,8 +cglobal add16x16_idct_dc_sse2, 2,2,8 call .loop add r0, FDEC_STRIDE4 %ifdef WIN64 @@ -465,7 +464,7 @@ IDCT_DC_STORE 0, xmm2, xmm3 ret -cglobal x264_add16x16_idct_dc_ssse3, 2,2,8 +cglobal add16x16_idct_dc_ssse3, 2,2,8 call .loop add r0, FDEC_STRIDE4 %ifdef WIN64 @@ -531,7 +530,7 @@ %endmacro INIT_MMX -cglobal x264_sub8x8_dct_dc_mmxext, 3,3 +cglobal sub8x8_dct_dc_mmxext, 3,3 DCTDC_2ROW_MMX m0, m4, 0 DCTDC_2ROW_MMX m5, m6, 2 paddw m0, m5 @@ -567,7 +566,7 @@ %endif %endmacro -cglobal x264_sub8x8_dct_dc_sse2, 3,3,8 +cglobal sub8x8_dct_dc_sse2, 3,3,8 pxor m7, m7 DCTDC_2ROW_SSE2 0, 0, m4 DCTDC_2ROW_SSE2 2, 1, m4 @@ -586,10 +585,10 @@ RET ;----------------------------------------------------------------------------- -; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] ) +; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] ) ;----------------------------------------------------------------------------- %macro SCAN_8x8 1 -cglobal x264_zigzag_scan_8x8_frame_%1, 2,2,8 +cglobal zigzag_scan_8x8_frame_%1, 2,2,8 movdqa xmm0, [r1] movdqa xmm1, [r1+16] movdq2q mm0, xmm0 @@ -703,9 +702,9 @@ SCAN_8x8 ssse3 ;----------------------------------------------------------------------------- -; void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] ) +; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] ) ;----------------------------------------------------------------------------- -cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2 +cglobal zigzag_scan_8x8_frame_mmxext, 2,2 movq mm0, [r1] movq mm1, [r1+28] movq mm2, [r1+214] @@ -798,9 +797,9 @@ RET ;----------------------------------------------------------------------------- -; void x264_zigzag_scan_4x4_frame_mmx( int16_t level[16], int16_t dct[4][4] ) +; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2 +cglobal zigzag_scan_4x4_frame_mmx, 2,2 movq mm0, [r1] movq mm1, [r1+8] movq mm2, [r1+16] @@ -828,9 +827,9 @@ RET ;----------------------------------------------------------------------------- -; void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[4][4] ) +; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2 +cglobal zigzag_scan_4x4_frame_ssse3, 2,2 movdqa xmm1, [r1+16] movdqa xmm0, [r1] pshufb xmm1, [pb_scan4frameb] @@ -845,10 +844,10 @@ RET ;----------------------------------------------------------------------------- -; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] ) +; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] ) ;----------------------------------------------------------------------------- ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2 -cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3 +cglobal zigzag_scan_4x4_field_mmxext, 2,3 pshufw mm0, [r1+4], 0xd2 movq mm1, [r1+16] movq mm2, [r1+24] @@ -862,7 +861,7 @@ RET ;----------------------------------------------------------------------------- -; void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[8][8] ) +; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] ) ;----------------------------------------------------------------------------- ; Output order: @@ -875,7 +874,7 @@ ; 45 46 47 51 56 57 52 53 ; 54 55 58 59 60 61 62 63 -cglobal x264_zigzag_scan_8x8_field_mmxext, 2,3 +cglobal zigzag_scan_8x8_field_mmxext, 2,3 movq mm0, [r1+20] ; 03 02 01 00 movq mm1, [r1+24] ; 07 06 05 04 movq mm2, [r1+28] ; 11 10 09 08 @@ -954,13 +953,13 @@ RET ;----------------------------------------------------------------------------- -; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t src, uint8_t dst ) +; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t src, uint8_t dst ) ;----------------------------------------------------------------------------- %macro ZIGZAG_SUB_4x4 2 %ifidn %1, ac -cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 4,4,8 +cglobal zigzag_sub_4x4%1_%2_ssse3, 4,4,8 %else -cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8 +cglobal zigzag_sub_4x4%1_%2_ssse3, 3,3,8 %endif movd xmm0, [r1+0FENC_STRIDE] movd xmm1, [r1+1FENC_STRIDE] @@ -1020,7 +1019,7 @@ ZIGZAG_SUB_4x4 ac, field ;----------------------------------------------------------------------------- -; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t dst, int16_t src, uint8_t nnz ) +; void zigzag_interleave_8x8_cavlc( int16_t dst, int16_t src, uint8_t nnz ) ;----------------------------------------------------------------------------- %macro INTERLEAVE 1 @@ -1047,7 +1046,7 @@ %endmacro INIT_MMX -cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3 +cglobal zigzag_interleave_8x8_cavlc_mmx, 3,3 INTERLEAVE 0 INTERLEAVE 8 INTERLEAVE 16 @@ -1095,7 +1094,7 @@ %endmacro INIT_XMM -cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8 +cglobal zigzag_interleave_8x8_cavlc_sse2, 3,3,8 INTERLEAVE_XMM 0 INTERLEAVE_XMM 16 packsswb m2, m3
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/deblock-a.asm ^
@@ -22,14 +22,13 @@ %include "x86inc.asm" -SECTION_RODATA -pb_00: times 16 db 0x00 -pb_01: times 16 db 0x01 -pb_03: times 16 db 0x03 -pb_a1: times 16 db 0xa1 - SECTION .text +cextern pb_0 +cextern pb_1 +cextern pb_3 +cextern pb_a1 + ; expands to [base],...,[base+7stride] %define PASS8ROWS(base, base3, stride, stride3) \ [base], [base+stride], [base+stride2], [base3], \ @@ -234,11 +233,11 @@ %macro DEBLOCK_P0_Q0 0 mova m5, m1 pxor m5, m2 ; p0^q0 - pand m5, [pb_01] ; (p0^q0)&1 + pand m5, [pb_1] ; (p0^q0)&1 pcmpeqb m4, m4 pxor m3, m4 pavgb m3, m0 ; (p1 - q1 + 256)>>1 - pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 + pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 pxor m4, m1 pavgb m4, m2 ; (q0 - p0 + 256)>>1 pavgb m3, m5 @@ -263,7 +262,7 @@ pavgb %6, m2 pavgb %2, %6 ; avg(p2,avg(p0,q0)) pxor %6, %3 - pand %6, [pb_01] ; (p2^avg(p0,q0))&1 + pand %6, [pb_1] ; (p2^avg(p0,q0))&1 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 mova %6, %1 psubusb %6, %5 @@ -275,10 +274,10 @@ %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_sse2( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) +; void deblock_v_luma( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) ;----------------------------------------------------------------------------- INIT_XMM -cglobal x264_deblock_v_luma_sse2, 5,5,10 +cglobal deblock_v_luma_sse2, 5,5,10 movd m8, [r4] ; tc0 lea r4, [r13] dec r2d ; alpha-1 @@ -321,10 +320,10 @@ RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_sse2( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) +; void deblock_h_luma( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_deblock_h_luma_sse2, 5,7 +cglobal deblock_h_luma_sse2, 5,7 movsxd r10, r1d lea r11, [r10+r102] lea r6, [r0-4] @@ -345,13 +344,13 @@ ; vertical filter ; alpha, beta, tc0 are still in r2d, r3d, r4 - ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them + ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them lea r0, [pix_tmp+0x30] mov r1d, 0x10 %ifdef WIN64 mov [rsp+0x20], r4 %endif - call x264_deblock_v_luma_sse2 + call deblock_v_luma_sse2 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) add r6, 2 @@ -383,9 +382,9 @@ %macro DEBLOCK_LUMA 3 ;----------------------------------------------------------------------------- -; void x264_deblock_v8_luma_mmxext( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) +; void deblock_v8_luma( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_%1, 5,5 +cglobal deblock_%2_luma_%1, 5,5 lea r4, [r13] dec r2 ; alpha-1 neg r4 @@ -436,10 +435,10 @@ RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_mmxext( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) +; void deblock_h_luma( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_deblock_h_luma_%1, 0,5 +cglobal deblock_h_luma_%1, 0,5 mov r0, r0mp mov r3, r1m lea r4, [r33] @@ -462,11 +461,11 @@ PUSH dword r2m PUSH dword 16 PUSH dword r0 - call x264_deblock_%2_luma_%1 + call deblock_%2_luma_%1 %ifidn %2, v8 add dword [esp ], 8 ; pix_tmp+0x38 add dword [esp+16], 2 ; tc0+2 - call x264_deblock_%2_luma_%1 + call deblock_%2_luma_%1 %endif ADD esp, 20 @@ -517,9 +516,9 @@ mova t3, t2 mova t4, t2 psrlw t2, 1 - pavgb t2, mpb_00 + pavgb t2, mpb_0 pxor t2, t0 - pand t2, mpb_01 + pand t2, mpb_1 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; mova t1, p2 @@ -528,21 +527,21 @@ psubb t2, q1 paddb t3, t3 psubb t3, t2 ; p2+2p1+2p0+2q0+q1 - pand t2, mpb_01 + pand t2, mpb_1 psubb t1, t2 pavgb t1, p1 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 psrlw t3, 2 - pavgb t3, mpb_00 + pavgb t3, mpb_0 pxor t3, t1 - pand t3, mpb_01 + pand t3, mpb_1 psubb t1, t3 ; p0'a = (p2+2p1+2p0+2q0+q1+4)/8 mova t3, p0 mova t2, p0 pxor t3, q1 pavgb t2, q1 - pand t3, mpb_01 + pand t3, mpb_1 psubb t2, t3 pavgb t2, p1 ; p0'b = (2p1+p0+q0+2)/4 @@ -562,9 +561,9 @@ paddb t2, t2 paddb t2, t4 ; 2p3+3p2+p1+p0+q0 psrlw t2, 2 - pavgb t2, mpb_00 + pavgb t2, mpb_0 pxor t2, t1 - pand t2, mpb_01 + pand t2, mpb_1 psubb t1, t2 ; p2' = (2p3+3p2+p1+p0+q0+4)/8 pxor t0, p1 @@ -603,8 +602,8 @@ %define mask0 m12 %define mask1p m13 %define mask1q [rsp-24] - %define mpb_00 m14 - %define mpb_01 m15 + %define mpb_0 m14 + %define mpb_1 m15 %else %define spill(x) [esp+16x+((stack_offset+4)&15)] %define p2 [r4+r1] @@ -614,14 +613,14 @@ %define mask0 spill(2) %define mask1p spill(3) %define mask1q spill(4) - %define mpb_00 [pb_00] - %define mpb_01 [pb_01] + %define mpb_0 [pb_0] + %define mpb_1 [pb_1] %endif ;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_intra_sse2( uint8_t pix, int stride, int alpha, int beta ) +; void deblock_v_luma_intra( uint8_t pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 +cglobal deblock_%2_luma_intra_%1, 4,6,16 %ifndef ARCH_X86_64 sub esp, 0x60 %endif @@ -638,12 +637,12 @@ mova q0, [r0] mova q1, [r0+r1] %ifdef ARCH_X86_64 - pxor mpb_00, mpb_00 - mova mpb_01, [pb_01] + pxor mpb_0, mpb_0 + mova mpb_1, [pb_1] LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 SWAP 7, 12 ; m12=mask0 - pavgb t5, mpb_00 - pavgb t5, mpb_01 ; alpha/4+1 + pavgb t5, mpb_0 + pavgb t5, mpb_1 ; alpha/4+1 movdqa p2, [r4+r1] movdqa q2, [r0+2r1] DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = \|p0-q0\| > alpha/4+1 @@ -658,8 +657,8 @@ LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 mova m4, t5 mova mask0, m7 - pavgb m4, [pb_00] - pavgb m4, [pb_01] ; alpha/4+1 + pavgb m4, [pb_0] + pavgb m4, [pb_1] ; alpha/4+1 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = \|p0-q0\| > alpha/4+1 pand m6, mask0 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = \|p2-p0\| > beta-1 @@ -681,9 +680,9 @@ INIT_MMX %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_intra_sse2( uint8_t pix, int stride, int alpha, int beta ) +; void deblock_h_luma_intra( uint8_t pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_luma_intra_%1, 4,7 +cglobal deblock_h_luma_intra_%1, 4,7 movsxd r10, r1d lea r11, [r103] lea r6, [r0-4] @@ -699,7 +698,7 @@ lea r0, [pix_tmp+0x40] mov r1, 0x10 - call x264_deblock_v_luma_intra_%1 + call deblock_v_luma_intra_%1 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) lea r5, [r6+r11] @@ -712,7 +711,7 @@ add rsp, 0x88 RET %else -cglobal x264_deblock_h_luma_intra_%1, 2,4 +cglobal deblock_h_luma_intra_%1, 2,4 lea r3, [r13] sub r0, 4 lea r2, [r0+r3] @@ -731,10 +730,10 @@ PUSH dword r2m PUSH dword 16 PUSH r0 - call x264_deblock_%2_luma_intra_%1 + call deblock_%2_luma_intra_%1 %ifidn %2, v8 add dword [rsp], 8 ; pix_tmp+8 - call x264_deblock_%2_luma_intra_%1 + call deblock_%2_luma_intra_%1 %endif ADD esp, 16 @@ -785,9 +784,9 @@ %define t6 r6 ;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma_mmxext( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) +; void deblock_v_chroma( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_mmxext, 5,6 +cglobal deblock_v_chroma_mmxext, 5,6 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] @@ -799,9 +798,9 @@ RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma_mmxext( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) +; void deblock_h_chroma( uint8_t pix, int stride, int alpha, int beta, int8_t tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_mmxext, 5,7 +cglobal deblock_h_chroma_mmxext, 5,7 %ifdef ARCH_X86_64 %define buf0 [rsp-24] %define buf1 [rsp-16] @@ -835,7 +834,7 @@ %macro CHROMA_INTRA_P0 3 movq m4, %1 pxor m4, %3 - pand m4, [pb_01] ; m4 = (p0^q1)&1 + pand m4, [pb_1] ; m4 = (p0^q1)&1 pavgb %1, %3 psubusb %1, m4 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) @@ -845,9 +844,9 @@ %define t6 r5 ;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma_intra_mmxext( uint8_t pix, int stride, int alpha, int beta ) +; void deblock_v_chroma_intra( uint8_t pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 +cglobal deblock_v_chroma_intra_mmxext, 4,5 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] @@ -859,9 +858,9 @@ RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma_intra_mmxext( uint8_t pix, int stride, int alpha, int beta ) +; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 +cglobal deblock_h_chroma_intra_mmxext, 4,6 CHROMA_H_START TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_intra_body_mmxext
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/mc-a.asm ^
@@ -29,15 +29,16 @@ SECTION_RODATA 32 ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0 -pw_1: times 8 dw 1 -pw_4: times 8 dw 4 -pw_8: times 8 dw 8 -pw_32: times 8 dw 32 -pw_64: times 8 dw 64 -sw_64: dd 64 SECTION .text +cextern pw_1 +cextern pw_4 +cextern pw_8 +cextern pw_32 +cextern pw_64 +cextern sw_64 + ;============================================================================= ; implicit weighted biprediction ;============================================================================= @@ -129,10 +130,10 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_avg_weight_w16_mmxext( uint8_t dst, int, uint8_t src1, int, uint8_t src2, int, int i_weight ) +; int pixel_avg_weight_w16( uint8_t dst, int, uint8_t src1, int, uint8_t src2, int, int i_weight ) ;----------------------------------------------------------------------------- %macro AVG_WEIGHT 2-3 0 -cglobal x264_pixel_avg_weight_w%2_%1 +cglobal pixel_avg_weight_w%2_%1 BIWEIGHT_START AVG_START %3 %if %2==8 && mmsize==16 @@ -165,7 +166,7 @@ AVG_WEIGHT mmxext, 8 AVG_WEIGHT mmxext, 16 INIT_XMM -%define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext +%define pixel_avg_weight_w4_sse2 pixel_avg_weight_w4_mmxext AVG_WEIGHT sse2, 8, 7 AVG_WEIGHT sse2, 16, 7 %define BIWEIGHT BIWEIGHT_SSSE3 @@ -293,8 +294,9 @@ %endrep %endmacro - -;void x264_mc_weight_wX( uint8_t dst, int i_dst_stride, uint8_t src,int i_src_stride, x264_weight_t weight,int h) +;----------------------------------------------------------------------------- +;void mc_weight_wX( uint8_t dst, int i_dst_stride, uint8_t src, int i_src_stride, weight_t weight, int h ) +;----------------------------------------------------------------------------- %ifdef ARCH_X86_64 %define NUMREGS 6 @@ -307,7 +309,7 @@ %endif %macro WEIGHTER 2 - cglobal x264_mc_weight_w%1_%2, NUMREGS, NUMREGS, 7 + cglobal mc_weight_w%1_%2, NUMREGS, NUMREGS, 7 WEIGHT_START %1 LOAD_HEIGHT .loop: @@ -363,9 +365,11 @@ %endrep %endmacro -;void x264_mc_offset_wX( uint8_t src, int i_src_stride, uint8_t dst, int i_dst_stride, x264_weight_t w, int h ) +;----------------------------------------------------------------------------- +;void mc_offset_wX( uint8_t src, int i_src_stride, uint8_t dst, int i_dst_stride, weight_t w, int h ) +;----------------------------------------------------------------------------- %macro OFFSET 3 - cglobal x264_mc_offset%3_w%1_%2, NUMREGS, NUMREGS + cglobal mc_offset%3_w%1_%2, NUMREGS, NUMREGS mova m2, [r4] LOAD_HEIGHT .loop: @@ -402,25 +406,25 @@ ;============================================================================= ;----------------------------------------------------------------------------- -; void x264_pixel_avg_4x4_mmxext( uint8_t dst, int dst_stride, -; uint8_t src1, int src1_stride, uint8_t src2, int src2_stride, int weight ); +; void pixel_avg_4x4( uint8_t dst, int dst_stride, +; uint8_t src1, int src1_stride, uint8_t src2, int src2_stride, int weight ); ;----------------------------------------------------------------------------- %macro AVGH 3 -cglobal x264_pixel_avg_%1x%2_%3 +cglobal pixel_avg_%1x%2_%3 mov eax, %2 cmp dword r6m, 32 - jne x264_pixel_avg_weight_w%1_%3 + jne pixel_avg_weight_w%1_%3 %if mmsize == 16 && %1 == 16 test dword r4m, 15 - jz x264_pixel_avg_w%1_sse2 + jz pixel_avg_w%1_sse2 %endif - jmp x264_pixel_avg_w%1_mmxext + jmp pixel_avg_w%1_mmxext %endmacro ;----------------------------------------------------------------------------- -; void x264_pixel_avg_w4_mmxext( uint8_t dst, int dst_stride, -; uint8_t src1, int src1_stride, uint8_t src2, int src2_stride, -; int height, int weight ); +; void pixel_avg_w4( uint8_t dst, int dst_stride, +; uint8_t src1, int src1_stride, uint8_t src2, int src2_stride, +; int height, int weight ); ;----------------------------------------------------------------------------- %macro AVG_END 0 @@ -445,17 +449,17 @@ %endmacro INIT_MMX -AVG_FUNC x264_pixel_avg_w4_mmxext, movd, movd +AVG_FUNC pixel_avg_w4_mmxext, movd, movd AVGH 4, 8, mmxext AVGH 4, 4, mmxext AVGH 4, 2, mmxext -AVG_FUNC x264_pixel_avg_w8_mmxext, movq, movq +AVG_FUNC pixel_avg_w8_mmxext, movq, movq AVGH 8, 16, mmxext AVGH 8, 8, mmxext AVGH 8, 4, mmxext -cglobal x264_pixel_avg_w16_mmxext +cglobal pixel_avg_w16_mmxext AVG_START movq mm0, [t2 ] movq mm1, [t2+8] @@ -475,7 +479,7 @@ AVGH 16, 8, mmxext INIT_XMM -AVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqa +AVG_FUNC pixel_avg_w16_sse2, movdqu, movdqa AVGH 16, 16, sse2 AVGH 16, 8, sse2 AVGH 8, 16, sse2 @@ -498,12 +502,12 @@ ;============================================================================= ;----------------------------------------------------------------------------- -; void x264_pixel_avg2_w4_mmxext( uint8_t dst, int dst_stride, -; uint8_t src1, int src_stride, -; uint8_t src2, int height ); +; void pixel_avg2_w4( uint8_t dst, int dst_stride, +; uint8_t src1, int src_stride, +; uint8_t src2, int height ); ;----------------------------------------------------------------------------- %macro AVG2_W8 2 -cglobal x264_pixel_avg2_w%1_mmxext, 6,7 +cglobal pixel_avg2_w%1_mmxext, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -524,7 +528,7 @@ AVG2_W8 8, movq %macro AVG2_W16 2 -cglobal x264_pixel_avg2_w%1_mmxext, 6,7 +cglobal pixel_avg2_w%1_mmxext, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -550,7 +554,7 @@ AVG2_W16 12, movd AVG2_W16 16, movq -cglobal x264_pixel_avg2_w20_mmxext, 6,7 +cglobal pixel_avg2_w20_mmxext, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -578,7 +582,7 @@ jg .height_loop REP_RET -cglobal x264_pixel_avg2_w16_sse2, 6,7 +cglobal pixel_avg2_w16_sse2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -597,7 +601,7 @@ REP_RET %macro AVG2_W20 1 -cglobal x264_pixel_avg2_w20_%1, 6,7 +cglobal pixel_avg2_w20_%1, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -647,16 +651,16 @@ %endmacro %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set -cglobal x264_pixel_avg2_w%1_cache%2_%3 +cglobal pixel_avg2_w%1_cache%2_%3 mov eax, r2m and eax, 0x1f\|(%2>>1) cmp eax, (32-%1)\|(%2>>1) - jle x264_pixel_avg2_w%1_%3 + jle pixel_avg2_w%1_%3 ;w12 isn't needed because w16 is just as fast if there's no cacheline split %if %1 == 12 - jmp x264_pixel_avg2_w16_cache_mmxext + jmp pixel_avg2_w16_cache_mmxext %else - jmp x264_pixel_avg2_w%1_cache_mmxext + jmp pixel_avg2_w%1_cache_mmxext %endif %endmacro @@ -687,7 +691,7 @@ %2 [r0+%1], mm0 %endmacro -x264_pixel_avg2_w8_cache_mmxext: +pixel_avg2_w8_cache_mmxext: AVG_CACHELINE_START AVG_CACHELINE_LOOP 0, movq add r2, r3 @@ -696,7 +700,7 @@ jg .height_loop REP_RET -x264_pixel_avg2_w16_cache_mmxext: +pixel_avg2_w16_cache_mmxext: AVG_CACHELINE_START AVG_CACHELINE_LOOP 0, movq AVG_CACHELINE_LOOP 8, movq @@ -706,7 +710,7 @@ jg .height_loop REP_RET -x264_pixel_avg2_w20_cache_mmxext: +pixel_avg2_w20_cache_mmxext: AVG_CACHELINE_START AVG_CACHELINE_LOOP 0, movq AVG_CACHELINE_LOOP 8, movq @@ -754,11 +758,11 @@ rep ret %endmacro -cglobal x264_pixel_avg2_w16_cache64_ssse3 +cglobal pixel_avg2_w16_cache64_ssse3 mov eax, r2m and eax, 0x3f cmp eax, 0x30 - jle x264_pixel_avg2_w16_sse2 + jle pixel_avg2_w16_sse2 PROLOGUE 6,7 lea r6, [r4+r2] and r4, ~0xf @@ -807,10 +811,10 @@ INIT_MMX ;----------------------------------------------------------------------------- -; void x264_mc_copy_w4_mmx( uint8_t dst, int i_dst_stride, -; uint8_t src, int i_src_stride, int i_height ) +; void mc_copy_w4( uint8_t dst, int i_dst_stride, +; uint8_t src, int i_src_stride, int i_height ) ;----------------------------------------------------------------------------- -cglobal x264_mc_copy_w4_mmx, 4,6 +cglobal mc_copy_w4_mmx, 4,6 cmp dword r4m, 4 lea r5, [r33] lea r4, [r13] @@ -822,7 +826,7 @@ COPY4 movd, movd, r4, r5 RET -cglobal x264_mc_copy_w8_mmx, 5,7 +cglobal mc_copy_w8_mmx, 5,7 lea r6, [r33] lea r5, [r13] .height_loop: @@ -833,7 +837,7 @@ jg .height_loop REP_RET -cglobal x264_mc_copy_w16_mmx, 5,7 +cglobal mc_copy_w16_mmx, 5,7 lea r6, [r33] lea r5, [r13] .height_loop: @@ -873,11 +877,11 @@ REP_RET %endmacro -COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu +COPY_W16_SSE2 mc_copy_w16_sse2, movdqu ; cacheline split with mmx has too much overhead; the speed benefit is near-zero. ; but with SSE3 the overhead is zero, so there's no reason not to include it. -COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu -COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa +COPY_W16_SSE2 mc_copy_w16_sse3, lddqu +COPY_W16_SSE2 mc_copy_w16_aligned_sse2, movdqa @@ -887,11 +891,11 @@ ; FIXME assumes 64 byte cachelines ;----------------------------------------------------------------------------- -; void x264_prefetch_fenc_mmxext( uint8_t pix_y, int stride_y, -; uint8_t pix_uv, int stride_uv, int mb_x ) +; void prefetch_fenc( uint8_t pix_y, int stride_y, +; uint8_t pix_uv, int stride_uv, int mb_x ) ;----------------------------------------------------------------------------- %ifdef ARCH_X86_64 -cglobal x264_prefetch_fenc_mmxext, 5,5 +cglobal prefetch_fenc_mmxext, 5,5 mov eax, r4d and eax, 3 imul eax, r1d @@ -910,7 +914,7 @@ RET %else -cglobal x264_prefetch_fenc_mmxext +cglobal prefetch_fenc_mmxext mov r2, [esp+20] mov r1, [esp+8] mov r0, [esp+4] @@ -935,9 +939,9 @@ %endif ; ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_prefetch_ref_mmxext( uint8_t pix, int stride, int parity ) +; void prefetch_ref( uint8_t pix, int stride, int parity ) ;----------------------------------------------------------------------------- -cglobal x264_prefetch_ref_mmxext, 3,3 +cglobal prefetch_ref_mmxext, 3,3 dec r2d and r2d, r1d lea r0, [r0+r28+64] @@ -982,16 +986,16 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_mc_chroma_mmxext( uint8_t dst, int dst_stride, -; uint8_t src, int src_stride, -; int dx, int dy, -; int width, int height ) +; void mc_chroma( uint8_t dst, int dst_stride, +; uint8_t *src, int src_stride, +; int dx, int dy, +; int width, int height ) ;----------------------------------------------------------------------------- %macro MC_CHROMA 1-2 0 -cglobal x264_mc_chroma_%1 +cglobal mc_chroma_%1 %if mmsize == 16 cmp dword r6m, 4 - jle x264_mc_chroma_mmxext + jle mc_chroma_mmxext %endif PROLOGUE 0,6,%2 MC_CHROMA_START @@ -1151,7 +1155,7 @@ %macro MC_CHROMA_SSSE3 2 INIT_MMX -cglobal x264_mc_chroma_ssse3%1, 0,6,%2 +cglobal mc_chroma_ssse3%1, 0,6,%2 MC_CHROMA_START and r4d, 7 and r5d, 7
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/mc-a2.asm ^
@@ -33,13 +33,14 @@ filt_mul51: times 8 db -5, 1 hpel_shuf: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 -pw_1: times 8 dw 1 -pw_16: times 8 dw 16 -pw_32: times 8 dw 32 -pd_128: times 4 dd 128 - SECTION .text +cextern pw_1 +cextern pw_16 +cextern pw_32 +cextern pd_128 +cextern pw_3fff + %macro LOAD_ADD 4 movh %4, %3 movh %1, %2 @@ -121,9 +122,9 @@ %macro HPEL_V 1-2 0 ;----------------------------------------------------------------------------- -; void x264_hpel_filter_v_mmxext( uint8_t dst, uint8_t src, int16_t buf, int stride, int width ); +; void hpel_filter_v( uint8_t dst, uint8_t src, int16_t buf, int stride, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_v_%1, 5,6,%2 +cglobal hpel_filter_v_%1, 5,6,%2 %ifdef WIN64 movsxd r4, r4d %endif @@ -180,9 +181,9 @@ HPEL_V mmxext ;----------------------------------------------------------------------------- -; void x264_hpel_filter_c_mmxext( uint8_t dst, int16_t buf, int width ); +; void hpel_filter_c( uint8_t dst, int16_t buf, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_c_mmxext, 3,3 +cglobal hpel_filter_c_mmxext, 3,3 add r0, r2 lea r1, [r1+r22] neg r2 @@ -209,9 +210,9 @@ REP_RET ;----------------------------------------------------------------------------- -; void x264_hpel_filter_h_mmxext( uint8_t dst, uint8_t src, int width ); +; void hpel_filter_h( uint8_t dst, uint8_t src, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_h_mmxext, 3,3 +cglobal hpel_filter_h_mmxext, 3,3 add r0, r2 add r1, r2 neg r2 @@ -256,9 +257,9 @@ %macro HPEL_C 1 ;----------------------------------------------------------------------------- -; void x264_hpel_filter_c_sse2( uint8_t dst, int16_t buf, int width ); +; void hpel_filter_c( uint8_t dst, int16_t buf, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_c_%1, 3,3,9 +cglobal hpel_filter_c_%1, 3,3,9 add r0, r2 lea r1, [r1+r22] neg r2 @@ -331,9 +332,9 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_hpel_filter_h_sse2( uint8_t dst, uint8_t src, int width ); +; void hpel_filter_h( uint8_t dst, uint8_t src, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_h_sse2, 3,3,8 +cglobal hpel_filter_h_sse2, 3,3,8 add r0, r2 add r1, r2 neg r2 @@ -380,9 +381,9 @@ %ifndef ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_hpel_filter_h_ssse3( uint8_t dst, uint8_t src, int width ); +; void hpel_filter_h( uint8_t dst, uint8_t src, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_h_ssse3, 3,3 +cglobal hpel_filter_h_ssse3, 3,3 add r0, r2 add r1, r2 neg r2 @@ -557,10 +558,10 @@ %macro HPEL 1 ;----------------------------------------------------------------------------- -; void x264_hpel_filter_sse2( uint8_t dsth, uint8_t dstv, uint8_t dstc, -; uint8_t src, int stride, int width, int height) +; void hpel_filter( uint8_t dsth, uint8_t dstv, uint8_t dstc, +; uint8_t src, int stride, int width, int height) ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_%1, 7,7,16 +cglobal hpel_filter_%1, 7,7,16 %ifdef WIN64 movsxd r4, r4d movsxd r5, r5d @@ -626,20 +627,16 @@ HPEL ssse3 %endif -cglobal x264_sfence - sfence - ret - %undef movntq %undef movntps %undef sfence ;----------------------------------------------------------------------------- -; void x264_plane_copy_core_mmxext( uint8_t dst, int i_dst, -; uint8_t src, int i_src, int w, int h) +; void plane_copy_core( uint8_t dst, int i_dst, +; uint8_t src, int i_src, int w, int h) ;----------------------------------------------------------------------------- ; assumes i_dst and w are multiples of 16, and i_dst>w -cglobal x264_plane_copy_core_mmxext, 6,7 +cglobal plane_copy_core_mmxext, 6,7 movsxdifnidn r1, r1d movsxdifnidn r3, r3d movsxdifnidn r4, r4d @@ -698,9 +695,9 @@ ; memzero SSE will fail for non-mod128. ;----------------------------------------------------------------------------- -; void x264_memcpy_aligned_mmx( void dst, const void src, size_t n ); +; void memcpy_aligned( void dst, const void src, size_t n ); ;----------------------------------------------------------------------------- -cglobal x264_memcpy_aligned_mmx, 3,3 +cglobal memcpy_aligned_mmx, 3,3 test r2d, 16 jz .copy32 sub r2d, 16 @@ -722,9 +719,9 @@ REP_RET ;----------------------------------------------------------------------------- -; void x264_memcpy_aligned_sse2( void dst, const void src, size_t n ); +; void memcpy_aligned( void dst, const void src, size_t n ); ;----------------------------------------------------------------------------- -cglobal x264_memcpy_aligned_sse2, 3,3 +cglobal memcpy_aligned_sse2, 3,3 test r2d, 16 jz .copy32 sub r2d, 16 @@ -752,10 +749,10 @@ REP_RET ;----------------------------------------------------------------------------- -; void x264_memzero_aligned( void dst, size_t n ); +; void memzero_aligned( void dst, size_t n ); ;----------------------------------------------------------------------------- %macro MEMZERO 1 -cglobal x264_memzero_aligned_%1, 2,2 +cglobal memzero_aligned_%1, 2,2 add r0, r1 neg r1 pxor m0, m0 @@ -778,9 +775,9 @@ ;----------------------------------------------------------------------------- -; void x264_integral_init4h_sse4( uint16_t sum, uint8_t pix, int stride ) +; void integral_init4h( uint16_t sum, uint8_t pix, int stride ) ;----------------------------------------------------------------------------- -cglobal x264_integral_init4h_sse4, 3,4 +cglobal integral_init4h_sse4, 3,4 lea r3, [r0+r22] add r1, r2 neg r2 @@ -799,7 +796,7 @@ jl .loop REP_RET -cglobal x264_integral_init8h_sse4, 3,4 +cglobal integral_init8h_sse4, 3,4 lea r3, [r0+r22] add r1, r2 neg r2 @@ -826,9 +823,9 @@ %macro INTEGRAL_INIT_8V 1 ;----------------------------------------------------------------------------- -; void x264_integral_init8v_mmx( uint16_t sum8, int stride ) +; void integral_init8v( uint16_t sum8, int stride ) ;----------------------------------------------------------------------------- -cglobal x264_integral_init8v_%1, 3,3 +cglobal integral_init8v_%1, 3,3 shl r1, 1 add r0, r1 lea r2, [r0+r18] @@ -851,10 +848,10 @@ INTEGRAL_INIT_8V sse2 ;----------------------------------------------------------------------------- -; void x264_integral_init4v_mmx( uint16_t sum8, uint16_t sum4, int stride ) +; void integral_init4v( uint16_t sum8, uint16_t sum4, int stride ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_integral_init4v_mmx, 3,5 +cglobal integral_init4v_mmx, 3,5 shl r2, 1 lea r3, [r0+r24] lea r4, [r0+r28] @@ -876,7 +873,7 @@ REP_RET INIT_XMM -cglobal x264_integral_init4v_sse2, 3,5 +cglobal integral_init4v_sse2, 3,5 shl r2, 1 add r0, r2 add r1, r2 @@ -901,7 +898,7 @@ jl .loop REP_RET -cglobal x264_integral_init4v_ssse3, 3,5 +cglobal integral_init4v_ssse3, 3,5 shl r2, 1 add r0, r2 add r1, r2 @@ -993,7 +990,7 @@ ; int src_stride, int dst_stride, int width, int height ) ;----------------------------------------------------------------------------- %macro FRAME_INIT_LOWRES 1-2 0 ; FIXME -cglobal x264_frame_init_lowres_core_%1, 6,7,%2 +cglobal frame_init_lowres_core_%1, 6,7,%2 %ifdef WIN64 movsxd r5, r5d %endif @@ -1114,7 +1111,7 @@ ; void mbtree_propagate_cost( int dst, uint16_t propagate_in, uint16_t intra_costs, ; uint16_t inter_costs, uint16_t inv_qscales, int len ) ;----------------------------------------------------------------------------- -cglobal x264_mbtree_propagate_cost_sse2, 6,6 +cglobal mbtree_propagate_cost_sse2, 6,6 shl r5d, 1 lea r0, [r0+r52] add r1, r5 @@ -1132,8 +1129,9 @@ pmaddwd xmm0, xmm2 paddd xmm0, xmm4 psrld xmm0, 8 ; intrainvq>>8 - movq xmm1, [r1+r5] ; prop movq xmm3, [r3+r5] ; inter + movq xmm1, [r1+r5] ; prop + pand xmm3, [pw_3fff] punpcklwd xmm1, xmm5 punpcklwd xmm3, xmm5 paddd xmm0, xmm1 ; prop + (intra*invq>>8)
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/mc-c.c ^
@@ -44,11 +44,11 @@ DECL_SUF( x264_pixel_avg_4x2, ( uint8_t , int, uint8_t , int, uint8_t , int, int )) #define MC_WEIGHT(w,type) \ - extern void x264_mc_weight_w##w##_##type( uint8_t ,int, uint8_t ,int, const x264_weight_t ,int ); + void x264_mc_weight_w##w##_##type( uint8_t ,int, uint8_t ,int, const x264_weight_t ,int ); #define MC_WEIGHT_OFFSET(w,type) \ - extern void x264_mc_offsetadd_w##w##_##type( uint8_t ,int, uint8_t ,int, const x264_weight_t ,int ); \ - extern void x264_mc_offsetsub_w##w##_##type( uint8_t ,int, uint8_t ,int, const x264_weight_t ,int ); \ + void x264_mc_offsetadd_w##w##_##type( uint8_t ,int, uint8_t ,int, const x264_weight_t ,int ); \ + void x264_mc_offsetsub_w##w##_##type( uint8_t ,int, uint8_t ,int, const x264_weight_t ,int ); \ MC_WEIGHT(w,type) MC_WEIGHT_OFFSET( 4, mmxext ) @@ -68,51 +68,51 @@ #undef MC_OFFSET #undef MC_WEIGHT -extern void x264_mc_copy_w4_mmx( uint8_t , int, uint8_t , int, int ); -extern void x264_mc_copy_w8_mmx( uint8_t , int, uint8_t , int, int ); -extern void x264_mc_copy_w16_mmx( uint8_t , int, uint8_t , int, int ); -extern void x264_mc_copy_w16_sse2( uint8_t , int, uint8_t , int, int ); -extern void x264_mc_copy_w16_sse3( uint8_t , int, uint8_t , int, int ); -extern void x264_mc_copy_w16_aligned_sse2( uint8_t , int, uint8_t , int, int ); -extern void x264_prefetch_fenc_mmxext( uint8_t , int, uint8_t , int, int ); -extern void x264_prefetch_ref_mmxext( uint8_t , int, int ); -extern void x264_mc_chroma_mmxext( uint8_t src, int i_src_stride, +void x264_mc_copy_w4_mmx( uint8_t , int, uint8_t , int, int ); +void x264_mc_copy_w8_mmx( uint8_t , int, uint8_t , int, int ); +void x264_mc_copy_w16_mmx( uint8_t , int, uint8_t , int, int ); +void x264_mc_copy_w16_sse2( uint8_t , int, uint8_t , int, int ); +void x264_mc_copy_w16_sse3( uint8_t , int, uint8_t , int, int ); +void x264_mc_copy_w16_aligned_sse2( uint8_t , int, uint8_t , int, int ); +void x264_prefetch_fenc_mmxext( uint8_t , int, uint8_t , int, int ); +void x264_prefetch_ref_mmxext( uint8_t , int, int ); +void x264_mc_chroma_mmxext( uint8_t src, int i_src_stride, + uint8_t dst, int i_dst_stride, + int dx, int dy, int i_width, int i_height ); +void x264_mc_chroma_sse2( uint8_t src, int i_src_stride, + uint8_t dst, int i_dst_stride, + int dx, int dy, int i_width, int i_height ); +void x264_mc_chroma_ssse3( uint8_t src, int i_src_stride, + uint8_t dst, int i_dst_stride, + int dx, int dy, int i_width, int i_height ); +void x264_mc_chroma_ssse3_cache64( uint8_t src, int i_src_stride, uint8_t dst, int i_dst_stride, int dx, int dy, int i_width, int i_height ); -extern void x264_mc_chroma_sse2( uint8_t src, int i_src_stride, - uint8_t dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -extern void x264_mc_chroma_ssse3( uint8_t src, int i_src_stride, - uint8_t dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -extern void x264_mc_chroma_ssse3_cache64( uint8_t src, int i_src_stride, - uint8_t dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -extern void x264_plane_copy_core_mmxext( uint8_t , int, uint8_t , int, int w, int h); -extern void x264_plane_copy_c( uint8_t , int, uint8_t , int, int w, int h); -extern void x264_memcpy_aligned_mmx( void dst, const void * src, size_t n ); -extern void x264_memcpy_aligned_sse2( void dst, const void * src, size_t n ); -extern void x264_memzero_aligned_mmx( void * dst, int n ); -extern void x264_memzero_aligned_sse2( void * dst, int n ); -extern void x264_integral_init4h_sse4( uint16_t sum, uint8_t pix, int stride ); -extern void x264_integral_init8h_sse4( uint16_t sum, uint8_t pix, int stride ); -extern void x264_integral_init4v_mmx( uint16_t sum8, uint16_t sum4, int stride ); -extern void x264_integral_init4v_sse2( uint16_t sum8, uint16_t sum4, int stride ); -extern void x264_integral_init8v_mmx( uint16_t sum8, int stride ); -extern void x264_integral_init8v_sse2( uint16_t sum8, int stride ); -extern void x264_integral_init4v_ssse3( uint16_t sum8, uint16_t sum4, int stride ); -extern void x264_mbtree_propagate_cost_sse2( int dst, uint16_t propagate_in, uint16_t intra_costs, - uint16_t inter_costs, uint16_t inv_qscales, int len ); -#define LOWRES(cpu) \ -extern void x264_frame_init_lowres_core_##cpu( uint8_t src0, uint8_t dst0, uint8_t dsth, uint8_t dstv, uint8_t dstc,\ - int src_stride, int dst_stride, int width, int height ); +void x264_plane_copy_core_mmxext( uint8_t , int, uint8_t , int, int w, int h); +void x264_plane_copy_c( uint8_t , int, uint8_t , int, int w, int h); +void x264_memcpy_aligned_mmx( void dst, const void * src, size_t n ); +void x264_memcpy_aligned_sse2( void dst, const void * src, size_t n ); +void x264_memzero_aligned_mmx( void * dst, int n ); +void x264_memzero_aligned_sse2( void * dst, int n ); +void x264_integral_init4h_sse4( uint16_t sum, uint8_t pix, int stride ); +void x264_integral_init8h_sse4( uint16_t sum, uint8_t pix, int stride ); +void x264_integral_init4v_mmx( uint16_t sum8, uint16_t sum4, int stride ); +void x264_integral_init4v_sse2( uint16_t sum8, uint16_t sum4, int stride ); +void x264_integral_init8v_mmx( uint16_t sum8, int stride ); +void x264_integral_init8v_sse2( uint16_t sum8, int stride ); +void x264_integral_init4v_ssse3( uint16_t sum8, uint16_t sum4, int stride ); +void x264_mbtree_propagate_cost_sse2( int dst, uint16_t propagate_in, uint16_t intra_costs, + uint16_t inter_costs, uint16_t inv_qscales, int len ); +#define LOWRES(cpu)\ +void x264_frame_init_lowres_core_##cpu( uint8_t src0, uint8_t dst0, uint8_t dsth, uint8_t dstv, uint8_t dstc,\ + int src_stride, int dst_stride, int width, int height ); LOWRES(mmxext) LOWRES(cache32_mmxext) LOWRES(sse2) LOWRES(ssse3) #define PIXEL_AVG_W(width,cpu)\ -extern void x264_pixel_avg2_w##width##_##cpu( uint8_t , int, uint8_t , int, uint8_t , int ); +void x264_pixel_avg2_w##width##_##cpu( uint8_t , int, uint8_t , int, uint8_t , int ); /* This declares some functions that don't exist, but that isn't a problem. / #define PIXEL_AVG_WALL(cpu)\ PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(20,cpu); @@ -228,8 +228,8 @@ } } -static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; -static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; +static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; +static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; #define MC_LUMA(name,instr1,instr2)\ static void mc_luma_##name( uint8_t dst, int i_dst_stride,\ @@ -309,7 +309,6 @@ void x264_hpel_filter_v_##cpuv( uint8_t dst, uint8_t src, int16_t buf, int stride, int width);\ void x264_hpel_filter_c_##cpuc( uint8_t dst, int16_t buf, int width );\ void x264_hpel_filter_h_##cpuh( uint8_t dst, uint8_t src, int width );\ -void x264_sfence( void );\ static void x264_hpel_filter_##cpu( uint8_t dsth, uint8_t dstv, uint8_t dstc, uint8_t src,\ int stride, int width, int height, int16_t buf )\ {\
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/pixel-32.asm ^
@@ -61,9 +61,9 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_sa8d_8x8_mmxext( uint8_t , int, uint8_t , int ) +; int pixel_sa8d_8x8( uint8_t , int, uint8_t , int ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_sa8d_8x8_internal_mmxext +cglobal pixel_sa8d_8x8_internal_mmxext push r0 push r2 sub esp, 0x74 @@ -169,9 +169,9 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t fenc, int16_t edges[2][8], int res ) +; void intra_sa8d_x3_8x8_core( uint8_t fenc, int16_t edges[2][8], int res ) ;----------------------------------------------------------------------------- -cglobal x264_intra_sa8d_x3_8x8_core_mmxext +cglobal intra_sa8d_x3_8x8_core_mmxext mov eax, [esp+4] mov ecx, [esp+8] sub esp, 0x70 @@ -329,10 +329,10 @@ ;----------------------------------------------------------------------------- -; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t pix1, int stride1, -; const uint8_t pix2, int stride2, int sums[2][4] ) +; void pixel_ssim_4x4x2_core( const uint8_t pix1, int stride1, +; const uint8_t pix2, int stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_ssim_4x4x2_core_mmxext +cglobal pixel_ssim_4x4x2_core_mmxext push ebx push edi mov ebx, [esp+16]
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/pixel-a.asm ^
@@ -27,17 +27,14 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA -pw_1: times 8 dw 1 -pw_00ff: times 8 dw 0xff -ssim_c1: times 4 dd 416 ; .01.0125525564 -ssim_c2: times 4 dd 235963 ; .03.032552556463 +SECTION_RODATA 32 mask_ff: times 16 db 0xff times 16 db 0 +ssim_c1: times 4 dd 416 ; .01.0125525564 +ssim_c2: times 4 dd 235963 ; .03.032552556463 mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1 mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1 mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1 -hsub_mul: times 8 db 1, -1 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 hmul_8p: times 8 db 1 times 4 db 1, -1 @@ -46,6 +43,11 @@ SECTION .text +cextern pw_1 +cextern pw_00ff + +cextern hsub_mul + %macro HADDD 2 ; sum junk %if mmsize == 16 movhlps %2, %1 @@ -213,7 +215,7 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_ssd_16x16_mmx( uint8_t , int, uint8_t , int ) +; int pixel_ssd_16x16( uint8_t , int, uint8_t , int ) ;----------------------------------------------------------------------------- %macro SSD 3-4 0 %if %1 != %2 @@ -221,7 +223,7 @@ %else %assign function_align 16 %endif -cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0 +cglobal pixel_ssd_%1x%2_%3, 0,0,0 mov al, %1%2/mmsize/2 %if %1 != %2 @@ -365,21 +367,21 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_var_wxh_mmxext( uint8_t , int ) +; int pixel_var_wxh( uint8_t , int ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_pixel_var_16x16_mmxext, 2,3 +cglobal pixel_var_16x16_mmxext, 2,3 VAR_START 0 VAR_2ROW 8, 16 VAR_END -cglobal x264_pixel_var_8x8_mmxext, 2,3 +cglobal pixel_var_8x8_mmxext, 2,3 VAR_START 0 VAR_2ROW r1, 4 VAR_END INIT_XMM -cglobal x264_pixel_var_16x16_sse2, 2,3,8 +cglobal pixel_var_16x16_sse2, 2,3,8 VAR_START 1 mov r2d, 8 .loop: @@ -392,7 +394,7 @@ jg .loop VAR_END -cglobal x264_pixel_var_8x8_sse2, 2,4,8 +cglobal pixel_var_8x8_sse2, 2,4,8 VAR_START 1 mov r2d, 2 lea r3, [r13] @@ -421,11 +423,11 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_var2_8x8_mmxext( uint8_t , int, uint8_t , int, int * ) +; int pixel_var2_8x8( uint8_t , int, uint8_t , int, int * ) ;----------------------------------------------------------------------------- %ifndef ARCH_X86_64 INIT_MMX -cglobal x264_pixel_var2_8x8_mmxext, 5,6 +cglobal pixel_var2_8x8_mmxext, 5,6 VAR_START 0 mov r5d, 8 .loop: @@ -455,7 +457,7 @@ %endif INIT_XMM -cglobal x264_pixel_var2_8x8_sse2, 5,6,8 +cglobal pixel_var2_8x8_sse2, 5,6,8 VAR_START 1 mov r5d, 4 .loop: @@ -479,7 +481,7 @@ VAR2_END RET -cglobal x264_pixel_var2_8x8_ssse3, 5,6,8 +cglobal pixel_var2_8x8_ssse3, 5,6,8 pxor m5, m5 ; sum pxor m6, m6 ; sum squared mova m7, [hsub_mul] @@ -692,10 +694,10 @@ ; for small blocks on x86_32, modify pixel pointer instead. ;----------------------------------------------------------------------------- -; int x264_pixel_satd_16x16_mmxext (uint8_t , int, uint8_t , int ) +; int pixel_satd_16x16( uint8_t , int, uint8_t , int ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_pixel_satd_16x4_internal_mmxext +cglobal pixel_satd_16x4_internal_mmxext SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 0 paddw m0, m2 @@ -706,69 +708,69 @@ paddw m0, m1 ret -cglobal x264_pixel_satd_8x8_internal_mmxext +cglobal pixel_satd_8x8_internal_mmxext SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 1 paddw m0, m2 paddw m0, m1 -x264_pixel_satd_8x4_internal_mmxext: +pixel_satd_8x4_internal_mmxext: SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 0 paddw m0, m2 paddw m0, m1 ret -cglobal x264_pixel_satd_16x16_mmxext, 4,6 +cglobal pixel_satd_16x16_mmxext, 4,6 SATD_START_MMX pxor m0, m0 %rep 3 - call x264_pixel_satd_16x4_internal_mmxext + call pixel_satd_16x4_internal_mmxext lea r0, [r0+4r1] lea r2, [r2+4r3] %endrep - call x264_pixel_satd_16x4_internal_mmxext + call pixel_satd_16x4_internal_mmxext HADDUW m0, m1 movd eax, m0 RET -cglobal x264_pixel_satd_16x8_mmxext, 4,6 +cglobal pixel_satd_16x8_mmxext, 4,6 SATD_START_MMX pxor m0, m0 - call x264_pixel_satd_16x4_internal_mmxext + call pixel_satd_16x4_internal_mmxext lea r0, [r0+4r1] lea r2, [r2+4r3] - call x264_pixel_satd_16x4_internal_mmxext + call pixel_satd_16x4_internal_mmxext SATD_END_MMX -cglobal x264_pixel_satd_8x16_mmxext, 4,6 +cglobal pixel_satd_8x16_mmxext, 4,6 SATD_START_MMX pxor m0, m0 - call x264_pixel_satd_8x8_internal_mmxext + call pixel_satd_8x8_internal_mmxext lea r0, [r0+4r1] lea r2, [r2+4r3] - call x264_pixel_satd_8x8_internal_mmxext + call pixel_satd_8x8_internal_mmxext SATD_END_MMX -cglobal x264_pixel_satd_8x8_mmxext, 4,6 +cglobal pixel_satd_8x8_mmxext, 4,6 SATD_START_MMX pxor m0, m0 - call x264_pixel_satd_8x8_internal_mmxext + call pixel_satd_8x8_internal_mmxext SATD_END_MMX -cglobal x264_pixel_satd_8x4_mmxext, 4,6 +cglobal pixel_satd_8x4_mmxext, 4,6 SATD_START_MMX pxor m0, m0 - call x264_pixel_satd_8x4_internal_mmxext + call pixel_satd_8x4_internal_mmxext SATD_END_MMX -cglobal x264_pixel_satd_4x8_mmxext, 4,6 +cglobal pixel_satd_4x8_mmxext, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 1 SATD_4x4_MMX m1, 0, 0 paddw m0, m1 SATD_END_MMX -cglobal x264_pixel_satd_4x4_mmxext, 4,6 +cglobal pixel_satd_4x4_mmxext, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 0 SATD_END_MMX @@ -808,12 +810,12 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_satd_8x4_sse2 (uint8_t , int, uint8_t , int ) +; int pixel_satd_8x4( uint8_t , int, uint8_t , int ) ;----------------------------------------------------------------------------- %macro SATDS_SSE2 1 INIT_XMM %ifnidn %1, sse2 -cglobal x264_pixel_satd_4x4_%1, 4, 6, 6 +cglobal pixel_satd_4x4_%1, 4, 6, 6 SATD_START_MMX mova m4, [hmul_4p] LOAD_DUP_2x4P m2, m5, [r2], [r2+r3] @@ -829,7 +831,7 @@ RET %endif -cglobal x264_pixel_satd_4x8_%1, 4, 6, 8 +cglobal pixel_satd_4x8_%1, 4, 6, 8 SATD_START_MMX %ifnidn %1, sse2 mova m7, [hmul_4p] @@ -869,16 +871,16 @@ movd eax, m6 RET -cglobal x264_pixel_satd_8x8_internal_%1 +cglobal pixel_satd_8x8_internal_%1 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6 -x264_pixel_satd_8x4_internal_%1: +pixel_satd_8x4_internal_%1: LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6 ret %ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same -cglobal x264_pixel_satd_16x4_internal_%1 +cglobal pixel_satd_16x4_internal_%1 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11 lea r2, [r2+4r3] lea r0, [r0+4r1] @@ -886,67 +888,67 @@ SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10 ret -cglobal x264_pixel_satd_16x8_%1, 4,6,12 +cglobal pixel_satd_16x8_%1, 4,6,12 SATD_START_SSE2 %1, m10, m7 %ifidn %1, sse2 mova m7, [pw_00ff] %endif - jmp x264_pixel_satd_16x8_internal_%1 + jmp pixel_satd_16x8_internal_%1 -cglobal x264_pixel_satd_16x16_%1, 4,6,12 +cglobal pixel_satd_16x16_%1, 4,6,12 SATD_START_SSE2 %1, m10, m7 %ifidn %1, sse2 mova m7, [pw_00ff] %endif - call x264_pixel_satd_16x4_internal_%1 - call x264_pixel_satd_16x4_internal_%1 -x264_pixel_satd_16x8_internal_%1: - call x264_pixel_satd_16x4_internal_%1 - call x264_pixel_satd_16x4_internal_%1 + call pixel_satd_16x4_internal_%1 + call pixel_satd_16x4_internal_%1 +pixel_satd_16x8_internal_%1: + call pixel_satd_16x4_internal_%1 + call pixel_satd_16x4_internal_%1 SATD_END_SSE2 %1, m10 %else -cglobal x264_pixel_satd_16x8_%1, 4,6,8 +cglobal pixel_satd_16x8_%1, 4,6,8 SATD_START_SSE2 %1, m6, m7 BACKUP_POINTERS - call x264_pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 RESTORE_AND_INC_POINTERS - call x264_pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 SATD_END_SSE2 %1, m6 -cglobal x264_pixel_satd_16x16_%1, 4,6,8 +cglobal pixel_satd_16x16_%1, 4,6,8 SATD_START_SSE2 %1, m6, m7 BACKUP_POINTERS - call x264_pixel_satd_8x8_internal_%1 - call x264_pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 RESTORE_AND_INC_POINTERS - call x264_pixel_satd_8x8_internal_%1 - call x264_pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 SATD_END_SSE2 %1, m6 %endif -cglobal x264_pixel_satd_8x16_%1, 4,6,8 +cglobal pixel_satd_8x16_%1, 4,6,8 SATD_START_SSE2 %1, m6, m7 - call x264_pixel_satd_8x8_internal_%1 - call x264_pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 SATD_END_SSE2 %1, m6 -cglobal x264_pixel_satd_8x8_%1, 4,6,8 +cglobal pixel_satd_8x8_%1, 4,6,8 SATD_START_SSE2 %1, m6, m7 - call x264_pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal_%1 SATD_END_SSE2 %1, m6 -cglobal x264_pixel_satd_8x4_%1, 4,6,8 +cglobal pixel_satd_8x4_%1, 4,6,8 SATD_START_SSE2 %1, m6, m7 - call x264_pixel_satd_8x4_internal_%1 + call pixel_satd_8x4_internal_%1 SATD_END_SSE2 %1, m6 %endmacro ; SATDS_SSE2 %macro SA8D 1 %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; int x264_pixel_sa8d_8x8_sse2( uint8_t , int, uint8_t , int ) +; int pixel_sa8d_8x8( uint8_t , int, uint8_t , int ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_sa8d_8x8_internal_%1 +cglobal pixel_sa8d_8x8_internal_%1 lea r10, [r0+4r1] lea r11, [r2+4r3] LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2 @@ -970,41 +972,41 @@ paddw m0, m1 paddw m0, m2 paddw m0, m8 - SAVE_MM_PERMUTATION x264_pixel_sa8d_8x8_internal_%1 + SAVE_MM_PERMUTATION pixel_sa8d_8x8_internal_%1 ret -cglobal x264_pixel_sa8d_8x8_%1, 4,6,12 +cglobal pixel_sa8d_8x8_%1, 4,6,12 lea r4, [3r1] lea r5, [3r3] %ifnidn %1, sse2 mova m7, [hmul_8p] %endif - call x264_pixel_sa8d_8x8_internal_%1 + call pixel_sa8d_8x8_internal_%1 HADDW m0, m1 movd eax, m0 add eax, 1 shr eax, 1 RET -cglobal x264_pixel_sa8d_16x16_%1, 4,6,12 +cglobal pixel_sa8d_16x16_%1, 4,6,12 lea r4, [3r1] lea r5, [3r3] %ifnidn %1, sse2 mova m7, [hmul_8p] %endif - call x264_pixel_sa8d_8x8_internal_%1 ; pix[0] + call pixel_sa8d_8x8_internal_%1 ; pix[0] add r2, 8 add r0, 8 mova m10, m0 - call x264_pixel_sa8d_8x8_internal_%1 ; pix[8] + call pixel_sa8d_8x8_internal_%1 ; pix[8] lea r2, [r2+8r3] lea r0, [r0+8r1] paddusw m10, m0 - call x264_pixel_sa8d_8x8_internal_%1 ; pix[8stride+8] + call pixel_sa8d_8x8_internal_%1 ; pix[8stride+8] sub r2, 8 sub r0, 8 paddusw m10, m0 - call x264_pixel_sa8d_8x8_internal_%1 ; pix[8stride] + call pixel_sa8d_8x8_internal_%1 ; pix[8stride] paddusw m0, m10 HADDUW m0, m1 movd eax, m0 @@ -1014,7 +1016,7 @@ %else ; ARCH_X86_32 %ifnidn %1, mmxext -cglobal x264_pixel_sa8d_8x8_internal_%1 +cglobal pixel_sa8d_8x8_internal_%1 %define spill0 [esp+4] %define spill1 [esp+20] %define spill2 [esp+36] @@ -1064,13 +1066,13 @@ ret %endif ; ifndef mmxext -cglobal x264_pixel_sa8d_8x8_%1, 4,7 +cglobal pixel_sa8d_8x8_%1, 4,7 mov r6, esp and esp, ~15 sub esp, 48 lea r4, [3r1] lea r5, [3r3] - call x264_pixel_sa8d_8x8_internal_%1 + call pixel_sa8d_8x8_internal_%1 HADDW m0, m1 movd eax, m0 add eax, 1 @@ -1078,26 +1080,26 @@ mov esp, r6 RET -cglobal x264_pixel_sa8d_16x16_%1, 4,7 +cglobal pixel_sa8d_16x16_%1, 4,7 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [3r1] lea r5, [3r3] - call x264_pixel_sa8d_8x8_internal_%1 + call pixel_sa8d_8x8_internal_%1 %ifidn %1, mmxext lea r0, [r0+4r1] lea r2, [r2+4r3] %endif mova [esp+48], m0 - call x264_pixel_sa8d_8x8_internal_%1 + call pixel_sa8d_8x8_internal_%1 mov r0, [r6+20] mov r2, [r6+28] add r0, 8 add r2, 8 paddusw m0, [esp+48] mova [esp+48], m0 - call x264_pixel_sa8d_8x8_internal_%1 + call pixel_sa8d_8x8_internal_%1 %ifidn %1, mmxext lea r0, [r0+4r1] lea r2, [r2+4r3] @@ -1106,7 +1108,7 @@ paddusw m0, [esp+48] %endif mova [esp+64-mmsize], m0 - call x264_pixel_sa8d_8x8_internal_%1 + call pixel_sa8d_8x8_internal_%1 paddusw m0, [esp+64-mmsize] %if mmsize == 16 HADDUW m0, m1 @@ -1140,9 +1142,9 @@ %ifdef ARCH_X86_64 INIT_XMM ;----------------------------------------------------------------------------- -; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t fenc, int16_t edges[2][8], int res ) +; void intra_sa8d_x3_8x8_core( uint8_t fenc, int16_t edges[2][8], int res ) ;----------------------------------------------------------------------------- -cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16 +cglobal intra_sa8d_x3_8x8_core_%1, 3,3,16 ; 8x8 hadamard pxor m8, m8 movq m0, [r0+0FENC_STRIDE] @@ -1247,7 +1249,7 @@ ; in: r0 = fenc ; out: m0..m3 = hadamard coefs INIT_MMX -cglobal x264_hadamard_load +cglobal hadamard_load ; not really a global, but otherwise cycles get attributed to the wrong function in profiling pxor m7, m7 movd m0, [r0+0FENC_STRIDE] @@ -1259,7 +1261,7 @@ punpcklbw m2, m7 punpcklbw m3, m7 HADAMARD4_2D 0, 1, 2, 3, 4 - SAVE_MM_PERMUTATION x264_hadamard_load + SAVE_MM_PERMUTATION hadamard_load ret %macro SCALAR_SUMSUB 4 @@ -1377,9 +1379,9 @@ %macro INTRA_SATDS_MMX 1 INIT_MMX ;----------------------------------------------------------------------------- -; void x264_intra_satd_x3_4x4_mmxext( uint8_t fenc, uint8_t fdec, int res ) +; void intra_satd_x3_4x4( uint8_t fenc, uint8_t fdec, int res ) ;----------------------------------------------------------------------------- -cglobal x264_intra_satd_x3_4x4_%1, 2,6 +cglobal intra_satd_x3_4x4_%1, 2,6 %ifdef ARCH_X86_64 ; stack is 16 byte aligned because abi says so %define top_1d rsp-8 ; size 8 @@ -1393,7 +1395,7 @@ %define t0 r2 %endif - call x264_hadamard_load + call hadamard_load SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5 mov t0d, r0d SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5 @@ -1430,9 +1432,9 @@ %endif ;----------------------------------------------------------------------------- -; void x264_intra_satd_x3_16x16_mmxext( uint8_t fenc, uint8_t fdec, int res ) +; void intra_satd_x3_16x16( uint8_t fenc, uint8_t fdec, int res ) ;----------------------------------------------------------------------------- -cglobal x264_intra_satd_x3_16x16_%1, 0,7 +cglobal intra_satd_x3_16x16_%1, 0,7 %ifdef ARCH_X86_64 %assign stack_pad 88 %else @@ -1466,7 +1468,7 @@ .loop_y: xor r4d, r4d .loop_x: - call x264_hadamard_load + call hadamard_load SUM3x4 %1 SUM4x3 t2d, [left_1d+8r3], [top_1d+8r4] @@ -1507,9 +1509,9 @@ RET ;----------------------------------------------------------------------------- -; void x264_intra_satd_x3_8x8c_mmxext( uint8_t fenc, uint8_t fdec, int res ) +; void intra_satd_x3_8x8c( uint8_t fenc, uint8_t fdec, int res ) ;----------------------------------------------------------------------------- -cglobal x264_intra_satd_x3_8x8c_%1, 0,6 +cglobal intra_satd_x3_8x8c_%1, 0,6 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call SUB rsp, 72 %define sums rsp+48 ; size 24 @@ -1555,7 +1557,7 @@ .loop_y: xor r4d, r4d .loop_x: - call x264_hadamard_load + call hadamard_load SUM3x4 %1 SUM4x3 [r5+4r4], [left_1d+8r3], [top_1d+8r4] @@ -1609,7 +1611,7 @@ ; in: r0=pix, r1=stride, r2=stride3, r3=tmp, m6=mask_ac4, m7=0 ; out: [tmp]=hadamard4, m0=satd -cglobal x264_hadamard_ac_4x4_mmxext +cglobal hadamard_ac_4x4_mmxext movh m0, [r0] movh m1, [r0+r1] movh m2, [r0+r12] @@ -1631,10 +1633,10 @@ paddw m0, m1 paddw m2, m3 paddw m0, m2 - SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext + SAVE_MM_PERMUTATION hadamard_ac_4x4_mmxext ret -cglobal x264_hadamard_ac_2x2max_mmxext +cglobal hadamard_ac_2x2max_mmxext mova m0, [r3+0x00] mova m1, [r3+0x20] mova m2, [r3+0x40] @@ -1646,30 +1648,30 @@ HADAMARD 0, max, 1, 3, 4, 5 paddw m7, m0 paddw m7, m1 - SAVE_MM_PERMUTATION x264_hadamard_ac_2x2max_mmxext + SAVE_MM_PERMUTATION hadamard_ac_2x2max_mmxext ret -cglobal x264_hadamard_ac_8x8_mmxext +cglobal hadamard_ac_8x8_mmxext mova m6, [mask_ac4] pxor m7, m7 - call x264_hadamard_ac_4x4_mmxext + call hadamard_ac_4x4_mmxext add r0, 4 add r3, 32 mova m5, m0 - call x264_hadamard_ac_4x4_mmxext + call hadamard_ac_4x4_mmxext lea r0, [r0+4r1] add r3, 64 paddw m5, m0 - call x264_hadamard_ac_4x4_mmxext + call hadamard_ac_4x4_mmxext sub r0, 4 sub r3, 32 paddw m5, m0 - call x264_hadamard_ac_4x4_mmxext + call hadamard_ac_4x4_mmxext paddw m5, m0 sub r3, 40 mova [rsp+gprsize+8], m5 ; save satd %rep 3 - call x264_hadamard_ac_2x2max_mmxext + call hadamard_ac_2x2max_mmxext %endrep mova m0, [r3+0x00] mova m1, [r3+0x20] @@ -1686,33 +1688,33 @@ paddw m6, m7 mova [rsp+gprsize], m6 ; save sa8d SWAP m0, m6 - SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext + SAVE_MM_PERMUTATION hadamard_ac_8x8_mmxext ret %macro HADAMARD_AC_WXH_MMX 2 -cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4 +cglobal pixel_hadamard_ac_%1x%2_mmxext, 2,4 %assign pad 16-gprsize-(stack_offset&15) %define ysub r1 sub rsp, 16+128+pad lea r2, [r13] lea r3, [rsp+16] - call x264_hadamard_ac_8x8_mmxext + call hadamard_ac_8x8_mmxext %if %2==16 %define ysub r2 lea r0, [r0+r14] sub rsp, 16 - call x264_hadamard_ac_8x8_mmxext + call hadamard_ac_8x8_mmxext %endif %if %1==16 neg ysub sub rsp, 16 lea r0, [r0+ysub4+8] neg ysub - call x264_hadamard_ac_8x8_mmxext + call hadamard_ac_8x8_mmxext %if %2==16 lea r0, [r0+r14] sub rsp, 16 - call x264_hadamard_ac_8x8_mmxext + call hadamard_ac_8x8_mmxext %endif %endif mova m1, [rsp+0x08] @@ -1779,7 +1781,7 @@ INIT_XMM ; in: r0=pix, r1=stride, r2=stride3 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride4 -cglobal x264_hadamard_ac_8x8_%1 +cglobal hadamard_ac_8x8_%1 %ifdef ARCH_X86_64 %define spill0 m8 %define spill1 m9 @@ -1883,7 +1885,7 @@ paddw m2, m4 paddw m0, m2 mova [rsp+gprsize+16], m0 ; save sa8d - SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1 + SAVE_MM_PERMUTATION hadamard_ac_8x8_%1 ret HADAMARD_AC_WXH_SSE2 16, 16, %1 @@ -1892,30 +1894,30 @@ HADAMARD_AC_WXH_SSE2 8, 8, %1 %endmacro ; HADAMARD_AC_SSE2 -; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t pix, int stride ) +; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t pix, int stride ) %macro HADAMARD_AC_WXH_SSE2 3 -cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3,11 +cglobal pixel_hadamard_ac_%1x%2_%3, 2,3,11 %assign pad 16-gprsize-(stack_offset&15) %define ysub r1 sub rsp, 48+pad lea r2, [r13] - call x264_hadamard_ac_8x8_%3 + call hadamard_ac_8x8_%3 %if %2==16 %define ysub r2 lea r0, [r0+r14] sub rsp, 32 - call x264_hadamard_ac_8x8_%3 + call hadamard_ac_8x8_%3 %endif %if %1==16 neg ysub sub rsp, 32 lea r0, [r0+ysub4+8] neg ysub - call x264_hadamard_ac_8x8_%3 + call hadamard_ac_8x8_%3 %if %2==16 lea r0, [r0+r14] sub rsp, 32 - call x264_hadamard_ac_8x8_%3 + call hadamard_ac_8x8_%3 %endif %endif mova m1, [rsp+0x20] @@ -1947,7 +1949,7 @@ ; instantiate satds %ifndef ARCH_X86_64 -cextern x264_pixel_sa8d_8x8_internal_mmxext +cextern pixel_sa8d_8x8_internal_mmxext SA8D mmxext %endif @@ -1999,8 +2001,8 @@ ;============================================================================= ;----------------------------------------------------------------------------- -; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t pix1, int stride1, -; const uint8_t pix2, int stride2, int sums[2][4] ) +; void pixel_ssim_4x4x2_core( const uint8_t pix1, int stride1, +; const uint8_t pix2, int stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- %macro SSIM_ITER 1 @@ -2033,7 +2035,7 @@ paddd m3, m6 %endmacro -cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8 +cglobal pixel_ssim_4x4x2_core_sse2, 4,4,8 pxor m0, m0 SSIM_ITER 0 SSIM_ITER 1 @@ -2069,9 +2071,9 @@ RET ;----------------------------------------------------------------------------- -; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width ) +; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_ssim_end4_sse2, 3,3,7 +cglobal pixel_ssim_end4_sse2, 3,3,7 movdqa m0, [r0+ 0] movdqa m1, [r0+16] movdqa m2, [r0+32] @@ -2175,10 +2177,10 @@ %define ABS1 ABS1_MMX ;----------------------------------------------------------------------------- -; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t sums, int delta, -; uint16_t cost_mvx, int16_t mvs, int width, int thresh ) +; int pixel_ads4( int enc_dc[4], uint16_t sums, int delta, +; uint16_t cost_mvx, int16_t mvs, int width, int thresh ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_ads4_mmxext, 4,7 +cglobal pixel_ads4_mmxext, 4,7 movq mm6, [r0] movq mm4, [r0+8] pshufw mm7, mm6, 0 @@ -2215,7 +2217,7 @@ movd [t0], mm1 ADS_END 1 -cglobal x264_pixel_ads2_mmxext, 4,7 +cglobal pixel_ads2_mmxext, 4,7 movq mm6, [r0] pshufw mm5, r6m, 0 pshufw mm7, mm6, 0 @@ -2236,7 +2238,7 @@ movd [t0], mm4 ADS_END 1 -cglobal x264_pixel_ads1_mmxext, 4,7 +cglobal pixel_ads1_mmxext, 4,7 pshufw mm7, [r0], 0 pshufw mm6, r6m, 0 ADS_START 2 @@ -2258,7 +2260,7 @@ ADS_END 2 %macro ADS_SSE2 1 -cglobal x264_pixel_ads4_%1, 4,7,12 +cglobal pixel_ads4_%1, 4,7,12 movdqa xmm4, [r0] pshuflw xmm7, xmm4, 0 pshuflw xmm6, xmm4, 0xAA @@ -2327,7 +2329,7 @@ %endif ; ARCH ADS_END 2 -cglobal x264_pixel_ads2_%1, 4,7,8 +cglobal pixel_ads2_%1, 4,7,8 movq xmm6, [r0] movd xmm5, r6m pshuflw xmm7, xmm6, 0 @@ -2353,7 +2355,7 @@ movq [t0], xmm1 ADS_END 2 -cglobal x264_pixel_ads1_%1, 4,7,8 +cglobal pixel_ads1_%1, 4,7,8 movd xmm7, [r0] movd xmm6, r6m pshuflw xmm7, xmm7, 0 @@ -2385,7 +2387,7 @@ %define ABS1 ABS1_SSSE3 ADS_SSE2 ssse3 -; int x264_pixel_ads_mvs( int16_t mvs, uint8_t masks, int width ) +; int pixel_ads_mvs( int16_t mvs, uint8_t masks, int width ) ; { ; int nmv=0, i, j; ; (uint32_t)(masks+width) = 0; @@ -2399,7 +2401,7 @@ ; } ; return nmv; ; } -cglobal x264_pixel_ads_mvs, 0,7,0 +cglobal pixel_ads_mvs, 0,7,0 ads_mvs: %ifdef ARCH_X86_64 ; mvs = r4
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/predict-a.asm ^
@@ -25,6 +25,24 @@ %include "x86inc.asm" %include "x86util.asm" +SECTION_RODATA + +pw_76543210: +pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 +pb_00s_ff: times 8 db 0 +pb_0s_ff: times 7 db 0 + db 0xff + +SECTION .text + +cextern pb_1 +cextern pb_3 +cextern pw_2 +cextern pw_4 +cextern pw_8 +cextern pw_ff00 +cextern pb_reverse + %macro STORE8x8 2 add r0, 4FDEC_STRIDE movq [r0 + -4FDEC_STRIDE], %1 @@ -74,24 +92,6 @@ movdqa [r0 + 3FDEC_STRIDE], %1 %endmacro -SECTION_RODATA - -ALIGN 16 -pb_1: times 16 db 1 -pb_3: times 16 db 3 -pw_2: times 4 dw 2 -pw_4: times 4 dw 4 -pw_8: times 8 dw 8 -pw_76543210: -pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 -pb_00s_ff: times 8 db 0 -pb_0s_ff: times 7 db 0 - db 0xff -pw_ff00: times 8 dw 0xff00 -pb_reverse: db 7, 6, 5, 4, 3, 2, 1, 0 - -SECTION .text - ; dest, left, right, src, tmp ; output: %1 = (t[n-1] + t[n]2 + t[n+1] + 2) >> 2 %macro PRED8x8_LOWPASS0 6 @@ -126,7 +126,7 @@ %endmacro ;----------------------------------------------------------------------------- -; void predict_4x4_ddl_mmxext( uint8_t src ) +; void predict_4x4_ddl( uint8_t src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_ddl_mmxext, 1,1 movq mm1, [r0-FDEC_STRIDE] @@ -149,7 +149,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_4x4_ddr_mmxext( uint8_t src ) +; void predict_4x4_ddr( uint8_t src ) ;----------------------------------------------------------------------------- %macro PREDICT_4x4 1 cglobal predict_4x4_ddr_%1, 1,1 @@ -233,7 +233,7 @@ PREDICT_4x4 ssse3 ;----------------------------------------------------------------------------- -; void predict_4x4_hu_mmxext( uint8_t src ) +; void predict_4x4_hu( uint8_t src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_hu_mmxext, 1,1 movq mm0, [r0+0FDEC_STRIDE-8] @@ -264,7 +264,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_4x4_vl_mmxext( uint8_t src ) +; void predict_4x4_vl( uint8_t src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_vl_mmxext, 1,1 movq mm1, [r0-FDEC_STRIDE] @@ -426,7 +426,7 @@ PREDICT_FILTER ssse3 ;----------------------------------------------------------------------------- -; void predict_8x8_v_mmxext( uint8_t src, uint8_t edge ) +; void predict_8x8_v( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_v_mmxext, 2,2 movq mm0, [r1+16] @@ -434,7 +434,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_h_mmxext( uint8_t src, uint8_t edge[33] ) +; void predict_8x8_h( uint8_t src, uint8_t edge[33] ) ;----------------------------------------------------------------------------- INIT_MMX @@ -459,7 +459,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_dc_mmxext( uint8_t src, uint8_t edge ); +; void predict_8x8_dc( uint8_t src, uint8_t edge ); ;----------------------------------------------------------------------------- cglobal predict_8x8_dc_mmxext, 2,2 pxor mm0, mm0 @@ -475,7 +475,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_dc_top_mmxext( uint8_t src, uint8_t edge ); +; void predict_8x8_dc_top( uint8_t src, uint8_t edge ); ;----------------------------------------------------------------------------- %macro PRED8x8_DC 2 cglobal %1, 2,2 @@ -497,7 +497,7 @@ ; functions if we know sse2 is available. ;----------------------------------------------------------------------------- -; void predict_8x8_ddl_mmxext( uint8_t src, uint8_t edge ) +; void predict_8x8_ddl( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddl_mmxext, 2,2 movq mm5, [r1+16] @@ -506,10 +506,10 @@ movq mm4, [r1+25] movq mm1, mm5 psllq mm1, 8 + add r0, FDEC_STRIDE4 PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7 PRED8x8_LOWPASS mm1, mm3, mm4, [r1+24], mm6 - -%assign Y 7 +%assign Y 3 %rep 6 movq [r0+YFDEC_STRIDE], mm1 movq mm2, mm0 @@ -528,17 +528,17 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_ddr_mmxext( uint8_t src, uint8_t edge ) +; void predict_8x8_ddr( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddr_mmxext, 2,2 movq mm1, [r1+7] movq mm2, [r1+9] movq mm3, [r1+15] movq mm4, [r1+17] + add r0, FDEC_STRIDE4 PRED8x8_LOWPASS mm0, mm1, mm2, [r1+8], mm7 PRED8x8_LOWPASS mm1, mm3, mm4, [r1+16], mm6 - -%assign Y 7 +%assign Y 3 %rep 6 movq [r0+YFDEC_STRIDE], mm0 movq mm2, mm1 @@ -557,7 +557,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_hu_mmxext( uint8_t src, uint8_t edge ) +; void predict_8x8_hu( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- %define PALIGNR PALIGNR_MMX cglobal predict_8x8_hu_mmxext, 2,2 @@ -602,7 +602,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_vr_core_mmxext( uint8_t src, uint8_t edge ) +; void predict_8x8_vr_core( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- ; fills only some pixels: @@ -622,9 +622,10 @@ movq mm1, [r1+14] movq mm4, mm3 pavgb mm3, mm2 + add r0, FDEC_STRIDE4 PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7 -%assign Y 0 +%assign Y -4 %rep 3 movq [r0+ Y FDEC_STRIDE], mm3 movq [r0+(Y+1)FDEC_STRIDE], mm0 @@ -638,7 +639,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8c_p_core_mmxext( uint8_t src, int i00, int b, int c ) +; void predict_8x8c_p_core( uint8_t src, int i00, int b, int c ) ;----------------------------------------------------------------------------- cglobal predict_8x8c_p_core_mmxext, 1,2 LOAD_PLANE_ARGS @@ -666,7 +667,7 @@ REP_RET ;----------------------------------------------------------------------------- -; void predict_16x16_p_core_mmxext( uint8_t src, int i00, int b, int c ) +; void predict_16x16_p_core( uint8_t src, int i00, int b, int c ) ;----------------------------------------------------------------------------- cglobal predict_16x16_p_core_mmxext, 1,2 LOAD_PLANE_ARGS @@ -710,16 +711,17 @@ %endif ; !ARCH_X86_64 ;----------------------------------------------------------------------------- -; void predict_8x8_ddl_sse2( uint8_t src, uint8_t edge ) +; void predict_8x8_ddl( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddl_sse2, 2,2 movdqa xmm3, [r1+16] movdqu xmm2, [r1+17] movdqa xmm1, xmm3 pslldq xmm1, 1 + add r0, FDEC_STRIDE4 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4 -%assign Y 0 +%assign Y -4 %rep 8 psrldq xmm0, 1 movq [r0+YFDEC_STRIDE], xmm0 @@ -728,18 +730,19 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_ddr_sse2( uint8_t src, uint8_t edge ) +; void predict_8x8_ddr( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddr_sse2, 2,2 movdqu xmm3, [r1+8] movdqu xmm1, [r1+7] movdqa xmm2, xmm3 psrldq xmm2, 1 + add r0, FDEC_STRIDE4 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4 movdqa xmm1, xmm0 psrldq xmm1, 1 -%assign Y 7 +%assign Y 3 %rep 3 movq [r0+YFDEC_STRIDE], xmm0 movq [r0+(Y-1)FDEC_STRIDE], xmm1 @@ -747,13 +750,13 @@ psrldq xmm1, 2 %assign Y (Y-2) %endrep - movq [r0+1FDEC_STRIDE], xmm0 - movq [r0+0FDEC_STRIDE], xmm1 + movq [r0-3FDEC_STRIDE], xmm0 + movq [r0-4FDEC_STRIDE], xmm1 RET ;----------------------------------------------------------------------------- -; void predict_8x8_vl_sse2( uint8_t src, uint8_t edge ) +; void predict_8x8_vl( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_vl_sse2, 2,2 movdqa xmm4, [r1+16] @@ -763,11 +766,12 @@ psrldq xmm2, 1 pslldq xmm1, 1 pavgb xmm3, xmm2 + add r0, FDEC_STRIDE4 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5 ; xmm0: (t0 + 2t1 + t2 + 2) >> 2 ; xmm3: (t0 + t1 + 1) >> 1 -%assign Y 0 +%assign Y -4 %rep 3 psrldq xmm0, 1 movq [r0+ Y FDEC_STRIDE], xmm3 @@ -782,7 +786,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_vr_sse2( uint8_t src, uint8_t edge ) +; void predict_8x8_vr( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_vr_sse2, 2,2,7 movdqu xmm0, [r1+8] @@ -817,7 +821,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_hd_mmxext( uint8_t src, uint8_t edge ) +; void predict_8x8_hd( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- %define PALIGNR PALIGNR_MMX cglobal predict_8x8_hd_mmxext, 2,2 @@ -864,7 +868,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8_hd_ssse3( uint8_t src, uint8_t edge ) +; void predict_8x8_hd( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_HD 1 cglobal predict_8x8_hd_%1, 2,2 @@ -903,7 +907,7 @@ %define PALIGNR PALIGNR_MMX ;----------------------------------------------------------------------------- -; void predict_8x8_hu_sse2( uint8_t src, uint8_t edge ) +; void predict_8x8_hu( uint8_t src, uint8_t edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_HU 1 cglobal predict_8x8_hu_%1, 2,2 @@ -965,7 +969,7 @@ PREDICT_8x8_HU ssse3 ;----------------------------------------------------------------------------- -; void predict_8x8c_v_mmx( uint8_t src ) +; void predict_8x8c_v( uint8_t src ) ;----------------------------------------------------------------------------- cglobal predict_8x8c_v_mmx, 1,1 movq mm0, [r0 - FDEC_STRIDE] @@ -973,7 +977,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8c_h_mmxext( uint8_t src ) +; void predict_8x8c_h( uint8_t src ) ;----------------------------------------------------------------------------- %macro PRED_8x8C_H 1 @@ -997,7 +1001,7 @@ PRED_8x8C_H ssse3 ;----------------------------------------------------------------------------- -; void predict_8x8c_dc_core_mmxext( uint8_t src, int s2, int s3 ) +; void predict_8x8c_dc_core( uint8_t src, int s2, int s3 ) ;----------------------------------------------------------------------------- cglobal predict_8x8c_dc_core_mmxext, 1,1 movq mm0, [r0 - FDEC_STRIDE] @@ -1052,7 +1056,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_8x8c_p_core_sse2( uint8_t src, int i00, int b, int c ) +; void predict_8x8c_p_core( uint8_t src, int i00, int b, int c ) ;----------------------------------------------------------------------------- cglobal predict_8x8c_p_core_sse2, 1,1 @@ -1094,7 +1098,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_16x16_p_core_sse2( uint8_t src, int i00, int b, int c ) +; void predict_16x16_p_core( uint8_t src, int i00, int b, int c ) ;----------------------------------------------------------------------------- cglobal predict_16x16_p_core_sse2, 1,2,8 movd xmm0, r1m @@ -1138,7 +1142,7 @@ REP_RET ;----------------------------------------------------------------------------- -; void predict_16x16_v_mmx( uint8_t src ) +; void predict_16x16_v( uint8_t src ) ;----------------------------------------------------------------------------- cglobal predict_16x16_v_mmx, 1,2 movq mm0, [r0 - FDEC_STRIDE] @@ -1147,7 +1151,7 @@ REP_RET ;----------------------------------------------------------------------------- -; void predict_16x16_v_sse2( uint8_t src ) +; void predict_16x16_v( uint8_t src ) ;----------------------------------------------------------------------------- cglobal predict_16x16_v_sse2, 1,1 movdqa xmm0, [r0 - FDEC_STRIDE] @@ -1155,7 +1159,7 @@ RET ;----------------------------------------------------------------------------- -; void predict_16x16_h_mmxext( uint8_t src ) +; void predict_16x16_h( uint8_t src ) ;----------------------------------------------------------------------------- %macro PRED_16x16_H 1 @@ -1188,7 +1192,7 @@ PRED_16x16_H ssse3 ;----------------------------------------------------------------------------- -; void predict_16x16_dc_core_mmxext( uint8_t src, int i_dc_left ) +; void predict_16x16_dc_core( uint8_t src, int i_dc_left ) ;----------------------------------------------------------------------------- %macro PRED16x16_DC 2 @@ -1225,7 +1229,7 @@ REP_RET ;----------------------------------------------------------------------------- -; void predict_16x16_dc_core_sse2( uint8_t src, int i_dc_left ) +; void predict_16x16_dc_core( uint8_t src, int i_dc_left ) ;----------------------------------------------------------------------------- %macro PRED16x16_DC_SSE2 2
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/predict-c.c ^
@@ -25,55 +25,55 @@ #include "predict.h" #include "pixel.h" -extern void predict_16x16_v_mmx( uint8_t src ); -extern void predict_16x16_h_mmxext( uint8_t src ); -extern void predict_16x16_h_ssse3( uint8_t src ); -extern void predict_16x16_dc_core_mmxext( uint8_t src, int i_dc_left ); -extern void predict_16x16_dc_left_core_mmxext( uint8_t src, int i_dc_left ); -extern void predict_16x16_dc_top_mmxext( uint8_t src ); -extern void predict_16x16_p_core_mmxext( uint8_t src, int i00, int b, int c ); -extern void predict_8x8c_p_core_mmxext( uint8_t src, int i00, int b, int c ); -extern void predict_8x8c_p_core_sse2( uint8_t src, int i00, int b, int c ); -extern void predict_8x8c_dc_core_mmxext( uint8_t src, int s2, int s3 ); -extern void predict_8x8c_dc_top_mmxext( uint8_t src ); -extern void predict_8x8c_v_mmx( uint8_t src ); -extern void predict_8x8c_h_mmxext( uint8_t src ); -extern void predict_8x8c_h_ssse3( uint8_t src ); -extern void predict_8x8_v_mmxext( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_h_mmxext( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_hd_mmxext( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_hu_mmxext( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_dc_mmxext( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_dc_top_mmxext( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_dc_left_mmxext( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_ddl_mmxext( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_ddr_mmxext( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_ddl_sse2( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_ddr_sse2( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_vl_sse2( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_vr_sse2( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_hu_sse2( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_hd_sse2( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_vr_core_mmxext( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_hd_ssse3( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_hu_ssse3( uint8_t src, uint8_t edge[33] ); -extern void predict_8x8_filter_mmxext ( uint8_t src, uint8_t edge[33], int i_neighbor, int i_filters ); -extern void predict_8x8_filter_ssse3 ( uint8_t src, uint8_t edge[33], int i_neighbor, int i_filters ); -extern void predict_4x4_ddl_mmxext( uint8_t src ); -extern void predict_4x4_ddr_mmxext( uint8_t src ); -extern void predict_4x4_vl_mmxext( uint8_t src ); -extern void predict_4x4_vr_mmxext( uint8_t src ); -extern void predict_4x4_vr_ssse3( uint8_t src ); -extern void predict_4x4_hd_mmxext( uint8_t src ); -extern void predict_4x4_hd_ssse3( uint8_t src ); -extern void predict_4x4_dc_mmxext( uint8_t src ); -extern void predict_4x4_ddr_ssse3( uint8_t src ); -extern void predict_4x4_hu_mmxext( uint8_t src ); -extern void predict_16x16_dc_top_sse2( uint8_t src ); -extern void predict_16x16_dc_core_sse2( uint8_t src, int i_dc_left ); -extern void predict_16x16_dc_left_core_sse2( uint8_t src, int i_dc_left ); -extern void predict_16x16_v_sse2( uint8_t src ); -extern void predict_16x16_p_core_sse2( uint8_t src, int i00, int b, int c ); + void x264_predict_16x16_v_mmx( uint8_t src ); + void x264_predict_16x16_h_mmxext( uint8_t src ); + void x264_predict_16x16_h_ssse3( uint8_t src ); + void x264_predict_16x16_dc_core_mmxext( uint8_t src, int i_dc_left ); + void x264_predict_16x16_dc_left_core_mmxext( uint8_t src, int i_dc_left ); + void x264_predict_16x16_dc_top_mmxext( uint8_t src ); + void x264_predict_16x16_p_core_mmxext( uint8_t src, int i00, int b, int c ); + void x264_predict_8x8c_p_core_mmxext( uint8_t src, int i00, int b, int c ); + void x264_predict_8x8c_p_core_sse2( uint8_t src, int i00, int b, int c ); + void x264_predict_8x8c_dc_core_mmxext( uint8_t src, int s2, int s3 ); + void x264_predict_8x8c_dc_top_mmxext( uint8_t src ); + void x264_predict_8x8c_v_mmx( uint8_t src ); + void x264_predict_8x8c_h_mmxext( uint8_t src ); + void x264_predict_8x8c_h_ssse3( uint8_t src ); + void x264_predict_8x8_v_mmxext( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_h_mmxext( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_hd_mmxext( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_hu_mmxext( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_dc_mmxext( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_dc_top_mmxext( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_dc_left_mmxext( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_ddl_mmxext( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_ddr_mmxext( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_ddl_sse2( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_ddr_sse2( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_vl_sse2( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_vr_sse2( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_hu_sse2( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_hd_sse2( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_vr_core_mmxext( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_hd_ssse3( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_hu_ssse3( uint8_t src, uint8_t edge[33] ); + void x264_predict_8x8_filter_mmxext( uint8_t src, uint8_t edge[33], int i_neighbor, int i_filters ); + void x264_predict_8x8_filter_ssse3( uint8_t src, uint8_t edge[33], int i_neighbor, int i_filters ); + void x264_predict_4x4_ddl_mmxext( uint8_t src ); + void x264_predict_4x4_ddr_mmxext( uint8_t src ); + void x264_predict_4x4_vl_mmxext( uint8_t src ); + void x264_predict_4x4_vr_mmxext( uint8_t src ); + void x264_predict_4x4_vr_ssse3( uint8_t src ); + void x264_predict_4x4_hd_mmxext( uint8_t src ); + void x264_predict_4x4_hd_ssse3( uint8_t src ); + void x264_predict_4x4_dc_mmxext( uint8_t src ); + void x264_predict_4x4_ddr_ssse3( uint8_t src ); + void x264_predict_4x4_hu_mmxext( uint8_t src ); + void x264_predict_16x16_dc_top_sse2( uint8_t src ); + void x264_predict_16x16_dc_core_sse2( uint8_t src, int i_dc_left ); + void x264_predict_16x16_dc_left_core_sse2( uint8_t src, int i_dc_left ); + void x264_predict_16x16_v_sse2( uint8_t src ); + void x264_predict_16x16_p_core_sse2( uint8_t src, int i00, int b, int c ); ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8}; ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1}; @@ -84,7 +84,7 @@ V += i * ( src[(j+i)FDEC_STRIDE -1] - src[(j-i)FDEC_STRIDE -1] );\ #define PREDICT_16x16_P(name)\ -static void predict_16x16_p_##name( uint8_t src )\ +static void x264_predict_16x16_p_##name( uint8_t src )\ {\ int a, b, c;\ int H = 0;\ @@ -102,7 +102,7 @@ b = ( 5 * H + 32 ) >> 6;\ c = ( 5 * V + 32 ) >> 6;\ i00 = a - b * 7 - c * 7 + 16;\ - predict_16x16_p_core_##name( src, i00, b, c );\ + x264_predict_16x16_p_core_##name( src, i00, b, c );\ } #ifndef ARCH_X86_64 @@ -111,7 +111,7 @@ PREDICT_16x16_P( sse2 ) #ifdef __GNUC__ -static void predict_16x16_p_ssse3( uint8_t src ) +static void x264_predict_16x16_p_ssse3( uint8_t src ) { int a, b, c, i00; int H, V; @@ -143,12 +143,12 @@ b = ( 5 * H + 32 ) >> 6; c = ( 5 * V + 32 ) >> 6; i00 = a - b * 7 - c * 7 + 16; - predict_16x16_p_core_sse2( src, i00, b, c ); + x264_predict_16x16_p_core_sse2( src, i00, b, c ); } #endif #define PREDICT_8x8_P(name)\ -static void predict_8x8c_p_##name( uint8_t src )\ +static void x264_predict_8x8c_p_##name( uint8_t src )\ {\ int a, b, c;\ int H = 0;\ @@ -162,7 +162,7 @@ b = ( 17 * H + 16 ) >> 5;\ c = ( 17 * V + 16 ) >> 5;\ i00 = a -3b -3c + 16;\ - predict_8x8c_p_core_##name( src, i00, b, c );\ + x264_predict_8x8c_p_core_##name( src, i00, b, c );\ } #ifndef ARCH_X86_64 @@ -171,7 +171,7 @@ PREDICT_8x8_P( sse2 ) #ifdef __GNUC__ -static void predict_8x8c_p_ssse3( uint8_t src ) +static void x264_predict_8x8c_p_ssse3( uint8_t src ) { int a, b, c, i00; int H, V; @@ -196,12 +196,12 @@ b = ( 17 * H + 16 ) >> 5; c = ( 17 * V + 16 ) >> 5; i00 = a -3b -3c + 16; - predict_8x8c_p_core_sse2( src, i00, b, c ); + x264_predict_8x8c_p_core_sse2( src, i00, b, c ); } #endif #define PREDICT_16x16_DC(name)\ -static void predict_16x16_dc_##name( uint8_t src )\ +static void x264_predict_16x16_dc_##name( uint8_t src )\ {\ uint32_t dc=16;\ int i;\ @@ -210,14 +210,14 @@ dc += src[-1 + i * FDEC_STRIDE];\ dc += src[-1 + (i+1) * FDEC_STRIDE];\ }\ - predict_16x16_dc_core_##name( src, dc );\ + x264_predict_16x16_dc_core_##name( src, dc );\ } PREDICT_16x16_DC( mmxext ) PREDICT_16x16_DC( sse2 ) #define PREDICT_16x16_DC_LEFT(name)\ -static void predict_16x16_dc_left_##name( uint8_t src )\ +static void x264_predict_16x16_dc_left_##name( uint8_t src )\ {\ uint32_t dc=8;\ int i;\ @@ -226,13 +226,13 @@ dc += src[-1 + i * FDEC_STRIDE];\ dc += src[-1 + (i+1) * FDEC_STRIDE];\ }\ - predict_16x16_dc_left_core_##name( src, dc>>4 );\ + x264_predict_16x16_dc_left_core_##name( src, dc>>4 );\ } PREDICT_16x16_DC_LEFT( mmxext ) PREDICT_16x16_DC_LEFT( sse2 ) -static void predict_8x8c_dc_mmxext( uint8_t src ) +static void x264_predict_8x8c_dc_mmxext( uint8_t src ) { int s2 = 4 + src[-1 + 0FDEC_STRIDE] @@ -246,11 +246,11 @@ + src[-1 + 6FDEC_STRIDE] + src[-1 + 7FDEC_STRIDE]; - predict_8x8c_dc_core_mmxext( src, s2, s3 ); + x264_predict_8x8c_dc_core_mmxext( src, s2, s3 ); } #ifdef ARCH_X86_64 -static void predict_8x8c_dc_left( uint8_t src ) +static void x264_predict_8x8c_dc_left( uint8_t src ) { int y; uint32_t s0 = 0, s1 = 0; @@ -304,9 +304,9 @@ #define SRC(x,y) src[(x)+(y)FDEC_STRIDE] #ifndef ARCH_X86_64 -static void predict_8x8_vr_mmxext( uint8_t src, uint8_t edge[33] ) +static void x264_predict_8x8_vr_mmxext( uint8_t src, uint8_t edge[33] ) { - predict_8x8_vr_core_mmxext( src, edge ); + x264_predict_8x8_vr_core_mmxext( src, edge ); { PREDICT_8x8_LOAD_TOPLEFT PREDICT_8x8_LOAD_LEFT @@ -326,7 +326,7 @@ t=e; e+=f; f-=t;\ t=g; g+=h; h-=t; -#define INTRA_SA8D_X3(cpu) \ +#define INTRA_SA8D_X3(cpu)\ void x264_intra_sa8d_x3_8x8_##cpu( uint8_t fenc, uint8_t edge[33], int res[3] )\ {\ PREDICT_8x8_LOAD_TOP\ @@ -372,30 +372,30 @@ { if( !(cpu&X264_CPU_MMX) ) return; - pf[I_PRED_16x16_V] = predict_16x16_v_mmx; + pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx; if( !(cpu&X264_CPU_MMXEXT) ) return; - pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext; - pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext; - pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_mmxext; + pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_mmxext; + pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_mmxext; + pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_mmxext; #ifndef ARCH_X86_64 - pf[I_PRED_16x16_P] = predict_16x16_p_mmxext; + pf[I_PRED_16x16_P] = x264_predict_16x16_p_mmxext; #endif - pf[I_PRED_16x16_H] = predict_16x16_h_mmxext; + pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmxext; if( !(cpu&X264_CPU_SSE2) ) return; - pf[I_PRED_16x16_DC] = predict_16x16_dc_sse2; - pf[I_PRED_16x16_V] = predict_16x16_v_sse2; + pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2; + pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse2; if( cpu&X264_CPU_SSE2_IS_SLOW ) return; - pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2; - pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_sse2; - pf[I_PRED_16x16_P] = predict_16x16_p_sse2; + pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2; + pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2; + pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; - pf[I_PRED_16x16_H] = predict_16x16_h_ssse3; + pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3; #ifdef __GNUC__ - pf[I_PRED_16x16_P] = predict_16x16_p_ssse3; + pf[I_PRED_16x16_P] = x264_predict_16x16_p_ssse3; #endif } @@ -404,25 +404,25 @@ if( !(cpu&X264_CPU_MMX) ) return; #ifdef ARCH_X86_64 - pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left; + pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left; #endif - pf[I_PRED_CHROMA_V] = predict_8x8c_v_mmx; + pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_mmx; if( !(cpu&X264_CPU_MMXEXT) ) return; - pf[I_PRED_CHROMA_DC_TOP] = predict_8x8c_dc_top_mmxext; - pf[I_PRED_CHROMA_H] = predict_8x8c_h_mmxext; + pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_mmxext; + pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmxext; #ifndef ARCH_X86_64 - pf[I_PRED_CHROMA_P] = predict_8x8c_p_mmxext; + pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_mmxext; #endif - pf[I_PRED_CHROMA_DC] = predict_8x8c_dc_mmxext; + pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmxext; if( !(cpu&X264_CPU_SSE2) ) return; - pf[I_PRED_CHROMA_P] = predict_8x8c_p_sse2; + pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; - pf[I_PRED_CHROMA_H] = predict_8x8c_h_ssse3; + pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_ssse3; #ifdef __GNUC__ - pf[I_PRED_CHROMA_P] = predict_8x8c_p_ssse3; + pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_ssse3; #endif } @@ -430,48 +430,48 @@ { if( !(cpu&X264_CPU_MMXEXT) ) return; - pf[I_PRED_8x8_V] = predict_8x8_v_mmxext; - pf[I_PRED_8x8_H] = predict_8x8_h_mmxext; - pf[I_PRED_8x8_DC] = predict_8x8_dc_mmxext; - pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext; - pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext; - pf[I_PRED_8x8_HD] = predict_8x8_hd_mmxext; - predict_8x8_filter = predict_8x8_filter_mmxext; + pf[I_PRED_8x8_V] = x264_predict_8x8_v_mmxext; + pf[I_PRED_8x8_H] = x264_predict_8x8_h_mmxext; + pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_mmxext; + pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmxext; + pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmxext; + pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_mmxext; + predict_8x8_filter = x264_predict_8x8_filter_mmxext; #ifdef ARCH_X86 - pf[I_PRED_8x8_DDL] = predict_8x8_ddl_mmxext; - pf[I_PRED_8x8_DDR] = predict_8x8_ddr_mmxext; - pf[I_PRED_8x8_VR] = predict_8x8_vr_mmxext; - pf[I_PRED_8x8_HU] = predict_8x8_hu_mmxext; + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_mmxext; + pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_mmxext; + pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_mmxext; + pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_mmxext; #endif if( !(cpu&X264_CPU_SSE2) ) return; - pf[I_PRED_8x8_DDL] = predict_8x8_ddl_sse2; - pf[I_PRED_8x8_VL] = predict_8x8_vl_sse2; - pf[I_PRED_8x8_VR] = predict_8x8_vr_sse2; - pf[I_PRED_8x8_DDR] = predict_8x8_ddr_sse2; - pf[I_PRED_8x8_HD] = predict_8x8_hd_sse2; - pf[I_PRED_8x8_HU] = predict_8x8_hu_sse2; + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_sse2; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_sse2; + pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_sse2; + pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_sse2; + pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_sse2; + pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; - pf[I_PRED_8x8_HD] = predict_8x8_hd_ssse3; - pf[I_PRED_8x8_HU] = predict_8x8_hu_ssse3; - predict_8x8_filter = predict_8x8_filter_ssse3; + pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3; + pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3; + *predict_8x8_filter = x264_predict_8x8_filter_ssse3; } void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] ) { if( !(cpu&X264_CPU_MMXEXT) ) return; - pf[I_PRED_4x4_VR] = predict_4x4_vr_mmxext; - pf[I_PRED_4x4_DDL] = predict_4x4_ddl_mmxext; - pf[I_PRED_4x4_VL] = predict_4x4_vl_mmxext; - pf[I_PRED_4x4_DC] = predict_4x4_dc_mmxext; - pf[I_PRED_4x4_DDR] = predict_4x4_ddr_mmxext; - pf[I_PRED_4x4_HD] = predict_4x4_hd_mmxext; - pf[I_PRED_4x4_HU] = predict_4x4_hu_mmxext; + pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmxext; + pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmxext; + pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_mmxext; + pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_mmxext; + pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_mmxext; + pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_mmxext; + pf[I_PRED_4x4_HU] = x264_predict_4x4_hu_mmxext; if( !(cpu&X264_CPU_SSSE3) ) return; - pf[I_PRED_4x4_DDR] = predict_4x4_ddr_ssse3; - pf[I_PRED_4x4_VR] = predict_4x4_vr_ssse3; - pf[I_PRED_4x4_HD] = predict_4x4_hd_ssse3; + pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3; + pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3; + pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3; }
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/quant-a.asm ^
@@ -26,10 +26,6 @@ %include "x86util.asm" SECTION_RODATA -pb_1: times 16 db 1 -pw_1: times 8 dw 1 -pd_1: times 4 dd 1 -pb_01: times 8 db 0, 1 %macro DQM4 3 dw %1, %2, %1, %2, %2, %3, %2, %3 @@ -71,6 +67,11 @@ SECTION .text +cextern pb_1 +cextern pw_1 +cextern pd_1 +cextern pb_01 + %macro QUANT_DC_START_MMX 0 movd m6, r1m ; mf movd m7, r2m ; bias @@ -183,7 +184,7 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias ) +; void quant_4x4_dc( int16_t dct[16], int mf, int bias ) ;----------------------------------------------------------------------------- %macro QUANT_DC 2-3 0 cglobal %1, 1,1,%3 @@ -202,7 +203,7 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) +; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) ;----------------------------------------------------------------------------- %macro QUANT_AC 2 cglobal %1, 3,3 @@ -220,33 +221,33 @@ %define PABSW PABSW_MMX %define PSIGNW PSIGNW_MMX %define QUANT_DC_START QUANT_DC_START_MMX -QUANT_DC x264_quant_2x2_dc_mmxext, 1 +QUANT_DC quant_2x2_dc_mmxext, 1 %ifndef ARCH_X86_64 ; not needed because sse2 is faster -QUANT_DC x264_quant_4x4_dc_mmxext, 4 -QUANT_AC x264_quant_4x4_mmx, 4 -QUANT_AC x264_quant_8x8_mmx, 16 +QUANT_DC quant_4x4_dc_mmxext, 4 +QUANT_AC quant_4x4_mmx, 4 +QUANT_AC quant_8x8_mmx, 16 %endif INIT_XMM -QUANT_DC x264_quant_4x4_dc_sse2, 2, 8 -QUANT_AC x264_quant_4x4_sse2, 2 -QUANT_AC x264_quant_8x8_sse2, 8 +QUANT_DC quant_4x4_dc_sse2, 2, 8 +QUANT_AC quant_4x4_sse2, 2 +QUANT_AC quant_8x8_sse2, 8 %define PABSW PABSW_SSSE3 %define PSIGNW PSIGNW_SSSE3 -QUANT_DC x264_quant_4x4_dc_ssse3, 2, 8 -QUANT_AC x264_quant_4x4_ssse3, 2 -QUANT_AC x264_quant_8x8_ssse3, 8 +QUANT_DC quant_4x4_dc_ssse3, 2, 8 +QUANT_AC quant_4x4_ssse3, 2 +QUANT_AC quant_8x8_ssse3, 8 INIT_MMX -QUANT_DC x264_quant_2x2_dc_ssse3, 1 +QUANT_DC quant_2x2_dc_ssse3, 1 %define QUANT_END QUANT_END_SSE4 ;Not faster on Conroe, so only used in SSE4 versions %define QUANT_DC_START QUANT_DC_START_SSSE3 INIT_XMM -QUANT_DC x264_quant_4x4_dc_sse4, 2, 8 -QUANT_AC x264_quant_4x4_sse4, 2 -QUANT_AC x264_quant_8x8_sse4, 8 +QUANT_DC quant_4x4_dc_sse4, 2, 8 +QUANT_AC quant_4x4_sse4, 2 +QUANT_AC quant_8x8_sse4, 8 @@ -347,10 +348,10 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) +; void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) ;----------------------------------------------------------------------------- %macro DEQUANT 4 -cglobal x264_dequant_%2x%2_%1, 0,3 +cglobal dequant_%2x%2_%1, 0,3 .skip_prologue: DEQUANT_START %3+2, %3 @@ -367,11 +368,11 @@ psrld m3, 1 DEQUANT_LOOP DEQUANT32_R, %2%2/4, %4 -cglobal x264_dequant_%2x%2_flat16_%1, 0,3 +cglobal dequant_%2x%2_flat16_%1, 0,3 movifnidn t2d, r2m %if %2 == 8 cmp t2d, 12 - jl x264_dequant_%2x%2_%1.skip_prologue + jl dequant_%2x%2_%1.skip_prologue sub t2d, 12 %endif imul t0d, t2d, 0x2b @@ -418,7 +419,7 @@ DEQUANT sse2, 8, 6, 2 %macro DEQUANT_DC 1 -cglobal x264_dequant_4x4dc_%1, 0,3 +cglobal dequant_4x4dc_%1, 0,3 DEQUANT_START 6, 6 .lshift: @@ -480,10 +481,10 @@ DEQUANT_DC sse2 ;----------------------------------------------------------------------------- -; void x264_denoise_dct_mmx( int16_t dct, uint32_t sum, uint16_t offset, int size ) +; void denoise_dct( int16_t dct, uint32_t sum, uint16_t offset, int size ) ;----------------------------------------------------------------------------- %macro DENOISE_DCT 1-2 0 -cglobal x264_denoise_dct_%1, 4,5,%2 +cglobal denoise_dct_%1, 4,5,%2 movzx r4d, word [r0] ; backup DC coefficient pxor m6, m6 .loop: @@ -534,7 +535,7 @@ ;----------------------------------------------------------------------------- -; int x264_decimate_score( int16_t dct ) +; int decimate_score( int16_t dct ) ;----------------------------------------------------------------------------- %macro DECIMATE_MASK_SSE2 6 @@ -579,21 +580,21 @@ or %2, %6 %endmacro -cextern x264_decimate_table4 -cextern x264_decimate_table8 +cextern decimate_table4 +cextern decimate_table8 %macro DECIMATE4x4 2 ;A LUT is faster than bsf on AMD processors, and no slower on Intel ;This is not true for score64. -cglobal x264_decimate_score%1_%2, 1,3 +cglobal decimate_score%1_%2, 1,3 %ifdef PIC - lea r10, [x264_decimate_table4] + lea r10, [decimate_table4] lea r11, [decimate_mask_table4] %define table r10 %define mask_table r11 %else - %define table x264_decimate_table4 + %define table decimate_table4 %define mask_table decimate_mask_table4 %endif DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx @@ -638,12 +639,12 @@ %macro DECIMATE8x8 1 %ifdef ARCH_X86_64 -cglobal x264_decimate_score64_%1, 1,4 +cglobal decimate_score64_%1, 1,4 %ifdef PIC - lea r10, [x264_decimate_table8] + lea r10, [decimate_table8] %define table r10 %else - %define table x264_decimate_table8 + %define table decimate_table8 %endif mova m5, [pb_1] DECIMATE_MASK r1d, eax, r0, m5, %1, null @@ -677,9 +678,9 @@ %else ; ARCH %ifidn %1, mmxext -cglobal x264_decimate_score64_%1, 1,6 +cglobal decimate_score64_%1, 1,6 %else -cglobal x264_decimate_score64_%1, 1,5 +cglobal decimate_score64_%1, 1,5 %endif mova m7, [pb_1] DECIMATE_MASK r3, r2, r0, m7, %1, r5 @@ -705,7 +706,7 @@ je .largerun shrd r3, r4, cl shr r4, cl - add r0b, byte [x264_decimate_table8 + ecx] + add r0b, byte [decimate_table8 + ecx] shrd r3, r4, 1 shr r4, 1 cmp r0, 6 ;score64's threshold is never higher than 6 @@ -746,7 +747,7 @@ DECIMATE8x8 ssse3 ;----------------------------------------------------------------------------- -; int x264_coeff_last( int16_t dct ) +; int coeff_last( int16_t dct ) ;----------------------------------------------------------------------------- %macro LAST_MASK_SSE2 2-3 @@ -780,12 +781,12 @@ %macro COEFF_LAST4 1 %ifdef ARCH_X86_64 -cglobal x264_coeff_last4_%1, 1,1 +cglobal coeff_last4_%1, 1,1 LAST rax, [r0], 0x3f shr eax, 4 RET %else -cglobal x264_coeff_last4_%1, 0,3 +cglobal coeff_last4_%1, 0,3 mov edx, r0mp mov eax, [edx+4] xor ecx, ecx @@ -805,7 +806,7 @@ COEFF_LAST4 mmxext_lzcnt %macro COEFF_LAST 1 -cglobal x264_coeff_last15_%1, 1,3 +cglobal coeff_last15_%1, 1,3 pxor m2, m2 LAST_MASK r1d, r0-2, r2d xor r1d, 0xffff @@ -813,7 +814,7 @@ dec eax RET -cglobal x264_coeff_last16_%1, 1,3 +cglobal coeff_last16_%1, 1,3 pxor m2, m2 LAST_MASK r1d, r0, r2d xor r1d, 0xffff @@ -821,7 +822,7 @@ RET %ifndef ARCH_X86_64 -cglobal x264_coeff_last64_%1, 1, 5-mmsize/16 +cglobal coeff_last64_%1, 1, 5-mmsize/16 pxor m2, m2 LAST_MASK r2d, r0+64, r4d LAST_MASK r3d, r0+96, r4d @@ -841,7 +842,7 @@ add eax, 32 RET %else -cglobal x264_coeff_last64_%1, 1,4 +cglobal coeff_last64_%1, 1,4 pxor m2, m2 LAST_MASK_SSE2 r1d, r0 LAST_MASK_SSE2 r2d, r0+32 @@ -872,7 +873,7 @@ COEFF_LAST sse2_lzcnt ;----------------------------------------------------------------------------- -; int x264_coeff_level_run( int16_t dct, x264_run_level_t runlevel ) +; int coeff_level_run( int16_t dct, run_level_t *runlevel ) ;----------------------------------------------------------------------------- %macro LAST_MASK4_MMX 2-3 @@ -901,7 +902,7 @@ %endif %macro COEFF_LEVELRUN 2 -cglobal x264_coeff_level_run%2_%1,0,7 +cglobal coeff_level_run%2_%1,0,7 movifnidn t0, r0mp movifnidn t1, r1mp pxor m2, m2
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/sad-a.asm ^
@@ -26,14 +26,13 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA -pb_3: times 16 db 3 -pb_shuf8x8c: db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6 -pw_8: times 4 dw 8 -sw_64: dd 64 - SECTION .text +cextern pb_3 +cextern pb_shuf8x8c +cextern pw_8 +cextern sw_64 + ;============================================================================= ; SAD MMX ;============================================================================= @@ -78,10 +77,10 @@ %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_sad_16x16_mmxext (uint8_t , int, uint8_t , int ) +; int pixel_sad_16x16( uint8_t , int, uint8_t , int ) ;----------------------------------------------------------------------------- %macro SAD 2 -cglobal x264_pixel_sad_%1x%2_mmxext, 4,4 +cglobal pixel_sad_%1x%2_mmxext, 4,4 pxor mm0, mm0 %rep %2/2 SAD_INC_2x%1P @@ -113,9 +112,9 @@ %macro SAD_W16 1 ;----------------------------------------------------------------------------- -; int x264_pixel_sad_16x16_sse2 (uint8_t , int, uint8_t , int ) +; int pixel_sad_16x16( uint8_t , int, uint8_t , int ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_sad_16x16_%1, 4,4,8 +cglobal pixel_sad_16x16_%1, 4,4,8 movdqu m0, [r2] movdqu m1, [r2+r3] lea r2, [r2+2r3] @@ -180,9 +179,9 @@ SAD_END_SSE2 ;----------------------------------------------------------------------------- -; int x264_pixel_sad_16x8_sse2 (uint8_t , int, uint8_t , int ) +; int pixel_sad_16x8( uint8_t , int, uint8_t , int ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_sad_16x8_%1, 4,4 +cglobal pixel_sad_16x8_%1, 4,4 movdqu m0, [r2] movdqu m2, [r2+r3] lea r2, [r2+2r3] @@ -249,7 +248,7 @@ %endmacro ;Even on Nehalem, no sizes other than 8x16 benefit from this method. -cglobal x264_pixel_sad_8x16_sse2, 4,4 +cglobal pixel_sad_8x16_sse2, 4,4 SAD_INC_4x8P_SSE 0 SAD_INC_4x8P_SSE 1 SAD_INC_4x8P_SSE 1 @@ -258,10 +257,10 @@ RET ;----------------------------------------------------------------------------- -; void intra_sad_x3_4x4 ( uint8_t fenc, uint8_t fdec, int res[3] ); +; void intra_sad_x3_4x4( uint8_t fenc, uint8_t fdec, int res[3] ); ;----------------------------------------------------------------------------- -cglobal x264_intra_sad_x3_4x4_mmxext, 3,3 +cglobal intra_sad_x3_4x4_mmxext, 3,3 pxor mm7, mm7 movd mm0, [r1-FDEC_STRIDE] movd mm1, [r0+FENC_STRIDE0] @@ -305,7 +304,7 @@ RET ;----------------------------------------------------------------------------- -; void intra_sad_x3_8x8 ( uint8_t fenc, uint8_t edge[33], int res[3]); +; void intra_sad_x3_8x8( uint8_t fenc, uint8_t edge[33], int res[3]); ;----------------------------------------------------------------------------- ;m0 = DC @@ -343,7 +342,7 @@ %endmacro INIT_MMX -cglobal x264_intra_sad_x3_8x8_mmxext, 3,3 +cglobal intra_sad_x3_8x8_mmxext, 3,3 movq m7, [r1+7] pxor m0, m0 movq m6, [r1+16] ;V prediction @@ -372,7 +371,7 @@ RET ;----------------------------------------------------------------------------- -; void intra_sad_x3_8x8c ( uint8_t fenc, uint8_t fdec, int res[3] ); +; void intra_sad_x3_8x8c( uint8_t fenc, uint8_t fdec, int res[3] ); ;----------------------------------------------------------------------------- %macro INTRA_SAD_HV_ITER 2 @@ -407,7 +406,7 @@ %endmacro %macro INTRA_SAD_8x8C 1 -cglobal x264_intra_sad_x3_8x8c_%1, 3,3 +cglobal intra_sad_x3_8x8c_%1, 3,3 movq m6, [r1 - FDEC_STRIDE] add r1, FDEC_STRIDE4 %ifidn %1,ssse3 @@ -508,13 +507,13 @@ ;----------------------------------------------------------------------------- -; void intra_sad_x3_16x16 ( uint8_t fenc, uint8_t fdec, int res[3] ); +; void intra_sad_x3_16x16( uint8_t fenc, uint8_t fdec, int res[3] ); ;----------------------------------------------------------------------------- ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score %macro INTRA_SAD16 1-2 0 -cglobal x264_intra_sad_x3_16x16_%1,3,5,%2 +cglobal intra_sad_x3_16x16_%1,3,5,%2 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r1-FDEC_STRIDE+0] @@ -526,10 +525,14 @@ %endif %assign x 0 %rep 16 - movzx r4d, byte [r1-1+FDEC_STRIDEx] + movzx r4d, byte [r1-1+FDEC_STRIDE(x&3)] +%if (x&3)==3 && x!=15 + add r1, FDEC_STRIDE4 +%endif add r3d, r4d %assign x x+1 %endrep + sub r1, FDEC_STRIDE12 add r3d, 16 shr r3d, 5 imul r3d, 0x01010101 @@ -813,11 +816,11 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_pixel_sad_x3_16x16_mmxext( uint8_t fenc, uint8_t pix0, uint8_t pix1, -; uint8_t pix2, int i_stride, int scores[3] ) +; void pixel_sad_x3_16x16( uint8_t fenc, uint8_t pix0, uint8_t pix1, +; uint8_t pix2, int i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X 3 -cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2 +cglobal pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2 %ifdef WIN64 %assign i %1+1 movsxd r %+ i, r %+ i %+ d @@ -1162,11 +1165,11 @@ %endmacro ;----------------------------------------------------------------------------- -; void x264_pixel_sad_x3_16x16_sse2( uint8_t fenc, uint8_t pix0, uint8_t pix1, -; uint8_t pix2, int i_stride, int scores[3] ) +; void pixel_sad_x3_16x16( uint8_t fenc, uint8_t pix0, uint8_t pix1, +; uint8_t pix2, int i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X_SSE2 4 -cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9 +cglobal pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9 %ifdef WIN64 %assign i %1+1 movsxd r %+ i, r %+ i %+ d @@ -1179,7 +1182,7 @@ %endmacro %macro SAD_X_SSE2_MISALIGN 4 -cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9 +cglobal pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9 %ifdef WIN64 %assign i %1+1 movsxd r %+ i, r %+ i %+ d @@ -1285,11 +1288,11 @@ %endmacro %macro SAD16_CACHELINE_FUNC 2 ; cpu, height -cglobal x264_pixel_sad_16x%2_cache64_%1 +cglobal pixel_sad_16x%2_cache64_%1 mov eax, r2m and eax, 0x37 cmp eax, 0x30 - jle x264_pixel_sad_16x%2_sse2 + jle pixel_sad_16x%2_sse2 PROLOGUE 4,6 mov r4d, r2d and r4d, 15 @@ -1320,7 +1323,7 @@ mov eax, r2m and eax, 0x17\|%1\|(%4>>1) cmp eax, 0x10\|%1\|(%4>>1) - jle x264_pixel_sad_%1x%2_mmxext + jle pixel_sad_%1x%2_mmxext and eax, 7 shl eax, 3 movd mm6, [sw_64] @@ -1333,7 +1336,7 @@ %endmacro %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline -cglobal x264_pixel_sad_16x%1_cache%2_mmxext +cglobal pixel_sad_16x%1_cache%2_mmxext SAD_CACHELINE_START_MMX2 16, %1, %1, %2 .loop: movq mm1, [r2] @@ -1359,7 +1362,7 @@ %endmacro %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline -cglobal x264_pixel_sad_8x%1_cache%2_mmxext +cglobal pixel_sad_8x%1_cache%2_mmxext SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2 .loop: movq mm1, [r2+8] @@ -1395,11 +1398,11 @@ %endmacro %macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name -cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6 +cglobal pixel_sad_x3_%1x%2_cache%3_%6 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 - jmp x264_pixel_sad_x3_%1x%2_%4 + jmp pixel_sad_x3_%1x%2_%4 .split: %ifdef ARCH_X86_64 PROLOGUE 6,7 @@ -1414,7 +1417,7 @@ mov r3, r4 mov r10, r0 mov r11, r5 - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [r11], eax %ifdef WIN64 mov r2, [rsp] @@ -1422,7 +1425,7 @@ pop r2 %endif mov r0, r10 - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [r11+4], eax %ifdef WIN64 mov r2, [rsp+8] @@ -1430,7 +1433,7 @@ pop r2 %endif mov r0, r10 - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [r11+8], eax %ifdef WIN64 add rsp, 24 @@ -1443,15 +1446,15 @@ push dword [esp+16] push dword 16 push dword [esp+20] - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+32] mov [edi], eax mov [esp+8], ecx - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+36] mov [edi+4], eax mov [esp+8], ecx - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [edi+8], eax add esp, 16 pop edi @@ -1460,12 +1463,12 @@ %endmacro %macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name -cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6 +cglobal pixel_sad_x4_%1x%2_cache%3_%6 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 CHECK_SPLIT r4m, %1, %3 - jmp x264_pixel_sad_x4_%1x%2_%4 + jmp pixel_sad_x4_%1x%2_%4 .split: %ifdef ARCH_X86_64 PROLOGUE 6,7 @@ -1480,7 +1483,7 @@ mov r1, FENC_STRIDE mov r3, r5 mov r10, r0 - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [r11], eax %ifdef WIN64 mov r2, [rsp] @@ -1488,7 +1491,7 @@ pop r2 %endif mov r0, r10 - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [r11+4], eax %ifdef WIN64 mov r2, [rsp+8] @@ -1496,7 +1499,7 @@ pop r2 %endif mov r0, r10 - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [r11+8], eax %ifdef WIN64 mov r2, [rsp+16] @@ -1504,7 +1507,7 @@ pop r2 %endif mov r0, r10 - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [r11+12], eax %ifdef WIN64 add rsp, 24 @@ -1517,19 +1520,19 @@ push dword [esp+16] push dword 16 push dword [esp+20] - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+32] mov [edi], eax mov [esp+8], ecx - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+36] mov [edi+4], eax mov [esp+8], ecx - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+40] mov [edi+8], eax mov [esp+8], ecx - call x264_pixel_sad_%1x%2_cache%3_%5 + call pixel_sad_%1x%2_cache%3_%5 mov [edi+12], eax add esp, 16 pop edi
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/util.h ^
@@ -26,7 +26,9 @@ #ifdef __GNUC__ +#ifdef __SSE__ #include <xmmintrin.h> +#endif #define x264_median_mv x264_median_mv_mmxext static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t dst, int16_t a, int16_t b, int16_t c ) @@ -107,7 +109,7 @@ } #define x264_predictor_roundclip x264_predictor_roundclip_mmxext -static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max ) +static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max ) { uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min ); uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max ); @@ -121,7 +123,7 @@ "punpckldq %%mm6, %%mm6 \n" "test $1, %0 \n" "jz 1f \n" - "movd -4(%5,%0,4), %%mm0 \n" + "movd -4(%6,%0,4), %%mm0 \n" "paddw %%mm7, %%mm0 \n" "psraw $2, %%mm0 \n" "pmaxsw %%mm5, %%mm0 \n" @@ -130,7 +132,7 @@ "dec %0 \n" "jz 2f \n" "1: \n" - "movq -8(%5,%0,4), %%mm0 \n" + "movq -8(%6,%0,4), %%mm0 \n" "paddw %%mm7, %%mm0 \n" "psraw $2, %%mm0 \n" "pmaxsw %%mm5, %%mm0 \n" @@ -139,15 +141,17 @@ "sub $2, %0 \n" "jnz 1b \n" "2: \n" - :"+r"(i), "+m"(M64( mvc )) - :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(mvc) + :"+r"(i), "=m"(M64( dst )) + :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(dst), "r"(mvc), "m"(M64( mvc )) ); } +#ifdef __SSE__ #undef M128_ZERO #define M128_ZERO ((__m128){0,0,0,0}) #define x264_union128_t x264_union128_sse_t typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t; +#endif #endif
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/common/x86/x86inc.asm ^
@@ -32,6 +32,8 @@ ; as this feature might be useful for others as well. Send patches or ideas ; to x264-devel@videolan.org . +%define program_name x264 + %ifdef ARCH_X86_64 %ifidn __OUTPUT_FORMAT__,win32 %define WIN64 @@ -169,7 +171,7 @@ %endrep %endmacro -DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7 +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 %ifdef ARCH_X86_64 %define gprsize 8 @@ -436,7 +438,7 @@ ; Symbol prefix for C linkage %macro cglobal 1-2+ - %xdefine %1 mangle(%1) + %xdefine %1 mangle(program_name %+ _ %+ %1) %xdefine %1.skip_prologue %1 %+ .skip_prologue %ifidn __OUTPUT_FORMAT__,elf global %1:function hidden @@ -453,10 +455,22 @@ %endmacro %macro cextern 1 + %xdefine %1 mangle(program_name %+ _ %+ %1) + extern %1 +%endmacro + +;like cextern, but without the prefix +%macro cextern_naked 1 %xdefine %1 mangle(%1) extern %1 %endmacro +%macro const 2+ + %xdefine %1 mangle(program_name %+ _ %+ %1) + global %1 + %1: %2 +%endmacro + ; This is needed for ELF, otherwise the GNU linker assumes the stack is ; executable by default. %ifidn __OUTPUT_FORMAT__,elf
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/configure ^
@@ -118,7 +118,7 @@ ffms_input="auto" mp4_output="auto" pthread="auto" -asm="yes" +asm="auto" debug="no" gprof="no" pic="no" @@ -154,33 +154,18 @@ --includedir=) includedir="$optarg" ;; - --enable-asm) - asm="yes" - ;; --disable-asm) asm="no" ;; - --enable-avs-input) - avs_input="auto" - ;; --disable-avs-input) avs_input="no" ;; - --enable-lavf-input) - lavf_input="auto" - ;; --disable-lavf-input) lavf_input="no" ;; - --enable-ffms-input) - ffms_input="auto" - ;; --disable-ffms-input) ffms_input="no" ;; - --enable-mp4-output) - mp4_output="yes" - ;; --disable-mp4-output) mp4_output="no" ;; @@ -193,9 +178,6 @@ --extra-ldflags=) LDFLAGS="$LDFLAGS ${opt#--extra-ldflags=}" ;; - --enable-pthread) - pthread="auto" # can't skip detection, since it differs by OS - ;; --disable-pthread) pthread="no" ;; @@ -214,8 +196,6 @@ shared="yes" ;; --enable-visualize) - LDFLAGS="$LDFLAGS -L/usr/X11R6/lib -lX11" - define HAVE_VISUALIZE vis="yes" ;; --host=*) @@ -425,7 +405,7 @@ pic="yes" fi -if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then +if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then if ! as_check "lzcnt eax, eax" ; then VER=`($AS --version \|\| echo no assembler) 2>$DEVNULL \| head -n 1` echo "Found $VER" @@ -444,7 +424,7 @@ define HAVE_MMX fi -if [ $asm = yes -a $ARCH = ARM ] ; then +if [ $asm = auto -a $ARCH = ARM ] ; then # set flags so neon is built by default echo $CFLAGS \| grep -Eq '(-mcpu\|-march\|-mfpu\|-mfloat-abi)' \|\| CFLAGS="$CFLAGS -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp" @@ -460,7 +440,7 @@ fi [ $asm = no ] && AS="" -[ "x$AS" = x ] && asm="no" +[ "x$AS" = x ] && asm="no" \|\| asm="yes" define ARCH_$ARCH define SYS_$SYS @@ -516,6 +496,13 @@ define HAVE_LOG2F fi +if [ "$vis" = "yes" ] && cc_check "X11/Xlib.h" "-L/usr/X11R6/lib -lX11" "XOpenDisplay( 0 );" ; then + LDFLAGS="-L/usr/X11R6/lib -lX11 $LDFLAGS" + define HAVE_VISUALIZE +else + vis="no" +fi + if [ "$lavf_input" = "auto" ] ; then lavf_input="no" if ${cross_prefix}pkg-config --exists libavformat libavcodec libswscale 2>$DEVNULL; then
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/encoder/analyse.c ^
@@ -27,7 +27,6 @@ #include <unistd.h> #include "common/common.h" -#include "common/cpu.h" #include "macroblock.h" #include "me.h" #include "ratecontrol.h" @@ -2569,15 +2568,11 @@ x264_mb_analysis_t analysis; int i_cost = COST_MAX; - h->mb.i_qp = x264_ratecontrol_qp( h ); - if( h->param.rc.i_aq_mode ) - { - x264_adaptive_quant( h ); - /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB, - * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. / - if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ) - h->mb.i_qp = h->mb.i_last_qp; - } + h->mb.i_qp = x264_ratecontrol_mb_qp( h ); + / If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB, + * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */ + if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ) + h->mb.i_qp = h->mb.i_last_qp; x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/encoder/cabac.c ^
@@ -539,16 +539,16 @@ // node ctx: 0..3: abslevel1 (with abslevelgt1 == 0). // 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter). /* map node ctx => cabac ctx for level=1 / -static const int coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; +static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; / map node ctx => cabac ctx for level>1 / -static const int coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; +static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; static const uint8_t coeff_abs_level_transition[2][8] = { / update node ctx after coding a level=1 / { 1, 2, 3, 3, 4, 5, 6, 7 }, / update node ctx after coding a level>1 / { 4, 4, 4, 4, 5, 6, 7, 7 } }; -static const int count_cat_m1[5] = {15, 14, 15, 3, 14}; +static const uint8_t count_cat_m1[5] = {15, 14, 15, 3, 14}; #if !RDO_SKIP_BS static void block_residual_write_cabac( x264_t h, x264_cabac_t cb, int i_ctxBlockCat, int16_t l ) @@ -736,13 +736,13 @@ } #endif -#define block_residual_write_cabac_cbf( h, cb, i_ctxBlockCat, i_idx, l, b_intra ) \ -{ \ - int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx, b_intra ); \ +#define block_residual_write_cabac_cbf( h, cb, i_ctxBlockCat, i_idx, l, b_intra )\ +{\ + int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx, b_intra );\ if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\ {\ x264_cabac_encode_decision( cb, ctxidxinc, 1 );\ - block_residual_write_cabac( h, cb, i_ctxBlockCat, l ); \ + block_residual_write_cabac( h, cb, i_ctxBlockCat, l );\ }\ else\ x264_cabac_encode_decision( cb, ctxidxinc, 0 );\
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/encoder/cavlc.c ^
@@ -117,7 +117,7 @@ { bs_t *s = &h->out.bs; static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0}; - static const int count_cat[5] = {16, 15, 16, 4, 15}; + static const uint8_t count_cat[5] = {16, 15, 16, 4, 15}; x264_run_level_t runlevel; int i_trailing, i_total_zero, i_suffix_length; int i_total = 0; @@ -172,7 +172,7 @@ } } - if( i_total < count_cat[i_ctxBlockCat] ) + if( (uint8_t)i_total < count_cat[i_ctxBlockCat] ) { if( i_ctxBlockCat == DCT_CHROMA_DC ) bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/encoder/encoder.c ^
@@ -25,7 +25,6 @@ #include <math.h> #include "common/common.h" -#include "common/cpu.h" #include "set.h" #include "analyse.h" @@ -356,9 +355,15 @@ static int x264_validate_parameters( x264_t h ) { #ifdef HAVE_MMX +#ifdef __SSE__ if( !(x264_cpu_detect() & X264_CPU_SSE) ) { x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n"); +#else + if( !(x264_cpu_detect() & X264_CPU_MMXEXT) ) + { + x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n"); +#endif x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n"); return -1; } @@ -1885,24 +1890,27 @@ x264_macroblock_cache_save( h ); / accumulate mb stats / - h->stat.frame.i_mb_count[h->mb.i_type]++; int b_intra = IS_INTRA( h->mb.i_type ); - if( !b_intra && !IS_SKIP( h->mb.i_type ) && !IS_DIRECT( h->mb.i_type ) ) + if( h->param.i_log_level >= X264_LOG_INFO \|\| h->param.rc.b_stat_write ) { - if( h->mb.i_partition != D_8x8 ) - h->stat.frame.i_mb_partition[h->mb.i_partition] += 4; - else - for( int i = 0; i < 4; i++ ) - h->stat.frame.i_mb_partition[h->mb.i_sub_partition[i]] ++; - if( h->param.i_frame_reference > 1 ) - for( int i_list = 0; i_list <= (h->sh.i_type == SLICE_TYPE_B); i_list++ ) - for( int i = 0; i < 4; i++ ) - { - int i_ref = h->mb.cache.ref[i_list][ x264_scan8[4i] ]; - if( i_ref >= 0 ) - h->stat.frame.i_mb_count_ref[i_list][i_ref] ++; - } + h->stat.frame.i_mb_count[h->mb.i_type]++; + if( !b_intra && !IS_SKIP( h->mb.i_type ) && !IS_DIRECT( h->mb.i_type ) ) + { + if( h->mb.i_partition != D_8x8 ) + h->stat.frame.i_mb_partition[h->mb.i_partition] += 4; + else + for( int i = 0; i < 4; i++ ) + h->stat.frame.i_mb_partition[h->mb.i_sub_partition[i]] ++; + if( h->param.i_frame_reference > 1 ) + for( int i_list = 0; i_list <= (h->sh.i_type == SLICE_TYPE_B); i_list++ ) + for( int i = 0; i < 4; i++ ) + { + int i_ref = h->mb.cache.ref[i_list][ x264_scan8[4i] ]; + if( i_ref >= 0 ) + h->stat.frame.i_mb_count_ref[i_list][i_ref] ++; + } + } } if( h->param.i_log_level >= X264_LOG_INFO ) @@ -2058,6 +2066,10 @@ static int x264_threaded_slices_write( x264_t h ) { void ret = NULL; +#ifdef HAVE_MMX + if( h->param.cpu&X264_CPU_SSE_MISALIGN ) + x264_cpu_mask_misalign_sse(); +#endif / set first/last mb and sync contexts / for( int i = 0; i < h->param.i_threads; i++ ) { @@ -2095,7 +2107,11 @@ / Go back and fix up the hpel on the borders between slices. / for( int i = 1; i < h->param.i_threads; i++ ) + { x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 1, 0 ); + if( h->sh.b_mbaff ) + x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 2, 0 ); + } x264_threads_merge_ratecontrol( h ); @@ -2119,6 +2135,12 @@ return 0; } +void x264_encoder_intra_refresh( x264_t h ) +{ + h = h->thread[h->i_thread_phase]; + h->b_queued_intra_refresh = 1; +} + /**************************************************************************** * x264_encoder_encode: * XXX: i_poc : is the poc of the current given picture @@ -2363,25 +2385,34 @@ h->i_nal_type = i_nal_type; h->i_nal_ref_idc = i_nal_ref_idc; - if( h->param.b_intra_refresh && h->fenc->i_type == X264_TYPE_P ) + if( h->param.b_intra_refresh ) { - int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2; - float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 ); - int max_position = (int)(increment * h->param.i_keyint_max); - if( IS_X264_TYPE_I( h->fref0[0]->i_type ) ) - h->fdec->f_pir_position = 0; - else + if( IS_X264_TYPE_I( h->fenc->i_type ) ) + { + h->fdec->i_frames_since_pir = 0; + h->b_queued_intra_refresh = 0; + /* PIR is currently only supported with ref == 1, so any intra frame effectively refreshes + * the whole frame and counts as an intra refresh. / + h->fdec->f_pir_position = h->sps->i_mb_width; + } + else if( h->fenc->i_type == X264_TYPE_P ) { + int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2; + float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 ); h->fdec->f_pir_position = h->fref0[0]->f_pir_position; - if( h->fdec->f_pir_position+0.5 >= max_position ) + h->fdec->i_frames_since_pir = h->fref0[0]->i_frames_since_pir + pocdiff; + if( h->fdec->i_frames_since_pir >= h->param.i_keyint_max \|\| + (h->b_queued_intra_refresh && h->fdec->f_pir_position + 0.5 >= h->sps->i_mb_width) ) { h->fdec->f_pir_position = 0; + h->fdec->i_frames_since_pir = 0; + h->b_queued_intra_refresh = 0; h->fenc->b_keyframe = 1; } + h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5; + h->fdec->f_pir_position += increment pocdiff; + h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5; } - h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5; - h->fdec->f_pir_position += increment * pocdiff; - h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5; } if( h->fenc->b_keyframe ) @@ -2789,8 +2820,8 @@ /* Slices used and PSNR / for( int i = 0; i < 5; i++ ) { - static const int slice_order[] = { SLICE_TYPE_I, SLICE_TYPE_SI, SLICE_TYPE_P, SLICE_TYPE_SP, SLICE_TYPE_B }; - static const char slice_name[] = { "P", "B", "I", "SP", "SI" }; + static const uint8_t slice_order[] = { SLICE_TYPE_I, SLICE_TYPE_SI, SLICE_TYPE_P, SLICE_TYPE_SP, SLICE_TYPE_B }; + static const char * const slice_name[] = { "P", "B", "I", "SP", "SI" }; int i_slice = slice_order[i]; if( h->stat.i_frame_count[i_slice] > 0 )
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/encoder/lookahead.c ^
@@ -35,7 +35,6 @@ * # of bframes + # of threads. / #include "common/common.h" -#include "common/cpu.h" #include "analyse.h" static void x264_lookahead_shift( x264_synch_frame_list_t dst, x264_synch_frame_list_t *src, int count )
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/encoder/macroblock.c ^
@@ -458,10 +458,10 @@ static void x264_macroblock_encode_skip( x264_t h ) { - h->mb.i_cbp_luma = 0x00; - h->mb.i_cbp_chroma = 0x00; - memset( h->mb.cache.non_zero_count, 0, sizeof( h->mb.cache.non_zero_count ) ); - / store cbp */ + for( int i = 0; i < sizeof( h->mb.cache.non_zero_count ); i += 16 ) + M128( &h->mb.cache.non_zero_count[i] ) = M128_ZERO; + h->mb.i_cbp_luma = 0; + h->mb.i_cbp_chroma = 0; h->mb.cbp[h->mb.i_mb_xy] = 0; }
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/encoder/me.c ^
@@ -48,8 +48,8 @@ /* (x-1)%6 / static const uint8_t mod6m1[8] = {5,0,1,2,3,4,5,0}; / radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. / -static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}}; -static const int square1[9][2] = {{0,0}, {0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {-1,1}, {1,-1}, {1,1}}; +static const int8_t hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}}; +static const int8_t square1[9][2] = {{0,0}, {0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {-1,1}, {1,-1}, {1,1}}; static void refine_subpel( x264_t h, x264_me_t m, int hpel_iters, int qpel_iters, int p_halfpel_thresh, int b_refine_qpel ); @@ -245,14 +245,15 @@ pmv = pack16to32_mask( bmx, bmy ); if( i_mvc > 0 ) { - x264_predictor_roundclip( mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max ); + ALIGNED_ARRAY_8( int16_t, mvc_fpel,[16][2] ); + x264_predictor_roundclip( mvc_fpel, mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max ); bcost <<= 4; for( int i = 1; i <= i_mvc; i++ ) { - if( M32( mvc[i-1] ) && (pmv != M32( mvc[i-1] )) ) + if( M32( mvc_fpel[i-1] ) && (pmv != M32( mvc[i-1] )) ) { - int mx = mvc[i-1][0]; - int my = mvc[i-1][1]; + int mx = mvc_fpel[i-1][0]; + int my = mvc_fpel[i-1][1]; int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[mystride+mx], stride ) + BITS_MVD( mx, my ); cost = (cost << 4) + i; COPY1_IF_LT( bcost, cost ); @@ -260,8 +261,8 @@ } if( bcost&15 ) { - bmx = mvc[(bcost&15)-1][0]; - bmy = mvc[(bcost&15)-1][1]; + bmx = mvc_fpel[(bcost&15)-1][0]; + bmy = mvc_fpel[(bcost&15)-1][1]; } bcost >>= 4; } @@ -376,7 +377,7 @@ / Uneven-cross Multi-Hexagon-grid Search * as in JM, except with different early termination / - static const int x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 }; + static const uint8_t x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 }; int ucost1, ucost2; int cross_start = 1; @@ -423,7 +424,7 @@ / range multipliers based on casual inspection of some statistics of * average distance between current predictor and final mv found by ESA. * these have not been tuned much by actual encoding. / - static const int range_mul[4][4] = + static const uint8_t range_mul[4][4] = { { 3, 3, 4, 4 }, { 3, 4, 4, 4 }, @@ -467,7 +468,7 @@ : mvd < 20denom ? 1 : mvd < 40denom ? 2 : 3; - i_me_range = i_me_range range_mul[mvd_ctx][sad_ctx] / 4; + i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] >> 2; } /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy. @@ -483,7 +484,7 @@ int i = 1; do { - static const int hex4[16][2] = { + static const int8_t hex4[16][2] = { { 0,-4}, { 0, 4}, {-2,-3}, { 2,-3}, {-4,-2}, { 4,-2}, {-4,-1}, { 4,-1}, {-4, 0}, { 4, 0}, {-4, 1}, { 4, 1}, @@ -657,7 +658,7 @@ bsad += ycost; } - limit = i_me_range / 2; + limit = i_me_range >> 1; sad_thresh = bsadsad_thresh>>3; while( nmvsad > limit2 && sad_thresh > bsad ) { @@ -913,14 +914,14 @@ m->cost_mv = p_cost_mvx[bmx] + p_cost_mvy[bmy]; } -#define BIME_CACHE( dx, dy, list ) \ -{ \ +#define BIME_CACHE( dx, dy, list )\ +{\ x264_me_t m = m##list;\ - int i = 4 + 3dx + dy; \ + int i = 4 + 3*dx + dy;\ int mvx = bm##list##x+dx;\ int mvy = bm##list##y+dy;\ stride[list][i] = bw;\ - src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none ); \ + src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none );\ if( rd )\ {\ h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ @@ -1106,11 +1107,11 @@ { \ uint64_t cost; \ M32( cache_mv ) = pack16to32_mask(mx,my); \ - if( m->i_pixel <= PIXEL_8x8 )\ - {\ - h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\ - h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\ - }\ + if( m->i_pixel <= PIXEL_8x8 ) \ + { \ + h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \ + h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \ + } \ cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \ } \
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/encoder/ratecontrol.c ^
@@ -29,7 +29,6 @@ #include <math.h> #include "common/common.h" -#include "common/cpu.h" #include "ratecontrol.h" #include "me.h" @@ -84,8 +83,7 @@ /* current frame / ratecontrol_entry_t rce; int qp; /* qp for current frame / - int qpm; / qp for current macroblock / - float f_qpm; / qp for current macroblock: precise float for AQ / + float qpm; / qp for current macroblock: precise float for AQ / float qpa_rc; / average of macroblocks' qp before aq / float qpa_aq; / average of macroblocks' qp after aq / float qp_novbv; / QP for the current frame if 1-pass VBV was disabled. / @@ -293,22 +291,6 @@ } } - -/**************************************************************************** -* x264_adaptive_quant: - * adjust macroblock QP based on variance (AC energy) of the MB. - * high variance = higher QP - * low variance = lower QP - * This generally increases SSIM and lowers PSNR. -****************************************************************************/ -void x264_adaptive_quant( x264_t h ) -{ - x264_emms(); - /* MB-tree currently doesn't adjust quantizers in unreferenced frames. / - float qp_offset = h->fdec->b_kept_as_ref ? h->fenc->f_qp_offset[h->mb.i_mb_xy] : h->fenc->f_qp_offset_aq[h->mb.i_mb_xy]; - h->mb.i_qp = x264_clip3( h->rc->f_qpm + qp_offset + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); -} - int x264_macroblock_tree_read( x264_t h, x264_frame_t frame ) { x264_ratecontrol_t rc = h->rc; @@ -669,7 +651,7 @@ return -1; } - CMP_OPT_FIRST_PASS( "wpredp", X264_MAX( 0, h->param.analyse.i_weighted_pred ) ); + CMP_OPT_FIRST_PASS( "weightp", X264_MAX( 0, h->param.analyse.i_weighted_pred ) ); CMP_OPT_FIRST_PASS( "bframes", h->param.i_bframe ); CMP_OPT_FIRST_PASS( "b_pyramid", h->param.i_bframe_pyramid ); CMP_OPT_FIRST_PASS( "intra_refresh", h->param.b_intra_refresh ); @@ -1180,28 +1162,27 @@ rc->qpa_rc = rc->qpa_aq = 0; - rc->qpm = rc->qp = x264_clip3( (int)(q + 0.5), 0, 51 ); h->fdec->f_qp_avg_rc = h->fdec->f_qp_avg_aq = - rc->f_qpm = q; + rc->qpm = q; if( rce ) rce->new_qp = rc->qp; - accum_p_qp_update( h, rc->f_qpm ); + accum_p_qp_update( h, rc->qpm ); if( h->sh.i_type != SLICE_TYPE_B ) rc->last_non_b_pict_type = h->sh.i_type; } -static double predict_row_size( x264_t h, int y, int qp ) +static double predict_row_size( x264_t h, int y, double qp ) { /* average between two predictors: * absolute SATD, and scaled bit cost of the colocated row in the previous frame / x264_ratecontrol_t rc = h->rc; double pred_s = predict_size( rc->row_pred[0], qp2qscale( qp ), h->fdec->i_row_satd[y] ); double pred_t = 0; - if( h->sh.i_type == SLICE_TYPE_I \|\| qp >= h->fref0[0]->i_row_qp[y] ) + if( h->sh.i_type == SLICE_TYPE_I \|\| qp >= h->fref0[0]->f_row_qp[y] ) { if( h->sh.i_type == SLICE_TYPE_P && h->fref0[0]->i_type == h->fdec->i_type @@ -1209,7 +1190,7 @@ && (abs(h->fref0[0]->i_row_satd[y] - h->fdec->i_row_satd[y]) < h->fdec->i_row_satd[y]/2)) { pred_t = h->fref0[0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref0[0]->i_row_satd[y] - * qp2qscale( h->fref0[0]->i_row_qp[y] ) / qp2qscale( qp ); + * qp2qscale( h->fref0[0]->f_row_qp[y] ) / qp2qscale( qp ); } if( pred_t == 0 ) pred_t = pred_s; @@ -1232,7 +1213,7 @@ return bits; } -static double predict_row_size_sum( x264_t h, int y, int qp ) +static double predict_row_size_sum( x264_t h, int y, double qp ) { double bits = row_bits_so_far(h, y); for( int i = y+1; i < h->i_threadslice_end; i++ ) @@ -1249,33 +1230,34 @@ x264_emms(); h->fdec->i_row_bits[y] += bits; - rc->qpa_rc += rc->f_qpm; + rc->qpa_rc += rc->qpm; rc->qpa_aq += h->mb.i_qp; if( h->mb.i_mb_x != h->sps->i_mb_width - 1 \|\| !rc->b_vbv ) return; - h->fdec->i_row_qp[y] = rc->qpm; + h->fdec->f_row_qp[y] = rc->qpm; update_predictor( rc->row_pred[0], qp2qscale( rc->qpm ), h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] ); - if( h->sh.i_type == SLICE_TYPE_P && rc->qpm < h->fref0[0]->i_row_qp[y] ) + if( h->sh.i_type == SLICE_TYPE_P && rc->qpm < h->fref0[0]->f_row_qp[y] ) update_predictor( rc->row_pred[1], qp2qscale( rc->qpm ), h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] ); /* tweak quality based on difference from predicted size / if( y < h->i_threadslice_end-1 ) { - int prev_row_qp = h->fdec->i_row_qp[y]; - int i_qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min ); - int i_qp_absolute_max = h->param.rc.i_qp_max; + float prev_row_qp = h->fdec->f_row_qp[y]; + float qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min ); + float qp_absolute_max = h->param.rc.i_qp_max; if( rc->rate_factor_max_increment ) - i_qp_absolute_max = X264_MIN( i_qp_absolute_max, rc->qp_novbv + rc->rate_factor_max_increment ); - int i_qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, i_qp_absolute_max ); + qp_absolute_max = X264_MIN( qp_absolute_max, rc->qp_novbv + rc->rate_factor_max_increment ); + float qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, qp_absolute_max ); + float step_size = 0.5; / B-frames shouldn't use lower QP than their reference frames. / if( h->sh.i_type == SLICE_TYPE_B ) { - i_qp_min = X264_MAX( i_qp_min, X264_MAX( h->fref0[0]->i_row_qp[y+1], h->fref1[0]->i_row_qp[y+1] ) ); - rc->qpm = X264_MAX( rc->qpm, i_qp_min ); + qp_min = X264_MAX( qp_min, X264_MAX( h->fref0[0]->f_row_qp[y+1], h->fref1[0]->f_row_qp[y+1] ) ); + rc->qpm = X264_MAX( rc->qpm, qp_min ); } float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned; @@ -1303,45 +1285,53 @@ rc_tol /= 2; if( !rc->b_vbv_min_rate ) - i_qp_min = X264_MAX( i_qp_min, h->sh.i_qp ); + qp_min = X264_MAX( qp_min, rc->qp_novbv ); - while( rc->qpm < i_qp_max + while( rc->qpm < qp_max && ((b1 > rc->frame_size_planned + rc_tol) \|\| (rc->buffer_fill - b1 < buffer_left_planned 0.5) \|\| (b1 > rc->frame_size_planned && rc->qpm < rc->qp_novbv)) ) { - rc->qpm ++; + rc->qpm += step_size; b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices; } - while( rc->qpm > i_qp_min - && (rc->qpm > h->fdec->i_row_qp[0] \|\| rc->single_frame_vbv) + while( rc->qpm > qp_min + && (rc->qpm > h->fdec->f_row_qp[0] \|\| rc->single_frame_vbv) && ((b1 < rc->frame_size_planned * 0.8 && rc->qpm <= prev_row_qp) \|\| b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1) ) { - rc->qpm --; + rc->qpm -= step_size; b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices; } /* avoid VBV underflow or MinCR violation / - while( (rc->qpm < i_qp_absolute_max) + while( (rc->qpm < qp_absolute_max) && ((rc->buffer_fill - b1 < rc->buffer_rate rc->max_frame_error) \|\| (rc->frame_size_maximum - b1 < rc->frame_size_maximum * rc->max_frame_error))) { - rc->qpm ++; + rc->qpm += step_size; b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices; } h->rc->frame_size_estimated = predict_row_size_sum( h, y, rc->qpm ); } - - /* loses the fractional part of the frame-wise qp / - rc->f_qpm = rc->qpm; } int x264_ratecontrol_qp( x264_t h ) { - return h->rc->qpm; + x264_emms(); + return x264_clip3( h->rc->qpm + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); +} + +int x264_ratecontrol_mb_qp( x264_t h ) +{ + x264_emms(); + float qp = h->rc->qpm; + if( h->param.rc.i_aq_mode ) + / MB-tree currently doesn't adjust quantizers in unreferenced frames. / + qp += h->fdec->b_kept_as_ref ? h->fenc->f_qp_offset[h->mb.i_mb_xy] : h->fenc->f_qp_offset_aq[h->mb.i_mb_xy]; + return x264_clip3( qp + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); } / In 2pass, force the same frame types as in the 1st pass */
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/encoder/ratecontrol.h ^
@@ -30,7 +30,6 @@ void x264_ratecontrol_init_reconfigurable( x264_t h, int b_init ); void x264_adaptive_quant_frame( x264_t h, x264_frame_t frame ); -void x264_adaptive_quant( x264_t ); int x264_macroblock_tree_read( x264_t h, x264_frame_t frame ); int x264_reference_build_list_optimal( x264_t h ); void x264_thread_sync_ratecontrol( x264_t cur, x264_t prev, x264_t next ); @@ -39,6 +38,7 @@ void x264_ratecontrol_set_weights( x264_t h, x264_frame_t frm ); void x264_ratecontrol_mb( x264_t , int bits ); int x264_ratecontrol_qp( x264_t ); +int x264_ratecontrol_mb_qp( x264_t h ); int x264_ratecontrol_end( x264_t , int bits, int filler ); void x264_ratecontrol_summary( x264_t ); void x264_ratecontrol_set_estimated_size( x264_t *, int bits );
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/encoder/rdo.c ^
@@ -50,6 +50,8 @@ * fractional bits, but only finite precision. / #undef x264_cabac_encode_decision #undef x264_cabac_encode_decision_noup +#undef x264_cabac_encode_bypass +#undef x264_cabac_encode_terminal #define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v) #define x264_cabac_encode_decision_noup(c,x,v) x264_cabac_size_decision_noup(c,x,v) #define x264_cabac_encode_terminal(c) ((c)->f8_bits_encoded += 7) @@ -438,10 +440,13 @@ if( i < b_ac ) { - / We only need to memset an empty 4x4 block. 8x8 can be + /* We only need to zero an empty 4x4 block. 8x8 can be implicitly emptied via zero nnz, as can dc. / if( i_coefs == 16 && !dc ) - memset( dct, 0, 16 sizeof(int16_t) ); + { + M128( &dct[0] ) = M128_ZERO; + M128( &dct[8] ) = M128_ZERO; + } return 0; } @@ -608,7 +613,10 @@ if( bnode == &nodes_cur[0] ) { if( i_coefs == 16 && !dc ) - memset( dct, 0, 16 * sizeof(int16_t) ); + { + M128( &dct[0] ) = M128_ZERO; + M128( &dct[8] ) = M128_ZERO; + } return 0; }
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/encoder/set.c ^
@@ -315,26 +315,22 @@ if( sps->vui.b_aspect_ratio_info_present ) { int i; - static const struct { int w, h; int sar; } sar[] = + static const struct { uint8_t w, h, sar; } sar[] = { { 1, 1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 }, { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 }, { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12}, - { 160,99, 13}, { 0, 0, -1 } + { 160,99, 13}, { 0, 0, 255 } }; - for( i = 0; sar[i].sar != -1; i++ ) + for( i = 0; sar[i].sar != 255; i++ ) { if( sar[i].w == sps->vui.i_sar_width && sar[i].h == sps->vui.i_sar_height ) break; } - if( sar[i].sar != -1 ) + bs_write( s, 8, sar[i].sar ); + if( sar[i].sar == 255 ) /* aspect_ratio_idc (extended) / { - bs_write( s, 8, sar[i].sar ); - } - else - { - bs_write( s, 8, 255); / aspect_ratio_idc (extended) */ bs_write( s, 16, sps->vui.i_sar_width ); bs_write( s, 16, sps->vui.i_sar_height ); }
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/encoder/slicetype.c ^
@@ -25,7 +25,6 @@ #include <math.h> #include "common/common.h" -#include "common/cpu.h" #include "macroblock.h" #include "me.h" @@ -382,21 +381,23 @@ /* Reverse-order MV prediction. / M32( mvc[0] ) = 0; - M32( mvc[1] ) = 0; M32( mvc[2] ) = 0; #define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; } if( i_mb_x < h->sps->i_mb_width - 1 ) - MVC(fenc_mv[1]); + MVC( fenc_mv[1] ); if( i_mb_y < h->sps->i_mb_height - 1 ) { - MVC(fenc_mv[i_mb_stride]); + MVC( fenc_mv[i_mb_stride] ); if( i_mb_x > 0 ) - MVC(fenc_mv[i_mb_stride-1]); + MVC( fenc_mv[i_mb_stride-1] ); if( i_mb_x < h->sps->i_mb_width - 1 ) - MVC(fenc_mv[i_mb_stride+1]); + MVC( fenc_mv[i_mb_stride+1] ); } #undef MVC - x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] ); + if( i_mvc <= 1 ) + CP32( m[l].mvp, mvc[0] ); + else + x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] ); x264_me_search( h, &m[l], mvc, i_mvc ); m[l].cost -= 2; // remove mvcost from skip mbs @@ -416,10 +417,6 @@ if( b_bidir && ( M32( m[0].mv ) \|\| M32( m[1].mv ) ) ) TRY_BIDIR( m[0].mv, m[1].mv, 5 ); - / Store to width-2 bitfield. / - frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] &= ~(3<<((i_mb_xy&3)2)); - frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] \|= list_used<<((i_mb_xy&3)2); - lowres_intra_mb: if( !fenc->b_intra_calculated ) { @@ -481,7 +478,10 @@ int i_icost = fenc->i_intra_cost[i_mb_xy]; int b_intra = i_icost < i_bcost; if( b_intra ) + { i_bcost = i_icost; + list_used = 0; + } if( b_frame_score_mb ) fenc->i_intra_mbs[b-p0] += b_intra; } @@ -501,7 +501,7 @@ } } - fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost; + fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost + (list_used << LOWRES_COST_SHIFT); } #undef TRY_BIDIR @@ -615,7 +615,7 @@ for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- ) { int i_mb_xy = h->mb.i_mb_x + h->mb.i_mb_yh->mb.i_mb_stride; - int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy]; + int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy] & LOWRES_COST_MASK; float qp_adj = qp_offset[i_mb_xy]; i_mb_cost = (i_mb_cost * x264_exp2fix8(qp_adj) + 128) >> 8; row_satd[ h->mb.i_mb_y ] += i_mb_cost; @@ -681,7 +681,7 @@ if( propagate_amount > 0 ) { /* Access width-2 bitfield. / - int lists_used = (frames[b]->lowres_inter_types[b-p0][p1-b][mb_index>>2] >> ((mb_index&3)2))&3; + int lists_used = frames[b]->lowres_costs[b-p0][p1-b][mb_index] >> LOWRES_COST_SHIFT; /* Follow the MVs to the previous frame(s). / for( int list = 0; list < 2; list++ ) if( (lists_used >> list)&1 ) @@ -1490,7 +1490,7 @@ for( int x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ ) { int intra_cost = (h->fenc->i_intra_cost[mb_xy] ip_factor + 128) >> 8; - int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy]; + int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy] & LOWRES_COST_MASK; int diff = intra_cost - inter_cost; if( h->param.rc.i_aq_mode ) h->fdec->i_row_satd[y] += (diff * frames[b]->i_inv_qscale_factor[mb_xy] + 128) >> 8;
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/input/avs.c ^
@@ -45,7 +45,7 @@ /* maximum size of the sequence of filters to try on non script files / #define AVS_MAX_SEQUENCE 5 -#define LOAD_AVS_FUNC(name, continue_on_fail) \ +#define LOAD_AVS_FUNC(name, continue_on_fail)\ {\ h->func.name = (void)GetProcAddress( h->library, #name );\ if( !continue_on_fail && !h->func.name )\
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/input/y4m.c ^
@@ -104,13 +104,21 @@ case 'I': /* Interlace type / switch( tokstart++ ) { - case 'p': break; - case '?': case 't': + info->interlaced = 1; + info->tff = 1; + break; case 'b': + info->interlaced = 1; + info->tff = 0; + break; case 'm': - default: info->interlaced = 1; + break; + //case '?': + //case 'p': + default: + break; } break; case 'F': /* Frame rate - 0:0 if unknown */
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/tools/checkasm-a.asm ^
@@ -43,7 +43,7 @@ SECTION .text -cextern puts +cextern_naked puts ; max number of args used by any x264 asm function. ; (max_args % 4) must equal 3 for stack alignment @@ -54,7 +54,7 @@ ;----------------------------------------------------------------------------- ; intptr_t x264_checkasm_call( intptr_t (func)(), int ok, ... ) ;----------------------------------------------------------------------------- -cglobal x264_checkasm_call, 4,7,16 +cglobal checkasm_call, 4,7,16 sub rsp, max_args8 %assign stack_offset stack_offset+max_args8 mov r6, r0 @@ -113,7 +113,7 @@ ;----------------------------------------------------------------------------- ; intptr_t x264_checkasm_call( intptr_t (func)(), int ok, ... ) ;----------------------------------------------------------------------------- -cglobal x264_checkasm_call, 1,7 +cglobal checkasm_call, 1,7 mov r3, n3 mov r4, n4 mov r5, n5 @@ -147,7 +147,7 @@ ;----------------------------------------------------------------------------- ; int x264_stack_pagealign( int (*func)(), int align ) ;----------------------------------------------------------------------------- -cglobal x264_stack_pagealign, 2,2 +cglobal stack_pagealign, 2,2 push rbp mov rbp, rsp and rsp, ~0xfff
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/tools/checkasm.c ^
@@ -265,7 +265,7 @@ buf3[i] = ~(buf4[i] = -(buf1[i&~0x88]&1)); #define TEST_PIXEL( name, align ) \ - ok = 1, used_asm = 0;\ + ok = 1, used_asm = 0; \ for( int i = 0; i < 7; i++ ) \ { \ int res_c, res_asm; \ @@ -305,7 +305,7 @@ TEST_PIXEL( sa8d, 1 ); #define TEST_PIXEL_X( N ) \ - ok = 1; used_asm = 0;\ + ok = 1; used_asm = 0; \ for( int i = 0; i < 7; i++ ) \ { \ int res_c[4]={0}, res_asm[4]={0}; \ @@ -350,7 +350,7 @@ { \ set_func_name( "%s_%s", "var", pixel_names[i] ); \ used_asm = 1; \ - /* abi-check wrapper can't return uint64_t, so separate it from return value check /\ + / abi-check wrapper can't return uint64_t, so separate it from return value check / \ call_c1( pixel_c.var[i], buf1, 16 ); \ call_a1( pixel_asm.var[i], buf1, 16 ); \ uint64_t res_c = pixel_c.var[i]( buf1, 16 ); \ @@ -415,7 +415,7 @@ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \ int res_c[3], res_asm[3]; \ - set_func_name( #name );\ + set_func_name( #name ); \ used_asm = 1; \ memcpy( buf3, buf2, 1024 ); \ for( int i = 0; i < 3; i++ ) \ @@ -538,7 +538,7 @@ #define TEST_DCT( name, t1, t2, size ) \ if( dct_asm.name != dct_ref.name ) \ { \ - set_func_name( #name );\ + set_func_name( #name ); \ used_asm = 1; \ call_c( dct_c.name, t1, buf1, buf2 ); \ call_a( dct_asm.name, t2, buf1, buf2 ); \ @@ -579,7 +579,7 @@ #define TEST_IDCT( name, src ) \ if( dct_asm.name != dct_ref.name ) \ { \ - set_func_name( #name );\ + set_func_name( #name ); \ used_asm = 1; \ memcpy( buf3, buf1, 3232 ); \ memcpy( buf4, buf1, 3232 ); \ @@ -644,12 +644,12 @@ ALIGNED_16( int16_t level1[64] ); ALIGNED_16( int16_t level2[64] ); -#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \ +#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \ if( zigzag_asm.name != zigzag_ref.name ) \ { \ - set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\ + set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ - memcpy(dct, buf1, sizesizeof(int16_t));\ + memcpy(dct, buf1, sizesizeof(int16_t)); \ call_c( zigzag_c.name, t1, dct ); \ call_a( zigzag_asm.name, t2, dct ); \ if( memcmp( t1, t2, sizesizeof(int16_t) ) ) \ @@ -663,18 +663,18 @@ if( zigzag_asm.name != zigzag_ref.name ) \ { \ int nz_a, nz_c; \ - set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\ + set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ memcpy( buf3, buf1, 16FDEC_STRIDE ); \ memcpy( buf4, buf1, 16FDEC_STRIDE ); \ - nz_c = call_c1( zigzag_c.name, t1, buf2, buf3 ); \ + nz_c = call_c1( zigzag_c.name, t1, buf2, buf3 ); \ nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4 ); \ - if( memcmp( t1, t2, sizesizeof(int16_t) )\|\| memcmp( buf3, buf4, 16FDEC_STRIDE ) \|\| nz_c != nz_a ) \ + if( memcmp( t1, t2, sizesizeof(int16_t) )\|\| memcmp( buf3, buf4, 16FDEC_STRIDE ) \|\| nz_c != nz_a ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ } \ - call_c2( zigzag_c.name, t1, buf2, buf3 ); \ + call_c2( zigzag_c.name, t1, buf2, buf3 ); \ call_a2( zigzag_asm.name, t2, buf2, buf4 ); \ } @@ -683,7 +683,7 @@ { \ int nz_a, nz_c; \ int16_t dc_a, dc_c; \ - set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\ + set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ for( int i = 0; i < 2; i++ ) \ { \ @@ -694,27 +694,27 @@ memcpy( buf3 + jFDEC_STRIDE, (i?buf1:buf2) + jFENC_STRIDE, 4 ); \ memcpy( buf4 + jFDEC_STRIDE, (i?buf1:buf2) + jFENC_STRIDE, 4 ); \ } \ - nz_c = call_c1( zigzag_c.name, t1, buf2, buf3, &dc_c ); \ + nz_c = call_c1( zigzag_c.name, t1, buf2, buf3, &dc_c ); \ nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \ - if( memcmp( t1+1, t2+1, 15sizeof(int16_t) ) \|\| memcmp( buf3, buf4, 16FDEC_STRIDE ) \|\| nz_c != nz_a \|\| dc_c != dc_a ) \ + if( memcmp( t1+1, t2+1, 15sizeof(int16_t) ) \|\| memcmp( buf3, buf4, 16FDEC_STRIDE ) \|\| nz_c != nz_a \|\| dc_c != dc_a ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ break; \ } \ } \ - call_c2( zigzag_c.name, t1, buf2, buf3, &dc_c ); \ + call_c2( zigzag_c.name, t1, buf2, buf3, &dc_c ); \ call_a2( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \ } -#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \ +#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \ if( zigzag_asm.name != zigzag_ref.name ) \ { \ for( int j = 0; j < 100; j++ ) \ { \ - set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\ + set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ - memcpy(dct, buf1, sizesizeof(int16_t));\ + memcpy(dct, buf1, sizesizeof(int16_t)); \ for( int i = 0; i < size; i++ ) \ dct[i] = rand()&0x1F ? 0 : dct[i]; \ memcpy(buf3, buf4, 10sizeof(uint8_t)); \ @@ -784,7 +784,7 @@ if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \ { \ const x264_weight_t weight = weight_none; \ - set_func_name( "mc_luma_%dx%d", w, h );\ + set_func_name( "mc_luma_%dx%d", w, h ); \ used_asm = 1; \ memset( buf3, 0xCD, 1024 ); \ memset( buf4, 0xCD, 1024 ); \ @@ -801,7 +801,7 @@ uint8_t ref = dst2; \ int ref_stride = 32; \ const x264_weight_t weight = weight_none; \ - set_func_name( "get_ref_%dx%d", w, h );\ + set_func_name( "get_ref_%dx%d", w, h ); \ used_asm = 1; \ memset( buf3, 0xCD, 1024 ); \ memset( buf4, 0xCD, 1024 ); \ @@ -819,13 +819,13 @@ #define MC_TEST_CHROMA( w, h ) \ if( mc_a.mc_chroma != mc_ref.mc_chroma ) \ { \ - set_func_name( "mc_chroma_%dx%d", w, h );\ + set_func_name( "mc_chroma_%dx%d", w, h ); \ used_asm = 1; \ memset( buf3, 0xCD, 1024 ); \ memset( buf4, 0xCD, 1024 ); \ call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \ call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \ - /* mc_chroma width=2 may write garbage to the right of dst. ignore that. /\ + / mc_chroma width=2 may write garbage to the right of dst. ignore that. / \ for( int j = 0; j < h; j++ ) \ for( int i = w; i < 4; i++ ) \ dst2[i+j16] = dst1[i+j16]; \ @@ -878,7 +878,7 @@ memcpy( buf4, buf1+320, 320 ); \ if( mc_a.name[i] != mc_ref.name[i] ) \ { \ - set_func_name( "%s_%s", #name, pixel_names[i] );\ + set_func_name( "%s_%s", #name, pixel_names[i] ); \ used_asm = 1; \ call_c1( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \ call_a1( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \ @@ -899,7 +899,7 @@ #define MC_TEST_WEIGHT( name, weight, aligned ) \ int align_off = (aligned ? 0 : rand()%16); \ - ok = 1, used_asm = 0;\ + ok = 1, used_asm = 0; \ for( int i = 1; i <= 5; i++ ) \ { \ ALIGNED_16( uint8_t buffC[640] ); \ @@ -1115,14 +1115,14 @@ #define TEST_DEBLOCK( name, align, ... ) \ for( int i = 0; i < 36; i++ ) \ { \ - int off = 832 + (i&15)4!align; /* benchmark various alignments of h filter /\ + int off = 832 + (i&15)4!align; /* benchmark various alignments of h filter / \ for( int j = 0; j < 1024; j++ ) \ - / two distributions of random to excersize different failure modes /\ + / two distributions of random to excersize different failure modes / \ buf3[j] = rand() & (i&1 ? 0xf : 0xff ); \ memcpy( buf4, buf3, 1024 ); \ if( db_a.name != db_ref.name ) \ { \ - set_func_name( #name );\ + set_func_name( #name ); \ used_asm = 1; \ call_c1( db_c.name, buf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ call_a1( db_a.name, buf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ @@ -1236,7 +1236,7 @@ dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \ result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ - if( memcmp( dct1, dct2, 162 ) \|\| result_c != result_a ) \ + if( memcmp( dct1, dct2, 162 ) \|\| result_c != result_a ) \ { \ oks[0] = 0; \ fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \ @@ -1491,11 +1491,11 @@ ip_c.predict_8x8_filter( buf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); -#define INTRA_TEST( name, dir, w, ... ) \ +#define INTRA_TEST( name, dir, w, ... )\ if( ip_a.name[dir] != ip_ref.name[dir] )\ - { \ + {\ set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\ - used_asm = 1; \ + used_asm = 1;\ memcpy( buf3, buf1, 3220 );\ memcpy( buf4, buf1, 3220 );\ call_c( ip_c.name[dir], buf3+48, ##__VA_ARGS__ );\ @@ -1556,32 +1556,66 @@ } #define DECL_CABAC(cpu) \ -static void run_cabac_##cpu( uint8_t dst )\ +static void run_cabac_decision_##cpu( uint8_t dst )\ {\ x264_cabac_t cb;\ x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\ x264_cabac_encode_init( &cb, dst, dst+0xff0 );\ for( int i = 0; i < 0x1000; i++ )\ x264_cabac_encode_decision_##cpu( &cb, buf1[i]>>1, buf1[i]&1 );\ +}\ +static void run_cabac_bypass_##cpu( uint8_t dst )\ +{\ + x264_cabac_t cb;\ + x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\ + x264_cabac_encode_init( &cb, dst, dst+0xff0 );\ + for( int i = 0; i < 0x1000; i++ )\ + x264_cabac_encode_bypass_##cpu( &cb, buf1[i]&1 );\ +}\ +static void run_cabac_terminal_##cpu( uint8_t *dst )\ +{\ + x264_cabac_t cb;\ + x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\ + x264_cabac_encode_init( &cb, dst, dst+0xff0 );\ + for( int i = 0; i < 0x1000; i++ )\ + x264_cabac_encode_terminal_##cpu( &cb );\ } DECL_CABAC(c) #ifdef HAVE_MMX DECL_CABAC(asm) #else -#define run_cabac_asm run_cabac_c +#define run_cabac_decision_asm run_cabac_decision_c +#define run_cabac_bypass_asm run_cabac_bypass_c +#define run_cabac_terminal_asm run_cabac_terminal_c #endif static int check_cabac( int cpu_ref, int cpu_new ) { int ret = 0, ok, used_asm = 1; - if( cpu_ref \|\| run_cabac_c == run_cabac_asm) + if( cpu_ref \|\| run_cabac_decision_c == run_cabac_decision_asm ) return 0; + set_func_name( "cabac_encode_decision" ); memcpy( buf4, buf3, 0x1000 ); - call_c( run_cabac_c, buf3 ); - call_a( run_cabac_asm, buf4 ); + call_c( run_cabac_decision_c, buf3 ); + call_a( run_cabac_decision_asm, buf4 ); + ok = !memcmp( buf3, buf4, 0x1000 ); + report( "cabac decision:" ); + + set_func_name( "cabac_encode_bypass" ); + memcpy( buf4, buf3, 0x1000 ); + call_c( run_cabac_bypass_c, buf3 ); + call_a( run_cabac_bypass_asm, buf4 ); ok = !memcmp( buf3, buf4, 0x1000 ); - report( "cabac :" ); + report( "cabac bypass:" ); + + set_func_name( "cabac_encode_terminal" ); + memcpy( buf4, buf3, 0x1000 ); + call_c( run_cabac_terminal_c, buf3 ); + call_a( run_cabac_terminal_asm, buf4 ); + ok = !memcmp( buf3, buf4, 0x1000 ); + report( "cabac terminal:" ); + return ret; }
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/x264.c ^
@@ -120,7 +120,7 @@ static const cli_pulldown_t pulldown_values[] = { - [X264_PULLDOWN_22] = {1, {TB}, 2.0}, + [X264_PULLDOWN_22] = {1, {TB}, 1.0}, [X264_PULLDOWN_32] = {4, {TBT, BT, BTB, TB}, 1.25}, [X264_PULLDOWN_64] = {2, {PIC_STRUCT_DOUBLE, PIC_STRUCT_TRIPLE}, 1.0}, [X264_PULLDOWN_DOUBLE] = {1, {PIC_STRUCT_DOUBLE}, 2.0}, @@ -1312,7 +1312,7 @@ * Encode: ****************************************************************************/ -static int Encode_frame( x264_t h, hnd_t hout, x264_picture_t pic, int64_t last_pts ) +static int Encode_frame( x264_t h, hnd_t hout, x264_picture_t pic, int64_t last_dts ) { x264_picture_t pic_out; x264_nal_t nal; @@ -1330,18 +1330,22 @@ if( i_frame_size ) { i_frame_size = output.write_frame( hout, nal[0].p_payload, i_frame_size, &pic_out ); - last_pts = pic_out.i_pts; + last_dts = pic_out.i_dts; } return i_frame_size; } -static void Print_status( int64_t i_start, int i_frame, int i_frame_total, int64_t i_file, x264_param_t param, int64_t last_pts ) +static void Print_status( int64_t i_start, int i_frame, int i_frame_total, int64_t i_file, x264_param_t param, int64_t last_ts ) { char buf[200]; int64_t i_elapsed = x264_mdate() - i_start; double fps = i_elapsed > 0 ? i_frame * 1000000. / i_elapsed : 0; - double bitrate = (double) i_file * 8 / ( (double) last_pts * 1000 * param->i_timebase_num / param->i_timebase_den ); + double bitrate; + if( last_ts ) + bitrate = (double) i_file * 8 / ( (double) last_ts * 1000 * param->i_timebase_num / param->i_timebase_den ); + else + bitrate = (double) i_file * 8 / ( (double) 1000 * param->i_fps_den / param->i_fps_num ); if( i_frame_total ) { int eta = i_elapsed * (i_frame_total - i_frame) / ((int64_t)i_frame * 1000000); @@ -1369,7 +1373,9 @@ int64_t i_file = 0; int i_frame_size; int i_update_interval; - int64_t last_pts = 0; + int64_t last_dts = 0; + int64_t prev_dts = 0; + int64_t first_dts = 0; # define MAX_PTS_WARNING 3 /* arbitrary / int pts_warning_cnt = 0; int64_t largest_pts = -1; @@ -1506,12 +1512,17 @@ pic.i_qpplus1 = 0; } - i_frame_size = Encode_frame( h, opt->hout, &pic, &last_pts ); + prev_dts = last_dts; + i_frame_size = Encode_frame( h, opt->hout, &pic, &last_dts ); if( i_frame_size < 0 ) return -1; i_file += i_frame_size; if( i_frame_size ) + { i_frame_output++; + if( i_frame_output == 1 ) + first_dts = prev_dts = last_dts; + } i_frame++; @@ -1520,19 +1531,24 @@ / update status line (up to 1000 times per input file) / if( opt->b_progress && i_frame_output % i_update_interval == 0 && i_frame_output ) - Print_status( i_start, i_frame_output, i_frame_total, i_file, param, last_pts ); + Print_status( i_start, i_frame_output, i_frame_total, i_file, param, 2 last_dts - prev_dts - first_dts ); } /* Flush delayed frames / while( !b_ctrl_c && x264_encoder_delayed_frames( h ) ) { - i_frame_size = Encode_frame( h, opt->hout, NULL, &last_pts ); + prev_dts = last_dts; + i_frame_size = Encode_frame( h, opt->hout, NULL, &last_dts ); if( i_frame_size < 0 ) return -1; i_file += i_frame_size; if( i_frame_size ) + { i_frame_output++; + if( i_frame_output == 1 ) + first_dts = prev_dts = last_dts; + } if( opt->b_progress && i_frame_output % i_update_interval == 0 && i_frame_output ) - Print_status( i_start, i_frame_output, i_frame_total, i_file, param, last_pts ); + Print_status( i_start, i_frame_output, i_frame_total, i_file, param, 2 last_dts - prev_dts - first_dts ); } if( pts_warning_cnt >= MAX_PTS_WARNING && param->i_log_level < X264_LOG_DEBUG ) fprintf( stderr, "x264 [warning]: %d suppressed nonmonotonic pts warnings\n", pts_warning_cnt-MAX_PTS_WARNING );
[-] [+]	Changed	x264-snapshot-20100517-2245.tar.bz2/x264.h ^
@@ -35,7 +35,7 @@ #include <stdarg.h> -#define X264_BUILD 94 +#define X264_BUILD 95 /* x264_t: * opaque handler for encoder / @@ -639,5 +639,13 @@ return the number of currently delayed (buffered) frames * this should be used at the end of the stream, to know when you have all the encoded frames. / int x264_encoder_delayed_frames( x264_t ); +/* x264_encoder_intra_refresh: + * If an intra refresh is not in progress, begin one with the next P-frame. + * If an intra refresh is in progress, begin one as soon as the current one finishes. + * Requires that b_intra_refresh be set. + * Useful for interactive streaming where the client can tell the server that packet loss has + * occurred. In this case, keyint can be set to an extremely high value so that intra refreshes + * only occur when calling x264_encoder_intra_refresh. / +void x264_encoder_intra_refresh( x264_t ); #endif

Changes of Revision 6