[-]
[+]
|
Changed |
x264.spec
|
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/Makefile
^
|
@@ -8,6 +8,7 @@
common/frame.c common/dct.c common/cpu.c common/cabac.c \
common/common.c common/mdate.c common/rectangle.c \
common/set.c common/quant.c common/deblock.c common/vlc.c \
+ common/mvpred.c \
encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
encoder/set.c encoder/macroblock.c encoder/cabac.c \
encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
@@ -49,8 +50,8 @@
# MMX/SSE optims
ifneq ($(AS),)
-X86SRC0 = cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \
- pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
+X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
+ mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
cpu-a.asm dct-32.asm
X86SRC = $(X86SRC0:%=common/x86/%)
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/arm/mc-c.c
^
|
@@ -112,8 +112,8 @@
x264_mc_copy_w16_neon,
};
-static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
static void mc_luma_neon( uint8_t *dst, int i_dst_stride,
uint8_t *src[4], int i_src_stride,
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/cabac.c
^
|
@@ -664,75 +664,44 @@
}
};
-/* FIXME could avoid this duplication by reversing the order of states
- * with MPS=0, but that would uglify the other tables */
-const uint8_t x264_cabac_range_lps[128][4] =
-{
- { 2, 2, 2, 2 },
- { 6, 7, 8, 9 }, { 6, 7, 9, 10 }, { 6, 8, 9, 11 },
- { 7, 8, 10, 11 }, { 7, 9, 10, 12 }, { 7, 9, 11, 12 },
- { 8, 9, 11, 13 }, { 8, 10, 12, 14 }, { 9, 11, 12, 14 },
- { 9, 11, 13, 15 }, { 10, 12, 14, 16 }, { 10, 12, 15, 17 },
- { 11, 13, 15, 18 }, { 11, 14, 16, 19 }, { 12, 14, 17, 20 },
- { 12, 15, 18, 21 }, { 13, 16, 19, 22 }, { 14, 17, 20, 23 },
- { 14, 18, 21, 24 }, { 15, 19, 22, 25 }, { 16, 20, 23, 27 },
- { 17, 21, 25, 28 }, { 18, 22, 26, 30 }, { 19, 23, 27, 31 },
- { 20, 24, 29, 33 }, { 21, 26, 30, 35 }, { 22, 27, 32, 37 },
- { 23, 28, 33, 39 }, { 24, 30, 35, 41 }, { 26, 31, 37, 43 },
- { 27, 33, 39, 45 }, { 29, 35, 41, 48 }, { 30, 37, 43, 50 },
- { 32, 39, 46, 53 }, { 33, 41, 48, 56 }, { 35, 43, 51, 59 },
- { 37, 45, 54, 62 }, { 39, 48, 56, 65 }, { 41, 50, 59, 69 },
- { 43, 53, 63, 72 }, { 46, 56, 66, 76 }, { 48, 59, 69, 80 },
- { 51, 62, 73, 85 }, { 53, 65, 77, 89 }, { 56, 69, 81, 94 },
- { 59, 72, 86, 99 }, { 62, 76, 90, 104 }, { 66, 80, 95, 110 },
- { 69, 85, 100, 116 }, { 73, 89, 105, 122 }, { 77, 94, 111, 128 },
- { 81, 99, 117, 135 }, { 85, 104, 123, 142 }, { 90, 110, 130, 150 },
- { 95, 116, 137, 158 }, { 100, 122, 144, 166 }, { 105, 128, 152, 175 },
- { 111, 135, 160, 185 }, { 116, 142, 169, 195 }, { 123, 150, 178, 205 },
- { 128, 158, 187, 216 }, { 128, 167, 197, 227 }, { 128, 176, 208, 240 },
-
- { 128, 176, 208, 240 }, { 128, 167, 197, 227 }, { 128, 158, 187, 216 },
- { 123, 150, 178, 205 }, { 116, 142, 169, 195 }, { 111, 135, 160, 185 },
- { 105, 128, 152, 175 }, { 100, 122, 144, 166 }, { 95, 116, 137, 158 },
- { 90, 110, 130, 150 }, { 85, 104, 123, 142 }, { 81, 99, 117, 135 },
- { 77, 94, 111, 128 }, { 73, 89, 105, 122 }, { 69, 85, 100, 116 },
- { 66, 80, 95, 110 }, { 62, 76, 90, 104 }, { 59, 72, 86, 99 },
- { 56, 69, 81, 94 }, { 53, 65, 77, 89 }, { 51, 62, 73, 85 },
- { 48, 59, 69, 80 }, { 46, 56, 66, 76 }, { 43, 53, 63, 72 },
- { 41, 50, 59, 69 }, { 39, 48, 56, 65 }, { 37, 45, 54, 62 },
- { 35, 43, 51, 59 }, { 33, 41, 48, 56 }, { 32, 39, 46, 53 },
- { 30, 37, 43, 50 }, { 29, 35, 41, 48 }, { 27, 33, 39, 45 },
- { 26, 31, 37, 43 }, { 24, 30, 35, 41 }, { 23, 28, 33, 39 },
- { 22, 27, 32, 37 }, { 21, 26, 30, 35 }, { 20, 24, 29, 33 },
- { 19, 23, 27, 31 }, { 18, 22, 26, 30 }, { 17, 21, 25, 28 },
- { 16, 20, 23, 27 }, { 15, 19, 22, 25 }, { 14, 18, 21, 24 },
- { 14, 17, 20, 23 }, { 13, 16, 19, 22 }, { 12, 15, 18, 21 },
- { 12, 14, 17, 20 }, { 11, 14, 16, 19 }, { 11, 13, 15, 18 },
- { 10, 12, 15, 17 }, { 10, 12, 14, 16 }, { 9, 11, 13, 15 },
- { 9, 11, 12, 14 }, { 8, 10, 12, 14 }, { 8, 9, 11, 13 },
- { 7, 9, 11, 12 }, { 7, 9, 10, 12 }, { 7, 8, 10, 11 },
- { 6, 8, 9, 11 }, { 6, 7, 9, 10 }, { 6, 7, 8, 9 },
- { 2, 2, 2, 2 },
+const uint8_t x264_cabac_range_lps[64][4] =
+{
+ { 2, 2, 2, 2}, { 6, 7, 8, 9}, { 6, 7, 9, 10}, { 6, 8, 9, 11},
+ { 7, 8, 10, 11}, { 7, 9, 10, 12}, { 7, 9, 11, 12}, { 8, 9, 11, 13},
+ { 8, 10, 12, 14}, { 9, 11, 12, 14}, { 9, 11, 13, 15}, { 10, 12, 14, 16},
+ { 10, 12, 15, 17}, { 11, 13, 15, 18}, { 11, 14, 16, 19}, { 12, 14, 17, 20},
+ { 12, 15, 18, 21}, { 13, 16, 19, 22}, { 14, 17, 20, 23}, { 14, 18, 21, 24},
+ { 15, 19, 22, 25}, { 16, 20, 23, 27}, { 17, 21, 25, 28}, { 18, 22, 26, 30},
+ { 19, 23, 27, 31}, { 20, 24, 29, 33}, { 21, 26, 30, 35}, { 22, 27, 32, 37},
+ { 23, 28, 33, 39}, { 24, 30, 35, 41}, { 26, 31, 37, 43}, { 27, 33, 39, 45},
+ { 29, 35, 41, 48}, { 30, 37, 43, 50}, { 32, 39, 46, 53}, { 33, 41, 48, 56},
+ { 35, 43, 51, 59}, { 37, 45, 54, 62}, { 39, 48, 56, 65}, { 41, 50, 59, 69},
+ { 43, 53, 63, 72}, { 46, 56, 66, 76}, { 48, 59, 69, 80}, { 51, 62, 73, 85},
+ { 53, 65, 77, 89}, { 56, 69, 81, 94}, { 59, 72, 86, 99}, { 62, 76, 90, 104},
+ { 66, 80, 95, 110}, { 69, 85, 100, 116}, { 73, 89, 105, 122}, { 77, 94, 111, 128},
+ { 81, 99, 117, 135}, { 85, 104, 123, 142}, { 90, 110, 130, 150}, { 95, 116, 137, 158},
+ {100, 122, 144, 166}, {105, 128, 152, 175}, {111, 135, 160, 185}, {116, 142, 169, 195},
+ {123, 150, 178, 205}, {128, 158, 187, 216}, {128, 167, 197, 227}, {128, 176, 208, 240}
};
const uint8_t x264_cabac_transition[128][2] =
{
- { 0, 0}, { 1, 25}, { 1, 25}, { 2, 26}, { 3, 26}, { 4, 26}, { 5, 27}, { 6, 27},
- { 7, 27}, { 8, 28}, { 9, 28}, { 10, 28}, { 11, 29}, { 12, 29}, { 13, 30}, { 14, 30},
- { 15, 30}, { 16, 31}, { 17, 31}, { 18, 32}, { 19, 33}, { 20, 33}, { 21, 33}, { 22, 34},
- { 23, 34}, { 24, 35}, { 25, 36}, { 26, 36}, { 27, 37}, { 28, 37}, { 29, 38}, { 30, 39},
- { 31, 39}, { 32, 40}, { 33, 41}, { 34, 41}, { 35, 42}, { 36, 42}, { 37, 44}, { 38, 44},
- { 39, 45}, { 40, 45}, { 41, 47}, { 42, 47}, { 43, 48}, { 44, 48}, { 45, 50}, { 46, 50},
- { 47, 51}, { 48, 52}, { 49, 52}, { 50, 54}, { 51, 54}, { 52, 55}, { 53, 56}, { 54, 57},
- { 55, 58}, { 56, 59}, { 57, 59}, { 58, 61}, { 59, 61}, { 60, 62}, { 61, 63}, { 62, 64},
- { 63, 65}, { 64, 66}, { 65, 67}, { 66, 68}, { 66, 69}, { 68, 70}, { 68, 71}, { 69, 72},
- { 70, 73}, { 71, 74}, { 72, 75}, { 73, 76}, { 73, 77}, { 75, 78}, { 75, 79}, { 76, 80},
- { 77, 81}, { 77, 82}, { 79, 83}, { 79, 84}, { 80, 85}, { 80, 86}, { 82, 87}, { 82, 88},
- { 83, 89}, { 83, 90}, { 85, 91}, { 85, 92}, { 86, 93}, { 86, 94}, { 87, 95}, { 88, 96},
- { 88, 97}, { 89, 98}, { 90, 99}, { 90,100}, { 91,101}, { 91,102}, { 92,103}, { 93,104},
- { 93,105}, { 94,106}, { 94,107}, { 94,108}, { 95,109}, { 96,110}, { 96,111}, { 97,112},
- { 97,113}, { 97,114}, { 98,115}, { 98,116}, { 99,117}, { 99,118}, { 99,119}, {100,120},
- {100,121}, {100,122}, {101,123}, {101,124}, {101,125}, {102,126}, {102,126}, {127,127},
+ { 0, 0}, { 1, 1}, { 2, 50}, { 51, 3}, { 2, 50}, { 51, 3}, { 4, 52}, { 53, 5},
+ { 6, 52}, { 53, 7}, { 8, 52}, { 53, 9}, { 10, 54}, { 55, 11}, { 12, 54}, { 55, 13},
+ { 14, 54}, { 55, 15}, { 16, 56}, { 57, 17}, { 18, 56}, { 57, 19}, { 20, 56}, { 57, 21},
+ { 22, 58}, { 59, 23}, { 24, 58}, { 59, 25}, { 26, 60}, { 61, 27}, { 28, 60}, { 61, 29},
+ { 30, 60}, { 61, 31}, { 32, 62}, { 63, 33}, { 34, 62}, { 63, 35}, { 36, 64}, { 65, 37},
+ { 38, 66}, { 67, 39}, { 40, 66}, { 67, 41}, { 42, 66}, { 67, 43}, { 44, 68}, { 69, 45},
+ { 46, 68}, { 69, 47}, { 48, 70}, { 71, 49}, { 50, 72}, { 73, 51}, { 52, 72}, { 73, 53},
+ { 54, 74}, { 75, 55}, { 56, 74}, { 75, 57}, { 58, 76}, { 77, 59}, { 60, 78}, { 79, 61},
+ { 62, 78}, { 79, 63}, { 64, 80}, { 81, 65}, { 66, 82}, { 83, 67}, { 68, 82}, { 83, 69},
+ { 70, 84}, { 85, 71}, { 72, 84}, { 85, 73}, { 74, 88}, { 89, 75}, { 76, 88}, { 89, 77},
+ { 78, 90}, { 91, 79}, { 80, 90}, { 91, 81}, { 82, 94}, { 95, 83}, { 84, 94}, { 95, 85},
+ { 86, 96}, { 97, 87}, { 88, 96}, { 97, 89}, { 90, 100}, {101, 91}, { 92, 100}, {101, 93},
+ { 94, 102}, {103, 95}, { 96, 104}, {105, 97}, { 98, 104}, {105, 99}, {100, 108}, {109, 101},
+ {102, 108}, {109, 103}, {104, 110}, {111, 105}, {106, 112}, {113, 107}, {108, 114}, {115, 109},
+ {110, 116}, {117, 111}, {112, 118}, {119, 113}, {114, 118}, {119, 115}, {116, 122}, {123, 117},
+ {118, 122}, {123, 119}, {120, 124}, {125, 121}, {122, 126}, {127, 123}, {124, 127}, {126, 125}
};
const uint8_t x264_cabac_renorm_shift[64]= {
@@ -743,41 +712,40 @@
};
/* -ln2(probability) */
-#define F(a,b) {FIX8(a),FIX8(b)}
-const uint16_t x264_cabac_entropy[128][2] =
+const uint16_t x264_cabac_entropy[128] =
{
- F(0.0273,5.7370), F(0.0288,5.6618), F(0.0303,5.5866), F(0.0320,5.5114),
- F(0.0337,5.4362), F(0.0355,5.3610), F(0.0375,5.2859), F(0.0395,5.2106),
- F(0.0416,5.1354), F(0.0439,5.0602), F(0.0463,4.9851), F(0.0488,4.9099),
- F(0.0515,4.8347), F(0.0543,4.7595), F(0.0572,4.6843), F(0.0604,4.6091),
- F(0.0637,4.5339), F(0.0671,4.4588), F(0.0708,4.3836), F(0.0747,4.3083),
- F(0.0788,4.2332), F(0.0832,4.1580), F(0.0878,4.0828), F(0.0926,4.0076),
- F(0.0977,3.9324), F(0.1032,3.8572), F(0.1089,3.7820), F(0.1149,3.7068),
- F(0.1214,3.6316), F(0.1282,3.5565), F(0.1353,3.4813), F(0.1429,3.4061),
- F(0.1510,3.3309), F(0.1596,3.2557), F(0.1686,3.1805), F(0.1782,3.1053),
- F(0.1884,3.0301), F(0.1992,2.9549), F(0.2107,2.8797), F(0.2229,2.8046),
- F(0.2358,2.7294), F(0.2496,2.6542), F(0.2642,2.5790), F(0.2798,2.5038),
- F(0.2964,2.4286), F(0.3142,2.3534), F(0.3331,2.2782), F(0.3532,2.2030),
- F(0.3748,2.1278), F(0.3979,2.0527), F(0.4226,1.9775), F(0.4491,1.9023),
- F(0.4776,1.8271), F(0.5082,1.7519), F(0.5412,1.6767), F(0.5768,1.6015),
- F(0.6152,1.5263), F(0.6568,1.4511), F(0.7020,1.3759), F(0.7513,1.3008),
- F(0.8050,1.2256), F(0.8638,1.1504), F(0.9285,1.0752), F(1.0000,1.0000),
- F(1.0000,1.0000), F(1.0752,0.9285), F(1.1504,0.8638), F(1.2256,0.8050),
- F(1.3008,0.7513), F(1.3759,0.7020), F(1.4511,0.6568), F(1.5263,0.6152),
- F(1.6015,0.5768), F(1.6767,0.5412), F(1.7519,0.5082), F(1.8271,0.4776),
- F(1.9023,0.4491), F(1.9775,0.4226), F(2.0527,0.3979), F(2.1278,0.3748),
- F(2.2030,0.3532), F(2.2782,0.3331), F(2.3534,0.3142), F(2.4286,0.2964),
- F(2.5038,0.2798), F(2.5790,0.2642), F(2.6542,0.2496), F(2.7294,0.2358),
- F(2.8046,0.2229), F(2.8797,0.2107), F(2.9549,0.1992), F(3.0301,0.1884),
- F(3.1053,0.1782), F(3.1805,0.1686), F(3.2557,0.1596), F(3.3309,0.1510),
- F(3.4061,0.1429), F(3.4813,0.1353), F(3.5565,0.1282), F(3.6316,0.1214),
- F(3.7068,0.1149), F(3.7820,0.1089), F(3.8572,0.1032), F(3.9324,0.0977),
- F(4.0076,0.0926), F(4.0828,0.0878), F(4.1580,0.0832), F(4.2332,0.0788),
- F(4.3083,0.0747), F(4.3836,0.0708), F(4.4588,0.0671), F(4.5339,0.0637),
- F(4.6091,0.0604), F(4.6843,0.0572), F(4.7595,0.0543), F(4.8347,0.0515),
- F(4.9099,0.0488), F(4.9851,0.0463), F(5.0602,0.0439), F(5.1354,0.0416),
- F(5.2106,0.0395), F(5.2859,0.0375), F(5.3610,0.0355), F(5.4362,0.0337),
- F(5.5114,0.0320), F(5.5866,0.0303), F(5.6618,0.0288), F(5.7370,0.0273),
+ FIX8(0.0273), FIX8(5.7370), FIX8(0.0288), FIX8(5.6618),
+ FIX8(0.0303), FIX8(5.5866), FIX8(0.0320), FIX8(5.5114),
+ FIX8(0.0337), FIX8(5.4362), FIX8(0.0355), FIX8(5.3610),
+ FIX8(0.0375), FIX8(5.2859), FIX8(0.0395), FIX8(5.2106),
+ FIX8(0.0416), FIX8(5.1354), FIX8(0.0439), FIX8(5.0602),
+ FIX8(0.0463), FIX8(4.9851), FIX8(0.0488), FIX8(4.9099),
+ FIX8(0.0515), FIX8(4.8347), FIX8(0.0543), FIX8(4.7595),
+ FIX8(0.0572), FIX8(4.6843), FIX8(0.0604), FIX8(4.6091),
+ FIX8(0.0637), FIX8(4.5339), FIX8(0.0671), FIX8(4.4588),
+ FIX8(0.0708), FIX8(4.3836), FIX8(0.0747), FIX8(4.3083),
+ FIX8(0.0788), FIX8(4.2332), FIX8(0.0832), FIX8(4.1580),
+ FIX8(0.0878), FIX8(4.0828), FIX8(0.0926), FIX8(4.0076),
+ FIX8(0.0977), FIX8(3.9324), FIX8(0.1032), FIX8(3.8572),
+ FIX8(0.1089), FIX8(3.7820), FIX8(0.1149), FIX8(3.7068),
+ FIX8(0.1214), FIX8(3.6316), FIX8(0.1282), FIX8(3.5565),
+ FIX8(0.1353), FIX8(3.4813), FIX8(0.1429), FIX8(3.4061),
+ FIX8(0.1510), FIX8(3.3309), FIX8(0.1596), FIX8(3.2557),
+ FIX8(0.1686), FIX8(3.1805), FIX8(0.1782), FIX8(3.1053),
+ FIX8(0.1884), FIX8(3.0301), FIX8(0.1992), FIX8(2.9549),
+ FIX8(0.2107), FIX8(2.8797), FIX8(0.2229), FIX8(2.8046),
+ FIX8(0.2358), FIX8(2.7294), FIX8(0.2496), FIX8(2.6542),
+ FIX8(0.2642), FIX8(2.5790), FIX8(0.2798), FIX8(2.5038),
+ FIX8(0.2964), FIX8(2.4286), FIX8(0.3142), FIX8(2.3534),
+ FIX8(0.3331), FIX8(2.2782), FIX8(0.3532), FIX8(2.2030),
+ FIX8(0.3748), FIX8(2.1278), FIX8(0.3979), FIX8(2.0527),
+ FIX8(0.4226), FIX8(1.9775), FIX8(0.4491), FIX8(1.9023),
+ FIX8(0.4776), FIX8(1.8271), FIX8(0.5082), FIX8(1.7519),
+ FIX8(0.5412), FIX8(1.6767), FIX8(0.5768), FIX8(1.6015),
+ FIX8(0.6152), FIX8(1.5263), FIX8(0.6568), FIX8(1.4511),
+ FIX8(0.7020), FIX8(1.3759), FIX8(0.7513), FIX8(1.3008),
+ FIX8(0.8050), FIX8(1.2256), FIX8(0.8638), FIX8(1.1504),
+ FIX8(0.9285), FIX8(1.0752), FIX8(1.0000), FIX8(1.0000)
};
@@ -794,14 +762,17 @@
cabac_context_init = &x264_cabac_context_init_PB[i_model];
for( int i = 0; i < 460; i++ )
- cb->state[i] = x264_clip3( (((*cabac_context_init)[i][0] * i_qp) >> 4) + (*cabac_context_init)[i][1], 1, 126 );
+ {
+ int state = x264_clip3( (((*cabac_context_init)[i][0] * i_qp) >> 4) + (*cabac_context_init)[i][1], 1, 126 );
+ cb->state[i] = (X264_MIN( state, 127-state ) << 1) | (state >> 6);
+ }
}
void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end )
{
cb->i_low = 0;
cb->i_range = 0x01FE;
- cb->i_queue = -1; // the first bit will be shifted away and not written
+ cb->i_queue = -9; // the first bit will be shifted away and not written
cb->i_bytes_outstanding = 0;
cb->p_start = p_data;
cb->p = p_data;
@@ -810,10 +781,10 @@
static inline void x264_cabac_putbyte( x264_cabac_t *cb )
{
- if( cb->i_queue >= 8 )
+ if( cb->i_queue >= 0 )
{
- int out = cb->i_low >> (cb->i_queue+2);
- cb->i_low &= (4<<cb->i_queue)-1;
+ int out = cb->i_low >> (cb->i_queue+10);
+ cb->i_low &= (0x400<<cb->i_queue)-1;
cb->i_queue -= 8;
if( (out & 0xff) == 0xff )
@@ -855,9 +826,9 @@
void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b )
{
int i_state = cb->state[i_ctx];
- int i_range_lps = x264_cabac_range_lps[i_state][(cb->i_range>>6)-4];
+ int i_range_lps = x264_cabac_range_lps[i_state>>1][(cb->i_range>>6)-4];
cb->i_range -= i_range_lps;
- if( b != (i_state >> 6) )
+ if( b != (i_state & 1) )
{
cb->i_low += cb->i_range;
cb->i_range = i_range_lps;
@@ -866,7 +837,7 @@
x264_cabac_encode_renorm( cb );
}
-void x264_cabac_encode_bypass( x264_cabac_t *cb, int b )
+void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b )
{
cb->i_low <<= 1;
cb->i_low += -b & cb->i_range;
@@ -892,7 +863,7 @@
} while( k > 0 );
}
-void x264_cabac_encode_terminal( x264_cabac_t *cb )
+void x264_cabac_encode_terminal_c( x264_cabac_t *cb )
{
cb->i_range -= 2;
x264_cabac_encode_renorm( cb );
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/cabac.h
^
|
@@ -31,7 +31,7 @@
int i_range;
/* bit stream */
- int i_queue;
+ int i_queue; //stored with an offset of -8 for faster asm
int i_bytes_outstanding;
uint8_t *p_start;
@@ -46,7 +46,7 @@
} x264_cabac_t;
extern const uint8_t x264_cabac_transition[128][2];
-extern const uint16_t x264_cabac_entropy[128][2];
+extern const uint16_t x264_cabac_entropy[128];
/* init the contexts given i_slice_type, the quantif and the model */
void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model );
@@ -55,15 +55,21 @@
void x264_cabac_encode_init ( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end );
void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b );
void x264_cabac_encode_decision_asm( x264_cabac_t *cb, int i_ctx, int b );
-void x264_cabac_encode_bypass( x264_cabac_t *cb, int b );
+void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b );
+void x264_cabac_encode_bypass_asm( x264_cabac_t *cb, int b );
+void x264_cabac_encode_terminal_c( x264_cabac_t *cb );
+void x264_cabac_encode_terminal_asm( x264_cabac_t *cb );
void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val );
-void x264_cabac_encode_terminal( x264_cabac_t *cb );
void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb );
#ifdef HAVE_MMX
#define x264_cabac_encode_decision x264_cabac_encode_decision_asm
+#define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
+#define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
#else
#define x264_cabac_encode_decision x264_cabac_encode_decision_c
+#define x264_cabac_encode_bypass x264_cabac_encode_bypass_c
+#define x264_cabac_encode_terminal x264_cabac_encode_terminal_c
#endif
#define x264_cabac_encode_decision_noup x264_cabac_encode_decision
@@ -78,25 +84,25 @@
{
int i_state = cb->state[i_ctx];
cb->state[i_ctx] = x264_cabac_transition[i_state][b];
- cb->f8_bits_encoded += x264_cabac_entropy[i_state][b];
+ cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
}
static ALWAYS_INLINE int x264_cabac_size_decision2( uint8_t *state, long b )
{
int i_state = *state;
*state = x264_cabac_transition[i_state][b];
- return x264_cabac_entropy[i_state][b];
+ return x264_cabac_entropy[i_state^b];
}
static ALWAYS_INLINE void x264_cabac_size_decision_noup( x264_cabac_t *cb, long i_ctx, long b )
{
int i_state = cb->state[i_ctx];
- cb->f8_bits_encoded += x264_cabac_entropy[i_state][b];
+ cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
}
static ALWAYS_INLINE int x264_cabac_size_decision_noup2( uint8_t *state, long b )
{
- return x264_cabac_entropy[*state][b];
+ return x264_cabac_entropy[*state^b];
}
#endif
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/common.c
^
|
@@ -22,7 +22,6 @@
*****************************************************************************/
#include "common.h"
-#include "cpu.h"
#include <stdarg.h>
#include <ctype.h>
@@ -1225,11 +1224,11 @@
s += sprintf( s, " bframes=%d", p->i_bframe );
if( p->i_bframe )
{
- s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d wpredb=%d",
+ s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d weightb=%d",
p->i_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias,
p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred );
}
- s += sprintf( s, " wpredp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 );
+ s += sprintf( s, " weightp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 );
s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d intra_refresh=%d",
p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold, p->b_intra_refresh );
@@ -1238,7 +1237,7 @@
s += sprintf( s, " rc_lookahead=%d", p->rc.i_lookahead );
s += sprintf( s, " rc=%s mbtree=%d", p->rc.i_rc_method == X264_RC_ABR ?
- ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_buffer_size == p->rc.i_bitrate ? "cbr" : "abr" )
+ ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_max_bitrate == p->rc.i_bitrate ? "cbr" : "abr" )
: p->rc.i_rc_method == X264_RC_CRF ? "crf" : "cqp", p->rc.b_mb_tree );
if( p->rc.i_rc_method == X264_RC_ABR || p->rc.i_rc_method == X264_RC_CRF )
{
@@ -1257,7 +1256,7 @@
s += sprintf( s, " vbv_maxrate=%d vbv_bufsize=%d",
p->rc.i_vbv_max_bitrate, p->rc.i_vbv_buffer_size );
if( p->rc.i_rc_method == X264_RC_CRF )
- s += sprintf( s, " crf-max=%.1f", p->rc.f_rf_constant_max );
+ s += sprintf( s, " crf_max=%.1f", p->rc.f_rf_constant_max );
}
}
else if( p->rc.i_rc_method == X264_RC_CQP )
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/common.h
^
|
@@ -110,6 +110,7 @@
#include "dct.h"
#include "cabac.h"
#include "quant.h"
+#include "cpu.h"
/****************************************************************************
* General functions
@@ -188,14 +189,14 @@
return amvd0 + (amvd1<<8);
}
-static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
+static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
{
for( int i = 0; i < i_mvc; i++ )
{
int mx = (mvc[i][0] + 2) >> 2;
int my = (mvc[i][1] + 2) >> 2;
- mvc[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
- mvc[i][1] = x264_clip3( my, mv_y_min, mv_y_max );
+ dst[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
+ dst[i][1] = x264_clip3( my, mv_y_min, mv_y_max );
}
}
@@ -407,6 +408,8 @@
int i_coded_fields_lookahead; /* Use separate counters for lookahead */
int i_cpb_delay_lookahead;
+ int b_queued_intra_refresh;
+
/* We use only one SPS and one PPS */
x264_sps_t sps_array[1];
x264_sps_t *sps;
@@ -658,7 +661,7 @@
ALIGNED_8( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] );
/* i_non_zero_count if available else 0x80 */
- ALIGNED_4( uint8_t non_zero_count[X264_SCAN8_SIZE] );
+ ALIGNED_16( uint8_t non_zero_count[X264_SCAN8_SIZE] );
/* -1 if unused, -2 if unavailable */
ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/cpu.c
^
|
@@ -87,8 +87,8 @@
#endif
#ifdef HAVE_MMX
-extern int x264_cpu_cpuid_test( void );
-extern uint32_t x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
+int x264_cpu_cpuid_test( void );
+uint32_t x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
uint32_t x264_cpu_detect( void )
{
@@ -324,13 +324,6 @@
#endif
-#ifndef HAVE_MMX
-void x264_emms( void )
-{
-}
-#endif
-
-
int x264_cpu_num_processors( void )
{
#if !defined(HAVE_PTHREAD)
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/cpu.h
^
|
@@ -23,7 +23,14 @@
uint32_t x264_cpu_detect( void );
int x264_cpu_num_processors( void );
-void x264_emms( void );
+void x264_cpu_emms( void );
+void x264_cpu_sfence( void );
+#ifdef HAVE_MMX
+#define x264_emms() x264_cpu_emms()
+#else
+#define x264_emms()
+#endif
+#define x264_sfence x264_cpu_sfence
void x264_cpu_mask_misalign_sse( void );
/* kluge:
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/frame.c
^
|
@@ -105,6 +105,7 @@
CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
CHECKED_MALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t));
CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
+ CHECKED_MALLOC( frame->mv16x16, 2*i_mb_count * sizeof(int16_t) );
CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
if( h->param.i_bframe )
{
@@ -117,7 +118,7 @@
frame->ref[1] = NULL;
}
CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
- CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
+ CHECKED_MALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) );
if( h->param.analyse.i_me_method >= X264_ME_ESA )
{
CHECKED_MALLOC( frame->buffer[3],
@@ -148,10 +149,7 @@
CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
for( int j = 0; j <= h->param.i_bframe+1; j++ )
for( int i = 0; i <= h->param.i_bframe+1; i++ )
- {
CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
- CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
- }
frame->i_intra_cost = frame->lowres_costs[0][0];
memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
}
@@ -199,19 +197,17 @@
x264_free( frame->i_propagate_cost );
for( int j = 0; j <= X264_BFRAME_MAX+1; j++ )
for( int i = 0; i <= X264_BFRAME_MAX+1; i++ )
- {
x264_free( frame->lowres_costs[j][i] );
- x264_free( frame->lowres_inter_types[j][i] );
- }
x264_free( frame->f_qp_offset );
x264_free( frame->f_qp_offset_aq );
x264_free( frame->i_inv_qscale_factor );
x264_free( frame->i_row_bits );
- x264_free( frame->i_row_qp );
+ x264_free( frame->f_row_qp );
x264_free( frame->mb_type );
x264_free( frame->mb_partition );
x264_free( frame->mv[0] );
x264_free( frame->mv[1] );
+ x264_free( frame->mv16x16 );
x264_free( frame->ref[0] );
x264_free( frame->ref[1] );
x264_pthread_mutex_destroy( &frame->mutex );
@@ -225,7 +221,7 @@
int i_csp = src->img.i_csp & X264_CSP_MASK;
if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
{
- x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
+ x264_log( h, X264_LOG_ERROR, "Invalid input colorspace\n" );
return -1;
}
@@ -247,6 +243,11 @@
plane += (height-1)*stride;
stride = -stride;
}
+ if( width > abs(stride) )
+ {
+ x264_log( h, X264_LOG_ERROR, "Input picture width is greater than stride\n" );
+ return -1;
+ }
h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
}
return 0;
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/frame.h
^
|
@@ -83,15 +83,21 @@
int8_t *mb_type;
uint8_t *mb_partition;
int16_t (*mv[2])[2];
+ int16_t (*mv16x16)[2];
int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
+
+ /* Stored as (lists_used << LOWRES_COST_SHIFT) + (cost).
+ * Doesn't need special addressing for intra cost because
+ * lists_used is guaranteed to be zero in that cast. */
uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
- /* Actually a width-2 bitfield with 4 values per uint8_t. */
- uint8_t (*lowres_inter_types[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
+ #define LOWRES_COST_MASK ((1<<14)-1)
+ #define LOWRES_COST_SHIFT 14
+
int *lowres_mv_costs[2][X264_BFRAME_MAX+1];
int8_t *ref[2];
int i_ref[2];
int ref_poc[2][16];
- int16_t inv_ref_poc[2][32]; // inverse values (list0 only) to avoid divisions in MB encoding
+ int16_t inv_ref_poc[2]; // inverse values of ref0 poc to avoid divisions in temporal MV prediction
/* for adaptive B-frame decision.
* contains the SATD cost of the lowres frame encoded in various modes
@@ -103,7 +109,7 @@
int *i_row_satds[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
int *i_row_satd;
int *i_row_bits;
- int *i_row_qp;
+ float *f_row_qp;
float *f_qp_offset;
float *f_qp_offset_aq;
int b_intra_calculated;
@@ -136,6 +142,7 @@
float f_pir_position;
int i_pir_start_col;
int i_pir_end_col;
+ int i_frames_since_pir;
} x264_frame_t;
/* synchronized frame list */
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/macroblock.c
^
|
@@ -3,9 +3,9 @@
*****************************************************************************
* Copyright (C) 2003-2008 x264 project
*
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ * Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
+ * Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
- * Jason Garrett-Glaser <darkshikari@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -25,469 +25,6 @@
#include "common.h"
#include "encoder/me.h"
-void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] )
-{
- const int i8 = x264_scan8[idx];
- const int i_ref= h->mb.cache.ref[i_list][i8];
- int i_refa = h->mb.cache.ref[i_list][i8 - 1];
- int16_t *mv_a = h->mb.cache.mv[i_list][i8 - 1];
- int i_refb = h->mb.cache.ref[i_list][i8 - 8];
- int16_t *mv_b = h->mb.cache.mv[i_list][i8 - 8];
- int i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width];
- int16_t *mv_c = h->mb.cache.mv[i_list][i8 - 8 + i_width];
-
- if( (idx&3) >= 2 + (i_width&1) || i_refc == -2 )
- {
- i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1];
- mv_c = h->mb.cache.mv[i_list][i8 - 8 - 1];
- }
-
- if( h->mb.i_partition == D_16x8 )
- {
- if( idx == 0 )
- {
- if( i_refb == i_ref )
- {
- CP32( mvp, mv_b );
- return;
- }
- }
- else
- {
- if( i_refa == i_ref )
- {
- CP32( mvp, mv_a );
- return;
- }
- }
- }
- else if( h->mb.i_partition == D_8x16 )
- {
- if( idx == 0 )
- {
- if( i_refa == i_ref )
- {
- CP32( mvp, mv_a );
- return;
- }
- }
- else
- {
- if( i_refc == i_ref )
- {
- CP32( mvp, mv_c );
- return;
- }
- }
- }
-
- int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);
-
- if( i_count > 1 )
- {
-median:
- x264_median_mv( mvp, mv_a, mv_b, mv_c );
- }
- else if( i_count == 1 )
- {
- if( i_refa == i_ref )
- CP32( mvp, mv_a );
- else if( i_refb == i_ref )
- CP32( mvp, mv_b );
- else
- CP32( mvp, mv_c );
- }
- else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
- CP32( mvp, mv_a );
- else
- goto median;
-}
-
-void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] )
-{
- int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
- int16_t *mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1];
- int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
- int16_t *mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8];
- int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
- int16_t *mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4];
- if( i_refc == -2 )
- {
- i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
- mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
- }
-
- int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);
-
- if( i_count > 1 )
- {
-median:
- x264_median_mv( mvp, mv_a, mv_b, mv_c );
- }
- else if( i_count == 1 )
- {
- if( i_refa == i_ref )
- CP32( mvp, mv_a );
- else if( i_refb == i_ref )
- CP32( mvp, mv_b );
- else
- CP32( mvp, mv_c );
- }
- else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
- CP32( mvp, mv_a );
- else
- goto median;
-}
-
-
-void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] )
-{
- int i_refa = h->mb.cache.ref[0][X264_SCAN8_0 - 1];
- int i_refb = h->mb.cache.ref[0][X264_SCAN8_0 - 8];
- int16_t *mv_a = h->mb.cache.mv[0][X264_SCAN8_0 - 1];
- int16_t *mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
-
- if( i_refa == -2 || i_refb == -2 ||
- !( i_refa | M32( mv_a ) ) ||
- !( i_refb | M32( mv_b ) ) )
- {
- M32( mv ) = 0;
- }
- else
- x264_mb_predict_mv_16x16( h, 0, 0, mv );
-}
-
-static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
-{
- int i_mb_4x4 = 16 * h->mb.i_mb_stride * h->mb.i_mb_y + 4 * h->mb.i_mb_x;
- int i_mb_8x8 = 4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x;
- const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
- const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy];
-
- x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
-
- h->mb.i_partition = partition_col;
-
- if( IS_INTRA( type_col ) )
- {
- x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
- x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, 0 );
- x264_macroblock_cache_mv( h, 0, 0, 4, 4, 1, 0 );
- return 1;
- }
-
- /* Don't do any checks other than the ones we have to, based
- * on the size of the colocated partitions.
- * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */
- int max_i8 = (D_16x16 - partition_col) + 1;
- int step = (partition_col == D_16x8) + 1;
- int width = 4 >> ((D_16x16 - partition_col)&1);
- int height = 4 >> ((D_16x16 - partition_col)>>1);
-
- for( int i8 = 0; i8 < max_i8; i8 += step )
- {
- int x8 = i8&1;
- int y8 = i8>>1;
- int i_part_8x8 = i_mb_8x8 + x8 + y8 * h->mb.i_b8_stride;
- int i_ref1_ref = h->fref1[0]->ref[0][i_part_8x8];
- int i_ref = (map_col_to_list0(i_ref1_ref>>h->sh.b_mbaff) << h->sh.b_mbaff) + (i_ref1_ref&h->sh.b_mbaff);
-
- if( i_ref >= 0 )
- {
- int dist_scale_factor = h->mb.dist_scale_factor[i_ref][0];
- int16_t *mv_col = h->fref1[0]->mv[0][i_mb_4x4 + 3*x8 + 3*y8 * h->mb.i_b4_stride];
- int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8;
- int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8;
- if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] || l0y-mv_col[1] > h->mb.mv_max_spel[1]) )
- return 0;
- x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, i_ref );
- x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, pack16to32_mask(l0x, l0y) );
- x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) );
- }
- else
- {
- /* the collocated ref isn't in the current list0 */
- /* FIXME: we might still be able to use direct_8x8 on some partitions */
- /* FIXME: with B-pyramid + extensive ref list reordering
- * (not currently used), we would also have to check
- * l1mv1 like in spatial mode */
- return 0;
- }
- }
-
- return 1;
-}
-
-static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
-{
- int8_t ref[2];
- ALIGNED_ARRAY_8( int16_t, mv,[2],[2] );
- const int8_t *l1ref0 = &h->fref1[0]->ref[0][h->mb.i_b8_xy];
- const int8_t *l1ref1 = &h->fref1[0]->ref[1][h->mb.i_b8_xy];
- const int16_t (*l1mv[2])[2] = { (const int16_t (*)[2]) &h->fref1[0]->mv[0][h->mb.i_b4_xy],
- (const int16_t (*)[2]) &h->fref1[0]->mv[1][h->mb.i_b4_xy] };
- const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
- const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy];
-
- h->mb.i_partition = partition_col;
-
- for( int i_list = 0; i_list < 2; i_list++ )
- {
- int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
- int16_t *mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1];
- int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
- int16_t *mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8];
- int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
- int16_t *mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4];
- if( i_refc == -2 )
- {
- i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
- mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
- }
-
- int i_ref = X264_MIN3( (unsigned)i_refa, (unsigned)i_refb, (unsigned)i_refc );
- if( i_ref < 0 )
- {
- i_ref = -1;
- M32( mv[i_list] ) = 0;
- }
- else
- {
- /* Same as x264_mb_predict_mv_16x16, but simplified to eliminate cases
- * not relevant to spatial direct. */
- int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);
-
- if( i_count > 1 )
- x264_median_mv( mv[i_list], mv_a, mv_b, mv_c );
- else
- {
- if( i_refa == i_ref )
- CP32( mv[i_list], mv_a );
- else if( i_refb == i_ref )
- CP32( mv[i_list], mv_b );
- else
- CP32( mv[i_list], mv_c );
- }
- }
-
- x264_macroblock_cache_ref( h, 0, 0, 4, 4, i_list, i_ref );
- x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, i_list, mv[i_list] );
- ref[i_list] = i_ref;
- }
-
- if( (M16( ref ) & 0x8080) == 0x8080 ) /* if( ref[0] < 0 && ref[1] < 0 ) */
- {
- x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
- x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
- return 1;
- }
-
- if( h->param.i_threads > 1
- && ( mv[0][1] > h->mb.mv_max_spel[1]
- || mv[1][1] > h->mb.mv_max_spel[1] ) )
- {
-#if 0
- fprintf(stderr, "direct_spatial: (%d,%d) (%d,%d) > %d \n",
- mv[0][0], mv[0][1], mv[1][0], mv[1][1],
- h->mb.mv_max_spel[1]);
-#endif
- return 0;
- }
-
- if( !M64( mv ) || IS_INTRA( type_col ) || (ref[0]&&ref[1]) )
- return 1;
-
- /* Don't do any checks other than the ones we have to, based
- * on the size of the colocated partitions.
- * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */
- int max_i8 = (D_16x16 - partition_col) + 1;
- int step = (partition_col == D_16x8) + 1;
- int width = 4 >> ((D_16x16 - partition_col)&1);
- int height = 4 >> ((D_16x16 - partition_col)>>1);
-
- /* col_zero_flag */
- for( int i8 = 0; i8 < max_i8; i8 += step )
- {
- const int x8 = i8&1;
- const int y8 = i8>>1;
- const int o8 = x8 + y8 * h->mb.i_b8_stride;
- const int o4 = 3*(x8 + y8 * h->mb.i_b4_stride);
- int idx;
- if( l1ref0[o8] == 0 )
- idx = 0;
- else if( l1ref0[o8] < 0 && l1ref1[o8] == 0 )
- idx = 1;
- else
- continue;
-
- if( abs( l1mv[idx][o4][0] ) <= 1 && abs( l1mv[idx][o4][1] ) <= 1 )
- {
- if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, 0 );
- if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, 0 );
- }
- }
-
- return 1;
-}
-
-int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
-{
- int b_available;
- if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_NONE )
- return 0;
- else if( h->sh.b_direct_spatial_mv_pred )
- b_available = x264_mb_predict_mv_direct16x16_spatial( h );
- else
- b_available = x264_mb_predict_mv_direct16x16_temporal( h );
-
- if( b_changed != NULL && b_available )
- {
- int changed;
-
- changed = M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][x264_scan8[0]] );
- changed |= M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][x264_scan8[0]] );
- changed |= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][x264_scan8[0]];
- changed |= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][x264_scan8[0]];
- if( !changed && h->mb.i_partition != D_16x16 )
- {
- changed |= M32( h->mb.cache.direct_mv[0][3] ) ^ M32( h->mb.cache.mv[0][x264_scan8[12]] );
- changed |= M32( h->mb.cache.direct_mv[1][3] ) ^ M32( h->mb.cache.mv[1][x264_scan8[12]] );
- changed |= h->mb.cache.direct_ref[0][3] ^ h->mb.cache.ref[0][x264_scan8[12]];
- changed |= h->mb.cache.direct_ref[1][3] ^ h->mb.cache.ref[1][x264_scan8[12]];
- }
- if( !changed && h->mb.i_partition == D_8x8 )
- {
- changed |= M32( h->mb.cache.direct_mv[0][1] ) ^ M32( h->mb.cache.mv[0][x264_scan8[4]] );
- changed |= M32( h->mb.cache.direct_mv[1][1] ) ^ M32( h->mb.cache.mv[1][x264_scan8[4]] );
- changed |= M32( h->mb.cache.direct_mv[0][2] ) ^ M32( h->mb.cache.mv[0][x264_scan8[8]] );
- changed |= M32( h->mb.cache.direct_mv[1][2] ) ^ M32( h->mb.cache.mv[1][x264_scan8[8]] );
- changed |= h->mb.cache.direct_ref[0][1] ^ h->mb.cache.ref[0][x264_scan8[4]];
- changed |= h->mb.cache.direct_ref[1][1] ^ h->mb.cache.ref[1][x264_scan8[4]];
- changed |= h->mb.cache.direct_ref[0][2] ^ h->mb.cache.ref[0][x264_scan8[8]];
- changed |= h->mb.cache.direct_ref[1][2] ^ h->mb.cache.ref[1][x264_scan8[8]];
- }
- *b_changed = changed;
- if( !changed )
- return b_available;
- }
-
- /* cache ref & mv */
- if( b_available )
- for( int l = 0; l < 2; l++ )
- {
- CP32( h->mb.cache.direct_mv[l][0], h->mb.cache.mv[l][x264_scan8[ 0]] );
- CP32( h->mb.cache.direct_mv[l][1], h->mb.cache.mv[l][x264_scan8[ 4]] );
- CP32( h->mb.cache.direct_mv[l][2], h->mb.cache.mv[l][x264_scan8[ 8]] );
- CP32( h->mb.cache.direct_mv[l][3], h->mb.cache.mv[l][x264_scan8[12]] );
- h->mb.cache.direct_ref[l][0] = h->mb.cache.ref[l][x264_scan8[ 0]];
- h->mb.cache.direct_ref[l][1] = h->mb.cache.ref[l][x264_scan8[ 4]];
- h->mb.cache.direct_ref[l][2] = h->mb.cache.ref[l][x264_scan8[ 8]];
- h->mb.cache.direct_ref[l][3] = h->mb.cache.ref[l][x264_scan8[12]];
- h->mb.cache.direct_partition = h->mb.i_partition;
- }
-
- return b_available;
-}
-
-/* This just improves encoder performance, it's not part of the spec */
-void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[9][2], int *i_mvc )
-{
- int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref];
- int i = 0;
-
-#define SET_MVP(mvp)\
- { \
- CP32( mvc[i], mvp ); \
- i++; \
- }
-
- /* b_direct */
- if( h->sh.i_type == SLICE_TYPE_B
- && h->mb.cache.ref[i_list][x264_scan8[12]] == i_ref )
- {
- SET_MVP( h->mb.cache.mv[i_list][x264_scan8[12]] );
- }
-
- if( i_ref == 0 && h->frames.b_have_lowres )
- {
- int16_t (*lowres_mv)[2] = i_list ? h->fenc->lowres_mvs[1][h->fref1[0]->i_frame-h->fenc->i_frame-1]
- : h->fenc->lowres_mvs[0][h->fenc->i_frame-h->fref0[0]->i_frame-1];
- if( lowres_mv[0][0] != 0x7fff )
- {
- M32( mvc[i] ) = (M32( lowres_mv[h->mb.i_mb_xy] )*2)&0xfffeffff;
- i++;
- }
- }
-
- /* spatial predictors */
- if( h->mb.i_neighbour_frame & MB_LEFT )
- {
- SET_MVP( mvr[h->mb.i_mb_left_xy] );
- }
- if( h->mb.i_neighbour_frame & MB_TOP )
- {
- SET_MVP( mvr[h->mb.i_mb_top_xy] );
-
- if( h->mb.i_neighbour_frame & MB_TOPLEFT )
- SET_MVP( mvr[h->mb.i_mb_topleft_xy] );
- if( h->mb.i_neighbour_frame & MB_TOPRIGHT )
- SET_MVP( mvr[h->mb.i_mb_topright_xy] );
- }
-#undef SET_MVP
-
- /* temporal predictors */
- if( h->fref0[0]->i_ref[0] > 0 )
- {
- x264_frame_t *l0 = h->fref0[0];
- x264_frame_t **fref = i_list ? h->fref1 : h->fref0;
- int field = h->mb.i_mb_y&1;
- int curpoc = h->fdec->i_poc + field*h->sh.i_delta_poc_bottom;
- int refpoc = fref[i_ref>>h->sh.b_mbaff]->i_poc;
- if( h->sh.b_mbaff && field^(i_ref&1) )
- refpoc += h->sh.i_delta_poc_bottom;
-
-#define SET_TMVP(dx, dy) { \
- int i_b4 = h->mb.i_b4_xy + dx*4 + dy*4*h->mb.i_b4_stride; \
- int i_b8 = h->mb.i_b8_xy + dx*2 + dy*2*h->mb.i_b8_stride; \
- int ref_col = l0->ref[0][i_b8]; \
- if( ref_col >= 0 ) \
- { \
- int scale = (curpoc - refpoc) * l0->inv_ref_poc[h->mb.b_interlaced&field][ref_col];\
- mvc[i][0] = (l0->mv[0][i_b4][0]*scale + 128) >> 8;\
- mvc[i][1] = (l0->mv[0][i_b4][1]*scale + 128) >> 8;\
- i++; \
- } \
- }
-
- SET_TMVP(0,0);
- if( h->mb.i_mb_x < h->sps->i_mb_width-1 )
- SET_TMVP(1,0);
- if( h->mb.i_mb_y < h->sps->i_mb_height-1 )
- SET_TMVP(0,1);
-#undef SET_TMVP
- }
-
- *i_mvc = i;
-}
-
-/* Set up a lookup table for delta pocs to reduce an IDIV to an IMUL */
-static void setup_inverse_delta_pocs( x264_t *h )
-{
- for( int field = 0; field <= h->sh.b_mbaff; field++ )
- {
- int curpoc = h->fdec->i_poc + field*h->sh.i_delta_poc_bottom;
- for( int i = 0; i < (h->i_ref0<<h->sh.b_mbaff); i++ )
- {
- int refpoc = h->fref0[i>>h->sh.b_mbaff]->i_poc;
- if( h->sh.b_mbaff && field^(i&1) )
- refpoc += h->sh.i_delta_poc_bottom;
- int delta = curpoc - refpoc;
-
- h->fdec->inv_ref_poc[field][i] = (256 + delta/2) / delta;
- }
- }
-}
-
static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height )
{
int i8 = x264_scan8[0]+x+8*y;
@@ -713,7 +250,7 @@
else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND )
i_refs = X264_MIN(16, i_refs + 1); //blind weights add one duplicate frame
- for( int j = 0; j < i_refs; j++ )
+ for( int j = !i; j < i_refs; j++ )
CHECKED_MALLOC( h->mb.mvr[i][j], 2 * i_mb_count * sizeof(int16_t) );
}
@@ -758,12 +295,13 @@
}
return 0;
-fail: return -1;
+fail:
+ return -1;
}
void x264_macroblock_cache_free( x264_t *h )
{
for( int i = 0; i < 2; i++ )
- for( int j = 0; j < 32; j++ )
+ for( int j = !i; j < 32; j++ )
x264_free( h->mb.mvr[i][j] );
for( int i = 0; i < 16; i++ )
x264_free( h->mb.p_weight_buf[i] );
@@ -811,7 +349,8 @@
CHECKED_MALLOC( h->scratch_buffer, scratch_size );
return 0;
-fail: return -1;
+fail:
+ return -1;
}
void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
@@ -827,6 +366,7 @@
{
h->mb.mv[0] = h->fdec->mv[0];
h->mb.mv[1] = h->fdec->mv[1];
+ h->mb.mvr[0][0] = h->fdec->mv16x16;
h->mb.ref[0] = h->fdec->ref[0];
h->mb.ref[1] = h->fdec->ref[1];
h->mb.type = h->fdec->mb_type;
@@ -861,7 +401,17 @@
/* init with not available (for top right idx=7,15) */
memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) );
- setup_inverse_delta_pocs( h );
+ if( h->i_ref0 > 0 )
+ for( int field = 0; field <= h->sh.b_mbaff; field++ )
+ {
+ int curpoc = h->fdec->i_poc + field*h->sh.i_delta_poc_bottom;
+ int refpoc = h->fref0[0]->i_poc;
+ if( h->sh.b_mbaff && field )
+ refpoc += h->sh.i_delta_poc_bottom;
+ int delta = curpoc - refpoc;
+
+ h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta;
+ }
h->mb.i_neighbour4[6] =
h->mb.i_neighbour4[9] =
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/mc.c
^
|
@@ -97,9 +97,9 @@
uint8_t *pix2, int i_stride_pix2, \
uint8_t *pix3, int i_stride_pix3, int weight ) \
{ \
- if( weight == 32 )\
+ if( weight == 32 ) \
pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
- else\
+ else \
pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \
}
PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
@@ -203,8 +203,8 @@
}
}
-static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
static void mc_luma( uint8_t *dst, int i_dst_stride,
uint8_t *src[4], int i_src_stride,
@@ -427,7 +427,7 @@
for( int i = 0; i < len; i++ )
{
int propagate_amount = propagate_in[i] + ((intra_costs[i] * inv_qscales[i] + 128)>>8);
- dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - inter_costs[i]), intra_costs[i]);
+ dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK)), intra_costs[i]);
}
}
|
[-]
[+]
|
Added |
x264-snapshot-20100517-2245.tar.bz2/common/mvpred.c
^
|
@@ -0,0 +1,466 @@
+/*****************************************************************************
+ * mvpred.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003-2008 x264 project
+ *
+ * Authors: Loren Merritt <lorenm@u.washington.edu>
+ * Jason Garrett-Glaser <darkshikari@gmail.com>
+ * Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "common.h"
+
+void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] )
+{
+ const int i8 = x264_scan8[idx];
+ const int i_ref= h->mb.cache.ref[i_list][i8];
+ int i_refa = h->mb.cache.ref[i_list][i8 - 1];
+ int16_t *mv_a = h->mb.cache.mv[i_list][i8 - 1];
+ int i_refb = h->mb.cache.ref[i_list][i8 - 8];
+ int16_t *mv_b = h->mb.cache.mv[i_list][i8 - 8];
+ int i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width];
+ int16_t *mv_c = h->mb.cache.mv[i_list][i8 - 8 + i_width];
+
+ if( (idx&3) >= 2 + (i_width&1) || i_refc == -2 )
+ {
+ i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1];
+ mv_c = h->mb.cache.mv[i_list][i8 - 8 - 1];
+ }
+
+ if( h->mb.i_partition == D_16x8 )
+ {
+ if( idx == 0 )
+ {
+ if( i_refb == i_ref )
+ {
+ CP32( mvp, mv_b );
+ return;
+ }
+ }
+ else
+ {
+ if( i_refa == i_ref )
+ {
+ CP32( mvp, mv_a );
+ return;
+ }
+ }
+ }
+ else if( h->mb.i_partition == D_8x16 )
+ {
+ if( idx == 0 )
+ {
+ if( i_refa == i_ref )
+ {
+ CP32( mvp, mv_a );
+ return;
+ }
+ }
+ else
+ {
+ if( i_refc == i_ref )
+ {
+ CP32( mvp, mv_c );
+ return;
+ }
+ }
+ }
+
+ int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);
+
+ if( i_count > 1 )
+ {
+median:
+ x264_median_mv( mvp, mv_a, mv_b, mv_c );
+ }
+ else if( i_count == 1 )
+ {
+ if( i_refa == i_ref )
+ CP32( mvp, mv_a );
+ else if( i_refb == i_ref )
+ CP32( mvp, mv_b );
+ else
+ CP32( mvp, mv_c );
+ }
+ else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
+ CP32( mvp, mv_a );
+ else
+ goto median;
+}
+
+void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] )
+{
+ int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
+ int16_t *mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1];
+ int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
+ int16_t *mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8];
+ int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
+ int16_t *mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4];
+ if( i_refc == -2 )
+ {
+ i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
+ mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
+ }
+
+ int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);
+
+ if( i_count > 1 )
+ {
+median:
+ x264_median_mv( mvp, mv_a, mv_b, mv_c );
+ }
+ else if( i_count == 1 )
+ {
+ if( i_refa == i_ref )
+ CP32( mvp, mv_a );
+ else if( i_refb == i_ref )
+ CP32( mvp, mv_b );
+ else
+ CP32( mvp, mv_c );
+ }
+ else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
+ CP32( mvp, mv_a );
+ else
+ goto median;
+}
+
+
+void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] )
+{
+ int i_refa = h->mb.cache.ref[0][X264_SCAN8_0 - 1];
+ int i_refb = h->mb.cache.ref[0][X264_SCAN8_0 - 8];
+ int16_t *mv_a = h->mb.cache.mv[0][X264_SCAN8_0 - 1];
+ int16_t *mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
+
+ if( i_refa == -2 || i_refb == -2 ||
+ !( i_refa | M32( mv_a ) ) ||
+ !( i_refb | M32( mv_b ) ) )
+ {
+ M32( mv ) = 0;
+ }
+ else
+ x264_mb_predict_mv_16x16( h, 0, 0, mv );
+}
+
+static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
+{
+ int i_mb_4x4 = 16 * h->mb.i_mb_stride * h->mb.i_mb_y + 4 * h->mb.i_mb_x;
+ int i_mb_8x8 = 4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x;
+ const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
+ const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy];
+
+ x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
+
+ h->mb.i_partition = partition_col;
+
+ if( IS_INTRA( type_col ) )
+ {
+ x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
+ x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, 0 );
+ x264_macroblock_cache_mv( h, 0, 0, 4, 4, 1, 0 );
+ return 1;
+ }
+
+ /* Don't do any checks other than the ones we have to, based
+ * on the size of the colocated partitions.
+ * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */
+ int max_i8 = (D_16x16 - partition_col) + 1;
+ int step = (partition_col == D_16x8) + 1;
+ int width = 4 >> ((D_16x16 - partition_col)&1);
+ int height = 4 >> ((D_16x16 - partition_col)>>1);
+
+ for( int i8 = 0; i8 < max_i8; i8 += step )
+ {
+ int x8 = i8&1;
+ int y8 = i8>>1;
+ int i_part_8x8 = i_mb_8x8 + x8 + y8 * h->mb.i_b8_stride;
+ int i_ref1_ref = h->fref1[0]->ref[0][i_part_8x8];
+ int i_ref = (map_col_to_list0(i_ref1_ref>>h->sh.b_mbaff) << h->sh.b_mbaff) + (i_ref1_ref&h->sh.b_mbaff);
+
+ if( i_ref >= 0 )
+ {
+ int dist_scale_factor = h->mb.dist_scale_factor[i_ref][0];
+ int16_t *mv_col = h->fref1[0]->mv[0][i_mb_4x4 + 3*x8 + 3*y8 * h->mb.i_b4_stride];
+ int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8;
+ int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8;
+ if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] || l0y-mv_col[1] > h->mb.mv_max_spel[1]) )
+ return 0;
+ x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, i_ref );
+ x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, pack16to32_mask(l0x, l0y) );
+ x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) );
+ }
+ else
+ {
+ /* the collocated ref isn't in the current list0 */
+ /* FIXME: we might still be able to use direct_8x8 on some partitions */
+ /* FIXME: with B-pyramid + extensive ref list reordering
+ * (not currently used), we would also have to check
+ * l1mv1 like in spatial mode */
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
+{
+ int8_t ref[2];
+ ALIGNED_ARRAY_8( int16_t, mv,[2],[2] );
+ const int8_t *l1ref0 = &h->fref1[0]->ref[0][h->mb.i_b8_xy];
+ const int8_t *l1ref1 = &h->fref1[0]->ref[1][h->mb.i_b8_xy];
+ const int16_t (*l1mv[2])[2] = { (const int16_t (*)[2]) &h->fref1[0]->mv[0][h->mb.i_b4_xy],
+ (const int16_t (*)[2]) &h->fref1[0]->mv[1][h->mb.i_b4_xy] };
+ const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
+ const int partition_col = h->fref1[0]->mb_partition[h->mb.i_mb_xy];
+
+ h->mb.i_partition = partition_col;
+
+ for( int i_list = 0; i_list < 2; i_list++ )
+ {
+ int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
+ int16_t *mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1];
+ int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
+ int16_t *mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8];
+ int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
+ int16_t *mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4];
+ if( i_refc == -2 )
+ {
+ i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
+ mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
+ }
+
+ int i_ref = X264_MIN3( (unsigned)i_refa, (unsigned)i_refb, (unsigned)i_refc );
+ if( i_ref < 0 )
+ {
+ i_ref = -1;
+ M32( mv[i_list] ) = 0;
+ }
+ else
+ {
+ /* Same as x264_mb_predict_mv_16x16, but simplified to eliminate cases
+ * not relevant to spatial direct. */
+ int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref);
+
+ if( i_count > 1 )
+ x264_median_mv( mv[i_list], mv_a, mv_b, mv_c );
+ else
+ {
+ if( i_refa == i_ref )
+ CP32( mv[i_list], mv_a );
+ else if( i_refb == i_ref )
+ CP32( mv[i_list], mv_b );
+ else
+ CP32( mv[i_list], mv_c );
+ }
+ }
+
+ x264_macroblock_cache_ref( h, 0, 0, 4, 4, i_list, i_ref );
+ x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, i_list, mv[i_list] );
+ ref[i_list] = i_ref;
+ }
+
+ if( (M16( ref ) & 0x8080) == 0x8080 ) /* if( ref[0] < 0 && ref[1] < 0 ) */
+ {
+ x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
+ x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
+ return 1;
+ }
+
+ if( h->param.i_threads > 1
+ && ( mv[0][1] > h->mb.mv_max_spel[1]
+ || mv[1][1] > h->mb.mv_max_spel[1] ) )
+ {
+#if 0
+ fprintf(stderr, "direct_spatial: (%d,%d) (%d,%d) > %d \n",
+ mv[0][0], mv[0][1], mv[1][0], mv[1][1],
+ h->mb.mv_max_spel[1]);
+#endif
+ return 0;
+ }
+
+ if( !M64( mv ) || IS_INTRA( type_col ) || (ref[0]&&ref[1]) )
+ return 1;
+
+ /* Don't do any checks other than the ones we have to, based
+ * on the size of the colocated partitions.
+ * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */
+ int max_i8 = (D_16x16 - partition_col) + 1;
+ int step = (partition_col == D_16x8) + 1;
+ int width = 4 >> ((D_16x16 - partition_col)&1);
+ int height = 4 >> ((D_16x16 - partition_col)>>1);
+
+ /* col_zero_flag */
+ for( int i8 = 0; i8 < max_i8; i8 += step )
+ {
+ const int x8 = i8&1;
+ const int y8 = i8>>1;
+ const int o8 = x8 + y8 * h->mb.i_b8_stride;
+ const int o4 = 3*(x8 + y8 * h->mb.i_b4_stride);
+ int idx;
+ if( l1ref0[o8] == 0 )
+ idx = 0;
+ else if( l1ref0[o8] < 0 && l1ref1[o8] == 0 )
+ idx = 1;
+ else
+ continue;
+
+ if( abs( l1mv[idx][o4][0] ) <= 1 && abs( l1mv[idx][o4][1] ) <= 1 )
+ {
+ if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, 0 );
+ if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, 0 );
+ }
+ }
+
+ return 1;
+}
+
+int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
+{
+ int b_available;
+ if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_NONE )
+ return 0;
+ else if( h->sh.b_direct_spatial_mv_pred )
+ b_available = x264_mb_predict_mv_direct16x16_spatial( h );
+ else
+ b_available = x264_mb_predict_mv_direct16x16_temporal( h );
+
+ if( b_changed != NULL && b_available )
+ {
+ int changed;
+
+ changed = M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][x264_scan8[0]] );
+ changed |= M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][x264_scan8[0]] );
+ changed |= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][x264_scan8[0]];
+ changed |= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][x264_scan8[0]];
+ if( !changed && h->mb.i_partition != D_16x16 )
+ {
+ changed |= M32( h->mb.cache.direct_mv[0][3] ) ^ M32( h->mb.cache.mv[0][x264_scan8[12]] );
+ changed |= M32( h->mb.cache.direct_mv[1][3] ) ^ M32( h->mb.cache.mv[1][x264_scan8[12]] );
+ changed |= h->mb.cache.direct_ref[0][3] ^ h->mb.cache.ref[0][x264_scan8[12]];
+ changed |= h->mb.cache.direct_ref[1][3] ^ h->mb.cache.ref[1][x264_scan8[12]];
+ }
+ if( !changed && h->mb.i_partition == D_8x8 )
+ {
+ changed |= M32( h->mb.cache.direct_mv[0][1] ) ^ M32( h->mb.cache.mv[0][x264_scan8[4]] );
+ changed |= M32( h->mb.cache.direct_mv[1][1] ) ^ M32( h->mb.cache.mv[1][x264_scan8[4]] );
+ changed |= M32( h->mb.cache.direct_mv[0][2] ) ^ M32( h->mb.cache.mv[0][x264_scan8[8]] );
+ changed |= M32( h->mb.cache.direct_mv[1][2] ) ^ M32( h->mb.cache.mv[1][x264_scan8[8]] );
+ changed |= h->mb.cache.direct_ref[0][1] ^ h->mb.cache.ref[0][x264_scan8[4]];
+ changed |= h->mb.cache.direct_ref[1][1] ^ h->mb.cache.ref[1][x264_scan8[4]];
+ changed |= h->mb.cache.direct_ref[0][2] ^ h->mb.cache.ref[0][x264_scan8[8]];
+ changed |= h->mb.cache.direct_ref[1][2] ^ h->mb.cache.ref[1][x264_scan8[8]];
+ }
+ *b_changed = changed;
+ if( !changed )
+ return b_available;
+ }
+
+ /* cache ref & mv */
+ if( b_available )
+ for( int l = 0; l < 2; l++ )
+ {
+ CP32( h->mb.cache.direct_mv[l][0], h->mb.cache.mv[l][x264_scan8[ 0]] );
+ CP32( h->mb.cache.direct_mv[l][1], h->mb.cache.mv[l][x264_scan8[ 4]] );
+ CP32( h->mb.cache.direct_mv[l][2], h->mb.cache.mv[l][x264_scan8[ 8]] );
+ CP32( h->mb.cache.direct_mv[l][3], h->mb.cache.mv[l][x264_scan8[12]] );
+ h->mb.cache.direct_ref[l][0] = h->mb.cache.ref[l][x264_scan8[ 0]];
+ h->mb.cache.direct_ref[l][1] = h->mb.cache.ref[l][x264_scan8[ 4]];
+ h->mb.cache.direct_ref[l][2] = h->mb.cache.ref[l][x264_scan8[ 8]];
+ h->mb.cache.direct_ref[l][3] = h->mb.cache.ref[l][x264_scan8[12]];
+ h->mb.cache.direct_partition = h->mb.i_partition;
+ }
+
+ return b_available;
+}
+
+/* This just improves encoder performance, it's not part of the spec */
+void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[9][2], int *i_mvc )
+{
+ int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref];
+ int i = 0;
+
+#define SET_MVP(mvp) \
+ { \
+ CP32( mvc[i], mvp ); \
+ i++; \
+ }
+
+ /* b_direct */
+ if( h->sh.i_type == SLICE_TYPE_B
+ && h->mb.cache.ref[i_list][x264_scan8[12]] == i_ref )
+ {
+ SET_MVP( h->mb.cache.mv[i_list][x264_scan8[12]] );
+ }
+
+ if( i_ref == 0 && h->frames.b_have_lowres )
+ {
+ int16_t (*lowres_mv)[2] = i_list ? h->fenc->lowres_mvs[1][h->fref1[0]->i_frame-h->fenc->i_frame-1]
+ : h->fenc->lowres_mvs[0][h->fenc->i_frame-h->fref0[0]->i_frame-1];
+ if( lowres_mv[0][0] != 0x7fff )
+ {
+ M32( mvc[i] ) = (M32( lowres_mv[h->mb.i_mb_xy] )*2)&0xfffeffff;
+ i++;
+ }
+ }
+
+ /* spatial predictors */
+ if( h->mb.i_neighbour_frame & MB_LEFT )
+ {
+ SET_MVP( mvr[h->mb.i_mb_left_xy] );
+ }
+ if( h->mb.i_neighbour_frame & MB_TOP )
+ {
+ SET_MVP( mvr[h->mb.i_mb_top_xy] );
+
+ if( h->mb.i_neighbour_frame & MB_TOPLEFT )
+ SET_MVP( mvr[h->mb.i_mb_topleft_xy] );
+ if( h->mb.i_neighbour_frame & MB_TOPRIGHT )
+ SET_MVP( mvr[h->mb.i_mb_topright_xy] );
+ }
+#undef SET_MVP
+
+ /* temporal predictors */
+ if( h->fref0[0]->i_ref[0] > 0 )
+ {
+ x264_frame_t *l0 = h->fref0[0];
+ x264_frame_t **fref = i_list ? h->fref1 : h->fref0;
+ int field = h->mb.i_mb_y&1;
+ int curpoc = h->fdec->i_poc + field*h->sh.i_delta_poc_bottom;
+ int refpoc = fref[i_ref>>h->sh.b_mbaff]->i_poc;
+ if( h->sh.b_mbaff && field^(i_ref&1) )
+ refpoc += h->sh.i_delta_poc_bottom;
+
+#define SET_TMVP( dx, dy ) \
+ { \
+ int mb_index = h->mb.i_mb_xy + dx + dy*h->mb.i_mb_stride; \
+ int scale = (curpoc - refpoc) * l0->inv_ref_poc[h->mb.b_interlaced&field]; \
+ mvc[i][0] = (l0->mv16x16[mb_index][0]*scale + 128) >> 8; \
+ mvc[i][1] = (l0->mv16x16[mb_index][1]*scale + 128) >> 8; \
+ i++; \
+ }
+
+ SET_TMVP(0,0);
+ if( h->mb.i_mb_x < h->sps->i_mb_width-1 )
+ SET_TMVP(1,0);
+ if( h->mb.i_mb_y < h->sps->i_mb_height-1 )
+ SET_TMVP(0,1);
+#undef SET_TMVP
+ }
+
+ *i_mvc = i;
+}
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/ppc/dct.c
^
|
@@ -205,7 +205,7 @@
vec_st( dct_tr1v, 16, (signed short *)dct );
vec_st( dct_tr2v, 32, (signed short *)dct );
vec_st( dct_tr3v, 48, (signed short *)dct );
-
+
vec_st( dct_tr4v, 64, (signed short *)dct );
vec_st( dct_tr5v, 80, (signed short *)dct );
vec_st( dct_tr6v, 96, (signed short *)dct );
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/ppc/mc.c
^
|
@@ -37,8 +37,8 @@
uint8_t *dst, int i_dst, int i_height );
-static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
@@ -291,8 +291,8 @@
}
-#define DO_PROCESS_W4( a ) \
- dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \
+#define DO_PROCESS_W4( a ) \
+ dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \
dstv_16B = vec_mladd( src##a##v_16B, coeff##a##v, dstv_16B )
static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
@@ -369,10 +369,10 @@
}
}
-#define DO_PROCESS_W8( a ) \
- src##a##v_16A = vec_u8_to_u16( src##a##v_8A ); \
- src##a##v_16B = vec_u8_to_u16( src##a##v_8B ); \
- dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \
+#define DO_PROCESS_W8( a ) \
+ src##a##v_16A = vec_u8_to_u16( src##a##v_8A ); \
+ src##a##v_16B = vec_u8_to_u16( src##a##v_8B ); \
+ dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \
dstv_16B = vec_mladd( src##a##v_16B, coeff##a##v, dstv_16B )
static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/ppc/ppccommon.h
^
|
@@ -113,13 +113,13 @@
vec_u8_t _hv, _lv
#define PREP_LOAD_SRC( src ) \
- vec_u8_t _##src##_ = vec_lvsl(0, src)
+ vec_u8_t _##src##_ = vec_lvsl(0, src)
#define VEC_LOAD_G( p, v, n, t ) \
_hv = vec_ld( 0, p ); \
v = (t) vec_lvsl( 0, p ); \
_lv = vec_ld( n - 1, p ); \
- v = (t) vec_perm( _hv, _lv, (vec_u8_t) v )
+ v = (t) vec_perm( _hv, _lv, (vec_u8_t) v )
#define VEC_LOAD( p, v, n, t, g ) \
_hv = vec_ld( 0, p ); \
@@ -134,7 +134,7 @@
#define VEC_LOAD_PARTIAL( p, v, n, t, g) \
_hv = vec_ld( 0, p); \
v = (t) vec_perm( _hv, _hv, (vec_u8_t) _##g##_ )
-
+
/***********************************************************************
* PREP_STORE##n: declares required vectors to store n bytes to a
@@ -155,7 +155,7 @@
_lv = vec_perm( (vec_u8_t) v, _tmp1v, _##o##r_ ); \
vec_st( _lv, 15, (uint8_t *) p ); \
_hv = vec_perm( _tmp1v, (vec_u8_t) v, _##o##r_ ); \
- vec_st( _hv, 0, (uint8_t *) p )
+ vec_st( _hv, 0, (uint8_t *) p )
#define PREP_STORE8 \
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/ppc/quant.c
^
|
@@ -20,7 +20,7 @@
#include "common/common.h"
#include "ppccommon.h"
-#include "quant.h"
+#include "quant.h"
// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
#define QUANT_16_U( idx0, idx1 ) \
@@ -55,7 +55,7 @@
nz = vec_or(nz, vec_or(temp1v, temp2v)); \
vec_st(temp2v, (idx1), (int16_t*)dct); \
}
-
+
int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
{
LOAD_ZERO;
@@ -220,7 +220,7 @@
vec_u16_t biasvB;
vec_s16_t temp1v, temp2v;
-
+
vec_u32_u qbits_u;
qbits_u.s[0]=16;
i_qbitsv = vec_splat(qbits_u.v, 0);
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/predict.c
^
|
@@ -41,7 +41,7 @@
* 16x16 prediction for intra luma block
****************************************************************************/
-#define PREDICT_16x16_DC(v) \
+#define PREDICT_16x16_DC(v)\
for( int i = 0; i < 16; i++ )\
{\
M32( src+ 0 ) = v;\
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/set.c
^
|
@@ -23,7 +23,7 @@
#define SHIFT(x,s) ((s)<=0 ? (x)<<-(s) : ((x)+(1<<((s)-1)))>>(s))
#define DIV(n,d) (((n) + ((d)>>1)) / (d))
-static const int dequant4_scale[6][3] =
+static const uint8_t dequant4_scale[6][3] =
{
{ 10, 13, 16 },
{ 11, 14, 18 },
@@ -32,7 +32,7 @@
{ 16, 20, 25 },
{ 18, 23, 29 }
};
-static const int quant4_scale[6][3] =
+static const uint16_t quant4_scale[6][3] =
{
{ 13107, 8066, 5243 },
{ 11916, 7490, 4660 },
@@ -42,11 +42,11 @@
{ 7282, 4559, 2893 },
};
-static const int quant8_scan[16] =
+static const uint8_t quant8_scan[16] =
{
0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1
};
-static const int dequant8_scale[6][6] =
+static const uint8_t dequant8_scale[6][6] =
{
{ 20, 18, 32, 19, 25, 24 },
{ 22, 19, 35, 21, 28, 26 },
@@ -55,7 +55,7 @@
{ 32, 28, 51, 30, 40, 38 },
{ 36, 32, 58, 34, 46, 43 },
};
-static const int quant8_scale[6][6] =
+static const uint16_t quant8_scale[6][6] =
{
{ 13107, 11428, 20972, 12222, 16777, 15481 },
{ 11916, 10826, 19174, 11058, 14980, 14290 },
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/cabac-a.asm
^
|
@@ -24,23 +24,21 @@
%include "x86inc.asm"
-SECTION_RODATA
-
SECTION .text
-cextern x264_cabac_range_lps
-cextern x264_cabac_transition
-cextern x264_cabac_renorm_shift
+cextern cabac_range_lps
+cextern cabac_transition
+cextern cabac_renorm_shift
; t3 must be ecx, since it's used for shift.
%ifdef WIN64
- DECLARE_REG_TMP 3,1,2,0,4,5,6,10
+ DECLARE_REG_TMP 3,1,2,0,4,5,6,10,2
%define pointer resq
%elifdef ARCH_X86_64
- DECLARE_REG_TMP 0,1,2,3,4,5,6,10
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,10,6
%define pointer resq
%else
- DECLARE_REG_TMP 0,4,2,1,3,5,6,2
+ DECLARE_REG_TMP 0,4,2,1,3,5,6,2,2
%define pointer resd
%endif
@@ -70,17 +68,19 @@
%endif
%endmacro
-cglobal x264_cabac_encode_decision_asm, 0,7
+cglobal cabac_encode_decision_asm, 0,7
movifnidn t0, r0mp
movifnidn t1d, r1m
mov t5d, [t0+cb.range]
- movzx t6d, byte [t0+cb.state+t1]
+ movzx t4d, byte [t0+cb.state+t1]
mov t3d, t5d
+ mov t6d, t4d
shr t5d, 6
+ shr t4d, 1
movifnidn t2d, r2m
- LOAD_GLOBAL t5d, x264_cabac_range_lps-4, t5, t6*4
- LOAD_GLOBAL t4d, x264_cabac_transition, t2, t6*2
- shr t6d, 6
+ LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*4
+ LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
+ and t6d, 1
sub t3d, t5d
cmp t6d, t2d
mov t6d, [t0+cb.low]
@@ -88,28 +88,74 @@
cmovne t3d, t5d
cmovne t6d, t7d
mov [t0+cb.state+t1], t4b
-;x264_cabac_encode_renorm
+;cabac_encode_renorm
mov t4d, t3d
shr t3d, 3
- LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
+ LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
shl t4d, t3b
shl t6d, t3b
add t3d, [t0+cb.queue]
mov [t0+cb.range], t4d
- cmp t3d, 8
- jl .update_queue_low
-;x264_cabac_putbyte
+ jge cabac_putbyte
+.update_queue_low:
+ mov [t0+cb.low], t6d
+ mov [t0+cb.queue], t3d
+ RET
+
+cglobal cabac_encode_bypass_asm, 0,3
+ movifnidn t0, r0mp
+ movifnidn t3d, r1m
+ neg t3d
+ mov t8d, [t0+cb.low]
+ and t3d, [t0+cb.range]
+ lea t8d, [t8*2+t3]
+ mov t3d, [t0+cb.queue]
+ inc t3d
+%ifdef UNIX64 ; .putbyte compiles to nothing but a jmp
+ jge cabac_putbyte
+%else
+ jge .putbyte
+%endif
+ mov [t0+cb.low], t8d
+ mov [t0+cb.queue], t3d
+ RET
+.putbyte:
+ PROLOGUE 0,7
+ movifnidn t6d, t8d
+ jmp cabac_putbyte
+
+cglobal cabac_encode_terminal_asm, 0,3
+ movifnidn t0, r0mp
+ sub dword [t0+cb.range], 2
+; shortcut: the renormalization shift in terminal
+; can only be 0 or 1 and is zero over 99% of the time.
+ test dword [t0+cb.range], 0x100
+ je .renorm
+ REP_RET
+.renorm:
+ shl dword [t0+cb.low], 1
+ shl dword [t0+cb.range], 1
+ inc dword [t0+cb.queue]
+ jge .putbyte
+ REP_RET
+.putbyte:
+ PROLOGUE 0,7
+ mov t3d, [t0+cb.queue]
+ mov t6d, [t0+cb.low]
+ jmp cabac_putbyte
+
+cabac_putbyte:
; alive: t0=cb t3=queue t6=low
%ifdef WIN64
DECLARE_REG_TMP 3,4,1,0,2,5,6,10
%endif
mov t1d, -1
- add t3d, 2
+ add t3d, 10
mov t2d, t6d
shl t1d, t3b
shr t2d, t3b ; out
not t1d
- sub t3d, 10
+ sub t3d, 18
and t6d, t1d
mov t5d, [t0+cb.bytes_outstanding]
cmp t2b, 0xff ; FIXME is a 32bit op faster?
@@ -127,8 +173,4 @@
.postpone:
inc t5d
mov [t0+cb.bytes_outstanding], t5d
-.update_queue_low:
- mov [t0+cb.low], t6d
- mov [t0+cb.queue], t3d
- RET
-
+ jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)
|
[-]
[+]
|
Added |
x264-snapshot-20100517-2245.tar.bz2/common/x86/const-a.asm
^
|
@@ -0,0 +1,54 @@
+;*****************************************************************************
+;* const-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2010 x264 project
+;*
+;* Author: Loren Merritt <lorenm@u.washington.edu>
+;* Jason Garrett-Glaser <darkshikari@gmail.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+
+const pb_01, times 8 db 0,1
+const pb_0, times 16 db 0
+const pb_a1, times 16 db 0xa1
+const pb_1, times 16 db 1
+const pb_3, times 16 db 3
+const hsub_mul, times 8 db 1, -1
+const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
+
+const pw_1, times 8 dw 1
+const pw_2, times 8 dw 2
+const pw_4, times 8 dw 4
+const pw_8, times 8 dw 8
+const pw_16, times 8 dw 16
+const pw_32, times 8 dw 32
+const pw_64, times 8 dw 64
+const pw_32_0, times 4 dw 32,
+ times 4 dw 0
+const pw_8000, times 8 dw 0x8000
+const pw_3fff, times 8 dw 0x3fff
+
+const pd_1, times 4 dd 1
+const pd_128, times 4 dd 128
+const pw_00ff, times 8 dw 0x00ff
+const pw_ff00, times 8 dw 0xff00
+
+const pb_reverse, db 7, 6, 5, 4, 3, 2, 1, 0
+const sw_64, dd 64
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/cpu-a.asm
^
|
@@ -29,9 +29,9 @@
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
-; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+; int cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
-cglobal x264_cpu_cpuid, 5,7
+cglobal cpu_cpuid, 5,7
push rbx
mov r11, r1
mov r10, r2
@@ -49,10 +49,10 @@
%else
;-----------------------------------------------------------------------------
-; int x264_cpu_cpuid_test( void )
+; int cpu_cpuid_test( void )
; return 0 if unsupported
;-----------------------------------------------------------------------------
-cglobal x264_cpu_cpuid_test
+cglobal cpu_cpuid_test
pushfd
push ebx
push ebp
@@ -75,9 +75,9 @@
ret
;-----------------------------------------------------------------------------
-; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+; int cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
-cglobal x264_cpu_cpuid, 0,6
+cglobal cpu_cpuid, 0,6
mov eax, r0m
cpuid
mov esi, r1m
@@ -91,9 +91,9 @@
RET
;-----------------------------------------------------------------------------
-; void x264_stack_align( void (*func)(void*), void *arg );
+; void stack_align( void (*func)(void*), void *arg );
;-----------------------------------------------------------------------------
-cglobal x264_stack_align
+cglobal stack_align
push ebp
mov ebp, esp
sub esp, 8
@@ -110,16 +110,23 @@
%endif
;-----------------------------------------------------------------------------
-; void x264_emms( void )
+; void cpu_emms( void )
;-----------------------------------------------------------------------------
-cglobal x264_emms
+cglobal cpu_emms
emms
ret
;-----------------------------------------------------------------------------
-; void x264_cpu_mask_misalign_sse(void)
+; void cpu_sfence( void )
;-----------------------------------------------------------------------------
-cglobal x264_cpu_mask_misalign_sse
+cglobal cpu_sfence
+ sfence
+ ret
+
+;-----------------------------------------------------------------------------
+; void cpu_mask_misalign_sse( void )
+;-----------------------------------------------------------------------------
+cglobal cpu_mask_misalign_sse
sub rsp, 4
stmxcsr [rsp]
or dword [rsp], 1<<17
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/dct-32.asm
^
|
@@ -27,13 +27,11 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
-
-pw_32: times 8 dw 32
-hsub_mul: times 8 db 1, -1
-
SECTION .text
+cextern pw_32
+cextern hsub_mul
+
; in: m0..m7
; out: 0,4,6 in mem, rest in regs
%macro DCT8_1D 9
@@ -188,10 +186,10 @@
%endmacro
;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-cglobal x264_sub8x8_dct8_mmx, 3,3
-global x264_sub8x8_dct8_mmx.skip_prologue
+cglobal sub8x8_dct8_mmx, 3,3
+global sub8x8_dct8_mmx.skip_prologue
.skip_prologue:
INIT_MMX
call load_diff_4x8_mmx
@@ -254,10 +252,10 @@
%endmacro
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
+; void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_mmx, 2,2
-global x264_add8x8_idct8_mmx.skip_prologue
+cglobal add8x8_idct8_mmx, 2,2
+global add8x8_idct8_mmx.skip_prologue
.skip_prologue:
INIT_MMX
add word [r1], 32
@@ -344,9 +342,9 @@
INIT_XMM
%macro DCT_SUB8 1
-cglobal x264_sub8x8_dct_%1, 3,3
+cglobal sub8x8_dct_%1, 3,3
add r2, 4*FDEC_STRIDE
-global x264_sub8x8_dct_%1.skip_prologue
+global sub8x8_dct_%1.skip_prologue
.skip_prologue:
%ifnidn %1, sse2
mova m7, [hsub_mul]
@@ -375,11 +373,11 @@
ret
;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-cglobal x264_sub8x8_dct8_%1, 3,3
+cglobal sub8x8_dct8_%1, 3,3
add r2, 4*FDEC_STRIDE
-global x264_sub8x8_dct8_%1.skip_prologue
+global sub8x8_dct8_%1.skip_prologue
.skip_prologue:
%ifidn %1, sse2
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE]
@@ -419,11 +417,11 @@
DCT_SUB8 ssse3
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] )
+; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct_sse2, 2,2
+cglobal add8x8_idct_sse2, 2,2
add r0, 4*FDEC_STRIDE
-global x264_add8x8_idct_sse2.skip_prologue
+global add8x8_idct_sse2.skip_prologue
.skip_prologue:
UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3
SBUTTERFLY qdq, 0, 1, 4
@@ -456,11 +454,11 @@
ret
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
+; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_sse2, 2,2
+cglobal add8x8_idct8_sse2, 2,2
add r0, 4*FDEC_STRIDE
-global x264_add8x8_idct8_sse2.skip_prologue
+global add8x8_idct8_sse2.skip_prologue
.skip_prologue:
UNSPILL r1, 1,2,3,5,6,7
IDCT8_1D 0,1,2,3,4,5,6,7,r1
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/dct-64.asm
^
|
@@ -26,11 +26,10 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
-pw_32: times 8 dw 32
-hsub_mul: times 8 db 1, -1
-
SECTION .text
+
+cextern pw_32
+cextern hsub_mul
INIT_XMM
%macro DCT8_1D 10
@@ -140,7 +139,7 @@
%endmacro
%macro DCT_SUB8 1
-cglobal x264_sub8x8_dct_%1, 3,3,11
+cglobal sub8x8_dct_%1, 3,3,11
add r2, 4*FDEC_STRIDE
%ifnidn %1, sse2
mova m7, [hsub_mul]
@@ -149,7 +148,7 @@
call .skip_prologue
RET
%endif
-global x264_sub8x8_dct_%1.skip_prologue
+global sub8x8_dct_%1.skip_prologue
.skip_prologue:
SWAP 7, 9
LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
@@ -165,9 +164,9 @@
ret
;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-cglobal x264_sub8x8_dct8_%1, 3,3,11
+cglobal sub8x8_dct8_%1, 3,3,11
add r2, 4*FDEC_STRIDE
%ifnidn %1, sse2
mova m7, [hsub_mul]
@@ -176,7 +175,7 @@
call .skip_prologue
RET
%endif
-global x264_sub8x8_dct8_%1.skip_prologue
+global sub8x8_dct8_%1.skip_prologue
.skip_prologue:
SWAP 7, 10
LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
@@ -205,16 +204,16 @@
DCT_SUB8 ssse3
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
+; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_sse2, 2,2,11
+cglobal add8x8_idct8_sse2, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
-global x264_add8x8_idct8_sse2.skip_prologue
+global add8x8_idct8_sse2.skip_prologue
.skip_prologue:
SWAP 7, 9
movdqa m0, [r1+0x00]
@@ -237,16 +236,16 @@
ret
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] )
+; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct_sse2, 2,2,11
+cglobal add8x8_idct_sse2, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
-global x264_add8x8_idct_sse2.skip_prologue
+global add8x8_idct_sse2.skip_prologue
.skip_prologue:
SWAP 7, 9
mova m0, [r1+ 0]
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/dct-a.asm
^
|
@@ -35,12 +35,6 @@
%endmacro
SECTION_RODATA
-pw_32_0: times 4 dw 32
- times 4 dw 0
-pw_32: times 8 dw 32
-pw_8000: times 8 dw 0x8000
-hsub_mul: times 8 db 1, -1
-
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
@@ -48,11 +42,16 @@
pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
-pb_1: times 16 db 1
-pw_1: times 8 dw 1
SECTION .text
+cextern pw_32_0
+cextern pw_32
+cextern pw_8000
+cextern hsub_mul
+cextern pb_1
+cextern pw_1
+
%macro WALSH4_1D 5
SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
@@ -73,9 +72,9 @@
INIT_MMX
;-----------------------------------------------------------------------------
-; void x264_dct4x4dc_mmx( int16_t d[4][4] )
+; void dct4x4dc( int16_t d[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_dct4x4dc_mmx, 1,1
+cglobal dct4x4dc_mmx, 1,1
movq m3, [r0+24]
movq m2, [r0+16]
movq m1, [r0+ 8]
@@ -95,9 +94,9 @@
RET
;-----------------------------------------------------------------------------
-; void x264_idct4x4dc_mmx( int16_t d[4][4] )
+; void idct4x4dc( int16_t d[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_idct4x4dc_mmx, 1,1
+cglobal idct4x4dc_mmx, 1,1
movq m3, [r0+24]
movq m2, [r0+16]
movq m1, [r0+ 8]
@@ -113,9 +112,9 @@
%macro SUB_DCT4 1
;-----------------------------------------------------------------------------
-; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
+; void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-cglobal x264_sub4x4_dct_%1, 3,3
+cglobal sub4x4_dct_%1, 3,3
%ifidn %1, mmx
.skip_prologue:
LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
@@ -140,9 +139,9 @@
SUB_DCT4 ssse3
;-----------------------------------------------------------------------------
-; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
+; void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_add4x4_idct_mmx, 2,2
+cglobal add4x4_idct_mmx, 2,2
pxor m7, m7
.skip_prologue:
movq m1, [r1+ 8]
@@ -160,7 +159,7 @@
RET
INIT_XMM
-cglobal x264_add4x4_idct_sse4, 2,2,6
+cglobal add4x4_idct_sse4, 2,2,6
mova m0, [r1+0x00] ; row1/row0
mova m2, [r1+0x10] ; row3/row2
mova m1, m0 ; row1/row0
@@ -213,7 +212,7 @@
INIT_MMX
;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
+; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
cglobal %1, 3,3,11
@@ -249,7 +248,7 @@
%endmacro
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
+; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6-7
cglobal %1, 2,2,11
@@ -280,33 +279,33 @@
%endmacro
%ifndef ARCH_X86_64
-SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
-ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
-SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
-ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
-
-cextern x264_sub8x8_dct8_mmx.skip_prologue
-cextern x264_add8x8_idct8_mmx.skip_prologue
-SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
-ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
+ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
+SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
+ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
+
+cextern sub8x8_dct8_mmx.skip_prologue
+cextern add8x8_idct8_mmx.skip_prologue
+SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
%endif
INIT_XMM
-cextern x264_sub8x8_dct_sse2.skip_prologue
-cextern x264_sub8x8_dct_ssse3.skip_prologue
-SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
-SUB_NxN_DCT x264_sub16x16_dct_ssse3, x264_sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
-cextern x264_add8x8_idct_sse2.skip_prologue
-ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
-
-cextern x264_sub8x8_dct8_sse2.skip_prologue
-cextern x264_add8x8_idct8_sse2.skip_prologue
-SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
-ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
+cextern sub8x8_dct_sse2.skip_prologue
+cextern sub8x8_dct_ssse3.skip_prologue
+SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
+cextern add8x8_idct_sse2.skip_prologue
+ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
+
+cextern sub8x8_dct8_sse2.skip_prologue
+cextern add8x8_idct8_sse2.skip_prologue
+SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
-cextern x264_sub8x8_dct8_ssse3.skip_prologue
-SUB_NxN_DCT x264_sub16x16_dct8_ssse3, x264_sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
+cextern sub8x8_dct8_ssse3.skip_prologue
+SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
;-----------------------------------------------------------------------------
@@ -331,7 +330,7 @@
movq [%3+FDEC_STRIDE*3], %1
%endmacro
-cglobal x264_add8x8_idct_dc_mmx, 2,2
+cglobal add8x8_idct_dc_mmx, 2,2
movq mm0, [r1]
pxor mm1, mm1
add r0, FDEC_STRIDE*4
@@ -350,7 +349,7 @@
ADD_DC mm2, mm3, r0
RET
-cglobal x264_add8x8_idct_dc_ssse3, 2,2
+cglobal add8x8_idct_dc_ssse3, 2,2
movq xmm0, [r1]
pxor xmm1, xmm1
add r0, FDEC_STRIDE*4
@@ -388,7 +387,7 @@
movhps [r0+FDEC_STRIDE* 3], xmm5
RET
-cglobal x264_add16x16_idct_dc_mmx, 2,3
+cglobal add16x16_idct_dc_mmx, 2,3
mov r2, 4
.loop:
movq mm0, [r1]
@@ -431,7 +430,7 @@
movdqa [r0+%1+FDEC_STRIDE*3], xmm7
%endmacro
-cglobal x264_add16x16_idct_dc_sse2, 2,2,8
+cglobal add16x16_idct_dc_sse2, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
%ifdef WIN64
@@ -465,7 +464,7 @@
IDCT_DC_STORE 0, xmm2, xmm3
ret
-cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
+cglobal add16x16_idct_dc_ssse3, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
%ifdef WIN64
@@ -531,7 +530,7 @@
%endmacro
INIT_MMX
-cglobal x264_sub8x8_dct_dc_mmxext, 3,3
+cglobal sub8x8_dct_dc_mmxext, 3,3
DCTDC_2ROW_MMX m0, m4, 0
DCTDC_2ROW_MMX m5, m6, 2
paddw m0, m5
@@ -567,7 +566,7 @@
%endif
%endmacro
-cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
+cglobal sub8x8_dct_dc_sse2, 3,3,8
pxor m7, m7
DCTDC_2ROW_SSE2 0, 0, m4
DCTDC_2ROW_SSE2 2, 1, m4
@@ -586,10 +585,10 @@
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
+; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro SCAN_8x8 1
-cglobal x264_zigzag_scan_8x8_frame_%1, 2,2,8
+cglobal zigzag_scan_8x8_frame_%1, 2,2,8
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
movdq2q mm0, xmm0
@@ -703,9 +702,9 @@
SCAN_8x8 ssse3
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] )
+; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2
+cglobal zigzag_scan_8x8_frame_mmxext, 2,2
movq mm0, [r1]
movq mm1, [r1+2*8]
movq mm2, [r1+2*14]
@@ -798,9 +797,9 @@
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_frame_mmx( int16_t level[16], int16_t dct[4][4] )
+; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
+cglobal zigzag_scan_4x4_frame_mmx, 2,2
movq mm0, [r1]
movq mm1, [r1+8]
movq mm2, [r1+16]
@@ -828,9 +827,9 @@
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[4][4] )
+; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
+cglobal zigzag_scan_4x4_frame_ssse3, 2,2
movdqa xmm1, [r1+16]
movdqa xmm0, [r1]
pshufb xmm1, [pb_scan4frameb]
@@ -845,10 +844,10 @@
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
+; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
-cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
+cglobal zigzag_scan_4x4_field_mmxext, 2,3
pshufw mm0, [r1+4], 0xd2
movq mm1, [r1+16]
movq mm2, [r1+24]
@@ -862,7 +861,7 @@
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[8][8] )
+; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
; Output order:
@@ -875,7 +874,7 @@
; 45 46 47 51 56 57 52 53
; 54 55 58 59 60 61 62 63
-cglobal x264_zigzag_scan_8x8_field_mmxext, 2,3
+cglobal zigzag_scan_8x8_field_mmxext, 2,3
movq mm0, [r1+2*0] ; 03 02 01 00
movq mm1, [r1+2*4] ; 07 06 05 04
movq mm2, [r1+2*8] ; 11 10 09 08
@@ -954,13 +953,13 @@
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
+; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
%macro ZIGZAG_SUB_4x4 2
%ifidn %1, ac
-cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 4,4,8
+cglobal zigzag_sub_4x4%1_%2_ssse3, 4,4,8
%else
-cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
+cglobal zigzag_sub_4x4%1_%2_ssse3, 3,3,8
%endif
movd xmm0, [r1+0*FENC_STRIDE]
movd xmm1, [r1+1*FENC_STRIDE]
@@ -1020,7 +1019,7 @@
ZIGZAG_SUB_4x4 ac, field
;-----------------------------------------------------------------------------
-; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz )
+; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
;-----------------------------------------------------------------------------
%macro INTERLEAVE 1
@@ -1047,7 +1046,7 @@
%endmacro
INIT_MMX
-cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
+cglobal zigzag_interleave_8x8_cavlc_mmx, 3,3
INTERLEAVE 0
INTERLEAVE 8
INTERLEAVE 16
@@ -1095,7 +1094,7 @@
%endmacro
INIT_XMM
-cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8
+cglobal zigzag_interleave_8x8_cavlc_sse2, 3,3,8
INTERLEAVE_XMM 0
INTERLEAVE_XMM 16
packsswb m2, m3
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/deblock-a.asm
^
|
@@ -22,14 +22,13 @@
%include "x86inc.asm"
-SECTION_RODATA
-pb_00: times 16 db 0x00
-pb_01: times 16 db 0x01
-pb_03: times 16 db 0x03
-pb_a1: times 16 db 0xa1
-
SECTION .text
+cextern pb_0
+cextern pb_1
+cextern pb_3
+cextern pb_a1
+
; expands to [base],...,[base+7*stride]
%define PASS8ROWS(base, base3, stride, stride3) \
[base], [base+stride], [base+stride*2], [base3], \
@@ -234,11 +233,11 @@
%macro DEBLOCK_P0_Q0 0
mova m5, m1
pxor m5, m2 ; p0^q0
- pand m5, [pb_01] ; (p0^q0)&1
+ pand m5, [pb_1] ; (p0^q0)&1
pcmpeqb m4, m4
pxor m3, m4
pavgb m3, m0 ; (p1 - q1 + 256)>>1
- pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
+ pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
pxor m4, m1
pavgb m4, m2 ; (q0 - p0 + 256)>>1
pavgb m3, m5
@@ -263,7 +262,7 @@
pavgb %6, m2
pavgb %2, %6 ; avg(p2,avg(p0,q0))
pxor %6, %3
- pand %6, [pb_01] ; (p2^avg(p0,q0))&1
+ pand %6, [pb_1] ; (p2^avg(p0,q0))&1
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
mova %6, %1
psubusb %6, %5
@@ -275,10 +274,10 @@
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
-; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_XMM
-cglobal x264_deblock_v_luma_sse2, 5,5,10
+cglobal deblock_v_luma_sse2, 5,5,10
movd m8, [r4] ; tc0
lea r4, [r1*3]
dec r2d ; alpha-1
@@ -321,10 +320,10 @@
RET
;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal x264_deblock_h_luma_sse2, 5,7
+cglobal deblock_h_luma_sse2, 5,7
movsxd r10, r1d
lea r11, [r10+r10*2]
lea r6, [r0-4]
@@ -345,13 +344,13 @@
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
- ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
+ ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30]
mov r1d, 0x10
%ifdef WIN64
mov [rsp+0x20], r4
%endif
- call x264_deblock_v_luma_sse2
+ call deblock_v_luma_sse2
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add r6, 2
@@ -383,9 +382,9 @@
%macro DEBLOCK_LUMA 3
;-----------------------------------------------------------------------------
-; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_%1, 5,5
+cglobal deblock_%2_luma_%1, 5,5
lea r4, [r1*3]
dec r2 ; alpha-1
neg r4
@@ -436,10 +435,10 @@
RET
;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal x264_deblock_h_luma_%1, 0,5
+cglobal deblock_h_luma_%1, 0,5
mov r0, r0mp
mov r3, r1m
lea r4, [r3*3]
@@ -462,11 +461,11 @@
PUSH dword r2m
PUSH dword 16
PUSH dword r0
- call x264_deblock_%2_luma_%1
+ call deblock_%2_luma_%1
%ifidn %2, v8
add dword [esp ], 8 ; pix_tmp+0x38
add dword [esp+16], 2 ; tc0+2
- call x264_deblock_%2_luma_%1
+ call deblock_%2_luma_%1
%endif
ADD esp, 20
@@ -517,9 +516,9 @@
mova t3, t2
mova t4, t2
psrlw t2, 1
- pavgb t2, mpb_00
+ pavgb t2, mpb_0
pxor t2, t0
- pand t2, mpb_01
+ pand t2, mpb_1
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
mova t1, p2
@@ -528,21 +527,21 @@
psubb t2, q1
paddb t3, t3
psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
- pand t2, mpb_01
+ pand t2, mpb_1
psubb t1, t2
pavgb t1, p1
pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
psrlw t3, 2
- pavgb t3, mpb_00
+ pavgb t3, mpb_0
pxor t3, t1
- pand t3, mpb_01
+ pand t3, mpb_1
psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
mova t3, p0
mova t2, p0
pxor t3, q1
pavgb t2, q1
- pand t3, mpb_01
+ pand t3, mpb_1
psubb t2, t3
pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
@@ -562,9 +561,9 @@
paddb t2, t2
paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
psrlw t2, 2
- pavgb t2, mpb_00
+ pavgb t2, mpb_0
pxor t2, t1
- pand t2, mpb_01
+ pand t2, mpb_1
psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
pxor t0, p1
@@ -603,8 +602,8 @@
%define mask0 m12
%define mask1p m13
%define mask1q [rsp-24]
- %define mpb_00 m14
- %define mpb_01 m15
+ %define mpb_0 m14
+ %define mpb_1 m15
%else
%define spill(x) [esp+16*x+((stack_offset+4)&15)]
%define p2 [r4+r1]
@@ -614,14 +613,14 @@
%define mask0 spill(2)
%define mask1p spill(3)
%define mask1q spill(4)
- %define mpb_00 [pb_00]
- %define mpb_01 [pb_01]
+ %define mpb_0 [pb_0]
+ %define mpb_1 [pb_1]
%endif
;-----------------------------------------------------------------------------
-; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
+cglobal deblock_%2_luma_intra_%1, 4,6,16
%ifndef ARCH_X86_64
sub esp, 0x60
%endif
@@ -638,12 +637,12 @@
mova q0, [r0]
mova q1, [r0+r1]
%ifdef ARCH_X86_64
- pxor mpb_00, mpb_00
- mova mpb_01, [pb_01]
+ pxor mpb_0, mpb_0
+ mova mpb_1, [pb_1]
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
SWAP 7, 12 ; m12=mask0
- pavgb t5, mpb_00
- pavgb t5, mpb_01 ; alpha/4+1
+ pavgb t5, mpb_0
+ pavgb t5, mpb_1 ; alpha/4+1
movdqa p2, [r4+r1]
movdqa q2, [r0+2*r1]
DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
@@ -658,8 +657,8 @@
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
mova m4, t5
mova mask0, m7
- pavgb m4, [pb_00]
- pavgb m4, [pb_01] ; alpha/4+1
+ pavgb m4, [pb_0]
+ pavgb m4, [pb_1] ; alpha/4+1
DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
pand m6, mask0
DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
@@ -681,9 +680,9 @@
INIT_MMX
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_luma_intra_%1, 4,7
+cglobal deblock_h_luma_intra_%1, 4,7
movsxd r10, r1d
lea r11, [r10*3]
lea r6, [r0-4]
@@ -699,7 +698,7 @@
lea r0, [pix_tmp+0x40]
mov r1, 0x10
- call x264_deblock_v_luma_intra_%1
+ call deblock_v_luma_intra_%1
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea r5, [r6+r11]
@@ -712,7 +711,7 @@
add rsp, 0x88
RET
%else
-cglobal x264_deblock_h_luma_intra_%1, 2,4
+cglobal deblock_h_luma_intra_%1, 2,4
lea r3, [r1*3]
sub r0, 4
lea r2, [r0+r3]
@@ -731,10 +730,10 @@
PUSH dword r2m
PUSH dword 16
PUSH r0
- call x264_deblock_%2_luma_intra_%1
+ call deblock_%2_luma_intra_%1
%ifidn %2, v8
add dword [rsp], 8 ; pix_tmp+8
- call x264_deblock_%2_luma_intra_%1
+ call deblock_%2_luma_intra_%1
%endif
ADD esp, 16
@@ -785,9 +784,9 @@
%define t6 r6
;-----------------------------------------------------------------------------
-; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_v_chroma_mmxext, 5,6
+cglobal deblock_v_chroma_mmxext, 5,6
CHROMA_V_START
movq m0, [t5]
movq m1, [t5+r1]
@@ -799,9 +798,9 @@
RET
;-----------------------------------------------------------------------------
-; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_chroma_mmxext, 5,7
+cglobal deblock_h_chroma_mmxext, 5,7
%ifdef ARCH_X86_64
%define buf0 [rsp-24]
%define buf1 [rsp-16]
@@ -835,7 +834,7 @@
%macro CHROMA_INTRA_P0 3
movq m4, %1
pxor m4, %3
- pand m4, [pb_01] ; m4 = (p0^q1)&1
+ pand m4, [pb_1] ; m4 = (p0^q1)&1
pavgb %1, %3
psubusb %1, m4
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
@@ -845,9 +844,9 @@
%define t6 r5
;-----------------------------------------------------------------------------
-; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_v_chroma_intra_mmxext, 4,5
+cglobal deblock_v_chroma_intra_mmxext, 4,5
CHROMA_V_START
movq m0, [t5]
movq m1, [t5+r1]
@@ -859,9 +858,9 @@
RET
;-----------------------------------------------------------------------------
-; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_chroma_intra_mmxext, 4,6
+cglobal deblock_h_chroma_intra_mmxext, 4,6
CHROMA_H_START
TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
call chroma_intra_body_mmxext
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/mc-a.asm
^
|
@@ -29,15 +29,16 @@
SECTION_RODATA 32
ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0
-pw_1: times 8 dw 1
-pw_4: times 8 dw 4
-pw_8: times 8 dw 8
-pw_32: times 8 dw 32
-pw_64: times 8 dw 64
-sw_64: dd 64
SECTION .text
+cextern pw_1
+cextern pw_4
+cextern pw_8
+cextern pw_32
+cextern pw_64
+cextern sw_64
+
;=============================================================================
; implicit weighted biprediction
;=============================================================================
@@ -129,10 +130,10 @@
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
+; int pixel_avg_weight_w16( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
;-----------------------------------------------------------------------------
%macro AVG_WEIGHT 2-3 0
-cglobal x264_pixel_avg_weight_w%2_%1
+cglobal pixel_avg_weight_w%2_%1
BIWEIGHT_START
AVG_START %3
%if %2==8 && mmsize==16
@@ -165,7 +166,7 @@
AVG_WEIGHT mmxext, 8
AVG_WEIGHT mmxext, 16
INIT_XMM
-%define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext
+%define pixel_avg_weight_w4_sse2 pixel_avg_weight_w4_mmxext
AVG_WEIGHT sse2, 8, 7
AVG_WEIGHT sse2, 16, 7
%define BIWEIGHT BIWEIGHT_SSSE3
@@ -293,8 +294,9 @@
%endrep
%endmacro
-
-;void x264_mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src,int i_src_stride, x264_weight_t *weight,int h)
+;-----------------------------------------------------------------------------
+;void mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, weight_t *weight, int h )
+;-----------------------------------------------------------------------------
%ifdef ARCH_X86_64
%define NUMREGS 6
@@ -307,7 +309,7 @@
%endif
%macro WEIGHTER 2
- cglobal x264_mc_weight_w%1_%2, NUMREGS, NUMREGS, 7
+ cglobal mc_weight_w%1_%2, NUMREGS, NUMREGS, 7
WEIGHT_START %1
LOAD_HEIGHT
.loop:
@@ -363,9 +365,11 @@
%endrep
%endmacro
-;void x264_mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, x264_weight_t *w, int h )
+;-----------------------------------------------------------------------------
+;void mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, weight_t *w, int h )
+;-----------------------------------------------------------------------------
%macro OFFSET 3
- cglobal x264_mc_offset%3_w%1_%2, NUMREGS, NUMREGS
+ cglobal mc_offset%3_w%1_%2, NUMREGS, NUMREGS
mova m2, [r4]
LOAD_HEIGHT
.loop:
@@ -402,25 +406,25 @@
;=============================================================================
;-----------------------------------------------------------------------------
-; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
-; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
+; void pixel_avg_4x4( uint8_t *dst, int dst_stride,
+; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
;-----------------------------------------------------------------------------
%macro AVGH 3
-cglobal x264_pixel_avg_%1x%2_%3
+cglobal pixel_avg_%1x%2_%3
mov eax, %2
cmp dword r6m, 32
- jne x264_pixel_avg_weight_w%1_%3
+ jne pixel_avg_weight_w%1_%3
%if mmsize == 16 && %1 == 16
test dword r4m, 15
- jz x264_pixel_avg_w%1_sse2
+ jz pixel_avg_w%1_sse2
%endif
- jmp x264_pixel_avg_w%1_mmxext
+ jmp pixel_avg_w%1_mmxext
%endmacro
;-----------------------------------------------------------------------------
-; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
-; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
-; int height, int weight );
+; void pixel_avg_w4( uint8_t *dst, int dst_stride,
+; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
+; int height, int weight );
;-----------------------------------------------------------------------------
%macro AVG_END 0
@@ -445,17 +449,17 @@
%endmacro
INIT_MMX
-AVG_FUNC x264_pixel_avg_w4_mmxext, movd, movd
+AVG_FUNC pixel_avg_w4_mmxext, movd, movd
AVGH 4, 8, mmxext
AVGH 4, 4, mmxext
AVGH 4, 2, mmxext
-AVG_FUNC x264_pixel_avg_w8_mmxext, movq, movq
+AVG_FUNC pixel_avg_w8_mmxext, movq, movq
AVGH 8, 16, mmxext
AVGH 8, 8, mmxext
AVGH 8, 4, mmxext
-cglobal x264_pixel_avg_w16_mmxext
+cglobal pixel_avg_w16_mmxext
AVG_START
movq mm0, [t2 ]
movq mm1, [t2+8]
@@ -475,7 +479,7 @@
AVGH 16, 8, mmxext
INIT_XMM
-AVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqa
+AVG_FUNC pixel_avg_w16_sse2, movdqu, movdqa
AVGH 16, 16, sse2
AVGH 16, 8, sse2
AVGH 8, 16, sse2
@@ -498,12 +502,12 @@
;=============================================================================
;-----------------------------------------------------------------------------
-; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride,
-; uint8_t *src1, int src_stride,
-; uint8_t *src2, int height );
+; void pixel_avg2_w4( uint8_t *dst, int dst_stride,
+; uint8_t *src1, int src_stride,
+; uint8_t *src2, int height );
;-----------------------------------------------------------------------------
%macro AVG2_W8 2
-cglobal x264_pixel_avg2_w%1_mmxext, 6,7
+cglobal pixel_avg2_w%1_mmxext, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
@@ -524,7 +528,7 @@
AVG2_W8 8, movq
%macro AVG2_W16 2
-cglobal x264_pixel_avg2_w%1_mmxext, 6,7
+cglobal pixel_avg2_w%1_mmxext, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
@@ -550,7 +554,7 @@
AVG2_W16 12, movd
AVG2_W16 16, movq
-cglobal x264_pixel_avg2_w20_mmxext, 6,7
+cglobal pixel_avg2_w20_mmxext, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
@@ -578,7 +582,7 @@
jg .height_loop
REP_RET
-cglobal x264_pixel_avg2_w16_sse2, 6,7
+cglobal pixel_avg2_w16_sse2, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
@@ -597,7 +601,7 @@
REP_RET
%macro AVG2_W20 1
-cglobal x264_pixel_avg2_w20_%1, 6,7
+cglobal pixel_avg2_w20_%1, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
@@ -647,16 +651,16 @@
%endmacro
%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
-cglobal x264_pixel_avg2_w%1_cache%2_%3
+cglobal pixel_avg2_w%1_cache%2_%3
mov eax, r2m
and eax, 0x1f|(%2>>1)
cmp eax, (32-%1)|(%2>>1)
- jle x264_pixel_avg2_w%1_%3
+ jle pixel_avg2_w%1_%3
;w12 isn't needed because w16 is just as fast if there's no cacheline split
%if %1 == 12
- jmp x264_pixel_avg2_w16_cache_mmxext
+ jmp pixel_avg2_w16_cache_mmxext
%else
- jmp x264_pixel_avg2_w%1_cache_mmxext
+ jmp pixel_avg2_w%1_cache_mmxext
%endif
%endmacro
@@ -687,7 +691,7 @@
%2 [r0+%1], mm0
%endmacro
-x264_pixel_avg2_w8_cache_mmxext:
+pixel_avg2_w8_cache_mmxext:
AVG_CACHELINE_START
AVG_CACHELINE_LOOP 0, movq
add r2, r3
@@ -696,7 +700,7 @@
jg .height_loop
REP_RET
-x264_pixel_avg2_w16_cache_mmxext:
+pixel_avg2_w16_cache_mmxext:
AVG_CACHELINE_START
AVG_CACHELINE_LOOP 0, movq
AVG_CACHELINE_LOOP 8, movq
@@ -706,7 +710,7 @@
jg .height_loop
REP_RET
-x264_pixel_avg2_w20_cache_mmxext:
+pixel_avg2_w20_cache_mmxext:
AVG_CACHELINE_START
AVG_CACHELINE_LOOP 0, movq
AVG_CACHELINE_LOOP 8, movq
@@ -754,11 +758,11 @@
rep ret
%endmacro
-cglobal x264_pixel_avg2_w16_cache64_ssse3
+cglobal pixel_avg2_w16_cache64_ssse3
mov eax, r2m
and eax, 0x3f
cmp eax, 0x30
- jle x264_pixel_avg2_w16_sse2
+ jle pixel_avg2_w16_sse2
PROLOGUE 6,7
lea r6, [r4+r2]
and r4, ~0xf
@@ -807,10 +811,10 @@
INIT_MMX
;-----------------------------------------------------------------------------
-; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride,
-; uint8_t *src, int i_src_stride, int i_height )
+; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
+; uint8_t *src, int i_src_stride, int i_height )
;-----------------------------------------------------------------------------
-cglobal x264_mc_copy_w4_mmx, 4,6
+cglobal mc_copy_w4_mmx, 4,6
cmp dword r4m, 4
lea r5, [r3*3]
lea r4, [r1*3]
@@ -822,7 +826,7 @@
COPY4 movd, movd, r4, r5
RET
-cglobal x264_mc_copy_w8_mmx, 5,7
+cglobal mc_copy_w8_mmx, 5,7
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
@@ -833,7 +837,7 @@
jg .height_loop
REP_RET
-cglobal x264_mc_copy_w16_mmx, 5,7
+cglobal mc_copy_w16_mmx, 5,7
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
@@ -873,11 +877,11 @@
REP_RET
%endmacro
-COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu
+COPY_W16_SSE2 mc_copy_w16_sse2, movdqu
; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
; but with SSE3 the overhead is zero, so there's no reason not to include it.
-COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu
-COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
+COPY_W16_SSE2 mc_copy_w16_sse3, lddqu
+COPY_W16_SSE2 mc_copy_w16_aligned_sse2, movdqa
@@ -887,11 +891,11 @@
; FIXME assumes 64 byte cachelines
;-----------------------------------------------------------------------------
-; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
-; uint8_t *pix_uv, int stride_uv, int mb_x )
+; void prefetch_fenc( uint8_t *pix_y, int stride_y,
+; uint8_t *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
%ifdef ARCH_X86_64
-cglobal x264_prefetch_fenc_mmxext, 5,5
+cglobal prefetch_fenc_mmxext, 5,5
mov eax, r4d
and eax, 3
imul eax, r1d
@@ -910,7 +914,7 @@
RET
%else
-cglobal x264_prefetch_fenc_mmxext
+cglobal prefetch_fenc_mmxext
mov r2, [esp+20]
mov r1, [esp+8]
mov r0, [esp+4]
@@ -935,9 +939,9 @@
%endif ; ARCH_X86_64
;-----------------------------------------------------------------------------
-; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
+; void prefetch_ref( uint8_t *pix, int stride, int parity )
;-----------------------------------------------------------------------------
-cglobal x264_prefetch_ref_mmxext, 3,3
+cglobal prefetch_ref_mmxext, 3,3
dec r2d
and r2d, r1d
lea r0, [r0+r2*8+64]
@@ -982,16 +986,16 @@
%endmacro
;-----------------------------------------------------------------------------
-; void x264_mc_chroma_mmxext( uint8_t *dst, int dst_stride,
-; uint8_t *src, int src_stride,
-; int dx, int dy,
-; int width, int height )
+; void mc_chroma( uint8_t *dst, int dst_stride,
+; uint8_t *src, int src_stride,
+; int dx, int dy,
+; int width, int height )
;-----------------------------------------------------------------------------
%macro MC_CHROMA 1-2 0
-cglobal x264_mc_chroma_%1
+cglobal mc_chroma_%1
%if mmsize == 16
cmp dword r6m, 4
- jle x264_mc_chroma_mmxext
+ jle mc_chroma_mmxext
%endif
PROLOGUE 0,6,%2
MC_CHROMA_START
@@ -1151,7 +1155,7 @@
%macro MC_CHROMA_SSSE3 2
INIT_MMX
-cglobal x264_mc_chroma_ssse3%1, 0,6,%2
+cglobal mc_chroma_ssse3%1, 0,6,%2
MC_CHROMA_START
and r4d, 7
and r5d, 7
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/mc-a2.asm
^
|
@@ -33,13 +33,14 @@
filt_mul51: times 8 db -5, 1
hpel_shuf: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
-pw_1: times 8 dw 1
-pw_16: times 8 dw 16
-pw_32: times 8 dw 32
-pd_128: times 4 dd 128
-
SECTION .text
+cextern pw_1
+cextern pw_16
+cextern pw_32
+cextern pd_128
+cextern pw_3fff
+
%macro LOAD_ADD 4
movh %4, %3
movh %1, %2
@@ -121,9 +122,9 @@
%macro HPEL_V 1-2 0
;-----------------------------------------------------------------------------
-; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
+; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_v_%1, 5,6,%2
+cglobal hpel_filter_v_%1, 5,6,%2
%ifdef WIN64
movsxd r4, r4d
%endif
@@ -180,9 +181,9 @@
HPEL_V mmxext
;-----------------------------------------------------------------------------
-; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_c_mmxext, 3,3
+cglobal hpel_filter_c_mmxext, 3,3
add r0, r2
lea r1, [r1+r2*2]
neg r2
@@ -209,9 +210,9 @@
REP_RET
;-----------------------------------------------------------------------------
-; void x264_hpel_filter_h_mmxext( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_h_mmxext, 3,3
+cglobal hpel_filter_h_mmxext, 3,3
add r0, r2
add r1, r2
neg r2
@@ -256,9 +257,9 @@
%macro HPEL_C 1
;-----------------------------------------------------------------------------
-; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_c_%1, 3,3,9
+cglobal hpel_filter_c_%1, 3,3,9
add r0, r2
lea r1, [r1+r2*2]
neg r2
@@ -331,9 +332,9 @@
%endmacro
;-----------------------------------------------------------------------------
-; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_h_sse2, 3,3,8
+cglobal hpel_filter_h_sse2, 3,3,8
add r0, r2
add r1, r2
neg r2
@@ -380,9 +381,9 @@
%ifndef ARCH_X86_64
;-----------------------------------------------------------------------------
-; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_h_ssse3, 3,3
+cglobal hpel_filter_h_ssse3, 3,3
add r0, r2
add r1, r2
neg r2
@@ -557,10 +558,10 @@
%macro HPEL 1
;-----------------------------------------------------------------------------
-; void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-; uint8_t *src, int stride, int width, int height)
+; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
+; uint8_t *src, int stride, int width, int height)
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_%1, 7,7,16
+cglobal hpel_filter_%1, 7,7,16
%ifdef WIN64
movsxd r4, r4d
movsxd r5, r5d
@@ -626,20 +627,16 @@
HPEL ssse3
%endif
-cglobal x264_sfence
- sfence
- ret
-
%undef movntq
%undef movntps
%undef sfence
;-----------------------------------------------------------------------------
-; void x264_plane_copy_core_mmxext( uint8_t *dst, int i_dst,
-; uint8_t *src, int i_src, int w, int h)
+; void plane_copy_core( uint8_t *dst, int i_dst,
+; uint8_t *src, int i_src, int w, int h)
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>w
-cglobal x264_plane_copy_core_mmxext, 6,7
+cglobal plane_copy_core_mmxext, 6,7
movsxdifnidn r1, r1d
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
@@ -698,9 +695,9 @@
; memzero SSE will fail for non-mod128.
;-----------------------------------------------------------------------------
-; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
+; void *memcpy_aligned( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
-cglobal x264_memcpy_aligned_mmx, 3,3
+cglobal memcpy_aligned_mmx, 3,3
test r2d, 16
jz .copy32
sub r2d, 16
@@ -722,9 +719,9 @@
REP_RET
;-----------------------------------------------------------------------------
-; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
+; void *memcpy_aligned( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
-cglobal x264_memcpy_aligned_sse2, 3,3
+cglobal memcpy_aligned_sse2, 3,3
test r2d, 16
jz .copy32
sub r2d, 16
@@ -752,10 +749,10 @@
REP_RET
;-----------------------------------------------------------------------------
-; void *x264_memzero_aligned( void *dst, size_t n );
+; void *memzero_aligned( void *dst, size_t n );
;-----------------------------------------------------------------------------
%macro MEMZERO 1
-cglobal x264_memzero_aligned_%1, 2,2
+cglobal memzero_aligned_%1, 2,2
add r0, r1
neg r1
pxor m0, m0
@@ -778,9 +775,9 @@
;-----------------------------------------------------------------------------
-; void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride )
+; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
;-----------------------------------------------------------------------------
-cglobal x264_integral_init4h_sse4, 3,4
+cglobal integral_init4h_sse4, 3,4
lea r3, [r0+r2*2]
add r1, r2
neg r2
@@ -799,7 +796,7 @@
jl .loop
REP_RET
-cglobal x264_integral_init8h_sse4, 3,4
+cglobal integral_init8h_sse4, 3,4
lea r3, [r0+r2*2]
add r1, r2
neg r2
@@ -826,9 +823,9 @@
%macro INTEGRAL_INIT_8V 1
;-----------------------------------------------------------------------------
-; void x264_integral_init8v_mmx( uint16_t *sum8, int stride )
+; void integral_init8v( uint16_t *sum8, int stride )
;-----------------------------------------------------------------------------
-cglobal x264_integral_init8v_%1, 3,3
+cglobal integral_init8v_%1, 3,3
shl r1, 1
add r0, r1
lea r2, [r0+r1*8]
@@ -851,10 +848,10 @@
INTEGRAL_INIT_8V sse2
;-----------------------------------------------------------------------------
-; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
+; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal x264_integral_init4v_mmx, 3,5
+cglobal integral_init4v_mmx, 3,5
shl r2, 1
lea r3, [r0+r2*4]
lea r4, [r0+r2*8]
@@ -876,7 +873,7 @@
REP_RET
INIT_XMM
-cglobal x264_integral_init4v_sse2, 3,5
+cglobal integral_init4v_sse2, 3,5
shl r2, 1
add r0, r2
add r1, r2
@@ -901,7 +898,7 @@
jl .loop
REP_RET
-cglobal x264_integral_init4v_ssse3, 3,5
+cglobal integral_init4v_ssse3, 3,5
shl r2, 1
add r0, r2
add r1, r2
@@ -993,7 +990,7 @@
; int src_stride, int dst_stride, int width, int height )
;-----------------------------------------------------------------------------
%macro FRAME_INIT_LOWRES 1-2 0 ; FIXME
-cglobal x264_frame_init_lowres_core_%1, 6,7,%2
+cglobal frame_init_lowres_core_%1, 6,7,%2
%ifdef WIN64
movsxd r5, r5d
%endif
@@ -1114,7 +1111,7 @@
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
;-----------------------------------------------------------------------------
-cglobal x264_mbtree_propagate_cost_sse2, 6,6
+cglobal mbtree_propagate_cost_sse2, 6,6
shl r5d, 1
lea r0, [r0+r5*2]
add r1, r5
@@ -1132,8 +1129,9 @@
pmaddwd xmm0, xmm2
paddd xmm0, xmm4
psrld xmm0, 8 ; intra*invq>>8
- movq xmm1, [r1+r5] ; prop
movq xmm3, [r3+r5] ; inter
+ movq xmm1, [r1+r5] ; prop
+ pand xmm3, [pw_3fff]
punpcklwd xmm1, xmm5
punpcklwd xmm3, xmm5
paddd xmm0, xmm1 ; prop + (intra*invq>>8)
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/mc-c.c
^
|
@@ -44,11 +44,11 @@
DECL_SUF( x264_pixel_avg_4x2, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
#define MC_WEIGHT(w,type) \
- extern void x264_mc_weight_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int );
+ void x264_mc_weight_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int );
#define MC_WEIGHT_OFFSET(w,type) \
- extern void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
- extern void x264_mc_offsetsub_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
+ void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
+ void x264_mc_offsetsub_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
MC_WEIGHT(w,type)
MC_WEIGHT_OFFSET( 4, mmxext )
@@ -68,51 +68,51 @@
#undef MC_OFFSET
#undef MC_WEIGHT
-extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
-extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
-extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
-extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
-extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
-extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
-extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
-extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
-extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
+void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
+void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
+void x264_prefetch_ref_mmxext( uint8_t *, int, int );
+void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
+ uint8_t *dst, int i_dst_stride,
+ int dx, int dy, int i_width, int i_height );
+void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride,
+ uint8_t *dst, int i_dst_stride,
+ int dx, int dy, int i_width, int i_height );
+void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride,
+ uint8_t *dst, int i_dst_stride,
+ int dx, int dy, int i_width, int i_height );
+void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride,
uint8_t *dst, int i_dst_stride,
int dx, int dy, int i_width, int i_height );
-extern void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
- int dx, int dy, int i_width, int i_height );
-extern void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
- int dx, int dy, int i_width, int i_height );
-extern void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
- int dx, int dy, int i_width, int i_height );
-extern void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
-extern void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h);
-extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
-extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
-extern void x264_memzero_aligned_mmx( void * dst, int n );
-extern void x264_memzero_aligned_sse2( void * dst, int n );
-extern void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
-extern void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
-extern void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
-extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
-extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
-extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
-extern void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
-extern void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, int len );
-#define LOWRES(cpu) \
-extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
- int src_stride, int dst_stride, int width, int height );
+void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
+void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h);
+void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
+void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
+void x264_memzero_aligned_mmx( void * dst, int n );
+void x264_memzero_aligned_sse2( void * dst, int n );
+void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
+void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
+void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
+void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
+void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
+void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
+void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
+void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, int len );
+#define LOWRES(cpu)\
+void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
+ int src_stride, int dst_stride, int width, int height );
LOWRES(mmxext)
LOWRES(cache32_mmxext)
LOWRES(sse2)
LOWRES(ssse3)
#define PIXEL_AVG_W(width,cpu)\
-extern void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int );
/* This declares some functions that don't exist, but that isn't a problem. */
#define PIXEL_AVG_WALL(cpu)\
PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(20,cpu);
@@ -228,8 +228,8 @@
}
}
-static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
#define MC_LUMA(name,instr1,instr2)\
static void mc_luma_##name( uint8_t *dst, int i_dst_stride,\
@@ -309,7 +309,6 @@
void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
void x264_hpel_filter_c_##cpuc( uint8_t *dst, int16_t *buf, int width );\
void x264_hpel_filter_h_##cpuh( uint8_t *dst, uint8_t *src, int width );\
-void x264_sfence( void );\
static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,\
int stride, int width, int height, int16_t *buf )\
{\
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/pixel-32.asm
^
|
@@ -61,9 +61,9 @@
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int )
+; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sa8d_8x8_internal_mmxext
+cglobal pixel_sa8d_8x8_internal_mmxext
push r0
push r2
sub esp, 0x74
@@ -169,9 +169,9 @@
%endmacro
;-----------------------------------------------------------------------------
-; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res )
+; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
;-----------------------------------------------------------------------------
-cglobal x264_intra_sa8d_x3_8x8_core_mmxext
+cglobal intra_sa8d_x3_8x8_core_mmxext
mov eax, [esp+4]
mov ecx, [esp+8]
sub esp, 0x70
@@ -329,10 +329,10 @@
;-----------------------------------------------------------------------------
-; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
-; const uint8_t *pix2, int stride2, int sums[2][4] )
+; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
+; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssim_4x4x2_core_mmxext
+cglobal pixel_ssim_4x4x2_core_mmxext
push ebx
push edi
mov ebx, [esp+16]
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/pixel-a.asm
^
|
@@ -27,17 +27,14 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
-pw_1: times 8 dw 1
-pw_00ff: times 8 dw 0xff
-ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
-ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
+SECTION_RODATA 32
mask_ff: times 16 db 0xff
times 16 db 0
+ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
+ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1
mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1
mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1
-hsub_mul: times 8 db 1, -1
hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
hmul_8p: times 8 db 1
times 4 db 1, -1
@@ -46,6 +43,11 @@
SECTION .text
+cextern pw_1
+cextern pw_00ff
+
+cextern hsub_mul
+
%macro HADDD 2 ; sum junk
%if mmsize == 16
movhlps %2, %1
@@ -213,7 +215,7 @@
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
+; int pixel_ssd_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SSD 3-4 0
%if %1 != %2
@@ -221,7 +223,7 @@
%else
%assign function_align 16
%endif
-cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0
+cglobal pixel_ssd_%1x%2_%3, 0,0,0
mov al, %1*%2/mmsize/2
%if %1 != %2
@@ -365,21 +367,21 @@
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_var_wxh_mmxext( uint8_t *, int )
+; int pixel_var_wxh( uint8_t *, int )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal x264_pixel_var_16x16_mmxext, 2,3
+cglobal pixel_var_16x16_mmxext, 2,3
VAR_START 0
VAR_2ROW 8, 16
VAR_END
-cglobal x264_pixel_var_8x8_mmxext, 2,3
+cglobal pixel_var_8x8_mmxext, 2,3
VAR_START 0
VAR_2ROW r1, 4
VAR_END
INIT_XMM
-cglobal x264_pixel_var_16x16_sse2, 2,3,8
+cglobal pixel_var_16x16_sse2, 2,3,8
VAR_START 1
mov r2d, 8
.loop:
@@ -392,7 +394,7 @@
jg .loop
VAR_END
-cglobal x264_pixel_var_8x8_sse2, 2,4,8
+cglobal pixel_var_8x8_sse2, 2,4,8
VAR_START 1
mov r2d, 2
lea r3, [r1*3]
@@ -421,11 +423,11 @@
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * )
+; int pixel_var2_8x8( uint8_t *, int, uint8_t *, int, int * )
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
INIT_MMX
-cglobal x264_pixel_var2_8x8_mmxext, 5,6
+cglobal pixel_var2_8x8_mmxext, 5,6
VAR_START 0
mov r5d, 8
.loop:
@@ -455,7 +457,7 @@
%endif
INIT_XMM
-cglobal x264_pixel_var2_8x8_sse2, 5,6,8
+cglobal pixel_var2_8x8_sse2, 5,6,8
VAR_START 1
mov r5d, 4
.loop:
@@ -479,7 +481,7 @@
VAR2_END
RET
-cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
+cglobal pixel_var2_8x8_ssse3, 5,6,8
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
mova m7, [hsub_mul]
@@ -692,10 +694,10 @@
; for small blocks on x86_32, modify pixel pointer instead.
;-----------------------------------------------------------------------------
-; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+; int pixel_satd_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal x264_pixel_satd_16x4_internal_mmxext
+cglobal pixel_satd_16x4_internal_mmxext
SATD_4x4_MMX m2, 0, 0
SATD_4x4_MMX m1, 4, 0
paddw m0, m2
@@ -706,69 +708,69 @@
paddw m0, m1
ret
-cglobal x264_pixel_satd_8x8_internal_mmxext
+cglobal pixel_satd_8x8_internal_mmxext
SATD_4x4_MMX m2, 0, 0
SATD_4x4_MMX m1, 4, 1
paddw m0, m2
paddw m0, m1
-x264_pixel_satd_8x4_internal_mmxext:
+pixel_satd_8x4_internal_mmxext:
SATD_4x4_MMX m2, 0, 0
SATD_4x4_MMX m1, 4, 0
paddw m0, m2
paddw m0, m1
ret
-cglobal x264_pixel_satd_16x16_mmxext, 4,6
+cglobal pixel_satd_16x16_mmxext, 4,6
SATD_START_MMX
pxor m0, m0
%rep 3
- call x264_pixel_satd_16x4_internal_mmxext
+ call pixel_satd_16x4_internal_mmxext
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endrep
- call x264_pixel_satd_16x4_internal_mmxext
+ call pixel_satd_16x4_internal_mmxext
HADDUW m0, m1
movd eax, m0
RET
-cglobal x264_pixel_satd_16x8_mmxext, 4,6
+cglobal pixel_satd_16x8_mmxext, 4,6
SATD_START_MMX
pxor m0, m0
- call x264_pixel_satd_16x4_internal_mmxext
+ call pixel_satd_16x4_internal_mmxext
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
- call x264_pixel_satd_16x4_internal_mmxext
+ call pixel_satd_16x4_internal_mmxext
SATD_END_MMX
-cglobal x264_pixel_satd_8x16_mmxext, 4,6
+cglobal pixel_satd_8x16_mmxext, 4,6
SATD_START_MMX
pxor m0, m0
- call x264_pixel_satd_8x8_internal_mmxext
+ call pixel_satd_8x8_internal_mmxext
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
- call x264_pixel_satd_8x8_internal_mmxext
+ call pixel_satd_8x8_internal_mmxext
SATD_END_MMX
-cglobal x264_pixel_satd_8x8_mmxext, 4,6
+cglobal pixel_satd_8x8_mmxext, 4,6
SATD_START_MMX
pxor m0, m0
- call x264_pixel_satd_8x8_internal_mmxext
+ call pixel_satd_8x8_internal_mmxext
SATD_END_MMX
-cglobal x264_pixel_satd_8x4_mmxext, 4,6
+cglobal pixel_satd_8x4_mmxext, 4,6
SATD_START_MMX
pxor m0, m0
- call x264_pixel_satd_8x4_internal_mmxext
+ call pixel_satd_8x4_internal_mmxext
SATD_END_MMX
-cglobal x264_pixel_satd_4x8_mmxext, 4,6
+cglobal pixel_satd_4x8_mmxext, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 1
SATD_4x4_MMX m1, 0, 0
paddw m0, m1
SATD_END_MMX
-cglobal x264_pixel_satd_4x4_mmxext, 4,6
+cglobal pixel_satd_4x4_mmxext, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 0
SATD_END_MMX
@@ -808,12 +810,12 @@
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
+; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SATDS_SSE2 1
INIT_XMM
%ifnidn %1, sse2
-cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
+cglobal pixel_satd_4x4_%1, 4, 6, 6
SATD_START_MMX
mova m4, [hmul_4p]
LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
@@ -829,7 +831,7 @@
RET
%endif
-cglobal x264_pixel_satd_4x8_%1, 4, 6, 8
+cglobal pixel_satd_4x8_%1, 4, 6, 8
SATD_START_MMX
%ifnidn %1, sse2
mova m7, [hmul_4p]
@@ -869,16 +871,16 @@
movd eax, m6
RET
-cglobal x264_pixel_satd_8x8_internal_%1
+cglobal pixel_satd_8x8_internal_%1
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
-x264_pixel_satd_8x4_internal_%1:
+pixel_satd_8x4_internal_%1:
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
ret
%ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
-cglobal x264_pixel_satd_16x4_internal_%1
+cglobal pixel_satd_16x4_internal_%1
LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
lea r2, [r2+4*r3]
lea r0, [r0+4*r1]
@@ -886,67 +888,67 @@
SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10
ret
-cglobal x264_pixel_satd_16x8_%1, 4,6,12
+cglobal pixel_satd_16x8_%1, 4,6,12
SATD_START_SSE2 %1, m10, m7
%ifidn %1, sse2
mova m7, [pw_00ff]
%endif
- jmp x264_pixel_satd_16x8_internal_%1
+ jmp pixel_satd_16x8_internal_%1
-cglobal x264_pixel_satd_16x16_%1, 4,6,12
+cglobal pixel_satd_16x16_%1, 4,6,12
SATD_START_SSE2 %1, m10, m7
%ifidn %1, sse2
mova m7, [pw_00ff]
%endif
- call x264_pixel_satd_16x4_internal_%1
- call x264_pixel_satd_16x4_internal_%1
-x264_pixel_satd_16x8_internal_%1:
- call x264_pixel_satd_16x4_internal_%1
- call x264_pixel_satd_16x4_internal_%1
+ call pixel_satd_16x4_internal_%1
+ call pixel_satd_16x4_internal_%1
+pixel_satd_16x8_internal_%1:
+ call pixel_satd_16x4_internal_%1
+ call pixel_satd_16x4_internal_%1
SATD_END_SSE2 %1, m10
%else
-cglobal x264_pixel_satd_16x8_%1, 4,6,8
+cglobal pixel_satd_16x8_%1, 4,6,8
SATD_START_SSE2 %1, m6, m7
BACKUP_POINTERS
- call x264_pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
RESTORE_AND_INC_POINTERS
- call x264_pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
SATD_END_SSE2 %1, m6
-cglobal x264_pixel_satd_16x16_%1, 4,6,8
+cglobal pixel_satd_16x16_%1, 4,6,8
SATD_START_SSE2 %1, m6, m7
BACKUP_POINTERS
- call x264_pixel_satd_8x8_internal_%1
- call x264_pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
RESTORE_AND_INC_POINTERS
- call x264_pixel_satd_8x8_internal_%1
- call x264_pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
SATD_END_SSE2 %1, m6
%endif
-cglobal x264_pixel_satd_8x16_%1, 4,6,8
+cglobal pixel_satd_8x16_%1, 4,6,8
SATD_START_SSE2 %1, m6, m7
- call x264_pixel_satd_8x8_internal_%1
- call x264_pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
SATD_END_SSE2 %1, m6
-cglobal x264_pixel_satd_8x8_%1, 4,6,8
+cglobal pixel_satd_8x8_%1, 4,6,8
SATD_START_SSE2 %1, m6, m7
- call x264_pixel_satd_8x8_internal_%1
+ call pixel_satd_8x8_internal_%1
SATD_END_SSE2 %1, m6
-cglobal x264_pixel_satd_8x4_%1, 4,6,8
+cglobal pixel_satd_8x4_%1, 4,6,8
SATD_START_SSE2 %1, m6, m7
- call x264_pixel_satd_8x4_internal_%1
+ call pixel_satd_8x4_internal_%1
SATD_END_SSE2 %1, m6
%endmacro ; SATDS_SSE2
%macro SA8D 1
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
-; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
+; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sa8d_8x8_internal_%1
+cglobal pixel_sa8d_8x8_internal_%1
lea r10, [r0+4*r1]
lea r11, [r2+4*r3]
LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
@@ -970,41 +972,41 @@
paddw m0, m1
paddw m0, m2
paddw m0, m8
- SAVE_MM_PERMUTATION x264_pixel_sa8d_8x8_internal_%1
+ SAVE_MM_PERMUTATION pixel_sa8d_8x8_internal_%1
ret
-cglobal x264_pixel_sa8d_8x8_%1, 4,6,12
+cglobal pixel_sa8d_8x8_%1, 4,6,12
lea r4, [3*r1]
lea r5, [3*r3]
%ifnidn %1, sse2
mova m7, [hmul_8p]
%endif
- call x264_pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal_%1
HADDW m0, m1
movd eax, m0
add eax, 1
shr eax, 1
RET
-cglobal x264_pixel_sa8d_16x16_%1, 4,6,12
+cglobal pixel_sa8d_16x16_%1, 4,6,12
lea r4, [3*r1]
lea r5, [3*r3]
%ifnidn %1, sse2
mova m7, [hmul_8p]
%endif
- call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
+ call pixel_sa8d_8x8_internal_%1 ; pix[0]
add r2, 8
add r0, 8
mova m10, m0
- call x264_pixel_sa8d_8x8_internal_%1 ; pix[8]
+ call pixel_sa8d_8x8_internal_%1 ; pix[8]
lea r2, [r2+8*r3]
lea r0, [r0+8*r1]
paddusw m10, m0
- call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8]
+ call pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8]
sub r2, 8
sub r0, 8
paddusw m10, m0
- call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
+ call pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
paddusw m0, m10
HADDUW m0, m1
movd eax, m0
@@ -1014,7 +1016,7 @@
%else ; ARCH_X86_32
%ifnidn %1, mmxext
-cglobal x264_pixel_sa8d_8x8_internal_%1
+cglobal pixel_sa8d_8x8_internal_%1
%define spill0 [esp+4]
%define spill1 [esp+20]
%define spill2 [esp+36]
@@ -1064,13 +1066,13 @@
ret
%endif ; ifndef mmxext
-cglobal x264_pixel_sa8d_8x8_%1, 4,7
+cglobal pixel_sa8d_8x8_%1, 4,7
mov r6, esp
and esp, ~15
sub esp, 48
lea r4, [3*r1]
lea r5, [3*r3]
- call x264_pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal_%1
HADDW m0, m1
movd eax, m0
add eax, 1
@@ -1078,26 +1080,26 @@
mov esp, r6
RET
-cglobal x264_pixel_sa8d_16x16_%1, 4,7
+cglobal pixel_sa8d_16x16_%1, 4,7
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [3*r1]
lea r5, [3*r3]
- call x264_pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal_%1
%ifidn %1, mmxext
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
mova [esp+48], m0
- call x264_pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal_%1
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8
add r2, 8
paddusw m0, [esp+48]
mova [esp+48], m0
- call x264_pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal_%1
%ifidn %1, mmxext
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
@@ -1106,7 +1108,7 @@
paddusw m0, [esp+48]
%endif
mova [esp+64-mmsize], m0
- call x264_pixel_sa8d_8x8_internal_%1
+ call pixel_sa8d_8x8_internal_%1
paddusw m0, [esp+64-mmsize]
%if mmsize == 16
HADDUW m0, m1
@@ -1140,9 +1142,9 @@
%ifdef ARCH_X86_64
INIT_XMM
;-----------------------------------------------------------------------------
-; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
+; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
;-----------------------------------------------------------------------------
-cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16
+cglobal intra_sa8d_x3_8x8_core_%1, 3,3,16
; 8x8 hadamard
pxor m8, m8
movq m0, [r0+0*FENC_STRIDE]
@@ -1247,7 +1249,7 @@
; in: r0 = fenc
; out: m0..m3 = hadamard coefs
INIT_MMX
-cglobal x264_hadamard_load
+cglobal hadamard_load
; not really a global, but otherwise cycles get attributed to the wrong function in profiling
pxor m7, m7
movd m0, [r0+0*FENC_STRIDE]
@@ -1259,7 +1261,7 @@
punpcklbw m2, m7
punpcklbw m3, m7
HADAMARD4_2D 0, 1, 2, 3, 4
- SAVE_MM_PERMUTATION x264_hadamard_load
+ SAVE_MM_PERMUTATION hadamard_load
ret
%macro SCALAR_SUMSUB 4
@@ -1377,9 +1379,9 @@
%macro INTRA_SATDS_MMX 1
INIT_MMX
;-----------------------------------------------------------------------------
-; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
-cglobal x264_intra_satd_x3_4x4_%1, 2,6
+cglobal intra_satd_x3_4x4_%1, 2,6
%ifdef ARCH_X86_64
; stack is 16 byte aligned because abi says so
%define top_1d rsp-8 ; size 8
@@ -1393,7 +1395,7 @@
%define t0 r2
%endif
- call x264_hadamard_load
+ call hadamard_load
SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5
mov t0d, r0d
SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5
@@ -1430,9 +1432,9 @@
%endif
;-----------------------------------------------------------------------------
-; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
-cglobal x264_intra_satd_x3_16x16_%1, 0,7
+cglobal intra_satd_x3_16x16_%1, 0,7
%ifdef ARCH_X86_64
%assign stack_pad 88
%else
@@ -1466,7 +1468,7 @@
.loop_y:
xor r4d, r4d
.loop_x:
- call x264_hadamard_load
+ call hadamard_load
SUM3x4 %1
SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
@@ -1507,9 +1509,9 @@
RET
;-----------------------------------------------------------------------------
-; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
-cglobal x264_intra_satd_x3_8x8c_%1, 0,6
+cglobal intra_satd_x3_8x8c_%1, 0,6
; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
SUB rsp, 72
%define sums rsp+48 ; size 24
@@ -1555,7 +1557,7 @@
.loop_y:
xor r4d, r4d
.loop_x:
- call x264_hadamard_load
+ call hadamard_load
SUM3x4 %1
SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
@@ -1609,7 +1611,7 @@
; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
; out: [tmp]=hadamard4, m0=satd
-cglobal x264_hadamard_ac_4x4_mmxext
+cglobal hadamard_ac_4x4_mmxext
movh m0, [r0]
movh m1, [r0+r1]
movh m2, [r0+r1*2]
@@ -1631,10 +1633,10 @@
paddw m0, m1
paddw m2, m3
paddw m0, m2
- SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext
+ SAVE_MM_PERMUTATION hadamard_ac_4x4_mmxext
ret
-cglobal x264_hadamard_ac_2x2max_mmxext
+cglobal hadamard_ac_2x2max_mmxext
mova m0, [r3+0x00]
mova m1, [r3+0x20]
mova m2, [r3+0x40]
@@ -1646,30 +1648,30 @@
HADAMARD 0, max, 1, 3, 4, 5
paddw m7, m0
paddw m7, m1
- SAVE_MM_PERMUTATION x264_hadamard_ac_2x2max_mmxext
+ SAVE_MM_PERMUTATION hadamard_ac_2x2max_mmxext
ret
-cglobal x264_hadamard_ac_8x8_mmxext
+cglobal hadamard_ac_8x8_mmxext
mova m6, [mask_ac4]
pxor m7, m7
- call x264_hadamard_ac_4x4_mmxext
+ call hadamard_ac_4x4_mmxext
add r0, 4
add r3, 32
mova m5, m0
- call x264_hadamard_ac_4x4_mmxext
+ call hadamard_ac_4x4_mmxext
lea r0, [r0+4*r1]
add r3, 64
paddw m5, m0
- call x264_hadamard_ac_4x4_mmxext
+ call hadamard_ac_4x4_mmxext
sub r0, 4
sub r3, 32
paddw m5, m0
- call x264_hadamard_ac_4x4_mmxext
+ call hadamard_ac_4x4_mmxext
paddw m5, m0
sub r3, 40
mova [rsp+gprsize+8], m5 ; save satd
%rep 3
- call x264_hadamard_ac_2x2max_mmxext
+ call hadamard_ac_2x2max_mmxext
%endrep
mova m0, [r3+0x00]
mova m1, [r3+0x20]
@@ -1686,33 +1688,33 @@
paddw m6, m7
mova [rsp+gprsize], m6 ; save sa8d
SWAP m0, m6
- SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext
+ SAVE_MM_PERMUTATION hadamard_ac_8x8_mmxext
ret
%macro HADAMARD_AC_WXH_MMX 2
-cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4
+cglobal pixel_hadamard_ac_%1x%2_mmxext, 2,4
%assign pad 16-gprsize-(stack_offset&15)
%define ysub r1
sub rsp, 16+128+pad
lea r2, [r1*3]
lea r3, [rsp+16]
- call x264_hadamard_ac_8x8_mmxext
+ call hadamard_ac_8x8_mmxext
%if %2==16
%define ysub r2
lea r0, [r0+r1*4]
sub rsp, 16
- call x264_hadamard_ac_8x8_mmxext
+ call hadamard_ac_8x8_mmxext
%endif
%if %1==16
neg ysub
sub rsp, 16
lea r0, [r0+ysub*4+8]
neg ysub
- call x264_hadamard_ac_8x8_mmxext
+ call hadamard_ac_8x8_mmxext
%if %2==16
lea r0, [r0+r1*4]
sub rsp, 16
- call x264_hadamard_ac_8x8_mmxext
+ call hadamard_ac_8x8_mmxext
%endif
%endif
mova m1, [rsp+0x08]
@@ -1779,7 +1781,7 @@
INIT_XMM
; in: r0=pix, r1=stride, r2=stride*3
; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
-cglobal x264_hadamard_ac_8x8_%1
+cglobal hadamard_ac_8x8_%1
%ifdef ARCH_X86_64
%define spill0 m8
%define spill1 m9
@@ -1883,7 +1885,7 @@
paddw m2, m4
paddw m0, m2
mova [rsp+gprsize+16], m0 ; save sa8d
- SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1
+ SAVE_MM_PERMUTATION hadamard_ac_8x8_%1
ret
HADAMARD_AC_WXH_SSE2 16, 16, %1
@@ -1892,30 +1894,30 @@
HADAMARD_AC_WXH_SSE2 8, 8, %1
%endmacro ; HADAMARD_AC_SSE2
-; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
+; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
%macro HADAMARD_AC_WXH_SSE2 3
-cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3,11
+cglobal pixel_hadamard_ac_%1x%2_%3, 2,3,11
%assign pad 16-gprsize-(stack_offset&15)
%define ysub r1
sub rsp, 48+pad
lea r2, [r1*3]
- call x264_hadamard_ac_8x8_%3
+ call hadamard_ac_8x8_%3
%if %2==16
%define ysub r2
lea r0, [r0+r1*4]
sub rsp, 32
- call x264_hadamard_ac_8x8_%3
+ call hadamard_ac_8x8_%3
%endif
%if %1==16
neg ysub
sub rsp, 32
lea r0, [r0+ysub*4+8]
neg ysub
- call x264_hadamard_ac_8x8_%3
+ call hadamard_ac_8x8_%3
%if %2==16
lea r0, [r0+r1*4]
sub rsp, 32
- call x264_hadamard_ac_8x8_%3
+ call hadamard_ac_8x8_%3
%endif
%endif
mova m1, [rsp+0x20]
@@ -1947,7 +1949,7 @@
; instantiate satds
%ifndef ARCH_X86_64
-cextern x264_pixel_sa8d_8x8_internal_mmxext
+cextern pixel_sa8d_8x8_internal_mmxext
SA8D mmxext
%endif
@@ -1999,8 +2001,8 @@
;=============================================================================
;-----------------------------------------------------------------------------
-; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
-; const uint8_t *pix2, int stride2, int sums[2][4] )
+; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
+; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
%macro SSIM_ITER 1
@@ -2033,7 +2035,7 @@
paddd m3, m6
%endmacro
-cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8
+cglobal pixel_ssim_4x4x2_core_sse2, 4,4,8
pxor m0, m0
SSIM_ITER 0
SSIM_ITER 1
@@ -2069,9 +2071,9 @@
RET
;-----------------------------------------------------------------------------
-; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
+; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssim_end4_sse2, 3,3,7
+cglobal pixel_ssim_end4_sse2, 3,3,7
movdqa m0, [r0+ 0]
movdqa m1, [r0+16]
movdqa m2, [r0+32]
@@ -2175,10 +2177,10 @@
%define ABS1 ABS1_MMX
;-----------------------------------------------------------------------------
-; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
-; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
+; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
+; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_ads4_mmxext, 4,7
+cglobal pixel_ads4_mmxext, 4,7
movq mm6, [r0]
movq mm4, [r0+8]
pshufw mm7, mm6, 0
@@ -2215,7 +2217,7 @@
movd [t0], mm1
ADS_END 1
-cglobal x264_pixel_ads2_mmxext, 4,7
+cglobal pixel_ads2_mmxext, 4,7
movq mm6, [r0]
pshufw mm5, r6m, 0
pshufw mm7, mm6, 0
@@ -2236,7 +2238,7 @@
movd [t0], mm4
ADS_END 1
-cglobal x264_pixel_ads1_mmxext, 4,7
+cglobal pixel_ads1_mmxext, 4,7
pshufw mm7, [r0], 0
pshufw mm6, r6m, 0
ADS_START 2
@@ -2258,7 +2260,7 @@
ADS_END 2
%macro ADS_SSE2 1
-cglobal x264_pixel_ads4_%1, 4,7,12
+cglobal pixel_ads4_%1, 4,7,12
movdqa xmm4, [r0]
pshuflw xmm7, xmm4, 0
pshuflw xmm6, xmm4, 0xAA
@@ -2327,7 +2329,7 @@
%endif ; ARCH
ADS_END 2
-cglobal x264_pixel_ads2_%1, 4,7,8
+cglobal pixel_ads2_%1, 4,7,8
movq xmm6, [r0]
movd xmm5, r6m
pshuflw xmm7, xmm6, 0
@@ -2353,7 +2355,7 @@
movq [t0], xmm1
ADS_END 2
-cglobal x264_pixel_ads1_%1, 4,7,8
+cglobal pixel_ads1_%1, 4,7,8
movd xmm7, [r0]
movd xmm6, r6m
pshuflw xmm7, xmm7, 0
@@ -2385,7 +2387,7 @@
%define ABS1 ABS1_SSSE3
ADS_SSE2 ssse3
-; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
+; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {
; int nmv=0, i, j;
; *(uint32_t*)(masks+width) = 0;
@@ -2399,7 +2401,7 @@
; }
; return nmv;
; }
-cglobal x264_pixel_ads_mvs, 0,7,0
+cglobal pixel_ads_mvs, 0,7,0
ads_mvs:
%ifdef ARCH_X86_64
; mvs = r4
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/predict-a.asm
^
|
@@ -25,6 +25,24 @@
%include "x86inc.asm"
%include "x86util.asm"
+SECTION_RODATA
+
+pw_76543210:
+pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
+pb_00s_ff: times 8 db 0
+pb_0s_ff: times 7 db 0
+ db 0xff
+
+SECTION .text
+
+cextern pb_1
+cextern pb_3
+cextern pw_2
+cextern pw_4
+cextern pw_8
+cextern pw_ff00
+cextern pb_reverse
+
%macro STORE8x8 2
add r0, 4*FDEC_STRIDE
movq [r0 + -4*FDEC_STRIDE], %1
@@ -74,24 +92,6 @@
movdqa [r0 + 3*FDEC_STRIDE], %1
%endmacro
-SECTION_RODATA
-
-ALIGN 16
-pb_1: times 16 db 1
-pb_3: times 16 db 3
-pw_2: times 4 dw 2
-pw_4: times 4 dw 4
-pw_8: times 8 dw 8
-pw_76543210:
-pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
-pb_00s_ff: times 8 db 0
-pb_0s_ff: times 7 db 0
- db 0xff
-pw_ff00: times 8 dw 0xff00
-pb_reverse: db 7, 6, 5, 4, 3, 2, 1, 0
-
-SECTION .text
-
; dest, left, right, src, tmp
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%macro PRED8x8_LOWPASS0 6
@@ -126,7 +126,7 @@
%endmacro
;-----------------------------------------------------------------------------
-; void predict_4x4_ddl_mmxext( uint8_t *src )
+; void predict_4x4_ddl( uint8_t *src )
;-----------------------------------------------------------------------------
cglobal predict_4x4_ddl_mmxext, 1,1
movq mm1, [r0-FDEC_STRIDE]
@@ -149,7 +149,7 @@
RET
;-----------------------------------------------------------------------------
-; void predict_4x4_ddr_mmxext( uint8_t *src )
+; void predict_4x4_ddr( uint8_t *src )
;-----------------------------------------------------------------------------
%macro PREDICT_4x4 1
cglobal predict_4x4_ddr_%1, 1,1
@@ -233,7 +233,7 @@
PREDICT_4x4 ssse3
;-----------------------------------------------------------------------------
-; void predict_4x4_hu_mmxext( uint8_t *src )
+; void predict_4x4_hu( uint8_t *src )
;-----------------------------------------------------------------------------
cglobal predict_4x4_hu_mmxext, 1,1
movq mm0, [r0+0*FDEC_STRIDE-8]
@@ -264,7 +264,7 @@
RET
;-----------------------------------------------------------------------------
-; void predict_4x4_vl_mmxext( uint8_t *src )
+; void predict_4x4_vl( uint8_t *src )
;-----------------------------------------------------------------------------
cglobal predict_4x4_vl_mmxext, 1,1
movq mm1, [r0-FDEC_STRIDE]
@@ -426,7 +426,7 @@
PREDICT_FILTER ssse3
;-----------------------------------------------------------------------------
-; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_v( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_v_mmxext, 2,2
movq mm0, [r1+16]
@@ -434,7 +434,7 @@
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] )
+; void predict_8x8_h( uint8_t *src, uint8_t edge[33] )
;-----------------------------------------------------------------------------
INIT_MMX
@@ -459,7 +459,7 @@
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
+; void predict_8x8_dc( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
cglobal predict_8x8_dc_mmxext, 2,2
pxor mm0, mm0
@@ -475,7 +475,7 @@
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge );
+; void predict_8x8_dc_top( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
%macro PRED8x8_DC 2
cglobal %1, 2,2
@@ -497,7 +497,7 @@
; functions if we know sse2 is available.
;-----------------------------------------------------------------------------
-; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_ddl_mmxext, 2,2
movq mm5, [r1+16]
@@ -506,10 +506,10 @@
movq mm4, [r1+25]
movq mm1, mm5
psllq mm1, 8
+ add r0, FDEC_STRIDE*4
PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7
PRED8x8_LOWPASS mm1, mm3, mm4, [r1+24], mm6
-
-%assign Y 7
+%assign Y 3
%rep 6
movq [r0+Y*FDEC_STRIDE], mm1
movq mm2, mm0
@@ -528,17 +528,17 @@
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_ddr_mmxext, 2,2
movq mm1, [r1+7]
movq mm2, [r1+9]
movq mm3, [r1+15]
movq mm4, [r1+17]
+ add r0, FDEC_STRIDE*4
PRED8x8_LOWPASS mm0, mm1, mm2, [r1+8], mm7
PRED8x8_LOWPASS mm1, mm3, mm4, [r1+16], mm6
-
-%assign Y 7
+%assign Y 3
%rep 6
movq [r0+Y*FDEC_STRIDE], mm0
movq mm2, mm1
@@ -557,7 +557,7 @@
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_hu_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
%define PALIGNR PALIGNR_MMX
cglobal predict_8x8_hu_mmxext, 2,2
@@ -602,7 +602,7 @@
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_vr_core( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
; fills only some pixels:
@@ -622,9 +622,10 @@
movq mm1, [r1+14]
movq mm4, mm3
pavgb mm3, mm2
+ add r0, FDEC_STRIDE*4
PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
-%assign Y 0
+%assign Y -4
%rep 3
movq [r0+ Y *FDEC_STRIDE], mm3
movq [r0+(Y+1)*FDEC_STRIDE], mm0
@@ -638,7 +639,7 @@
RET
;-----------------------------------------------------------------------------
-; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
+; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
cglobal predict_8x8c_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
@@ -666,7 +667,7 @@
REP_RET
;-----------------------------------------------------------------------------
-; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
+; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
cglobal predict_16x16_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
@@ -710,16 +711,17 @@
%endif ; !ARCH_X86_64
;-----------------------------------------------------------------------------
-; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge )
+; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_ddl_sse2, 2,2
movdqa xmm3, [r1+16]
movdqu xmm2, [r1+17]
movdqa xmm1, xmm3
pslldq xmm1, 1
+ add r0, FDEC_STRIDE*4
PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
-%assign Y 0
+%assign Y -4
%rep 8
psrldq xmm0, 1
movq [r0+Y*FDEC_STRIDE], xmm0
@@ -728,18 +730,19 @@
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge )
+; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_ddr_sse2, 2,2
movdqu xmm3, [r1+8]
movdqu xmm1, [r1+7]
movdqa xmm2, xmm3
psrldq xmm2, 1
+ add r0, FDEC_STRIDE*4
PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
movdqa xmm1, xmm0
psrldq xmm1, 1
-%assign Y 7
+%assign Y 3
%rep 3
movq [r0+Y*FDEC_STRIDE], xmm0
movq [r0+(Y-1)*FDEC_STRIDE], xmm1
@@ -747,13 +750,13 @@
psrldq xmm1, 2
%assign Y (Y-2)
%endrep
- movq [r0+1*FDEC_STRIDE], xmm0
- movq [r0+0*FDEC_STRIDE], xmm1
+ movq [r0-3*FDEC_STRIDE], xmm0
+ movq [r0-4*FDEC_STRIDE], xmm1
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge )
+; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_vl_sse2, 2,2
movdqa xmm4, [r1+16]
@@ -763,11 +766,12 @@
psrldq xmm2, 1
pslldq xmm1, 1
pavgb xmm3, xmm2
+ add r0, FDEC_STRIDE*4
PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5
; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
; xmm3: (t0 + t1 + 1) >> 1
-%assign Y 0
+%assign Y -4
%rep 3
psrldq xmm0, 1
movq [r0+ Y *FDEC_STRIDE], xmm3
@@ -782,7 +786,7 @@
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_vr_sse2( uint8_t *src, uint8_t *edge )
+; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
cglobal predict_8x8_vr_sse2, 2,2,7
movdqu xmm0, [r1+8]
@@ -817,7 +821,7 @@
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_hd_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
%define PALIGNR PALIGNR_MMX
cglobal predict_8x8_hd_mmxext, 2,2
@@ -864,7 +868,7 @@
RET
;-----------------------------------------------------------------------------
-; void predict_8x8_hd_ssse3( uint8_t *src, uint8_t *edge )
+; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_HD 1
cglobal predict_8x8_hd_%1, 2,2
@@ -903,7 +907,7 @@
%define PALIGNR PALIGNR_MMX
;-----------------------------------------------------------------------------
-; void predict_8x8_hu_sse2( uint8_t *src, uint8_t *edge )
+; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_HU 1
cglobal predict_8x8_hu_%1, 2,2
@@ -965,7 +969,7 @@
PREDICT_8x8_HU ssse3
;-----------------------------------------------------------------------------
-; void predict_8x8c_v_mmx( uint8_t *src )
+; void predict_8x8c_v( uint8_t *src )
;-----------------------------------------------------------------------------
cglobal predict_8x8c_v_mmx, 1,1
movq mm0, [r0 - FDEC_STRIDE]
@@ -973,7 +977,7 @@
RET
;-----------------------------------------------------------------------------
-; void predict_8x8c_h_mmxext( uint8_t *src )
+; void predict_8x8c_h( uint8_t *src )
;-----------------------------------------------------------------------------
%macro PRED_8x8C_H 1
@@ -997,7 +1001,7 @@
PRED_8x8C_H ssse3
;-----------------------------------------------------------------------------
-; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
+; void predict_8x8c_dc_core( uint8_t *src, int s2, int s3 )
;-----------------------------------------------------------------------------
cglobal predict_8x8c_dc_core_mmxext, 1,1
movq mm0, [r0 - FDEC_STRIDE]
@@ -1052,7 +1056,7 @@
RET
;-----------------------------------------------------------------------------
-; void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c )
+; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
cglobal predict_8x8c_p_core_sse2, 1,1
@@ -1094,7 +1098,7 @@
RET
;-----------------------------------------------------------------------------
-; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c )
+; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
cglobal predict_16x16_p_core_sse2, 1,2,8
movd xmm0, r1m
@@ -1138,7 +1142,7 @@
REP_RET
;-----------------------------------------------------------------------------
-; void predict_16x16_v_mmx( uint8_t *src )
+; void predict_16x16_v( uint8_t *src )
;-----------------------------------------------------------------------------
cglobal predict_16x16_v_mmx, 1,2
movq mm0, [r0 - FDEC_STRIDE]
@@ -1147,7 +1151,7 @@
REP_RET
;-----------------------------------------------------------------------------
-; void predict_16x16_v_sse2( uint8_t *src )
+; void predict_16x16_v( uint8_t *src )
;-----------------------------------------------------------------------------
cglobal predict_16x16_v_sse2, 1,1
movdqa xmm0, [r0 - FDEC_STRIDE]
@@ -1155,7 +1159,7 @@
RET
;-----------------------------------------------------------------------------
-; void predict_16x16_h_mmxext( uint8_t *src )
+; void predict_16x16_h( uint8_t *src )
;-----------------------------------------------------------------------------
%macro PRED_16x16_H 1
@@ -1188,7 +1192,7 @@
PRED_16x16_H ssse3
;-----------------------------------------------------------------------------
-; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
+; void predict_16x16_dc_core( uint8_t *src, int i_dc_left )
;-----------------------------------------------------------------------------
%macro PRED16x16_DC 2
@@ -1225,7 +1229,7 @@
REP_RET
;-----------------------------------------------------------------------------
-; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left )
+; void predict_16x16_dc_core( uint8_t *src, int i_dc_left )
;-----------------------------------------------------------------------------
%macro PRED16x16_DC_SSE2 2
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/predict-c.c
^
|
@@ -25,55 +25,55 @@
#include "predict.h"
#include "pixel.h"
-extern void predict_16x16_v_mmx( uint8_t *src );
-extern void predict_16x16_h_mmxext( uint8_t *src );
-extern void predict_16x16_h_ssse3( uint8_t *src );
-extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left );
-extern void predict_16x16_dc_left_core_mmxext( uint8_t *src, int i_dc_left );
-extern void predict_16x16_dc_top_mmxext( uint8_t *src );
-extern void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
-extern void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
-extern void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c );
-extern void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
-extern void predict_8x8c_dc_top_mmxext( uint8_t *src );
-extern void predict_8x8c_v_mmx( uint8_t *src );
-extern void predict_8x8c_h_mmxext( uint8_t *src );
-extern void predict_8x8c_h_ssse3( uint8_t *src );
-extern void predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_vr_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hu_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hd_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hd_ssse3( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hu_ssse3( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_filter_mmxext ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
-extern void predict_8x8_filter_ssse3 ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
-extern void predict_4x4_ddl_mmxext( uint8_t *src );
-extern void predict_4x4_ddr_mmxext( uint8_t *src );
-extern void predict_4x4_vl_mmxext( uint8_t *src );
-extern void predict_4x4_vr_mmxext( uint8_t *src );
-extern void predict_4x4_vr_ssse3( uint8_t *src );
-extern void predict_4x4_hd_mmxext( uint8_t *src );
-extern void predict_4x4_hd_ssse3( uint8_t *src );
-extern void predict_4x4_dc_mmxext( uint8_t *src );
-extern void predict_4x4_ddr_ssse3( uint8_t *src );
-extern void predict_4x4_hu_mmxext( uint8_t *src );
-extern void predict_16x16_dc_top_sse2( uint8_t *src );
-extern void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left );
-extern void predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left );
-extern void predict_16x16_v_sse2( uint8_t *src );
-extern void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
+ void x264_predict_16x16_v_mmx( uint8_t *src );
+ void x264_predict_16x16_h_mmxext( uint8_t *src );
+ void x264_predict_16x16_h_ssse3( uint8_t *src );
+ void x264_predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left );
+ void x264_predict_16x16_dc_left_core_mmxext( uint8_t *src, int i_dc_left );
+ void x264_predict_16x16_dc_top_mmxext( uint8_t *src );
+ void x264_predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
+ void x264_predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
+ void x264_predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c );
+ void x264_predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
+ void x264_predict_8x8c_dc_top_mmxext( uint8_t *src );
+ void x264_predict_8x8c_v_mmx( uint8_t *src );
+ void x264_predict_8x8c_h_mmxext( uint8_t *src );
+ void x264_predict_8x8c_h_ssse3( uint8_t *src );
+ void x264_predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_ddl_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_vr_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hu_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hd_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hd_ssse3( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hu_ssse3( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_filter_mmxext( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
+ void x264_predict_8x8_filter_ssse3( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
+ void x264_predict_4x4_ddl_mmxext( uint8_t *src );
+ void x264_predict_4x4_ddr_mmxext( uint8_t *src );
+ void x264_predict_4x4_vl_mmxext( uint8_t *src );
+ void x264_predict_4x4_vr_mmxext( uint8_t *src );
+ void x264_predict_4x4_vr_ssse3( uint8_t *src );
+ void x264_predict_4x4_hd_mmxext( uint8_t *src );
+ void x264_predict_4x4_hd_ssse3( uint8_t *src );
+ void x264_predict_4x4_dc_mmxext( uint8_t *src );
+ void x264_predict_4x4_ddr_ssse3( uint8_t *src );
+ void x264_predict_4x4_hu_mmxext( uint8_t *src );
+ void x264_predict_16x16_dc_top_sse2( uint8_t *src );
+ void x264_predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left );
+ void x264_predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left );
+ void x264_predict_16x16_v_sse2( uint8_t *src );
+ void x264_predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
@@ -84,7 +84,7 @@
V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\
#define PREDICT_16x16_P(name)\
-static void predict_16x16_p_##name( uint8_t *src )\
+static void x264_predict_16x16_p_##name( uint8_t *src )\
{\
int a, b, c;\
int H = 0;\
@@ -102,7 +102,7 @@
b = ( 5 * H + 32 ) >> 6;\
c = ( 5 * V + 32 ) >> 6;\
i00 = a - b * 7 - c * 7 + 16;\
- predict_16x16_p_core_##name( src, i00, b, c );\
+ x264_predict_16x16_p_core_##name( src, i00, b, c );\
}
#ifndef ARCH_X86_64
@@ -111,7 +111,7 @@
PREDICT_16x16_P( sse2 )
#ifdef __GNUC__
-static void predict_16x16_p_ssse3( uint8_t *src )
+static void x264_predict_16x16_p_ssse3( uint8_t *src )
{
int a, b, c, i00;
int H, V;
@@ -143,12 +143,12 @@
b = ( 5 * H + 32 ) >> 6;
c = ( 5 * V + 32 ) >> 6;
i00 = a - b * 7 - c * 7 + 16;
- predict_16x16_p_core_sse2( src, i00, b, c );
+ x264_predict_16x16_p_core_sse2( src, i00, b, c );
}
#endif
#define PREDICT_8x8_P(name)\
-static void predict_8x8c_p_##name( uint8_t *src )\
+static void x264_predict_8x8c_p_##name( uint8_t *src )\
{\
int a, b, c;\
int H = 0;\
@@ -162,7 +162,7 @@
b = ( 17 * H + 16 ) >> 5;\
c = ( 17 * V + 16 ) >> 5;\
i00 = a -3*b -3*c + 16;\
- predict_8x8c_p_core_##name( src, i00, b, c );\
+ x264_predict_8x8c_p_core_##name( src, i00, b, c );\
}
#ifndef ARCH_X86_64
@@ -171,7 +171,7 @@
PREDICT_8x8_P( sse2 )
#ifdef __GNUC__
-static void predict_8x8c_p_ssse3( uint8_t *src )
+static void x264_predict_8x8c_p_ssse3( uint8_t *src )
{
int a, b, c, i00;
int H, V;
@@ -196,12 +196,12 @@
b = ( 17 * H + 16 ) >> 5;
c = ( 17 * V + 16 ) >> 5;
i00 = a -3*b -3*c + 16;
- predict_8x8c_p_core_sse2( src, i00, b, c );
+ x264_predict_8x8c_p_core_sse2( src, i00, b, c );
}
#endif
#define PREDICT_16x16_DC(name)\
-static void predict_16x16_dc_##name( uint8_t *src )\
+static void x264_predict_16x16_dc_##name( uint8_t *src )\
{\
uint32_t dc=16;\
int i;\
@@ -210,14 +210,14 @@
dc += src[-1 + i * FDEC_STRIDE];\
dc += src[-1 + (i+1) * FDEC_STRIDE];\
}\
- predict_16x16_dc_core_##name( src, dc );\
+ x264_predict_16x16_dc_core_##name( src, dc );\
}
PREDICT_16x16_DC( mmxext )
PREDICT_16x16_DC( sse2 )
#define PREDICT_16x16_DC_LEFT(name)\
-static void predict_16x16_dc_left_##name( uint8_t *src )\
+static void x264_predict_16x16_dc_left_##name( uint8_t *src )\
{\
uint32_t dc=8;\
int i;\
@@ -226,13 +226,13 @@
dc += src[-1 + i * FDEC_STRIDE];\
dc += src[-1 + (i+1) * FDEC_STRIDE];\
}\
- predict_16x16_dc_left_core_##name( src, dc>>4 );\
+ x264_predict_16x16_dc_left_core_##name( src, dc>>4 );\
}
PREDICT_16x16_DC_LEFT( mmxext )
PREDICT_16x16_DC_LEFT( sse2 )
-static void predict_8x8c_dc_mmxext( uint8_t *src )
+static void x264_predict_8x8c_dc_mmxext( uint8_t *src )
{
int s2 = 4
+ src[-1 + 0*FDEC_STRIDE]
@@ -246,11 +246,11 @@
+ src[-1 + 6*FDEC_STRIDE]
+ src[-1 + 7*FDEC_STRIDE];
- predict_8x8c_dc_core_mmxext( src, s2, s3 );
+ x264_predict_8x8c_dc_core_mmxext( src, s2, s3 );
}
#ifdef ARCH_X86_64
-static void predict_8x8c_dc_left( uint8_t *src )
+static void x264_predict_8x8c_dc_left( uint8_t *src )
{
int y;
uint32_t s0 = 0, s1 = 0;
@@ -304,9 +304,9 @@
#define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
#ifndef ARCH_X86_64
-static void predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] )
+static void x264_predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] )
{
- predict_8x8_vr_core_mmxext( src, edge );
+ x264_predict_8x8_vr_core_mmxext( src, edge );
{
PREDICT_8x8_LOAD_TOPLEFT
PREDICT_8x8_LOAD_LEFT
@@ -326,7 +326,7 @@
t=e; e+=f; f-=t;\
t=g; g+=h; h-=t;
-#define INTRA_SA8D_X3(cpu) \
+#define INTRA_SA8D_X3(cpu)\
void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] )\
{\
PREDICT_8x8_LOAD_TOP\
@@ -372,30 +372,30 @@
{
if( !(cpu&X264_CPU_MMX) )
return;
- pf[I_PRED_16x16_V] = predict_16x16_v_mmx;
+ pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
- pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext;
- pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext;
- pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_mmxext;
+ pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_mmxext;
+ pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_mmxext;
+ pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_mmxext;
#ifndef ARCH_X86_64
- pf[I_PRED_16x16_P] = predict_16x16_p_mmxext;
+ pf[I_PRED_16x16_P] = x264_predict_16x16_p_mmxext;
#endif
- pf[I_PRED_16x16_H] = predict_16x16_h_mmxext;
+ pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmxext;
if( !(cpu&X264_CPU_SSE2) )
return;
- pf[I_PRED_16x16_DC] = predict_16x16_dc_sse2;
- pf[I_PRED_16x16_V] = predict_16x16_v_sse2;
+ pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2;
+ pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse2;
if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
- pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2;
- pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_sse2;
- pf[I_PRED_16x16_P] = predict_16x16_p_sse2;
+ pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2;
+ pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
+ pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
- pf[I_PRED_16x16_H] = predict_16x16_h_ssse3;
+ pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3;
#ifdef __GNUC__
- pf[I_PRED_16x16_P] = predict_16x16_p_ssse3;
+ pf[I_PRED_16x16_P] = x264_predict_16x16_p_ssse3;
#endif
}
@@ -404,25 +404,25 @@
if( !(cpu&X264_CPU_MMX) )
return;
#ifdef ARCH_X86_64
- pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left;
+ pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left;
#endif
- pf[I_PRED_CHROMA_V] = predict_8x8c_v_mmx;
+ pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
- pf[I_PRED_CHROMA_DC_TOP] = predict_8x8c_dc_top_mmxext;
- pf[I_PRED_CHROMA_H] = predict_8x8c_h_mmxext;
+ pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_mmxext;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmxext;
#ifndef ARCH_X86_64
- pf[I_PRED_CHROMA_P] = predict_8x8c_p_mmxext;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_mmxext;
#endif
- pf[I_PRED_CHROMA_DC] = predict_8x8c_dc_mmxext;
+ pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmxext;
if( !(cpu&X264_CPU_SSE2) )
return;
- pf[I_PRED_CHROMA_P] = predict_8x8c_p_sse2;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
- pf[I_PRED_CHROMA_H] = predict_8x8c_h_ssse3;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_ssse3;
#ifdef __GNUC__
- pf[I_PRED_CHROMA_P] = predict_8x8c_p_ssse3;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_ssse3;
#endif
}
@@ -430,48 +430,48 @@
{
if( !(cpu&X264_CPU_MMXEXT) )
return;
- pf[I_PRED_8x8_V] = predict_8x8_v_mmxext;
- pf[I_PRED_8x8_H] = predict_8x8_h_mmxext;
- pf[I_PRED_8x8_DC] = predict_8x8_dc_mmxext;
- pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext;
- pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext;
- pf[I_PRED_8x8_HD] = predict_8x8_hd_mmxext;
- *predict_8x8_filter = predict_8x8_filter_mmxext;
+ pf[I_PRED_8x8_V] = x264_predict_8x8_v_mmxext;
+ pf[I_PRED_8x8_H] = x264_predict_8x8_h_mmxext;
+ pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_mmxext;
+ pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmxext;
+ pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmxext;
+ pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_mmxext;
+ *predict_8x8_filter = x264_predict_8x8_filter_mmxext;
#ifdef ARCH_X86
- pf[I_PRED_8x8_DDL] = predict_8x8_ddl_mmxext;
- pf[I_PRED_8x8_DDR] = predict_8x8_ddr_mmxext;
- pf[I_PRED_8x8_VR] = predict_8x8_vr_mmxext;
- pf[I_PRED_8x8_HU] = predict_8x8_hu_mmxext;
+ pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_mmxext;
+ pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_mmxext;
+ pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_mmxext;
+ pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_mmxext;
#endif
if( !(cpu&X264_CPU_SSE2) )
return;
- pf[I_PRED_8x8_DDL] = predict_8x8_ddl_sse2;
- pf[I_PRED_8x8_VL] = predict_8x8_vl_sse2;
- pf[I_PRED_8x8_VR] = predict_8x8_vr_sse2;
- pf[I_PRED_8x8_DDR] = predict_8x8_ddr_sse2;
- pf[I_PRED_8x8_HD] = predict_8x8_hd_sse2;
- pf[I_PRED_8x8_HU] = predict_8x8_hu_sse2;
+ pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_sse2;
+ pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_sse2;
+ pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_sse2;
+ pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_sse2;
+ pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_sse2;
+ pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
- pf[I_PRED_8x8_HD] = predict_8x8_hd_ssse3;
- pf[I_PRED_8x8_HU] = predict_8x8_hu_ssse3;
- *predict_8x8_filter = predict_8x8_filter_ssse3;
+ pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3;
+ pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
+ *predict_8x8_filter = x264_predict_8x8_filter_ssse3;
}
void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
{
if( !(cpu&X264_CPU_MMXEXT) )
return;
- pf[I_PRED_4x4_VR] = predict_4x4_vr_mmxext;
- pf[I_PRED_4x4_DDL] = predict_4x4_ddl_mmxext;
- pf[I_PRED_4x4_VL] = predict_4x4_vl_mmxext;
- pf[I_PRED_4x4_DC] = predict_4x4_dc_mmxext;
- pf[I_PRED_4x4_DDR] = predict_4x4_ddr_mmxext;
- pf[I_PRED_4x4_HD] = predict_4x4_hd_mmxext;
- pf[I_PRED_4x4_HU] = predict_4x4_hu_mmxext;
+ pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmxext;
+ pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmxext;
+ pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_mmxext;
+ pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_mmxext;
+ pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_mmxext;
+ pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_mmxext;
+ pf[I_PRED_4x4_HU] = x264_predict_4x4_hu_mmxext;
if( !(cpu&X264_CPU_SSSE3) )
return;
- pf[I_PRED_4x4_DDR] = predict_4x4_ddr_ssse3;
- pf[I_PRED_4x4_VR] = predict_4x4_vr_ssse3;
- pf[I_PRED_4x4_HD] = predict_4x4_hd_ssse3;
+ pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
+ pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3;
+ pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/quant-a.asm
^
|
@@ -26,10 +26,6 @@
%include "x86util.asm"
SECTION_RODATA
-pb_1: times 16 db 1
-pw_1: times 8 dw 1
-pd_1: times 4 dd 1
-pb_01: times 8 db 0, 1
%macro DQM4 3
dw %1, %2, %1, %2, %2, %3, %2, %3
@@ -71,6 +67,11 @@
SECTION .text
+cextern pb_1
+cextern pw_1
+cextern pd_1
+cextern pb_01
+
%macro QUANT_DC_START_MMX 0
movd m6, r1m ; mf
movd m7, r2m ; bias
@@ -183,7 +184,7 @@
%endmacro
;-----------------------------------------------------------------------------
-; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
+; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
;-----------------------------------------------------------------------------
%macro QUANT_DC 2-3 0
cglobal %1, 1,1,%3
@@ -202,7 +203,7 @@
%endmacro
;-----------------------------------------------------------------------------
-; int x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
+; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
;-----------------------------------------------------------------------------
%macro QUANT_AC 2
cglobal %1, 3,3
@@ -220,33 +221,33 @@
%define PABSW PABSW_MMX
%define PSIGNW PSIGNW_MMX
%define QUANT_DC_START QUANT_DC_START_MMX
-QUANT_DC x264_quant_2x2_dc_mmxext, 1
+QUANT_DC quant_2x2_dc_mmxext, 1
%ifndef ARCH_X86_64 ; not needed because sse2 is faster
-QUANT_DC x264_quant_4x4_dc_mmxext, 4
-QUANT_AC x264_quant_4x4_mmx, 4
-QUANT_AC x264_quant_8x8_mmx, 16
+QUANT_DC quant_4x4_dc_mmxext, 4
+QUANT_AC quant_4x4_mmx, 4
+QUANT_AC quant_8x8_mmx, 16
%endif
INIT_XMM
-QUANT_DC x264_quant_4x4_dc_sse2, 2, 8
-QUANT_AC x264_quant_4x4_sse2, 2
-QUANT_AC x264_quant_8x8_sse2, 8
+QUANT_DC quant_4x4_dc_sse2, 2, 8
+QUANT_AC quant_4x4_sse2, 2
+QUANT_AC quant_8x8_sse2, 8
%define PABSW PABSW_SSSE3
%define PSIGNW PSIGNW_SSSE3
-QUANT_DC x264_quant_4x4_dc_ssse3, 2, 8
-QUANT_AC x264_quant_4x4_ssse3, 2
-QUANT_AC x264_quant_8x8_ssse3, 8
+QUANT_DC quant_4x4_dc_ssse3, 2, 8
+QUANT_AC quant_4x4_ssse3, 2
+QUANT_AC quant_8x8_ssse3, 8
INIT_MMX
-QUANT_DC x264_quant_2x2_dc_ssse3, 1
+QUANT_DC quant_2x2_dc_ssse3, 1
%define QUANT_END QUANT_END_SSE4
;Not faster on Conroe, so only used in SSE4 versions
%define QUANT_DC_START QUANT_DC_START_SSSE3
INIT_XMM
-QUANT_DC x264_quant_4x4_dc_sse4, 2, 8
-QUANT_AC x264_quant_4x4_sse4, 2
-QUANT_AC x264_quant_8x8_sse4, 8
+QUANT_DC quant_4x4_dc_sse4, 2, 8
+QUANT_AC quant_4x4_sse4, 2
+QUANT_AC quant_8x8_sse4, 8
@@ -347,10 +348,10 @@
%endmacro
;-----------------------------------------------------------------------------
-; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+; void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
%macro DEQUANT 4
-cglobal x264_dequant_%2x%2_%1, 0,3
+cglobal dequant_%2x%2_%1, 0,3
.skip_prologue:
DEQUANT_START %3+2, %3
@@ -367,11 +368,11 @@
psrld m3, 1
DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
-cglobal x264_dequant_%2x%2_flat16_%1, 0,3
+cglobal dequant_%2x%2_flat16_%1, 0,3
movifnidn t2d, r2m
%if %2 == 8
cmp t2d, 12
- jl x264_dequant_%2x%2_%1.skip_prologue
+ jl dequant_%2x%2_%1.skip_prologue
sub t2d, 12
%endif
imul t0d, t2d, 0x2b
@@ -418,7 +419,7 @@
DEQUANT sse2, 8, 6, 2
%macro DEQUANT_DC 1
-cglobal x264_dequant_4x4dc_%1, 0,3
+cglobal dequant_4x4dc_%1, 0,3
DEQUANT_START 6, 6
.lshift:
@@ -480,10 +481,10 @@
DEQUANT_DC sse2
;-----------------------------------------------------------------------------
-; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
+; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
;-----------------------------------------------------------------------------
%macro DENOISE_DCT 1-2 0
-cglobal x264_denoise_dct_%1, 4,5,%2
+cglobal denoise_dct_%1, 4,5,%2
movzx r4d, word [r0] ; backup DC coefficient
pxor m6, m6
.loop:
@@ -534,7 +535,7 @@
;-----------------------------------------------------------------------------
-; int x264_decimate_score( int16_t *dct )
+; int decimate_score( int16_t *dct )
;-----------------------------------------------------------------------------
%macro DECIMATE_MASK_SSE2 6
@@ -579,21 +580,21 @@
or %2, %6
%endmacro
-cextern x264_decimate_table4
-cextern x264_decimate_table8
+cextern decimate_table4
+cextern decimate_table8
%macro DECIMATE4x4 2
;A LUT is faster than bsf on AMD processors, and no slower on Intel
;This is not true for score64.
-cglobal x264_decimate_score%1_%2, 1,3
+cglobal decimate_score%1_%2, 1,3
%ifdef PIC
- lea r10, [x264_decimate_table4]
+ lea r10, [decimate_table4]
lea r11, [decimate_mask_table4]
%define table r10
%define mask_table r11
%else
- %define table x264_decimate_table4
+ %define table decimate_table4
%define mask_table decimate_mask_table4
%endif
DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx
@@ -638,12 +639,12 @@
%macro DECIMATE8x8 1
%ifdef ARCH_X86_64
-cglobal x264_decimate_score64_%1, 1,4
+cglobal decimate_score64_%1, 1,4
%ifdef PIC
- lea r10, [x264_decimate_table8]
+ lea r10, [decimate_table8]
%define table r10
%else
- %define table x264_decimate_table8
+ %define table decimate_table8
%endif
mova m5, [pb_1]
DECIMATE_MASK r1d, eax, r0, m5, %1, null
@@ -677,9 +678,9 @@
%else ; ARCH
%ifidn %1, mmxext
-cglobal x264_decimate_score64_%1, 1,6
+cglobal decimate_score64_%1, 1,6
%else
-cglobal x264_decimate_score64_%1, 1,5
+cglobal decimate_score64_%1, 1,5
%endif
mova m7, [pb_1]
DECIMATE_MASK r3, r2, r0, m7, %1, r5
@@ -705,7 +706,7 @@
je .largerun
shrd r3, r4, cl
shr r4, cl
- add r0b, byte [x264_decimate_table8 + ecx]
+ add r0b, byte [decimate_table8 + ecx]
shrd r3, r4, 1
shr r4, 1
cmp r0, 6 ;score64's threshold is never higher than 6
@@ -746,7 +747,7 @@
DECIMATE8x8 ssse3
;-----------------------------------------------------------------------------
-; int x264_coeff_last( int16_t *dct )
+; int coeff_last( int16_t *dct )
;-----------------------------------------------------------------------------
%macro LAST_MASK_SSE2 2-3
@@ -780,12 +781,12 @@
%macro COEFF_LAST4 1
%ifdef ARCH_X86_64
-cglobal x264_coeff_last4_%1, 1,1
+cglobal coeff_last4_%1, 1,1
LAST rax, [r0], 0x3f
shr eax, 4
RET
%else
-cglobal x264_coeff_last4_%1, 0,3
+cglobal coeff_last4_%1, 0,3
mov edx, r0mp
mov eax, [edx+4]
xor ecx, ecx
@@ -805,7 +806,7 @@
COEFF_LAST4 mmxext_lzcnt
%macro COEFF_LAST 1
-cglobal x264_coeff_last15_%1, 1,3
+cglobal coeff_last15_%1, 1,3
pxor m2, m2
LAST_MASK r1d, r0-2, r2d
xor r1d, 0xffff
@@ -813,7 +814,7 @@
dec eax
RET
-cglobal x264_coeff_last16_%1, 1,3
+cglobal coeff_last16_%1, 1,3
pxor m2, m2
LAST_MASK r1d, r0, r2d
xor r1d, 0xffff
@@ -821,7 +822,7 @@
RET
%ifndef ARCH_X86_64
-cglobal x264_coeff_last64_%1, 1, 5-mmsize/16
+cglobal coeff_last64_%1, 1, 5-mmsize/16
pxor m2, m2
LAST_MASK r2d, r0+64, r4d
LAST_MASK r3d, r0+96, r4d
@@ -841,7 +842,7 @@
add eax, 32
RET
%else
-cglobal x264_coeff_last64_%1, 1,4
+cglobal coeff_last64_%1, 1,4
pxor m2, m2
LAST_MASK_SSE2 r1d, r0
LAST_MASK_SSE2 r2d, r0+32
@@ -872,7 +873,7 @@
COEFF_LAST sse2_lzcnt
;-----------------------------------------------------------------------------
-; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel )
+; int coeff_level_run( int16_t *dct, run_level_t *runlevel )
;-----------------------------------------------------------------------------
%macro LAST_MASK4_MMX 2-3
@@ -901,7 +902,7 @@
%endif
%macro COEFF_LEVELRUN 2
-cglobal x264_coeff_level_run%2_%1,0,7
+cglobal coeff_level_run%2_%1,0,7
movifnidn t0, r0mp
movifnidn t1, r1mp
pxor m2, m2
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/sad-a.asm
^
|
@@ -26,14 +26,13 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
-pb_3: times 16 db 3
-pb_shuf8x8c: db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
-pw_8: times 4 dw 8
-sw_64: dd 64
-
SECTION .text
+cextern pb_3
+cextern pb_shuf8x8c
+cextern pw_8
+cextern sw_64
+
;=============================================================================
; SAD MMX
;=============================================================================
@@ -78,10 +77,10 @@
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SAD 2
-cglobal x264_pixel_sad_%1x%2_mmxext, 4,4
+cglobal pixel_sad_%1x%2_mmxext, 4,4
pxor mm0, mm0
%rep %2/2
SAD_INC_2x%1P
@@ -113,9 +112,9 @@
%macro SAD_W16 1
;-----------------------------------------------------------------------------
-; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x16_%1, 4,4,8
+cglobal pixel_sad_16x16_%1, 4,4,8
movdqu m0, [r2]
movdqu m1, [r2+r3]
lea r2, [r2+2*r3]
@@ -180,9 +179,9 @@
SAD_END_SSE2
;-----------------------------------------------------------------------------
-; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x8_%1, 4,4
+cglobal pixel_sad_16x8_%1, 4,4
movdqu m0, [r2]
movdqu m2, [r2+r3]
lea r2, [r2+2*r3]
@@ -249,7 +248,7 @@
%endmacro
;Even on Nehalem, no sizes other than 8x16 benefit from this method.
-cglobal x264_pixel_sad_8x16_sse2, 4,4
+cglobal pixel_sad_8x16_sse2, 4,4
SAD_INC_4x8P_SSE 0
SAD_INC_4x8P_SSE 1
SAD_INC_4x8P_SSE 1
@@ -258,10 +257,10 @@
RET
;-----------------------------------------------------------------------------
-; void intra_sad_x3_4x4 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
-cglobal x264_intra_sad_x3_4x4_mmxext, 3,3
+cglobal intra_sad_x3_4x4_mmxext, 3,3
pxor mm7, mm7
movd mm0, [r1-FDEC_STRIDE]
movd mm1, [r0+FENC_STRIDE*0]
@@ -305,7 +304,7 @@
RET
;-----------------------------------------------------------------------------
-; void intra_sad_x3_8x8 ( uint8_t *fenc, uint8_t edge[33], int res[3]);
+; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[33], int res[3]);
;-----------------------------------------------------------------------------
;m0 = DC
@@ -343,7 +342,7 @@
%endmacro
INIT_MMX
-cglobal x264_intra_sad_x3_8x8_mmxext, 3,3
+cglobal intra_sad_x3_8x8_mmxext, 3,3
movq m7, [r1+7]
pxor m0, m0
movq m6, [r1+16] ;V prediction
@@ -372,7 +371,7 @@
RET
;-----------------------------------------------------------------------------
-; void intra_sad_x3_8x8c ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
%macro INTRA_SAD_HV_ITER 2
@@ -407,7 +406,7 @@
%endmacro
%macro INTRA_SAD_8x8C 1
-cglobal x264_intra_sad_x3_8x8c_%1, 3,3
+cglobal intra_sad_x3_8x8c_%1, 3,3
movq m6, [r1 - FDEC_STRIDE]
add r1, FDEC_STRIDE*4
%ifidn %1,ssse3
@@ -508,13 +507,13 @@
;-----------------------------------------------------------------------------
-; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
%macro INTRA_SAD16 1-2 0
-cglobal x264_intra_sad_x3_16x16_%1,3,5,%2
+cglobal intra_sad_x3_16x16_%1,3,5,%2
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [r1-FDEC_STRIDE+0]
@@ -526,10 +525,14 @@
%endif
%assign x 0
%rep 16
- movzx r4d, byte [r1-1+FDEC_STRIDE*x]
+ movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
+%if (x&3)==3 && x!=15
+ add r1, FDEC_STRIDE*4
+%endif
add r3d, r4d
%assign x x+1
%endrep
+ sub r1, FDEC_STRIDE*12
add r3d, 16
shr r3d, 5
imul r3d, 0x01010101
@@ -813,11 +816,11 @@
%endmacro
;-----------------------------------------------------------------------------
-; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-; uint8_t *pix2, int i_stride, int scores[3] )
+; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+; uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
-cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
+cglobal pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
%ifdef WIN64
%assign i %1+1
movsxd r %+ i, r %+ i %+ d
@@ -1162,11 +1165,11 @@
%endmacro
;-----------------------------------------------------------------------------
-; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-; uint8_t *pix2, int i_stride, int scores[3] )
+; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+; uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X_SSE2 4
-cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9
+cglobal pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9
%ifdef WIN64
%assign i %1+1
movsxd r %+ i, r %+ i %+ d
@@ -1179,7 +1182,7 @@
%endmacro
%macro SAD_X_SSE2_MISALIGN 4
-cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9
+cglobal pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9
%ifdef WIN64
%assign i %1+1
movsxd r %+ i, r %+ i %+ d
@@ -1285,11 +1288,11 @@
%endmacro
%macro SAD16_CACHELINE_FUNC 2 ; cpu, height
-cglobal x264_pixel_sad_16x%2_cache64_%1
+cglobal pixel_sad_16x%2_cache64_%1
mov eax, r2m
and eax, 0x37
cmp eax, 0x30
- jle x264_pixel_sad_16x%2_sse2
+ jle pixel_sad_16x%2_sse2
PROLOGUE 4,6
mov r4d, r2d
and r4d, 15
@@ -1320,7 +1323,7 @@
mov eax, r2m
and eax, 0x17|%1|(%4>>1)
cmp eax, 0x10|%1|(%4>>1)
- jle x264_pixel_sad_%1x%2_mmxext
+ jle pixel_sad_%1x%2_mmxext
and eax, 7
shl eax, 3
movd mm6, [sw_64]
@@ -1333,7 +1336,7 @@
%endmacro
%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal x264_pixel_sad_16x%1_cache%2_mmxext
+cglobal pixel_sad_16x%1_cache%2_mmxext
SAD_CACHELINE_START_MMX2 16, %1, %1, %2
.loop:
movq mm1, [r2]
@@ -1359,7 +1362,7 @@
%endmacro
%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal x264_pixel_sad_8x%1_cache%2_mmxext
+cglobal pixel_sad_8x%1_cache%2_mmxext
SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
.loop:
movq mm1, [r2+8]
@@ -1395,11 +1398,11 @@
%endmacro
%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
-cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6
+cglobal pixel_sad_x3_%1x%2_cache%3_%6
CHECK_SPLIT r1m, %1, %3
CHECK_SPLIT r2m, %1, %3
CHECK_SPLIT r3m, %1, %3
- jmp x264_pixel_sad_x3_%1x%2_%4
+ jmp pixel_sad_x3_%1x%2_%4
.split:
%ifdef ARCH_X86_64
PROLOGUE 6,7
@@ -1414,7 +1417,7 @@
mov r3, r4
mov r10, r0
mov r11, r5
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [r11], eax
%ifdef WIN64
mov r2, [rsp]
@@ -1422,7 +1425,7 @@
pop r2
%endif
mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [r11+4], eax
%ifdef WIN64
mov r2, [rsp+8]
@@ -1430,7 +1433,7 @@
pop r2
%endif
mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [r11+8], eax
%ifdef WIN64
add rsp, 24
@@ -1443,15 +1446,15 @@
push dword [esp+16]
push dword 16
push dword [esp+20]
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov ecx, [esp+32]
mov [edi], eax
mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov ecx, [esp+36]
mov [edi+4], eax
mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [edi+8], eax
add esp, 16
pop edi
@@ -1460,12 +1463,12 @@
%endmacro
%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
-cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6
+cglobal pixel_sad_x4_%1x%2_cache%3_%6
CHECK_SPLIT r1m, %1, %3
CHECK_SPLIT r2m, %1, %3
CHECK_SPLIT r3m, %1, %3
CHECK_SPLIT r4m, %1, %3
- jmp x264_pixel_sad_x4_%1x%2_%4
+ jmp pixel_sad_x4_%1x%2_%4
.split:
%ifdef ARCH_X86_64
PROLOGUE 6,7
@@ -1480,7 +1483,7 @@
mov r1, FENC_STRIDE
mov r3, r5
mov r10, r0
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [r11], eax
%ifdef WIN64
mov r2, [rsp]
@@ -1488,7 +1491,7 @@
pop r2
%endif
mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [r11+4], eax
%ifdef WIN64
mov r2, [rsp+8]
@@ -1496,7 +1499,7 @@
pop r2
%endif
mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [r11+8], eax
%ifdef WIN64
mov r2, [rsp+16]
@@ -1504,7 +1507,7 @@
pop r2
%endif
mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [r11+12], eax
%ifdef WIN64
add rsp, 24
@@ -1517,19 +1520,19 @@
push dword [esp+16]
push dword 16
push dword [esp+20]
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov ecx, [esp+32]
mov [edi], eax
mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov ecx, [esp+36]
mov [edi+4], eax
mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov ecx, [esp+40]
mov [edi+8], eax
mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [edi+12], eax
add esp, 16
pop edi
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/util.h
^
|
@@ -26,7 +26,9 @@
#ifdef __GNUC__
+#ifdef __SSE__
#include <xmmintrin.h>
+#endif
#define x264_median_mv x264_median_mv_mmxext
static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
@@ -107,7 +109,7 @@
}
#define x264_predictor_roundclip x264_predictor_roundclip_mmxext
-static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
+static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
{
uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min );
uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max );
@@ -121,7 +123,7 @@
"punpckldq %%mm6, %%mm6 \n"
"test $1, %0 \n"
"jz 1f \n"
- "movd -4(%5,%0,4), %%mm0 \n"
+ "movd -4(%6,%0,4), %%mm0 \n"
"paddw %%mm7, %%mm0 \n"
"psraw $2, %%mm0 \n"
"pmaxsw %%mm5, %%mm0 \n"
@@ -130,7 +132,7 @@
"dec %0 \n"
"jz 2f \n"
"1: \n"
- "movq -8(%5,%0,4), %%mm0 \n"
+ "movq -8(%6,%0,4), %%mm0 \n"
"paddw %%mm7, %%mm0 \n"
"psraw $2, %%mm0 \n"
"pmaxsw %%mm5, %%mm0 \n"
@@ -139,15 +141,17 @@
"sub $2, %0 \n"
"jnz 1b \n"
"2: \n"
- :"+r"(i), "+m"(M64( mvc ))
- :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(mvc)
+ :"+r"(i), "=m"(M64( dst ))
+ :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(dst), "r"(mvc), "m"(M64( mvc ))
);
}
+#ifdef __SSE__
#undef M128_ZERO
#define M128_ZERO ((__m128){0,0,0,0})
#define x264_union128_t x264_union128_sse_t
typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t;
+#endif
#endif
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/common/x86/x86inc.asm
^
|
@@ -32,6 +32,8 @@
; as this feature might be useful for others as well. Send patches or ideas
; to x264-devel@videolan.org .
+%define program_name x264
+
%ifdef ARCH_X86_64
%ifidn __OUTPUT_FORMAT__,win32
%define WIN64
@@ -169,7 +171,7 @@
%endrep
%endmacro
-DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
%ifdef ARCH_X86_64
%define gprsize 8
@@ -436,7 +438,7 @@
; Symbol prefix for C linkage
%macro cglobal 1-2+
- %xdefine %1 mangle(%1)
+ %xdefine %1 mangle(program_name %+ _ %+ %1)
%xdefine %1.skip_prologue %1 %+ .skip_prologue
%ifidn __OUTPUT_FORMAT__,elf
global %1:function hidden
@@ -453,10 +455,22 @@
%endmacro
%macro cextern 1
+ %xdefine %1 mangle(program_name %+ _ %+ %1)
+ extern %1
+%endmacro
+
+;like cextern, but without the prefix
+%macro cextern_naked 1
%xdefine %1 mangle(%1)
extern %1
%endmacro
+%macro const 2+
+ %xdefine %1 mangle(program_name %+ _ %+ %1)
+ global %1
+ %1: %2
+%endmacro
+
; This is needed for ELF, otherwise the GNU linker assumes the stack is
; executable by default.
%ifidn __OUTPUT_FORMAT__,elf
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/configure
^
|
@@ -118,7 +118,7 @@
ffms_input="auto"
mp4_output="auto"
pthread="auto"
-asm="yes"
+asm="auto"
debug="no"
gprof="no"
pic="no"
@@ -154,33 +154,18 @@
--includedir=*)
includedir="$optarg"
;;
- --enable-asm)
- asm="yes"
- ;;
--disable-asm)
asm="no"
;;
- --enable-avs-input)
- avs_input="auto"
- ;;
--disable-avs-input)
avs_input="no"
;;
- --enable-lavf-input)
- lavf_input="auto"
- ;;
--disable-lavf-input)
lavf_input="no"
;;
- --enable-ffms-input)
- ffms_input="auto"
- ;;
--disable-ffms-input)
ffms_input="no"
;;
- --enable-mp4-output)
- mp4_output="yes"
- ;;
--disable-mp4-output)
mp4_output="no"
;;
@@ -193,9 +178,6 @@
--extra-ldflags=*)
LDFLAGS="$LDFLAGS ${opt#--extra-ldflags=}"
;;
- --enable-pthread)
- pthread="auto" # can't skip detection, since it differs by OS
- ;;
--disable-pthread)
pthread="no"
;;
@@ -214,8 +196,6 @@
shared="yes"
;;
--enable-visualize)
- LDFLAGS="$LDFLAGS -L/usr/X11R6/lib -lX11"
- define HAVE_VISUALIZE
vis="yes"
;;
--host=*)
@@ -425,7 +405,7 @@
pic="yes"
fi
-if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
+if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
if ! as_check "lzcnt eax, eax" ; then
VER=`($AS --version || echo no assembler) 2>$DEVNULL | head -n 1`
echo "Found $VER"
@@ -444,7 +424,7 @@
define HAVE_MMX
fi
-if [ $asm = yes -a $ARCH = ARM ] ; then
+if [ $asm = auto -a $ARCH = ARM ] ; then
# set flags so neon is built by default
echo $CFLAGS | grep -Eq '(-mcpu|-march|-mfpu|-mfloat-abi)' || CFLAGS="$CFLAGS -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp"
@@ -460,7 +440,7 @@
fi
[ $asm = no ] && AS=""
-[ "x$AS" = x ] && asm="no"
+[ "x$AS" = x ] && asm="no" || asm="yes"
define ARCH_$ARCH
define SYS_$SYS
@@ -516,6 +496,13 @@
define HAVE_LOG2F
fi
+if [ "$vis" = "yes" ] && cc_check "X11/Xlib.h" "-L/usr/X11R6/lib -lX11" "XOpenDisplay( 0 );" ; then
+ LDFLAGS="-L/usr/X11R6/lib -lX11 $LDFLAGS"
+ define HAVE_VISUALIZE
+else
+ vis="no"
+fi
+
if [ "$lavf_input" = "auto" ] ; then
lavf_input="no"
if ${cross_prefix}pkg-config --exists libavformat libavcodec libswscale 2>$DEVNULL; then
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/encoder/analyse.c
^
|
@@ -27,7 +27,6 @@
#include <unistd.h>
#include "common/common.h"
-#include "common/cpu.h"
#include "macroblock.h"
#include "me.h"
#include "ratecontrol.h"
@@ -2569,15 +2568,11 @@
x264_mb_analysis_t analysis;
int i_cost = COST_MAX;
- h->mb.i_qp = x264_ratecontrol_qp( h );
- if( h->param.rc.i_aq_mode )
- {
- x264_adaptive_quant( h );
- /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
- * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
- if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
- h->mb.i_qp = h->mb.i_last_qp;
- }
+ h->mb.i_qp = x264_ratecontrol_mb_qp( h );
+ /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
+ * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
+ if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
+ h->mb.i_qp = h->mb.i_last_qp;
x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/encoder/cabac.c
^
|
@@ -539,16 +539,16 @@
// node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
// 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
/* map node ctx => cabac ctx for level=1 */
-static const int coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
+static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
/* map node ctx => cabac ctx for level>1 */
-static const int coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
+static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
static const uint8_t coeff_abs_level_transition[2][8] = {
/* update node ctx after coding a level=1 */
{ 1, 2, 3, 3, 4, 5, 6, 7 },
/* update node ctx after coding a level>1 */
{ 4, 4, 4, 4, 5, 6, 7, 7 }
};
-static const int count_cat_m1[5] = {15, 14, 15, 3, 14};
+static const uint8_t count_cat_m1[5] = {15, 14, 15, 3, 14};
#if !RDO_SKIP_BS
static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int16_t *l )
@@ -736,13 +736,13 @@
}
#endif
-#define block_residual_write_cabac_cbf( h, cb, i_ctxBlockCat, i_idx, l, b_intra ) \
-{ \
- int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx, b_intra ); \
+#define block_residual_write_cabac_cbf( h, cb, i_ctxBlockCat, i_idx, l, b_intra )\
+{\
+ int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx, b_intra );\
if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\
{\
x264_cabac_encode_decision( cb, ctxidxinc, 1 );\
- block_residual_write_cabac( h, cb, i_ctxBlockCat, l ); \
+ block_residual_write_cabac( h, cb, i_ctxBlockCat, l );\
}\
else\
x264_cabac_encode_decision( cb, ctxidxinc, 0 );\
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/encoder/cavlc.c
^
|
@@ -117,7 +117,7 @@
{
bs_t *s = &h->out.bs;
static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
- static const int count_cat[5] = {16, 15, 16, 4, 15};
+ static const uint8_t count_cat[5] = {16, 15, 16, 4, 15};
x264_run_level_t runlevel;
int i_trailing, i_total_zero, i_suffix_length;
int i_total = 0;
@@ -172,7 +172,7 @@
}
}
- if( i_total < count_cat[i_ctxBlockCat] )
+ if( (uint8_t)i_total < count_cat[i_ctxBlockCat] )
{
if( i_ctxBlockCat == DCT_CHROMA_DC )
bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/encoder/encoder.c
^
|
@@ -25,7 +25,6 @@
#include <math.h>
#include "common/common.h"
-#include "common/cpu.h"
#include "set.h"
#include "analyse.h"
@@ -356,9 +355,15 @@
static int x264_validate_parameters( x264_t *h )
{
#ifdef HAVE_MMX
+#ifdef __SSE__
if( !(x264_cpu_detect() & X264_CPU_SSE) )
{
x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n");
+#else
+ if( !(x264_cpu_detect() & X264_CPU_MMXEXT) )
+ {
+ x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
+#endif
x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n");
return -1;
}
@@ -1885,24 +1890,27 @@
x264_macroblock_cache_save( h );
/* accumulate mb stats */
- h->stat.frame.i_mb_count[h->mb.i_type]++;
int b_intra = IS_INTRA( h->mb.i_type );
- if( !b_intra && !IS_SKIP( h->mb.i_type ) && !IS_DIRECT( h->mb.i_type ) )
+ if( h->param.i_log_level >= X264_LOG_INFO || h->param.rc.b_stat_write )
{
- if( h->mb.i_partition != D_8x8 )
- h->stat.frame.i_mb_partition[h->mb.i_partition] += 4;
- else
- for( int i = 0; i < 4; i++ )
- h->stat.frame.i_mb_partition[h->mb.i_sub_partition[i]] ++;
- if( h->param.i_frame_reference > 1 )
- for( int i_list = 0; i_list <= (h->sh.i_type == SLICE_TYPE_B); i_list++ )
- for( int i = 0; i < 4; i++ )
- {
- int i_ref = h->mb.cache.ref[i_list][ x264_scan8[4*i] ];
- if( i_ref >= 0 )
- h->stat.frame.i_mb_count_ref[i_list][i_ref] ++;
- }
+ h->stat.frame.i_mb_count[h->mb.i_type]++;
+ if( !b_intra && !IS_SKIP( h->mb.i_type ) && !IS_DIRECT( h->mb.i_type ) )
+ {
+ if( h->mb.i_partition != D_8x8 )
+ h->stat.frame.i_mb_partition[h->mb.i_partition] += 4;
+ else
+ for( int i = 0; i < 4; i++ )
+ h->stat.frame.i_mb_partition[h->mb.i_sub_partition[i]] ++;
+ if( h->param.i_frame_reference > 1 )
+ for( int i_list = 0; i_list <= (h->sh.i_type == SLICE_TYPE_B); i_list++ )
+ for( int i = 0; i < 4; i++ )
+ {
+ int i_ref = h->mb.cache.ref[i_list][ x264_scan8[4*i] ];
+ if( i_ref >= 0 )
+ h->stat.frame.i_mb_count_ref[i_list][i_ref] ++;
+ }
+ }
}
if( h->param.i_log_level >= X264_LOG_INFO )
@@ -2058,6 +2066,10 @@
static int x264_threaded_slices_write( x264_t *h )
{
void *ret = NULL;
+#ifdef HAVE_MMX
+ if( h->param.cpu&X264_CPU_SSE_MISALIGN )
+ x264_cpu_mask_misalign_sse();
+#endif
/* set first/last mb and sync contexts */
for( int i = 0; i < h->param.i_threads; i++ )
{
@@ -2095,7 +2107,11 @@
/* Go back and fix up the hpel on the borders between slices. */
for( int i = 1; i < h->param.i_threads; i++ )
+ {
x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 1, 0 );
+ if( h->sh.b_mbaff )
+ x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 2, 0 );
+ }
x264_threads_merge_ratecontrol( h );
@@ -2119,6 +2135,12 @@
return 0;
}
+void x264_encoder_intra_refresh( x264_t *h )
+{
+ h = h->thread[h->i_thread_phase];
+ h->b_queued_intra_refresh = 1;
+}
+
/****************************************************************************
* x264_encoder_encode:
* XXX: i_poc : is the poc of the current given picture
@@ -2363,25 +2385,34 @@
h->i_nal_type = i_nal_type;
h->i_nal_ref_idc = i_nal_ref_idc;
- if( h->param.b_intra_refresh && h->fenc->i_type == X264_TYPE_P )
+ if( h->param.b_intra_refresh )
{
- int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2;
- float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 );
- int max_position = (int)(increment * h->param.i_keyint_max);
- if( IS_X264_TYPE_I( h->fref0[0]->i_type ) )
- h->fdec->f_pir_position = 0;
- else
+ if( IS_X264_TYPE_I( h->fenc->i_type ) )
+ {
+ h->fdec->i_frames_since_pir = 0;
+ h->b_queued_intra_refresh = 0;
+ /* PIR is currently only supported with ref == 1, so any intra frame effectively refreshes
+ * the whole frame and counts as an intra refresh. */
+ h->fdec->f_pir_position = h->sps->i_mb_width;
+ }
+ else if( h->fenc->i_type == X264_TYPE_P )
{
+ int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2;
+ float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 );
h->fdec->f_pir_position = h->fref0[0]->f_pir_position;
- if( h->fdec->f_pir_position+0.5 >= max_position )
+ h->fdec->i_frames_since_pir = h->fref0[0]->i_frames_since_pir + pocdiff;
+ if( h->fdec->i_frames_since_pir >= h->param.i_keyint_max ||
+ (h->b_queued_intra_refresh && h->fdec->f_pir_position + 0.5 >= h->sps->i_mb_width) )
{
h->fdec->f_pir_position = 0;
+ h->fdec->i_frames_since_pir = 0;
+ h->b_queued_intra_refresh = 0;
h->fenc->b_keyframe = 1;
}
+ h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
+ h->fdec->f_pir_position += increment * pocdiff;
+ h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5;
}
- h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
- h->fdec->f_pir_position += increment * pocdiff;
- h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5;
}
if( h->fenc->b_keyframe )
@@ -2789,8 +2820,8 @@
/* Slices used and PSNR */
for( int i = 0; i < 5; i++ )
{
- static const int slice_order[] = { SLICE_TYPE_I, SLICE_TYPE_SI, SLICE_TYPE_P, SLICE_TYPE_SP, SLICE_TYPE_B };
- static const char *slice_name[] = { "P", "B", "I", "SP", "SI" };
+ static const uint8_t slice_order[] = { SLICE_TYPE_I, SLICE_TYPE_SI, SLICE_TYPE_P, SLICE_TYPE_SP, SLICE_TYPE_B };
+ static const char * const slice_name[] = { "P", "B", "I", "SP", "SI" };
int i_slice = slice_order[i];
if( h->stat.i_frame_count[i_slice] > 0 )
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/encoder/lookahead.c
^
|
@@ -35,7 +35,6 @@
* # of bframes + # of threads.
*/
#include "common/common.h"
-#include "common/cpu.h"
#include "analyse.h"
static void x264_lookahead_shift( x264_synch_frame_list_t *dst, x264_synch_frame_list_t *src, int count )
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/encoder/macroblock.c
^
|
@@ -458,10 +458,10 @@
static void x264_macroblock_encode_skip( x264_t *h )
{
- h->mb.i_cbp_luma = 0x00;
- h->mb.i_cbp_chroma = 0x00;
- memset( h->mb.cache.non_zero_count, 0, sizeof( h->mb.cache.non_zero_count ) );
- /* store cbp */
+ for( int i = 0; i < sizeof( h->mb.cache.non_zero_count ); i += 16 )
+ M128( &h->mb.cache.non_zero_count[i] ) = M128_ZERO;
+ h->mb.i_cbp_luma = 0;
+ h->mb.i_cbp_chroma = 0;
h->mb.cbp[h->mb.i_mb_xy] = 0;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/encoder/me.c
^
|
@@ -48,8 +48,8 @@
/* (x-1)%6 */
static const uint8_t mod6m1[8] = {5,0,1,2,3,4,5,0};
/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
-static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
-static const int square1[9][2] = {{0,0}, {0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {-1,1}, {1,-1}, {1,1}};
+static const int8_t hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
+static const int8_t square1[9][2] = {{0,0}, {0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {-1,1}, {1,-1}, {1,1}};
static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel );
@@ -245,14 +245,15 @@
pmv = pack16to32_mask( bmx, bmy );
if( i_mvc > 0 )
{
- x264_predictor_roundclip( mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
+ ALIGNED_ARRAY_8( int16_t, mvc_fpel,[16][2] );
+ x264_predictor_roundclip( mvc_fpel, mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
bcost <<= 4;
for( int i = 1; i <= i_mvc; i++ )
{
- if( M32( mvc[i-1] ) && (pmv != M32( mvc[i-1] )) )
+ if( M32( mvc_fpel[i-1] ) && (pmv != M32( mvc[i-1] )) )
{
- int mx = mvc[i-1][0];
- int my = mvc[i-1][1];
+ int mx = mvc_fpel[i-1][0];
+ int my = mvc_fpel[i-1][1];
int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my );
cost = (cost << 4) + i;
COPY1_IF_LT( bcost, cost );
@@ -260,8 +261,8 @@
}
if( bcost&15 )
{
- bmx = mvc[(bcost&15)-1][0];
- bmy = mvc[(bcost&15)-1][1];
+ bmx = mvc_fpel[(bcost&15)-1][0];
+ bmy = mvc_fpel[(bcost&15)-1][1];
}
bcost >>= 4;
}
@@ -376,7 +377,7 @@
/* Uneven-cross Multi-Hexagon-grid Search
* as in JM, except with different early termination */
- static const int x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 };
+ static const uint8_t x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 };
int ucost1, ucost2;
int cross_start = 1;
@@ -423,7 +424,7 @@
/* range multipliers based on casual inspection of some statistics of
* average distance between current predictor and final mv found by ESA.
* these have not been tuned much by actual encoding. */
- static const int range_mul[4][4] =
+ static const uint8_t range_mul[4][4] =
{
{ 3, 3, 4, 4 },
{ 3, 4, 4, 4 },
@@ -467,7 +468,7 @@
: mvd < 20*denom ? 1
: mvd < 40*denom ? 2 : 3;
- i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] / 4;
+ i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] >> 2;
}
/* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
@@ -483,7 +484,7 @@
int i = 1;
do
{
- static const int hex4[16][2] = {
+ static const int8_t hex4[16][2] = {
{ 0,-4}, { 0, 4}, {-2,-3}, { 2,-3},
{-4,-2}, { 4,-2}, {-4,-1}, { 4,-1},
{-4, 0}, { 4, 0}, {-4, 1}, { 4, 1},
@@ -657,7 +658,7 @@
bsad += ycost;
}
- limit = i_me_range / 2;
+ limit = i_me_range >> 1;
sad_thresh = bsad*sad_thresh>>3;
while( nmvsad > limit*2 && sad_thresh > bsad )
{
@@ -913,14 +914,14 @@
m->cost_mv = p_cost_mvx[bmx] + p_cost_mvy[bmy];
}
-#define BIME_CACHE( dx, dy, list ) \
-{ \
+#define BIME_CACHE( dx, dy, list )\
+{\
x264_me_t *m = m##list;\
- int i = 4 + 3*dx + dy; \
+ int i = 4 + 3*dx + dy;\
int mvx = bm##list##x+dx;\
int mvy = bm##list##y+dy;\
stride[list][i] = bw;\
- src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none ); \
+ src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none );\
if( rd )\
{\
h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
@@ -1106,11 +1107,11 @@
{ \
uint64_t cost; \
M32( cache_mv ) = pack16to32_mask(mx,my); \
- if( m->i_pixel <= PIXEL_8x8 )\
- {\
- h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\
- h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\
- }\
+ if( m->i_pixel <= PIXEL_8x8 ) \
+ { \
+ h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
+ h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
+ } \
cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
} \
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/encoder/ratecontrol.c
^
|
@@ -29,7 +29,6 @@
#include <math.h>
#include "common/common.h"
-#include "common/cpu.h"
#include "ratecontrol.h"
#include "me.h"
@@ -84,8 +83,7 @@
/* current frame */
ratecontrol_entry_t *rce;
int qp; /* qp for current frame */
- int qpm; /* qp for current macroblock */
- float f_qpm; /* qp for current macroblock: precise float for AQ */
+ float qpm; /* qp for current macroblock: precise float for AQ */
float qpa_rc; /* average of macroblocks' qp before aq */
float qpa_aq; /* average of macroblocks' qp after aq */
float qp_novbv; /* QP for the current frame if 1-pass VBV was disabled. */
@@ -293,22 +291,6 @@
}
}
-
-/*****************************************************************************
-* x264_adaptive_quant:
- * adjust macroblock QP based on variance (AC energy) of the MB.
- * high variance = higher QP
- * low variance = lower QP
- * This generally increases SSIM and lowers PSNR.
-*****************************************************************************/
-void x264_adaptive_quant( x264_t *h )
-{
- x264_emms();
- /* MB-tree currently doesn't adjust quantizers in unreferenced frames. */
- float qp_offset = h->fdec->b_kept_as_ref ? h->fenc->f_qp_offset[h->mb.i_mb_xy] : h->fenc->f_qp_offset_aq[h->mb.i_mb_xy];
- h->mb.i_qp = x264_clip3( h->rc->f_qpm + qp_offset + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
-}
-
int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
{
x264_ratecontrol_t *rc = h->rc;
@@ -669,7 +651,7 @@
return -1;
}
- CMP_OPT_FIRST_PASS( "wpredp", X264_MAX( 0, h->param.analyse.i_weighted_pred ) );
+ CMP_OPT_FIRST_PASS( "weightp", X264_MAX( 0, h->param.analyse.i_weighted_pred ) );
CMP_OPT_FIRST_PASS( "bframes", h->param.i_bframe );
CMP_OPT_FIRST_PASS( "b_pyramid", h->param.i_bframe_pyramid );
CMP_OPT_FIRST_PASS( "intra_refresh", h->param.b_intra_refresh );
@@ -1180,28 +1162,27 @@
rc->qpa_rc =
rc->qpa_aq = 0;
- rc->qpm =
rc->qp = x264_clip3( (int)(q + 0.5), 0, 51 );
h->fdec->f_qp_avg_rc =
h->fdec->f_qp_avg_aq =
- rc->f_qpm = q;
+ rc->qpm = q;
if( rce )
rce->new_qp = rc->qp;
- accum_p_qp_update( h, rc->f_qpm );
+ accum_p_qp_update( h, rc->qpm );
if( h->sh.i_type != SLICE_TYPE_B )
rc->last_non_b_pict_type = h->sh.i_type;
}
-static double predict_row_size( x264_t *h, int y, int qp )
+static double predict_row_size( x264_t *h, int y, double qp )
{
/* average between two predictors:
* absolute SATD, and scaled bit cost of the colocated row in the previous frame */
x264_ratecontrol_t *rc = h->rc;
double pred_s = predict_size( rc->row_pred[0], qp2qscale( qp ), h->fdec->i_row_satd[y] );
double pred_t = 0;
- if( h->sh.i_type == SLICE_TYPE_I || qp >= h->fref0[0]->i_row_qp[y] )
+ if( h->sh.i_type == SLICE_TYPE_I || qp >= h->fref0[0]->f_row_qp[y] )
{
if( h->sh.i_type == SLICE_TYPE_P
&& h->fref0[0]->i_type == h->fdec->i_type
@@ -1209,7 +1190,7 @@
&& (abs(h->fref0[0]->i_row_satd[y] - h->fdec->i_row_satd[y]) < h->fdec->i_row_satd[y]/2))
{
pred_t = h->fref0[0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref0[0]->i_row_satd[y]
- * qp2qscale( h->fref0[0]->i_row_qp[y] ) / qp2qscale( qp );
+ * qp2qscale( h->fref0[0]->f_row_qp[y] ) / qp2qscale( qp );
}
if( pred_t == 0 )
pred_t = pred_s;
@@ -1232,7 +1213,7 @@
return bits;
}
-static double predict_row_size_sum( x264_t *h, int y, int qp )
+static double predict_row_size_sum( x264_t *h, int y, double qp )
{
double bits = row_bits_so_far(h, y);
for( int i = y+1; i < h->i_threadslice_end; i++ )
@@ -1249,33 +1230,34 @@
x264_emms();
h->fdec->i_row_bits[y] += bits;
- rc->qpa_rc += rc->f_qpm;
+ rc->qpa_rc += rc->qpm;
rc->qpa_aq += h->mb.i_qp;
if( h->mb.i_mb_x != h->sps->i_mb_width - 1 || !rc->b_vbv )
return;
- h->fdec->i_row_qp[y] = rc->qpm;
+ h->fdec->f_row_qp[y] = rc->qpm;
update_predictor( rc->row_pred[0], qp2qscale( rc->qpm ), h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] );
- if( h->sh.i_type == SLICE_TYPE_P && rc->qpm < h->fref0[0]->i_row_qp[y] )
+ if( h->sh.i_type == SLICE_TYPE_P && rc->qpm < h->fref0[0]->f_row_qp[y] )
update_predictor( rc->row_pred[1], qp2qscale( rc->qpm ), h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] );
/* tweak quality based on difference from predicted size */
if( y < h->i_threadslice_end-1 )
{
- int prev_row_qp = h->fdec->i_row_qp[y];
- int i_qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min );
- int i_qp_absolute_max = h->param.rc.i_qp_max;
+ float prev_row_qp = h->fdec->f_row_qp[y];
+ float qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min );
+ float qp_absolute_max = h->param.rc.i_qp_max;
if( rc->rate_factor_max_increment )
- i_qp_absolute_max = X264_MIN( i_qp_absolute_max, rc->qp_novbv + rc->rate_factor_max_increment );
- int i_qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, i_qp_absolute_max );
+ qp_absolute_max = X264_MIN( qp_absolute_max, rc->qp_novbv + rc->rate_factor_max_increment );
+ float qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, qp_absolute_max );
+ float step_size = 0.5;
/* B-frames shouldn't use lower QP than their reference frames. */
if( h->sh.i_type == SLICE_TYPE_B )
{
- i_qp_min = X264_MAX( i_qp_min, X264_MAX( h->fref0[0]->i_row_qp[y+1], h->fref1[0]->i_row_qp[y+1] ) );
- rc->qpm = X264_MAX( rc->qpm, i_qp_min );
+ qp_min = X264_MAX( qp_min, X264_MAX( h->fref0[0]->f_row_qp[y+1], h->fref1[0]->f_row_qp[y+1] ) );
+ rc->qpm = X264_MAX( rc->qpm, qp_min );
}
float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned;
@@ -1303,45 +1285,53 @@
rc_tol /= 2;
if( !rc->b_vbv_min_rate )
- i_qp_min = X264_MAX( i_qp_min, h->sh.i_qp );
+ qp_min = X264_MAX( qp_min, rc->qp_novbv );
- while( rc->qpm < i_qp_max
+ while( rc->qpm < qp_max
&& ((b1 > rc->frame_size_planned + rc_tol) ||
(rc->buffer_fill - b1 < buffer_left_planned * 0.5) ||
(b1 > rc->frame_size_planned && rc->qpm < rc->qp_novbv)) )
{
- rc->qpm ++;
+ rc->qpm += step_size;
b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
}
- while( rc->qpm > i_qp_min
- && (rc->qpm > h->fdec->i_row_qp[0] || rc->single_frame_vbv)
+ while( rc->qpm > qp_min
+ && (rc->qpm > h->fdec->f_row_qp[0] || rc->single_frame_vbv)
&& ((b1 < rc->frame_size_planned * 0.8 && rc->qpm <= prev_row_qp)
|| b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1) )
{
- rc->qpm --;
+ rc->qpm -= step_size;
b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
}
/* avoid VBV underflow or MinCR violation */
- while( (rc->qpm < i_qp_absolute_max)
+ while( (rc->qpm < qp_absolute_max)
&& ((rc->buffer_fill - b1 < rc->buffer_rate * rc->max_frame_error) ||
(rc->frame_size_maximum - b1 < rc->frame_size_maximum * rc->max_frame_error)))
{
- rc->qpm ++;
+ rc->qpm += step_size;
b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
}
h->rc->frame_size_estimated = predict_row_size_sum( h, y, rc->qpm );
}
-
- /* loses the fractional part of the frame-wise qp */
- rc->f_qpm = rc->qpm;
}
int x264_ratecontrol_qp( x264_t *h )
{
- return h->rc->qpm;
+ x264_emms();
+ return x264_clip3( h->rc->qpm + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
+}
+
+int x264_ratecontrol_mb_qp( x264_t *h )
+{
+ x264_emms();
+ float qp = h->rc->qpm;
+ if( h->param.rc.i_aq_mode )
+ /* MB-tree currently doesn't adjust quantizers in unreferenced frames. */
+ qp += h->fdec->b_kept_as_ref ? h->fenc->f_qp_offset[h->mb.i_mb_xy] : h->fenc->f_qp_offset_aq[h->mb.i_mb_xy];
+ return x264_clip3( qp + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
}
/* In 2pass, force the same frame types as in the 1st pass */
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/encoder/ratecontrol.h
^
|
@@ -30,7 +30,6 @@
void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
-void x264_adaptive_quant( x264_t * );
int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
int x264_reference_build_list_optimal( x264_t *h );
void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
@@ -39,6 +38,7 @@
void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm );
void x264_ratecontrol_mb( x264_t *, int bits );
int x264_ratecontrol_qp( x264_t * );
+int x264_ratecontrol_mb_qp( x264_t *h );
int x264_ratecontrol_end( x264_t *, int bits, int *filler );
void x264_ratecontrol_summary( x264_t * );
void x264_ratecontrol_set_estimated_size( x264_t *, int bits );
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/encoder/rdo.c
^
|
@@ -50,6 +50,8 @@
* fractional bits, but only finite precision. */
#undef x264_cabac_encode_decision
#undef x264_cabac_encode_decision_noup
+#undef x264_cabac_encode_bypass
+#undef x264_cabac_encode_terminal
#define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
#define x264_cabac_encode_decision_noup(c,x,v) x264_cabac_size_decision_noup(c,x,v)
#define x264_cabac_encode_terminal(c) ((c)->f8_bits_encoded += 7)
@@ -438,10 +440,13 @@
if( i < b_ac )
{
- /* We only need to memset an empty 4x4 block. 8x8 can be
+ /* We only need to zero an empty 4x4 block. 8x8 can be
implicitly emptied via zero nnz, as can dc. */
if( i_coefs == 16 && !dc )
- memset( dct, 0, 16 * sizeof(int16_t) );
+ {
+ M128( &dct[0] ) = M128_ZERO;
+ M128( &dct[8] ) = M128_ZERO;
+ }
return 0;
}
@@ -608,7 +613,10 @@
if( bnode == &nodes_cur[0] )
{
if( i_coefs == 16 && !dc )
- memset( dct, 0, 16 * sizeof(int16_t) );
+ {
+ M128( &dct[0] ) = M128_ZERO;
+ M128( &dct[8] ) = M128_ZERO;
+ }
return 0;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/encoder/set.c
^
|
@@ -315,26 +315,22 @@
if( sps->vui.b_aspect_ratio_info_present )
{
int i;
- static const struct { int w, h; int sar; } sar[] =
+ static const struct { uint8_t w, h, sar; } sar[] =
{
{ 1, 1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
{ 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
{ 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
- { 160,99, 13}, { 0, 0, -1 }
+ { 160,99, 13}, { 0, 0, 255 }
};
- for( i = 0; sar[i].sar != -1; i++ )
+ for( i = 0; sar[i].sar != 255; i++ )
{
if( sar[i].w == sps->vui.i_sar_width &&
sar[i].h == sps->vui.i_sar_height )
break;
}
- if( sar[i].sar != -1 )
+ bs_write( s, 8, sar[i].sar );
+ if( sar[i].sar == 255 ) /* aspect_ratio_idc (extended) */
{
- bs_write( s, 8, sar[i].sar );
- }
- else
- {
- bs_write( s, 8, 255); /* aspect_ratio_idc (extended) */
bs_write( s, 16, sps->vui.i_sar_width );
bs_write( s, 16, sps->vui.i_sar_height );
}
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/encoder/slicetype.c
^
|
@@ -25,7 +25,6 @@
#include <math.h>
#include "common/common.h"
-#include "common/cpu.h"
#include "macroblock.h"
#include "me.h"
@@ -382,21 +381,23 @@
/* Reverse-order MV prediction. */
M32( mvc[0] ) = 0;
- M32( mvc[1] ) = 0;
M32( mvc[2] ) = 0;
#define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; }
if( i_mb_x < h->sps->i_mb_width - 1 )
- MVC(fenc_mv[1]);
+ MVC( fenc_mv[1] );
if( i_mb_y < h->sps->i_mb_height - 1 )
{
- MVC(fenc_mv[i_mb_stride]);
+ MVC( fenc_mv[i_mb_stride] );
if( i_mb_x > 0 )
- MVC(fenc_mv[i_mb_stride-1]);
+ MVC( fenc_mv[i_mb_stride-1] );
if( i_mb_x < h->sps->i_mb_width - 1 )
- MVC(fenc_mv[i_mb_stride+1]);
+ MVC( fenc_mv[i_mb_stride+1] );
}
#undef MVC
- x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );
+ if( i_mvc <= 1 )
+ CP32( m[l].mvp, mvc[0] );
+ else
+ x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );
x264_me_search( h, &m[l], mvc, i_mvc );
m[l].cost -= 2; // remove mvcost from skip mbs
@@ -416,10 +417,6 @@
if( b_bidir && ( M32( m[0].mv ) || M32( m[1].mv ) ) )
TRY_BIDIR( m[0].mv, m[1].mv, 5 );
- /* Store to width-2 bitfield. */
- frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] &= ~(3<<((i_mb_xy&3)*2));
- frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] |= list_used<<((i_mb_xy&3)*2);
-
lowres_intra_mb:
if( !fenc->b_intra_calculated )
{
@@ -481,7 +478,10 @@
int i_icost = fenc->i_intra_cost[i_mb_xy];
int b_intra = i_icost < i_bcost;
if( b_intra )
+ {
i_bcost = i_icost;
+ list_used = 0;
+ }
if( b_frame_score_mb )
fenc->i_intra_mbs[b-p0] += b_intra;
}
@@ -501,7 +501,7 @@
}
}
- fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost;
+ fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost + (list_used << LOWRES_COST_SHIFT);
}
#undef TRY_BIDIR
@@ -615,7 +615,7 @@
for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
{
int i_mb_xy = h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride;
- int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy];
+ int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy] & LOWRES_COST_MASK;
float qp_adj = qp_offset[i_mb_xy];
i_mb_cost = (i_mb_cost * x264_exp2fix8(qp_adj) + 128) >> 8;
row_satd[ h->mb.i_mb_y ] += i_mb_cost;
@@ -681,7 +681,7 @@
if( propagate_amount > 0 )
{
/* Access width-2 bitfield. */
- int lists_used = (frames[b]->lowres_inter_types[b-p0][p1-b][mb_index>>2] >> ((mb_index&3)*2))&3;
+ int lists_used = frames[b]->lowres_costs[b-p0][p1-b][mb_index] >> LOWRES_COST_SHIFT;
/* Follow the MVs to the previous frame(s). */
for( int list = 0; list < 2; list++ )
if( (lists_used >> list)&1 )
@@ -1490,7 +1490,7 @@
for( int x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ )
{
int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor + 128) >> 8;
- int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy];
+ int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy] & LOWRES_COST_MASK;
int diff = intra_cost - inter_cost;
if( h->param.rc.i_aq_mode )
h->fdec->i_row_satd[y] += (diff * frames[b]->i_inv_qscale_factor[mb_xy] + 128) >> 8;
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/input/avs.c
^
|
@@ -45,7 +45,7 @@
/* maximum size of the sequence of filters to try on non script files */
#define AVS_MAX_SEQUENCE 5
-#define LOAD_AVS_FUNC(name, continue_on_fail) \
+#define LOAD_AVS_FUNC(name, continue_on_fail)\
{\
h->func.name = (void*)GetProcAddress( h->library, #name );\
if( !continue_on_fail && !h->func.name )\
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/input/y4m.c
^
|
@@ -104,13 +104,21 @@
case 'I': /* Interlace type */
switch( *tokstart++ )
{
- case 'p': break;
- case '?':
case 't':
+ info->interlaced = 1;
+ info->tff = 1;
+ break;
case 'b':
+ info->interlaced = 1;
+ info->tff = 0;
+ break;
case 'm':
- default:
info->interlaced = 1;
+ break;
+ //case '?':
+ //case 'p':
+ default:
+ break;
}
break;
case 'F': /* Frame rate - 0:0 if unknown */
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/tools/checkasm-a.asm
^
|
@@ -43,7 +43,7 @@
SECTION .text
-cextern puts
+cextern_naked puts
; max number of args used by any x264 asm function.
; (max_args % 4) must equal 3 for stack alignment
@@ -54,7 +54,7 @@
;-----------------------------------------------------------------------------
; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
-cglobal x264_checkasm_call, 4,7,16
+cglobal checkasm_call, 4,7,16
sub rsp, max_args*8
%assign stack_offset stack_offset+max_args*8
mov r6, r0
@@ -113,7 +113,7 @@
;-----------------------------------------------------------------------------
; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
-cglobal x264_checkasm_call, 1,7
+cglobal checkasm_call, 1,7
mov r3, n3
mov r4, n4
mov r5, n5
@@ -147,7 +147,7 @@
;-----------------------------------------------------------------------------
; int x264_stack_pagealign( int (*func)(), int align )
;-----------------------------------------------------------------------------
-cglobal x264_stack_pagealign, 2,2
+cglobal stack_pagealign, 2,2
push rbp
mov rbp, rsp
and rsp, ~0xfff
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/tools/checkasm.c
^
|
@@ -265,7 +265,7 @@
buf3[i] = ~(buf4[i] = -(buf1[i&~0x88]&1));
#define TEST_PIXEL( name, align ) \
- ok = 1, used_asm = 0;\
+ ok = 1, used_asm = 0; \
for( int i = 0; i < 7; i++ ) \
{ \
int res_c, res_asm; \
@@ -305,7 +305,7 @@
TEST_PIXEL( sa8d, 1 );
#define TEST_PIXEL_X( N ) \
- ok = 1; used_asm = 0;\
+ ok = 1; used_asm = 0; \
for( int i = 0; i < 7; i++ ) \
{ \
int res_c[4]={0}, res_asm[4]={0}; \
@@ -350,7 +350,7 @@
{ \
set_func_name( "%s_%s", "var", pixel_names[i] ); \
used_asm = 1; \
- /* abi-check wrapper can't return uint64_t, so separate it from return value check */\
+ /* abi-check wrapper can't return uint64_t, so separate it from return value check */ \
call_c1( pixel_c.var[i], buf1, 16 ); \
call_a1( pixel_asm.var[i], buf1, 16 ); \
uint64_t res_c = pixel_c.var[i]( buf1, 16 ); \
@@ -415,7 +415,7 @@
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
{ \
int res_c[3], res_asm[3]; \
- set_func_name( #name );\
+ set_func_name( #name ); \
used_asm = 1; \
memcpy( buf3, buf2, 1024 ); \
for( int i = 0; i < 3; i++ ) \
@@ -538,7 +538,7 @@
#define TEST_DCT( name, t1, t2, size ) \
if( dct_asm.name != dct_ref.name ) \
{ \
- set_func_name( #name );\
+ set_func_name( #name ); \
used_asm = 1; \
call_c( dct_c.name, t1, buf1, buf2 ); \
call_a( dct_asm.name, t2, buf1, buf2 ); \
@@ -579,7 +579,7 @@
#define TEST_IDCT( name, src ) \
if( dct_asm.name != dct_ref.name ) \
{ \
- set_func_name( #name );\
+ set_func_name( #name ); \
used_asm = 1; \
memcpy( buf3, buf1, 32*32 ); \
memcpy( buf4, buf1, 32*32 ); \
@@ -644,12 +644,12 @@
ALIGNED_16( int16_t level1[64] );
ALIGNED_16( int16_t level2[64] );
-#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
+#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
if( zigzag_asm.name != zigzag_ref.name ) \
{ \
- set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
+ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
used_asm = 1; \
- memcpy(dct, buf1, size*sizeof(int16_t));\
+ memcpy(dct, buf1, size*sizeof(int16_t)); \
call_c( zigzag_c.name, t1, dct ); \
call_a( zigzag_asm.name, t2, dct ); \
if( memcmp( t1, t2, size*sizeof(int16_t) ) ) \
@@ -663,18 +663,18 @@
if( zigzag_asm.name != zigzag_ref.name ) \
{ \
int nz_a, nz_c; \
- set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
+ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
used_asm = 1; \
memcpy( buf3, buf1, 16*FDEC_STRIDE ); \
memcpy( buf4, buf1, 16*FDEC_STRIDE ); \
- nz_c = call_c1( zigzag_c.name, t1, buf2, buf3 ); \
+ nz_c = call_c1( zigzag_c.name, t1, buf2, buf3 ); \
nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4 ); \
- if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a ) \
+ if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a ) \
{ \
ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \
} \
- call_c2( zigzag_c.name, t1, buf2, buf3 ); \
+ call_c2( zigzag_c.name, t1, buf2, buf3 ); \
call_a2( zigzag_asm.name, t2, buf2, buf4 ); \
}
@@ -683,7 +683,7 @@
{ \
int nz_a, nz_c; \
int16_t dc_a, dc_c; \
- set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
+ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
used_asm = 1; \
for( int i = 0; i < 2; i++ ) \
{ \
@@ -694,27 +694,27 @@
memcpy( buf3 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \
memcpy( buf4 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \
} \
- nz_c = call_c1( zigzag_c.name, t1, buf2, buf3, &dc_c ); \
+ nz_c = call_c1( zigzag_c.name, t1, buf2, buf3, &dc_c ); \
nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \
- if( memcmp( t1+1, t2+1, 15*sizeof(int16_t) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a || dc_c != dc_a ) \
+ if( memcmp( t1+1, t2+1, 15*sizeof(int16_t) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a || dc_c != dc_a ) \
{ \
ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \
break; \
} \
} \
- call_c2( zigzag_c.name, t1, buf2, buf3, &dc_c ); \
+ call_c2( zigzag_c.name, t1, buf2, buf3, &dc_c ); \
call_a2( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \
}
-#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \
+#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \
if( zigzag_asm.name != zigzag_ref.name ) \
{ \
for( int j = 0; j < 100; j++ ) \
{ \
- set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
+ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
used_asm = 1; \
- memcpy(dct, buf1, size*sizeof(int16_t));\
+ memcpy(dct, buf1, size*sizeof(int16_t)); \
for( int i = 0; i < size; i++ ) \
dct[i] = rand()&0x1F ? 0 : dct[i]; \
memcpy(buf3, buf4, 10*sizeof(uint8_t)); \
@@ -784,7 +784,7 @@
if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
{ \
const x264_weight_t *weight = weight_none; \
- set_func_name( "mc_luma_%dx%d", w, h );\
+ set_func_name( "mc_luma_%dx%d", w, h ); \
used_asm = 1; \
memset( buf3, 0xCD, 1024 ); \
memset( buf4, 0xCD, 1024 ); \
@@ -801,7 +801,7 @@
uint8_t *ref = dst2; \
int ref_stride = 32; \
const x264_weight_t *weight = weight_none; \
- set_func_name( "get_ref_%dx%d", w, h );\
+ set_func_name( "get_ref_%dx%d", w, h ); \
used_asm = 1; \
memset( buf3, 0xCD, 1024 ); \
memset( buf4, 0xCD, 1024 ); \
@@ -819,13 +819,13 @@
#define MC_TEST_CHROMA( w, h ) \
if( mc_a.mc_chroma != mc_ref.mc_chroma ) \
{ \
- set_func_name( "mc_chroma_%dx%d", w, h );\
+ set_func_name( "mc_chroma_%dx%d", w, h ); \
used_asm = 1; \
memset( buf3, 0xCD, 1024 ); \
memset( buf4, 0xCD, 1024 ); \
call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \
call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \
- /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\
+ /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \
for( int j = 0; j < h; j++ ) \
for( int i = w; i < 4; i++ ) \
dst2[i+j*16] = dst1[i+j*16]; \
@@ -878,7 +878,7 @@
memcpy( buf4, buf1+320, 320 ); \
if( mc_a.name[i] != mc_ref.name[i] ) \
{ \
- set_func_name( "%s_%s", #name, pixel_names[i] );\
+ set_func_name( "%s_%s", #name, pixel_names[i] ); \
used_asm = 1; \
call_c1( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \
call_a1( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \
@@ -899,7 +899,7 @@
#define MC_TEST_WEIGHT( name, weight, aligned ) \
int align_off = (aligned ? 0 : rand()%16); \
- ok = 1, used_asm = 0;\
+ ok = 1, used_asm = 0; \
for( int i = 1; i <= 5; i++ ) \
{ \
ALIGNED_16( uint8_t buffC[640] ); \
@@ -1115,14 +1115,14 @@
#define TEST_DEBLOCK( name, align, ... ) \
for( int i = 0; i < 36; i++ ) \
{ \
- int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */\
+ int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \
for( int j = 0; j < 1024; j++ ) \
- /* two distributions of random to excersize different failure modes */\
+ /* two distributions of random to excersize different failure modes */ \
buf3[j] = rand() & (i&1 ? 0xf : 0xff ); \
memcpy( buf4, buf3, 1024 ); \
if( db_a.name != db_ref.name ) \
{ \
- set_func_name( #name );\
+ set_func_name( #name ); \
used_asm = 1; \
call_c1( db_c.name, buf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
call_a1( db_a.name, buf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
@@ -1236,7 +1236,7 @@
dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \
result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
- if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a ) \
+ if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a ) \
{ \
oks[0] = 0; \
fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
@@ -1491,11 +1491,11 @@
ip_c.predict_8x8_filter( buf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
-#define INTRA_TEST( name, dir, w, ... ) \
+#define INTRA_TEST( name, dir, w, ... )\
if( ip_a.name[dir] != ip_ref.name[dir] )\
- { \
+ {\
set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
- used_asm = 1; \
+ used_asm = 1;\
memcpy( buf3, buf1, 32*20 );\
memcpy( buf4, buf1, 32*20 );\
call_c( ip_c.name[dir], buf3+48, ##__VA_ARGS__ );\
@@ -1556,32 +1556,66 @@
}
#define DECL_CABAC(cpu) \
-static void run_cabac_##cpu( uint8_t *dst )\
+static void run_cabac_decision_##cpu( uint8_t *dst )\
{\
x264_cabac_t cb;\
x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\
x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
for( int i = 0; i < 0x1000; i++ )\
x264_cabac_encode_decision_##cpu( &cb, buf1[i]>>1, buf1[i]&1 );\
+}\
+static void run_cabac_bypass_##cpu( uint8_t *dst )\
+{\
+ x264_cabac_t cb;\
+ x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\
+ x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
+ for( int i = 0; i < 0x1000; i++ )\
+ x264_cabac_encode_bypass_##cpu( &cb, buf1[i]&1 );\
+}\
+static void run_cabac_terminal_##cpu( uint8_t *dst )\
+{\
+ x264_cabac_t cb;\
+ x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\
+ x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
+ for( int i = 0; i < 0x1000; i++ )\
+ x264_cabac_encode_terminal_##cpu( &cb );\
}
DECL_CABAC(c)
#ifdef HAVE_MMX
DECL_CABAC(asm)
#else
-#define run_cabac_asm run_cabac_c
+#define run_cabac_decision_asm run_cabac_decision_c
+#define run_cabac_bypass_asm run_cabac_bypass_c
+#define run_cabac_terminal_asm run_cabac_terminal_c
#endif
static int check_cabac( int cpu_ref, int cpu_new )
{
int ret = 0, ok, used_asm = 1;
- if( cpu_ref || run_cabac_c == run_cabac_asm)
+ if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm )
return 0;
+
set_func_name( "cabac_encode_decision" );
memcpy( buf4, buf3, 0x1000 );
- call_c( run_cabac_c, buf3 );
- call_a( run_cabac_asm, buf4 );
+ call_c( run_cabac_decision_c, buf3 );
+ call_a( run_cabac_decision_asm, buf4 );
+ ok = !memcmp( buf3, buf4, 0x1000 );
+ report( "cabac decision:" );
+
+ set_func_name( "cabac_encode_bypass" );
+ memcpy( buf4, buf3, 0x1000 );
+ call_c( run_cabac_bypass_c, buf3 );
+ call_a( run_cabac_bypass_asm, buf4 );
ok = !memcmp( buf3, buf4, 0x1000 );
- report( "cabac :" );
+ report( "cabac bypass:" );
+
+ set_func_name( "cabac_encode_terminal" );
+ memcpy( buf4, buf3, 0x1000 );
+ call_c( run_cabac_terminal_c, buf3 );
+ call_a( run_cabac_terminal_asm, buf4 );
+ ok = !memcmp( buf3, buf4, 0x1000 );
+ report( "cabac terminal:" );
+
return ret;
}
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/x264.c
^
|
@@ -120,7 +120,7 @@
static const cli_pulldown_t pulldown_values[] =
{
- [X264_PULLDOWN_22] = {1, {TB}, 2.0},
+ [X264_PULLDOWN_22] = {1, {TB}, 1.0},
[X264_PULLDOWN_32] = {4, {TBT, BT, BTB, TB}, 1.25},
[X264_PULLDOWN_64] = {2, {PIC_STRUCT_DOUBLE, PIC_STRUCT_TRIPLE}, 1.0},
[X264_PULLDOWN_DOUBLE] = {1, {PIC_STRUCT_DOUBLE}, 2.0},
@@ -1312,7 +1312,7 @@
* Encode:
*****************************************************************************/
-static int Encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic, int64_t *last_pts )
+static int Encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic, int64_t *last_dts )
{
x264_picture_t pic_out;
x264_nal_t *nal;
@@ -1330,18 +1330,22 @@
if( i_frame_size )
{
i_frame_size = output.write_frame( hout, nal[0].p_payload, i_frame_size, &pic_out );
- *last_pts = pic_out.i_pts;
+ *last_dts = pic_out.i_dts;
}
return i_frame_size;
}
-static void Print_status( int64_t i_start, int i_frame, int i_frame_total, int64_t i_file, x264_param_t *param, int64_t last_pts )
+static void Print_status( int64_t i_start, int i_frame, int i_frame_total, int64_t i_file, x264_param_t *param, int64_t last_ts )
{
char buf[200];
int64_t i_elapsed = x264_mdate() - i_start;
double fps = i_elapsed > 0 ? i_frame * 1000000. / i_elapsed : 0;
- double bitrate = (double) i_file * 8 / ( (double) last_pts * 1000 * param->i_timebase_num / param->i_timebase_den );
+ double bitrate;
+ if( last_ts )
+ bitrate = (double) i_file * 8 / ( (double) last_ts * 1000 * param->i_timebase_num / param->i_timebase_den );
+ else
+ bitrate = (double) i_file * 8 / ( (double) 1000 * param->i_fps_den / param->i_fps_num );
if( i_frame_total )
{
int eta = i_elapsed * (i_frame_total - i_frame) / ((int64_t)i_frame * 1000000);
@@ -1369,7 +1373,9 @@
int64_t i_file = 0;
int i_frame_size;
int i_update_interval;
- int64_t last_pts = 0;
+ int64_t last_dts = 0;
+ int64_t prev_dts = 0;
+ int64_t first_dts = 0;
# define MAX_PTS_WARNING 3 /* arbitrary */
int pts_warning_cnt = 0;
int64_t largest_pts = -1;
@@ -1506,12 +1512,17 @@
pic.i_qpplus1 = 0;
}
- i_frame_size = Encode_frame( h, opt->hout, &pic, &last_pts );
+ prev_dts = last_dts;
+ i_frame_size = Encode_frame( h, opt->hout, &pic, &last_dts );
if( i_frame_size < 0 )
return -1;
i_file += i_frame_size;
if( i_frame_size )
+ {
i_frame_output++;
+ if( i_frame_output == 1 )
+ first_dts = prev_dts = last_dts;
+ }
i_frame++;
@@ -1520,19 +1531,24 @@
/* update status line (up to 1000 times per input file) */
if( opt->b_progress && i_frame_output % i_update_interval == 0 && i_frame_output )
- Print_status( i_start, i_frame_output, i_frame_total, i_file, param, last_pts );
+ Print_status( i_start, i_frame_output, i_frame_total, i_file, param, 2 * last_dts - prev_dts - first_dts );
}
/* Flush delayed frames */
while( !b_ctrl_c && x264_encoder_delayed_frames( h ) )
{
- i_frame_size = Encode_frame( h, opt->hout, NULL, &last_pts );
+ prev_dts = last_dts;
+ i_frame_size = Encode_frame( h, opt->hout, NULL, &last_dts );
if( i_frame_size < 0 )
return -1;
i_file += i_frame_size;
if( i_frame_size )
+ {
i_frame_output++;
+ if( i_frame_output == 1 )
+ first_dts = prev_dts = last_dts;
+ }
if( opt->b_progress && i_frame_output % i_update_interval == 0 && i_frame_output )
- Print_status( i_start, i_frame_output, i_frame_total, i_file, param, last_pts );
+ Print_status( i_start, i_frame_output, i_frame_total, i_file, param, 2 * last_dts - prev_dts - first_dts );
}
if( pts_warning_cnt >= MAX_PTS_WARNING && param->i_log_level < X264_LOG_DEBUG )
fprintf( stderr, "x264 [warning]: %d suppressed nonmonotonic pts warnings\n", pts_warning_cnt-MAX_PTS_WARNING );
|
[-]
[+]
|
Changed |
x264-snapshot-20100517-2245.tar.bz2/x264.h
^
|
@@ -35,7 +35,7 @@
#include <stdarg.h>
-#define X264_BUILD 94
+#define X264_BUILD 95
/* x264_t:
* opaque handler for encoder */
@@ -639,5 +639,13 @@
* return the number of currently delayed (buffered) frames
* this should be used at the end of the stream, to know when you have all the encoded frames. */
int x264_encoder_delayed_frames( x264_t * );
+/* x264_encoder_intra_refresh:
+ * If an intra refresh is not in progress, begin one with the next P-frame.
+ * If an intra refresh is in progress, begin one as soon as the current one finishes.
+ * Requires that b_intra_refresh be set.
+ * Useful for interactive streaming where the client can tell the server that packet loss has
+ * occurred. In this case, keyint can be set to an extremely high value so that intra refreshes
+ * only occur when calling x264_encoder_intra_refresh. */
+void x264_encoder_intra_refresh( x264_t * );
#endif
|