File openjpeg-svn470-t1-flags-mmx.patch of Package openjpeg
x
1
diff -urN -x '*.orig' -x '*.rej' -x '*~' -x '.*' OpenJPEG.orig/libopenjpeg/t1.c OpenJPEG.patched/libopenjpeg/t1.c
2
--- OpenJPEG.orig/libopenjpeg/t1.c 2007-11-13 13:52:05.000000000 -0600
3
+++ OpenJPEG.patched/libopenjpeg/t1.c 2007-11-14 01:09:40.000000000 -0600
4
5
#include "opj_includes.h"
6
#include "t1_luts.h"
7
8
+/* Don't use MMX on amd64 */
9
+/* Note that merely including mmintrin.h, even if we don't use it, changes the code gcc */
10
+/* outputs on amd64, and it is measurably slower. A bug in gcc? */
11
+#ifdef __amd64__
12
+#undef __MMX__
13
+#endif
14
+
15
+#ifdef __MMX__
16
+#include <mmintrin.h>
17
+#endif
18
+
19
/** @defgroup T1 T1 - Implementation of the tier-1 coding */
20
/*@{*/
21
22
23
static char t1_getspb(int f);
24
static short t1_getnmsedec_sig(int x, int bitpos);
25
static short t1_getnmsedec_ref(int x, int bitpos);
26
-#ifdef __amd64__
27
+#if defined(__amd64__) || defined(__MMX__)
28
static INLINE void t1_updateflags(flag_t *flagsp, int s, int stride);
29
#else
30
static void t1_updateflags(flag_t *flagsp, int s, int stride);
31
32
}
33
34
#else
35
+#ifdef __MMX__
36
+
37
+static void t1_updateflags(flag_t *flagsp, int s, int stride) {
38
+ static const __v4hi mod[] = {
39
+ {T1_SIG_SE, T1_SIG_E, T1_SIG_NE, 0},
40
+ {T1_SIG_SE, T1_SIG_E|T1_SGN_E, T1_SIG_NE, 0},
41
+ {T1_SIG_S, T1_SIG, T1_SIG_N, 0},
42
+ {T1_SIG_S|T1_SGN_S, T1_SIG, T1_SIG_N|T1_SGN_N, 0},
43
+ {T1_SIG_SW, T1_SIG_W, T1_SIG_NW, 0},
44
+ {T1_SIG_SW, T1_SIG_W|T1_SGN_W, T1_SIG_NW, 0}
45
+ };
46
+
47
+ __m64 tmp1 = *(__m64*)((void*)&flagsp[-1 - stride]);
48
+ __m64 tmp2 = *(__m64*)((void*)&flagsp[-1 ]);
49
+ __m64 tmp3 = *(__m64*)((void*)&flagsp[-1 + stride]);
50
+
51
+ tmp1 = _mm_or_si64(tmp1, mod[s]);
52
+ tmp2 = _mm_or_si64(tmp2, mod[s+2]);
53
+ tmp3 = _mm_or_si64(tmp3, mod[s+4]);
54
+
55
+ *(__m64*)((void*)&flagsp[-1 - stride]) = tmp1;
56
+ *(__m64*)((void*)&flagsp[-1 ]) = tmp2;
57
+ *(__m64*)((void*)&flagsp[-1 + stride]) = tmp3;
58
+}
59
+
60
+#else
61
62
static void t1_updateflags(flag_t *flagsp, int s, int stride) {
63
static const flag_t mod[] = {
64
65
}
66
67
#endif
68
+#endif
69
70
static void t1_enc_sigpass_step(
71
opj_t1_t *t1,
72
73
| ((int64)(T1_SIG | T1_VISIT | T1_SIG_OTH)<<48);
74
agg = !tmp;
75
#else
76
+ int* flagsp = (int*)&t1->flags[(k+1) + (i+1)*(t1->h+2)];
77
+ agg = flagsp[1];
78
if (cblksty & J2K_CCP_CBLKSTY_VSC) {
79
- agg = !(t1->flags[(k+1) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
80
- || t1->flags[(k+2) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
81
- || t1->flags[(k+3) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
82
- || (t1->flags[(k+4) + (i+1)*(t1->h+2)]
83
- & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) & (T1_SIG | T1_VISIT | T1_SIG_OTH));
84
- } else {
85
- agg = !(t1->flags[(k+1) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
86
- || t1->flags[(k+2) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
87
- || t1->flags[(k+3) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
88
- || t1->flags[(k+4) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH));
89
+ agg &= ~((T1_SIG_S|T1_SIG_SE|T1_SIG_SW|T1_SGN_S)<<16);
90
}
91
+ agg |= flagsp[0];
92
+ agg &= (T1_SIG|T1_VISIT|T1_SIG_OTH)|(T1_SIG|T1_VISIT|T1_SIG_OTH)<<16;
93
+ agg = !agg;
94
#endif
95
} else {
96
agg = 0;
97
98
memset(t1->data,0,datasize * sizeof(int));
99
100
flagssize=(h+2) * (w+2);
101
-#ifdef __amd64__
102
+#if defined(__amd64__) || defined(__MMX__)
103
/* 64 bit SIMD/SWAR in t1_updateflags requires one short of headroom
104
because three shorts = 48 bits. */
105
++flagssize;
106
107
int correction = 3;
108
type = ((bpno < (cblk->numbps - 4)) && (passtype < 2) && (cblksty & J2K_CCP_CBLKSTY_LAZY)) ? T1_TYPE_RAW : T1_TYPE_MQ;
109
110
+#if !defined(__amd64__) && defined(__MMX__)
111
+ _mm_empty();
112
+#endif
113
switch (passtype) {
114
case 0:
115
t1_enc_sigpass(t1, bpno, orient, &nmsedec, type, cblksty);
116
117
mqc_segmark_enc(mqc);
118
break;
119
}
120
+#if !defined(__amd64__) && defined(__MMX__)
121
+ _mm_empty();
122
+#endif
123
124
/* fixed_quality */
125
cumwmsedec += t1_getwmsedec(nmsedec, compno, level, orient, bpno, qmfbid, stepsize, numcomps);
126
127
mqc_setstate(mqc, T1_CTXNO_AGG, 0, 3);
128
mqc_setstate(mqc, T1_CTXNO_ZC, 0, 4);
129
130
+#if !defined(__amd64__) && defined(__MMX__)
131
+ _mm_empty();
132
+#endif
133
for (segno = 0; segno < cblk->numsegs; ++segno) {
134
opj_tcd_seg_t *seg = &cblk->segs[segno];
135
136
137
}
138
}
139
}
140
+#if !defined(__amd64__) && defined(__MMX__)
141
+ _mm_empty();
142
+#endif
143
}
144
145
/* ----------------------------------------------------------------------- */
146