Logoj0ke.net Open Build Service > Projects > multimedia:SL11 > openjpeg > openjpeg-svn470-t1-flags-mmx.patch
Sign Up | Log In

File openjpeg-svn470-t1-flags-mmx.patch of Package openjpeg

x
 
1
diff -urN -x '*.orig' -x '*.rej' -x '*~' -x '.*' OpenJPEG.orig/libopenjpeg/t1.c OpenJPEG.patched/libopenjpeg/t1.c
2
--- OpenJPEG.orig/libopenjpeg/t1.c  2007-11-13 13:52:05.000000000 -0600
3
+++ OpenJPEG.patched/libopenjpeg/t1.c   2007-11-14 01:09:40.000000000 -0600
4
@@ -33,6 +33,17 @@
5
 #include "opj_includes.h"
6
 #include "t1_luts.h"
7
 
8
+/* Don't use MMX on amd64 */
9
+/* Note that merely including mmintrin.h, even if we don't use it, changes the code gcc */
10
+/* outputs on amd64, and it is measurably slower. A bug in gcc? */
11
+#ifdef __amd64__
12
+#undef __MMX__
13
+#endif
14
+
15
+#ifdef __MMX__
16
+#include <mmintrin.h>
17
+#endif
18
+
19
 /** @defgroup T1 T1 - Implementation of the tier-1 coding */
20
 /*@{*/
21
 
22
@@ -45,7 +56,7 @@
23
 static char t1_getspb(int f);
24
 static short t1_getnmsedec_sig(int x, int bitpos);
25
 static short t1_getnmsedec_ref(int x, int bitpos);
26
-#ifdef __amd64__
27
+#if defined(__amd64__) || defined(__MMX__)
28
 static INLINE void t1_updateflags(flag_t *flagsp, int s, int stride);
29
 #else
30
 static void t1_updateflags(flag_t *flagsp, int s, int stride);
31
@@ -293,6 +304,32 @@
32
 }
33
 
34
 #else
35
+#ifdef __MMX__
36
+
37
+static void t1_updateflags(flag_t *flagsp, int s, int stride) {
38
+   static const __v4hi mod[] = {
39
+       {T1_SIG_SE,         T1_SIG_E,          T1_SIG_NE,         0},
40
+       {T1_SIG_SE,         T1_SIG_E|T1_SGN_E, T1_SIG_NE,         0},
41
+       {T1_SIG_S,          T1_SIG,            T1_SIG_N,          0},
42
+       {T1_SIG_S|T1_SGN_S, T1_SIG,            T1_SIG_N|T1_SGN_N, 0},
43
+       {T1_SIG_SW,         T1_SIG_W,          T1_SIG_NW,         0},
44
+       {T1_SIG_SW,         T1_SIG_W|T1_SGN_W, T1_SIG_NW,         0}
45
+   };
46
+
47
+   __m64 tmp1 = *(__m64*)((void*)&flagsp[-1 - stride]);
48
+   __m64 tmp2 = *(__m64*)((void*)&flagsp[-1         ]);
49
+   __m64 tmp3 = *(__m64*)((void*)&flagsp[-1 + stride]);
50
+
51
+   tmp1 = _mm_or_si64(tmp1, mod[s]);
52
+   tmp2 = _mm_or_si64(tmp2, mod[s+2]);
53
+   tmp3 = _mm_or_si64(tmp3, mod[s+4]);
54
+
55
+   *(__m64*)((void*)&flagsp[-1 - stride]) = tmp1;
56
+   *(__m64*)((void*)&flagsp[-1         ]) = tmp2;
57
+   *(__m64*)((void*)&flagsp[-1 + stride]) = tmp3;
58
+}
59
+
60
+#else
61
 
62
 static void t1_updateflags(flag_t *flagsp, int s, int stride) {
63
    static const flag_t mod[] = {
64
@@ -316,6 +353,7 @@
65
 }
66
 
67
 #endif
68
+#endif
69
 
70
 static void t1_enc_sigpass_step(
71
        opj_t1_t *t1,
72
@@ -720,18 +758,14 @@
73
                    | ((int64)(T1_SIG | T1_VISIT | T1_SIG_OTH)<<48);
74
                agg = !tmp;
75
 #else
76
+               int* flagsp = (int*)&t1->flags[(k+1) + (i+1)*(t1->h+2)];
77
+               agg = flagsp[1];
78
                if (cblksty & J2K_CCP_CBLKSTY_VSC) {
79
-                   agg = !(t1->flags[(k+1) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
80
-                       ||  t1->flags[(k+2) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
81
-                       ||  t1->flags[(k+3) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
82
-                       || (t1->flags[(k+4) + (i+1)*(t1->h+2)] 
83
-                      & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) & (T1_SIG | T1_VISIT | T1_SIG_OTH));
84
-               } else {
85
-                   agg = !(t1->flags[(k+1) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
86
-                        || t1->flags[(k+2) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
87
-                        || t1->flags[(k+3) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
88
-                        || t1->flags[(k+4) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH));
89
+                   agg &= ~((T1_SIG_S|T1_SIG_SE|T1_SIG_SW|T1_SGN_S)<<16);
90
                }
91
+               agg |= flagsp[0];
92
+               agg &= (T1_SIG|T1_VISIT|T1_SIG_OTH)|(T1_SIG|T1_VISIT|T1_SIG_OTH)<<16;
93
+               agg = !agg;
94
 #endif
95
            } else {
96
                agg = 0;
97
@@ -820,7 +854,7 @@
98
    memset(t1->data,0,datasize * sizeof(int));
99
 
100
    flagssize=(h+2) * (w+2);
101
-#ifdef __amd64__
102
+#if defined(__amd64__) || defined(__MMX__)
103
    /* 64 bit SIMD/SWAR in t1_updateflags requires one short of headroom
104
       because three shorts = 48 bits. */
105
    ++flagssize;
106
@@ -886,6 +920,9 @@
107
        int correction = 3;
108
        type = ((bpno < (cblk->numbps - 4)) && (passtype < 2) && (cblksty & J2K_CCP_CBLKSTY_LAZY)) ? T1_TYPE_RAW : T1_TYPE_MQ;
109
        
110
+#if !defined(__amd64__) && defined(__MMX__)
111
+   _mm_empty();
112
+#endif
113
        switch (passtype) {
114
            case 0:
115
                t1_enc_sigpass(t1, bpno, orient, &nmsedec, type, cblksty);
116
@@ -900,6 +937,9 @@
117
                    mqc_segmark_enc(mqc);
118
                break;
119
        }
120
+#if !defined(__amd64__) && defined(__MMX__)
121
+   _mm_empty();
122
+#endif
123
        
124
        /* fixed_quality */
125
        cumwmsedec += t1_getwmsedec(nmsedec, compno, level, orient, bpno, qmfbid, stepsize, numcomps);
126
@@ -1004,6 +1044,9 @@
127
    mqc_setstate(mqc, T1_CTXNO_AGG, 0, 3);
128
    mqc_setstate(mqc, T1_CTXNO_ZC, 0, 4);
129
    
130
+#if !defined(__amd64__) && defined(__MMX__)
131
+   _mm_empty();
132
+#endif
133
    for (segno = 0; segno < cblk->numsegs; ++segno) {
134
        opj_tcd_seg_t *seg = &cblk->segs[segno];
135
        
136
@@ -1044,6 +1087,9 @@
137
            }
138
        }
139
    }
140
+#if !defined(__amd64__) && defined(__MMX__)
141
+   _mm_empty();
142
+#endif
143
 }
144
 
145
 /* ----------------------------------------------------------------------- */
146