diff -urN -x '*.orig' -x '*.rej' -x '*~' -x '.*' OpenJPEG.orig/libopenjpeg/t1.c OpenJPEG.patched/libopenjpeg/t1.c --- OpenJPEG.orig/libopenjpeg/t1.c 2007-11-13 13:52:05.000000000 -0600 +++ OpenJPEG.patched/libopenjpeg/t1.c 2007-11-14 01:09:40.000000000 -0600 @@ -33,6 +33,17 @@ #include "opj_includes.h" #include "t1_luts.h" +/* Don't use MMX on amd64 */ +/* Note that merely including mmintrin.h, even if we don't use it, changes the code gcc */ +/* outputs on amd64, and it is measurably slower. A bug in gcc? */ +#ifdef __amd64__ +#undef __MMX__ +#endif + +#ifdef __MMX__ +#include +#endif + /** @defgroup T1 T1 - Implementation of the tier-1 coding */ /*@{*/ @@ -45,7 +56,7 @@ static char t1_getspb(int f); static short t1_getnmsedec_sig(int x, int bitpos); static short t1_getnmsedec_ref(int x, int bitpos); -#ifdef __amd64__ +#if defined(__amd64__) || defined(__MMX__) static INLINE void t1_updateflags(flag_t *flagsp, int s, int stride); #else static void t1_updateflags(flag_t *flagsp, int s, int stride); @@ -293,6 +304,32 @@ } #else +#ifdef __MMX__ + +static void t1_updateflags(flag_t *flagsp, int s, int stride) { + static const __v4hi mod[] = { + {T1_SIG_SE, T1_SIG_E, T1_SIG_NE, 0}, + {T1_SIG_SE, T1_SIG_E|T1_SGN_E, T1_SIG_NE, 0}, + {T1_SIG_S, T1_SIG, T1_SIG_N, 0}, + {T1_SIG_S|T1_SGN_S, T1_SIG, T1_SIG_N|T1_SGN_N, 0}, + {T1_SIG_SW, T1_SIG_W, T1_SIG_NW, 0}, + {T1_SIG_SW, T1_SIG_W|T1_SGN_W, T1_SIG_NW, 0} + }; + + __m64 tmp1 = *(__m64*)((void*)&flagsp[-1 - stride]); + __m64 tmp2 = *(__m64*)((void*)&flagsp[-1 ]); + __m64 tmp3 = *(__m64*)((void*)&flagsp[-1 + stride]); + + tmp1 = _mm_or_si64(tmp1, mod[s]); + tmp2 = _mm_or_si64(tmp2, mod[s+2]); + tmp3 = _mm_or_si64(tmp3, mod[s+4]); + + *(__m64*)((void*)&flagsp[-1 - stride]) = tmp1; + *(__m64*)((void*)&flagsp[-1 ]) = tmp2; + *(__m64*)((void*)&flagsp[-1 + stride]) = tmp3; +} + +#else static void t1_updateflags(flag_t *flagsp, int s, int stride) { static const flag_t mod[] = { @@ -316,6 +353,7 @@ } #endif +#endif static void t1_enc_sigpass_step( opj_t1_t *t1, @@ -720,18 +758,14 @@ | ((int64)(T1_SIG | T1_VISIT | T1_SIG_OTH)<<48); agg = !tmp; #else + int* flagsp = (int*)&t1->flags[(k+1) + (i+1)*(t1->h+2)]; + agg = flagsp[1]; if (cblksty & J2K_CCP_CBLKSTY_VSC) { - agg = !(t1->flags[(k+1) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH) - || t1->flags[(k+2) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH) - || t1->flags[(k+3) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH) - || (t1->flags[(k+4) + (i+1)*(t1->h+2)] - & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) & (T1_SIG | T1_VISIT | T1_SIG_OTH)); - } else { - agg = !(t1->flags[(k+1) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH) - || t1->flags[(k+2) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH) - || t1->flags[(k+3) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH) - || t1->flags[(k+4) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)); + agg &= ~((T1_SIG_S|T1_SIG_SE|T1_SIG_SW|T1_SGN_S)<<16); } + agg |= flagsp[0]; + agg &= (T1_SIG|T1_VISIT|T1_SIG_OTH)|(T1_SIG|T1_VISIT|T1_SIG_OTH)<<16; + agg = !agg; #endif } else { agg = 0; @@ -820,7 +854,7 @@ memset(t1->data,0,datasize * sizeof(int)); flagssize=(h+2) * (w+2); -#ifdef __amd64__ +#if defined(__amd64__) || defined(__MMX__) /* 64 bit SIMD/SWAR in t1_updateflags requires one short of headroom because three shorts = 48 bits. */ ++flagssize; @@ -886,6 +920,9 @@ int correction = 3; type = ((bpno < (cblk->numbps - 4)) && (passtype < 2) && (cblksty & J2K_CCP_CBLKSTY_LAZY)) ? T1_TYPE_RAW : T1_TYPE_MQ; +#if !defined(__amd64__) && defined(__MMX__) + _mm_empty(); +#endif switch (passtype) { case 0: t1_enc_sigpass(t1, bpno, orient, &nmsedec, type, cblksty); @@ -900,6 +937,9 @@ mqc_segmark_enc(mqc); break; } +#if !defined(__amd64__) && defined(__MMX__) + _mm_empty(); +#endif /* fixed_quality */ cumwmsedec += t1_getwmsedec(nmsedec, compno, level, orient, bpno, qmfbid, stepsize, numcomps); @@ -1004,6 +1044,9 @@ mqc_setstate(mqc, T1_CTXNO_AGG, 0, 3); mqc_setstate(mqc, T1_CTXNO_ZC, 0, 4); +#if !defined(__amd64__) && defined(__MMX__) + _mm_empty(); +#endif for (segno = 0; segno < cblk->numsegs; ++segno) { opj_tcd_seg_t *seg = &cblk->segs[segno]; @@ -1044,6 +1087,9 @@ } } } +#if !defined(__amd64__) && defined(__MMX__) + _mm_empty(); +#endif } /* ----------------------------------------------------------------------- */