Add SSE2/SSE4.1 version of od_bin_fdct4x4.

Multimedia / Daala - Andreas Gal [gmail.com] - 2 September 2014 17:18 EDT

Currently requires configuring with -msse4.1 in your CFLAGS to build the SSE4.1 version.

###

diff --git a/Makefile.am b/Makefile.am
index b357e87..e2d6fdb 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -41,6 +41,17 @@ noinst_HEADERS = \
src/x86/x86enc.h \
src/x86/x86int.h

+src_dct_SOURCES = src/dct.c
+if ENABLE_X86ASM
+src_dct_SOURCES += src/x86/cpu.c
+if ENABLE_SSE2_INTRINSICS
+src_dct_SOURCES += src/x86/sse2dct.c
+endif
+if ENABLE_SSE41_INTRINSICS
+src_dct_SOURCES += src/x86/sse41dct.c
+endif
+endif
+
src_libdaalabase_la_CFLAGS = $(OGG_CFLAGS)
src_libdaalabase_la_LIBADD = $(OGG_LIBS) $(LIBM)
if DUMP_IMAGES
@@ -52,7 +63,6 @@ src_libdaalabase_la_LDFLAGS +=
src_libdaalabase_la_SOURCES = \
src/adapt.c \
src/block_size.c \
- src/dct.c \
src/entcode.c \
src/entdec.c \
src/filter.c \
@@ -75,10 +85,10 @@ src_libdaalabase_la_SOURCES = \
src/tf.c \
src/zigzag4.c \
src/zigzag8.c \
- src/zigzag16.c
+ src/zigzag16.c \
+ $(src_dct_SOURCES)
if ENABLE_X86ASM
src_libdaalabase_la_SOURCES += \
- src/x86/cpu.c \
src/x86/sse2mc.c \
src/x86/x86state.c
endif
@@ -222,7 +232,7 @@ tools_y4m2png_LDADD = $(THEORA_LIBS) $(OGG_LIBS) $(PNG_LIBS)
tools_dump_psnrhvs_SOURCES = \
tools/vidinput.c \
tools/y4m_input.c \
- src/dct.c \
+ $(src_dct_SOURCES) \
tools/dump_psnrhvs.c
tools_dump_psnrhvs_CFLAGS = $(THEORA_CFLAGS) $(OGG_CFLAGS) $(PNG_CFLAGS)
tools_dump_psnrhvs_LDADD = $(THEORA_LIBS) $(OGG_LIBS) $(PNG_LIBS) $(LIBM)
@@ -258,7 +268,7 @@ tools_block_size_analysis_SOURCES = \
src/block_size_enc.c \
tools/vidinput.c \
tools/y4m_input.c \
- src/dct.c \
+ $(src_dct_SOURCES) \
src/internal.c \
src/generic_encoder.c \
src/generic_code.c \
@@ -320,7 +330,7 @@ tools_intra_stats_SOURCES = \
src/entenc.c \
src/entcode.c \
src/filter.c \
- src/dct.c \
+ $(src_dct_SOURCES) \
src/intra.c \
src/switch_table.c \
src/tf.c \
@@ -354,7 +364,7 @@ tools_intra_pred_SOURCES = \
src/entenc.c \
src/entcode.c \
src/filter.c \
- src/dct.c \
+ $(src_dct_SOURCES) \
src/intra.c \
src/intradata.c \
src/internal.c \
@@ -387,7 +397,7 @@ tools_intra_trace_SOURCES = \
src/entenc.c \
src/entcode.c \
src/filter.c \
- src/dct.c \
+ $(src_dct_SOURCES) \
src/intra.c \
src/switch_table.c \
src/tf.c \
@@ -416,7 +426,7 @@ tools_trans_SOURCES = \
tools/svd.c \
tools/cholesky.c \
src/filter.c \
- src/dct.c \
+ $(src_dct_SOURCES) \
src/intradata.c
tools_trans_CFLAGS = $(THEORA_CFLAGS) $(OGG_CFLAGS) $(PNG_CFLAGS) $(OPENMP_CFLAGS)
tools_trans_LDADD = $(THEORA_LIBS) $(OGG_LIBS) $(PNG_LIBS) $(LIBM)
@@ -430,7 +440,7 @@ tools_trans_gain_SOURCES = \
tools/od_filter.c \
tools/trans_tools.c \
src/filter.c \
- src/dct.c \
+ $(src_dct_SOURCES) \
src/intradata.c
tools_trans_gain_CFLAGS = $(THEORA_CFLAGS) $(OGG_CFLAGS) $(PNG_CFLAGS) $(OPENMP_CFLAGS)
tools_trans_gain_LDADD = $(THEORA_LIBS) $(OGG_LIBS) $(PNG_LIBS) $(LIBM)
@@ -455,7 +465,7 @@ tools_trans2d_SOURCES = \
tools/svd.c \
tools/cholesky.c \
src/filter.c \
- src/dct.c \
+ $(src_dct_SOURCES) \
src/intradata.c
tools_trans2d_CFLAGS = $(THEORA_CFLAGS) $(OGG_CFLAGS) $(PNG_CFLAGS) $(OPENMP_CFLAGS)
tools_trans2d_LDADD = $(THEORA_LIBS) $(OGG_LIBS) $(PNG_LIBS) $(LIBM)
@@ -469,7 +479,7 @@ tools_init_intra_xform_SOURCES = \
tools/vidinput.c \
tools/y4m_input.c \
src/filter.c \
- src/dct.c \
+ $(src_dct_SOURCES) \
src/intra.c \
src/tf.c \
src/internal.c \
@@ -545,7 +555,7 @@ TESTS = \
src/tests/test_filters \
src/tests/check_tests

-src_tests_dcttest_SOURCES = src/dct.c src/filter.c src/internal.c
+src_tests_dcttest_SOURCES = $(src_dct_SOURCES) src/filter.c src/internal.c
src_tests_dcttest_CFLAGS = $(OGG_CFLAGS) \
-DOD_DCT_CHECK_OVERFLOW -DOD_DCT_TEST -DOD_ENABLE_ASSERTIONS
src_tests_dcttest_LDADD = $(LIBM)
diff --git a/configure.ac b/configure.ac
index 6c5e8ad..976bd0d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -170,10 +170,36 @@ case $host_cpu in
;;
esac

-AM_CONDITIONAL([ENABLE_X86ASM], [test "$enable_asm" = "yes" -a "$cpu_x86" = "true"])
AS_IF([test "$enable_asm" = "yes" -a "$cpu_x86" = "true"], [
AC_DEFINE([OD_X86ASM], [1], [Enable asm optimisations])
+ AC_TRY_COMPILE([
+#include
+ ], [
+ return _mm_cvtsi128_si32(_mm_setzero_si128());
+ ], [
+ enable_sse2_intrinsics=yes
+ AC_DEFINE([OD_SSE2_INTRINSICS], [1],
+ [Enable SSE2 intrinsics optimisations])
+ AC_TRY_COMPILE([
+#include
+ ], [
+ return _mm_cvtsi128_si32(_mm_mullo_epi32(_mm_setzero_si128(),
+ _mm_setzero_si128()));
+ ], [
+ enable_sse41_intrinsics=yes
+ AC_DEFINE([OD_SSE41_INTRINSICS], [1],
+ [Enable SSE4.1 intrinsics optimisations])
+ ], [enable_sse41_intrinsics=no]
+ )
+ ], [enable_sse2_intrinsics=no]
+ )
])
+AM_CONDITIONAL([ENABLE_X86ASM],
+ [test "$enable_asm" = "yes" -a "$cpu_x86" = "true"])
+AM_CONDITIONAL([ENABLE_SSE2_INTRINSICS],
+ [test "$enable_asm" = "yes" -a "$cpu_x86" = "true" -a "$enable_sse2_intrinsics" = "yes"])
+AM_CONDITIONAL([ENABLE_SSE41_INTRINSICS],
+ [test "$enable_asm" = "yes" -a "$cpu_x86" = "true" -a "$enable_sse41_intrinsics" = "yes"])

AC_ARG_ENABLE([encoder-check],
AS_HELP_STRING([--enable-encoder-check], [Compare reconstructed frames]),,
diff --git a/src/x86/sse2dct.c b/src/x86/sse2dct.c
new file mode 100644
index 0000000..012500e
--- /dev/null
+++ b/src/x86/sse2dct.c
@@ -0,0 +1,147 @@
+/*Daala video codec
+Copyright (c) 2002-2013 Daala project contributors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/
+
+#if defined(HAVE_CONFIG_H)
+# include "config.h"
+#endif
+
+#include
+#include "../dct.h"
+#include "x86int.h"
+
+OD_SIMD_INLINE __m128i od_unbiased_rshift_epi32(__m128i a, int b) {
+ return _mm_srai_epi32(_mm_add_epi32(_mm_srli_epi32(a, 32 - b), a), b);
+}
+
+OD_SIMD_INLINE void od_overflow_check_epi32(__m128i val, ogg_int32_t scale,
+ ogg_int32_t offset, int idx) {
+#if defined(OD_DCT_TEST) && defined(OD_DCT_CHECK_OVERFLOW)
+ ogg_int32_t mem[4];
+ int n;
+ _mm_store_si128((__m128i *)mem, val);
+ for (n = 0; n < 4; n++) {
+ OD_DCT_OVERFLOW_CHECK(mem[n], scale, offset, idx);
+ }
+#endif
+ (void)val;
+ (void)scale;
+ (void)offset;
+ (void)idx;
+}
+
+/*This is overridden by the SSE4.1 version.*/
+#if !defined(OD_MULLO_EPI32)
+OD_SIMD_INLINE __m128i od_mullo_epi32_sse2(__m128i a, int b1) {
+ __m128i b = _mm_set1_epi32(b1);
+ __m128i lo = _mm_mul_epu32(a, b);
+ __m128i hi = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
+ return _mm_unpacklo_epi32(_mm_shuffle_epi32(lo, _MM_SHUFFLE(0, 0, 2, 0)),
+ _mm_shuffle_epi32(hi, _MM_SHUFFLE(0, 0, 2, 0)));
+}
+
+# define OD_MULLO_EPI32 od_mullo_epi32_sse2
+#endif
+
+OD_SIMD_INLINE __m128i od_dct_mul_epi32(__m128i val, ogg_int32_t scale,
+ ogg_int32_t offset, ogg_int32_t shift) {
+ return _mm_srai_epi32(_mm_add_epi32(OD_MULLO_EPI32(val, scale),
+ _mm_set1_epi32(offset)), shift);
+}
+
+OD_SIMD_INLINE void od_transpose4(__m128i *t0, __m128i *t1,
+ __m128i *t2, __m128i *t3) {
+ __m128i a = _mm_unpacklo_epi32(*t0, *t1);
+ __m128i b = _mm_unpacklo_epi32(*t2, *t3);
+ __m128i c = _mm_unpackhi_epi32(*t0, *t1);
+ __m128i d = _mm_unpackhi_epi32(*t2, *t3);
+ *t0 = _mm_unpacklo_epi64(a, b);
+ *t1 = _mm_unpackhi_epi64(a, b);
+ *t2 = _mm_unpacklo_epi64(c, d);
+ *t3 = _mm_unpackhi_epi64(c, d);
+}
+
+OD_SIMD_INLINE void od_load4(const od_coeff *x, int xstride,
+ __m128i *t0, __m128i *t1, __m128i *t2, __m128i *t3) {
+ *t0 = _mm_load_si128((const __m128i *)(x + 0*xstride));
+ *t1 = _mm_load_si128((const __m128i *)(x + 1*xstride));
+ *t2 = _mm_load_si128((const __m128i *)(x + 2*xstride));
+ *t3 = _mm_load_si128((const __m128i *)(x + 3*xstride));
+}
+
+OD_SIMD_INLINE void od_store4(od_coeff *x, int xstride,
+ __m128i t0, __m128i t1, __m128i t2, __m128i t3) {
+ _mm_store_si128((__m128i *)(x + 0*xstride), t0);
+ _mm_store_si128((__m128i *)(x + 1*xstride), t1);
+ _mm_store_si128((__m128i *)(x + 2*xstride), t2);
+ _mm_store_si128((__m128i *)(x + 3*xstride), t3);
+}
+
+OD_SIMD_INLINE void od_fdct4_kernel(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3) {
+ /*9 adds, 2 shifts, 3 "muls".*/
+ __m128i t0 = *x0;
+ __m128i t2 = *x1;
+ __m128i t1 = *x2;
+ __m128i t3 = *x3;
+ __m128i t2h;
+ /*+1/-1 butterflies:*/
+ t3 = _mm_sub_epi32(t0, t3);
+ t2 = _mm_add_epi32(t2, t1);
+ t2h = od_unbiased_rshift_epi32(t2, 1);
+ t1 = _mm_sub_epi32(t2h, t1);
+ t0 = _mm_sub_epi32(t0, od_unbiased_rshift_epi32(t3, 1));
+ /*+ Embedded 2-point type-II DCT.*/
+ t0 = _mm_add_epi32(t0, t2h);
+ t2 = _mm_sub_epi32(t0, t2);
+ /*+ Embedded 2-point type-IV DST.*/
+ /*23013/32768 ~= 4*sin(\frac{\pi}{8}) - 2*tan(\frac{\pi}{8}) ~=
+ 0.70230660471416898931046248770220*/
+ od_overflow_check_epi32(t1, 23013, 16384, 0);
+ t3 = _mm_sub_epi32(t3, od_dct_mul_epi32(t1, 23013, 16384, 15));
+ /*21407/32768~=\sqrt{1/2}*cos(\frac{\pi}{8}))
+ ~=0.65328148243818826392832158671359*/
+ od_overflow_check_epi32(t3, 21407, 16384, 1);
+ t1 = _mm_add_epi32(t1, od_dct_mul_epi32(t3, 21407, 16384, 15));
+ /*18293/16384 ~= 4*sin(\frac{\pi}{8}) - tan(\frac{\pi}{8}) ~=
+ 1.1165201670872640381121512119119*/
+ od_overflow_check_epi32(t3, 18293, 8192, 2);
+ t3 = _mm_sub_epi32(t3, od_dct_mul_epi32(t1, 18293, 8192, 14));
+ od_transpose4(&t0, &t1, &t2, &t3);
+ *x0 = t0;
+ *x1 = t1;
+ *x2 = t2;
+ *x3 = t3;
+}
+
+void od_bin_fdct4x4_sse2(od_coeff *y, int ystride,
+ const od_coeff *x, int xstride) {
+ __m128i t0;
+ __m128i t1;
+ __m128i t2;
+ __m128i t3;
+ od_load4(x, xstride, &t0, &t1, &t2, &t3);
+ od_fdct4_kernel(&t0, &t1, &t2, &t3);
+ od_fdct4_kernel(&t0, &t1, &t2, &t3);
+ od_store4(y, ystride, t0, t1, t2, t3);
+}
diff --git a/src/x86/sse41dct.c b/src/x86/sse41dct.c
new file mode 100644
index 0000000..84471b8
--- /dev/null
+++ b/src/x86/sse41dct.c
@@ -0,0 +1,40 @@
+/*Daala video codec
+Copyright (c) 2002-2013 Daala project contributors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/
+
+#if defined(HAVE_CONFIG_H)
+# include "config.h"
+#endif
+
+#include
+#include "x86int.h"
+
+OD_SIMD_INLINE __m128i od_mullo_epi32_sse41(__m128i a, int b) {
+ return _mm_mullo_epi32(a, _mm_set1_epi32(b));
+}
+
+#define OD_MULLO_EPI32 od_mullo_epi32_sse41
+
+#define od_bin_fdct4x4_sse2 od_bin_fdct4x4_sse41
+
+#include "sse2dct.c"
diff --git a/src/x86/x86int.h b/src/x86/x86int.h
index f269e27..27a7c50 100644
--- a/src/x86/x86int.h
+++ b/src/x86/x86int.h
@@ -26,6 +26,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/
# define _x86_x86int_H (1)
# include "../state.h"

+# if OD_GNUC_PREREQ(3, 0, 0)
+# define OD_SIMD_INLINE static __inline __attribute__((always_inline))
+# else
+# define OD_SIMD_INLINE static
+# endif
+
void od_state_opt_vtbl_init_x86(od_state *_state);

void od_mc_predict1fmv8_sse2(unsigned char *_dst,const unsigned char *_src,
@@ -35,5 +41,9 @@ void od_mc_blend_full8_sse2(unsigned char *_dst,int _dystride,
const unsigned char *_src[4],int _log_xblk_sz,int _log_yblk_sz);
void od_mc_blend_full_split8_sse2(unsigned char *_dst,int _dystride,
const unsigned char *_src[4],int _c,int _s,int _log_xblk_sz,int _log_yblk_sz);
+void od_bin_fdct4x4_sse2(od_coeff *y, int ystride,
+ const od_coeff *x, int xstride);
+void od_bin_fdct4x4_sse41(od_coeff *y, int ystride,
+ const od_coeff *x, int xstride);

#endif
diff --git a/tools/init_intra_maps.c b/tools/init_intra_maps.c
index 0e16b72..c2c3dcb 100644
--- a/tools/init_intra_maps.c
+++ b/tools/init_intra_maps.c
@@ -31,8 +31,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/
#include
#include
#include "intra_fit_tools.h"
-#include "../src/dct.c"
-#include "../src/internal.c"
+#include "../src/dct.h"
+#include "../src/internal.h"

/*For validation purposes only.
Copied from libvpx.*/

a6ed7ae Add SSE2/SSE4.1 version of od_bin_fdct4x4.
Makefile.am | 36 +++++++-----
configure.ac | 28 ++++++++-
src/x86/sse2dct.c | 147 +++++++++++++++++++++++++++++++++++++++++++++++
src/x86/sse41dct.c | 40 +++++++++++++
src/x86/x86int.h | 10 ++++
tools/init_intra_maps.c | 4 +-
6 files changed, 249 insertions(+), 16 deletions(-)

Upstream: git.xiph.org


  • Share