Merge pull request #4099 from juj/sse3

kripken · kripken · commit 7759b2fe435c · 2016-02-14T10:26:58.000-08:00
Sse3
diff --git a/emcc.py b/emcc.py
@@ -685,6 +685,11 @@ def validate_arg_level(level_string, max_level, err_msg):
         newargs.append('-D__SSE__=1')
         newargs.append('-D__SSE2__=1')
         newargs[i] = ''
+      elif newargs[i] == '-msse3':
+        newargs.append('-D__SSE__=1')
+        newargs.append('-D__SSE2__=1')
+        newargs.append('-D__SSE3__=1')
+        newargs[i] = ''
 
     if should_exit:
       sys.exit(0)
diff --git a/site/source/docs/porting/simd.rst b/site/source/docs/porting/simd.rst
@@ -17,7 +17,7 @@ There are three different ways to generate code to benefit from SIMD instruction
 
 - Emscripten supports the GCC/Clang compiler specific `SIMD Vector Extensions <https://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html>`_. These constructs do not require any changes to the command line build flags, but any code that utilizes the vector built-ins will always unconditionally emit SIMD.js vector instructions.
 
-- A third option is to use the x86 SSE intrinsics. Emscripten has full support for compiling code that utilizes the SSE1 and SSE2 intrinsic function calls. To enable SSE1 intrinsics support, pass the compiler flag -msse, and add in a #include <xmmintrin.h>. To build SSE2 intrinsics code, pass the compiler flag -msse2, and use #include <emmintrin.h>.
+- A third option is to use the x86 SSE intrinsics. Emscripten has full support for compiling code that utilizes the SSE1, SSE2 and SSE3 intrinsic function calls. To enable SSE1 intrinsics support, pass the compiler flag -msse, and add in a #include <xmmintrin.h>. To build SSE2 intrinsics code, pass the compiler flag -msse2, and use #include <emmintrin.h>. For SSE3, pass -msse3 and #include <pmmintrin.h>.
 
 These three methods are not mutually exclusive, but may freely be combined.
 
@@ -30,9 +30,9 @@ When porting native SIMD code, it should be noted that because of portability co
 
  - The SIMD types supported by SIMD.js are Float32x4, Int32x4, Uint32x4, Int16x8, Uint16x8, Int8x16 and Uint8x16. In particular, Float64x2 and Int64x2 are currently not supported, however Float64x2 is emulated in software in the current polyfill. 256-bit or wider SIMD types (AVX) are not supported either.
 
- - Even though the full set of SSE1 and SSE2 intrinsics are supported, because of the platform-abstract nature of SIMD.js, some of these intrinsics will compile down to scalarized instructions to emulate. To verify which instructions are accelerated and which are not, examine the code in the platform headers `xmmintrin.h <https://github.com/kripken/emscripten/blob/incoming/system/include/emscripten/xmmintrin.h>`_ and `emmintrin.h <https://github.com/kripken/emscripten/blob/incoming/system/include/emscripten/xmmintrin.h>`_.
+ - Even though the full set of SSE1, SSE2 and SSE3 intrinsics are supported, because of the platform-abstract nature of SIMD.js, some of these intrinsics will compile down to scalarized instructions to emulate. To verify which instructions are accelerated and which are not, examine the code in the platform headers `xmmintrin.h <https://github.com/kripken/emscripten/blob/incoming/system/include/emscripten/xmmintrin.h>`_ and `emmintrin.h <https://github.com/kripken/emscripten/blob/incoming/system/include/emscripten/xmmintrin.h>`_.
 
- - Currently the Intel x86 SIMD support is limited to SSE1 and SSE2 instruction sets. The Intel x86 SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and FMA instruction sets or newer are not supported. Also, the old Intel x86 MMX instruction set is not supported.
+ - Currently the Intel x86 SIMD support is limited to SSE1, SSE2 and SSE3 instruction sets. The Intel x86 SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and FMA instruction sets or newer are not supported. Also, the old Intel x86 MMX instruction set is not supported.
 
  - SIMD.js does not have control over managing floating point rounding modes or handling denormals.
 
diff --git a/system/include/emscripten/pmmintrin.h b/system/include/emscripten/pmmintrin.h
@@ -0,0 +1,156 @@
+/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __PMMINTRIN_H
+#define __PMMINTRIN_H
+
+#include <emmintrin.h>
+
+#ifndef __SSE3__
+#error "SSE3 instruction set not enabled"
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#ifdef __EMSCRIPTEN__
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#else
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
+#endif
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lddqu_si128(__m128i const *__p)
+{
+#ifdef __EMSCRIPTEN__
+  return _mm_loadu_si128(__p);
+#else
+  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
+#endif
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_addsub_ps(__m128 __a, __m128 __b)
+{
+#ifdef __EMSCRIPTEN__
+  return _mm_add_ps(__a, _mm_mul_ps(__b, _mm_set_ps(1.f, -1.f, 1.f, -1.f)));
+#else
+  return __builtin_ia32_addsubps(__a, __b);
+#endif
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hadd_ps(__m128 __a, __m128 __b)
+{
+#ifdef __EMSCRIPTEN__
+  return _mm_add_ps(_mm_shuffle_ps(__a, __b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(__a, __b, _MM_SHUFFLE(3, 1, 3, 1)));
+#else
+  return __builtin_ia32_haddps(__a, __b);
+#endif
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hsub_ps(__m128 __a, __m128 __b)
+{
+#ifdef __EMSCRIPTEN__
+  return _mm_sub_ps(_mm_shuffle_ps(__a, __b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(__a, __b, _MM_SHUFFLE(3, 1, 3, 1)));
+#else
+  return __builtin_ia32_hsubps(__a, __b);
+#endif
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movehdup_ps(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 1, 1, 3, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_moveldup_ps(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_addsub_pd(__m128d __a, __m128d __b)
+{
+#ifdef __EMSCRIPTEN__
+  return _mm_add_pd(__a, _mm_mul_pd(__b, _mm_set_pd(1.0, -1.0)));
+#else
+  return __builtin_ia32_addsubpd(__a, __b);
+#endif
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hadd_pd(__m128d __a, __m128d __b)
+{
+#ifdef __EMSCRIPTEN__
+  return _mm_add_pd(_mm_shuffle_pd(__a, __b, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(__a, __b, _MM_SHUFFLE2(1, 1)));
+#else
+  return __builtin_ia32_haddpd(__a, __b);
+#endif
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hsub_pd(__m128d __a, __m128d __b)
+{
+#ifdef __EMSCRIPTEN__
+  return _mm_sub_pd(_mm_shuffle_pd(__a, __b, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(__a, __b, _MM_SHUFFLE2(1, 1)));
+#else
+  return __builtin_ia32_hsubpd(__a, __b);
+#endif
+}
+
+#define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_movedup_pd(__m128d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0);
+}
+
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+
+#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+
+#ifndef __EMSCRIPTEN__
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_monitor((void *)__p, __extensions, __hints);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mwait(unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_mwait(__extensions, __hints);
+}
+
+#endif /* __EMSCRIPTEN__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __PMMINTRIN_H */
diff --git a/system/include/emscripten/x86intrin.h b/system/include/emscripten/x86intrin.h
@@ -0,0 +1,20 @@
+#ifndef __X86INTRIN_H
+#define __X86INTRIN_H
+
+// x86intrin.h is the standard include-all for all supported intrinsics.
+
+#if __SSE__
+#include <xmmintrin.h>
+#else
+#warning x86intrin.h included without SIMD.js support enabled.
+#endif
+
+#if __SSE2__
+#include <emmintrin.h>
+#endif
+
+#if __SSE3__
+#include <pmmintrin.h>
+#endif
+
+#endif
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -5840,6 +5840,21 @@ def test_sse2_full(self):
       self.emcc_args = orig_args + mode + ['-I' + path_from_root('tests'), '-msse2'] + args
       self.do_run(open(path_from_root('tests', 'test_sse2_full.cpp'), 'r').read(), native_result)
 
+  # Tests the full SSE3 API.
+  @SIMD
+  def test_sse3_full(self):
+    args = []
+    if '-O0' in self.emcc_args: args += ['-D_DEBUG=1']
+    Popen([CLANG, path_from_root('tests', 'test_sse3_full.cpp'), '-o', 'test_sse3_full', '-D_CRT_SECURE_NO_WARNINGS=1', '-msse3'] + args + get_clang_native_args(), env=get_clang_native_env(), stdout=PIPE).communicate()
+    native_result, err = Popen('./test_sse3_full', stdout=PIPE).communicate()
+    native_result = native_result.replace('\r\n', '\n') # Windows line endings fix
+
+    Settings.PRECISE_F32 = 1 # SIMD currently requires Math.fround
+    orig_args = self.emcc_args
+    for mode in [[], ['-s', 'SIMD=1']]:
+      self.emcc_args = orig_args + mode + ['-I' + path_from_root('tests'), '-msse3'] + args
+      self.do_run(open(path_from_root('tests', 'test_sse3_full.cpp'), 'r').read(), native_result)
+
   @SIMD
   def test_simd(self):
     test_path = path_from_root('tests', 'core', 'test_simd')
diff --git a/tests/test_sse3_full.cpp b/tests/test_sse3_full.cpp
@@ -0,0 +1,40 @@
+// This file uses SSE3 by calling different functions with different interesting inputs and prints the results.
+// Use a diff tool to compare the results between platforms.
+
+#include <pmmintrin.h>
+#define ENABLE_SSE2
+#include "test_sse_full.h"
+
+#ifndef _DEBUG
+// The following tests break when optimizer is applied, so disable them for now. Baby steps.
+// See https://github.com/kripken/emscripten/issues/3789
+#define BREAKS_UNDER_OPTIMIZATION
+#endif
+
+float *interesting_floats = get_interesting_floats();
+int numInterestingFloats = sizeof(interesting_floats_)/sizeof(interesting_floats_[0]);
+uint32_t *interesting_ints = get_interesting_ints();
+int numInterestingInts = sizeof(interesting_ints_)/sizeof(interesting_ints_[0]);
+double *interesting_doubles = get_interesting_doubles();
+int numInterestingDoubles = sizeof(interesting_doubles_)/sizeof(interesting_doubles_[0]);
+
+int main()
+{
+	assert(numInterestingFloats % 4 == 0);
+	assert(numInterestingInts % 4 == 0);
+	assert(numInterestingDoubles % 4 == 0);	
+
+	Ret_M128d_M128d(__m128d, _mm_addsub_pd);
+	Ret_M128_M128(__m128, _mm_addsub_ps);
+	Ret_M128d_M128d(__m128d, _mm_hadd_pd);
+	Ret_M128_M128(__m128, _mm_hadd_ps);
+	Ret_M128d_M128d(__m128d, _mm_hsub_pd);
+	Ret_M128_M128(__m128, _mm_hsub_ps);
+#ifndef BREAKS_UNDER_OPTIMIZATION
+	Ret_IntPtr(__m128i, _mm_lddqu_si128, __m128i*, 4, 1);
+#endif
+	Ret_DoublePtr(__m128d, _mm_loaddup_pd, 1, 1);
+	Ret_M128d(__m128d, _mm_movedup_pd);
+	Ret_M128(__m128, _mm_movehdup_ps);
+	Ret_M128(__m128, _mm_moveldup_ps);
+}