Merge pull request emscripten-core#4075 from juj/simd_intrinsics_fixes

Simd intrinsics fixes
sandejulio · Feb 3, 2016 · 7bedc32 · 7bedc32
2 parents ab98ff7 + 0cf6d39
commit 7bedc32
Show file tree

Hide file tree

Showing 14 changed files with 810 additions and 19 deletions.
diff --git a/emscripten.py b/emscripten.py
@@ -483,15 +483,24 @@ def make_emulated_param(i):
                          'select', 'swizzle', 'shuffle',
                          'load', 'store', 'load1', 'store1', 'load2', 'store2', 'load3', 'store3']
     simdintboolfuncs = ['and', 'xor', 'or', 'not']
+    if metadata['simdUint8x16']:
+      simdinttypes += ['Uint8x16']
+      simdintfloatfuncs += ['fromUint8x16Bits']
     if metadata['simdInt8x16']:
-      simdinttypes += ['Int8x16', 'Uint8x16']
-      simdintfloatfuncs += ['fromInt8x16Bits', 'fromUint8x16Bits']
+      simdinttypes += ['Int8x16']
+      simdintfloatfuncs += ['fromInt8x16Bits']
+    if metadata['simdUint16x8']:
+      simdinttypes += ['Uint16x8']
+      simdintfloatfuncs += ['fromUint16x8Bits']
     if metadata['simdInt16x8']:
-      simdinttypes += ['Int16x8', 'Uint16x8']
-      simdintfloatfuncs += ['fromInt16x8Bits', 'fromUint16x8Bits']
+      simdinttypes += ['Int16x8']
+      simdintfloatfuncs += ['fromInt16x8Bits']
+    if metadata['simdUint32x4']:
+      simdinttypes += ['Uint32x4']
+      simdintfloatfuncs += ['fromUint32x4', 'fromUint32x4Bits']
     if metadata['simdInt32x4']:
-      simdinttypes += ['Int32x4', 'Uint32x4']
-      simdintfloatfuncs += ['fromInt32x4', 'fromInt32x4Bits', 'fromUint32x4Bits']
+      simdinttypes += ['Int32x4']
+      simdintfloatfuncs += ['fromInt32x4', 'fromInt32x4Bits']
     if metadata['simdFloat32x4']:
       simdfloattypes += ['Float32x4']
       simdintfloatfuncs += ['fromFloat32x4', 'fromFloat32x4Bits']
@@ -508,8 +517,8 @@ def make_emulated_param(i):
       simdbooltypes += ['Bool64x2']
 
     simdfloatfuncs = simdfuncs + simdintfloatfuncs + ['div', 'min', 'max', 'minNum', 'maxNum', 'sqrt',
-                                  'abs', 'reciprocalApproximation', 'reciprocalSqrtApproximation'];
-    simdintfuncs = simdfuncs + simdintfloatfuncs + simdintboolfuncs + ['shiftLeftByScalar', 'shiftRightByScalar'];
+                                  'abs', 'reciprocalApproximation', 'reciprocalSqrtApproximation']
+    simdintfuncs = simdfuncs + simdintfloatfuncs + simdintboolfuncs + ['shiftLeftByScalar', 'shiftRightByScalar', 'addSaturate', 'subSaturate']
     simdboolfuncs = simdfuncs + simdintboolfuncs + ['anyTrue', 'allTrue']
     simdtypes = simdfloattypes + simdinttypes + simdbooltypes
 
@@ -735,7 +744,10 @@ def string_contains_any(s, str_list):
           if sub in s:
             return True
         return False
-      nonexisting_simd_symbols = ['Int8x16_fromInt8x16', 'Int16x8_fromInt16x8', 'Int32x4_fromInt32x4', 'Float32x4_fromFloat32x4', 'Float64x2_fromFloat64x2']
+      nonexisting_simd_symbols = ['Int8x16_fromInt8x16', 'Uint8x16_fromUint8x16', 'Int16x8_fromInt16x8', 'Uint16x8_fromUint16x8', 'Int32x4_fromInt32x4', 'Uint32x4_fromUint32x4', 'Float32x4_fromFloat32x4', 'Float64x2_fromFloat64x2']
+      nonexisting_simd_symbols += ['Int32x4_addSaturate', 'Int32x4_subSaturate', 'Uint32x4_addSaturate', 'Uint32x4_subSaturate']
+      nonexisting_simd_symbols += [(x + '_' + y) for x in ['Int8x16', 'Uint8x16', 'Int16x8', 'Uint16x8', 'Float64x2'] for y in ['load2', 'load3', 'store2', 'store3']]
+      nonexisting_simd_symbols += [(x + '_' + y) for x in ['Int8x16', 'Uint8x16', 'Int16x8', 'Uint16x8'] for y in ['load1', 'load1']]
 
       asm_global_funcs += ''.join(['  var SIMD_' + ty + '=global' + access_quote('SIMD') + access_quote(ty) + ';\n' for ty in simdtypes])
 

diff --git a/src/ecmascript_simd.js b/src/ecmascript_simd.js
@@ -878,6 +878,11 @@ if (typeof simdPhase2 !== 'undefined') {
           "load", "store"],
   }
 
+  // XXX Emscripten: Need these functions for intrinsics, see https://github.com/tc39/ecmascript_simd/issues/316.
+  float64x2.fns.push("load1");
+  float64x2.fns.push("store1");
+  // XXX Emscripten
+
   var bool64x2 = {
     name: "Bool64x2",
     fn: SIMD.Bool64x2,

diff --git a/system/include/emscripten/vector.h b/system/include/emscripten/vector.h
@@ -67,8 +67,8 @@ float64x2 emscripten_float64x2_select(bool64x2 __a, float64x2 __b, float64x2 __c
 // n.b. No emscripten_float64x2_subSaturate, only defined on 8-bit and 16-bit integer SIMD types.
 // n.b. No emscripten_float64x2_shiftLeftByScalar, only defined on integer SIMD types.
 // n.b. No emscripten_float64x2_shiftRightByScalar, only defined on integer SIMD types.
-inline float emscripten_float64x2_extractLane(float64x2 __a, int __lane) __attribute__((__nothrow__, __const__)) { return __a[__lane]; }
-inline float64x2 emscripten_float64x2_replaceLane(float64x2 __a, int __lane, float __s) __attribute__((__nothrow__, __const__)) { __a[__lane] = __s; return __a; }
+inline double emscripten_float64x2_extractLane(float64x2 __a, int __lane) __attribute__((__nothrow__, __const__)) { return __a[__lane]; }
+inline float64x2 emscripten_float64x2_replaceLane(float64x2 __a, int __lane, double __s) __attribute__((__nothrow__, __const__)) { __a[__lane] = __s; return __a; }
 void emscripten_float64x2_store(const void *__p, float64x2 __a) __attribute__((__nothrow__));
 void emscripten_float64x2_store1(const void *__p, float64x2 __a) __attribute__((__nothrow__));
 float64x2 emscripten_float64x2_load(const void *__p) __attribute__((__nothrow__, __pure__));
@@ -450,20 +450,20 @@ uint8x16 emscripten_uint8x16_swizzle(uint8x16 __a, int __lane0, int __lane1, int
 uint8x16 emscripten_uint8x16_shuffle(uint8x16 __a, uint8x16 __b, int __lane0, int __lane1, int __lane2, int __lane3, int __lane4, int __lane5, int __lane6, int __lane7, int __lane8, int __lane9, int __lane10, int __lane11, int __lane12, int __lane13, int __lane14, int __lane15) __attribute__((__nothrow__, __const__));
 
 // Bool64x2
-bool emscripten_bool64x2_anyTrue(bool64x2 __a, bool64x2 __b) __attribute__((__nothrow__, __const__));
-bool emscripten_bool64x2_allTrue(bool64x2 __a, bool64x2 __b) __attribute__((__nothrow__, __const__));
+int emscripten_bool64x2_anyTrue(bool64x2 __a) __attribute__((__nothrow__, __const__));
+int emscripten_bool64x2_allTrue(bool64x2 __a) __attribute__((__nothrow__, __const__));
 
 // Bool32x4
-bool emscripten_bool32x4_anyTrue(bool32x4 __a, bool32x4 __b) __attribute__((__nothrow__, __const__));
-bool emscripten_bool32x4_allTrue(bool32x4 __a, bool32x4 __b) __attribute__((__nothrow__, __const__));
+int emscripten_bool32x4_anyTrue(bool32x4 __a) __attribute__((__nothrow__, __const__));
+int emscripten_bool32x4_allTrue(bool32x4 __a) __attribute__((__nothrow__, __const__));
 
 // Bool16x8
-bool emscripten_bool16x8_anyTrue(bool16x8 __a, bool16x8 __b) __attribute__((__nothrow__, __const__));
-bool emscripten_bool16x8_allTrue(bool16x8 __a, bool16x8 __b) __attribute__((__nothrow__, __const__));
+int emscripten_bool16x8_anyTrue(bool16x8 __a) __attribute__((__nothrow__, __const__));
+int emscripten_bool16x8_allTrue(bool16x8 __a) __attribute__((__nothrow__, __const__));
 
 // Bool8x16
-bool emscripten_bool8x16_anyTrue(bool8x16 __a, bool8x16 __b) __attribute__((__nothrow__, __const__));
-bool emscripten_bool8x16_allTrue(bool8x16 __a, bool8x16 __b) __attribute__((__nothrow__, __const__));
+int emscripten_bool8x16_anyTrue(bool8x16 __a) __attribute__((__nothrow__, __const__));
+int emscripten_bool8x16_allTrue(bool8x16 __a) __attribute__((__nothrow__, __const__));
 
 #ifdef __cplusplus
 }

diff --git a/tests/core/test_simd_float32x4.c b/tests/core/test_simd_float32x4.c
@@ -0,0 +1,98 @@
+#include <emscripten/vector.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+
+void dump(const char *name, float32x4 vec)
+{
+    printf("%s: %f %f %f %f\n", name, emscripten_float32x4_extractLane(vec, 0), emscripten_float32x4_extractLane(vec, 1), emscripten_float32x4_extractLane(vec, 2), emscripten_float32x4_extractLane(vec, 3));
+}
+#define DUMP(V) dump(#V, (V))
+
+void dumpBytes(const char *name, const void *bytes, int n)
+{
+    printf("%s:", name);
+    for(int i = 0; i < n; ++i)
+        printf(" %02X", ((uint8_t*)bytes)[i]);
+    printf("\n");
+}
+#define DUMPBYTES(name, bytes) dumpBytes(name, bytes, sizeof(bytes))
+
+int main()
+{
+    float32x4 v = emscripten_float32x4_set(-1.f, 0.f, 1.f, 3.5f);
+    DUMP(v);
+    float32x4 w = emscripten_float32x4_splat(2.f);
+    DUMP(w);
+    DUMP(emscripten_float32x4_add(v, w));
+    DUMP(emscripten_float32x4_sub(v, w));
+    DUMP(emscripten_float32x4_mul(v, w));
+    DUMP(emscripten_float32x4_div(v, w));
+    DUMP(emscripten_float32x4_max(v, w));
+    DUMP(emscripten_float32x4_min(v, w));
+    DUMP(emscripten_float32x4_maxNum(v, w));
+    DUMP(emscripten_float32x4_minNum(v, w));
+    DUMP(emscripten_float32x4_neg(v));
+    DUMP(emscripten_float32x4_sqrt(v));
+    DUMP(emscripten_float32x4_reciprocalApproximation(v));
+    DUMP(emscripten_float32x4_reciprocalSqrtApproximation(v));
+    DUMP(emscripten_float32x4_abs(v));
+    DUMP(emscripten_float32x4_and(v, w));
+    DUMP(emscripten_float32x4_xor(v, w));
+    DUMP(emscripten_float32x4_or(v, w));
+    DUMP(emscripten_float32x4_not(v));
+    DUMP(emscripten_float32x4_lessThan(v, w));
+    DUMP(emscripten_float32x4_lessThanOrEqual(v, w));
+    DUMP(emscripten_float32x4_greaterThan(v, w));
+    DUMP(emscripten_float32x4_greaterThanOrEqual(v, w));
+    DUMP(emscripten_float32x4_equal(v, w));
+    DUMP(emscripten_float32x4_notEqual(v, w));
+    bool32x4 b = emscripten_int32x4_set(0, -1, 0, -1);
+    DUMP(emscripten_float32x4_select(b, v, w));
+    DUMP(emscripten_float32x4_replaceLane(v, 0, 9.f));
+    DUMP(emscripten_float32x4_replaceLane(v, 1, -3.f));
+    DUMP(emscripten_float32x4_replaceLane(v, 2, 0.f));
+    DUMP(emscripten_float32x4_replaceLane(v, 3, -0.f));
+    uint8_t bytes[16];
+    memset(bytes, 0xFF, sizeof(bytes));
+    emscripten_float32x4_store(bytes, v);
+    DUMPBYTES("emscripten_float32x4_store", bytes);
+    memset(bytes, 0xFF, sizeof(bytes));
+    emscripten_float32x4_store1(bytes, v);
+    DUMPBYTES("emscripten_float32x4_store1", bytes);
+    memset(bytes, 0xFF, sizeof(bytes));
+    emscripten_float32x4_store2(bytes, v);
+    DUMPBYTES("emscripten_float32x4_store2", bytes);
+    memset(bytes, 0xFF, sizeof(bytes));
+    emscripten_float32x4_store3(bytes, v);
+    DUMPBYTES("emscripten_float32x4_store3", bytes);
+
+    emscripten_float32x4_store(bytes, v);
+    DUMP(emscripten_float32x4_load(bytes));
+    DUMP(emscripten_float32x4_load1(bytes));
+    DUMP(emscripten_float32x4_load2(bytes));
+    DUMP(emscripten_float32x4_load3(bytes));
+    // TODO: emscripten_float32x4_fromFloat64x2Bits
+    // TODO: emscripten_float32x4_fromInt32x4Bits
+    // TODO: emscripten_float32x4_fromUint32x4Bits
+    // TODO: emscripten_float32x4_fromInt16x8Bits
+    // TODO: emscripten_float32x4_fromUint16x8Bits
+    // TODO: emscripten_float32x4_fromInt8x16Bits
+    // TODO: emscripten_float32x4_fromUint8x16Bits
+    // TODO: emscripten_float32x4_fromInt32x4
+    // TODO: emscripten_float32x4_fromUint32x4
+    DUMP(emscripten_float32x4_swizzle(v, 0, 1, 2, 3));
+    DUMP(emscripten_float32x4_swizzle(v, 3, 2, 1, 0));
+    DUMP(emscripten_float32x4_swizzle(v, 0, 0, 0, 0));
+    DUMP(emscripten_float32x4_swizzle(v, 0, 3, 0, 3));
+    DUMP(emscripten_float32x4_swizzle(v, 3, 3, 3, 3));
+    float32x4 z = emscripten_float32x4_set(-5.f, 20.f, 14.f, 9.f);
+    DUMP(z);
+    DUMP(emscripten_float32x4_shuffle(v, z, 0, 0, 0, 0));
+    DUMP(emscripten_float32x4_shuffle(v, z, 4, 4, 4, 4));
+    DUMP(emscripten_float32x4_shuffle(v, z, 7, 7, 7, 7));
+    DUMP(emscripten_float32x4_shuffle(v, z, 0, 2, 4, 6));
+    DUMP(emscripten_float32x4_shuffle(v, z, 7, 0, 3, 5));
+
+    printf("Done!\n");
+}
diff --git a/tests/core/test_simd_float32x4.out b/tests/core/test_simd_float32x4.out
@@ -0,0 +1,50 @@
+v: -1.000000 0.000000 1.000000 3.500000
+w: 2.000000 2.000000 2.000000 2.000000
+emscripten_float32x4_add(v, w): 1.000000 2.000000 3.000000 5.500000
+emscripten_float32x4_sub(v, w): -3.000000 -2.000000 -1.000000 1.500000
+emscripten_float32x4_mul(v, w): -2.000000 0.000000 2.000000 7.000000
+emscripten_float32x4_div(v, w): -0.500000 0.000000 0.500000 1.750000
+emscripten_float32x4_max(v, w): 2.000000 2.000000 2.000000 3.500000
+emscripten_float32x4_min(v, w): -1.000000 0.000000 1.000000 2.000000
+emscripten_float32x4_maxNum(v, w): 2.000000 2.000000 2.000000 3.500000
+emscripten_float32x4_minNum(v, w): -1.000000 0.000000 1.000000 2.000000
+emscripten_float32x4_neg(v): 1.000000 -0.000000 -1.000000 -3.500000
+emscripten_float32x4_sqrt(v): nan 0.000000 1.000000 1.870829
+emscripten_float32x4_reciprocalApproximation(v): -1.000000 inf 1.000000 0.285714
+emscripten_float32x4_reciprocalSqrtApproximation(v): nan inf 1.000000 0.534522
+emscripten_float32x4_abs(v): 1.000000 0.000000 1.000000 3.500000
+emscripten_float32x4_and(v, w): 0.000000 0.000000 0.000000 2.000000
+emscripten_float32x4_xor(v, w): -inf 2.000000 inf 0.000000
+emscripten_float32x4_or(v, w): -inf 2.000000 inf 3.500000
+emscripten_float32x4_not(v): 4.000000 nan -4.000000 -1.250000
+emscripten_float32x4_lessThan(v, w): nan nan nan 0.000000
+emscripten_float32x4_lessThanOrEqual(v, w): nan nan nan 0.000000
+emscripten_float32x4_greaterThan(v, w): 0.000000 0.000000 0.000000 nan
+emscripten_float32x4_greaterThanOrEqual(v, w): 0.000000 0.000000 0.000000 nan
+emscripten_float32x4_equal(v, w): 0.000000 0.000000 0.000000 0.000000
+emscripten_float32x4_notEqual(v, w): nan nan nan nan
+emscripten_float32x4_select(b, v, w): 2.000000 0.000000 2.000000 3.500000
+emscripten_float32x4_replaceLane(v, 0, 9.f): 9.000000 0.000000 1.000000 3.500000
+emscripten_float32x4_replaceLane(v, 1, -3.f): -1.000000 -3.000000 1.000000 3.500000
+emscripten_float32x4_replaceLane(v, 2, 0.f): -1.000000 0.000000 0.000000 3.500000
+emscripten_float32x4_replaceLane(v, 3, -0.f): -1.000000 0.000000 1.000000 -0.000000
+emscripten_float32x4_store: 00 00 80 BF 00 00 00 00 00 00 80 3F 00 00 60 40
+emscripten_float32x4_store1: 00 00 80 BF FF FF FF FF FF FF FF FF FF FF FF FF
+emscripten_float32x4_store2: 00 00 80 BF 00 00 00 00 FF FF FF FF FF FF FF FF
+emscripten_float32x4_store3: 00 00 80 BF 00 00 00 00 00 00 80 3F FF FF FF FF
+emscripten_float32x4_load(bytes): -1.000000 0.000000 1.000000 3.500000
+emscripten_float32x4_load1(bytes): -1.000000 0.000000 0.000000 0.000000
+emscripten_float32x4_load2(bytes): -1.000000 0.000000 0.000000 0.000000
+emscripten_float32x4_load3(bytes): -1.000000 0.000000 1.000000 0.000000
+emscripten_float32x4_swizzle(v, 0, 1, 2, 3): -1.000000 0.000000 1.000000 3.500000
+emscripten_float32x4_swizzle(v, 3, 2, 1, 0): 3.500000 1.000000 0.000000 -1.000000
+emscripten_float32x4_swizzle(v, 0, 0, 0, 0): -1.000000 -1.000000 -1.000000 -1.000000
+emscripten_float32x4_swizzle(v, 0, 3, 0, 3): -1.000000 3.500000 -1.000000 3.500000
+emscripten_float32x4_swizzle(v, 3, 3, 3, 3): 3.500000 3.500000 3.500000 3.500000
+z: -5.000000 20.000000 14.000000 9.000000
+emscripten_float32x4_shuffle(v, z, 0, 0, 0, 0): -1.000000 -1.000000 -1.000000 -1.000000
+emscripten_float32x4_shuffle(v, z, 4, 4, 4, 4): -5.000000 -5.000000 -5.000000 -5.000000
+emscripten_float32x4_shuffle(v, z, 7, 7, 7, 7): 9.000000 9.000000 9.000000 9.000000
+emscripten_float32x4_shuffle(v, z, 0, 2, 4, 6): -1.000000 1.000000 -5.000000 14.000000
+emscripten_float32x4_shuffle(v, z, 7, 0, 3, 5): 9.000000 -1.000000 3.500000 20.000000
+Done!
diff --git a/tests/core/test_simd_float64x2.c b/tests/core/test_simd_float64x2.c
@@ -0,0 +1,87 @@
+#include <emscripten/vector.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+
+void dump(const char *name, float64x2 vec)
+{
+    printf("%s: %f %f\n", name, emscripten_float64x2_extractLane(vec, 0), emscripten_float64x2_extractLane(vec, 1));
+}
+#define DUMP(V) dump(#V, (V))
+
+void dumpBytes(const char *name, const void *bytes, int n)
+{
+    printf("%s:", name);
+    for(int i = 0; i < n; ++i)
+        printf(" %02X", ((uint8_t*)bytes)[i]);
+    printf("\n");
+}
+#define DUMPBYTES(name, bytes) dumpBytes(name, bytes, sizeof(bytes))
+
+int main()
+{
+    float64x2 v = emscripten_float64x2_set(-1.5f, 2.5f);
+    DUMP(v);
+    float64x2 w = emscripten_float64x2_splat(1.5f);
+    DUMP(w);
+    DUMP(emscripten_float64x2_add(v, w));
+    DUMP(emscripten_float64x2_sub(v, w));
+    DUMP(emscripten_float64x2_mul(v, w));
+    DUMP(emscripten_float64x2_div(v, w));
+    DUMP(emscripten_float64x2_max(v, w));
+    DUMP(emscripten_float64x2_min(v, w));
+    DUMP(emscripten_float64x2_maxNum(v, w));
+    DUMP(emscripten_float64x2_minNum(v, w));
+    DUMP(emscripten_float64x2_neg(v));
+    DUMP(emscripten_float64x2_sqrt(v));
+    DUMP(emscripten_float64x2_reciprocalApproximation(v));
+    DUMP(emscripten_float64x2_reciprocalSqrtApproximation(v));
+    DUMP(emscripten_float64x2_abs(v));
+    DUMP(emscripten_float64x2_and(v, w));
+    DUMP(emscripten_float64x2_xor(v, w));
+    DUMP(emscripten_float64x2_or(v, w));
+    DUMP(emscripten_float64x2_not(v));
+    DUMP(emscripten_float64x2_lessThan(v, w));
+    DUMP(emscripten_float64x2_lessThanOrEqual(v, w));
+    DUMP(emscripten_float64x2_greaterThan(v, w));
+    DUMP(emscripten_float64x2_greaterThanOrEqual(v, w));
+    DUMP(emscripten_float64x2_equal(v, w));
+    DUMP(emscripten_float64x2_notEqual(v, w));
+    //bool64x2 b = emscripten_int64x2_set(0, -1); // TODO: Can't yet use this form, no int64x2.
+    //DUMP(emscripten_float64x2_select(b, v, w));
+    DUMP(emscripten_float64x2_replaceLane(v, 0, 9.f));
+    DUMP(emscripten_float64x2_replaceLane(v, 1, -3.f));
+    uint8_t bytes[16];
+    memset(bytes, 0xFF, sizeof(bytes));
+    emscripten_float64x2_store(bytes, v);
+    DUMPBYTES("emscripten_float64x2_store", bytes);
+    memset(bytes, 0xFF, sizeof(bytes));
+    emscripten_float64x2_store1(bytes, v);
+    DUMPBYTES("emscripten_float64x2_store1", bytes);
+
+    emscripten_float64x2_store(bytes, v);
+    DUMP(emscripten_float64x2_load(bytes));
+    DUMP(emscripten_float64x2_load1(bytes));
+    // TODO: emscripten_float64x2_fromFloat64x2Bits
+    // TODO: emscripten_float64x2_fromInt64x2Bits
+    // TODO: emscripten_float64x2_fromUint64x2Bits
+    // TODO: emscripten_float64x2_fromInt16x8Bits
+    // TODO: emscripten_float64x2_fromUint16x8Bits
+    // TODO: emscripten_float64x2_fromInt8x16Bits
+    // TODO: emscripten_float64x2_fromUint8x16Bits
+    // TODO: emscripten_float64x2_fromInt64x2
+    // TODO: emscripten_float64x2_fromUint64x2
+    DUMP(emscripten_float64x2_swizzle(v, 0, 1));
+    DUMP(emscripten_float64x2_swizzle(v, 1, 0));
+    DUMP(emscripten_float64x2_swizzle(v, 0, 0));
+    DUMP(emscripten_float64x2_swizzle(v, 1, 1));
+    float64x2 z = emscripten_float64x2_set(-5.5f, 20.5f);
+    DUMP(z);
+    DUMP(emscripten_float64x2_shuffle(v, z, 0, 0));
+    DUMP(emscripten_float64x2_shuffle(v, z, 2, 2));
+    DUMP(emscripten_float64x2_shuffle(v, z, 3, 3));
+    DUMP(emscripten_float64x2_shuffle(v, z, 0, 2));
+    DUMP(emscripten_float64x2_shuffle(v, z, 3, 1));
+
+    printf("Done!\n");
+}