Skip to content

Commit

Permalink
wip : WASM 128-bit SIMD support
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Oct 22, 2022
1 parent e905c6f commit db460b7
Show file tree
Hide file tree
Showing 5 changed files with 189 additions and 13 deletions.
12 changes: 6 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -123,15 +123,15 @@ else()
if (MSVC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2 /D_CRT_SECURE_NO_WARNINGS=1")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
if (EMSCRIPTEN)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -msimd128")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
endif()
endif()
endif()

if (EMSCRIPTEN)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -msimd128")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
endif()

# whisper - this is the main library of the project

set(TARGET whisper)
Expand Down
5 changes: 3 additions & 2 deletions bindings/javascript/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,14 @@ if (WHISPER_WASM_SINGLE_FILE)
)
endif()

#-s TOTAL_MEMORY=536870912 \
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
--bind \
-s MODULARIZE=1 \
-s ASSERTIONS=1 \
-s USE_PTHREADS=1 \
-s PTHREAD_POOL_SIZE=8 \
-s TOTAL_MEMORY=536870912 \
-s PTHREAD_POOL_SIZE=9 \
-s ALLOW_MEMORY_GROWTH=1 \
-s FORCE_FILESYSTEM=1 \
-s EXPORT_NAME=\"'whisper_factory'\" \
${EXTRA_FLAGS} \
Expand Down
3 changes: 2 additions & 1 deletion bindings/javascript/emscripten.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <emscripten/bind.h>

#include <vector>
#include <thread>

std::vector<struct whisper_context *> g_contexts(4, nullptr);

Expand Down Expand Up @@ -47,7 +48,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
params.print_special_tokens = false;
params.translate = false;
params.language = "en";
params.n_threads = 4;
params.n_threads = std::min(8, (int) std::thread::hardware_concurrency());
params.offset_ms = 0;

std::vector<float> pcmf32;
Expand Down
2 changes: 1 addition & 1 deletion bindings/javascript/whisper.js

Large diffs are not rendered by default.

180 changes: 177 additions & 3 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {

#else

#ifdef __wasm_simd128__
#include <wasm_simd128.h>
#else
#include <immintrin.h>
#endif

// FP16 <-> FP32
// ref: https://github.com/Maratyszcza/FP16
Expand Down Expand Up @@ -288,7 +292,7 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
sumf += x[i]*y[i];
}
#elif defined(__AVX2__)
// AVX 256-bit (unroll 4)
// AVX 256-bit
const int n32 = (n & ~31);

__m256 sum0 = _mm256_setzero_ps();
Expand Down Expand Up @@ -330,6 +334,45 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
for (int i = n32; i < n; ++i) {
sumf += x[i]*y[i];
}
#elif defined(__wasm_simd128__)
// WASM 128-bit
const int n16 = (n & ~15);

v128_t sum0 = wasm_f32x4_splat(0);
v128_t sum1 = wasm_f32x4_splat(0);
v128_t sum2 = wasm_f32x4_splat(0);
v128_t sum3 = wasm_f32x4_splat(0);

v128_t x0, x1, x2, x3;
v128_t y0, y1, y2, y3;

for (int i = 0; i < n16; i += 16) {
x0 = wasm_v128_load(x + i + 0);
x1 = wasm_v128_load(x + i + 4);
x2 = wasm_v128_load(x + i + 8);
x3 = wasm_v128_load(x + i + 12);

y0 = wasm_v128_load(y + i + 0);
y1 = wasm_v128_load(y + i + 4);
y2 = wasm_v128_load(y + i + 8);
y3 = wasm_v128_load(y + i + 12);

sum0 = wasm_f32x4_add(sum0, wasm_f32x4_mul(x0, y0));
sum1 = wasm_f32x4_add(sum1, wasm_f32x4_mul(x1, y1));
sum2 = wasm_f32x4_add(sum2, wasm_f32x4_mul(x2, y2));
sum3 = wasm_f32x4_add(sum3, wasm_f32x4_mul(x3, y3));
}

sum0 = wasm_f32x4_add(sum0, sum1);
sum2 = wasm_f32x4_add(sum2, sum3);
sum0 = wasm_f32x4_add(sum0, sum2);

sumf = wasm_f32x4_extract_lane(sum0, 0) + wasm_f32x4_extract_lane(sum0, 1) + wasm_f32x4_extract_lane(sum0, 2) + wasm_f32x4_extract_lane(sum0, 3);

// leftovers
for (int i = n16; i < n; ++i) {
sumf += x[i]*y[i];
}
#else
// scalar
for (int i = 0; i < n; ++i) {
Expand Down Expand Up @@ -446,7 +489,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
}
#elif defined(__AVX2__)
// AVX 256-bit (unroll 4)
// AVX 256-bit
const int n32 = (n & ~31);

__m256 sum0 = _mm256_setzero_ps();
Expand Down Expand Up @@ -489,6 +532,54 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
//GGML_ASSERT(false);
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
}
#elif defined(__wasm_simd128__)
// WASM 128-bit
const int n16 = (n & ~15);

v128_t sum0 = wasm_f32x4_splat(0.0f);
v128_t sum1 = wasm_f32x4_splat(0.0f);
v128_t sum2 = wasm_f32x4_splat(0.0f);
v128_t sum3 = wasm_f32x4_splat(0.0f);

v128_t x0, x1, x2, x3;
v128_t y0, y1, y2, y3;

float tx[16];
float ty[16];

for (int i = 0; i < n16; i += 16) {
for (int k = 0; k < 16; ++k) {
tx[k] = ggml_fp16_to_fp32(x[i + k]);
ty[k] = ggml_fp16_to_fp32(y[i + k]);
}

x0 = wasm_v128_load(tx + 0);
x1 = wasm_v128_load(tx + 4);
x2 = wasm_v128_load(tx + 8);
x3 = wasm_v128_load(tx + 12);

y0 = wasm_v128_load(ty + 0);
y1 = wasm_v128_load(ty + 4);
y2 = wasm_v128_load(ty + 8);
y3 = wasm_v128_load(ty + 12);

sum0 = wasm_f32x4_add(sum0, wasm_f32x4_mul(x0, y0));
sum1 = wasm_f32x4_add(sum1, wasm_f32x4_mul(x1, y1));
sum2 = wasm_f32x4_add(sum2, wasm_f32x4_mul(x2, y2));
sum3 = wasm_f32x4_add(sum3, wasm_f32x4_mul(x3, y3));
}

sum0 = wasm_f32x4_add(sum0, sum1);
sum2 = wasm_f32x4_add(sum2, sum3);
sum0 = wasm_f32x4_add(sum0, sum2);

sumf = wasm_f32x4_extract_lane(sum0, 0) + wasm_f32x4_extract_lane(sum0, 1) + wasm_f32x4_extract_lane(sum0, 2) + wasm_f32x4_extract_lane(sum0, 3);

// leftovers
for (int i = n16; i < n; ++i) {
//GGML_ASSERT(false);
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
}
#else
for (int i = 0; i < n; ++i) {
sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
Expand Down Expand Up @@ -535,7 +626,7 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
y[i] += x[i]*v;
}
#elif defined(__AVX2__)
// AVX 256-bit (unroll 4)
// AVX 256-bit
const int n32 = (n & ~31);

const __m256 v4 = _mm256_set1_ps(v);
Expand Down Expand Up @@ -569,6 +660,41 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
for (int i = n32; i < n; ++i) {
y[i] += x[i]*v;
}
#elif defined(__wasm_simd128__)
// WASM SIMD 128-bit
const int n16 = (n & ~15);

const v128_t v4 = wasm_f32x4_splat(v);

v128_t x0, x1, x2, x3;
v128_t y0, y1, y2, y3;

for (int i = 0; i < n16; i += 16) {
x0 = wasm_v128_load(x + i + 0);
x1 = wasm_v128_load(x + i + 4);
x2 = wasm_v128_load(x + i + 8);
x3 = wasm_v128_load(x + i + 12);

y0 = wasm_v128_load(y + i + 0);
y1 = wasm_v128_load(y + i + 4);
y2 = wasm_v128_load(y + i + 8);
y3 = wasm_v128_load(y + i + 12);

y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4));
y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4));
y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4));
y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4));

wasm_v128_store(y + i + 0, y0);
wasm_v128_store(y + i + 4, y1);
wasm_v128_store(y + i + 8, y2);
wasm_v128_store(y + i + 12, y3);
}

// leftovers
for (int i = n16; i < n; ++i) {
y[i] += x[i]*v;
}
#else
// scalar
for (int i = 0; i < n; ++i) {
Expand Down Expand Up @@ -696,6 +822,54 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
GGML_ASSERT(false);
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
}
#elif defined(__wasm_simd128__)
// WASM SIMD 128-bit
const int n16 = (n & ~15);

const v128_t v4 = wasm_f32x4_splat(v);

v128_t x0, x1, x2, x3;
v128_t y0, y1, y2, y3;

float tx[16];
float ty[16];

for (int i = 0; i < n16; i += 16) {
for (int k = 0; k < 16; ++k) {
tx[k] = ggml_fp16_to_fp32(x[i + k]);
ty[k] = ggml_fp16_to_fp32(y[i + k]);
}

x0 = wasm_v128_load(tx + 0);
x1 = wasm_v128_load(tx + 4);
x2 = wasm_v128_load(tx + 8);
x3 = wasm_v128_load(tx + 12);

y0 = wasm_v128_load(ty + 0);
y1 = wasm_v128_load(ty + 4);
y2 = wasm_v128_load(ty + 8);
y3 = wasm_v128_load(ty + 12);

y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4));
y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4));
y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4));
y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4));

wasm_v128_store(ty + 0, y0);
wasm_v128_store(ty + 4, y1);
wasm_v128_store(ty + 8, y2);
wasm_v128_store(ty + 12, y3);

for (int k = 0; k < 16; ++k) {
y[i + k] = ggml_fp32_to_fp16(ty[k]);
}
}

// leftovers
for (int i = n16; i < n; ++i) {
GGML_ASSERT(false);
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
}
#else
for (int i = 0; i < n; ++i) {
y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
Expand Down

0 comments on commit db460b7

Please sign in to comment.