Skip to content

Commit

Permalink
x86: lossless audio: SSE4 madd 32bits
Browse files Browse the repository at this point in the history
The unique user so far is wmalossless 24bits. The few samples tested show an
order of 8, so more unrolling or an avx2 version do not make sense.

Timings: 68 -> 49 cycles

Reviewed-by: Paul B Mahol <[email protected]>
Signed-off-by: Michael Niedermayer <[email protected]>
  • Loading branch information
cgisquet authored and michaelni committed May 7, 2016
1 parent e811ebc commit 9630b3f
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 0 deletions.
33 changes: 33 additions & 0 deletions libavcodec/x86/lossless_audiodsp.asm
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,39 @@ SCALARPRODUCT
INIT_XMM sse2
SCALARPRODUCT

INIT_XMM sse4
; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3,
; int order, int mul)
cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul
shl orderq, 1
movd m7, mulm
SPLATW m7, m7
pxor m6, m6
add v1q, orderq
lea v2q, [v2q + 2*orderq]
add v3q, orderq
neg orderq
.loop:
mova m3, [v1q + orderq]
movu m0, [v2q + 2*orderq]
pmovsxwd m4, m3
movu m1, [v2q + 2*orderq + mmsize]
movhlps m5, m3
movu m2, [v3q + orderq]
pmovsxwd m5, m5
pmullw m2, m7
pmulld m0, m4
pmulld m1, m5
paddw m2, m3
paddd m6, m0
paddd m6, m1
mova [v1q + orderq], m2
add orderq, 16
jl .loop
HADDD m6, m0
movd eax, m6
RET

%macro SCALARPRODUCT_LOOP 1
align 16
.loop%1:
Expand Down
7 changes: 7 additions & 0 deletions libavcodec/x86/lossless_audiodsp_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);

int32_t ff_scalarproduct_and_madd_int32_sse4(int16_t *v1, const int32_t *v2,
const int16_t *v3,
int order, int mul);

av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
{
#if HAVE_YASM
Expand All @@ -45,5 +49,8 @@ av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
if (EXTERNAL_SSSE3(cpu_flags) &&
!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;

if (EXTERNAL_SSE4(cpu_flags))
c->scalarproduct_and_madd_int32 = ff_scalarproduct_and_madd_int32_sse4;
#endif
}

0 comments on commit 9630b3f

Please sign in to comment.