Skip to content

Commit

Permalink
Slightliy better performance due to caching changes, added neon detec…
Browse files Browse the repository at this point in the history
…tion code for android platforms
  • Loading branch information
Martin Fleisz committed Jun 28, 2011
1 parent b958b1f commit 1f201c0
Showing 1 changed file with 36 additions and 63 deletions.
99 changes: 36 additions & 63 deletions libfreerdp-rfx/neon/rfx_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,59 +24,20 @@

#include "rfx_neon.h"

#if defined(ANDROID_DISABLED)
#if defined(ANDROID)
#include <cpu-features.h>
#include <android/log.h>
#endif

#define CACHE_LINE_BYTES 64

static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
prefetch_buffer(char * buffer, int num_bytes)
prefetch_data(void * buffer1)
{
asm(" mov r3, %0 \t\n"
" add r4, r3, %1 \t\n"
"1: \t\n"
" pld [r3] \t\n"
" add r3, r3, #64 \t\n"
" cmp r3, r4 \t\n"
" bne 1b \t\n"
asm(" pld [%0, #64] \t\n"
: // no output
: "r" (buffer), "r" (num_bytes)
: "r3", "r4" );
: "r" (buffer1)
);
}

static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
prefetch_line(char * buffer)
{
asm(" mov r3, %0 \t\n"
" pld [r3, #0] \t\n"
: // no output
: "r" (buffer)
: "r3" );
}


static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
prefetch_buffers(char * buffer1, char * buffer2, char * buffer3, int num_bytes)
{
asm(" mov r3, %0 \t\n"
" mov r4, %1 \t\n"
" mov r5, %2 \t\n"
" mov r6, #0 \t\n"
"cache_loop2: \t\n"
" pld [r3, r6] \t\n"
" pld [r3, r6] \t\n"
" pld [r3, r6] \t\n"
" add r6, r6, #64 \t\n"
" cmp r6, %3 \t\n"
" bne cache_loop2 \t\n"
: // no output
: "r" (buffer1), "r" (buffer2), "r" (buffer3), "r" (num_bytes)
: "r3", "r4", "r5", "r6" );
}


void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer)
{
int16x8_t zero = vdupq_n_s16(0);
Expand All @@ -87,11 +48,13 @@ void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin
int16x8_t* cb_g_buf = (int16x8_t*)cb_g_buffer;
int16x8_t* cr_b_buf = (int16x8_t*)cr_b_buffer;

prefetch_buffers((char*)y_r_buf, (char*)cb_g_buf, (char*)cr_b_buf, sizeof(sint16) * 4096);

int i;
for (i = 0; i < 4096 / 8; i++)
{
prefetch_data(&y_r_buf[i]);
prefetch_data(&cr_b_buf[i]);
prefetch_data(&cb_g_buf[i]);

int16x8_t y = vld1q_s16((sint16*)&y_r_buf[i]);
y = vaddq_s16(y, y_add);

Expand Down Expand Up @@ -130,7 +93,7 @@ void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin
}

static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_quantization_decode_block_NEON(sint16 * buffer, int buffer_size, uint32 factor)
rfx_quantization_decode_block_NEON(sint16 * buffer, const int buffer_size, const uint32 factor)
{
if (factor <= 6)
return;
Expand All @@ -140,6 +103,7 @@ rfx_quantization_decode_block_NEON(sint16 * buffer, int buffer_size, uint32 fact

do
{
prefetch_data(buf);
int16x8_t val = vld1q_s16((sint16*)buf);
val = vshlq_s16(val, quantFactors);
vst1q_s16((sint16*)buf, val);
Expand All @@ -151,8 +115,6 @@ rfx_quantization_decode_block_NEON(sint16 * buffer, int buffer_size, uint32 fact
void
rfx_quantization_decode_NEON(sint16 * buffer, const uint32 * quantization_values)
{
prefetch_buffer((char *) buffer, 4096 * sizeof(sint16));

rfx_quantization_decode_block_NEON(buffer, 1024, quantization_values[8]); /* HL1 */
rfx_quantization_decode_block_NEON(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
rfx_quantization_decode_block_NEON(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
Expand Down Expand Up @@ -182,9 +144,11 @@ rfx_dwt_2d_decode_block_horiz_NEON(sint16 * l, sint16 * h, sint16 * dst, int sub
{
// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
int16x8_t l_n = vld1q_s16(l_ptr);
prefetch_data(l_ptr);

int16x8_t h_n = vld1q_s16(h_ptr);
int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
prefetch_data(h_ptr);

if (n == 0)
{
Expand Down Expand Up @@ -212,11 +176,13 @@ rfx_dwt_2d_decode_block_horiz_NEON(sint16 * l, sint16 * h, sint16 * dst, int sub
// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);

int16x8_t h_n = vld1q_s16(h_ptr);
prefetch_data(h_ptr);

h_n = vshlq_n_s16(h_n, 1);

int16x8x2_t dst_n;
dst_n.val[0] = vld1q_s16(l_ptr);
prefetch_data(l_ptr);
int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
if (n == subband_width - 8)
{
Expand All @@ -230,6 +196,7 @@ rfx_dwt_2d_decode_block_horiz_NEON(sint16 * l, sint16 * h, sint16 * dst, int sub
dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);

vst2q_s16(dst_ptr, dst_n);
prefetch_data(dst_ptr);

l_ptr+=8;
h_ptr+=8;
Expand All @@ -253,6 +220,8 @@ rfx_dwt_2d_decode_block_vert_NEON(sint16 * l, sint16 * h, sint16 * dst, int subb
{
for (x = 0; x < total_width; x+=8)
{
prefetch_data(l_ptr);
prefetch_data(h_ptr);
// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);

int16x8_t l_n = vld1q_s16(l_ptr);
Expand All @@ -268,6 +237,7 @@ rfx_dwt_2d_decode_block_vert_NEON(sint16 * l, sint16 * h, sint16 * dst, int subb
}
tmp_n = vshrq_n_s16(tmp_n, 1);

prefetch_data(dst_ptr);
int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
vst1q_s16(dst_ptr, dst_n);

Expand All @@ -287,9 +257,12 @@ rfx_dwt_2d_decode_block_vert_NEON(sint16 * l, sint16 * h, sint16 * dst, int subb
for (x = 0; x < total_width; x+=8)
{
// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);

int16x8_t h_n = vld1q_s16(h_ptr);
int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);

prefetch_data(h_ptr);
prefetch_data(dst_ptr - total_width);

h_n = vshlq_n_s16(h_n, 1);

int16x8_t tmp_n = dst_n_m;
Expand All @@ -304,6 +277,7 @@ rfx_dwt_2d_decode_block_vert_NEON(sint16 * l, sint16 * h, sint16 * dst, int subb

int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
vst1q_s16(dst_ptr, dst_n);
prefetch_data(dst_ptr);

h_ptr+=8;
dst_ptr+=8;
Expand All @@ -318,8 +292,6 @@ rfx_dwt_2d_decode_block_NEON(sint16 * buffer, sint16 * idwt, int subband_width)
sint16 * hl, * lh, * hh, * ll;
sint16 * l_dst, * h_dst;

prefetch_buffer((char *) idwt, subband_width * 4 * sizeof(sint16));

/* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
/* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
/* The lower part L uses LL(3) and HL(0). */
Expand All @@ -342,35 +314,36 @@ rfx_dwt_2d_decode_block_NEON(sint16 * buffer, sint16 * idwt, int subband_width)
}

void
rfx_dwt_2d_decode_NEON(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32)
rfx_dwt_2d_decode_NEON(sint16 * buffer, sint16 * dwt_buffer)
{
prefetch_buffer((char *) buffer, 4096 * sizeof(sint16));

rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer_8, 8);
rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer_16, 16);
rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer_32, 32);
rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
}



int isNeonSupported()
{
#if defined(ANDROID_DISABLED)
#if defined(ANDROID)
if (android_getCpuFamily() != ANDROID_CPU_FAMILY_ARM)
{
_android_log_print(ANDROID_LOG_INFO, "freerdp", "NEON optimization disabled - No ARM CPU found");
DEBUG_RFX("NEON optimization disabled - No ARM CPU found");
return 0;
}

features = android_getCpuFeatures();
uint64_t features = android_getCpuFeatures();
if ((features & ANDROID_CPU_ARM_FEATURE_ARMv7))
{
if (features & ANDROID_CPU_ARM_FEATURE_NEON)
{
DEBUG_RFX("NEON optimization enabled!");
return 1;
_android_log_print(ANDROID_LOG_INFO, "freerdp", "NEON optimization disabled - CPU not NEON capable");
}
DEBUG_RFX("NEON optimization disabled - CPU not NEON capable");
}
else
_android_log_print(ANDROID_LOG_INFO, "freerdp", "NEON optimization disabled - No ARMv7 CPU found");
DEBUG_RFX("NEON optimization disabled - No ARMv7 CPU found");

return 0;
#else
Expand Down

0 comments on commit 1f201c0

Please sign in to comment.