From 1a072a3a013976a178e0068be021e23b9a0ed59f Mon Sep 17 00:00:00 2001 From: David Conrad Date: Thu, 20 Aug 2009 20:44:09 -0700 Subject: [PATCH] Fix unaligned accesses in bitstream writer Fixes x264 on CPUs with no unaligned access support (e.g. SPARC). Improves performance marginally on CPUs with penalties for unaligned stores (e.g. some x86). --- common/bs.h | 26 +++++++++----------------- encoder/cavlc.c | 4 ++++ encoder/encoder.c | 2 ++ 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/common/bs.h b/common/bs.h index 68b9f5e35..0c009921a 100644 --- a/common/bs.h +++ b/common/bs.h @@ -73,21 +73,22 @@ extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE]; static inline void bs_init( bs_t *s, void *p_data, int i_data ) { - int offset = ((intptr_t)p_data & (WORD_SIZE-1)); + int offset = ((intptr_t)p_data & 3); s->p = s->p_start = (uint8_t*)p_data - offset; s->p_end = (uint8_t*)p_data + i_data; - s->i_left = offset ? 8*offset : (WORD_SIZE*8); - s->cur_bits = endian_fix( *(intptr_t*)s->p ); + s->i_left = (WORD_SIZE - offset)*8; + s->cur_bits = endian_fix32(*(uint32_t *)(s->p)); + s->cur_bits >>= (4-offset)*8; } static inline int bs_pos( bs_t *s ) { return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left ); } -/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32/64-bit aligned. */ +/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */ static inline void bs_flush( bs_t *s ) { - *(intptr_t*)s->p = endian_fix( s->cur_bits << s->i_left ); + *(uint32_t*)s->p = endian_fix32( s->cur_bits << (s->i_left&31) ); s->p += WORD_SIZE - s->i_left / 8; s->i_left = WORD_SIZE*8; } @@ -151,21 +152,12 @@ static inline void bs_write1( bs_t *s, uint32_t i_bit ) static inline void bs_align_0( bs_t *s ) { - if( s->i_left&7 ) - { - s->cur_bits <<= s->i_left&7; - s->i_left &= ~7; - } + bs_write( s, s->i_left&7, 0 ); bs_flush( s ); } static inline void bs_align_1( bs_t *s ) { - if( s->i_left&7 ) - { - s->cur_bits <<= s->i_left&7; - s->cur_bits |= (1 << (s->i_left&7)) - 1; - s->i_left &= ~7; - } + bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 ); bs_flush( s ); } @@ -245,7 +237,7 @@ static inline void bs_write_te( bs_t *s, int x, int val ) static inline void bs_rbsp_trailing( bs_t *s ) { bs_write1( s, 1 ); - bs_flush( s ); + bs_write( s, s->i_left&7, 0 ); } static inline int bs_size_ue( unsigned int val ) diff --git a/encoder/cavlc.c b/encoder/cavlc.c index 89bf07d73..0d88bfc6c 100644 --- a/encoder/cavlc.c +++ b/encoder/cavlc.c @@ -298,6 +298,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s ) #if !RDO_SKIP_BS if( i_mb_type == I_PCM ) { + uint8_t *p_start = s->p_start; bs_write_ue( s, i_mb_i_offset + 25 ); i_mb_pos_tex = bs_pos( s ); h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; @@ -313,6 +314,9 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s ) memcpy( s->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 ); s->p += 64; + bs_init( s, s->p, s->p_end - s->p ); + s->p_start = p_start; + /* if PCM is chosen, we need to store reconstructed frame data */ h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 ); h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 ); diff --git a/encoder/encoder.c b/encoder/encoder.c index 3c95e8606..caefa669f 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -981,6 +981,7 @@ int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal ) x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST ); x264_pps_write( &h->out.bs, h->pps ); x264_nal_end( h ); + bs_flush( &h->out.bs ); } /* now set output*/ *pi_nal = h->out.i_nal; @@ -1374,6 +1375,7 @@ static int x264_slice_write( x264_t *h ) bs_write_ue( &h->out.bs, i_skip ); /* last skip run */ /* rbsp_slice_trailing_bits */ bs_rbsp_trailing( &h->out.bs ); + bs_flush( &h->out.bs ); } x264_nal_end( h );