From 5c34a0f01aa5d0ff6c3027cba1a5248699cf7e39 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Wed, 19 Mar 2014 07:38:41 +0100 Subject: [PATCH] erasure-code: remove copy of gf-complete / jerasure Signed-off-by: Loic Dachary --- .../gf-complete/include/gf_complete.h | 192 -- .../jerasure/gf-complete/include/gf_general.h | 61 - .../jerasure/gf-complete/include/gf_int.h | 206 -- .../jerasure/gf-complete/include/gf_method.h | 20 - .../jerasure/gf-complete/include/gf_rand.h | 22 - .../jerasure/gf-complete/src/gf.c | 1039 ------- .../jerasure/gf-complete/src/gf_general.c | 538 ---- .../jerasure/gf-complete/src/gf_method.c | 185 -- .../jerasure/gf-complete/src/gf_rand.c | 80 - .../jerasure/gf-complete/src/gf_w128.c | 1769 ----------- .../jerasure/gf-complete/src/gf_w16.c | 2489 --------------- .../jerasure/gf-complete/src/gf_w32.c | 2741 ----------------- .../jerasure/gf-complete/src/gf_w4.c | 2081 ------------- .../jerasure/gf-complete/src/gf_w64.c | 2244 -------------- .../jerasure/gf-complete/src/gf_w8.c | 2456 --------------- .../jerasure/gf-complete/src/gf_wgen.c | 1019 ------ .../jerasure/jerasure/include/cauchy.h | 45 - .../jerasure/jerasure/include/galois.h | 99 - .../jerasure/jerasure/include/jerasure.h | 294 -- .../jerasure/jerasure/include/liberation.h | 47 - .../jerasure/jerasure/include/reed_sol.h | 50 - .../jerasure/jerasure/src/cauchy.c | 405 --- .../jerasure/jerasure/src/galois.c | 353 --- .../jerasure/jerasure/src/jerasure.c | 1387 --------- .../jerasure/jerasure/src/liberation.c | 262 -- .../jerasure/jerasure/src/reed_sol.c | 301 -- 26 files changed, 20385 deletions(-) delete mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_complete.h delete mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_general.h delete mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_int.h delete mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_method.h delete mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_rand.h delete mode 100644 src/erasure-code/jerasure/gf-complete/src/gf.c delete mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_general.c delete mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_method.c delete mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_rand.c delete mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w128.c delete mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w16.c delete mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w32.c delete mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w4.c delete mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w64.c delete mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w8.c delete mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_wgen.c delete mode 100644 src/erasure-code/jerasure/jerasure/include/cauchy.h delete mode 100644 src/erasure-code/jerasure/jerasure/include/galois.h delete mode 100644 src/erasure-code/jerasure/jerasure/include/jerasure.h delete mode 100644 src/erasure-code/jerasure/jerasure/include/liberation.h delete mode 100644 src/erasure-code/jerasure/jerasure/include/reed_sol.h delete mode 100644 src/erasure-code/jerasure/jerasure/src/cauchy.c delete mode 100644 src/erasure-code/jerasure/jerasure/src/galois.c delete mode 100644 src/erasure-code/jerasure/jerasure/src/jerasure.c delete mode 100644 src/erasure-code/jerasure/jerasure/src/liberation.c delete mode 100644 src/erasure-code/jerasure/jerasure/src/reed_sol.c diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_complete.h b/src/erasure-code/jerasure/gf-complete/include/gf_complete.h deleted file mode 100644 index 57b439e27ee56..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/include/gf_complete.h +++ /dev/null @@ -1,192 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_complete.h - * - * The main include file for gf_complete. - */ - -#ifndef _GF_COMPLETE_H_ -#define _GF_COMPLETE_H_ -#include - -#ifdef INTEL_SSE4 - #include -#endif - -#ifdef INTEL_SSSE3 - #include -#endif - -#ifdef INTEL_SSE2 - #include -#endif - -#ifdef INTEL_SSE4_PCLMUL - #include -#endif - - -/* These are the different ways to perform multiplication. - Not all are implemented for all values of w. - See the paper for an explanation of how they work. */ - -typedef enum {GF_MULT_DEFAULT, - GF_MULT_SHIFT, - GF_MULT_CARRY_FREE, - GF_MULT_GROUP, - GF_MULT_BYTWO_p, - GF_MULT_BYTWO_b, - GF_MULT_TABLE, - GF_MULT_LOG_TABLE, - GF_MULT_LOG_ZERO, - GF_MULT_LOG_ZERO_EXT, - GF_MULT_SPLIT_TABLE, - GF_MULT_COMPOSITE } gf_mult_type_t; - -/* These are the different ways to optimize region - operations. They are bits because you can compose them. - Certain optimizations only apply to certain gf_mult_type_t's. - Again, please see documentation for how to use these */ - -#define GF_REGION_DEFAULT (0x0) -#define GF_REGION_DOUBLE_TABLE (0x1) -#define GF_REGION_QUAD_TABLE (0x2) -#define GF_REGION_LAZY (0x4) -#define GF_REGION_SSE (0x8) -#define GF_REGION_NOSSE (0x10) -#define GF_REGION_ALTMAP (0x20) -#define GF_REGION_CAUCHY (0x40) - -typedef uint32_t gf_region_type_t; - -/* These are different ways to implement division. - Once again, it's best to use "DEFAULT". However, - there are times when you may want to experiment - with the others. */ - -typedef enum { GF_DIVIDE_DEFAULT, - GF_DIVIDE_MATRIX, - GF_DIVIDE_EUCLID } gf_division_type_t; - -/* We support w=4,8,16,32,64 and 128 with their own data types and - operations for multiplication, division, etc. We also support - a "gen" type so that you can do general gf arithmetic for any - value of w from 1 to 32. You can perform a "region" operation - on these if you use "CAUCHY" as the mapping. - */ - -typedef uint32_t gf_val_32_t; -typedef uint64_t gf_val_64_t; -typedef uint64_t *gf_val_128_t; - -extern int _gf_errno; -extern void gf_error(); - -typedef struct gf *GFP; - -typedef union gf_func_a_b { - gf_val_32_t (*w32) (GFP gf, gf_val_32_t a, gf_val_32_t b); - gf_val_64_t (*w64) (GFP gf, gf_val_64_t a, gf_val_64_t b); - void (*w128)(GFP gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t c); -} gf_func_a_b; - -typedef union { - gf_val_32_t (*w32) (GFP gf, gf_val_32_t a); - gf_val_64_t (*w64) (GFP gf, gf_val_64_t a); - void (*w128)(GFP gf, gf_val_128_t a, gf_val_128_t b); -} gf_func_a; - -typedef union { - void (*w32) (GFP gf, void *src, void *dest, gf_val_32_t val, int bytes, int add); - void (*w64) (GFP gf, void *src, void *dest, gf_val_64_t val, int bytes, int add); - void (*w128)(GFP gf, void *src, void *dest, gf_val_128_t val, int bytes, int add); -} gf_region; - -typedef union { - gf_val_32_t (*w32) (GFP gf, void *start, int bytes, int index); - gf_val_64_t (*w64) (GFP gf, void *start, int bytes, int index); - void (*w128)(GFP gf, void *start, int bytes, int index, gf_val_128_t rv); -} gf_extract; - -typedef struct gf { - gf_func_a_b multiply; - gf_func_a_b divide; - gf_func_a inverse; - gf_region multiply_region; - gf_extract extract_word; - void *scratch; -} gf_t; - -/* Initializes the GF to defaults. Pass it a pointer to a gf_t. - Returns 0 on failure, 1 on success. */ - -extern int gf_init_easy(GFP gf, int w); - -/* Initializes the GF changing the defaults. - Returns 0 on failure, 1 on success. - Pass it a pointer to a gf_t. - For mult_type and divide_type, use one of gf_mult_type_t gf_divide_type_t . - For region_type, OR together the GF_REGION_xxx's defined above. - Use 0 as prim_poly for defaults. Otherwise, the leading 1 is optional. - Use NULL for scratch_memory to have init_hard allocate memory. Otherwise, - use gf_scratch_size() to determine how big scratch_memory has to be. - */ - -extern int gf_init_hard(GFP gf, - int w, - int mult_type, - int region_type, - int divide_type, - uint64_t prim_poly, - int arg1, - int arg2, - GFP base_gf, - void *scratch_memory); - -/* Determines the size for scratch_memory. - Returns 0 on failure and non-zero on success. */ - -extern int gf_scratch_size(int w, - int mult_type, - int region_type, - int divide_type, - int arg1, - int arg2); - -/* This reports the gf_scratch_size of a gf_t that has already been created */ - -extern int gf_size(GFP gf); - -/* Frees scratch memory if gf_init_easy/gf_init_hard called malloc. - If recursive = 1, then it calls itself recursively on base_gf. */ - -extern int gf_free(GFP gf, int recursive); - -/* This is support for inline single multiplications and divisions. - I know it's yucky, but if you've got to be fast, you've got to be fast. - We support inlining for w=4, w=8 and w=16. - - To use inline multiplication and division with w=4 or 8, you should use the - default gf_t, or one with a single table. Otherwise, gf_w4/8_get_mult_table() - will return NULL. Similarly, with w=16, the gf_t must be LOG */ - -uint8_t *gf_w4_get_mult_table(GFP gf); -uint8_t *gf_w4_get_div_table(GFP gf); - -#define GF_W4_INLINE_MULTDIV(table, a, b) (table[((a)<<4)|(b)]) - -uint8_t *gf_w8_get_mult_table(GFP gf); -uint8_t *gf_w8_get_div_table(GFP gf); - -#define GF_W8_INLINE_MULTDIV(table, a, b) (table[(((uint32_t) (a))<<8)|(b)]) - -uint16_t *gf_w16_get_log_table(GFP gf); -uint16_t *gf_w16_get_mult_alog_table(GFP gf); -uint16_t *gf_w16_get_div_alog_table(GFP gf); - -#define GF_W16_INLINE_MULT(log, alog, a, b) ((a) == 0 || (b) == 0) ? 0 : (alog[(uint32_t)log[a]+(uint32_t)log[b]]) -#define GF_W16_INLINE_DIV(log, alog, a, b) ((a) == 0 || (b) == 0) ? 0 : (alog[(int)log[a]-(int)log[b]]) -#endif diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_general.h b/src/erasure-code/jerasure/gf-complete/include/gf_general.h deleted file mode 100644 index 9a5de529dc008..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/include/gf_general.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_general.h - * - * This file has helper routines for doing basic GF operations with any - * legal value of w. The problem is that w <= 32, w=64 and w=128 all have - * different data types, which is a pain. The procedures in this file try - * to alleviate that pain. They are used in gf_unit and gf_time. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "gf_complete.h" - -typedef union { - uint32_t w32; - uint64_t w64; - uint64_t w128[2]; -} gf_general_t; - -void gf_general_set_zero(gf_general_t *v, int w); -void gf_general_set_one(gf_general_t *v, int w); -void gf_general_set_two(gf_general_t *v, int w); - -int gf_general_is_zero(gf_general_t *v, int w); -int gf_general_is_one(gf_general_t *v, int w); -int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w); - -void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex); -int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex); - -void gf_general_set_random(gf_general_t *v, int w, int zero_ok); - -void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); -void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); -void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); -void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b); - -void gf_general_do_region_multiply(gf_t *gf, gf_general_t *a, - void *ra, void *rb, - int bytes, int xor); - -void gf_general_do_region_check(gf_t *gf, gf_general_t *a, - void *orig_a, void *orig_target, void *final_target, - int bytes, int xor); - - -/* Which is M, D or I for multiply, divide or inverse. */ - -void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size); -int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, char which); diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_int.h b/src/erasure-code/jerasure/gf-complete/include/gf_int.h deleted file mode 100644 index 2ce3d9817358c..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/include/gf_int.h +++ /dev/null @@ -1,206 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_int.h - * - * Internal code for Galois field routines. This is not meant for - * users to include, but for the internal GF files to use. - */ - -#pragma once - -#include "gf_complete.h" - -#include - -extern void timer_start (double *t); -extern double timer_split (const double *t); -extern void galois_fill_random (void *buf, int len, unsigned int seed); - -#define GF_SSE2 0x01 -#define GF_SSSE3 0x02 -#define GF_SSE4 0x04 -#define GF_SSE4_PCLMUL 0x08 - -typedef struct { - int mult_type; - int region_type; - int divide_type; - int w; - uint64_t prim_poly; - int free_me; - int arg1; - int arg2; - gf_t *base_gf; - void *private; - uint32_t sse; -} gf_internal_t; - -extern int gf_w4_init (gf_t *gf); -extern int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_w8_init (gf_t *gf); -extern int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_w16_init (gf_t *gf); -extern int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_w32_init (gf_t *gf); -extern int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_w64_init (gf_t *gf); -extern int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_w128_init (gf_t *gf); -extern int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_wgen_init (gf_t *gf); -extern int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2); - -void gf_wgen_cauchy_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor); -gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index); - -extern void gf_alignment_error(char *s, int a); - -extern uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp); - -/* This returns the correct default for prim_poly when base is used as the base - field for COMPOSITE. It returns 0 if we don't have a default prim_poly. */ - -extern uint64_t gf_composite_get_default_poly(gf_t *base); - -/* This structure lets you define a region multiply. It helps because you can handle - unaligned portions of the data with the procedures below, which really cleans - up the code. */ - -typedef struct { - gf_t *gf; - void *src; - void *dest; - int bytes; - uint64_t val; - int xor; - int align; /* The number of bytes to which to align. */ - void *s_start; /* The start and the top of the aligned region. */ - void *d_start; - void *s_top; - void *d_top; -} gf_region_data; - -/* This lets you set up one of these in one call. It also sets the start/top pointers. */ - -void gf_set_region_data(gf_region_data *rd, - gf_t *gf, - void *src, - void *dest, - int bytes, - uint64_t val, - int xor, - int align); - -/* This performs gf->multiply.32() on all of the unaligned bytes in the beginning of the region */ - -extern void gf_do_initial_region_alignment(gf_region_data *rd); - -/* This performs gf->multiply.32() on all of the unaligned bytes in the end of the region */ - -extern void gf_do_final_region_alignment(gf_region_data *rd); - -extern void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base); - -extern void gf_multby_zero(void *dest, int bytes, int xor); -extern void gf_multby_one(void *src, void *dest, int bytes, int xor); - -typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */ - GF_E_MDEFREG, /* Reg != Default && Mult == Default */ - GF_E_MDEFARG, /* Args != Default && Mult == Default */ - GF_E_DIVCOMP, /* Mult == Composite && Div != Default */ - GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */ - GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */ - GF_E_SSE__NO, /* Reg == SSE && Reg == NOSSE */ - GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */ - GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/ - GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */ - GF_E_ARG2SET, /* Arg2 != 0 && Mult \notin SPLIT/GROUP */ - GF_E_MATRIXW, /* Div == MATRIX && w > 32 */ - GF_E_BAD___W, /* Illegal w */ - GF_E_DOUBLET, /* Reg == DOUBLE && Mult != TABLE */ - GF_E_DOUBLEW, /* Reg == DOUBLE && w \notin {4,8} */ - GF_E_DOUBLEJ, /* Reg == DOUBLE && other Reg */ - GF_E_DOUBLEL, /* Reg == DOUBLE & LAZY but w = 4 */ - GF_E_QUAD__T, /* Reg == QUAD && Mult != TABLE */ - GF_E_QUAD__W, /* Reg == QUAD && w != 4 */ - GF_E_QUAD__J, /* Reg == QUAD && other Reg */ - GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/ - GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */ - GF_E_SSESHIF, /* Mult == Shift && Reg == SSE|NOSSE */ - GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */ - GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SSE|NOSSE */ - GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */ - GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */ - GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */ - GF_E_LOGBADW, /* Mult == LOGx, w too big*/ - GF_E_LOG___J, /* Mult == LOGx, && Reg == SSE|ALTMAP|NOSSE */ - GF_E_ZERBADW, /* Mult == LOG_ZERO, w \notin {8,16} */ - GF_E_ZEXBADW, /* Mult == LOG_ZERO_EXT, w != 8 */ - GF_E_LOGPOLY, /* Mult == LOG & poly not primitive */ - GF_E_GR_ARGX, /* Mult == GROUP, Bad arg1/2 */ - GF_E_GR_W_48, /* Mult == GROUP, w \in { 4, 8 } */ - GF_E_GR_W_16, /* Mult == GROUP, w == 16, arg1 != 4 || arg2 != 4 */ - GF_E_GR_128A, /* Mult == GROUP, w == 128, bad args */ - GF_E_GR_A_27, /* Mult == GROUP, either arg > 27 */ - GF_E_GR_AR_W, /* Mult == GROUP, either arg > w */ - GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */ - GF_E_TABLE_W, /* Mult == TABLE, w too big */ - GF_E_TAB_SSE, /* Mult == TABLE, SSE|NOSSE only apply to w == 4 */ - GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */ - GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */ - GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */ - GF_E_SP128AL, /* Mult == SPLIT, w=128, SSE requires ALTMAP */ - GF_E_SP128AS, /* Mult == SPLIT, w=128, ALTMAP requires SSE */ - GF_E_SP128_A, /* Mult == SPLIT, w=128, SSE only with 4/128 */ - GF_E_SP128_S, /* Mult == SPLIT, w=128, ALTMAP only with 4/128 */ - GF_E_SPLIT_W, /* Mult == SPLIT, Bad w (8, 16, 32, 64, 128) */ - GF_E_SP_16AR, /* Mult == SPLIT, w=16, Bad arg1/arg2 */ - GF_E_SP_16_A, /* Mult == SPLIT, w=16, ALTMAP only with 4/16 */ - GF_E_SP_16_S, /* Mult == SPLIT, w=16, SSE only with 4/16 */ - GF_E_SP_32AR, /* Mult == SPLIT, w=32, Bad arg1/arg2 */ - GF_E_SP_32AS, /* Mult == SPLIT, w=32, ALTMAP requires SSE */ - GF_E_SP_32_A, /* Mult == SPLIT, w=32, ALTMAP only with 4/32 */ - GF_E_SP_32_S, /* Mult == SPLIT, w=32, SSE only with 4/32 */ - GF_E_SP_64AR, /* Mult == SPLIT, w=64, Bad arg1/arg2 */ - GF_E_SP_64AS, /* Mult == SPLIT, w=64, ALTMAP requires SSE */ - GF_E_SP_64_A, /* Mult == SPLIT, w=64, ALTMAP only with 4/64 */ - GF_E_SP_64_S, /* Mult == SPLIT, w=64, SSE only with 4/64 */ - GF_E_SP_8_AR, /* Mult == SPLIT, w=8, Bad arg1/arg2 */ - GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */ - GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */ - GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */ - GF_E_COMP_SS, /* Mult == COMP, SSE|NOSSE */ - GF_E_COMP__W, /* Mult == COMP, Bad w. */ - GF_E_UNKFLAG, /* Unknown flag in create_from.... */ - GF_E_UNKNOWN, /* Unknown mult_type. */ - GF_E_UNK_REG, /* Unknown region_type. */ - GF_E_UNK_DIV, /* Unknown divide_type. */ - GF_E_CFM___W, /* Mult == CFM, Bad w. */ - GF_E_CFM4POL, /* Mult == CFM & Prim Poly has high bits set. */ - GF_E_CFM8POL, /* Mult == CFM & Prim Poly has high bits set. */ - GF_E_CF16POL, /* Mult == CFM & Prim Poly has high bits set. */ - GF_E_CF32POL, /* Mult == CFM & Prim Poly has high bits set. */ - GF_E_CF64POL, /* Mult == CFM & Prim Poly has high bits set. */ - GF_E_FEWARGS, /* Too few args in argc/argv. */ - GF_E_BADPOLY, /* Bad primitive polynomial -- too many bits set. */ - GF_E_COMP_PP, /* Bad primitive polynomial -- bigger than sub-field. */ - GF_E_COMPXPP, /* Can't derive a default pp for composite field. */ - GF_E_BASE__W, /* Composite -- Base field is the wrong size. */ - GF_E_TWOMULT, /* In create_from... two -m's. */ - GF_E_TWO_DIV, /* In create_from... two -d's. */ - GF_E_POLYSPC, /* Bad numbera after -p. */ - GF_E_SPLITAR, /* Ran out of arguments in SPLIT */ - GF_E_SPLITNU, /* Arguments not integers in SPLIT. */ - GF_E_GROUPAR, /* Ran out of arguments in GROUP */ - GF_E_GROUPNU, /* Arguments not integers in GROUP. */ - GF_E_DEFAULT } gf_error_type_t; - diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_method.h b/src/erasure-code/jerasure/gf-complete/include/gf_method.h deleted file mode 100644 index 880b349676dc9..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/include/gf_method.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_method.h - * - * Parses argv to figure out the flags and arguments. Creates the gf. - */ - -#pragma once - -#include "gf_complete.h" - -/* Parses argv starting at "starting". - - Returns 0 on failure. - On success, it returns one past the last argument it read in argv. */ - -extern int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting); diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_rand.h b/src/erasure-code/jerasure/gf-complete/include/gf_rand.h deleted file mode 100644 index 24294adc704fe..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/include/gf_rand.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_rand.h - * - * Random number generation, using the "Mother of All" random number generator. */ - -#pragma once -#include -#include -#include - -/* These are all pretty self-explanatory */ -uint32_t MOA_Random_32(); -uint64_t MOA_Random_64(); -void MOA_Random_128(uint64_t *x); -uint32_t MOA_Random_W(int w, int zero_ok); -void MOA_Fill_Random_Region (void *reg, int size); /* reg should be aligned to 4 bytes, but - size can be anything. */ -void MOA_Seed(uint32_t seed); diff --git a/src/erasure-code/jerasure/gf-complete/src/gf.c b/src/erasure-code/jerasure/gf-complete/src/gf.c deleted file mode 100644 index 0e475b93408ef..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/src/gf.c +++ /dev/null @@ -1,1039 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf.c - * - * Generic routines for Galois fields - */ - -#include "gf_int.h" -#include -#include - -int _gf_errno = GF_E_DEFAULT; - -void gf_error() -{ - char *s; - - switch(_gf_errno) { - case GF_E_DEFAULT: s = "No Error."; break; - case GF_E_TWOMULT: s = "Cannot specify two -m's."; break; - case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break; - case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break; - case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break; - case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break; - case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break; - case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break; - case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break; - case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break; - case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break; - case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break; - case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break; - case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break; - case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break; - case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break; - case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break; - case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break; - case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break; - case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break; - case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break; - case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break; - case GF_E_SSE__NO: s = "Cannot specify -r SSE and -r NOSSE."; break; - case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break; - case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break; - case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break; - case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break; - case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break; - case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break; - case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break; - case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break; - case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break; - case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SSE|NOSSE."; break; - case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break; - case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break; - case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break; - case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SSE|NOSSE."; break; - case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break; - case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break; - case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break; - case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break; - case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SSE|NOSSE."; break; - case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break; - case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SSE|NOSSE."; break; - case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break; - case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break; - case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SSE, but SSE2 is not supported."; break; - case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break; - case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SSE|NOSSE."; break; - case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break; - case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break; - case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break; - case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break; - case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break; - case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break; - case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break; - case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break; - case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break; - case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SSE|NOSSE."; break; - case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break; - case GF_E_TAB_SSE: s = "With -m TABLE, SSE|NOSSE only applies to w=4."; break; - case GF_E_TABSSE3: s = "With -m TABLE, -r SSE, you need SSSE3 supported."; break; - case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break; - case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break; - case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SSE requires -r ALTMAP."; break; - case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break; - case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r SSE|NOSSE only with arg1/arg2 = 4/128."; break; - case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break; - case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break; - case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break; - case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break; - case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SSE|NOSSE only with arg1/arg2 = 4/16."; break; - case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break; - case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break; - case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break; - case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SSE|NOSSE only with arg1/arg2 = 4/32."; break; - case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break; - case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break; - case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break; - case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SSE|NOSSE only with arg1/arg2 = 4/64."; break; - case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break; - case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break; - case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SSE."; break; - case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break; - case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SSE and -r NOSSE do not apply."; break; - case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break; - case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break; - case GF_E_UNKNOWN: s = "Unknown multiplication type."; break; - case GF_E_UNK_REG: s = "Unknown region type."; break; - case GF_E_UNK_DIV: s = "Unknown division type."; break; - default: s = "Undefined error."; - } - - fprintf(stderr, "%s\n", s); -} - -uint64_t gf_composite_get_default_poly(gf_t *base) -{ - gf_internal_t *h; - int rv; - - h = (gf_internal_t *) base->scratch; - if (h->w == 4) { - if (h->mult_type == GF_MULT_COMPOSITE) return 0; - if (h->prim_poly == 0x13) return 2; - return 0; - } - if (h->w == 8) { - if (h->mult_type == GF_MULT_COMPOSITE) return 0; - if (h->prim_poly == 0x11d) return 3; - return 0; - } - if (h->w == 16) { - if (h->mult_type == GF_MULT_COMPOSITE) { - rv = gf_composite_get_default_poly(h->base_gf); - if (rv != h->prim_poly) return 0; - if (rv == 3) return 0x105; - return 0; - } else { - if (h->prim_poly == 0x1100b) return 2; - if (h->prim_poly == 0x1002d) return 7; - return 0; - } - } - if (h->w == 32) { - if (h->mult_type == GF_MULT_COMPOSITE) { - rv = gf_composite_get_default_poly(h->base_gf); - if (rv != h->prim_poly) return 0; - if (rv == 2) return 0x10005; - if (rv == 7) return 0x10008; - if (rv == 0x105) return 0x10002; - return 0; - } else { - if (h->prim_poly == 0x400007) return 2; - if (h->prim_poly == 0xc5) return 3; - return 0; - } - } - if (h->w == 64) { - if (h->mult_type == GF_MULT_COMPOSITE) { - rv = gf_composite_get_default_poly(h->base_gf); - if (rv != h->prim_poly) return 0; - if (rv == 3) return 0x100000009ULL; - if (rv == 2) return 0x100000004ULL; - if (rv == 0x10005) return 0x100000003ULL; - if (rv == 0x10002) return 0x100000005ULL; - if (rv == 0x10008) return 0x100000006ULL; /* JSP: (0x0x100000003 works too, - but I want to differentiate cases). */ - return 0; - } else { - if (h->prim_poly == 0x1bULL) return 2; - return 0; - } - } - return 0; -} - -int gf_error_check(int w, int mult_type, int region_type, int divide_type, - int arg1, int arg2, uint64_t poly, gf_t *base) -{ - int sse3 = 0; - int sse2 = 0; - int pclmul = 0; - int rdouble, rquad, rlazy, rsse, rnosse, raltmap, rcauchy, tmp; - gf_internal_t *sub; - - rdouble = (region_type & GF_REGION_DOUBLE_TABLE); - rquad = (region_type & GF_REGION_QUAD_TABLE); - rlazy = (region_type & GF_REGION_LAZY); - rsse = (region_type & GF_REGION_SSE); - rnosse = (region_type & GF_REGION_NOSSE); - raltmap = (region_type & GF_REGION_ALTMAP); - rcauchy = (region_type & GF_REGION_CAUCHY); - - if (divide_type != GF_DIVIDE_DEFAULT && - divide_type != GF_DIVIDE_MATRIX && - divide_type != GF_DIVIDE_EUCLID) { - _gf_errno = GF_E_UNK_DIV; - return 0; - } - - tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY | - GF_REGION_SSE | GF_REGION_NOSSE | GF_REGION_ALTMAP | GF_REGION_CAUCHY ); - if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; } - -#ifdef INTEL_SSE2 - sse2 = 1; -#endif - -#ifdef INTEL_SSSE3 - sse3 = 1; -#endif - -#ifdef INTEL_SSE4_PCLMUL - pclmul = 1; -#endif - - - if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; } - - if (mult_type != GF_MULT_COMPOSITE && w < 64) { - if ((poly >> (w+1)) != 0) { _gf_errno = GF_E_BADPOLY; return 0; } - } - - if (mult_type == GF_MULT_DEFAULT) { - if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; } - if (region_type != GF_REGION_DEFAULT) { _gf_errno = GF_E_MDEFREG; return 0; } - if (arg1 != 0 || arg2 != 0) { _gf_errno = GF_E_MDEFARG; return 0; } - return 1; - } - - if (rsse && rnosse) { _gf_errno = GF_E_SSE__NO; return 0; } - if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; } - if (rcauchy && region_type != GF_REGION_CAUCHY) { _gf_errno = GF_E_CAUCHYB; return 0; } - if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; } - - if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE && - mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) { - _gf_errno = GF_E_ARG1SET; - return 0; - } - - if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) { - _gf_errno = GF_E_ARG2SET; - return 0; - } - - if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; } - - if (rdouble) { - if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; } - if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; } - if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; } - if (rsse || rnosse || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; } - if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; } - return 1; - } - - if (rquad) { - if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; } - if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; } - if (rsse || rnosse || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; } - return 1; - } - - if (rlazy) { _gf_errno = GF_E_LAZY__X; return 0; } - - if (mult_type == GF_MULT_SHIFT) { - if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; } - if (rsse || rnosse) { _gf_errno = GF_E_SSESHIF; return 0; } - return 1; - } - - if (mult_type == GF_MULT_CARRY_FREE) { - if (w != 4 && w != 8 && w != 16 && - w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; } - if (w == 4 && (poly & 0xc)) { _gf_errno = GF_E_CFM4POL; return 0; } - if (w == 8 && (poly & 0x80)) { _gf_errno = GF_E_CFM8POL; return 0; } - if (w == 16 && (poly & 0xe000)) { _gf_errno = GF_E_CF16POL; return 0; } - if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; } - if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; } - if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; } - if (rsse || rnosse) { _gf_errno = GF_E_SSE_CFM; return 0; } - if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; } - return 1; - } - - if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) { - if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; } - if (rsse && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; } - return 1; - } - - if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO - || mult_type == GF_MULT_LOG_ZERO_EXT ) { - if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; } - if (raltmap || rsse || rnosse) { _gf_errno = GF_E_LOG___J; return 0; } - - if (mult_type == GF_MULT_LOG_TABLE) return 1; - - if (w != 8 && w != 16) { _gf_errno = GF_E_ZERBADW; return 0; } - - if (mult_type == GF_MULT_LOG_ZERO) return 1; - - if (w != 8) { _gf_errno = GF_E_ZEXBADW; return 0; } - return 1; - } - - if (mult_type == GF_MULT_GROUP) { - if (arg1 <= 0 || arg2 <= 0) { _gf_errno = GF_E_GR_ARGX; return 0; } - if (w == 4 || w == 8) { _gf_errno = GF_E_GR_W_48; return 0; } - if (w == 16 && (arg1 != 4 || arg2 != 4)) { _gf_errno = GF_E_GR_W_16; return 0; } - if (w == 128 && (arg1 != 4 || - (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; } - if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; } - if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; } - if (raltmap || rsse || rnosse) { _gf_errno = GF_E_GR____J; return 0; } - return 1; - } - - if (mult_type == GF_MULT_TABLE) { - if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; } - if (w != 4 && (rsse || rnosse)) { _gf_errno = GF_E_TAB_SSE; return 0; } - if (rsse && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; } - if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; } - return 1; - } - - if (mult_type == GF_MULT_SPLIT_TABLE) { - if (arg1 > arg2) { - tmp = arg1; - arg1 = arg2; - arg2 = tmp; - } - if (w == 8) { - if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; } - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } - if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; } - } else if (w == 16) { - if (arg1 == 4 && arg2 == 16) { - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } - } else if (arg1 == 8 && (arg2 == 16 || arg2 == 8)) { - if (rsse || rnosse) { _gf_errno = GF_E_SP_16_S; return 0; } - if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; } - } else { _gf_errno = GF_E_SP_16AR; return 0; } - } else if (w == 32) { - if ((arg1 == 8 && arg2 == 8) || - (arg1 == 8 && arg2 == 32) || - (arg1 == 16 && arg2 == 32)) { - if (rsse || rnosse) { _gf_errno = GF_E_SP_32_S; return 0; } - if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; } - } else if ((arg1 == 4 && arg2 == 32) || - (arg1 == 4 && arg2 == 32)) { - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } - if (raltmap && arg1 != 4) { _gf_errno = GF_E_SP_32_A; return 0; } - if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; } - if (raltmap && rnosse) { _gf_errno = GF_E_SP_32AS; return 0; } - } else { _gf_errno = GF_E_SP_32AR; return 0; } - } else if (w == 64) { - if ((arg1 == 8 && arg2 == 8) || - (arg1 == 8 && arg2 == 64) || - (arg1 == 16 && arg2 == 64)) { - if (rsse || rnosse) { _gf_errno = GF_E_SP_64_S; return 0; } - if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; } - } else if (arg1 == 4 && arg2 == 64) { - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } - if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; } - if (raltmap && rnosse) { _gf_errno = GF_E_SP_64AS; return 0; } - } else { _gf_errno = GF_E_SP_64AR; return 0; } - } else if (w == 128) { - if (arg1 == 8 && arg2 == 128) { - if (rsse || rnosse) { _gf_errno = GF_E_SP128_S; return 0; } - if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; } - } else if (arg1 == 4 && arg2 == 128) { - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } - if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; } - if (raltmap && rnosse) { _gf_errno = GF_E_SP128AS; return 0; } - } else { _gf_errno = GF_E_SP128AR; return 0; } - } else { _gf_errno = GF_E_SPLIT_W; return 0; } - return 1; - } - - if (mult_type == GF_MULT_COMPOSITE) { - if (w != 8 && w != 16 && w != 32 - && w != 64 && w != 128) { _gf_errno = GF_E_COMP__W; return 0; } - if ((poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; } - if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; } - if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; } - if (rsse || rnosse) { _gf_errno = GF_E_COMP_SS; return 0; } - if (base != NULL) { - sub = (gf_internal_t *) base->scratch; - if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; } - if (poly == 0) { - if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; } - } - } - return 1; - } - - _gf_errno = GF_E_UNKNOWN; - return 0; -} - -int gf_scratch_size(int w, - int mult_type, - int region_type, - int divide_type, - int arg1, - int arg2) -{ - if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL) == 0) return 0; - - switch(w) { - case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2); - case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2); - case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2); - case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2); - case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2); - case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2); - default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2); - } -} - -extern int gf_size(gf_t *gf) -{ - gf_internal_t *h; - int s; - - s = sizeof(gf_t); - h = (gf_internal_t *) gf->scratch; - s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2); - if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf); - return s; -} - - -int gf_init_easy(gf_t *gf, int w) -{ - return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, - 0, 0, 0, NULL, NULL); -} - -/* Allen: What's going on here is this function is putting info into the - scratch mem of gf, and then calling the relevant REAL init - func for the word size. Probably done this way to consolidate - those aspects of initialization that don't rely on word size, - and then take care of word-size-specific stuff. */ - -int gf_init_hard(gf_t *gf, int w, int mult_type, - int region_type, - int divide_type, - uint64_t prim_poly, - int arg1, int arg2, - gf_t *base_gf, - void *scratch_memory) -{ - int sz; - gf_internal_t *h; - - if (gf_error_check(w, mult_type, region_type, divide_type, - arg1, arg2, prim_poly, base_gf) == 0) return 0; - - sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2); - if (sz <= 0) return 0; /* This shouldn't happen, as all errors should get caught - in gf_error_check() */ - - if (scratch_memory == NULL) { - h = (gf_internal_t *) malloc(sz); - h->free_me = 1; - } else { - h = scratch_memory; - h->free_me = 0; - } - gf->scratch = (void *) h; - h->mult_type = mult_type; - h->region_type = region_type; - h->divide_type = divide_type; - h->w = w; - h->prim_poly = prim_poly; - h->arg1 = arg1; - h->arg2 = arg2; - h->base_gf = base_gf; - h->private = (void *) gf->scratch; - h->private = (char*)h->private + (sizeof(gf_internal_t)); - h->sse = 0x00; -#ifdef INTEL_SSE2 - h->sse |= GF_SSE2; -#endif -#ifdef INTEL_SSSE3 - h->sse |= GF_SSSE3; -#endif -#ifdef INTEL_SSE4 - h->sse |= GF_SSE4; -#endif -#ifdef INTEL_SSE4_PCLMUL - h->sse |= GF_SSE4_PCLMUL; -#endif - gf->extract_word.w32 = NULL; - - switch(w) { - case 4: return gf_w4_init(gf); - case 8: return gf_w8_init(gf); - case 16: return gf_w16_init(gf); - case 32: return gf_w32_init(gf); - case 64: return gf_w64_init(gf); - case 128: return gf_w128_init(gf); - default: return gf_wgen_init(gf); - } -} - -int gf_free(gf_t *gf, int recursive) -{ - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - if (recursive && h->base_gf != NULL) { - gf_free(h->base_gf, 1); - free(h->base_gf); - } - if (h->free_me) free(h); - return 0; /* Making compiler happy */ -} - -void gf_alignment_error(char *s, int a) -{ - fprintf(stderr, "Alignment error in %s:\n", s); - fprintf(stderr, " The source and destination buffers must be aligned to each other,\n"); - fprintf(stderr, " and they must be aligned to a %d-byte address.\n", a); - exit(1); -} - -static -void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) { - int cols, i, j; - uint32_t tmp; - - cols = rows; - - for (i = 0; i < rows; i++) inv[i] = (1 << i); - - /* First -- convert into upper triangular */ - - for (i = 0; i < cols; i++) { - - /* Swap rows if we ave a zero i,i element. If we can't swap, then the - matrix was not invertible */ - - if ((mat[i] & (1 << i)) == 0) { - for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ; - if (j == rows) { - fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n"); - exit(1); - } - tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp; - tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp; - } - - /* Now for each j>i, add A_ji*Ai to Aj */ - for (j = i+1; j != rows; j++) { - if ((mat[j] & (1 << i)) != 0) { - mat[j] ^= mat[i]; - inv[j] ^= inv[i]; - } - } - } - - /* Now the matrix is upper triangular. Start at the top and multiply down */ - - for (i = rows-1; i >= 0; i--) { - for (j = 0; j < i; j++) { - if (mat[j] & (1 << i)) { - /* mat[j] ^= mat[i]; */ - inv[j] ^= inv[i]; - } - } - } -} - -uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp) -{ - uint32_t mat[32], inv[32], mask; - int i; - - mask = (w == 32) ? 0xffffffff : (1 << w) - 1; - for (i = 0; i < w; i++) { - mat[i] = y; - - if (y & (1 << (w-1))) { - y = y << 1; - y = ((y ^ pp) & mask); - } else { - y = y << 1; - } - } - - gf_invert_binary_matrix(mat, inv, w); - return inv[0]; -} - -void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base) -{ - uint64_t a, prod; - int xor; - uint64_t *s64, *d64, *top; - - s64 = rd->s_start; - d64 = rd->d_start; - top = rd->d_top; - xor = rd->xor; - - if (xor) { - while (d64 != top) { - a = *s64; - prod = base[a >> 48]; - a <<= 16; - prod <<= 16; - prod ^= base[a >> 48]; - a <<= 16; - prod <<= 16; - prod ^= base[a >> 48]; - a <<= 16; - prod <<= 16; - prod ^= base[a >> 48]; - prod ^= *d64; - *d64 = prod; - s64++; - d64++; - } - } else { - while (d64 != top) { - a = *s64; - prod = base[a >> 48]; - a <<= 16; - prod <<= 16; - prod ^= base[a >> 48]; - a <<= 16; - prod <<= 16; - prod ^= base[a >> 48]; - a <<= 16; - prod <<= 16; - prod ^= base[a >> 48]; - *d64 = prod; - s64++; - d64++; - } - } -} - -static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top) -{ - uint8_t *s8, *d8; - uint16_t *s16, *d16; - uint32_t *s32, *d32; - uint64_t *s64, *d64; - gf_internal_t *h; - int wb; - uint32_t p, a; - - h = rd->gf->scratch; - wb = (h->w)/8; - if (wb == 0) wb = 1; - - while (src < s_top) { - switch (h->w) { - case 8: - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - *d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) : - rd->gf->multiply.w32(rd->gf, rd->val, *s8); - break; - case 4: - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - a = *s8; - p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf); - p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4); - if (rd->xor) p ^= *d8; - *d8 = p; - break; - case 16: - s16 = (uint16_t *) src; - d16 = (uint16_t *) dest; - *d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) : - rd->gf->multiply.w32(rd->gf, rd->val, *s16); - break; - case 32: - s32 = (uint32_t *) src; - d32 = (uint32_t *) dest; - *d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) : - rd->gf->multiply.w32(rd->gf, rd->val, *s32); - break; - case 64: - s64 = (uint64_t *) src; - d64 = (uint64_t *) dest; - *d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) : - rd->gf->multiply.w64(rd->gf, rd->val, *s64); - break; - default: - fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w); - exit(1); - } - src = (char*)src + wb; - dest = (char*)dest + wb; - } -} - -/* JSP - The purpose of this procedure is to error check alignment, - and to set up the region operation so that it can best leverage - large words. - - It stores its information in rd. - - Assuming you're not doing Cauchy coding, (see below for that), - then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably - should change that). - - src and dest must then be aligned on ceil(w/8)-byte boundaries. - Moreover, bytes must be a multiple of ceil(w/8). If the variable - align is equal to ceil(w/8), then we will set s_start = src, - d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes). - And we return -- the implementation will go ahead and do the - multiplication on individual words (e.g. using discrete logs). - - If align is greater than ceil(w/8), then the implementation needs - to work on groups of "align" bytes. For example, suppose you are - implementing BYTWO, without SSE. Then you will be doing the region - multiplication in units of 8 bytes, so align = 8. Or, suppose you - are doing a Quad table in GF(2^4). You will be doing the region - multiplication in units of 2 bytes, so align = 2. Or, suppose you - are doing split multiplication with SSE operations in GF(2^8). - Then align = 16. Worse yet, suppose you are doing split - multiplication with SSE operations in GF(2^16), with or without - ALTMAP. Then, you will be doing the multiplication on 256 bits at - a time. So align = 32. - - When align does not equal ceil(w/8), we split the region - multiplication into three parts. We are going to make s_start be - the first address greater than or equal to src that is a multiple - of align. s_top is going to be the largest address >= src+bytes - such that (s_top - s_start) is a multiple of align. We do the - same with d_start and d_top. When we say that "src and dest must - be aligned with respect to each other, we mean that s_start-src - must equal d_start-dest. - - Now, the region multiplication is done in three parts -- the part - between src and s_start must be done using single words. - Similarly, the part between s_top and src+bytes must also be done - using single words. The part between s_start and s_top will be - done in chunks of "align" bytes. - - One final thing -- if align > 16, then s_start and d_start will be - aligned on a 16 byte boundary. Perhaps we should have two - variables: align and chunksize. Then we'd have s_start & d_start - aligned to "align", and have s_top-s_start be a multiple of - chunksize. That may be less confusing, but it would be a big - change. - - Finally, if align = -1, then we are doing Cauchy multiplication, - using only XOR's. In this case, we're not going to care about - alignment because we are just doing XOR's. Instead, the only - thing we care about is that bytes must be a multiple of w. - - This is not to say that alignment doesn't matter in performance - with XOR's. See that discussion in gf_multby_one(). - - After you call gf_set_region_data(), the procedure - gf_do_initial_region_alignment() calls gf->multiply.w32() on - everything between src and s_start. The procedure - gf_do_final_region_alignment() calls gf->multiply.w32() on - everything between s_top and src+bytes. - */ - -void gf_set_region_data(gf_region_data *rd, - gf_t *gf, - void *src, - void *dest, - int bytes, - uint64_t val, - int xor, - int align) -{ - gf_internal_t *h = NULL; - int wb; - uint32_t a; - unsigned long uls, uld; - - if (gf == NULL) { /* JSP - Can be NULL if you're just doing XOR's */ - wb = 1; - } else { - h = gf->scratch; - wb = (h->w)/8; - if (wb == 0) wb = 1; - } - - rd->gf = gf; - rd->src = src; - rd->dest = dest; - rd->bytes = bytes; - rd->val = val; - rd->xor = xor; - rd->align = align; - - uls = (unsigned long) src; - uld = (unsigned long) dest; - - a = (align <= 16) ? align : 16; - - if (align == -1) { /* JSP: This is cauchy. Error check bytes, then set up the pointers - so that there are no alignment regions. */ - if (h != NULL && bytes % h->w != 0) { - fprintf(stderr, "Error in region multiply operation.\n"); - fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w); - exit(1); - } - - rd->s_start = src; - rd->d_start = dest; - rd->s_top = (char*)src + bytes; - rd->d_top = (char*)src + bytes; - return; - } - - if (uls % a != uld % a) { - fprintf(stderr, "Error in region multiply operation.\n"); - fprintf(stderr, "The source & destination pointers must be aligned with respect\n"); - fprintf(stderr, "to each other along a %d byte boundary.\n", a); - fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src, - (unsigned long) dest); - exit(1); - } - - if (uls % wb != 0) { - fprintf(stderr, "Error in region multiply operation.\n"); - fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb); - fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src, - (unsigned long) dest); - exit(1); - } - - if (bytes % wb != 0) { - fprintf(stderr, "Error in region multiply operation.\n"); - fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb); - exit(1); - } - - uls %= a; - if (uls != 0) uls = (a-uls); - rd->s_start = (char*)rd->src + uls; - rd->d_start = (char*)rd->dest + uls; - bytes -= uls; - bytes -= (bytes % align); - rd->s_top = (char*)rd->s_start + bytes; - rd->d_top = (char*)rd->d_start + bytes; - -} - -void gf_do_initial_region_alignment(gf_region_data *rd) -{ - gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start); -} - -void gf_do_final_region_alignment(gf_region_data *rd) -{ - gf_slow_multiply_region(rd, rd->s_top, rd->d_top, (char*)rd->src+rd->bytes); -} - -void gf_multby_zero(void *dest, int bytes, int xor) -{ - if (xor) return; - bzero(dest, bytes); - return; -} - -/* JSP - gf_multby_one tries to do this in the most efficient way - possible. If xor = 0, then simply call memcpy() since that - should be optimized by the system. Otherwise, try to do the xor - in the following order: - - If src and dest are aligned with respect to each other on 16-byte - boundaries and you have SSE instructions, then use aligned SSE - instructions. - - If they aren't but you still have SSE instructions, use unaligned - SSE instructions. - - If there are no SSE instructions, but they are aligned with - respect to each other on 8-byte boundaries, then do them with - uint64_t's. - - Otherwise, call gf_unaligned_xor(), which does the following: - align a destination pointer along an 8-byte boundary, and then - memcpy 32 bytes at a time from the src pointer to an array of - doubles. I'm not sure if that's the best -- probably needs - testing, but this seems like it could be a black hole. - */ - -static void gf_unaligned_xor(void *src, void *dest, int bytes); - -void gf_multby_one(void *src, void *dest, int bytes, int xor) -{ -#ifdef INTEL_SSE2 - __m128i ms, md; -#endif - unsigned long uls, uld; - uint8_t *s8, *d8; - uint64_t *s64, *d64, *dtop64; - gf_region_data rd; - - if (!xor) { - memcpy(dest, src, bytes); - return; - } - uls = (unsigned long) src; - uld = (unsigned long) dest; - -#ifdef INTEL_SSE2 - int abytes; - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - if (uls % 16 == uld % 16) { - gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16); - while (s8 != rd.s_start) { - *d8 ^= *s8; - d8++; - s8++; - } - while (s8 < (uint8_t *) rd.s_top) { - ms = _mm_load_si128 ((__m128i *)(s8)); - md = _mm_load_si128 ((__m128i *)(d8)); - md = _mm_xor_si128(md, ms); - _mm_store_si128((__m128i *)(d8), md); - s8 += 16; - d8 += 16; - } - while (s8 != (uint8_t *) src + bytes) { - *d8 ^= *s8; - d8++; - s8++; - } - return; - } - - abytes = (bytes & 0xfffffff0); - - while (d8 < (uint8_t *) dest + abytes) { - ms = _mm_loadu_si128 ((__m128i *)(s8)); - md = _mm_loadu_si128 ((__m128i *)(d8)); - md = _mm_xor_si128(md, ms); - _mm_storeu_si128((__m128i *)(d8), md); - s8 += 16; - d8 += 16; - } - while (d8 != (uint8_t *) dest+bytes) { - *d8 ^= *s8; - d8++; - s8++; - } - return; -#endif - - if (uls % 8 != uld % 8) { - gf_unaligned_xor(src, dest, bytes); - return; - } - - gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 8); - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - while (d8 != rd.d_start) { - *d8 ^= *s8; - d8++; - s8++; - } - dtop64 = (uint64_t *) rd.d_top; - - d64 = (uint64_t *) rd.d_start; - s64 = (uint64_t *) rd.s_start; - - while (d64 < dtop64) { - *d64 ^= *s64; - d64++; - s64++; - } - - s8 = (uint8_t *) rd.s_top; - d8 = (uint8_t *) rd.d_top; - - while (d8 != (uint8_t *) dest+bytes) { - *d8 ^= *s8; - d8++; - s8++; - } - return; -} - -#define UNALIGNED_BUFSIZE (8) - -static void gf_unaligned_xor(void *src, void *dest, int bytes) -{ - uint64_t scopy[UNALIGNED_BUFSIZE], *d64; - int i; - gf_region_data rd; - uint8_t *s8, *d8; - - /* JSP - call gf_set_region_data(), but use dest in both places. This is - because I only want to set up dest. If I used src, gf_set_region_data() - would fail because src and dest are not aligned to each other wrt - 8-byte pointers. I know this will actually align d_start to 16 bytes. - If I change gf_set_region_data() to split alignment & chunksize, then - I could do this correctly. */ - - gf_set_region_data(&rd, NULL, dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE); - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - - while (d8 < (uint8_t *) rd.d_start) { - *d8 ^= *s8; - d8++; - s8++; - } - - d64 = (uint64_t *) d8; - while (d64 < (uint64_t *) rd.d_top) { - memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE); - s8 += 8*UNALIGNED_BUFSIZE; - for (i = 0; i < UNALIGNED_BUFSIZE; i++) { - *d64 ^= scopy[i]; - d64++; - } - } - - d8 = (uint8_t *) d64; - while (d8 < (uint8_t *) ((char*)dest+bytes)) { - *d8 ^= *s8; - d8++; - s8++; - } -} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_general.c b/src/erasure-code/jerasure/gf-complete/src/gf_general.c deleted file mode 100644 index c410598153513..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/src/gf_general.c +++ /dev/null @@ -1,538 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_general.c - * - * This file has helper routines for doing basic GF operations with any - * legal value of w. The problem is that w <= 32, w=64 and w=128 all have - * different data types, which is a pain. The procedures in this file try - * to alleviate that pain. They are used in gf_unit and gf_time. - */ - -#include -#include -#include -#include -#include -#include - -#include "gf_complete.h" -#include "gf_int.h" -#include "gf_method.h" -#include "gf_rand.h" -#include "gf_general.h" - -void gf_general_set_zero(gf_general_t *v, int w) -{ - if (w <= 32) { - v->w32 = 0; - } else if (w <= 64) { - v->w64 = 0; - } else { - v->w128[0] = 0; - v->w128[1] = 0; - } -} - -void gf_general_set_one(gf_general_t *v, int w) -{ - if (w <= 32) { - v->w32 = 1; - } else if (w <= 64) { - v->w64 = 1; - } else { - v->w128[0] = 0; - v->w128[1] = 1; - } -} - -void gf_general_set_two(gf_general_t *v, int w) -{ - if (w <= 32) { - v->w32 = 2; - } else if (w <= 64) { - v->w64 = 2; - } else { - v->w128[0] = 0; - v->w128[1] = 2; - } -} - -int gf_general_is_zero(gf_general_t *v, int w) -{ - if (w <= 32) { - return (v->w32 == 0); - } else if (w <= 64) { - return (v->w64 == 0); - } else { - return (v->w128[0] == 0 && v->w128[1] == 0); - } -} - -int gf_general_is_one(gf_general_t *v, int w) -{ - if (w <= 32) { - return (v->w32 == 1); - } else if (w <= 64) { - return (v->w64 == 1); - } else { - return (v->w128[0] == 0 && v->w128[1] == 1); - } -} - -void gf_general_set_random(gf_general_t *v, int w, int zero_ok) -{ - if (w <= 32) { - v->w32 = MOA_Random_W(w, zero_ok); - } else if (w <= 64) { - while (1) { - v->w64 = MOA_Random_64(); - if (v->w64 != 0 || zero_ok) return; - } - } else { - while (1) { - MOA_Random_128(v->w128); - if (v->w128[0] != 0 || v->w128[1] != 0 || zero_ok) return; - } - } -} - -void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex) -{ - if (w <= 32) { - if (hex) { - sprintf(s, "%x", v->w32); - } else { - sprintf(s, "%u", v->w32); - } - } else if (w <= 64) { - if (hex) { - sprintf(s, "%llx", (long long unsigned int) v->w64); - } else { - sprintf(s, "%lld", (long long unsigned int) v->w64); - } - } else { - if (v->w128[0] == 0) { - sprintf(s, "%llx", (long long unsigned int) v->w128[1]); - } else { - sprintf(s, "%llx%016llx", (long long unsigned int) v->w128[0], - (long long unsigned int) v->w128[1]); - } - } -} - -int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex) -{ - int l; - int save; - - if (w <= 32) { - if (hex) { - if (sscanf(s, "%x", &(v->w32)) == 0) return 0; - } else { - if (sscanf(s, "%u", &(v->w32)) == 0) return 0; - } - if (w == 32) return 1; - if (w == 31) { - if (v->w32 & (1 << 31)) return 0; - return 1; - } - if (v->w32 & ~((1 << w)-1)) return 0; - return 1; - } else if (w <= 64) { - if (hex) return (sscanf(s, "%llx", (long long unsigned int *) (&(v->w64))) == 1); - return (sscanf(s, "%lld", (long long int *) (&(v->w64))) == 1); - } else { - if (!hex) return 0; - l = strlen(s); - if (l <= 16) { - v->w128[0] = 0; - return (sscanf(s, "%llx", (long long unsigned int *) (&(v->w128[1]))) == 1); - } else { - if (l > 32) return 0; - save = s[l-16]; - s[l-16] = '\0'; - if (sscanf(s, "%llx", (long long unsigned int *) (&(v->w128[0]))) == 0) { - s[l-16] = save; - return 0; - } - return (sscanf(s+(l-16), "%llx", (long long unsigned int *) (&(v->w128[1]))) == 1); - } - } -} - -void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c) -{ - gf_internal_t *h; - int w; - - h = (gf_internal_t *) gf->scratch; - w = h->w; - - if (w <= 32) { - c->w32 = a->w32 ^ b->w32; - } else if (w <= 64) { - c->w64 = a->w64 ^ b->w64; - } else { - c->w128[0] = a->w128[0] ^ b->w128[0]; - c->w128[1] = a->w128[1] ^ b->w128[1]; - } -} - -void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c) -{ - gf_internal_t *h; - int w; - - h = (gf_internal_t *) gf->scratch; - w = h->w; - - if (w <= 32) { - c->w32 = gf->multiply.w32(gf, a->w32, b->w32); - } else if (w <= 64) { - c->w64 = gf->multiply.w64(gf, a->w64, b->w64); - } else { - gf->multiply.w128(gf, a->w128, b->w128, c->w128); - } -} - -void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c) -{ - gf_internal_t *h; - int w; - - h = (gf_internal_t *) gf->scratch; - w = h->w; - - if (w <= 32) { - c->w32 = gf->divide.w32(gf, a->w32, b->w32); - } else if (w <= 64) { - c->w64 = gf->divide.w64(gf, a->w64, b->w64); - } else { - gf->divide.w128(gf, a->w128, b->w128, c->w128); - } -} - -void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b) -{ - gf_internal_t *h; - int w; - - h = (gf_internal_t *) gf->scratch; - w = h->w; - - if (w <= 32) { - b->w32 = gf->inverse.w32(gf, a->w32); - } else if (w <= 64) { - b->w64 = gf->inverse.w64(gf, a->w64); - } else { - gf->inverse.w128(gf, a->w128, b->w128); - } -} - -int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w) -{ - if (w <= 32) { - return (v1->w32 == v2->w32); - } else if (w <= 64) { - return (v1->w64 == v2->w64); - } else { - return (v1->w128[0] == v2->w128[0] && - v1->w128[0] == v2->w128[0]); - } -} - -void gf_general_do_region_multiply(gf_t *gf, gf_general_t *a, void *ra, void *rb, int bytes, int xor) -{ - gf_internal_t *h; - int w; - - h = (gf_internal_t *) gf->scratch; - w = h->w; - - if (w <= 32) { - gf->multiply_region.w32(gf, ra, rb, a->w32, bytes, xor); - } else if (w <= 64) { - gf->multiply_region.w64(gf, ra, rb, a->w64, bytes, xor); - } else { - gf->multiply_region.w128(gf, ra, rb, a->w128, bytes, xor); - } -} - -void gf_general_do_region_check(gf_t *gf, gf_general_t *a, void *orig_a, void *orig_target, void *final_target, int bytes, int xor) -{ - gf_internal_t *h; - int w, words, i; - gf_general_t oa, ot, ft, sb; - char sa[50], soa[50], sot[50], sft[50], ssb[50]; - - h = (gf_internal_t *) gf->scratch; - w = h->w; - - words = (bytes * 8) / w; - for (i = 0; i < words; i++) { - if (w <= 32) { - oa.w32 = gf->extract_word.w32(gf, orig_a, bytes, i); - ot.w32 = gf->extract_word.w32(gf, orig_target, bytes, i); - ft.w32 = gf->extract_word.w32(gf, final_target, bytes, i); - sb.w32 = gf->multiply.w32(gf, a->w32, oa.w32); - if (xor) sb.w32 ^= ot.w32; - } else if (w <= 64) { - oa.w64 = gf->extract_word.w64(gf, orig_a, bytes, i); - ot.w64 = gf->extract_word.w64(gf, orig_target, bytes, i); - ft.w64 = gf->extract_word.w64(gf, final_target, bytes, i); - sb.w64 = gf->multiply.w64(gf, a->w64, oa.w64); - if (xor) sb.w64 ^= ot.w64; - } else { - gf->extract_word.w128(gf, orig_a, bytes, i, oa.w128); - gf->extract_word.w128(gf, orig_target, bytes, i, ot.w128); - gf->extract_word.w128(gf, final_target, bytes, i, ft.w128); - gf->multiply.w128(gf, a->w128, oa.w128, sb.w128); - if (xor) { - sb.w128[0] ^= ot.w128[0]; - sb.w128[1] ^= ot.w128[1]; - } - } - - if (!gf_general_are_equal(&ft, &sb, w)) { - - fprintf(stderr,"Problem with region multiply (all values in hex):\n"); - fprintf(stderr," Target address base: 0x%lx. Word 0x%x of 0x%x. Xor: %d\n", - (unsigned long) final_target, i, words, xor); - gf_general_val_to_s(a, w, sa, 1); - gf_general_val_to_s(&oa, w, soa, 1); - gf_general_val_to_s(&ot, w, sot, 1); - gf_general_val_to_s(&ft, w, sft, 1); - gf_general_val_to_s(&sb, w, ssb, 1); - fprintf(stderr," Value: %s\n", sa); - fprintf(stderr," Original source word: %s\n", soa); - if (xor) fprintf(stderr," XOR with target word: %s\n", sot); - fprintf(stderr," Product word: %s\n", sft); - fprintf(stderr," It should be: %s\n", ssb); - exit(0); - } - } -} - -void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size) -{ - void *top; - gf_general_t g; - uint8_t *r8, *r8a; - uint16_t *r16; - uint32_t *r32; - uint64_t *r64; - int i; - - top = (char*)rb+size; - - /* If w is 8, 16, 32, 64 or 128, fill the regions with random bytes. - However, don't allow for zeros in rb, because that will screw up - division. - - When w is 4, you fill the regions with random 4-bit words in each byte. - - Otherwise, treat every four bytes as an uint32_t - and fill it with a random value mod (1 << w). - */ - - if (w == 8 || w == 16 || w == 32 || w == 64 || w == 128) { - MOA_Fill_Random_Region (ra, size); - while (rb < top) { - gf_general_set_random(&g, w, 0); - switch (w) { - case 8: - r8 = (uint8_t *) rb; - *r8 = g.w32; - break; - case 16: - r16 = (uint16_t *) rb; - *r16 = g.w32; - break; - case 32: - r32 = (uint32_t *) rb; - *r32 = g.w32; - break; - case 64: - r64 = (uint64_t *) rb; - *r64 = g.w64; - break; - case 128: - r64 = (uint64_t *) rb; - r64[0] = g.w128[0]; - r64[1] = g.w128[1]; - break; - } - rb = (char*)rb + (w/8); - } - } else if (w == 4) { - r8a = (uint8_t *) ra; - r8 = (uint8_t *) rb; - while (r8 < (uint8_t *) top) { - gf_general_set_random(&g, w, 1); - *r8a = g.w32; - gf_general_set_random(&g, w, 0); - *r8 = g.w32; - r8a++; - r8++; - } - } else { - r32 = (uint32_t *) ra; - for (i = 0; i < size/4; i++) r32[i] = MOA_Random_W(w, 1); - r32 = (uint32_t *) rb; - for (i = 0; i < size/4; i++) r32[i] = MOA_Random_W(w, 0); - } -} - -/* This sucks, but in order to time, you really need to avoid putting ifs in - the inner loops. So, I'm doing a separate timing test for each w: - (4 & 8), 16, 32, 64, 128 and everything else. Fortunately, the "everything else" - tests can be equivalent to w=32. - - I'm also putting the results back into ra, because otherwise, the optimizer might - figure out that we're not really doing anything in the inner loops and it - will chuck that. */ - -int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, char test) -{ - gf_internal_t *h; - void *top; - uint8_t *r8a, *r8b, *top8; - uint16_t *r16a, *r16b, *top16; - uint32_t *r32a, *r32b, *top32; - uint64_t *r64a, *r64b, *top64, *r64c; - int w, rv; - - h = (gf_internal_t *) gf->scratch; - w = h->w; - top = (char*)ra + size; - - if (w == 8 || w == 4) { - r8a = (uint8_t *) ra; - r8b = (uint8_t *) rb; - top8 = (uint8_t *) top; - if (test == 'M') { - while (r8a < top8) { - *r8a = gf->multiply.w32(gf, *r8a, *r8b); - r8a++; - r8b++; - } - } else if (test == 'D') { - while (r8a < top8) { - *r8a = gf->divide.w32(gf, *r8a, *r8b); - r8a++; - r8b++; - } - } else if (test == 'I') { - while (r8a < top8) { - *r8a = gf->inverse.w32(gf, *r8a); - r8a++; - } - } - return (top8 - (uint8_t *) ra); - } - - if (w == 16) { - r16a = (uint16_t *) ra; - r16b = (uint16_t *) rb; - top16 = (uint16_t *) top; - if (test == 'M') { - while (r16a < top16) { - *r16a = gf->multiply.w32(gf, *r16a, *r16b); - r16a++; - r16b++; - } - } else if (test == 'D') { - while (r16a < top16) { - *r16a = gf->divide.w32(gf, *r16a, *r16b); - r16a++; - r16b++; - } - } else if (test == 'I') { - while (r16a < top16) { - *r16a = gf->inverse.w32(gf, *r16a); - r16a++; - } - } - return (top16 - (uint16_t *) ra); - } - if (w <= 32) { - r32a = (uint32_t *) ra; - r32b = (uint32_t *) rb; - top32 = (uint32_t *) ra + (size/4); /* This is for the "everything elses" */ - - if (test == 'M') { - while (r32a < top32) { - *r32a = gf->multiply.w32(gf, *r32a, *r32b); - r32a++; - r32b++; - } - } else if (test == 'D') { - while (r32a < top32) { - *r32a = gf->divide.w32(gf, *r32a, *r32b); - r32a++; - r32b++; - } - } else if (test == 'I') { - while (r32a < top32) { - *r32a = gf->inverse.w32(gf, *r32a); - r32a++; - } - } - return (top32 - (uint32_t *) ra); - } - if (w == 64) { - r64a = (uint64_t *) ra; - r64b = (uint64_t *) rb; - top64 = (uint64_t *) top; - if (test == 'M') { - while (r64a < top64) { - *r64a = gf->multiply.w64(gf, *r64a, *r64b); - r64a++; - r64b++; - } - } else if (test == 'D') { - while (r64a < top64) { - *r64a = gf->divide.w64(gf, *r64a, *r64b); - r64a++; - r64b++; - } - } else if (test == 'I') { - while (r64a < top64) { - *r64a = gf->inverse.w64(gf, *r64a); - r64a++; - } - } - return (top64 - (uint64_t *) ra); - } - if (w == 128) { - r64a = (uint64_t *) ra; - r64c = r64a; - r64a += 2; - r64b = (uint64_t *) rb; - top64 = (uint64_t *) top; - rv = (top64 - r64a)/2; - if (test == 'M') { - while (r64a < top64) { - gf->multiply.w128(gf, r64a, r64b, r64c); - r64a += 2; - r64b += 2; - } - } else if (test == 'D') { - while (r64a < top64) { - gf->divide.w128(gf, r64a, r64b, r64c); - r64a += 2; - r64b += 2; - } - } else if (test == 'I') { - while (r64a < top64) { - gf->inverse.w128(gf, r64a, r64c); - r64a += 2; - } - } - return rv; - } - return 0; -} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_method.c b/src/erasure-code/jerasure/gf-complete/src/gf_method.c deleted file mode 100644 index a7bcacff9767c..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/src/gf_method.c +++ /dev/null @@ -1,185 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_method.c - * - * Parses argv to figure out the mult_type and arguments. Returns the gf. - */ - -#include -#include -#include -#include -#include - -#include "gf_complete.h" -#include "gf_int.h" -#include "gf_method.h" - -int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting) -{ - int mult_type, divide_type, region_type; - int arg1, arg2; - uint64_t prim_poly; - gf_t *base; - - mult_type = GF_MULT_DEFAULT; - region_type = GF_REGION_DEFAULT; - divide_type = GF_DIVIDE_DEFAULT; - prim_poly = 0; - base = NULL; - arg1 = 0; - arg2 = 0; - while (1) { - if (argc > starting) { - if (strcmp(argv[starting], "-m") == 0) { - starting++; - if (mult_type != GF_MULT_DEFAULT) { - if (base != NULL) gf_free(base, 1); - _gf_errno = GF_E_TWOMULT; - return 0; - } - if (strcmp(argv[starting], "SHIFT") == 0) { - mult_type = GF_MULT_SHIFT; - starting++; - } else if (strcmp(argv[starting], "CARRY_FREE") == 0) { - mult_type = GF_MULT_CARRY_FREE; - starting++; - } else if (strcmp(argv[starting], "GROUP") == 0) { - mult_type = GF_MULT_GROUP; - if (argc < starting + 3) { - _gf_errno = GF_E_GROUPAR; - return 0; - } - if (sscanf(argv[starting+1], "%d", &arg1) == 0 || - sscanf(argv[starting+2], "%d", &arg2) == 0) { - _gf_errno = GF_E_GROUPNU; - return 0; - } - starting += 3; - } else if (strcmp(argv[starting], "BYTWO_p") == 0) { - mult_type = GF_MULT_BYTWO_p; - starting++; - } else if (strcmp(argv[starting], "BYTWO_b") == 0) { - mult_type = GF_MULT_BYTWO_b; - starting++; - } else if (strcmp(argv[starting], "TABLE") == 0) { - mult_type = GF_MULT_TABLE; - starting++; - } else if (strcmp(argv[starting], "LOG") == 0) { - mult_type = GF_MULT_LOG_TABLE; - starting++; - } else if (strcmp(argv[starting], "LOG_ZERO") == 0) { - mult_type = GF_MULT_LOG_ZERO; - starting++; - } else if (strcmp(argv[starting], "LOG_ZERO_EXT") == 0) { - mult_type = GF_MULT_LOG_ZERO_EXT; - starting++; - } else if (strcmp(argv[starting], "SPLIT") == 0) { - mult_type = GF_MULT_SPLIT_TABLE; - if (argc < starting + 3) { - _gf_errno = GF_E_SPLITAR; - return 0; - } - if (sscanf(argv[starting+1], "%d", &arg1) == 0 || - sscanf(argv[starting+2], "%d", &arg2) == 0) { - _gf_errno = GF_E_SPLITNU; - return 0; - } - starting += 3; - } else if (strcmp(argv[starting], "COMPOSITE") == 0) { - mult_type = GF_MULT_COMPOSITE; - if (argc < starting + 2) { _gf_errno = GF_E_FEWARGS; return 0; } - if (sscanf(argv[starting+1], "%d", &arg1) == 0) { - _gf_errno = GF_E_COMP_A2; - return 0; - } - starting += 2; - base = (gf_t *) malloc(sizeof(gf_t)); - starting = create_gf_from_argv(base, w/arg1, argc, argv, starting); - if (starting == 0) { - free(base); - return 0; - } - } else { - if (base != NULL) gf_free(base, 1); - _gf_errno = GF_E_UNKNOWN; - return 0; - } - } else if (strcmp(argv[starting], "-r") == 0) { - starting++; - if (strcmp(argv[starting], "DOUBLE") == 0) { - region_type |= GF_REGION_DOUBLE_TABLE; - starting++; - } else if (strcmp(argv[starting], "QUAD") == 0) { - region_type |= GF_REGION_QUAD_TABLE; - starting++; - } else if (strcmp(argv[starting], "LAZY") == 0) { - region_type |= GF_REGION_LAZY; - starting++; - } else if (strcmp(argv[starting], "SSE") == 0) { - region_type |= GF_REGION_SSE; - starting++; - } else if (strcmp(argv[starting], "NOSSE") == 0) { - region_type |= GF_REGION_NOSSE; - starting++; - } else if (strcmp(argv[starting], "CAUCHY") == 0) { - region_type |= GF_REGION_CAUCHY; - starting++; - } else if (strcmp(argv[starting], "ALTMAP") == 0) { - region_type |= GF_REGION_ALTMAP; - starting++; - } else { - if (base != NULL) gf_free(base, 1); - _gf_errno = GF_E_UNK_REG; - return 0; - } - } else if (strcmp(argv[starting], "-p") == 0) { - starting++; - if (sscanf(argv[starting], "%llx", (long long unsigned int *)(&prim_poly)) == 0) { - if (base != NULL) gf_free(base, 1); - _gf_errno = GF_E_POLYSPC; - return 0; - } - starting++; - } else if (strcmp(argv[starting], "-d") == 0) { - starting++; - if (divide_type != GF_DIVIDE_DEFAULT) { - if (base != NULL) gf_free(base, 1); - _gf_errno = GF_E_TWO_DIV; - return 0; - } else if (strcmp(argv[starting], "EUCLID") == 0) { - divide_type = GF_DIVIDE_EUCLID; - starting++; - } else if (strcmp(argv[starting], "MATRIX") == 0) { - divide_type = GF_DIVIDE_MATRIX; - starting++; - } else { - _gf_errno = GF_E_UNK_DIV; - return 0; - } - } else if (strcmp(argv[starting], "-") == 0) { - /* - printf("Scratch size: %d\n", gf_scratch_size(w, - mult_type, region_type, divide_type, arg1, arg2)); - */ - if (gf_init_hard(gf, w, mult_type, region_type, divide_type, - prim_poly, arg1, arg2, base, NULL) == 0) { - if (base != NULL) gf_free(base, 1); - return 0; - } else - return starting + 1; - } else { - if (base != NULL) gf_free(base, 1); - _gf_errno = GF_E_UNKFLAG; - return 0; - } - } else { - if (base != NULL) gf_free(base, 1); - _gf_errno = GF_E_FEWARGS; - return 0; - } - } -} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_rand.c b/src/erasure-code/jerasure/gf-complete/src/gf_rand.c deleted file mode 100644 index a9aa7ad3605c8..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/src/gf_rand.c +++ /dev/null @@ -1,80 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_rand.c -- Random number generator. - */ - -#include -#include -#include -#include "gf_rand.h" - -/* Lifted the "Mother of All" random number generator from http://www.agner.org/random/ */ - -static uint32_t MOA_X[5]; - -uint32_t MOA_Random_32() { - uint64_t sum; - sum = (uint64_t)2111111111UL * (uint64_t)MOA_X[3] + - (uint64_t)1492 * (uint64_t)(MOA_X[2]) + - (uint64_t)1776 * (uint64_t)(MOA_X[1]) + - (uint64_t)5115 * (uint64_t)(MOA_X[0]) + - (uint64_t)MOA_X[4]; - MOA_X[3] = MOA_X[2]; MOA_X[2] = MOA_X[1]; MOA_X[1] = MOA_X[0]; - MOA_X[4] = (uint32_t)(sum >> 32); - MOA_X[0] = (uint32_t)sum; - return MOA_X[0]; -} - -uint64_t MOA_Random_64() { - uint64_t sum; - - sum = MOA_Random_32(); - sum <<= 32; - sum |= MOA_Random_32(); - return sum; -} - -void MOA_Random_128(uint64_t *x) { - x[0] = MOA_Random_64(); - x[1] = MOA_Random_64(); - return; -} - -uint32_t MOA_Random_W(int w, int zero_ok) -{ - uint32_t b; - - do { - b = MOA_Random_32(); - if (w == 31) b &= 0x7fffffff; - if (w < 31) b %= (1 << w); - } while (!zero_ok && b == 0); - return b; -} - -void MOA_Seed(uint32_t seed) { - int i; - uint32_t s = seed; - for (i = 0; i < 5; i++) { - s = s * 29943829 - 1; - MOA_X[i] = s; - } - for (i=0; i<19; i++) MOA_Random_32(); -} - - -void MOA_Fill_Random_Region (void *reg, int size) -{ - uint32_t *r32; - uint8_t *r8; - int i; - - r32 = (uint32_t *) reg; - r8 = (uint8_t *) reg; - for (i = 0; i < size/4; i++) r32[i] = MOA_Random_32(); - for (i *= 4; i < size; i++) r8[i] = MOA_Random_W(8, 1); -} - diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w128.c b/src/erasure-code/jerasure/gf-complete/src/gf_w128.c deleted file mode 100644 index d4336ae5ee6ef..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/src/gf_w128.c +++ /dev/null @@ -1,1769 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_w128.c - * - * Routines for 128-bit Galois fields - */ - -#include "gf_int.h" -#include -#include - -#define GF_FIELD_WIDTH (128) - -#define two_x(a) {\ - a[0] <<= 1; \ - if (a[1] & 1ULL << 63) a[0] ^= 1; \ - a[1] <<= 1; } - -#define a_get_b(a, i, b, j) {\ - a[i] = b[j]; \ - a[i + 1] = b[j + 1];} - -#define set_zero(a, i) {\ - a[i] = 0; \ - a[i + 1] = 0;} - -struct gf_w128_split_4_128_data { - uint64_t last_value[2]; - uint64_t tables[2][32][16]; -}; - -struct gf_w128_split_8_128_data { - uint64_t last_value[2]; - uint64_t tables[2][16][256]; -}; - -typedef struct gf_group_tables_s { - gf_val_128_t m_table; - gf_val_128_t r_table; -} gf_group_tables_t; - -#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); } - -static -void -gf_w128_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, -int xor) -{ - int i; - gf_val_128_t s128; - gf_val_128_t d128; - uint64_t c128[2]; - gf_region_data rd; - - /* We only do this to check on alignment. */ - gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); - - if (val[0] == 0) { - if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } - } - - set_zero(c128, 0); - - s128 = (gf_val_128_t) src; - d128 = (gf_val_128_t) dest; - - if (xor) { - for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) { - gf->multiply.w128(gf, &s128[i], val, c128); - d128[i] ^= c128[0]; - d128[i+1] ^= c128[1]; - } - } else { - for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) { - gf->multiply.w128(gf, &s128[i], val, &d128[i]); - } - } -} - -static -void -gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, -int xor) -{ - gf_internal_t * h = gf->scratch; - if ((h->sse & GF_SSE4_PCLMUL) == 0) - return; -#if defined(INTEL_SSE4_PCLMUL) - int i; - gf_val_128_t s128; - gf_val_128_t d128; - gf_region_data rd; - __m128i a,b; - __m128i result0,result1; - __m128i prim_poly; - __m128i c,d,e,f; - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly); - /* We only do this to check on alignment. */ - gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); - - if (val[0] == 0) { - if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } - } - - s128 = (gf_val_128_t) src; - d128 = (gf_val_128_t) dest; - - if (xor) { - for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) { - a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0); - b = _mm_insert_epi64 (a, val[1], 0); - a = _mm_insert_epi64 (a, s128[i], 1); - b = _mm_insert_epi64 (b, val[0], 1); - - c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/ - f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/ - e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ - d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ - - /* now reusing a and b as temporary variables*/ - result0 = _mm_setzero_si128(); - result1 = result0; - - result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0)); - a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1)); - result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a)); - - a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0)); - result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a)); - result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1)); - /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce. */ - - a = _mm_srli_si128 (result0, 8); - b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); - result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8)); - result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8)); - - a = _mm_insert_epi64 (result0, 0, 1); - b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); - result1 = _mm_xor_si128 (result1, b); - d128[i] ^= (uint64_t)_mm_extract_epi64(result1,1); - d128[i+1] ^= (uint64_t)_mm_extract_epi64(result1,0); - } - } else { - for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) { - a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0); - b = _mm_insert_epi64 (a, val[1], 0); - a = _mm_insert_epi64 (a, s128[i], 1); - b = _mm_insert_epi64 (b, val[0], 1); - - c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/ - f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/ - e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ - d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ - - /* now reusing a and b as temporary variables*/ - result0 = _mm_setzero_si128(); - result1 = result0; - - result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0)); - a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1)); - result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a)); - - a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0)); - result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a)); - result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1)); - /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/ - - a = _mm_srli_si128 (result0, 8); - b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); - result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8)); - result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8)); - - a = _mm_insert_epi64 (result0, 0, 1); - b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); - result1 = _mm_xor_si128 (result1, b); - d128[i] = (uint64_t)_mm_extract_epi64(result1,1); - d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0); - } - } -#endif -} - -/* - * Some w128 notes: - * --Big Endian - * --return values allocated beforehand - */ - -#define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0) - -void -gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) -{ - /* ordered highest bit to lowest l[0] l[1] r[0] r[1] */ - uint64_t pl[2], pr[2], ppl[2], ppr[2], i, a[2], bl[2], br[2], one, lbit; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - - if (GF_W128_IS_ZERO(a128) || GF_W128_IS_ZERO(b128)) { - set_zero(c128, 0); - return; - } - - a_get_b(a, 0, a128, 0); - a_get_b(br, 0, b128, 0); - set_zero(bl, 0); - - one = 1; - lbit = (one << 63); - - set_zero(pl, 0); - set_zero(pr, 0); - - /* Allen: a*b for right half of a */ - for (i = 0; i < GF_FIELD_WIDTH/2; i++) { - if (a[1] & (one << i)) { - pl[1] ^= bl[1]; - pr[0] ^= br[0]; - pr[1] ^= br[1]; - } - bl[1] <<= 1; - if (br[0] & lbit) bl[1] ^= 1; - br[0] <<= 1; - if (br[1] & lbit) br[0] ^= 1; - br[1] <<= 1; - } - - /* Allen: a*b for left half of a */ - for (i = 0; i < GF_FIELD_WIDTH/2; i++) { - if (a[0] & (one << i)) { - pl[0] ^= bl[0]; - pl[1] ^= bl[1]; - pr[0] ^= br[0]; - } - bl[0] <<= 1; - if (bl[1] & lbit) bl[0] ^= 1; - bl[1] <<= 1; - if (br[0] & lbit) bl[1] ^= 1; - br[0] <<= 1; - } - - /* Allen: do first half of reduction (based on left quarter of initial product) */ - one = lbit >> 1; - ppl[0] = one; /* Allen: introduce leading one of primitive polynomial */ - ppl[1] = h->prim_poly >> 2; - ppr[0] = h->prim_poly << (GF_FIELD_WIDTH/2-2); - ppr[1] = 0; - while (one != 0) { - if (pl[0] & one) { - pl[0] ^= ppl[0]; - pl[1] ^= ppl[1]; - pr[0] ^= ppr[0]; - pr[1] ^= ppr[1]; - } - one >>= 1; - ppr[1] >>= 1; - if (ppr[0] & 1) ppr[1] ^= lbit; - ppr[0] >>= 1; - if (ppl[1] & 1) ppr[0] ^= lbit; - ppl[1] >>= 1; - if (ppl[0] & 1) ppl[1] ^= lbit; - ppl[0] >>= 1; - } - - /* Allen: final half of reduction */ - one = lbit; - while (one != 0) { - if (pl[1] & one) { - pl[1] ^= ppl[1]; - pr[0] ^= ppr[0]; - pr[1] ^= ppr[1]; - } - one >>= 1; - ppr[1] >>= 1; - if (ppr[0] & 1) ppr[1] ^= lbit; - ppr[0] >>= 1; - if (ppl[1] & 1) ppr[0] ^= lbit; - ppl[1] >>= 1; - } - - /* Allen: if we really want to optimize this we can just be using c128 instead of pr all along */ - c128[0] = pr[0]; - c128[1] = pr[1]; - - return; -} - -void -gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) -{ - gf_internal_t * h = gf->scratch; - if ((h->sse & GF_SSE4_PCLMUL) == 0) - return; -#if defined(INTEL_SSE4_PCLMUL) - - __m128i a,b; - __m128i result0,result1; - __m128i prim_poly; - __m128i c,d,e,f; - - a = _mm_insert_epi64 (_mm_setzero_si128(), a128[1], 0); - b = _mm_insert_epi64 (a, b128[1], 0); - a = _mm_insert_epi64 (a, a128[0], 1); - b = _mm_insert_epi64 (b, b128[0], 1); - - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly); - - /* we need to test algorithm 2 later*/ - c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/ - f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/ - e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ - d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ - - /* now reusing a and b as temporary variables*/ - result0 = _mm_setzero_si128(); - result1 = result0; - - result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0)); - a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1)); - result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a)); - - a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0)); - result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a)); - result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1)); - /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/ - - a = _mm_srli_si128 (result0, 8); - b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); - result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8)); - result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8)); - - a = _mm_insert_epi64 (result0, 0, 1); - b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); - result1 = _mm_xor_si128 (result1, b); - - c128[0] = (uint64_t)_mm_extract_epi64(result1,1); - c128[1] = (uint64_t)_mm_extract_epi64(result1,0); -#endif -return; -} - -void -gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) -{ - uint64_t amask[2], pmask, pp, prod[2]; /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/ - uint64_t topbit; /* this is used as a boolean value */ - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - prod[0] = 0; - prod[1] = 0; - pmask = 0x8000000000000000ULL; - amask[0] = 0x8000000000000000ULL; - amask[1] = 0; - - while (amask[1] != 0 || amask[0] != 0) { - topbit = (prod[0] & pmask); - prod[0] <<= 1; - if (prod[1] & pmask) prod[0] ^= 1; - prod[1] <<= 1; - if (topbit) prod[1] ^= pp; - if ((a128[0] & amask[0]) || (a128[1] & amask[1])) { - prod[0] ^= b128[0]; - prod[1] ^= b128[1]; - } - amask[1] >>= 1; - if (amask[0] & 1) amask[1] ^= pmask; - amask[0] >>= 1; - } - c128[0] = prod [0]; - c128[1] = prod [1]; - return; -} - -void -gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) -{ - gf_internal_t * h = gf->scratch; - if ((h->sse & GF_SSE4) == 0) - return; -#if defined(INTEL_SSE4) - int i; - __m128i a, b, pp, prod, amask, u_middle_one; - /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/ - uint32_t topbit, middlebit, pmask; /* this is used as a boolean value */ - - - h = (gf_internal_t *) gf->scratch; - pp = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly); - prod = _mm_setzero_si128(); - a = _mm_insert_epi64(prod, a128[1], 0x0); - a = _mm_insert_epi64(a, a128[0], 0x1); - b = _mm_insert_epi64(prod, b128[1], 0x0); - b = _mm_insert_epi64(b, b128[0], 0x1); - pmask = 0x80000000; - amask = _mm_insert_epi32(prod, 0x80000000, 0x3); - u_middle_one = _mm_insert_epi32(prod, 1, 0x2); - - for (i = 0; i < 64; i++) { - topbit = (_mm_extract_epi32(prod, 0x3) & pmask); - middlebit = (_mm_extract_epi32(prod, 0x1) & pmask); - prod = _mm_slli_epi64(prod, 1); /* this instruction loses the middle bit */ - if (middlebit) { - prod = _mm_xor_si128(prod, u_middle_one); - } - if (topbit) { - prod = _mm_xor_si128(prod, pp); - } - if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 1))) { - prod = _mm_xor_si128(prod, b); - } - amask = _mm_srli_epi64(amask, 1); /*so does this one, but we can just replace after loop*/ - } - amask = _mm_insert_epi32(amask, 1 << 31, 0x1); - for (i = 64; i < 128; i++) { - topbit = (_mm_extract_epi32(prod, 0x3) & pmask); - middlebit = (_mm_extract_epi32(prod, 0x1) & pmask); - prod = _mm_slli_epi64(prod, 1); - if (middlebit) prod = _mm_xor_si128(prod, u_middle_one); - if (topbit) prod = _mm_xor_si128(prod, pp); - if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 0))) { - prod = _mm_xor_si128(prod, b); - } - amask = _mm_srli_epi64(amask, 1); - } - c128[0] = (uint64_t)_mm_extract_epi64(prod, 1); - c128[1] = (uint64_t)_mm_extract_epi64(prod, 0); -#endif - return; -} - - -/* Ben: This slow function implements sse instrutions for bytwo_b because why not */ -void -gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) -{ - gf_internal_t * h = gf->scratch; - if ((h->sse & GF_SSE4) == 0) - return; -#if defined(INTEL_SSE4) - __m128i a, b, lmask, hmask, pp, c, middle_one; - uint64_t topbit, middlebit; - - - c = _mm_setzero_si128(); - lmask = _mm_insert_epi64(c, 1ULL << 63, 0); - hmask = _mm_insert_epi64(c, 1ULL << 63, 1); - b = _mm_insert_epi64(c, a128[0], 1); - b = _mm_insert_epi64(b, a128[1], 0); - a = _mm_insert_epi64(c, b128[0], 1); - a = _mm_insert_epi64(a, b128[1], 0); - pp = _mm_insert_epi64(c, h->prim_poly, 0); - middle_one = _mm_insert_epi64(c, 1, 0x1); - - while (1) { - if (_mm_extract_epi32(a, 0x0) & 1) { - c = _mm_xor_si128(c, b); - } - middlebit = (_mm_extract_epi32(a, 0x2) & 1); - a = _mm_srli_epi64(a, 1); - if (middlebit) a = _mm_xor_si128(a, lmask); - if ((_mm_extract_epi64(a, 0x1) == 0ULL) && (_mm_extract_epi64(a, 0x0) == 0ULL)){ - c128[0] = _mm_extract_epi64(c, 0x1); - c128[1] = _mm_extract_epi64(c, 0x0); - return; - } - topbit = (_mm_extract_epi64(_mm_and_si128(b, hmask), 1)); - middlebit = (_mm_extract_epi64(_mm_and_si128(b, lmask), 0)); - b = _mm_slli_epi64(b, 1); - if (middlebit) b = _mm_xor_si128(b, middle_one); - if (topbit) b = _mm_xor_si128(b, pp); - } -#endif -} - -void -gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) -{ - uint64_t bmask, pp; - gf_internal_t *h; - uint64_t a[2], b[2], c[2]; - - h = (gf_internal_t *) gf->scratch; - - bmask = (1ULL << 63); - set_zero(c, 0); - b[0] = a128[0]; - b[1] = a128[1]; - a[0] = b128[0]; - a[1] = b128[1]; - - while (1) { - if (a[1] & 1) { - c[0] ^= b[0]; - c[1] ^= b[1]; - } - a[1] >>= 1; - if (a[0] & 1) a[1] ^= bmask; - a[0] >>= 1; - if (a[1] == 0 && a[0] == 0) { - c128[0] = c[0]; - c128[1] = c[1]; - return; - } - pp = (b[0] & bmask); - b[0] <<= 1; - if (b[1] & bmask) b[0] ^= 1; - b[1] <<= 1; - if (pp) b[1] ^= h->prim_poly; - } -} - -static -void -gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) -{ - int i, j, k; - uint64_t pp; - gf_internal_t *h; - uint64_t *s64, *d64, *top; - gf_region_data rd; - uint64_t v[2], s; - struct gf_w128_split_4_128_data *ld; - - /* We only do this to check on alignment. */ - gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); - - if (val[0] == 0) { - if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } - } - - h = (gf_internal_t *) gf->scratch; - ld = (struct gf_w128_split_4_128_data *) h->private; - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - top = (uint64_t *) rd.d_top; - - if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) { - v[0] = val[0]; - v[1] = val[1]; - for (i = 0; i < 32; i++) { - ld->tables[0][i][0] = 0; - ld->tables[1][i][0] = 0; - for (j = 1; j < 16; j <<= 1) { - for (k = 0; k < j; k++) { - ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]); - ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]); - } - pp = (v[0] & (1ULL << 63)); - v[0] <<= 1; - if (v[1] & (1ULL << 63)) v[0] ^= 1; - v[1] <<= 1; - if (pp) v[1] ^= h->prim_poly; - } - } - } - ld->last_value[0] = val[0]; - ld->last_value[1] = val[1]; - -/* - for (i = 0; i < 32; i++) { - for (j = 0; j < 16; j++) { - printf("%2d %2d %016llx %016llx\n", i, j, ld->tables[0][i][j], ld->tables[1][i][j]); - } - printf("\n"); - } - */ - i = 0; - while (d64 < top) { - v[0] = (xor) ? d64[0] : 0; - v[1] = (xor) ? d64[1] : 0; - s = s64[1]; - i = 0; - while (s != 0) { - v[0] ^= ld->tables[0][i][s&0xf]; - v[1] ^= ld->tables[1][i][s&0xf]; - s >>= 4; - i++; - } - s = s64[0]; - i = 16; - while (s != 0) { - v[0] ^= ld->tables[0][i][s&0xf]; - v[1] ^= ld->tables[1][i][s&0xf]; - s >>= 4; - i++; - } - d64[0] = v[0]; - d64[1] = v[1]; - s64 += 2; - d64 += 2; - } -} - -static -void -gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) -{ - gf_internal_t * h = gf->scratch; - if ((h->sse & GF_SSSE3) == 0) - return; - -#ifdef INTEL_SSSE3 - int i, j, k; - uint64_t pp, v[2], s, *s64, *d64, *top; - __m128i p, tables[32][16]; - struct gf_w128_split_4_128_data *ld; - gf_region_data rd; - - if (val[0] == 0) { - if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } - } - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - /* We only do this to check on alignment. */ - gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 16); - - /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */ - - gf_w128_multiply_region_from_single(gf, src, dest, val, ((char*)rd.s_start-(char*)src), xor); - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - top = (uint64_t *) rd.d_top; - - ld = (struct gf_w128_split_4_128_data *) h->private; - - if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) { - v[0] = val[0]; - v[1] = val[1]; - for (i = 0; i < 32; i++) { - ld->tables[0][i][0] = 0; - ld->tables[1][i][0] = 0; - for (j = 1; j < 16; j <<= 1) { - for (k = 0; k < j; k++) { - ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]); - ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]); - } - pp = (v[0] & (1ULL << 63)); - v[0] <<= 1; - if (v[1] & (1ULL << 63)) v[0] ^= 1; - v[1] <<= 1; - if (pp) v[1] ^= h->prim_poly; - } - } - } - - ld->last_value[0] = val[0]; - ld->last_value[1] = val[1]; - - for (i = 0; i < 32; i++) { - for (j = 0; j < 16; j++) { - v[0] = ld->tables[0][i][j]; - v[1] = ld->tables[1][i][j]; - tables[i][j] = _mm_loadu_si128((__m128i *) v); - -/* - printf("%2d %2d: ", i, j); - MM_PRINT8("", tables[i][j]); */ - } - } - - while (d64 != top) { - - if (xor) { - p = _mm_load_si128 ((__m128i *) d64); - } else { - p = _mm_setzero_si128(); - } - s = *s64; - s64++; - for (i = 0; i < 16; i++) { - j = (s&0xf); - s >>= 4; - p = _mm_xor_si128(p, tables[16+i][j]); - } - s = *s64; - s64++; - for (i = 0; i < 16; i++) { - j = (s&0xf); - s >>= 4; - p = _mm_xor_si128(p, tables[i][j]); - } - _mm_store_si128((__m128i *) d64, p); - d64 += 2; - } - - /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */ - - gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((char*)src+bytes)-(char*)rd.s_top, xor); -#endif -} - -static -void -gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) -{ - gf_internal_t * h = gf->scratch; - if ((h->sse & GF_SSSE3) == 0) - return; - -#ifdef INTEL_SSSE3 - int i, j, k; - uint64_t pp, v[2], *s64, *d64, *top; - __m128i si, tables[32][16], p[16], v0, mask1; - struct gf_w128_split_4_128_data *ld; - uint8_t btable[16]; - gf_region_data rd; - - if (val[0] == 0) { - if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } - } - - pp = h->prim_poly; - - /* We only do this to check on alignment. */ - gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 256); - - /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */ - - gf_w128_multiply_region_from_single(gf, src, dest, val, ((char*)rd.s_start-(char*)src), xor); - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - top = (uint64_t *) rd.d_top; - - ld = (struct gf_w128_split_4_128_data *) h->private; - - if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) { - v[0] = val[0]; - v[1] = val[1]; - for (i = 0; i < 32; i++) { - ld->tables[0][i][0] = 0; - ld->tables[1][i][0] = 0; - for (j = 1; j < 16; j <<= 1) { - for (k = 0; k < j; k++) { - ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]); - ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]); - } - pp = (v[0] & (1ULL << 63)); - v[0] <<= 1; - if (v[1] & (1ULL << 63)) v[0] ^= 1; - v[1] <<= 1; - if (pp) v[1] ^= h->prim_poly; - } - } - } - - ld->last_value[0] = val[0]; - ld->last_value[1] = val[1]; - - for (i = 0; i < 32; i++) { - for (j = 0; j < 16; j++) { - for (k = 0; k < 16; k++) { - btable[k] = (uint8_t) ld->tables[1-(j/8)][i][k]; - ld->tables[1-(j/8)][i][k] >>= 8; - } - tables[i][j] = _mm_loadu_si128((__m128i *) btable); -/* - printf("%2d %2d: ", i, j); - MM_PRINT8("", tables[i][j]); - */ - } - } - - - mask1 = _mm_set1_epi8(0xf); - - while (d64 != top) { - - if (xor) { - for (i = 0; i < 16; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2)); - } else { - for (i = 0; i < 16; i++) p[i] = _mm_setzero_si128(); - } - i = 0; - for (k = 0; k < 16; k++) { - v0 = _mm_load_si128((__m128i *) s64); - s64 += 2; - - si = _mm_and_si128(v0, mask1); - - for (j = 0; j < 16; j++) { - p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); - } - i++; - v0 = _mm_srli_epi32(v0, 4); - si = _mm_and_si128(v0, mask1); - for (j = 0; j < 16; j++) { - p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); - } - i++; - } - for (i = 0; i < 16; i++) { - _mm_store_si128((__m128i *) d64, p[i]); - d64 += 2; - } - } - /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */ - - gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((char*)src+bytes)-(char*)rd.s_top, xor); -#endif -} - -static -void -gf_w128_split_8_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) -{ - int i, j, k; - uint64_t pp; - gf_internal_t *h; - uint64_t *s64, *d64, *top; - gf_region_data rd; - uint64_t v[2], s; - struct gf_w128_split_8_128_data *ld; - - /* Check on alignment. Ignore it otherwise. */ - gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); - - if (val[0] == 0) { - if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } - } - - h = (gf_internal_t *) gf->scratch; - ld = (struct gf_w128_split_8_128_data *) h->private; - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - top = (uint64_t *) rd.d_top; - - if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) { - v[0] = val[0]; - v[1] = val[1]; - for (i = 0; i < 16; i++) { - ld->tables[0][i][0] = 0; - ld->tables[1][i][0] = 0; - for (j = 1; j < (1 << 8); j <<= 1) { - for (k = 0; k < j; k++) { - ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]); - ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]); - } - pp = (v[0] & (1ULL << 63)); - v[0] <<= 1; - if (v[1] & (1ULL << 63)) v[0] ^= 1; - v[1] <<= 1; - if (pp) v[1] ^= h->prim_poly; - } - } - } - ld->last_value[0] = val[0]; - ld->last_value[1] = val[1]; - - while (d64 < top) { - v[0] = (xor) ? d64[0] : 0; - v[1] = (xor) ? d64[1] : 0; - s = s64[1]; - i = 0; - while (s != 0) { - v[0] ^= ld->tables[0][i][s&0xff]; - v[1] ^= ld->tables[1][i][s&0xff]; - s >>= 8; - i++; - } - s = s64[0]; - i = 8; - while (s != 0) { - v[0] ^= ld->tables[0][i][s&0xff]; - v[1] ^= ld->tables[1][i][s&0xff]; - s >>= 8; - i++; - } - d64[0] = v[0]; - d64[1] = v[1]; - s64 += 2; - d64 += 2; - } -} - -void -gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) -{ - uint64_t bmask, pp; - gf_internal_t *h; - uint64_t a[2], c[2], b[2], *s64, *d64, *top; - gf_region_data rd; - - /* We only do this to check on alignment. */ - gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); - - if (val[0] == 0) { - if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } - } - - h = (gf_internal_t *) gf->scratch; - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - top = (uint64_t *) rd.d_top; - bmask = (1ULL << 63); - - while (d64 < top) { - set_zero(c, 0); - b[0] = s64[0]; - b[1] = s64[1]; - a[0] = val[0]; - a[1] = val[1]; - - while (a[0] != 0) { - if (a[1] & 1) { - c[0] ^= b[0]; - c[1] ^= b[1]; - } - a[1] >>= 1; - if (a[0] & 1) a[1] ^= bmask; - a[0] >>= 1; - pp = (b[0] & bmask); - b[0] <<= 1; - if (b[1] & bmask) b[0] ^= 1; - b[1] <<= 1; - if (pp) b[1] ^= h->prim_poly; - } - while (1) { - if (a[1] & 1) { - c[0] ^= b[0]; - c[1] ^= b[1]; - } - a[1] >>= 1; - if (a[1] == 0) break; - pp = (b[0] & bmask); - b[0] <<= 1; - if (b[1] & bmask) b[0] ^= 1; - b[1] <<= 1; - if (pp) b[1] ^= h->prim_poly; - } - if (xor) { - d64[0] ^= c[0]; - d64[1] ^= c[1]; - } else { - d64[0] = c[0]; - d64[1] = c[1]; - } - s64 += 2; - d64 += 2; - } -} - -static -void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128) -{ - int i, j; - int g_m; - uint64_t prim_poly, lbit; - gf_internal_t *scratch; - gf_group_tables_t *gt; - uint64_t a128[2]; - scratch = (gf_internal_t *) gf->scratch; - gt = scratch->private; - g_m = scratch->arg1; - prim_poly = scratch->prim_poly; - - - set_zero(gt->m_table, 0); - a_get_b(gt->m_table, 2, b128, 0); - lbit = 1; - lbit <<= 63; - - for (i = 2; i < (1 << g_m); i <<= 1) { - a_get_b(a128, 0, gt->m_table, 2 * (i >> 1)); - two_x(a128); - a_get_b(gt->m_table, 2 * i, a128, 0); - if (gt->m_table[2 * (i >> 1)] & lbit) gt->m_table[(2 * i) + 1] ^= prim_poly; - for (j = 0; j < i; j++) { - gt->m_table[(2 * i) + (2 * j)] = gt->m_table[(2 * i)] ^ gt->m_table[(2 * j)]; - gt->m_table[(2 * i) + (2 * j) + 1] = gt->m_table[(2 * i) + 1] ^ gt->m_table[(2 * j) + 1]; - } - } - return; -} - -void -gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) -{ - int i; - /* index_r, index_m, total_m (if g_r > g_m) */ - int i_r, i_m, t_m; - int mask_m, mask_r; - int g_m, g_r; - uint64_t p_i[2], a[2]; - gf_internal_t *scratch; - gf_group_tables_t *gt; - - scratch = (gf_internal_t *) gf->scratch; - gt = scratch->private; - g_m = scratch->arg1; - g_r = scratch->arg2; - - mask_m = (1 << g_m) - 1; - mask_r = (1 << g_r) - 1; - - if (b128[0] != gt->m_table[2] || b128[1] != gt->m_table[3]) { - gf_w128_group_m_init(gf, b128); - } - - p_i[0] = 0; - p_i[1] = 0; - a[0] = a128[0]; - a[1] = a128[1]; - - t_m = 0; - i_r = 0; - - /* Top 64 bits */ - for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { - i_m = (a[0] >> (i * g_m)) & mask_m; - i_r ^= (p_i[0] >> (64 - g_m)) & mask_r; - p_i[0] <<= g_m; - p_i[0] ^= (p_i[1] >> (64-g_m)); - p_i[1] <<= g_m; - p_i[0] ^= gt->m_table[2 * i_m]; - p_i[1] ^= gt->m_table[(2 * i_m) + 1]; - t_m += g_m; - if (t_m == g_r) { - p_i[1] ^= gt->r_table[i_r]; - t_m = 0; - i_r = 0; - } else { - i_r <<= g_m; - } - } - - for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { - i_m = (a[1] >> (i * g_m)) & mask_m; - i_r ^= (p_i[0] >> (64 - g_m)) & mask_r; - p_i[0] <<= g_m; - p_i[0] ^= (p_i[1] >> (64-g_m)); - p_i[1] <<= g_m; - p_i[0] ^= gt->m_table[2 * i_m]; - p_i[1] ^= gt->m_table[(2 * i_m) + 1]; - t_m += g_m; - if (t_m == g_r) { - p_i[1] ^= gt->r_table[i_r]; - t_m = 0; - i_r = 0; - } else { - i_r <<= g_m; - } - } - c128[0] = p_i[0]; - c128[1] = p_i[1]; -} - -static -void -gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) -{ - int i; - int i_r, i_m, t_m; - int mask_m, mask_r; - int g_m, g_r; - uint64_t p_i[2], a[2]; - gf_internal_t *scratch; - gf_group_tables_t *gt; - gf_region_data rd; - uint64_t *a128, *c128, *top; - - /* We only do this to check on alignment. */ - gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); - - if (val[0] == 0) { - if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } - } - - scratch = (gf_internal_t *) gf->scratch; - gt = scratch->private; - g_m = scratch->arg1; - g_r = scratch->arg2; - - mask_m = (1 << g_m) - 1; - mask_r = (1 << g_r) - 1; - - if (val[0] != gt->m_table[2] || val[1] != gt->m_table[3]) { - gf_w128_group_m_init(gf, val); - } - - a128 = (uint64_t *) src; - c128 = (uint64_t *) dest; - top = (uint64_t *) rd.d_top; - - while (c128 < top) { - p_i[0] = 0; - p_i[1] = 0; - a[0] = a128[0]; - a[1] = a128[1]; - - t_m = 0; - i_r = 0; - - /* Top 64 bits */ - for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { - i_m = (a[0] >> (i * g_m)) & mask_m; - i_r ^= (p_i[0] >> (64 - g_m)) & mask_r; - p_i[0] <<= g_m; - p_i[0] ^= (p_i[1] >> (64-g_m)); - p_i[1] <<= g_m; - - p_i[0] ^= gt->m_table[2 * i_m]; - p_i[1] ^= gt->m_table[(2 * i_m) + 1]; - t_m += g_m; - if (t_m == g_r) { - p_i[1] ^= gt->r_table[i_r]; - t_m = 0; - i_r = 0; - } else { - i_r <<= g_m; - } - } - for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { - i_m = (a[1] >> (i * g_m)) & mask_m; - i_r ^= (p_i[0] >> (64 - g_m)) & mask_r; - p_i[0] <<= g_m; - p_i[0] ^= (p_i[1] >> (64-g_m)); - p_i[1] <<= g_m; - p_i[0] ^= gt->m_table[2 * i_m]; - p_i[1] ^= gt->m_table[(2 * i_m) + 1]; - t_m += g_m; - if (t_m == g_r) { - p_i[1] ^= gt->r_table[i_r]; - t_m = 0; - i_r = 0; - } else { - i_r <<= g_m; - } - } - - if (xor) { - c128[0] ^= p_i[0]; - c128[1] ^= p_i[1]; - } else { - c128[0] = p_i[0]; - c128[1] = p_i[1]; - } - a128 += 2; - c128 += 2; - } -} - -/* a^-1 -> b */ - void -gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128) -{ - uint64_t e_i[2], e_im1[2], e_ip1[2]; - uint64_t d_i, d_im1, d_ip1; - uint64_t y_i[2], y_im1[2], y_ip1[2]; - uint64_t c_i[2]; - uint64_t *b; - uint64_t one = 1; - - /* This needs to return some sort of error (in b128?) */ - if (a128[0] == 0 && a128[1] == 0) return; - - b = (uint64_t *) b128; - - e_im1[0] = 0; - e_im1[1] = ((gf_internal_t *) (gf->scratch))->prim_poly; - e_i[0] = a128[0]; - e_i[1] = a128[1]; - d_im1 = 128; - - //Allen: I think d_i starts at 63 here, and checks each bit of a, starting at MSB, looking for the first nonzero bit - //so d_i should be 0 if this half of a is all 0s, otherwise it should be the position from right of the first-from-left zero bit of this half of a. - //BUT if d_i is 0 at end we won't know yet if the rightmost bit of this half is 1 or not - - for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[0]) == 0 && d_i > 0; d_i--) ; - - //Allen: this is testing just the first half of the stop condition above, so if it holds we know we did not find a nonzero bit yet - - if (!((one << d_i) & e_i[0])) { - - //Allen: this is doing the same thing on the other half of a. In other words, we're still searching for a nonzero bit of a. - // but not bothering to test if d_i hits zero, which is fine because we've already tested for a=0. - - for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[1]) == 0; d_i--) ; - - } else { - - //Allen: if a 1 was found in more-significant half of a, make d_i the ACTUAL index of the first nonzero bit in the entire a. - - d_i += 64; - } - y_i[0] = 0; - y_i[1] = 1; - y_im1[0] = 0; - y_im1[1] = 0; - - while (!(e_i[0] == 0 && e_i[1] == 1)) { - - e_ip1[0] = e_im1[0]; - e_ip1[1] = e_im1[1]; - d_ip1 = d_im1; - c_i[0] = 0; - c_i[1] = 0; - - while (d_ip1 >= d_i) { - if ((d_ip1 - d_i) >= 64) { - c_i[0] ^= (one << ((d_ip1 - d_i) - 64)); - e_ip1[0] ^= (e_i[1] << ((d_ip1 - d_i) - 64)); - } else { - c_i[1] ^= (one << (d_ip1 - d_i)); - e_ip1[0] ^= (e_i[0] << (d_ip1 - d_i)); - if (d_ip1 - d_i > 0) e_ip1[0] ^= (e_i[1] >> (64 - (d_ip1 - d_i))); - e_ip1[1] ^= (e_i[1] << (d_ip1 - d_i)); - } - d_ip1--; - if (e_ip1[0] == 0 && e_ip1[1] == 0) { b[0] = 0; b[1] = 0; return; } - while (d_ip1 >= 64 && (e_ip1[0] & (one << (d_ip1 - 64))) == 0) d_ip1--; - while (d_ip1 < 64 && (e_ip1[1] & (one << d_ip1)) == 0) d_ip1--; - } - gf->multiply.w128(gf, c_i, y_i, y_ip1); - y_ip1[0] ^= y_im1[0]; - y_ip1[1] ^= y_im1[1]; - - y_im1[0] = y_i[0]; - y_im1[1] = y_i[1]; - - y_i[0] = y_ip1[0]; - y_i[1] = y_ip1[1]; - - e_im1[0] = e_i[0]; - e_im1[1] = e_i[1]; - d_im1 = d_i; - e_i[0] = e_ip1[0]; - e_i[1] = e_ip1[1]; - d_i = d_ip1; - } - - b[0] = y_i[0]; - b[1] = y_i[1]; - return; -} - - void -gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) -{ - uint64_t d[2]; - gf->inverse.w128(gf, b128, d); - gf->multiply.w128(gf, a128, d, c128); - return; -} - - void -gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128) -{ - uint64_t one128[2]; - one128[0] = 0; - one128[1] = 1; - gf->divide.w128(gf, one128, a128, b128); - return; -} - - -static - void -gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint64_t a0 = a[1]; - uint64_t a1 = a[0]; - uint64_t c0, c1, d, tmp; - uint64_t a0inv, a1inv; - - if (a0 == 0) { - a1inv = base_gf->inverse.w64(base_gf, a1); - c0 = base_gf->multiply.w64(base_gf, a1inv, h->prim_poly); - c1 = a1inv; - } else if (a1 == 0) { - c0 = base_gf->inverse.w64(base_gf, a0); - c1 = 0; - } else { - a1inv = base_gf->inverse.w64(base_gf, a1); - a0inv = base_gf->inverse.w64(base_gf, a0); - - d = base_gf->multiply.w64(base_gf, a1, a0inv); - - tmp = (base_gf->multiply.w64(base_gf, a1, a0inv) ^ base_gf->multiply.w64(base_gf, a0, a1inv) ^ h->prim_poly); - tmp = base_gf->inverse.w64(base_gf, tmp); - - d = base_gf->multiply.w64(base_gf, d, tmp); - - c0 = base_gf->multiply.w64(base_gf, (d^1), a0inv); - c1 = base_gf->multiply.w64(base_gf, d, a1inv); - } - inv[0] = c1; - inv[1] = c0; -} - -static - void -gf_w128_composite_multiply(gf_t *gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t rv) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint64_t b0 = b[1]; - uint64_t b1 = b[0]; - uint64_t a0 = a[1]; - uint64_t a1 = a[0]; - uint64_t a1b1; - - a1b1 = base_gf->multiply.w64(base_gf, a1, b1); - - rv[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1); - rv[0] = base_gf->multiply.w64(base_gf, a1, b0) ^ - base_gf->multiply.w64(base_gf, a0, b1) ^ - base_gf->multiply.w64(base_gf, a1b1, h->prim_poly); -} - -static - void -gf_w128_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint64_t b0 = val[1]; - uint64_t b1 = val[0]; - uint64_t *s64, *d64; - uint64_t *top; - uint64_t a0, a1, a1b1; - gf_region_data rd; - - if (val[0] == 0 && val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); - - s64 = rd.s_start; - d64 = rd.d_start; - top = rd.d_top; - - if (xor) { - while (d64 < top) { - a1 = s64[0]; - a0 = s64[1]; - a1b1 = base_gf->multiply.w64(base_gf, a1, b1); - - d64[1] ^= (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1); - d64[0] ^= (base_gf->multiply.w64(base_gf, a1, b0) ^ - base_gf->multiply.w64(base_gf, a0, b1) ^ - base_gf->multiply.w64(base_gf, a1b1, h->prim_poly)); - s64 += 2; - d64 += 2; - } - } else { - while (d64 < top) { - a1 = s64[0]; - a0 = s64[1]; - a1b1 = base_gf->multiply.w64(base_gf, a1, b1); - - d64[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1); - d64[0] = (base_gf->multiply.w64(base_gf, a1, b0) ^ - base_gf->multiply.w64(base_gf, a0, b1) ^ - base_gf->multiply.w64(base_gf, a1b1, h->prim_poly)); - s64 += 2; - d64 += 2; - } - } -} - -static -void -gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int - xor) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; - gf_val_64_t val0 = val[1]; - gf_val_64_t val1 = val[0]; - uint8_t *slow, *shigh; - uint8_t *dlow, *dhigh, *top; - int sub_reg_size; - gf_region_data rd; - - gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 64); - gf_w128_multiply_region_from_single(gf, src, dest, val, ((char*)rd.s_start-(char*)src), xor); - - slow = (uint8_t *) rd.s_start; - dlow = (uint8_t *) rd.d_start; - top = (uint8_t*) rd.d_top; - sub_reg_size = (top - dlow)/2; - shigh = slow + sub_reg_size; - dhigh = dlow + sub_reg_size; - - base_gf->multiply_region.w64(base_gf, slow, dlow, val0, sub_reg_size, xor); - base_gf->multiply_region.w64(base_gf, shigh, dlow, val1, sub_reg_size, 1); - base_gf->multiply_region.w64(base_gf, slow, dhigh, val1, sub_reg_size, xor); - base_gf->multiply_region.w64(base_gf, shigh, dhigh, val0, sub_reg_size, 1); - base_gf->multiply_region.w64(base_gf, shigh, dhigh, base_gf->multiply.w64(base_gf, h->prim_poly, val1 - ), sub_reg_size, 1); - - gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((char*)src+bytes)-(char*)rd.s_top, xor); -} - - - static -int gf_w128_composite_init(gf_t *gf) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - - if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w128 = gf_w128_composite_multiply_region_alt; - } else { - gf->multiply_region.w128 = gf_w128_composite_multiply_region; - } - - gf->multiply.w128 = gf_w128_composite_multiply; - gf->divide.w128 = gf_w128_divide_from_inverse; - gf->inverse.w128 = gf_w128_composite_inverse; - - return 1; -} - -static -int gf_w128_cfm_init(gf_t *gf) -{ - gf_internal_t * h = gf->scratch; - if (h->sse & GF_SSE4_PCLMUL) { - return 0; - } else { - gf->inverse.w128 = gf_w128_euclid; - gf->multiply.w128 = gf_w128_clm_multiply; - gf->multiply_region.w128 = gf_w128_clm_multiply_region_from_single; - return 1; - } -} - -static -int gf_w128_shift_init(gf_t *gf) -{ - gf->multiply.w128 = gf_w128_shift_multiply; - gf->inverse.w128 = gf_w128_euclid; - gf->multiply_region.w128 = gf_w128_multiply_region_from_single; - return 1; -} - - static -int gf_w128_bytwo_init(gf_t *gf) -{ - gf_internal_t *h; - h = (gf_internal_t *) gf->scratch; - - if (h->mult_type == GF_MULT_BYTWO_p) { - gf->multiply.w128 = gf_w128_bytwo_p_multiply; - /*gf->multiply.w128 = gf_w128_sse_bytwo_p_multiply;*/ - /* John: the sse function is slower.*/ - } else { - gf->multiply.w128 = gf_w128_bytwo_b_multiply; - /*gf->multiply.w128 = gf_w128_sse_bytwo_b_multiply; -Ben: This sse function is also slower. */ - } - gf->inverse.w128 = gf_w128_euclid; - gf->multiply_region.w128 = gf_w128_bytwo_b_multiply_region; - return 1; -} - -/* - * Because the prim poly is only 8 bits and we are limiting g_r to 16, I do not need the high 64 - * bits in all of these numbers. - */ - static -void gf_w128_group_r_init(gf_t *gf) -{ - int i, j; - int g_r; - uint64_t pp; - gf_internal_t *scratch; - gf_group_tables_t *gt; - scratch = (gf_internal_t *) gf->scratch; - gt = scratch->private; - g_r = scratch->arg2; - pp = scratch->prim_poly; - - gt->r_table[0] = 0; - for (i = 1; i < (1 << g_r); i++) { - gt->r_table[i] = 0; - for (j = 0; j < g_r; j++) { - if (i & (1 << j)) { - gt->r_table[i] ^= (pp << j); - } - } - } - return; -} - - static -int gf_w128_split_init(gf_t *gf) -{ - struct gf_w128_split_4_128_data *sd4; - struct gf_w128_split_8_128_data *sd8; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - - gf->multiply.w128 = gf_w128_bytwo_p_multiply; - if((h->sse & GF_SSE4_PCLMUL) && !(h->region_type & GF_REGION_NOSSE)){ - gf->multiply.w128 = gf_w128_clm_multiply; - } - - gf->inverse.w128 = gf_w128_euclid; - - if ((h->arg1 != 4 && h->arg2 != 4) || h->mult_type == GF_MULT_DEFAULT) { - sd8 = (struct gf_w128_split_8_128_data *) h->private; - sd8->last_value[0] = 0; - sd8->last_value[1] = 0; - gf->multiply_region.w128 = gf_w128_split_8_128_multiply_region; - } else { - sd4 = (struct gf_w128_split_4_128_data *) h->private; - sd4->last_value[0] = 0; - sd4->last_value[1] = 0; - if((h->region_type & GF_REGION_ALTMAP)) - { - #ifdef INTEL_SSE4 - if(!(h->region_type & GF_REGION_NOSSE)) - gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region; - else - return 0; - #else - return 0; - #endif - } - else { - if(h->sse & GF_SSE4) { - if(!(h->region_type & GF_REGION_NOSSE)) - gf->multiply_region.w128 = gf_w128_split_4_128_sse_multiply_region; - else - gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region; - } else { - gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region; - } - } - } - return 1; -} - - -static -int gf_w128_group_init(gf_t *gf) -{ - gf_internal_t *scratch; - gf_group_tables_t *gt; - int g_r, size_r; - - scratch = (gf_internal_t *) gf->scratch; - gt = scratch->private; - g_r = scratch->arg2; - size_r = (1 << g_r); - - gt->r_table = scratch->private + (2 * sizeof(uint64_t *)); - gt->m_table = gt->r_table + size_r; - gt->m_table[2] = 0; - gt->m_table[3] = 0; - - gf->multiply.w128 = gf_w128_group_multiply; - gf->inverse.w128 = gf_w128_euclid; - gf->multiply_region.w128 = gf_w128_group_multiply_region; - - gf_w128_group_r_init(gf); - - return 1; -} - -void gf_w128_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv) -{ - gf_val_128_t s; - - s = (gf_val_128_t) start; - s += (index * 2); - memcpy(rv, s, 16); -} - -static void gf_w128_split_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv) -{ - int i, blocks; - uint64_t *r64, tmp; - uint8_t *r8; - gf_region_data rd; - - gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 256); - r64 = (uint64_t *) start; - if ((r64 + index*2 < (uint64_t *) rd.d_start) || - (r64 + index*2 >= (uint64_t *) rd.d_top)) { - memcpy(rv, r64+(index*2), 16); - return; - } - - index -= (((uint64_t *) rd.d_start) - r64)/2; - r64 = (uint64_t *) rd.d_start; - - blocks = index/16; - r64 += (blocks*32); - index %= 16; - r8 = (uint8_t *) r64; - r8 += index; - rv[0] = 0; - rv[1] = 0; - - for (i = 0; i < 8; i++) { - tmp = *r8; - rv[1] |= (tmp << (i*8)); - r8 += 16; - } - - for (i = 0; i < 8; i++) { - tmp = *r8; - rv[0] |= (tmp << (i*8)); - r8 += 16; - } - return; -} - - static -void gf_w128_composite_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv) -{ - int sub_size; - gf_internal_t *h; - uint8_t *r8, *top; - uint64_t *r64; - gf_region_data rd; - - h = (gf_internal_t *) gf->scratch; - gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64); - r64 = (uint64_t *) start; - if ((r64 + index*2 < (uint64_t *) rd.d_start) || - (r64 + index*2 >= (uint64_t *) rd.d_top)) { - memcpy(rv, r64+(index*2), 16); - return; - } - index -= (((uint64_t *) rd.d_start) - r64)/2; - r8 = (uint8_t *) rd.d_start; - top = (uint8_t *) rd.d_top; - sub_size = (top-r8)/2; - - rv[1] = h->base_gf->extract_word.w64(h->base_gf, r8, sub_size, index); - rv[0] = h->base_gf->extract_word.w64(h->base_gf, r8+sub_size, sub_size, index); - - return; -} - -int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) -{ - int size_m, size_r; - if (divide_type==GF_DIVIDE_MATRIX) return 0; - - switch(mult_type) - { - case GF_MULT_CARRY_FREE: - return sizeof(gf_internal_t); - break; - case GF_MULT_SHIFT: - return sizeof(gf_internal_t); - break; - case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: - return sizeof(gf_internal_t); - break; - case GF_MULT_DEFAULT: - case GF_MULT_SPLIT_TABLE: - if ((arg1 == 4 && arg2 == 128) || (arg1 == 128 && arg2 == 4)) { - return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_4_128_data) + 64; - } else if ((arg1 == 8 && arg2 == 128) || (arg1 == 128 && arg2 == 8) || mult_type == GF_MULT_DEFAULT) { - return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_8_128_data) + 64; - } - return 0; - break; - case GF_MULT_GROUP: - /* JSP We've already error checked the arguments. */ - size_m = (1 << arg1) * 2 * sizeof(uint64_t); - size_r = (1 << arg2) * 2 * sizeof(uint64_t); - /* - * two pointers prepend the table data for structure - * because the tables are of dynamic size - */ - return sizeof(gf_internal_t) + size_m + size_r + 4 * sizeof(uint64_t *); - break; - case GF_MULT_COMPOSITE: - if (arg1 == 2) { - return sizeof(gf_internal_t) + 4; - } else { - return 0; - } - break; - - default: - return 0; - } -} - -int gf_w128_init(gf_t *gf) -{ - gf_internal_t *h; - int no_default_flag = 0; - - h = (gf_internal_t *) gf->scratch; - - /* Allen: set default primitive polynomial / irreducible polynomial if needed */ - - if (h->prim_poly == 0) { - if (h->mult_type == GF_MULT_COMPOSITE) { - h->prim_poly = gf_composite_get_default_poly(h->base_gf); - if (h->prim_poly == 0) return 0; /* This shouldn't happen */ - } else { - h->prim_poly = 0x87; /* Omitting the leftmost 1 as in w=32 */ - } - if (no_default_flag == 1) { - fprintf(stderr,"Code contains no default irreducible polynomial for given base field\n"); - return 0; - } - } - - gf->multiply.w128 = NULL; - gf->divide.w128 = NULL; - gf->inverse.w128 = NULL; - gf->multiply_region.w128 = NULL; - switch(h->mult_type) { - case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: if (gf_w128_bytwo_init(gf) == 0) return 0; break; - case GF_MULT_CARRY_FREE: if (gf_w128_cfm_init(gf) == 0) return 0; break; - case GF_MULT_SHIFT: if (gf_w128_shift_init(gf) == 0) return 0; break; - case GF_MULT_GROUP: if (gf_w128_group_init(gf) == 0) return 0; break; - case GF_MULT_DEFAULT: - case GF_MULT_SPLIT_TABLE: if (gf_w128_split_init(gf) == 0) return 0; break; - case GF_MULT_COMPOSITE: if (gf_w128_composite_init(gf) == 0) return 0; break; - default: return 0; - } - - /* Ben: Used to be h->region_type == GF_REGION_ALTMAP, but failed since there - are multiple flags in h->region_type */ - if (h->mult_type == GF_MULT_SPLIT_TABLE && (h->region_type & GF_REGION_ALTMAP)) { - gf->extract_word.w128 = gf_w128_split_extract_word; - } else if (h->mult_type == GF_MULT_COMPOSITE && h->region_type == GF_REGION_ALTMAP) { - gf->extract_word.w128 = gf_w128_composite_extract_word; - } else { - gf->extract_word.w128 = gf_w128_extract_word; - } - - if (h->divide_type == GF_DIVIDE_EUCLID) { - gf->divide.w128 = gf_w128_divide_from_inverse; - } - - if (gf->inverse.w128 != NULL && gf->divide.w128 == NULL) { - gf->divide.w128 = gf_w128_divide_from_inverse; - } - if (gf->inverse.w128 == NULL && gf->divide.w128 != NULL) { - gf->inverse.w128 = gf_w128_inverse_from_divide; - } - return 1; -} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w16.c b/src/erasure-code/jerasure/gf-complete/src/gf_w16.c deleted file mode 100644 index f1fb6501269c4..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/src/gf_w16.c +++ /dev/null @@ -1,2489 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_w16.c - * - * Routines for 16-bit Galois fields - */ - -#include "gf_int.h" -#include -#include - -#define GF_FIELD_WIDTH (16) -#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) -#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1 - -#define GF_BASE_FIELD_WIDTH (8) -#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) - -struct gf_w16_logtable_data { - uint16_t log_tbl[GF_FIELD_SIZE]; - uint16_t antilog_tbl[GF_FIELD_SIZE * 2]; - uint16_t inv_tbl[GF_FIELD_SIZE]; - uint16_t *d_antilog; -}; - -struct gf_w16_zero_logtable_data { - int log_tbl[GF_FIELD_SIZE]; - uint16_t _antilog_tbl[GF_FIELD_SIZE * 4]; - uint16_t *antilog_tbl; - uint16_t inv_tbl[GF_FIELD_SIZE]; -}; - -struct gf_w16_lazytable_data { - uint16_t log_tbl[GF_FIELD_SIZE]; - uint16_t antilog_tbl[GF_FIELD_SIZE * 2]; - uint16_t inv_tbl[GF_FIELD_SIZE]; - uint16_t *d_antilog; - uint16_t lazytable[GF_FIELD_SIZE]; -}; - -struct gf_w16_bytwo_data { - uint64_t prim_poly; - uint64_t mask1; - uint64_t mask2; -}; - -struct gf_w16_split_8_8_data { - uint16_t tables[3][256][256]; -}; - -struct gf_w16_group_4_4_data { - uint16_t reduce[16]; - uint16_t shift[16]; -}; - -struct gf_w16_composite_data { - uint8_t *mult_table; -}; - -#define AB2(ip, am1 ,am2, b, t1, t2) {\ - t1 = (b << 1) & am1;\ - t2 = b & am2; \ - t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \ - b = (t1 ^ (t2 & ip));} - -#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ - t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ - t2 = _mm_and_si128(va, m2); \ - t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ - va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } - -#define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf(" %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); } - -#define GF_FIRST_BIT (1 << 15) -#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1) - -static -inline -gf_val_32_t gf_w16_inverse_from_divide (gf_t *gf, gf_val_32_t a) -{ - return gf->divide.w32(gf, 1, a); -} - -static -inline -gf_val_32_t gf_w16_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - b = gf->inverse.w32(gf, b); - return gf->multiply.w32(gf, a, b); -} - -static -void -gf_w16_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - gf_region_data rd; - uint16_t *s16; - uint16_t *d16; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); - gf_do_initial_region_alignment(&rd); - - s16 = (uint16_t *) rd.s_start; - d16 = (uint16_t *) rd.d_start; - - if (xor) { - while (d16 < ((uint16_t *) rd.d_top)) { - *d16 ^= gf->multiply.w32(gf, val, *s16); - d16++; - s16++; - } - } else { - while (d16 < ((uint16_t *) rd.d_top)) { - *d16 = gf->multiply.w32(gf, val, *s16); - d16++; - s16++; - } - } - gf_do_final_region_alignment(&rd); -} - -#if defined(INTEL_SSE4_PCLMUL) -static -void -gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - gf_region_data rd; - uint16_t *s16; - uint16_t *d16; - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); - gf_do_initial_region_alignment(&rd); - - a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); - - s16 = (uint16_t *) rd.s_start; - d16 = (uint16_t *) rd.d_start; - - if (xor) { - while (d16 < ((uint16_t *) rd.d_top)) { - - /* see gf_w16_clm_multiply() to see explanation of method */ - - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - - *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); - d16++; - s16++; - } - } else { - while (d16 < ((uint16_t *) rd.d_top)) { - - /* see gf_w16_clm_multiply() to see explanation of method */ - - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - - *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - d16++; - s16++; - } - } - gf_do_final_region_alignment(&rd); -} -#endif - -#if defined(INTEL_SSE4_PCLMUL) -static -void -gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - gf_region_data rd; - uint16_t *s16; - uint16_t *d16; - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); - gf_do_initial_region_alignment(&rd); - - s16 = (uint16_t *) rd.s_start; - d16 = (uint16_t *) rd.d_start; - - if (xor) { - while (d16 < ((uint16_t *) rd.d_top)) { - - /* see gf_w16_clm_multiply() to see explanation of method */ - - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - - *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); - d16++; - s16++; - } - } else { - while (d16 < ((uint16_t *) rd.d_top)) { - - /* see gf_w16_clm_multiply() to see explanation of method */ - - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - - *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - d16++; - s16++; - } - } - gf_do_final_region_alignment(&rd); -} -#endif - -#if defined(INTEL_SSE4_PCLMUL) -static -void -gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - gf_region_data rd; - uint16_t *s16; - uint16_t *d16; - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); - gf_do_initial_region_alignment(&rd); - - a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); - - s16 = (uint16_t *) rd.s_start; - d16 = (uint16_t *) rd.d_start; - - if (xor) { - while (d16 < ((uint16_t *) rd.d_top)) { - - /* see gf_w16_clm_multiply() to see explanation of method */ - - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - - *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); - d16++; - s16++; - } - } else { - while (d16 < ((uint16_t *) rd.d_top)) { - - /* see gf_w16_clm_multiply() to see explanation of method */ - - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - - *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - d16++; - s16++; - } - } - gf_do_final_region_alignment(&rd); -} -#endif - -static -inline -gf_val_32_t gf_w16_euclid (gf_t *gf, gf_val_32_t b) -{ - gf_val_32_t e_i, e_im1, e_ip1; - gf_val_32_t d_i, d_im1, d_ip1; - gf_val_32_t y_i, y_im1, y_ip1; - gf_val_32_t c_i; - - if (b == 0) return -1; - e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; - e_i = b; - d_im1 = 16; - for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ; - y_i = 1; - y_im1 = 0; - - while (e_i != 1) { - - e_ip1 = e_im1; - d_ip1 = d_im1; - c_i = 0; - - while (d_ip1 >= d_i) { - c_i ^= (1 << (d_ip1 - d_i)); - e_ip1 ^= (e_i << (d_ip1 - d_i)); - if (e_ip1 == 0) return 0; - while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; - } - - y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); - y_im1 = y_i; - y_i = y_ip1; - - e_im1 = e_i; - d_im1 = d_i; - e_i = e_ip1; - d_i = d_ip1; - } - - return y_i; -} - -static -gf_val_32_t gf_w16_extract_word(gf_t *gf, void *start, int bytes, int index) -{ - uint16_t *r16, rv; - - r16 = (uint16_t *) start; - rv = r16[index]; - return rv; -} - -static -gf_val_32_t gf_w16_composite_extract_word(gf_t *gf, void *start, int bytes, int index) -{ - int sub_size; - gf_internal_t *h; - uint8_t *r8, *top; - uint16_t a, b, *r16; - gf_region_data rd; - - h = (gf_internal_t *) gf->scratch; - gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); - r16 = (uint16_t *) start; - if (r16 + index < (uint16_t *) rd.d_start) return r16[index]; - if (r16 + index >= (uint16_t *) rd.d_top) return r16[index]; - index -= (((uint16_t *) rd.d_start) - r16); - r8 = (uint8_t *) rd.d_start; - top = (uint8_t *) rd.d_top; - sub_size = (top-r8)/2; - - a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index); - b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index); - return (a | (b << 8)); -} - -static -gf_val_32_t gf_w16_split_extract_word(gf_t *gf, void *start, int bytes, int index) -{ - uint16_t *r16, rv; - uint8_t *r8; - gf_region_data rd; - - gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); - r16 = (uint16_t *) start; - if (r16 + index < (uint16_t *) rd.d_start) return r16[index]; - if (r16 + index >= (uint16_t *) rd.d_top) return r16[index]; - index -= (((uint16_t *) rd.d_start) - r16); - r8 = (uint8_t *) rd.d_start; - r8 += ((index & 0xfffffff0)*2); - r8 += (index & 0xf); - rv = (*r8 << 8); - r8 += 16; - rv |= *r8; - return rv; -} - -static -inline -gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b) -{ - return gf_bitmatrix_inverse(b, 16, ((gf_internal_t *) (gf->scratch))->prim_poly); -} - -/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only - include it for completeness. It does have the feature that it requires no - extra memory. - */ - -static -inline -gf_val_32_t -gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) -{ - gf_val_32_t rv = 0; - -#if defined(INTEL_SSE4_PCLMUL) - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); - b = _mm_insert_epi32 (a, b16, 0); - - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); - - /* Do the initial multiply */ - - result = _mm_clmulepi64_si128 (a, b, 0); - - /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only - have to do the reduction at most twice, because (w-2)/z == 2. Where - z is equal to the number of zeros after the leading 1 - - _mm_clmulepi64_si128 is the carryless multiply operation. Here - _mm_srli_si128 shifts the result to the right by 2 bytes. This allows - us to multiply the prim_poly by the leading bits of the result. We - then xor the result of that operation back with the result.*/ - - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - - /* Extracts 32 bit value from result. */ - - rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - - -#endif - return rv; -} - -static -inline -gf_val_32_t -gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) -{ - gf_val_32_t rv = 0; - -#if defined(INTEL_SSE4_PCLMUL) - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); - b = _mm_insert_epi32 (a, b16, 0); - - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); - - /* Do the initial multiply */ - - result = _mm_clmulepi64_si128 (a, b, 0); - - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - - /* Extracts 32 bit value from result. */ - - rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - - -#endif - return rv; -} - -static -inline -gf_val_32_t -gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) -{ - gf_val_32_t rv = 0; - -#if defined(INTEL_SSE4_PCLMUL) - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); - b = _mm_insert_epi32 (a, b16, 0); - - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); - - /* Do the initial multiply */ - - result = _mm_clmulepi64_si128 (a, b, 0); - - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); - result = _mm_xor_si128 (result, w); - - /* Extracts 32 bit value from result. */ - - rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - - -#endif - return rv; -} - - -static -inline - gf_val_32_t -gf_w16_shift_multiply (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) -{ - gf_val_32_t product, i, pp, a, b; - gf_internal_t *h; - - a = a16; - b = b16; - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - product = 0; - - for (i = 0; i < GF_FIELD_WIDTH; i++) { - if (a & (1 << i)) product ^= (b << i); - } - for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { - if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); - } - return product; -} - -static -int gf_w16_shift_init(gf_t *gf) -{ - gf->multiply.w32 = gf_w16_shift_multiply; - return 1; -} - -static -int gf_w16_cfm_init(gf_t *gf) -{ -#if defined(INTEL_SSE4_PCLMUL) - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - - /*Ben: Determining how many reductions to do */ - - if ((0xfe00 & h->prim_poly) == 0) { - gf->multiply.w32 = gf_w16_clm_multiply_2; - gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_2; - } else if((0xf000 & h->prim_poly) == 0) { - gf->multiply.w32 = gf_w16_clm_multiply_3; - gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_3; - } else if ((0xe000 & h->prim_poly) == 0) { - gf->multiply.w32 = gf_w16_clm_multiply_4; - gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_4; - } else { - return 0; - } - return 1; -#endif - - return 0; -} - -/* KMG: GF_MULT_LOGTABLE: */ - -static -void -gf_w16_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint16_t *s16, *d16; - int lv; - struct gf_w16_logtable_data *ltd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); - gf_do_initial_region_alignment(&rd); - - ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; - s16 = (uint16_t *) rd.s_start; - d16 = (uint16_t *) rd.d_start; - - lv = ltd->log_tbl[val]; - - if (xor) { - while (d16 < (uint16_t *) rd.d_top) { - *d16 ^= (*s16 == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[*s16]]); - d16++; - s16++; - } - } else { - while (d16 < (uint16_t *) rd.d_top) { - *d16 = (*s16 == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[*s16]]); - d16++; - s16++; - } - } - gf_do_final_region_alignment(&rd); -} - -static -inline -gf_val_32_t -gf_w16_log_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_w16_logtable_data *ltd; - - ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; - return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(int) ltd->log_tbl[a] + (int) ltd->log_tbl[b]]; -} - -static -inline -gf_val_32_t -gf_w16_log_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - int log_sum = 0; - struct gf_w16_logtable_data *ltd; - - if (a == 0 || b == 0) return 0; - ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; - - log_sum = (int) ltd->log_tbl[a] - (int) ltd->log_tbl[b]; - return (ltd->d_antilog[log_sum]); -} - -static -gf_val_32_t -gf_w16_log_inverse(gf_t *gf, gf_val_32_t a) -{ - struct gf_w16_logtable_data *ltd; - - ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; - return (ltd->inv_tbl[a]); -} - -static -int gf_w16_log_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_w16_logtable_data *ltd; - int i, b; - int check = 0; - - h = (gf_internal_t *) gf->scratch; - ltd = h->private; - - for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) - ltd->log_tbl[i] = 0; - ltd->d_antilog = ltd->antilog_tbl + GF_MULT_GROUP_SIZE; - - b = 1; - for (i = 0; i < GF_MULT_GROUP_SIZE; i++) { - if (ltd->log_tbl[b] != 0) check = 1; - ltd->log_tbl[b] = i; - ltd->antilog_tbl[i] = b; - ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = b; - b <<= 1; - if (b & GF_FIELD_SIZE) { - b = b ^ h->prim_poly; - } - } - - /* If you can't construct the log table, there's a problem. This code is used for - some other implementations (e.g. in SPLIT), so if the log table doesn't work in - that instance, use CARRY_FREE / SHIFT instead. */ - - if (check) { - if (h->mult_type != GF_MULT_LOG_TABLE) { - -#if defined(INTEL_SSE4_PCLMUL) - return gf_w16_cfm_init(gf); -#endif - return gf_w16_shift_init(gf); - } else { - _gf_errno = GF_E_LOGPOLY; - return 0; - } - } - - ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */ - ltd->inv_tbl[1] = 1; - for (i = 2; i < GF_FIELD_SIZE; i++) { - ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]]; - } - - gf->inverse.w32 = gf_w16_log_inverse; - gf->divide.w32 = gf_w16_log_divide; - gf->multiply.w32 = gf_w16_log_multiply; - gf->multiply_region.w32 = gf_w16_log_multiply_region; - - return 1; -} - -/* JSP: GF_MULT_SPLIT_TABLE: Using 8 multiplication tables to leverage SSE instructions. -*/ - - -/* Ben: Does alternate mapping multiplication using a split table in the - lazy method without sse instructions*/ - -static -void -gf_w16_split_4_16_lazy_nosse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint64_t i, j, c, prod; - uint8_t *s8, *d8, *top; - uint16_t table[4][16]; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); - gf_do_initial_region_alignment(&rd); - - /*Ben: Constructs lazy multiplication table*/ - - for (j = 0; j < 16; j++) { - for (i = 0; i < 4; i++) { - c = (j << (i*4)); - table[i][j] = gf->multiply.w32(gf, c, val); - } - } - - /*Ben: s8 is the start of source, d8 is the start of dest, top is end of dest region. */ - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - top = (uint8_t *) rd.d_top; - - - while (d8 < top) { - - /*Ben: Multiplies across 16 two byte quantities using alternate mapping - high bits are on the left, low bits are on the right. */ - - for (j=0;j<16;j++) { - - /*Ben: If the xor flag is set, the product should include what is in dest */ - prod = (xor) ? ((uint16_t)(*d8)<<8) ^ *(d8+16) : 0; - - /*Ben: xors all 4 table lookups into the product variable*/ - - prod ^= ((table[0][*(s8+16)&0xf]) ^ - (table[1][(*(s8+16)&0xf0)>>4]) ^ - (table[2][*(s8)&0xf]) ^ - (table[3][(*(s8)&0xf0)>>4])); - - /*Ben: Stores product in the destination and moves on*/ - - *d8 = (uint8_t)(prod >> 8); - *(d8+16) = (uint8_t)(prod & 0x00ff); - s8++; - d8++; - } - s8+=16; - d8+=16; - } - gf_do_final_region_alignment(&rd); -} - -static - void -gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint64_t i, j, a, c, prod; - uint16_t *s16, *d16, *top; - uint16_t table[4][16]; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); - gf_do_initial_region_alignment(&rd); - - for (j = 0; j < 16; j++) { - for (i = 0; i < 4; i++) { - c = (j << (i*4)); - table[i][j] = gf->multiply.w32(gf, c, val); - } - } - - s16 = (uint16_t *) rd.s_start; - d16 = (uint16_t *) rd.d_start; - top = (uint16_t *) rd.d_top; - - while (d16 < top) { - a = *s16; - prod = (xor) ? *d16 : 0; - for (i = 0; i < 4; i++) { - prod ^= table[i][a&0xf]; - a >>= 4; - } - *d16 = prod; - s16++; - d16++; - } -} - -static -void -gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint64_t j, k, v, a, prod, *s64, *d64, *top64; - gf_internal_t *h; - uint64_t htable[256], ltable[256]; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); - gf_do_initial_region_alignment(&rd); - - h = (gf_internal_t *) gf->scratch; - - v = val; - ltable[0] = 0; - for (j = 1; j < 256; j <<= 1) { - for (k = 0; k < j; k++) ltable[k^j] = (v ^ ltable[k]); - v = GF_MULTBY_TWO(v); - } - htable[0] = 0; - for (j = 1; j < 256; j <<= 1) { - for (k = 0; k < j; k++) htable[k^j] = (v ^ htable[k]); - v = GF_MULTBY_TWO(v); - } - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - top64 = (uint64_t *) rd.d_top; - -/* Does Unrolling Matter? -- Doesn't seem to. - while (d64 != top64) { - a = *s64; - - prod = htable[a >> 56]; - a <<= 8; - prod ^= ltable[a >> 56]; - a <<= 8; - prod <<= 16; - - prod ^= htable[a >> 56]; - a <<= 8; - prod ^= ltable[a >> 56]; - a <<= 8; - prod <<= 16; - - prod ^= htable[a >> 56]; - a <<= 8; - prod ^= ltable[a >> 56]; - a <<= 8; - prod <<= 16; - - prod ^= htable[a >> 56]; - a <<= 8; - prod ^= ltable[a >> 56]; - prod ^= ((xor) ? *d64 : 0); - *d64 = prod; - s64++; - d64++; - } -*/ - - while (d64 != top64) { - a = *s64; - - prod = 0; - for (j = 0; j < 4; j++) { - prod <<= 16; - prod ^= htable[a >> 56]; - a <<= 8; - prod ^= ltable[a >> 56]; - a <<= 8; - } - - //JSP: We can move the conditional outside the while loop, but we need to fully test it to understand which is better. - - prod ^= ((xor) ? *d64 : 0); - *d64 = prod; - s64++; - d64++; - } - gf_do_final_region_alignment(&rd); -} - -static void -gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint64_t c; - gf_internal_t *h; - struct gf_w16_lazytable_data *ltd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); - gf_do_initial_region_alignment(&rd); - - h = (gf_internal_t *) gf->scratch; - ltd = (struct gf_w16_lazytable_data *) h->private; - - ltd->lazytable[0] = 0; - - /* - a = val; - c = 1; - pp = h->prim_poly; - - do { - ltd->lazytable[c] = a; - c <<= 1; - if (c & (1 << GF_FIELD_WIDTH)) c ^= pp; - a <<= 1; - if (a & (1 << GF_FIELD_WIDTH)) a ^= pp; - } while (c != 1); - */ - - for (c = 1; c < GF_FIELD_SIZE; c++) { - ltd->lazytable[c] = gf_w16_shift_multiply(gf, c, val); - } - - gf_two_byte_region_table_multiply(&rd, ltd->lazytable); - gf_do_final_region_alignment(&rd); -} - -static -void -gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ -#ifdef INTEL_SSSE3 - uint64_t i, j, *s64, *d64, *top64;; - uint64_t c, prod; - uint8_t low[4][16]; - uint8_t high[4][16]; - gf_region_data rd; - - __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4], tta, ttb, lmask; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); - gf_do_initial_region_alignment(&rd); - - for (j = 0; j < 16; j++) { - for (i = 0; i < 4; i++) { - c = (j << (i*4)); - prod = gf->multiply.w32(gf, c, val); - low[i][j] = (prod & 0xff); - high[i][j] = (prod >> 8); - } - } - - for (i = 0; i < 4; i++) { - tlow[i] = _mm_loadu_si128((__m128i *)low[i]); - thigh[i] = _mm_loadu_si128((__m128i *)high[i]); - } - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - top64 = (uint64_t *) rd.d_top; - - mask = _mm_set1_epi8 (0x0f); - lmask = _mm_set1_epi16 (0xff); - - if (xor) { - while (d64 != top64) { - - ta = _mm_load_si128((__m128i *) s64); - tb = _mm_load_si128((__m128i *) (s64+2)); - - tta = _mm_srli_epi16(ta, 8); - ttb = _mm_srli_epi16(tb, 8); - tpl = _mm_and_si128(tb, lmask); - tph = _mm_and_si128(ta, lmask); - - tb = _mm_packus_epi16(tpl, tph); - ta = _mm_packus_epi16(ttb, tta); - - ti = _mm_and_si128 (mask, tb); - tph = _mm_shuffle_epi8 (thigh[0], ti); - tpl = _mm_shuffle_epi8 (tlow[0], ti); - - tb = _mm_srli_epi16(tb, 4); - ti = _mm_and_si128 (mask, tb); - tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); - tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); - - ti = _mm_and_si128 (mask, ta); - tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); - tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); - - ta = _mm_srli_epi16(ta, 4); - ti = _mm_and_si128 (mask, ta); - tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); - tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); - - ta = _mm_unpackhi_epi8(tpl, tph); - tb = _mm_unpacklo_epi8(tpl, tph); - - tta = _mm_load_si128((__m128i *) d64); - ta = _mm_xor_si128(ta, tta); - ttb = _mm_load_si128((__m128i *) (d64+2)); - tb = _mm_xor_si128(tb, ttb); - _mm_store_si128 ((__m128i *)d64, ta); - _mm_store_si128 ((__m128i *)(d64+2), tb); - - d64 += 4; - s64 += 4; - - } - } else { - while (d64 != top64) { - - ta = _mm_load_si128((__m128i *) s64); - tb = _mm_load_si128((__m128i *) (s64+2)); - - tta = _mm_srli_epi16(ta, 8); - ttb = _mm_srli_epi16(tb, 8); - tpl = _mm_and_si128(tb, lmask); - tph = _mm_and_si128(ta, lmask); - - tb = _mm_packus_epi16(tpl, tph); - ta = _mm_packus_epi16(ttb, tta); - - ti = _mm_and_si128 (mask, tb); - tph = _mm_shuffle_epi8 (thigh[0], ti); - tpl = _mm_shuffle_epi8 (tlow[0], ti); - - tb = _mm_srli_epi16(tb, 4); - ti = _mm_and_si128 (mask, tb); - tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); - tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); - - ti = _mm_and_si128 (mask, ta); - tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); - tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); - - ta = _mm_srli_epi16(ta, 4); - ti = _mm_and_si128 (mask, ta); - tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); - tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); - - ta = _mm_unpackhi_epi8(tpl, tph); - tb = _mm_unpacklo_epi8(tpl, tph); - - _mm_store_si128 ((__m128i *)d64, ta); - _mm_store_si128 ((__m128i *)(d64+2), tb); - - d64 += 4; - s64 += 4; - } - } - - gf_do_final_region_alignment(&rd); -#endif -} - -static -void -gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ -#ifdef INTEL_SSSE3 - uint64_t i, j, *s64, *d64, *top64;; - uint64_t c, prod; - uint8_t low[4][16]; - uint8_t high[4][16]; - gf_region_data rd; - __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4]; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); - gf_do_initial_region_alignment(&rd); - - for (j = 0; j < 16; j++) { - for (i = 0; i < 4; i++) { - c = (j << (i*4)); - prod = gf->multiply.w32(gf, c, val); - low[i][j] = (prod & 0xff); - high[i][j] = (prod >> 8); - } - } - - for (i = 0; i < 4; i++) { - tlow[i] = _mm_loadu_si128((__m128i *)low[i]); - thigh[i] = _mm_loadu_si128((__m128i *)high[i]); - } - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - top64 = (uint64_t *) rd.d_top; - - mask = _mm_set1_epi8 (0x0f); - - if (xor) { - while (d64 != top64) { - - ta = _mm_load_si128((__m128i *) s64); - tb = _mm_load_si128((__m128i *) (s64+2)); - - ti = _mm_and_si128 (mask, tb); - tph = _mm_shuffle_epi8 (thigh[0], ti); - tpl = _mm_shuffle_epi8 (tlow[0], ti); - - tb = _mm_srli_epi16(tb, 4); - ti = _mm_and_si128 (mask, tb); - tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); - tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); - - ti = _mm_and_si128 (mask, ta); - tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); - tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); - - ta = _mm_srli_epi16(ta, 4); - ti = _mm_and_si128 (mask, ta); - tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); - tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); - - ta = _mm_load_si128((__m128i *) d64); - tph = _mm_xor_si128(tph, ta); - _mm_store_si128 ((__m128i *)d64, tph); - tb = _mm_load_si128((__m128i *) (d64+2)); - tpl = _mm_xor_si128(tpl, tb); - _mm_store_si128 ((__m128i *)(d64+2), tpl); - - d64 += 4; - s64 += 4; - } - } else { - while (d64 != top64) { - - ta = _mm_load_si128((__m128i *) s64); - tb = _mm_load_si128((__m128i *) (s64+2)); - - ti = _mm_and_si128 (mask, tb); - tph = _mm_shuffle_epi8 (thigh[0], ti); - tpl = _mm_shuffle_epi8 (tlow[0], ti); - - tb = _mm_srli_epi16(tb, 4); - ti = _mm_and_si128 (mask, tb); - tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); - tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); - - ti = _mm_and_si128 (mask, ta); - tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); - tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); - - ta = _mm_srli_epi16(ta, 4); - ti = _mm_and_si128 (mask, ta); - tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); - tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); - - _mm_store_si128 ((__m128i *)d64, tph); - _mm_store_si128 ((__m128i *)(d64+2), tpl); - - d64 += 4; - s64 += 4; - - } - } - gf_do_final_region_alignment(&rd); - -#endif -} - -uint32_t -gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - uint32_t alow, blow; - struct gf_w16_split_8_8_data *d8; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - d8 = (struct gf_w16_split_8_8_data *) h->private; - - alow = a & 0xff; - blow = b & 0xff; - a >>= 8; - b >>= 8; - - return d8->tables[0][alow][blow] ^ - d8->tables[1][alow][b] ^ - d8->tables[1][a][blow] ^ - d8->tables[2][a][b]; -} - -static -int gf_w16_split_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_w16_split_8_8_data *d8; - int i, j, exp, issse3; - uint32_t p, basep; - - h = (gf_internal_t *) gf->scratch; - -issse3 = 0; -#ifdef INTEL_SSSE3 - issse3 = 1; -#endif - - if (h->arg1 == 8 && h->arg2 == 8) { - d8 = (struct gf_w16_split_8_8_data *) h->private; - basep = 1; - for (exp = 0; exp < 3; exp++) { - for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0; - for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0; - d8->tables[exp][1][1] = basep; - for (i = 2; i < 256; i++) { - if (i&1) { - p = d8->tables[exp][i^1][1]; - d8->tables[exp][i][1] = p ^ basep; - } else { - p = d8->tables[exp][i>>1][1]; - d8->tables[exp][i][1] = GF_MULTBY_TWO(p); - } - } - for (i = 1; i < 256; i++) { - p = d8->tables[exp][i][1]; - for (j = 1; j < 256; j++) { - if (j&1) { - d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p; - } else { - d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]); - } - } - } - for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep); - } - gf->multiply.w32 = gf_w16_split_8_8_multiply; - gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; - return 1; - - } - - /* We'll be using LOG for multiplication, unless the pp isn't primitive. - In that case, we'll be using SHIFT. */ - - gf_w16_log_init(gf); - - /* Defaults */ - - if (issse3) { - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region; - } else { - gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; - } - - - if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) { - gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; - - } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) { - if (issse3) { - if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSSE) - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region; - else if(h->region_type & GF_REGION_NOSSE) - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region; - else if(h->region_type & GF_REGION_ALTMAP) - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region; - } else { - if(h->region_type & GF_REGION_SSE) - return 0; - else if(h->region_type & GF_REGION_ALTMAP) - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region; - else - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region; - } - } - - return 1; -} - -static -int gf_w16_table_init(gf_t *gf) -{ - gf_w16_log_init(gf); - - gf->multiply_region.w32 = gf_w16_table_lazy_multiply_region; - return 1; -} - -static -void -gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint16_t lv; - int i; - uint16_t *s16, *d16, *top16; - struct gf_w16_zero_logtable_data *ltd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); - gf_do_initial_region_alignment(&rd); - - ltd = (struct gf_w16_zero_logtable_data*) ((gf_internal_t *) gf->scratch)->private; - s16 = (uint16_t *) rd.s_start; - d16 = (uint16_t *) rd.d_start; - top16 = (uint16_t *) rd.d_top; - bytes = top16 - d16; - - lv = ltd->log_tbl[val]; - - if (xor) { - for (i = 0; i < bytes; i++) { - d16[i] ^= (ltd->antilog_tbl[lv + ltd->log_tbl[s16[i]]]); - } - } else { - for (i = 0; i < bytes; i++) { - d16[i] = (ltd->antilog_tbl[lv + ltd->log_tbl[s16[i]]]); - } - } - - /* This isn't necessary. */ - - gf_do_final_region_alignment(&rd); -} - -/* Here -- double-check Kevin */ - -static -inline -gf_val_32_t -gf_w16_log_zero_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_w16_zero_logtable_data *ltd; - - ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; - return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]]; -} - -static -inline -gf_val_32_t -gf_w16_log_zero_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - int log_sum = 0; - struct gf_w16_zero_logtable_data *ltd; - - if (a == 0 || b == 0) return 0; - ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; - - log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE); - return (ltd->antilog_tbl[log_sum]); -} - -static -gf_val_32_t -gf_w16_log_zero_inverse (gf_t *gf, gf_val_32_t a) -{ - struct gf_w16_zero_logtable_data *ltd; - - ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; - return (ltd->inv_tbl[a]); -} - -static -inline -gf_val_32_t -gf_w16_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - uint32_t prod, pp, pmask, amask; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - - prod = 0; - pmask = 0x8000; - amask = 0x8000; - - while (amask != 0) { - if (prod & pmask) { - prod = ((prod << 1) ^ pp); - } else { - prod <<= 1; - } - if (a & amask) prod ^= b; - amask >>= 1; - } - return prod; -} - -static -inline -gf_val_32_t -gf_w16_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - uint32_t prod, pp, bmask; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - prod = 0; - bmask = 0x8000; - - while (1) { - if (a & 1) prod ^= b; - a >>= 1; - if (a == 0) return prod; - if (b & bmask) { - b = ((b << 1) ^ pp); - } else { - b <<= 1; - } - } -} - -static -void -gf_w16_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint64_t *s64, *d64, t1, t2, ta, prod, amask; - gf_region_data rd; - struct gf_w16_bytwo_data *btd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); - gf_do_initial_region_alignment(&rd); - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - - if (xor) { - while (s64 < (uint64_t *) rd.s_top) { - prod = 0; - amask = 0x8000; - ta = *s64; - while (amask != 0) { - AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); - if (val & amask) prod ^= ta; - amask >>= 1; - } - *d64 ^= prod; - d64++; - s64++; - } - } else { - while (s64 < (uint64_t *) rd.s_top) { - prod = 0; - amask = 0x8000; - ta = *s64; - while (amask != 0) { - AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); - if (val & amask) prod ^= ta; - amask >>= 1; - } - *d64 = prod; - d64++; - s64++; - } - } - gf_do_final_region_alignment(&rd); -} - -#define BYTWO_P_ONESTEP {\ - SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ - t1 = _mm_and_si128(v, one); \ - t1 = _mm_sub_epi16(t1, one); \ - t1 = _mm_and_si128(t1, ta); \ - prod = _mm_xor_si128(prod, t1); \ - v = _mm_srli_epi64(v, 1); } - -#ifdef INTEL_SSE2 -static -void -gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - int i; - uint8_t *s8, *d8; - uint32_t vrev; - __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; - struct gf_w16_bytwo_data *btd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - vrev = 0; - for (i = 0; i < 16; i++) { - vrev <<= 1; - if (!(val & (1 << i))) vrev |= 1; - } - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - pp = _mm_set1_epi16(btd->prim_poly&0xffff); - m1 = _mm_set1_epi16((btd->mask1)&0xffff); - m2 = _mm_set1_epi16((btd->mask2)&0xffff); - one = _mm_set1_epi16(1); - - while (d8 < (uint8_t *) rd.d_top) { - prod = _mm_setzero_si128(); - v = _mm_set1_epi16(vrev); - ta = _mm_load_si128((__m128i *) s8); - tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); - d8 += 16; - s8 += 16; - } - gf_do_final_region_alignment(&rd); -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, m2, t1, t2, va; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi16(btd->prim_poly&0xffff); - m1 = _mm_set1_epi16((btd->mask1)&0xffff); - m2 = _mm_set1_epi16((btd->mask2)&0xffff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); - _mm_store_si128((__m128i *)d8, va); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, m2, t1, t2, va, vb; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi16(btd->prim_poly&0xffff); - m1 = _mm_set1_epi16((btd->mask1)&0xffff); - m2 = _mm_set1_epi16((btd->mask2)&0xffff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); - vb = _mm_load_si128 ((__m128i *)(d8)); - vb = _mm_xor_si128(vb, va); - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } -} -#endif - - -#ifdef INTEL_SSE2 -static -void -gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - int itb; - uint8_t *d8, *s8; - __m128i pp, m1, m2, t1, t2, va, vb; - struct gf_w16_bytwo_data *btd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - - if (val == 2) { - if (xor) { - gf_w16_bytwo_b_sse_region_2_xor(&rd, btd); - } else { - gf_w16_bytwo_b_sse_region_2_noxor(&rd, btd); - } - gf_do_final_region_alignment(&rd); - return; - } - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - pp = _mm_set1_epi16(btd->prim_poly&0xffff); - m1 = _mm_set1_epi16((btd->mask1)&0xffff); - m2 = _mm_set1_epi16((btd->mask2)&0xffff); - - while (d8 < (uint8_t *) rd.d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8)); - itb = val; - while (1) { - if (itb & 1) vb = _mm_xor_si128(vb, va); - itb >>= 1; - if (itb == 0) break; - SSE_AB2(pp, m1, m2, va, t1, t2); - } - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } - - gf_do_final_region_alignment(&rd); -} -#endif - -static -void -gf_w16_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint64_t *s64, *d64, t1, t2, ta, tb, prod; - struct gf_w16_bytwo_data *btd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - - switch (val) { - case 2: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= ta; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta; - d64++; - s64++; - } - } - break; - case 3: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 4: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= ta; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta; - d64++; - s64++; - } - } - break; - case 5: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta ^ prod; - d64++; - s64++; - } - } - default: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - prod = *d64 ; - ta = *s64; - tb = val; - while (1) { - if (tb & 1) prod ^= ta; - tb >>= 1; - if (tb == 0) break; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - } - *d64 = prod; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - prod = 0 ; - ta = *s64; - tb = val; - while (1) { - if (tb & 1) prod ^= ta; - tb >>= 1; - if (tb == 0) break; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - } - *d64 = prod; - d64++; - s64++; - } - } - break; - } - gf_do_final_region_alignment(&rd); -} - -static -int gf_w16_bytwo_init(gf_t *gf) -{ - gf_internal_t *h; - uint64_t ip, m1, m2; - struct gf_w16_bytwo_data *btd; - - h = (gf_internal_t *) gf->scratch; - btd = (struct gf_w16_bytwo_data *) (h->private); - ip = h->prim_poly & 0xffff; - m1 = 0xfffe; - m2 = 0x8000; - btd->prim_poly = 0; - btd->mask1 = 0; - btd->mask2 = 0; - - while (ip != 0) { - btd->prim_poly |= ip; - btd->mask1 |= m1; - btd->mask2 |= m2; - ip <<= GF_FIELD_WIDTH; - m1 <<= GF_FIELD_WIDTH; - m2 <<= GF_FIELD_WIDTH; - } - - if (h->mult_type == GF_MULT_BYTWO_p) { - gf->multiply.w32 = gf_w16_bytwo_p_multiply; - #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) - gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region; - #else - gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) - return 0; - #endif - } else { - gf->multiply.w32 = gf_w16_bytwo_b_multiply; - #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) - gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region; - #else - gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) - return 0; - #endif - } - - return 1; -} - -static -int gf_w16_log_zero_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_w16_zero_logtable_data *ltd; - int i, b; - - h = (gf_internal_t *) gf->scratch; - ltd = h->private; - - ltd->log_tbl[0] = (-GF_MULT_GROUP_SIZE) + 1; - - bzero(&(ltd->_antilog_tbl[0]), sizeof(ltd->_antilog_tbl)); - - ltd->antilog_tbl = &(ltd->_antilog_tbl[GF_FIELD_SIZE * 2]); - - b = 1; - for (i = 0; i < GF_MULT_GROUP_SIZE; i++) { - ltd->log_tbl[b] = (uint16_t)i; - ltd->antilog_tbl[i] = (uint16_t)b; - ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = (uint16_t)b; - b <<= 1; - if (b & GF_FIELD_SIZE) { - b = b ^ h->prim_poly; - } - } - ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */ - ltd->inv_tbl[1] = 1; - for (i = 2; i < GF_FIELD_SIZE; i++) { - ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]]; - } - - gf->inverse.w32 = gf_w16_log_zero_inverse; - gf->divide.w32 = gf_w16_log_zero_divide; - gf->multiply.w32 = gf_w16_log_zero_multiply; - gf->multiply_region.w32 = gf_w16_log_zero_multiply_region; - return 1; -} - -static -gf_val_32_t -gf_w16_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint8_t b0 = b & 0x00ff; - uint8_t b1 = (b & 0xff00) >> 8; - uint8_t a0 = a & 0x00ff; - uint8_t a1 = (a & 0xff00) >> 8; - uint8_t a1b1; - uint16_t rv; - - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8)); - return rv; -} - -static -gf_val_32_t -gf_w16_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - uint8_t b0 = b & 0x00ff; - uint8_t b1 = (b & 0xff00) >> 8; - uint8_t a0 = a & 0x00ff; - uint8_t a1 = (a & 0xff00) >> 8; - uint8_t a1b1, *mt; - uint16_t rv; - struct gf_w16_composite_data *cd; - - cd = (struct gf_w16_composite_data *) h->private; - mt = cd->mult_table; - - a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1); - - rv = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8)); - return rv; -} - -/* - * Composite field division trick (explained in 2007 tech report) - * - * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 - * - * let c = b^-1 - * - * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0) - * - * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 - * - * let d = b1c1 and d+1 = b0c0 - * - * solve s*b1c1+b1c0+b0c1 = 0 - * - * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1 - * - * c0 = (d+1)b0^-1 - * c1 = d*b1^-1 - * - * a / b = a * c - */ - -static -gf_val_32_t -gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint8_t a0 = a & 0x00ff; - uint8_t a1 = (a & 0xff00) >> 8; - uint8_t c0, c1, d, tmp; - uint16_t c; - uint8_t a0inv, a1inv; - - if (a0 == 0) { - a1inv = base_gf->inverse.w32(base_gf, a1); - c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); - c1 = a1inv; - } else if (a1 == 0) { - c0 = base_gf->inverse.w32(base_gf, a0); - c1 = 0; - } else { - a1inv = base_gf->inverse.w32(base_gf, a1); - a0inv = base_gf->inverse.w32(base_gf, a0); - - d = base_gf->multiply.w32(base_gf, a1, a0inv); - - tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly); - tmp = base_gf->inverse.w32(base_gf, tmp); - - d = base_gf->multiply.w32(base_gf, d, tmp); - - c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv); - c1 = base_gf->multiply.w32(base_gf, d, a1inv); - } - - c = c0 | (c1 << 8); - - return c; -} - -static -void -gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint8_t b0 = val & 0x00ff; - uint8_t b1 = (val & 0xff00) >> 8; - uint16_t *s16, *d16, *top; - uint8_t a0, a1, a1b1, *mt; - gf_region_data rd; - struct gf_w16_composite_data *cd; - - cd = (struct gf_w16_composite_data *) h->private; - mt = cd->mult_table; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); - - s16 = rd.s_start; - d16 = rd.d_start; - top = rd.d_top; - - if (mt == NULL) { - if (xor) { - while (d16 < top) { - a0 = (*s16) & 0x00ff; - a1 = ((*s16) & 0xff00) >> 8; - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - (*d16) ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((base_gf->multiply.w32(base_gf, a1, b0) ^ - base_gf->multiply.w32(base_gf, a0, b1) ^ - base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8)); - s16++; - d16++; - } - } else { - while (d16 < top) { - a0 = (*s16) & 0x00ff; - a1 = ((*s16) & 0xff00) >> 8; - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - (*d16) = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((base_gf->multiply.w32(base_gf, a1, b0) ^ - base_gf->multiply.w32(base_gf, a0, b1) ^ - base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8)); - s16++; - d16++; - } - } - } else { - if (xor) { - while (d16 < top) { - a0 = (*s16) & 0x00ff; - a1 = ((*s16) & 0xff00) >> 8; - a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1); - - (*d16) ^= ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | - ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ - GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ - GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8)); - s16++; - d16++; - } - } else { - while (d16 < top) { - a0 = (*s16) & 0x00ff; - a1 = ((*s16) & 0xff00) >> 8; - a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1); - - (*d16) = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | - ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ - GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ - GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8)); - s16++; - d16++; - } - } - } -} - -static -void -gf_w16_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint8_t val0 = val & 0x00ff; - uint8_t val1 = (val & 0xff00) >> 8; - gf_region_data rd; - int sub_reg_size; - uint8_t *slow, *shigh; - uint8_t *dlow, *dhigh, *top;; - - /* JSP: I want the two pointers aligned wrt each other on 16 byte - boundaries. So I'm going to make sure that the area on - which the two operate is a multiple of 32. Of course, that - junks up the mapping, but so be it -- that's why we have extract_word.... */ - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); - gf_do_initial_region_alignment(&rd); - - slow = (uint8_t *) rd.s_start; - dlow = (uint8_t *) rd.d_start; - top = (uint8_t *) rd.d_top; - sub_reg_size = (top - dlow)/2; - shigh = slow + sub_reg_size; - dhigh = dlow + sub_reg_size; - - base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor); - base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor); - base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); - - gf_do_final_region_alignment(&rd); -} - -static -int gf_w16_composite_init(gf_t *gf) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - struct gf_w16_composite_data *cd; - - if (h->base_gf == NULL) return 0; - - cd = (struct gf_w16_composite_data *) h->private; - cd->mult_table = gf_w8_get_mult_table(h->base_gf); - - if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w32 = gf_w16_composite_multiply_region_alt; - } else { - gf->multiply_region.w32 = gf_w16_composite_multiply_region; - } - - if (cd->mult_table == NULL) { - gf->multiply.w32 = gf_w16_composite_multiply_recursive; - } else { - gf->multiply.w32 = gf_w16_composite_multiply_inline; - } - gf->divide.w32 = NULL; - gf->inverse.w32 = gf_w16_composite_inverse; - - return 1; -} - -static -void -gf_w16_group_4_set_shift_tables(uint16_t *shift, uint16_t val, gf_internal_t *h) -{ - int i, j; - - shift[0] = 0; - for (i = 0; i < 16; i += 2) { - j = (shift[i>>1] << 1); - if (j & (1 << 16)) j ^= h->prim_poly; - shift[i] = j; - shift[i^1] = j^val; - } -} - -static -inline -gf_val_32_t -gf_w16_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - uint16_t p, l, ind, r, a16; - - struct gf_w16_group_4_4_data *d44; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - - d44 = (struct gf_w16_group_4_4_data *) h->private; - gf_w16_group_4_set_shift_tables(d44->shift, b, h); - - a16 = a; - ind = a16 >> 12; - a16 <<= 4; - p = d44->shift[ind]; - r = p & 0xfff; - l = p >> 12; - ind = a16 >> 12; - a16 <<= 4; - p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); - r = p & 0xfff; - l = p >> 12; - ind = a16 >> 12; - a16 <<= 4; - p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); - r = p & 0xfff; - l = p >> 12; - ind = a16 >> 12; - p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); - return p; -} - -static -void gf_w16_group_4_4_region_multiply(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint16_t p, l, ind, r, a16, p16; - struct gf_w16_group_4_4_data *d44; - gf_region_data rd; - uint16_t *s16, *d16, *top; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_internal_t *h = (gf_internal_t *) gf->scratch; - d44 = (struct gf_w16_group_4_4_data *) h->private; - gf_w16_group_4_set_shift_tables(d44->shift, val, h); - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); - gf_do_initial_region_alignment(&rd); - - s16 = (uint16_t *) rd.s_start; - d16 = (uint16_t *) rd.d_start; - top = (uint16_t *) rd.d_top; - - while (d16 < top) { - p = 0; - a16 = *s16; - p16 = (xor) ? *d16 : 0; - ind = a16 >> 12; - a16 <<= 4; - p = d44->shift[ind]; - r = p & 0xfff; - l = p >> 12; - ind = a16 >> 12; - a16 <<= 4; - p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); - r = p & 0xfff; - l = p >> 12; - ind = a16 >> 12; - a16 <<= 4; - p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); - r = p & 0xfff; - l = p >> 12; - ind = a16 >> 12; - p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); - p ^= p16; - *d16 = p; - d16++; - s16++; - } - gf_do_final_region_alignment(&rd); -} - -static -int gf_w16_group_init(gf_t *gf) -{ - int i, j, p; - struct gf_w16_group_4_4_data *d44; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - - d44 = (struct gf_w16_group_4_4_data *) h->private; - d44->reduce[0] = 0; - for (i = 0; i < 16; i++) { - p = 0; - for (j = 0; j < 4; j++) { - if (i & (1 << j)) p ^= (h->prim_poly << j); - } - d44->reduce[p>>16] = (p&0xffff); - } - - gf->multiply.w32 = gf_w16_group_4_4_multiply; - gf->divide.w32 = NULL; - gf->inverse.w32 = NULL; - gf->multiply_region.w32 = gf_w16_group_4_4_region_multiply; - - return 1; -} - -int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) -{ - switch(mult_type) - { - case GF_MULT_TABLE: - return sizeof(gf_internal_t) + sizeof(struct gf_w16_lazytable_data) + 64; - break; - case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: - return sizeof(gf_internal_t) + sizeof(struct gf_w16_bytwo_data); - break; - case GF_MULT_LOG_ZERO: - return sizeof(gf_internal_t) + sizeof(struct gf_w16_zero_logtable_data) + 64; - break; - case GF_MULT_LOG_TABLE: - return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; - break; - case GF_MULT_DEFAULT: - case GF_MULT_SPLIT_TABLE: - if (arg1 == 8 && arg2 == 8) { - return sizeof(gf_internal_t) + sizeof(struct gf_w16_split_8_8_data) + 64; - } else if ((arg1 == 8 && arg2 == 16) || (arg2 == 8 && arg1 == 16)) { - return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; - } else if (mult_type == GF_MULT_DEFAULT || - (arg1 == 4 && arg2 == 16) || (arg2 == 4 && arg1 == 16)) { - return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; - } - return 0; - break; - case GF_MULT_GROUP: - return sizeof(gf_internal_t) + sizeof(struct gf_w16_group_4_4_data) + 64; - break; - case GF_MULT_CARRY_FREE: - return sizeof(gf_internal_t); - break; - case GF_MULT_SHIFT: - return sizeof(gf_internal_t); - break; - case GF_MULT_COMPOSITE: - return sizeof(gf_internal_t) + sizeof(struct gf_w16_composite_data) + 64; - break; - - default: - return 0; - } - return 0; -} - -int gf_w16_init(gf_t *gf) -{ - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - - /* Allen: set default primitive polynomial / irreducible polynomial if needed */ - - if (h->prim_poly == 0) { - if (h->mult_type == GF_MULT_COMPOSITE) { - h->prim_poly = gf_composite_get_default_poly(h->base_gf); - if (h->prim_poly == 0) return 0; - } else { - - /* Allen: use the following primitive polynomial to make - carryless multiply work more efficiently for GF(2^16). - - h->prim_poly = 0x1002d; - - The following is the traditional primitive polynomial for GF(2^16) */ - - h->prim_poly = 0x1100b; - } - } - - if (h->mult_type != GF_MULT_COMPOSITE) h->prim_poly |= (1 << 16); - - gf->multiply.w32 = NULL; - gf->divide.w32 = NULL; - gf->inverse.w32 = NULL; - gf->multiply_region.w32 = NULL; - - switch(h->mult_type) { - case GF_MULT_LOG_ZERO: if (gf_w16_log_zero_init(gf) == 0) return 0; break; - case GF_MULT_LOG_TABLE: if (gf_w16_log_init(gf) == 0) return 0; break; - case GF_MULT_DEFAULT: - case GF_MULT_SPLIT_TABLE: if (gf_w16_split_init(gf) == 0) return 0; break; - case GF_MULT_TABLE: if (gf_w16_table_init(gf) == 0) return 0; break; - case GF_MULT_CARRY_FREE: if (gf_w16_cfm_init(gf) == 0) return 0; break; - case GF_MULT_SHIFT: if (gf_w16_shift_init(gf) == 0) return 0; break; - case GF_MULT_COMPOSITE: if (gf_w16_composite_init(gf) == 0) return 0; break; - case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: if (gf_w16_bytwo_init(gf) == 0) return 0; break; - case GF_MULT_GROUP: if (gf_w16_group_init(gf) == 0) return 0; break; - default: return 0; - } - if (h->divide_type == GF_DIVIDE_EUCLID) { - gf->divide.w32 = gf_w16_divide_from_inverse; - gf->inverse.w32 = gf_w16_euclid; - } else if (h->divide_type == GF_DIVIDE_MATRIX) { - gf->divide.w32 = gf_w16_divide_from_inverse; - gf->inverse.w32 = gf_w16_matrix; - } - - if (gf->divide.w32 == NULL) { - gf->divide.w32 = gf_w16_divide_from_inverse; - if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_euclid; - } - - if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_inverse_from_divide; - - if (h->region_type & GF_REGION_ALTMAP) { - if (h->mult_type == GF_MULT_COMPOSITE) { - gf->extract_word.w32 = gf_w16_composite_extract_word; - } else { - gf->extract_word.w32 = gf_w16_split_extract_word; - } - } else if (h->region_type == GF_REGION_CAUCHY) { - gf->multiply_region.w32 = gf_wgen_cauchy_region; - gf->extract_word.w32 = gf_wgen_extract_word; - } else { - gf->extract_word.w32 = gf_w16_extract_word; - } - if (gf->multiply_region.w32 == NULL) { - gf->multiply_region.w32 = gf_w16_multiply_region_from_single; - } - return 1; -} - -/* Inline setup functions */ - -uint16_t *gf_w16_get_log_table(gf_t *gf) -{ - struct gf_w16_logtable_data *ltd; - - if (gf->multiply.w32 == gf_w16_log_multiply) { - ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; - return (uint16_t *) ltd->log_tbl; - } - return NULL; -} - -uint16_t *gf_w16_get_mult_alog_table(gf_t *gf) -{ - gf_internal_t *h; - struct gf_w16_logtable_data *ltd; - - h = (gf_internal_t *) gf->scratch; - if (gf->multiply.w32 == gf_w16_log_multiply) { - ltd = (struct gf_w16_logtable_data *) h->private; - return (uint16_t *) ltd->antilog_tbl; - } - return NULL; -} - -uint16_t *gf_w16_get_div_alog_table(gf_t *gf) -{ - gf_internal_t *h; - struct gf_w16_logtable_data *ltd; - - h = (gf_internal_t *) gf->scratch; - if (gf->multiply.w32 == gf_w16_log_multiply) { - ltd = (struct gf_w16_logtable_data *) h->private; - return (uint16_t *) ltd->d_antilog; - } - return NULL; -} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w32.c b/src/erasure-code/jerasure/gf-complete/src/gf_w32.c deleted file mode 100644 index 1503c72dce902..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/src/gf_w32.c +++ /dev/null @@ -1,2741 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_w32.c - * - * Routines for 32-bit Galois fields - */ - - -#include "gf_int.h" -#include -#include - -#define GF_FIELD_WIDTH (32) -#define GF_FIRST_BIT (1 << 31) - -#define GF_BASE_FIELD_WIDTH (16) -#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) -#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 -#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1) - -struct gf_split_2_32_lazy_data { - uint32_t tables[16][4]; - uint32_t last_value; -}; - -struct gf_w32_split_8_8_data { - uint32_t tables[7][256][256]; - uint32_t region_tables[4][256]; - uint32_t last_value; -}; - -struct gf_w32_group_data { - uint32_t *reduce; - uint32_t *shift; - int tshift; - uint64_t rmask; - uint32_t *memory; -}; - -struct gf_split_16_32_lazy_data { - uint32_t tables[2][(1<<16)]; - uint32_t last_value; -}; - -struct gf_split_8_32_lazy_data { - uint32_t tables[4][256]; - uint32_t last_value; -}; - -struct gf_split_4_32_lazy_data { - uint32_t tables[8][16]; - uint32_t last_value; -}; - -struct gf_w32_bytwo_data { - uint64_t prim_poly; - uint64_t mask1; - uint64_t mask2; -}; - -struct gf_w32_composite_data { - uint16_t *log; - uint16_t *alog; -}; - -#define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); } - -#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); } - -#define AB2(ip, am1 ,am2, b, t1, t2) {\ - t1 = (b << 1) & am1;\ - t2 = b & am2; \ - t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \ - b = (t1 ^ (t2 & ip));} - -#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ - t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ - t2 = _mm_and_si128(va, m2); \ - t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ - va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } - -static -inline -uint32_t gf_w32_inverse_from_divide (gf_t *gf, uint32_t a) -{ - return gf->divide.w32(gf, 1, a); -} - -static -inline -uint32_t gf_w32_divide_from_inverse (gf_t *gf, uint32_t a, uint32_t b) -{ - b = gf->inverse.w32(gf, b); - return gf->multiply.w32(gf, a, b); -} - -static -void -gf_w32_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int -xor) -{ - int i; - uint32_t *s32; - uint32_t *d32; - - s32 = (uint32_t *) src; - d32 = (uint32_t *) dest; - - if (xor) { - for (i = 0; i < bytes/sizeof(uint32_t); i++) { - d32[i] ^= gf->multiply.w32(gf, val, s32[i]); - } - } else { - for (i = 0; i < bytes/sizeof(uint32_t); i++) { - d32[i] = gf->multiply.w32(gf, val, s32[i]); - } - } -} - -#if defined(INTEL_SSE4_PCLMUL) - -static -void -gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) -{ - - int i; - uint32_t *s32; - uint32_t *d32; - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); - s32 = (uint32_t *) src; - d32 = (uint32_t *) dest; - - if (xor) { - for (i = 0; i < bytes/sizeof(uint32_t); i++) { - b = _mm_insert_epi32 (a, s32[i], 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); - } - } else { - for (i = 0; i < bytes/sizeof(uint32_t); i++) { - b = _mm_insert_epi32 (a, s32[i], 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - } - } -} -#endif - -#if defined(INTEL_SSE4_PCLMUL) - -static -void -gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) -{ - - int i; - uint32_t *s32; - uint32_t *d32; - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); - - s32 = (uint32_t *) src; - d32 = (uint32_t *) dest; - - if (xor) { - for (i = 0; i < bytes/sizeof(uint32_t); i++) { - b = _mm_insert_epi32 (a, s32[i], 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); - } - } else { - for (i = 0; i < bytes/sizeof(uint32_t); i++) { - b = _mm_insert_epi32 (a, s32[i], 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - } - } -} -#endif - -#if defined(INTEL_SSE4_PCLMUL) -static -void -gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) -{ - int i; - uint32_t *s32; - uint32_t *d32; - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); - - s32 = (uint32_t *) src; - d32 = (uint32_t *) dest; - - if (xor) { - for (i = 0; i < bytes/sizeof(uint32_t); i++) { - b = _mm_insert_epi32 (a, s32[i], 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); - } - } else { - for (i = 0; i < bytes/sizeof(uint32_t); i++) { - b = _mm_insert_epi32 (a, s32[i], 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - } - } -} -#endif - -static -inline -uint32_t gf_w32_euclid (gf_t *gf, uint32_t b) -{ - uint32_t e_i, e_im1, e_ip1; - uint32_t d_i, d_im1, d_ip1; - uint32_t y_i, y_im1, y_ip1; - uint32_t c_i; - - if (b == 0) return -1; - e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; - e_i = b; - d_im1 = 32; - for (d_i = d_im1-1; ((1 << d_i) & e_i) == 0; d_i--) ; - y_i = 1; - y_im1 = 0; - - while (e_i != 1) { - - e_ip1 = e_im1; - d_ip1 = d_im1; - c_i = 0; - - while (d_ip1 >= d_i) { - c_i ^= (1 << (d_ip1 - d_i)); - e_ip1 ^= (e_i << (d_ip1 - d_i)); - d_ip1--; - if (e_ip1 == 0) return 0; - while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; - } - - y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); - y_im1 = y_i; - y_i = y_ip1; - - e_im1 = e_i; - d_im1 = d_i; - e_i = e_ip1; - d_i = d_ip1; - } - - return y_i; -} - -static -gf_val_32_t gf_w32_extract_word(gf_t *gf, void *start, int bytes, int index) -{ - uint32_t *r32, rv; - - r32 = (uint32_t *) start; - rv = r32[index]; - return rv; -} - -static -gf_val_32_t gf_w32_composite_extract_word(gf_t *gf, void *start, int bytes, int index) -{ - int sub_size; - gf_internal_t *h; - uint8_t *r8, *top; - uint32_t a, b, *r32; - gf_region_data rd; - - h = (gf_internal_t *) gf->scratch; - gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); - r32 = (uint32_t *) start; - if (r32 + index < (uint32_t *) rd.d_start) return r32[index]; - if (r32 + index >= (uint32_t *) rd.d_top) return r32[index]; - index -= (((uint32_t *) rd.d_start) - r32); - r8 = (uint8_t *) rd.d_start; - top = (uint8_t *) rd.d_top; - sub_size = (top-r8)/2; - - a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index); - b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index); - return (a | (b << 16)); -} - -static -gf_val_32_t gf_w32_split_extract_word(gf_t *gf, void *start, int bytes, int index) -{ - int i; - uint32_t *r32, rv; - uint8_t *r8; - gf_region_data rd; - - gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64); - r32 = (uint32_t *) start; - if (r32 + index < (uint32_t *) rd.d_start) return r32[index]; - if (r32 + index >= (uint32_t *) rd.d_top) return r32[index]; - index -= (((uint32_t *) rd.d_start) - r32); - r8 = (uint8_t *) rd.d_start; - r8 += ((index & 0xfffffff0)*4); - r8 += (index & 0xf); - r8 += 48; - rv =0; - for (i = 0; i < 4; i++) { - rv <<= 8; - rv |= *r8; - r8 -= 16; - } - return rv; -} - - -static -inline -uint32_t gf_w32_matrix (gf_t *gf, uint32_t b) -{ - return gf_bitmatrix_inverse(b, 32, ((gf_internal_t *) (gf->scratch))->prim_poly); -} - -/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only - include it for completeness. It does have the feature that it requires no - extra memory. -*/ - - - - -static -inline -gf_val_32_t -gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) -{ - gf_val_32_t rv = 0; - -#if defined(INTEL_SSE4_PCLMUL) - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - - a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0); - b = _mm_insert_epi32 (a, b32, 0); - - prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); - - /* Do the initial multiply */ - - result = _mm_clmulepi64_si128 (a, b, 0); - - /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only - have to do the reduction at most twice, because (w-2)/z == 2. Where - z is equal to the number of zeros after the leading 1 - - _mm_clmulepi64_si128 is the carryless multiply operation. Here - _mm_srli_si128 shifts the result to the right by 4 bytes. This allows - us to multiply the prim_poly by the leading bits of the result. We - then xor the result of that operation back with the result.*/ - - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - - /* Extracts 32 bit value from result. */ - rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); -#endif - return rv; -} -static -inline -gf_val_32_t -gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) -{ - gf_val_32_t rv = 0; - -#if defined(INTEL_SSE4_PCLMUL) - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - - a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0); - b = _mm_insert_epi32 (a, b32, 0); - - prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); - - /* Do the initial multiply */ - - result = _mm_clmulepi64_si128 (a, b, 0); - - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - - /* Extracts 32 bit value from result. */ - - rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); -#endif - return rv; -} - -static -inline -gf_val_32_t -gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) -{ - gf_val_32_t rv = 0; - -#if defined(INTEL_SSE4_PCLMUL) - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - - a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0); - b = _mm_insert_epi32 (a, b32, 0); - - prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); - - /* Do the initial multiply */ - - result = _mm_clmulepi64_si128 (a, b, 0); - - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); - result = _mm_xor_si128 (result, w); - - /* Extracts 32 bit value from result. */ - - rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); -#endif - return rv; -} - - -static -inline -uint32_t -gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32) -{ - uint64_t product, i, pp, a, b, one; - gf_internal_t *h; - - a = a32; - b = b32; - h = (gf_internal_t *) gf->scratch; - one = 1; - pp = h->prim_poly | (one << 32); - - product = 0; - - for (i = 0; i < GF_FIELD_WIDTH; i++) { - if (a & (one << i)) product ^= (b << i); - } - for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { - if (product & (one << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); - } - return product; -} - - static -int gf_w32_cfm_init(gf_t *gf) -{ - gf->inverse.w32 = gf_w32_euclid; - gf->multiply_region.w32 = gf_w32_multiply_region_from_single; - - /*Ben: We also check to see if the prim poly will work for pclmul */ - /*Ben: Check to see how many reduction steps it will take*/ - -#if defined(INTEL_SSE4_PCLMUL) - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - - if ((0xfffe0000 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w32_clm_multiply_2; - gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_2; - }else if ((0xffc00000 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w32_clm_multiply_3; - gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_3; - }else if ((0xfe000000 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w32_clm_multiply_4; - gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_4; - } else { - return 0; - } - return 1; - #endif - - return 0; -} - - static -int gf_w32_shift_init(gf_t *gf) -{ - gf->inverse.w32 = gf_w32_euclid; - gf->multiply_region.w32 = gf_w32_multiply_region_from_single; - gf->multiply.w32 = gf_w32_shift_multiply; - return 1; -} - -static - void -gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h) -{ - int i; - uint32_t j; - - shift[0] = 0; - - for (i = 1; i < (1 << h->arg1); i <<= 1) { - for (j = 0; j < i; j++) shift[i|j] = shift[j]^val; - if (val & GF_FIRST_BIT) { - val <<= 1; - val ^= h->prim_poly; - } else { - val <<= 1; - } - } -} - - static -void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - int leftover, rs; - uint32_t p, l, ind, a32; - int bits_left; - int g_s; - gf_region_data rd; - uint32_t *s32, *d32, *top; - struct gf_w32_group_data *gd; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gd = (struct gf_w32_group_data *) h->private; - g_s = h->arg1; - gf_w32_group_set_shift_tables(gd->shift, val, h); - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); - gf_do_initial_region_alignment(&rd); - - s32 = (uint32_t *) rd.s_start; - d32 = (uint32_t *) rd.d_start; - top = (uint32_t *) rd.d_top; - - leftover = 32 % g_s; - if (leftover == 0) leftover = g_s; - - while (d32 < top) { - rs = 32 - leftover; - a32 = *s32; - ind = a32 >> rs; - a32 <<= leftover; - p = gd->shift[ind]; - - bits_left = rs; - rs = 32 - g_s; - - while (bits_left > 0) { - bits_left -= g_s; - ind = a32 >> rs; - a32 <<= g_s; - l = p >> rs; - p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)); - } - if (xor) p ^= *d32; - *d32 = p; - d32++; - s32++; - } - gf_do_final_region_alignment(&rd); -} - - static -void gf_w32_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint32_t *s32, *d32, *top; - int i; - int leftover; - uint64_t p, l, r; - uint32_t a32, ind; - int g_s, g_r; - struct gf_w32_group_data *gd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_internal_t *h = (gf_internal_t *) gf->scratch; - g_s = h->arg1; - g_r = h->arg2; - gd = (struct gf_w32_group_data *) h->private; - gf_w32_group_set_shift_tables(gd->shift, val, h); - - leftover = GF_FIELD_WIDTH % g_s; - if (leftover == 0) leftover = g_s; - - gd = (struct gf_w32_group_data *) h->private; - gf_w32_group_set_shift_tables(gd->shift, val, h); - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); - gf_do_initial_region_alignment(&rd); - - s32 = (uint32_t *) rd.s_start; - d32 = (uint32_t *) rd.d_start; - top = (uint32_t *) rd.d_top; - - while (d32 < top) { - a32 = *s32; - ind = a32 >> (GF_FIELD_WIDTH - leftover); - p = gd->shift[ind]; - p <<= g_s; - a32 <<= leftover; - - i = (GF_FIELD_WIDTH - leftover); - while (i > g_s) { - ind = a32 >> (GF_FIELD_WIDTH-g_s); - p ^= gd->shift[ind]; - a32 <<= g_s; - p <<= g_s; - i -= g_s; - } - - ind = a32 >> (GF_FIELD_WIDTH-g_s); - p ^= gd->shift[ind]; - - for (i = gd->tshift ; i >= 0; i -= g_r) { - l = p & (gd->rmask << i); - r = gd->reduce[l >> (i+32)]; - r <<= (i); - p ^= r; - } - - if (xor) p ^= *d32; - *d32 = p; - d32++; - s32++; - } - gf_do_final_region_alignment(&rd); -} - -static -inline -gf_val_32_t -gf_w32_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - int leftover, rs; - uint32_t p, l, ind, a32; - int bits_left; - int g_s; - - struct gf_w32_group_data *gd; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - g_s = h->arg1; - - gd = (struct gf_w32_group_data *) h->private; - gf_w32_group_set_shift_tables(gd->shift, b, h); - - leftover = 32 % g_s; - if (leftover == 0) leftover = g_s; - - rs = 32 - leftover; - a32 = a; - ind = a32 >> rs; - a32 <<= leftover; - p = gd->shift[ind]; - - bits_left = rs; - rs = 32 - g_s; - - while (bits_left > 0) { - bits_left -= g_s; - ind = a32 >> rs; - a32 <<= g_s; - l = p >> rs; - p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)); - } - return p; -} - -static -inline -gf_val_32_t -gf_w32_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - uint32_t p, l, ind, a32; - - struct gf_w32_group_data *d44; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - - d44 = (struct gf_w32_group_data *) h->private; - gf_w32_group_set_shift_tables(d44->shift, b, h); - - p = 0; - a32 = a; - ind = a32 >> 28; - a32 <<= 4; - p = d44->shift[ind]; - ind = a32 >> 28; - a32 <<= 4; - l = p >> 28; - p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); - ind = a32 >> 28; - a32 <<= 4; - l = p >> 28; - p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); - ind = a32 >> 28; - a32 <<= 4; - l = p >> 28; - p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); - ind = a32 >> 28; - a32 <<= 4; - l = p >> 28; - p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); - ind = a32 >> 28; - a32 <<= 4; - l = p >> 28; - p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); - ind = a32 >> 28; - a32 <<= 4; - l = p >> 28; - p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); - ind = a32 >> 28; - l = p >> 28; - p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); - return p; -} - -static -inline -gf_val_32_t -gf_w32_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - int i; - int leftover; - uint64_t p, l, r; - uint32_t a32, ind; - int g_s, g_r; - struct gf_w32_group_data *gd; - - gf_internal_t *h = (gf_internal_t *) gf->scratch; - g_s = h->arg1; - g_r = h->arg2; - gd = (struct gf_w32_group_data *) h->private; - gf_w32_group_set_shift_tables(gd->shift, b, h); - - leftover = GF_FIELD_WIDTH % g_s; - if (leftover == 0) leftover = g_s; - - a32 = a; - ind = a32 >> (GF_FIELD_WIDTH - leftover); - p = gd->shift[ind]; - p <<= g_s; - a32 <<= leftover; - - i = (GF_FIELD_WIDTH - leftover); - while (i > g_s) { - ind = a32 >> (GF_FIELD_WIDTH-g_s); - p ^= gd->shift[ind]; - a32 <<= g_s; - p <<= g_s; - i -= g_s; - } - - ind = a32 >> (GF_FIELD_WIDTH-g_s); - p ^= gd->shift[ind]; - - for (i = gd->tshift ; i >= 0; i -= g_r) { - l = p & (gd->rmask << i); - r = gd->reduce[l >> (i+32)]; - r <<= (i); - p ^= r; - } - return p; -} - -static -inline -gf_val_32_t -gf_w32_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - uint32_t prod, pp, bmask; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - prod = 0; - bmask = 0x80000000; - - while (1) { - if (a & 1) prod ^= b; - a >>= 1; - if (a == 0) return prod; - if (b & bmask) { - b = ((b << 1) ^ pp); - } else { - b <<= 1; - } - } -} - -static -inline -gf_val_32_t -gf_w32_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - uint32_t prod, pp, pmask, amask; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - - prod = 0; - pmask = 0x80000000; - amask = 0x80000000; - - while (amask != 0) { - if (prod & pmask) { - prod = ((prod << 1) ^ pp); - } else { - prod <<= 1; - } - if (a & amask) prod ^= b; - amask >>= 1; - } - return prod; -} - -static -void -gf_w32_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint64_t *s64, *d64, t1, t2, ta, prod, amask; - gf_region_data rd; - struct gf_w32_bytwo_data *btd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); - gf_do_initial_region_alignment(&rd); - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - - if (xor) { - while (s64 < (uint64_t *) rd.s_top) { - prod = 0; - amask = 0x80000000; - ta = *s64; - while (amask != 0) { - AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); - if (val & amask) prod ^= ta; - amask >>= 1; - } - *d64 ^= prod; - d64++; - s64++; - } - } else { - while (s64 < (uint64_t *) rd.s_top) { - prod = 0; - amask = 0x80000000; - ta = *s64; - while (amask != 0) { - AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); - if (val & amask) prod ^= ta; - amask >>= 1; - } - *d64 = prod; - d64++; - s64++; - } - } - gf_do_final_region_alignment(&rd); -} - -#define BYTWO_P_ONESTEP {\ - SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ - t1 = _mm_and_si128(v, one); \ - t1 = _mm_sub_epi32(t1, one); \ - t1 = _mm_and_si128(t1, ta); \ - prod = _mm_xor_si128(prod, t1); \ - v = _mm_srli_epi64(v, 1); } - -#ifdef INTEL_SSE2 -static -void -gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - int i; - uint8_t *s8, *d8; - uint32_t vrev; - __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; - struct gf_w32_bytwo_data *btd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - vrev = 0; - for (i = 0; i < 32; i++) { - vrev <<= 1; - if (!(val & (1 << i))) vrev |= 1; - } - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - pp = _mm_set1_epi32(btd->prim_poly&0xffffffff); - m1 = _mm_set1_epi32((btd->mask1)&0xffffffff); - m2 = _mm_set1_epi32((btd->mask2)&0xffffffff); - one = _mm_set1_epi32(1); - - while (d8 < (uint8_t *) rd.d_top) { - prod = _mm_setzero_si128(); - v = _mm_set1_epi32(vrev); - ta = _mm_load_si128((__m128i *) s8); - tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); - d8 += 16; - s8 += 16; - } - gf_do_final_region_alignment(&rd); -} -#endif - -static -void -gf_w32_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint64_t *s64, *d64, t1, t2, ta, tb, prod; - struct gf_w32_bytwo_data *btd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); - gf_do_initial_region_alignment(&rd); - - btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - - switch (val) { - case 2: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= ta; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta; - d64++; - s64++; - } - } - break; - case 3: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - case 4: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= ta; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta; - d64++; - s64++; - } - } - break; - case 5: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta ^ prod; - d64++; - s64++; - } - } - default: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - prod = *d64 ; - ta = *s64; - tb = val; - while (1) { - if (tb & 1) prod ^= ta; - tb >>= 1; - if (tb == 0) break; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - } - *d64 = prod; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - prod = 0 ; - ta = *s64; - tb = val; - while (1) { - if (tb & 1) prod ^= ta; - tb >>= 1; - if (tb == 0) break; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - } - *d64 = prod; - d64++; - s64++; - } - } - break; - } - gf_do_final_region_alignment(&rd); -} - -#ifdef INTEL_SSE2 -static -void -gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, m2, t1, t2, va; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi32(btd->prim_poly&0xffffffff); - m1 = _mm_set1_epi32((btd->mask1)&0xffffffff); - m2 = _mm_set1_epi32((btd->mask2)&0xffffffff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); - _mm_store_si128((__m128i *)d8, va); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, m2, t1, t2, va, vb; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi32(btd->prim_poly&0xffffffff); - m1 = _mm_set1_epi32((btd->mask1)&0xffffffff); - m2 = _mm_set1_epi32((btd->mask2)&0xffffffff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); - vb = _mm_load_si128 ((__m128i *)(d8)); - vb = _mm_xor_si128(vb, va); - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } -} -#endif - - -#ifdef INTEL_SSE2 -static -void -gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint32_t itb; - uint8_t *d8, *s8; - __m128i pp, m1, m2, t1, t2, va, vb; - struct gf_w32_bytwo_data *btd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - - if (val == 2) { - if (xor) { - gf_w32_bytwo_b_sse_region_2_xor(&rd, btd); - } else { - gf_w32_bytwo_b_sse_region_2_noxor(&rd, btd); - } - gf_do_final_region_alignment(&rd); - return; - } - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - pp = _mm_set1_epi32(btd->prim_poly&0xffffffff); - m1 = _mm_set1_epi32((btd->mask1)&0xffffffff); - m2 = _mm_set1_epi32((btd->mask2)&0xffffffff); - - while (d8 < (uint8_t *) rd.d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8)); - itb = val; - while (1) { - if (itb & 1) vb = _mm_xor_si128(vb, va); - itb >>= 1; - if (itb == 0) break; - SSE_AB2(pp, m1, m2, va, t1, t2); - } - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } - - gf_do_final_region_alignment(&rd); -} -#endif - -static -int gf_w32_bytwo_init(gf_t *gf) -{ - gf_internal_t *h; - uint64_t ip, m1, m2; - struct gf_w32_bytwo_data *btd; - - h = (gf_internal_t *) gf->scratch; - btd = (struct gf_w32_bytwo_data *) (h->private); - ip = h->prim_poly & 0xffffffff; - m1 = 0xfffffffe; - m2 = 0x80000000; - btd->prim_poly = 0; - btd->mask1 = 0; - btd->mask2 = 0; - - while (ip != 0) { - btd->prim_poly |= ip; - btd->mask1 |= m1; - btd->mask2 |= m2; - ip <<= GF_FIELD_WIDTH; - m1 <<= GF_FIELD_WIDTH; - m2 <<= GF_FIELD_WIDTH; - } - - if (h->mult_type == GF_MULT_BYTWO_p) { - gf->multiply.w32 = gf_w32_bytwo_p_multiply; - #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) - gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region; - #else - gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) - return 0; - #endif - } else { - gf->multiply.w32 = gf_w32_bytwo_b_multiply; - #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) - gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region; - #else - gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) - return 0; - #endif - } - - gf->inverse.w32 = gf_w32_euclid; - return 1; -} - -static -inline -uint32_t -gf_w32_split_8_8_multiply (gf_t *gf, uint32_t a32, uint32_t b32) -{ - uint32_t product, i, j, mask, tb; - gf_internal_t *h; - struct gf_w32_split_8_8_data *d8; - - h = (gf_internal_t *) gf->scratch; - d8 = (struct gf_w32_split_8_8_data *) h->private; - product = 0; - mask = 0xff; - - for (i = 0; i < 4; i++) { - tb = b32; - for (j = 0; j < 4; j++) { - product ^= d8->tables[i+j][a32&mask][tb&mask]; - tb >>= 8; - } - a32 >>= 8; - } - return product; -} - -static -inline -void -gf_w32_split_8_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) -{ - gf_internal_t *h; - uint32_t *s32, *d32, *top, p, a, v; - struct gf_split_8_32_lazy_data *d8; - struct gf_w32_split_8_8_data *d88; - uint32_t *t[4]; - int i, j, k, change; - uint32_t pp; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - h = (gf_internal_t *) gf->scratch; - if (h->arg1 == 32 || h->arg2 == 32 || h->mult_type == GF_MULT_DEFAULT) { - d8 = (struct gf_split_8_32_lazy_data *) h->private; - for (i = 0; i < 4; i++) t[i] = d8->tables[i]; - change = (val != d8->last_value); - if (change) d8->last_value = val; - } else { - d88 = (struct gf_w32_split_8_8_data *) h->private; - for (i = 0; i < 4; i++) t[i] = d88->region_tables[i]; - change = (val != d88->last_value); - if (change) d88->last_value = val; - } - pp = h->prim_poly; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); - gf_do_initial_region_alignment(&rd); - - s32 = (uint32_t *) rd.s_start; - d32 = (uint32_t *) rd.d_start; - top = (uint32_t *) rd.d_top; - - if (change) { - v = val; - for (i = 0; i < 4; i++) { - t[i][0] = 0; - for (j = 1; j < 256; j <<= 1) { - for (k = 0; k < j; k++) { - t[i][k^j] = (v ^ t[i][k]); - } - v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); - } - } - } - - while (d32 < top) { - p = (xor) ? *d32 : 0; - a = *s32; - i = 0; - while (a != 0) { - v = (a & 0xff); - p ^= t[i][v]; - a >>= 8; - i++; - } - *d32 = p; - d32++; - s32++; - } - gf_do_final_region_alignment(&rd); -} - -static -inline -void -gf_w32_split_16_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) -{ - gf_internal_t *h; - uint32_t *s32, *d32, *top, p, a, v; - struct gf_split_16_32_lazy_data *d16; - uint32_t *t[2]; - int i, j, k, change; - uint32_t pp; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - h = (gf_internal_t *) gf->scratch; - d16 = (struct gf_split_16_32_lazy_data *) h->private; - for (i = 0; i < 2; i++) t[i] = d16->tables[i]; - change = (val != d16->last_value); - if (change) d16->last_value = val; - - pp = h->prim_poly; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); - gf_do_initial_region_alignment(&rd); - - s32 = (uint32_t *) rd.s_start; - d32 = (uint32_t *) rd.d_start; - top = (uint32_t *) rd.d_top; - - if (change) { - v = val; - for (i = 0; i < 2; i++) { - t[i][0] = 0; - for (j = 1; j < (1 << 16); j <<= 1) { - for (k = 0; k < j; k++) { - t[i][k^j] = (v ^ t[i][k]); - } - v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); - } - } - } - - while (d32 < top) { - p = (xor) ? *d32 : 0; - a = *s32; - i = 0; - while (a != 0) { - v = (a & 0xffff); - p ^= t[i][v]; - a >>= 16; - i++; - } - *d32 = p; - d32++; - s32++; - } - gf_do_final_region_alignment(&rd); -} - -static -void -gf_w32_split_2_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) -{ - gf_internal_t *h; - struct gf_split_2_32_lazy_data *ld; - int i; - uint32_t pp, v, v2, s, *s32, *d32, *top; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); - gf_do_initial_region_alignment(&rd); - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - ld = (struct gf_split_2_32_lazy_data *) h->private; - - if (ld->last_value != val) { - v = val; - for (i = 0; i < 16; i++) { - v2 = (v << 1); - if (v & GF_FIRST_BIT) v2 ^= pp; - ld->tables[i][0] = 0; - ld->tables[i][1] = v; - ld->tables[i][2] = v2; - ld->tables[i][3] = (v2 ^ v); - v = (v2 << 1); - if (v2 & GF_FIRST_BIT) v ^= pp; - } - } - ld->last_value = val; - - s32 = (uint32_t *) rd.s_start; - d32 = (uint32_t *) rd.d_start; - top = (uint32_t *) rd.d_top; - - while (d32 != top) { - v = (xor) ? *d32 : 0; - s = *s32; - i = 0; - while (s != 0) { - v ^= ld->tables[i][s&3]; - s >>= 2; - i++; - } - *d32 = v; - d32++; - s32++; - } - gf_do_final_region_alignment(&rd); -} - -#ifdef INTEL_SSSE3 -static -void -gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) -{ - gf_internal_t *h; - int i, tindex; - uint32_t pp, v, v2, *s32, *d32, *top; - __m128i vi, si, pi, shuffler, tables[16], adder, xi, mask1, mask2; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); - gf_do_initial_region_alignment(&rd); - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - s32 = (uint32_t *) rd.s_start; - d32 = (uint32_t *) rd.d_start; - top = (uint32_t *) rd.d_top; - - v = val; - for (i = 0; i < 16; i++) { - v2 = (v << 1); - if (v & GF_FIRST_BIT) v2 ^= pp; - tables[i] = _mm_set_epi32(v2 ^ v, v2, v, 0); - v = (v2 << 1); - if (v2 & GF_FIRST_BIT) v ^= pp; - } - - shuffler = _mm_set_epi8(0xc, 0xc, 0xc, 0xc, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); - adder = _mm_set_epi8(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); - mask1 = _mm_set1_epi8(0x3); - mask2 = _mm_set1_epi8(0xc); - - while (d32 != top) { - pi = (xor) ? _mm_load_si128 ((__m128i *) d32) : _mm_setzero_si128(); - vi = _mm_load_si128((__m128i *) s32); - - tindex = 0; - for (i = 0; i < 4; i++) { - si = _mm_shuffle_epi8(vi, shuffler); - - xi = _mm_and_si128(si, mask1); - xi = _mm_slli_epi16(xi, 2); - xi = _mm_xor_si128(xi, adder); - pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); - tindex++; - - xi = _mm_and_si128(si, mask2); - xi = _mm_xor_si128(xi, adder); - pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); - si = _mm_srli_epi16(si, 2); - tindex++; - - xi = _mm_and_si128(si, mask2); - xi = _mm_xor_si128(xi, adder); - pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); - si = _mm_srli_epi16(si, 2); - tindex++; - - xi = _mm_and_si128(si, mask2); - xi = _mm_xor_si128(xi, adder); - pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); - si = _mm_srli_epi16(si, 2); - tindex++; - - vi = _mm_srli_epi32(vi, 8); - } - _mm_store_si128((__m128i *) d32, pi); - d32 += 4; - s32 += 4; - } - - gf_do_final_region_alignment(&rd); - -} -#endif - -static -void -gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) -{ - gf_internal_t *h; - struct gf_split_4_32_lazy_data *ld; - int i, j, k; - uint32_t pp, v, s, *s32, *d32, *top; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - ld = (struct gf_split_4_32_lazy_data *) h->private; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); - gf_do_initial_region_alignment(&rd); - - if (ld->last_value != val) { - v = val; - for (i = 0; i < 8; i++) { - ld->tables[i][0] = 0; - for (j = 1; j < 16; j <<= 1) { - for (k = 0; k < j; k++) { - ld->tables[i][k^j] = (v ^ ld->tables[i][k]); - } - v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); - } - } - } - ld->last_value = val; - - s32 = (uint32_t *) rd.s_start; - d32 = (uint32_t *) rd.d_start; - top = (uint32_t *) rd.d_top; - - while (d32 != top) { - v = (xor) ? *d32 : 0; - s = *s32; - i = 0; - while (s != 0) { - v ^= ld->tables[i][s&0xf]; - s >>= 4; - i++; - } - *d32 = v; - d32++; - s32++; - } - gf_do_final_region_alignment(&rd); -} - -static -void -gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) -{ -#ifdef INTEL_SSSE3 - gf_internal_t *h; - int i, j, k; - uint32_t pp, v, *s32, *d32, *top; - __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3; - struct gf_split_4_32_lazy_data *ld; - uint8_t btable[16]; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64); - gf_do_initial_region_alignment(&rd); - - s32 = (uint32_t *) rd.s_start; - d32 = (uint32_t *) rd.d_start; - top = (uint32_t *) rd.d_top; - - ld = (struct gf_split_4_32_lazy_data *) h->private; - - v = val; - for (i = 0; i < 8; i++) { - ld->tables[i][0] = 0; - for (j = 1; j < 16; j <<= 1) { - for (k = 0; k < j; k++) { - ld->tables[i][k^j] = (v ^ ld->tables[i][k]); - } - v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); - } - for (j = 0; j < 4; j++) { - for (k = 0; k < 16; k++) { - btable[k] = (uint8_t) ld->tables[i][k]; - ld->tables[i][k] >>= 8; - } - tables[i][j] = _mm_loadu_si128((__m128i *) btable); - } - } - - mask1 = _mm_set1_epi8(0xf); - - if (xor) { - while (d32 != top) { - p0 = _mm_load_si128 ((__m128i *) d32); - p1 = _mm_load_si128 ((__m128i *) (d32+4)); - p2 = _mm_load_si128 ((__m128i *) (d32+8)); - p3 = _mm_load_si128 ((__m128i *) (d32+12)); - - v0 = _mm_load_si128((__m128i *) s32); s32 += 4; - v1 = _mm_load_si128((__m128i *) s32); s32 += 4; - v2 = _mm_load_si128((__m128i *) s32); s32 += 4; - v3 = _mm_load_si128((__m128i *) s32); s32 += 4; - - si = _mm_and_si128(v0, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si)); - - v0 = _mm_srli_epi32(v0, 4); - si = _mm_and_si128(v0, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); - - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); - - v1 = _mm_srli_epi32(v1, 4); - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); - - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); - - v2 = _mm_srli_epi32(v2, 4); - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); - - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si)); - - v3 = _mm_srli_epi32(v3, 4); - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); - - _mm_store_si128((__m128i *) d32, p0); - _mm_store_si128((__m128i *) (d32+4), p1); - _mm_store_si128((__m128i *) (d32+8), p2); - _mm_store_si128((__m128i *) (d32+12), p3); - d32 += 16; - } - } else { - while (d32 != top) { - - v0 = _mm_load_si128((__m128i *) s32); s32 += 4; - v1 = _mm_load_si128((__m128i *) s32); s32 += 4; - v2 = _mm_load_si128((__m128i *) s32); s32 += 4; - v3 = _mm_load_si128((__m128i *) s32); s32 += 4; - - si = _mm_and_si128(v0, mask1); - p0 = _mm_shuffle_epi8(tables[0][0], si); - p1 = _mm_shuffle_epi8(tables[0][1], si); - p2 = _mm_shuffle_epi8(tables[0][2], si); - p3 = _mm_shuffle_epi8(tables[0][3], si); - - v0 = _mm_srli_epi32(v0, 4); - si = _mm_and_si128(v0, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); - - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); - - v1 = _mm_srli_epi32(v1, 4); - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); - - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); - - v2 = _mm_srli_epi32(v2, 4); - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); - - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si)); - - v3 = _mm_srli_epi32(v3, 4); - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); - - _mm_store_si128((__m128i *) d32, p0); - _mm_store_si128((__m128i *) (d32+4), p1); - _mm_store_si128((__m128i *) (d32+8), p2); - _mm_store_si128((__m128i *) (d32+12), p3); - d32 += 16; - } - } - - gf_do_final_region_alignment(&rd); - -#endif -} - - -static -void -gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) -{ -#ifdef INTEL_SSSE3 - gf_internal_t *h; - int i, j, k; - uint32_t pp, v, *s32, *d32, *top, tmp_table[16]; - __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8; - __m128i tv1, tv2, tv3, tv0; - uint8_t btable[16]; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64); - gf_do_initial_region_alignment(&rd); - - s32 = (uint32_t *) rd.s_start; - d32 = (uint32_t *) rd.d_start; - top = (uint32_t *) rd.d_top; - - v = val; - for (i = 0; i < 8; i++) { - tmp_table[0] = 0; - for (j = 1; j < 16; j <<= 1) { - for (k = 0; k < j; k++) { - tmp_table[k^j] = (v ^ tmp_table[k]); - } - v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); - } - for (j = 0; j < 4; j++) { - for (k = 0; k < 16; k++) { - btable[k] = (uint8_t) tmp_table[k]; - tmp_table[k] >>= 8; - } - tables[i][j] = _mm_loadu_si128((__m128i *) btable); - } - } - - mask1 = _mm_set1_epi8(0xf); - mask8 = _mm_set1_epi16(0xff); - - if (xor) { - while (d32 != top) { - v0 = _mm_load_si128((__m128i *) s32); s32 += 4; - v1 = _mm_load_si128((__m128i *) s32); s32 += 4; - v2 = _mm_load_si128((__m128i *) s32); s32 += 4; - v3 = _mm_load_si128((__m128i *) s32); s32 += 4; - - p0 = _mm_srli_epi16(v0, 8); - p1 = _mm_srli_epi16(v1, 8); - p2 = _mm_srli_epi16(v2, 8); - p3 = _mm_srli_epi16(v3, 8); - - tv0 = _mm_and_si128(v0, mask8); - tv1 = _mm_and_si128(v1, mask8); - tv2 = _mm_and_si128(v2, mask8); - tv3 = _mm_and_si128(v3, mask8); - - v0 = _mm_packus_epi16(p1, p0); - v1 = _mm_packus_epi16(tv1, tv0); - v2 = _mm_packus_epi16(p3, p2); - v3 = _mm_packus_epi16(tv3, tv2); - - p0 = _mm_srli_epi16(v0, 8); - p1 = _mm_srli_epi16(v1, 8); - p2 = _mm_srli_epi16(v2, 8); - p3 = _mm_srli_epi16(v3, 8); - - tv0 = _mm_and_si128(v0, mask8); - tv1 = _mm_and_si128(v1, mask8); - tv2 = _mm_and_si128(v2, mask8); - tv3 = _mm_and_si128(v3, mask8); - - v0 = _mm_packus_epi16(p2, p0); - v1 = _mm_packus_epi16(p3, p1); - v2 = _mm_packus_epi16(tv2, tv0); - v3 = _mm_packus_epi16(tv3, tv1); - - si = _mm_and_si128(v0, mask1); - p0 = _mm_shuffle_epi8(tables[6][0], si); - p1 = _mm_shuffle_epi8(tables[6][1], si); - p2 = _mm_shuffle_epi8(tables[6][2], si); - p3 = _mm_shuffle_epi8(tables[6][3], si); - - v0 = _mm_srli_epi32(v0, 4); - si = _mm_and_si128(v0, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); - - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); - - v1 = _mm_srli_epi32(v1, 4); - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); - - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); - - v2 = _mm_srli_epi32(v2, 4); - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); - - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si)); - - v3 = _mm_srli_epi32(v3, 4); - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); - - tv0 = _mm_unpackhi_epi8(p1, p3); - tv1 = _mm_unpackhi_epi8(p0, p2); - tv2 = _mm_unpacklo_epi8(p1, p3); - tv3 = _mm_unpacklo_epi8(p0, p2); - - p0 = _mm_unpackhi_epi8(tv1, tv0); - p1 = _mm_unpacklo_epi8(tv1, tv0); - p2 = _mm_unpackhi_epi8(tv3, tv2); - p3 = _mm_unpacklo_epi8(tv3, tv2); - - v0 = _mm_load_si128 ((__m128i *) d32); - v1 = _mm_load_si128 ((__m128i *) (d32+4)); - v2 = _mm_load_si128 ((__m128i *) (d32+8)); - v3 = _mm_load_si128 ((__m128i *) (d32+12)); - - p0 = _mm_xor_si128(p0, v0); - p1 = _mm_xor_si128(p1, v1); - p2 = _mm_xor_si128(p2, v2); - p3 = _mm_xor_si128(p3, v3); - - _mm_store_si128((__m128i *) d32, p0); - _mm_store_si128((__m128i *) (d32+4), p1); - _mm_store_si128((__m128i *) (d32+8), p2); - _mm_store_si128((__m128i *) (d32+12), p3); - d32 += 16; - } - } else { - while (d32 != top) { - v0 = _mm_load_si128((__m128i *) s32); s32 += 4; - v1 = _mm_load_si128((__m128i *) s32); s32 += 4; - v2 = _mm_load_si128((__m128i *) s32); s32 += 4; - v3 = _mm_load_si128((__m128i *) s32); s32 += 4; - - p0 = _mm_srli_epi16(v0, 8); - p1 = _mm_srli_epi16(v1, 8); - p2 = _mm_srli_epi16(v2, 8); - p3 = _mm_srli_epi16(v3, 8); - - tv0 = _mm_and_si128(v0, mask8); - tv1 = _mm_and_si128(v1, mask8); - tv2 = _mm_and_si128(v2, mask8); - tv3 = _mm_and_si128(v3, mask8); - - v0 = _mm_packus_epi16(p1, p0); - v1 = _mm_packus_epi16(tv1, tv0); - v2 = _mm_packus_epi16(p3, p2); - v3 = _mm_packus_epi16(tv3, tv2); - - p0 = _mm_srli_epi16(v0, 8); - p1 = _mm_srli_epi16(v1, 8); - p2 = _mm_srli_epi16(v2, 8); - p3 = _mm_srli_epi16(v3, 8); - - tv0 = _mm_and_si128(v0, mask8); - tv1 = _mm_and_si128(v1, mask8); - tv2 = _mm_and_si128(v2, mask8); - tv3 = _mm_and_si128(v3, mask8); - - v0 = _mm_packus_epi16(p2, p0); - v1 = _mm_packus_epi16(p3, p1); - v2 = _mm_packus_epi16(tv2, tv0); - v3 = _mm_packus_epi16(tv3, tv1); - - p0 = v0; - p1 = v1; - p2 = v2; - p3 = v3; - - si = _mm_and_si128(v0, mask1); - p0 = _mm_shuffle_epi8(tables[6][0], si); - p1 = _mm_shuffle_epi8(tables[6][1], si); - p2 = _mm_shuffle_epi8(tables[6][2], si); - p3 = _mm_shuffle_epi8(tables[6][3], si); - - v0 = _mm_srli_epi32(v0, 4); - si = _mm_and_si128(v0, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); - - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); - - v1 = _mm_srli_epi32(v1, 4); - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); - - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); - - v2 = _mm_srli_epi32(v2, 4); - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); - - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si)); - - v3 = _mm_srli_epi32(v3, 4); - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); - - tv0 = _mm_unpackhi_epi8(p1, p3); - tv1 = _mm_unpackhi_epi8(p0, p2); - tv2 = _mm_unpacklo_epi8(p1, p3); - tv3 = _mm_unpacklo_epi8(p0, p2); - - p0 = _mm_unpackhi_epi8(tv1, tv0); - p1 = _mm_unpacklo_epi8(tv1, tv0); - p2 = _mm_unpackhi_epi8(tv3, tv2); - p3 = _mm_unpacklo_epi8(tv3, tv2); - - _mm_store_si128((__m128i *) d32, p0); - _mm_store_si128((__m128i *) (d32+4), p1); - _mm_store_si128((__m128i *) (d32+8), p2); - _mm_store_si128((__m128i *) (d32+12), p3); - d32 += 16; - } - } - gf_do_final_region_alignment(&rd); - -#endif -} - -static -int gf_w32_split_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_split_2_32_lazy_data *ld2; - struct gf_split_4_32_lazy_data *ld4; - struct gf_w32_split_8_8_data *d8; - struct gf_split_8_32_lazy_data *d32; - struct gf_split_16_32_lazy_data *d16; - uint32_t p, basep; - int i, j, exp, ispclmul, issse3; - - ispclmul = 0; -#if defined(INTEL_SSE4_PCLMUL) - ispclmul = 1; -#endif - - issse3 = 0; -#ifdef INTEL_SSSE3 - issse3 = 1; -#endif - - h = (gf_internal_t *) gf->scratch; - - /* Defaults */ - - gf->inverse.w32 = gf_w32_euclid; - - /* JSP: First handle single multiplication: - If args == 8, then we're doing split 8 8. - Otherwise, if PCLMUL, we use that. - Otherwise, we use bytwo_p. - */ - - if (h->arg1 == 8 && h->arg2 == 8) { - gf->multiply.w32 = gf_w32_split_8_8_multiply; - } else if (ispclmul) { - if ((0xfffe0000 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w32_clm_multiply_2; - } else if ((0xffc00000 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w32_clm_multiply_3; - } else if ((0xfe000000 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w32_clm_multiply_4; - } - } else { - gf->multiply.w32 = gf_w32_bytwo_p_multiply; - } - - /* Easy cases: 16/32 and 2/32 */ - - if ((h->arg1 == 16 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 16)) { - d16 = (struct gf_split_16_32_lazy_data *) h->private; - d16->last_value = 0; - gf->multiply_region.w32 = gf_w32_split_16_32_lazy_multiply_region; - return 1; - } - - if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) { - ld2 = (struct gf_split_2_32_lazy_data *) h->private; - ld2->last_value = 0; - #ifdef INTEL_SSSE3 - if (!(h->region_type & GF_REGION_NOSSE)) - gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region; - else - gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; - #else - gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; - if(h->region_type & GF_REGION_SSE) return 0; - #endif - return 1; - } - - /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */ - - if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) || - (issse3 && h->mult_type == GF_REGION_DEFAULT)) { - ld4 = (struct gf_split_4_32_lazy_data *) h->private; - ld4->last_value = 0; - if ((h->region_type & GF_REGION_NOSSE) || !issse3) { - gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region; - } else if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region; - } else { - gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region; - } - return 1; - } - - /* 8/32 or Default + no SSE */ - - if ((h->arg1 == 8 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 8) || - h->mult_type == GF_MULT_DEFAULT) { - d32 = (struct gf_split_8_32_lazy_data *) h->private; - d32->last_value = 0; - gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region; - return 1; - } - - /* Finally, if args == 8, then we have to set up the tables here. */ - - if (h->arg1 == 8 && h->arg2 == 8) { - d8 = (struct gf_w32_split_8_8_data *) h->private; - d8->last_value = 0; - gf->multiply.w32 = gf_w32_split_8_8_multiply; - gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region; - basep = 1; - for (exp = 0; exp < 7; exp++) { - for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0; - for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0; - d8->tables[exp][1][1] = basep; - for (i = 2; i < 256; i++) { - if (i&1) { - p = d8->tables[exp][i^1][1]; - d8->tables[exp][i][1] = p ^ basep; - } else { - p = d8->tables[exp][i>>1][1]; - d8->tables[exp][i][1] = GF_MULTBY_TWO(p); - } - } - for (i = 1; i < 256; i++) { - p = d8->tables[exp][i][1]; - for (j = 1; j < 256; j++) { - if (j&1) { - d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p; - } else { - d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]); - } - } - } - for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep); - } - return 1; - } - - /* If we get here, then the arguments were bad. */ - - return 0; -} - -static -int gf_w32_group_init(gf_t *gf) -{ - uint32_t i, j, p, index; - struct gf_w32_group_data *gd; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - int g_r, g_s; - - g_s = h->arg1; - g_r = h->arg2; - - gd = (struct gf_w32_group_data *) h->private; - gd->shift = (uint32_t *) (&(gd->memory)); - gd->reduce = gd->shift + (1 << g_s); - - gd->rmask = (1 << g_r) - 1; - gd->rmask <<= 32; - - gd->tshift = 32 % g_s; - if (gd->tshift == 0) gd->tshift = g_s; - gd->tshift = (32 - gd->tshift); - gd->tshift = ((gd->tshift-1)/g_r) * g_r; - - gd->reduce[0] = 0; - for (i = 0; i < (1 << g_r); i++) { - p = 0; - index = 0; - for (j = 0; j < g_r; j++) { - if (i & (1 << j)) { - p ^= (h->prim_poly << j); - index ^= (1 << j); - index ^= (h->prim_poly >> (32-j)); - } - } - gd->reduce[index] = p; - } - - if (g_s == g_r) { - gf->multiply.w32 = gf_w32_group_s_equals_r_multiply; - gf->multiply_region.w32 = gf_w32_group_s_equals_r_multiply_region; - } else { - gf->multiply.w32 = gf_w32_group_multiply; - gf->multiply_region.w32 = gf_w32_group_multiply_region; - } - gf->divide.w32 = NULL; - gf->inverse.w32 = gf_w32_euclid; - - return 1; -} - - -static -uint32_t -gf_w32_composite_multiply_recursive(gf_t *gf, uint32_t a, uint32_t b) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint32_t b0 = b & 0x0000ffff; - uint32_t b1 = (b & 0xffff0000) >> 16; - uint32_t a0 = a & 0x0000ffff; - uint32_t a1 = (a & 0xffff0000) >> 16; - uint32_t a1b1; - uint32_t rv; - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - rv = ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16) | (base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1); - return rv; -} - -/* JSP: This could be made faster. Someday, when I'm bored. */ - -static -uint32_t -gf_w32_composite_multiply_inline(gf_t *gf, uint32_t a, uint32_t b) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - uint32_t b0 = b & 0x0000ffff; - uint32_t b1 = b >> 16; - uint32_t a0 = a & 0x0000ffff; - uint32_t a1 = a >> 16; - uint32_t a1b1, prod; - uint16_t *log, *alog; - struct gf_w32_composite_data *cd; - - cd = (struct gf_w32_composite_data *) h->private; - log = cd->log; - alog = cd->alog; - - a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1); - prod = GF_W16_INLINE_MULT(log, alog, a1, b0); - prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1); - prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly); - prod <<= 16; - prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0); - prod ^= a1b1; - return prod; -} - -/* - * Composite field division trick (explained in 2007 tech report) - * - * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 - * - * let c = b^-1 - * - * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0) - * - * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 - * - * let d = b1c1 and d+1 = b0c0 - * - * solve s*b1c1+b1c0+b0c1 = 0 - * - * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1 - * - * c0 = (d+1)b0^-1 - * c1 = d*b1^-1 - * - * a / b = a * c - */ - -static -uint32_t -gf_w32_composite_inverse(gf_t *gf, uint32_t a) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint16_t a0 = a & 0x0000ffff; - uint16_t a1 = (a & 0xffff0000) >> 16; - uint16_t c0, c1, d, tmp; - uint32_t c; - uint16_t a0inv, a1inv; - - if (a0 == 0) { - a1inv = base_gf->inverse.w32(base_gf, a1); - c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); - c1 = a1inv; - } else if (a1 == 0) { - c0 = base_gf->inverse.w32(base_gf, a0); - c1 = 0; - } else { - a1inv = base_gf->inverse.w32(base_gf, a1); - a0inv = base_gf->inverse.w32(base_gf, a0); - - d = base_gf->multiply.w32(base_gf, a1, a0inv); - - tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly); - tmp = base_gf->inverse.w32(base_gf, tmp); - - d = base_gf->multiply.w32(base_gf, d, tmp); - - c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv); - c1 = base_gf->multiply.w32(base_gf, d, a1inv); - } - - c = c0 | (c1 << 16); - - return c; -} - -static -void -gf_w32_composite_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint32_t b0 = val & 0x0000ffff; - uint32_t b1 = (val & 0xffff0000) >> 16; - uint32_t *s32, *d32, *top; - uint16_t a0, a1, a1b1, *log, *alog; - uint32_t prod; - gf_region_data rd; - struct gf_w32_composite_data *cd; - - cd = (struct gf_w32_composite_data *) h->private; - log = cd->log; - alog = cd->alog; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); - - s32 = rd.s_start; - d32 = rd.d_start; - top = rd.d_top; - - if (log == NULL) { - if (xor) { - while (d32 < top) { - a0 = *s32 & 0x0000ffff; - a1 = (*s32 & 0xffff0000) >> 16; - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - *d32 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16)); - s32++; - d32++; - } - } else { - while (d32 < top) { - a0 = *s32 & 0x0000ffff; - a1 = (*s32 & 0xffff0000) >> 16; - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - *d32 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16)); - s32++; - d32++; - } - } - } else { - if (xor) { - while (d32 < top) { - a0 = *s32 & 0x0000ffff; - a1 = (*s32 & 0xffff0000) >> 16; - a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1); - - prod = GF_W16_INLINE_MULT(log, alog, a1, b0); - prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1); - prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly); - prod <<= 16; - prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0); - prod ^= a1b1; - *d32 ^= prod; - s32++; - d32++; - } - } else { - while (d32 < top) { - a0 = *s32 & 0x0000ffff; - a1 = (*s32 & 0xffff0000) >> 16; - a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1); - - prod = GF_W16_INLINE_MULT(log, alog, a1, b0); - prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1); - prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly); - prod <<= 16; - prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0); - prod ^= a1b1; - - *d32 = prod; - s32++; - d32++; - } - } - } -} - -static -void -gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint16_t val0 = val & 0x0000ffff; - uint16_t val1 = (val & 0xffff0000) >> 16; - gf_region_data rd; - int sub_reg_size; - uint8_t *slow, *shigh; - uint8_t *dlow, *dhigh, *top; - - /* JSP: I want the two pointers aligned wrt each other on 16 byte - boundaries. So I'm going to make sure that the area on - which the two operate is a multiple of 32. Of course, that - junks up the mapping, but so be it -- that's why we have extract_word.... */ - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); - gf_do_initial_region_alignment(&rd); - - slow = (uint8_t *) rd.s_start; - dlow = (uint8_t *) rd.d_start; - top = (uint8_t *) rd.d_top; - sub_reg_size = (top - dlow)/2; - shigh = slow + sub_reg_size; - dhigh = dlow + sub_reg_size; - - base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor); - base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor); - base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); - - gf_do_final_region_alignment(&rd); -} - -static -int gf_w32_composite_init(gf_t *gf) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - struct gf_w32_composite_data *cd; - - if (h->base_gf == NULL) return 0; - - cd = (struct gf_w32_composite_data *) h->private; - cd->log = gf_w16_get_log_table(h->base_gf); - cd->alog = gf_w16_get_mult_alog_table(h->base_gf); - - if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w32 = gf_w32_composite_multiply_region_alt; - } else { - gf->multiply_region.w32 = gf_w32_composite_multiply_region; - } - - if (cd->log == NULL) { - gf->multiply.w32 = gf_w32_composite_multiply_recursive; - } else { - gf->multiply.w32 = gf_w32_composite_multiply_inline; - } - gf->divide.w32 = NULL; - gf->inverse.w32 = gf_w32_composite_inverse; - - return 1; -} - - - -int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) -{ - int issse3 = 0; - -#ifdef INTEL_SSSE3 - issse3 = 1; -#endif - - switch(mult_type) - { - case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: - return sizeof(gf_internal_t) + sizeof(struct gf_w32_bytwo_data) + 64; - break; - case GF_MULT_GROUP: - return sizeof(gf_internal_t) + sizeof(struct gf_w32_group_data) + - sizeof(uint32_t) * (1 << arg1) + - sizeof(uint32_t) * (1 << arg2) + 64; - break; - case GF_MULT_DEFAULT: - - case GF_MULT_SPLIT_TABLE: - if (arg1 == 8 && arg2 == 8){ - return sizeof(gf_internal_t) + sizeof(struct gf_w32_split_8_8_data) + 64; - } - if ((arg1 == 16 && arg2 == 32) || (arg2 == 16 && arg1 == 32)) { - return sizeof(gf_internal_t) + sizeof(struct gf_split_16_32_lazy_data) + 64; - } - if ((arg1 == 2 && arg2 == 32) || (arg2 == 2 && arg1 == 32)) { - return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64; - } - if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) || - (mult_type == GF_MULT_DEFAULT && !issse3)) { - return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64; - } - if ((arg1 == 4 && arg2 == 32) || - (arg2 == 4 && arg1 == 32) || - mult_type == GF_MULT_DEFAULT) { - return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64; - } - return 0; - case GF_MULT_CARRY_FREE: - return sizeof(gf_internal_t); - break; - case GF_MULT_SHIFT: - return sizeof(gf_internal_t); - break; - case GF_MULT_COMPOSITE: - return sizeof(gf_internal_t) + sizeof(struct gf_w32_composite_data) + 64; - break; - - default: - return 0; - } - return 0; -} - -int gf_w32_init(gf_t *gf) -{ - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - - /* Allen: set default primitive polynomial / irreducible polynomial if needed */ - - if (h->prim_poly == 0) { - if (h->mult_type == GF_MULT_COMPOSITE) { - h->prim_poly = gf_composite_get_default_poly(h->base_gf); - if (h->prim_poly == 0) return 0; /* This shouldn't happen */ - } else { - - /* Allen: use the following primitive polynomial to make carryless multiply work more efficiently for GF(2^32).*/ - - /* h->prim_poly = 0xc5; */ - - /* Allen: The following is the traditional primitive polynomial for GF(2^32) */ - - h->prim_poly = 0x400007; - } - } - - /* No leading one */ - - if(h->mult_type != GF_MULT_COMPOSITE) h->prim_poly &= 0xffffffff; - - gf->multiply.w32 = NULL; - gf->divide.w32 = NULL; - gf->inverse.w32 = NULL; - gf->multiply_region.w32 = NULL; - - switch(h->mult_type) { - case GF_MULT_CARRY_FREE: if (gf_w32_cfm_init(gf) == 0) return 0; break; - case GF_MULT_SHIFT: if (gf_w32_shift_init(gf) == 0) return 0; break; - case GF_MULT_COMPOSITE: if (gf_w32_composite_init(gf) == 0) return 0; break; - case GF_MULT_DEFAULT: - case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break; - case GF_MULT_GROUP: if (gf_w32_group_init(gf) == 0) return 0; break; - case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: if (gf_w32_bytwo_init(gf) == 0) return 0; break; - default: return 0; - } - if (h->divide_type == GF_DIVIDE_EUCLID) { - gf->divide.w32 = gf_w32_divide_from_inverse; - gf->inverse.w32 = gf_w32_euclid; - } else if (h->divide_type == GF_DIVIDE_MATRIX) { - gf->divide.w32 = gf_w32_divide_from_inverse; - gf->inverse.w32 = gf_w32_matrix; - } - - if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) { - gf->divide.w32 = gf_w32_divide_from_inverse; - } - if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) { - gf->inverse.w32 = gf_w32_inverse_from_divide; - } - if (h->region_type == GF_REGION_CAUCHY) { - gf->extract_word.w32 = gf_wgen_extract_word; - gf->multiply_region.w32 = gf_wgen_cauchy_region; - } else if (h->region_type & GF_REGION_ALTMAP) { - if (h->mult_type == GF_MULT_COMPOSITE) { - gf->extract_word.w32 = gf_w32_composite_extract_word; - } else { - gf->extract_word.w32 = gf_w32_split_extract_word; - } - } else { - gf->extract_word.w32 = gf_w32_extract_word; - } - return 1; -} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w4.c b/src/erasure-code/jerasure/gf-complete/src/gf_w4.c deleted file mode 100644 index 65cbf23a25d32..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/src/gf_w4.c +++ /dev/null @@ -1,2081 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_w4.c - * - * Routines for 4-bit Galois fields - */ - -#include "gf_int.h" -#include -#include - -#define GF_FIELD_WIDTH 4 -#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2) -#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) -#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1) - -/* ------------------------------------------------------------ - JSP: Each implementation has its own data, which is allocated - at one time as part of the handle. For that reason, it - shouldn't be hierarchical -- i.e. one should be able to - allocate it with one call to malloc. */ - -struct gf_logtable_data { - uint8_t log_tbl[GF_FIELD_SIZE]; - uint8_t antilog_tbl[GF_FIELD_SIZE * 2]; - uint8_t *antilog_tbl_div; -}; - -struct gf_single_table_data { - uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; -}; - -struct gf_double_table_data { - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE]; -}; -struct gf_quad_table_data { - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint16_t mult[GF_FIELD_SIZE][(1<<16)]; -}; - -struct gf_quad_table_lazy_data { - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint16_t mult[(1 << 16)]; -}; - -struct gf_bytwo_data { - uint64_t prim_poly; - uint64_t mask1; - uint64_t mask2; -}; - -#define AB2(ip, am1 ,am2, b, t1, t2) {\ - t1 = (b << 1) & am1;\ - t2 = b & am2; \ - t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \ - b = (t1 ^ (t2 & ip));} - -#define SSE_AB2(pp, m1, va, t1, t2) {\ - t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ - t2 = _mm_and_si128(va, _mm_set1_epi8(0x88)); \ - t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ - va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } - -/* ------------------------------------------------------------ - JSP: These are basic and work from multiple implementations. - */ - -static -inline -gf_val_32_t gf_w4_inverse_from_divide (gf_t *gf, gf_val_32_t a) -{ - return gf->divide.w32(gf, 1, a); -} - -static -inline -gf_val_32_t gf_w4_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - b = gf->inverse.w32(gf, b); - return gf->multiply.w32(gf, a, b); -} - -static -inline -gf_val_32_t gf_w4_euclid (gf_t *gf, gf_val_32_t b) -{ - gf_val_32_t e_i, e_im1, e_ip1; - gf_val_32_t d_i, d_im1, d_ip1; - gf_val_32_t y_i, y_im1, y_ip1; - gf_val_32_t c_i; - - if (b == 0) return -1; - e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; - e_i = b; - d_im1 = 4; - for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ; - y_i = 1; - y_im1 = 0; - - while (e_i != 1) { - e_ip1 = e_im1; - d_ip1 = d_im1; - c_i = 0; - - while (d_ip1 >= d_i) { - c_i ^= (1 << (d_ip1 - d_i)); - e_ip1 ^= (e_i << (d_ip1 - d_i)); - if (e_ip1 == 0) return 0; - while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; - } - - y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); - y_im1 = y_i; - y_i = y_ip1; - - e_im1 = e_i; - d_im1 = d_i; - e_i = e_ip1; - d_i = d_ip1; - } - - return y_i; -} - -static -gf_val_32_t gf_w4_extract_word(gf_t *gf, void *start, int bytes, int index) -{ - uint8_t *r8, v; - - r8 = (uint8_t *) start; - v = r8[index/2]; - if (index%2) { - return v >> 4; - } else { - return v&0xf; - } -} - - -static -inline -gf_val_32_t gf_w4_matrix (gf_t *gf, gf_val_32_t b) -{ - return gf_bitmatrix_inverse(b, 4, ((gf_internal_t *) (gf->scratch))->prim_poly); -} - - -static -inline -gf_val_32_t -gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - uint8_t product, i, pp; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - product = 0; - - for (i = 0; i < GF_FIELD_WIDTH; i++) { - if (a & (1 << i)) product ^= (b << i); - } - for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { - if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); - } - return product; -} - -/* Ben: This function works, but it is 33% slower than the normal shift mult */ - -static -inline -gf_val_32_t -gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4) -{ - gf_val_32_t rv = 0; - -#if defined(INTEL_SSE4_PCLMUL) - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - a = _mm_insert_epi32 (_mm_setzero_si128(), a4, 0); - b = _mm_insert_epi32 (a, b4, 0); - - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1fULL)); - - /* Do the initial multiply */ - - result = _mm_clmulepi64_si128 (a, b, 0); - - /* Ben/JSP: Do prim_poly reduction once. We are guaranteed that we will only - have to do the reduction only once, because (w-2)/z == 1. Where - z is equal to the number of zeros after the leading 1. - - _mm_clmulepi64_si128 is the carryless multiply operation. Here - _mm_srli_epi64 shifts the result to the right by 4 bits. This allows - us to multiply the prim_poly by the leading bits of the result. We - then xor the result of that operation back with the result. */ - - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_epi64 (result, 4), 0); - result = _mm_xor_si128 (result, w); - - /* Extracts 32 bit value from result. */ - - rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); -#endif - return rv; -} - -static -void -gf_w4_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int - xor) -{ - gf_region_data rd; - uint8_t *s8; - uint8_t *d8; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); - gf_do_initial_region_alignment(&rd); - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - if (xor) { - while (d8 < ((uint8_t *) rd.d_top)) { - *d8 ^= (gf->multiply.w32(gf, val, (*s8 & 0xf)) | - ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4)); - d8++; - s8++; - } - } else { - while (d8 < ((uint8_t *) rd.d_top)) { - *d8 = (gf->multiply.w32(gf, val, (*s8 & 0xf)) | - ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4)); - d8++; - s8++; - } - } - gf_do_final_region_alignment(&rd); -} - -/* ------------------------------------------------------------ - IMPLEMENTATION: LOG_TABLE: - - JSP: This is a basic log-antilog implementation. - I'm not going to spend any time optimizing it because the - other techniques are faster for both single and region - operations. - */ - -static -inline -gf_val_32_t -gf_w4_log_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_logtable_data *ltd; - - ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private; - return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])]; -} - -static -inline -gf_val_32_t -gf_w4_log_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - int log_sum = 0; - struct gf_logtable_data *ltd; - - if (a == 0 || b == 0) return 0; - ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private; - - log_sum = ltd->log_tbl[a] - ltd->log_tbl[b]; - return (ltd->antilog_tbl_div[log_sum]); -} - -static -void -gf_w4_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - int i; - uint8_t lv, b, c; - uint8_t *s8, *d8; - - struct gf_logtable_data *ltd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private; - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - - lv = ltd->log_tbl[val]; - - for (i = 0; i < bytes; i++) { - c = (xor) ? d8[i] : 0; - b = (s8[i] >> GF_FIELD_WIDTH); - c ^= (b == 0) ? 0 : (ltd->antilog_tbl[lv + ltd->log_tbl[b]] << GF_FIELD_WIDTH); - b = (s8[i] & 0xf); - c ^= (b == 0) ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[b]]; - d8[i] = c; - } -} - -static -int gf_w4_log_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_logtable_data *ltd; - int i, b; - - h = (gf_internal_t *) gf->scratch; - ltd = h->private; - - for (i = 0; i < GF_FIELD_SIZE; i++) - ltd->log_tbl[i]=0; - - ltd->antilog_tbl_div = ltd->antilog_tbl + (GF_FIELD_SIZE-1); - b = 1; - i = 0; - do { - if (ltd->log_tbl[b] != 0 && i != 0) { - fprintf(stderr, "Cannot construct log table: Polynomial is not primitive.\n\n"); - return 0; - } - ltd->log_tbl[b] = i; - ltd->antilog_tbl[i] = b; - ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = b; - b <<= 1; - i++; - if (b & GF_FIELD_SIZE) b = b ^ h->prim_poly; - } while (b != 1); - - if (i != GF_FIELD_SIZE - 1) { - _gf_errno = GF_E_LOGPOLY; - return 0; - } - - gf->inverse.w32 = gf_w4_inverse_from_divide; - gf->divide.w32 = gf_w4_log_divide; - gf->multiply.w32 = gf_w4_log_multiply; - gf->multiply_region.w32 = gf_w4_log_multiply_region; - return 1; -} - -/* ------------------------------------------------------------ - IMPLEMENTATION: SINGLE TABLE: JSP. - */ - -static -inline -gf_val_32_t -gf_w4_single_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_single_table_data *std; - - std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private; - return std->mult[a][b]; -} - -static -inline -gf_val_32_t -gf_w4_single_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_single_table_data *std; - - std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private; - return std->div[a][b]; -} - -static -void -gf_w4_single_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - int i; - uint8_t b, c; - uint8_t *s8, *d8; - - struct gf_single_table_data *std; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private; - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - - for (i = 0; i < bytes; i++) { - c = (xor) ? d8[i] : 0; - b = (s8[i] >> GF_FIELD_WIDTH); - c ^= (std->mult[val][b] << GF_FIELD_WIDTH); - b = (s8[i] & 0xf); - c ^= (std->mult[val][b]); - d8[i] = c; - } -} - -#define MM_PRINT(s, r) { uint8_t blah[16]; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (i = 0; i < 16; i++) printf(" %02x", blah[i]); printf("\n"); } - -#ifdef INTEL_SSSE3 -static -void -gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - gf_region_data rd; - uint8_t *base, *sptr, *dptr, *top; - __m128i tl, loset, r, va, th; - - struct gf_single_table_data *std; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - - std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private; - base = (uint8_t *) std->mult; - base += (val << GF_FIELD_WIDTH); - - gf_do_initial_region_alignment(&rd); - - tl = _mm_loadu_si128((__m128i *)base); - th = _mm_slli_epi64(tl, 4); - loset = _mm_set1_epi8 (0x0f); - - sptr = rd.s_start; - dptr = rd.d_start; - top = rd.s_top; - - while (sptr < (uint8_t *) top) { - va = _mm_load_si128 ((__m128i *)(sptr)); - r = _mm_and_si128 (loset, va); - r = _mm_shuffle_epi8 (tl, r); - va = _mm_srli_epi64 (va, 4); - va = _mm_and_si128 (loset, va); - va = _mm_shuffle_epi8 (th, va); - r = _mm_xor_si128 (r, va); - va = (xor) ? _mm_load_si128 ((__m128i *)(dptr)) : _mm_setzero_si128(); - r = _mm_xor_si128 (r, va); - _mm_store_si128 ((__m128i *)(dptr), r); - dptr += 16; - sptr += 16; - } - gf_do_final_region_alignment(&rd); - -} -#endif - -static -int gf_w4_single_table_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_single_table_data *std; - int a, b, prod; - - - h = (gf_internal_t *) gf->scratch; - std = (struct gf_single_table_data *)h->private; - - bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - - for (a = 1; a < GF_FIELD_SIZE; a++) { - for (b = 1; b < GF_FIELD_SIZE; b++) { - prod = gf_w4_shift_multiply(gf, a, b); - std->mult[a][b] = prod; - std->div[prod][b] = a; - } - } - - gf->inverse.w32 = NULL; - gf->divide.w32 = gf_w4_single_table_divide; - gf->multiply.w32 = gf_w4_single_table_multiply; - #ifdef INTEL_SSSE3 - if(h->region_type & (GF_REGION_NOSSE | GF_REGION_CAUCHY)) - gf->multiply_region.w32 = gf_w4_single_table_multiply_region; - else - gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region; - #else - gf->multiply_region.w32 = gf_w4_single_table_multiply_region; - if (h->region_type & GF_REGION_SSE) return 0; - #endif - - return 1; -} - -/* ------------------------------------------------------------ - IMPLEMENTATION: DOUBLE TABLE: JSP. - */ - -static -inline -gf_val_32_t -gf_w4_double_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_double_table_data *std; - - std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private; - return std->mult[a][b]; -} - -static -inline -gf_val_32_t -gf_w4_double_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_double_table_data *std; - - std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private; - return std->div[a][b]; -} - -static -void -gf_w4_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - int i; - uint8_t *s8, *d8, *base; - gf_region_data rd; - struct gf_double_table_data *std; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); - - std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private; - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - base = (uint8_t *) std->mult; - base += (val << GF_DOUBLE_WIDTH); - - if (xor) { - for (i = 0; i < bytes; i++) d8[i] ^= base[s8[i]]; - } else { - for (i = 0; i < bytes; i++) d8[i] = base[s8[i]]; - } -} - -static -int gf_w4_double_table_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_double_table_data *std; - int a, b, c, prod, ab; - uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; - - h = (gf_internal_t *) gf->scratch; - std = (struct gf_double_table_data *)h->private; - - bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - - for (a = 1; a < GF_FIELD_SIZE; a++) { - for (b = 1; b < GF_FIELD_SIZE; b++) { - prod = gf_w4_shift_multiply(gf, a, b); - mult[a][b] = prod; - std->div[prod][b] = a; - } - } - bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE * GF_FIELD_SIZE); - for (a = 0; a < GF_FIELD_SIZE; a++) { - for (b = 0; b < GF_FIELD_SIZE; b++) { - ab = mult[a][b]; - for (c = 0; c < GF_FIELD_SIZE; c++) { - std->mult[a][(b << 4) | c] = ((ab << 4) | mult[a][c]); - } - } - } - - gf->inverse.w32 = NULL; - gf->divide.w32 = gf_w4_double_table_divide; - gf->multiply.w32 = gf_w4_double_table_multiply; - gf->multiply_region.w32 = gf_w4_double_table_multiply_region; - return 1; -} - - -static -inline -gf_val_32_t -gf_w4_quad_table_lazy_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_quad_table_lazy_data *std; - - std = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private; - return std->div[a][b]; -} - -static -inline -gf_val_32_t -gf_w4_quad_table_lazy_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_quad_table_lazy_data *std; - - std = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private; - return std->smult[a][b]; -} - -static -inline -gf_val_32_t -gf_w4_quad_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_quad_table_data *std; - - std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private; - return std->div[a][b]; -} - -static -inline -gf_val_32_t -gf_w4_quad_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_quad_table_data *std; - uint16_t v; - - std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private; - v = std->mult[a][b]; - return v; -} - -static -void -gf_w4_quad_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint16_t *base; - gf_region_data rd; - struct gf_quad_table_data *std; - struct gf_quad_table_lazy_data *ltd; - gf_internal_t *h; - int a, b, c, d, va, vb, vc, vd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - h = (gf_internal_t *) (gf->scratch); - if (h->region_type & GF_REGION_LAZY) { - ltd = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private; - base = ltd->mult; - for (a = 0; a < 16; a++) { - va = (ltd->smult[val][a] << 12); - for (b = 0; b < 16; b++) { - vb = (ltd->smult[val][b] << 8); - for (c = 0; c < 16; c++) { - vc = (ltd->smult[val][c] << 4); - for (d = 0; d < 16; d++) { - vd = ltd->smult[val][d]; - base[(a << 12) | (b << 8) | (c << 4) | d ] = (va | vb | vc | vd); - } - } - } - } - } else { - std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private; - base = &(std->mult[val][0]); - } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); - gf_do_initial_region_alignment(&rd); - gf_two_byte_region_table_multiply(&rd, base); - gf_do_final_region_alignment(&rd); -} - -static -int gf_w4_quad_table_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_quad_table_data *std; - int prod, val, a, b, c, d, va, vb, vc, vd; - uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; - - h = (gf_internal_t *) gf->scratch; - std = (struct gf_quad_table_data *)h->private; - - bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - - for (a = 1; a < GF_FIELD_SIZE; a++) { - for (b = 1; b < GF_FIELD_SIZE; b++) { - prod = gf_w4_shift_multiply(gf, a, b); - mult[a][b] = prod; - std->div[prod][b] = a; - } - } - - for (val = 0; val < 16; val++) { - for (a = 0; a < 16; a++) { - va = (mult[val][a] << 12); - for (b = 0; b < 16; b++) { - vb = (mult[val][b] << 8); - for (c = 0; c < 16; c++) { - vc = (mult[val][c] << 4); - for (d = 0; d < 16; d++) { - vd = mult[val][d]; - std->mult[val][(a << 12) | (b << 8) | (c << 4) | d ] = (va | vb | vc | vd); - } - } - } - } - } - - gf->inverse.w32 = NULL; - gf->divide.w32 = gf_w4_quad_table_divide; - gf->multiply.w32 = gf_w4_quad_table_multiply; - gf->multiply_region.w32 = gf_w4_quad_table_multiply_region; - return 1; -} -static -int gf_w4_quad_table_lazy_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_quad_table_lazy_data *std; - int a, b, prod, loga, logb; - uint8_t log_tbl[GF_FIELD_SIZE]; - uint8_t antilog_tbl[GF_FIELD_SIZE*2]; - - h = (gf_internal_t *) gf->scratch; - std = (struct gf_quad_table_lazy_data *)h->private; - - b = 1; - for (a = 0; a < GF_MULT_GROUP_SIZE; a++) { - log_tbl[b] = a; - antilog_tbl[a] = b; - antilog_tbl[a+GF_MULT_GROUP_SIZE] = b; - b <<= 1; - if (b & GF_FIELD_SIZE) { - b = b ^ h->prim_poly; - } - } - - bzero(std->smult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - - for (a = 1; a < GF_FIELD_SIZE; a++) { - loga = log_tbl[a]; - for (b = 1; b < GF_FIELD_SIZE; b++) { - logb = log_tbl[b]; - prod = antilog_tbl[loga+logb]; - std->smult[a][b] = prod; - std->div[prod][b] = a; - } - } - - gf->inverse.w32 = NULL; - gf->divide.w32 = gf_w4_quad_table_lazy_divide; - gf->multiply.w32 = gf_w4_quad_table_lazy_multiply; - gf->multiply_region.w32 = gf_w4_quad_table_multiply_region; - return 1; -} - -static -int gf_w4_table_init(gf_t *gf) -{ - int rt; - gf_internal_t *h; - int issse3 = 0; - -#ifdef INTEL_SSSE3 - issse3 = 1; -#endif - - h = (gf_internal_t *) gf->scratch; - rt = (h->region_type); - - if (h->mult_type == GF_MULT_DEFAULT && !issse3) rt |= GF_REGION_DOUBLE_TABLE; - - if (rt & GF_REGION_DOUBLE_TABLE) { - return gf_w4_double_table_init(gf); - } else if (rt & GF_REGION_QUAD_TABLE) { - if (rt & GF_REGION_LAZY) { - return gf_w4_quad_table_lazy_init(gf); - } else { - return gf_w4_quad_table_init(gf); - } - return gf_w4_double_table_init(gf); - } else { - return gf_w4_single_table_init(gf); - } - return 0; -} - -/* ------------------------------------------------------------ - JSP: GF_MULT_BYTWO_p and _b: See the paper. -*/ - -static -inline -gf_val_32_t -gf_w4_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - uint32_t prod, pp, pmask, amask; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - - prod = 0; - pmask = 0x8; - amask = 0x8; - - while (amask != 0) { - if (prod & pmask) { - prod = ((prod << 1) ^ pp); - } else { - prod <<= 1; - } - if (a & amask) prod ^= b; - amask >>= 1; - } - return prod; -} - -static -inline -gf_val_32_t -gf_w4_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - uint32_t prod, pp, bmask; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - prod = 0; - bmask = 0x8; - - while (1) { - if (a & 1) prod ^= b; - a >>= 1; - if (a == 0) return prod; - if (b & bmask) { - b = ((b << 1) ^ pp); - } else { - b <<= 1; - } - } -} - -static -void -gf_w4_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint64_t *s64, *d64, t1, t2, ta, prod, amask; - gf_region_data rd; - struct gf_bytwo_data *btd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); - gf_do_initial_region_alignment(&rd); - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - - if (xor) { - while (s64 < (uint64_t *) rd.s_top) { - prod = 0; - amask = 0x8; - ta = *s64; - while (amask != 0) { - AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); - if (val & amask) prod ^= ta; - amask >>= 1; - } - *d64 ^= prod; - d64++; - s64++; - } - } else { - while (s64 < (uint64_t *) rd.s_top) { - prod = 0; - amask = 0x8; - ta = *s64; - while (amask != 0) { - AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); - if (val & amask) prod ^= ta; - amask >>= 1; - } - *d64 = prod; - d64++; - s64++; - } - } - gf_do_final_region_alignment(&rd); -} - -#define BYTWO_P_ONESTEP {\ - SSE_AB2(pp, m1, prod, t1, t2); \ - t1 = _mm_and_si128(v, one); \ - t1 = _mm_sub_epi8(t1, one); \ - t1 = _mm_and_si128(t1, ta); \ - prod = _mm_xor_si128(prod, t1); \ - v = _mm_srli_epi64(v, 1); } - -#ifdef INTEL_SSE2 -static -void -gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - int i; - uint8_t *s8, *d8; - uint8_t vrev; - __m128i pp, m1, ta, prod, t1, t2, tp, one, v; - struct gf_bytwo_data *btd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - vrev = 0; - for (i = 0; i < 4; i++) { - vrev <<= 1; - if (!(val & (1 << i))) vrev |= 1; - } - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - one = _mm_set1_epi8(1); - - while (d8 < (uint8_t *) rd.d_top) { - prod = _mm_setzero_si128(); - v = _mm_set1_epi8(vrev); - ta = _mm_load_si128((__m128i *) s8); - tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); - d8 += 16; - s8 += 16; - } - gf_do_final_region_alignment(&rd); -} -#endif - -/* -static -void -gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ -#ifdef INTEL_SSE2 - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; - struct gf_bytwo_data *btd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); - - if (xor) { - while (d8 < (uint8_t *) rd.d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - vb = _mm_load_si128 ((__m128i *)(d8)); - tb = val; - while (1) { - if (tb & 1) vb = _mm_xor_si128(vb, va); - tb >>= 1; - if (tb == 0) break; - SSE_AB2(pp, m1, m2, va, t1, t2); - } - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } - } else { - while (d8 < (uint8_t *) rd.d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - vb = _mm_setzero_si128 (); - tb = val; - while (1) { - if (tb & 1) vb = _mm_xor_si128(vb, va); - tb >>= 1; - if (tb == 0) break; - t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); - t2 = _mm_and_si128(va, m2); - t2 = _mm_sub_epi64 ( - _mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); - va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); - } - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } - } - gf_do_final_region_alignment(&rd); -#endif -} -*/ - -#ifdef INTEL_SSE2 -static -void -gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, t1, t2, va; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, va, t1, t2); - _mm_store_si128((__m128i *)d8, va); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, t1, t2, va, vb; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, va, t1, t2); - vb = _mm_load_si128 ((__m128i *)(d8)); - vb = _mm_xor_si128(vb, va); - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, t1, t2, va; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, va, t1, t2); - SSE_AB2(pp, m1, va, t1, t2); - _mm_store_si128((__m128i *)d8, va); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, t1, t2, va, vb; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, va, t1, t2); - SSE_AB2(pp, m1, va, t1, t2); - vb = _mm_load_si128 ((__m128i *)(d8)); - vb = _mm_xor_si128(vb, va); - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } -} -#endif - - -#ifdef INTEL_SSE2 -static -void -gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, t1, t2, va, vb; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - vb = va; - SSE_AB2(pp, m1, va, t1, t2); - va = _mm_xor_si128(va, vb); - _mm_store_si128((__m128i *)d8, va); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, t1, t2, va, vb; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); - SSE_AB2(pp, m1, va, t1, t2); - vb = _mm_xor_si128(vb, va); - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, t1, t2, va, vb; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - vb = va; - SSE_AB2(pp, m1, va, t1, t2); - SSE_AB2(pp, m1, va, t1, t2); - va = _mm_xor_si128(va, vb); - _mm_store_si128((__m128i *)d8, va); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, t1, t2, va, vb; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); - SSE_AB2(pp, m1, va, t1, t2); - SSE_AB2(pp, m1, va, t1, t2); - vb = _mm_xor_si128(vb, va); - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, t1, t2, va, vb; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - vb = va; - SSE_AB2(pp, m1, va, t1, t2); - vb = _mm_xor_si128(va, vb); - SSE_AB2(pp, m1, va, t1, t2); - va = _mm_xor_si128(va, vb); - _mm_store_si128((__m128i *)d8, va); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, t1, t2, va, vb; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); - SSE_AB2(pp, m1, va, t1, t2); - vb = _mm_xor_si128(vb, va); - SSE_AB2(pp, m1, va, t1, t2); - vb = _mm_xor_si128(vb, va); - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, t1, t2, va, vb; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, va, t1, t2); - vb = va; - SSE_AB2(pp, m1, va, t1, t2); - va = _mm_xor_si128(va, vb); - _mm_store_si128((__m128i *)d8, va); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, t1, t2, va, vb; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, va, t1, t2); - vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); - SSE_AB2(pp, m1, va, t1, t2); - vb = _mm_xor_si128(vb, va); - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; - struct gf_bytwo_data *btd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - - switch (val) { - case 2: - if (!xor) { - gf_w4_bytwo_b_sse_region_2_noxor(&rd, btd); - } else { - gf_w4_bytwo_b_sse_region_2_xor(&rd, btd); - } - gf_do_final_region_alignment(&rd); - return; - case 3: - if (!xor) { - gf_w4_bytwo_b_sse_region_3_noxor(&rd, btd); - } else { - gf_w4_bytwo_b_sse_region_3_xor(&rd, btd); - } - gf_do_final_region_alignment(&rd); - return; - case 4: - if (!xor) { - gf_w4_bytwo_b_sse_region_4_noxor(&rd, btd); - } else { - gf_w4_bytwo_b_sse_region_4_xor(&rd, btd); - } - gf_do_final_region_alignment(&rd); - return; - case 5: - if (!xor) { - gf_w4_bytwo_b_sse_region_5_noxor(&rd, btd); - } else { - gf_w4_bytwo_b_sse_region_5_xor(&rd, btd); - } - gf_do_final_region_alignment(&rd); - return; - case 6: - if (!xor) { - gf_w4_bytwo_b_sse_region_6_noxor(&rd, btd); - } else { - gf_w4_bytwo_b_sse_region_6_xor(&rd, btd); - } - gf_do_final_region_alignment(&rd); - return; - case 7: - if (!xor) { - gf_w4_bytwo_b_sse_region_7_noxor(&rd, btd); - } else { - gf_w4_bytwo_b_sse_region_7_xor(&rd, btd); - } - gf_do_final_region_alignment(&rd); - return; - } - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); - - if (xor) { - while (d8 < (uint8_t *) rd.d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - vb = _mm_load_si128 ((__m128i *)(d8)); - tb = val; - while (1) { - if (tb & 1) vb = _mm_xor_si128(vb, va); - tb >>= 1; - if (tb == 0) break; - SSE_AB2(pp, m1, va, t1, t2); - } - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } - } else { - while (d8 < (uint8_t *) rd.d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - vb = _mm_setzero_si128 (); - tb = val; - while (1) { - if (tb & 1) vb = _mm_xor_si128(vb, va); - tb >>= 1; - if (tb == 0) break; - t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); - t2 = _mm_and_si128(va, m2); - t2 = _mm_sub_epi64 ( - _mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); - va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); - } - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } - } - gf_do_final_region_alignment(&rd); -} -#endif - -static -void -gf_w4_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint64_t *s64, *d64, t1, t2, ta, tb, prod; - struct gf_bytwo_data *btd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - - switch (val) { - case 1: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - *d64 ^= *s64; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - *d64 = *s64; - d64++; - s64++; - } - } - break; - case 2: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= ta; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta; - d64++; - s64++; - } - } - break; - case 3: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 4: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= ta; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta; - d64++; - s64++; - } - } - break; - case 5: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta ^ prod; - d64++; - s64++; - } - } - case 6: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta ^ prod; - d64++; - s64++; - } - } - case 7: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta ^ prod; - d64++; - s64++; - } - } - break; - case 8: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= ta; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta; - d64++; - s64++; - } - } - break; - case 9: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 10: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 11: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 12: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 13: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 14: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 15: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - default: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - prod = *d64 ; - ta = *s64; - tb = val; - while (1) { - if (tb & 1) prod ^= ta; - tb >>= 1; - if (tb == 0) break; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - } - *d64 = prod; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - prod = 0 ; - ta = *s64; - tb = val; - while (1) { - if (tb & 1) prod ^= ta; - tb >>= 1; - if (tb == 0) break; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - } - *d64 = prod; - d64++; - s64++; - } - } - break; - } - gf_do_final_region_alignment(&rd); -} - -static -int gf_w4_bytwo_init(gf_t *gf) -{ - gf_internal_t *h; - uint64_t ip, m1, m2; - struct gf_bytwo_data *btd; - - h = (gf_internal_t *) gf->scratch; - btd = (struct gf_bytwo_data *) (h->private); - ip = h->prim_poly & 0xf; - m1 = 0xe; - m2 = 0x8; - btd->prim_poly = 0; - btd->mask1 = 0; - btd->mask2 = 0; - - while (ip != 0) { - btd->prim_poly |= ip; - btd->mask1 |= m1; - btd->mask2 |= m2; - ip <<= GF_FIELD_WIDTH; - m1 <<= GF_FIELD_WIDTH; - m2 <<= GF_FIELD_WIDTH; - } - - if (h->mult_type == GF_MULT_BYTWO_p) { - gf->multiply.w32 = gf_w4_bytwo_p_multiply; - #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) - gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region; - #else - gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region; - if (h->region_type & GF_REGION_SSE) - return 0; - #endif - } else { - gf->multiply.w32 = gf_w4_bytwo_b_multiply; - #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) - gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region; - #else - gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region; - if (h->region_type & GF_REGION_SSE) - return 0; - #endif - } - return 1; -} - - -static -int gf_w4_cfm_init(gf_t *gf) -{ -#if defined(INTEL_SSE4_PCLMUL) - gf->multiply.w32 = gf_w4_clm_multiply; - return 1; -#endif - return 0; -} - -static -int gf_w4_shift_init(gf_t *gf) -{ - gf->multiply.w32 = gf_w4_shift_multiply; - return 1; -} - -/* JSP: I'm putting all error-checking into gf_error_check(), so you don't - have to do error checking in scratch_size or in init */ - -int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) -{ - int issse3 = 0; - -#ifdef INTEL_SSSE3 - issse3 = 1; -#endif - - switch(mult_type) - { - case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: - return sizeof(gf_internal_t) + sizeof(struct gf_bytwo_data); - break; - case GF_MULT_DEFAULT: - case GF_MULT_TABLE: - if (region_type == GF_REGION_CAUCHY) { - return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64; - } - - if (mult_type == GF_MULT_DEFAULT && !issse3) region_type = GF_REGION_DOUBLE_TABLE; - - if (region_type & GF_REGION_DOUBLE_TABLE) { - return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64; - } else if (region_type & GF_REGION_QUAD_TABLE) { - if ((region_type & GF_REGION_LAZY) == 0) { - return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_data) + 64; - } else { - return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_lazy_data) + 64; - } - } else { - return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64; - } - break; - - case GF_MULT_LOG_TABLE: - return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64; - break; - case GF_MULT_CARRY_FREE: - return sizeof(gf_internal_t); - break; - case GF_MULT_SHIFT: - return sizeof(gf_internal_t); - break; - default: - return 0; - } - return 0; -} - -int -gf_w4_init (gf_t *gf) -{ - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - if (h->prim_poly == 0) h->prim_poly = 0x13; - h->prim_poly |= 0x10; - gf->multiply.w32 = NULL; - gf->divide.w32 = NULL; - gf->inverse.w32 = NULL; - gf->multiply_region.w32 = NULL; - gf->extract_word.w32 = gf_w4_extract_word; - - switch(h->mult_type) { - case GF_MULT_CARRY_FREE: if (gf_w4_cfm_init(gf) == 0) return 0; break; - case GF_MULT_SHIFT: if (gf_w4_shift_init(gf) == 0) return 0; break; - case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: if (gf_w4_bytwo_init(gf) == 0) return 0; break; - case GF_MULT_LOG_TABLE: if (gf_w4_log_init(gf) == 0) return 0; break; - case GF_MULT_DEFAULT: - case GF_MULT_TABLE: if (gf_w4_table_init(gf) == 0) return 0; break; - default: return 0; - } - - if (h->divide_type == GF_DIVIDE_EUCLID) { - gf->divide.w32 = gf_w4_divide_from_inverse; - gf->inverse.w32 = gf_w4_euclid; - } else if (h->divide_type == GF_DIVIDE_MATRIX) { - gf->divide.w32 = gf_w4_divide_from_inverse; - gf->inverse.w32 = gf_w4_matrix; - } - - if (gf->divide.w32 == NULL) { - gf->divide.w32 = gf_w4_divide_from_inverse; - if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_euclid; - } - - if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_inverse_from_divide; - - if (h->region_type == GF_REGION_CAUCHY) { - gf->multiply_region.w32 = gf_wgen_cauchy_region; - gf->extract_word.w32 = gf_wgen_extract_word; - } - - if (gf->multiply_region.w32 == NULL) { - gf->multiply_region.w32 = gf_w4_multiply_region_from_single; - } - - return 1; -} - -/* Inline setup functions */ - -uint8_t *gf_w4_get_mult_table(gf_t *gf) -{ - gf_internal_t *h; - struct gf_single_table_data *std; - - h = (gf_internal_t *) gf->scratch; - if (gf->multiply.w32 == gf_w4_single_table_multiply) { - std = (struct gf_single_table_data *) h->private; - return (uint8_t *) std->mult; - } - return NULL; -} - -uint8_t *gf_w4_get_div_table(gf_t *gf) -{ - gf_internal_t *h; - struct gf_single_table_data *std; - - h = (gf_internal_t *) gf->scratch; - if (gf->multiply.w32 == gf_w4_single_table_multiply) { - std = (struct gf_single_table_data *) h->private; - return (uint8_t *) std->div; - } - return NULL; -} - diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w64.c b/src/erasure-code/jerasure/gf-complete/src/gf_w64.c deleted file mode 100644 index f04daf05df676..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/src/gf_w64.c +++ /dev/null @@ -1,2244 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_w64.c - * - * Routines for 64-bit Galois fields - */ - -#include "gf_int.h" -#include -#include - -#define GF_FIELD_WIDTH (64) -#define GF_FIRST_BIT (1ULL << 63) - -#define GF_BASE_FIELD_WIDTH (32) -#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH) -#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 - -struct gf_w64_group_data { - uint64_t *reduce; - uint64_t *shift; - uint64_t *memory; -}; - -struct gf_split_4_64_lazy_data { - uint64_t tables[16][16]; - uint64_t last_value; -}; - -struct gf_split_8_64_lazy_data { - uint64_t tables[8][(1<<8)]; - uint64_t last_value; -}; - -struct gf_split_16_64_lazy_data { - uint64_t tables[4][(1<<16)]; - uint64_t last_value; -}; - -struct gf_split_8_8_data { - uint64_t tables[15][256][256]; -}; - -static -inline -gf_val_64_t gf_w64_inverse_from_divide (gf_t *gf, gf_val_64_t a) -{ - return gf->divide.w64(gf, 1, a); -} - -#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); } - -static -inline -gf_val_64_t gf_w64_divide_from_inverse (gf_t *gf, gf_val_64_t a, gf_val_64_t b) -{ - b = gf->inverse.w64(gf, b); - return gf->multiply.w64(gf, a, b); -} - -static -void -gf_w64_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int -xor) -{ - int i; - gf_val_64_t *s64; - gf_val_64_t *d64; - - s64 = (gf_val_64_t *) src; - d64 = (gf_val_64_t *) dest; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - if (xor) { - for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) { - d64[i] ^= gf->multiply.w64(gf, val, s64[i]); - } - } else { - for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) { - d64[i] = gf->multiply.w64(gf, val, s64[i]); - } - } -} - -#if defined(INTEL_SSE4_PCLMUL) -static -void -gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int -xor) -{ - gf_val_64_t *s64, *d64, *top; - gf_region_data rd; - - __m128i a, b; - __m128i result, r1; - __m128i prim_poly; - __m128i w; - __m128i m1, m2, m3, m4; - gf_internal_t * h = gf->scratch; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); - b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0); - m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff); - m2 = _mm_slli_si128(m1, 4); - m2 = _mm_or_si128(m1, m2); - m3 = _mm_slli_si128(m1, 8); - m4 = _mm_slli_si128(m3, 4); - - s64 = (gf_val_64_t *) rd.s_start; - d64 = (gf_val_64_t *) rd.d_start; - top = (gf_val_64_t *) rd.d_top; - - if (xor) { - while (d64 != top) { - a = _mm_load_si128((__m128i *) s64); - result = _mm_clmulepi64_si128 (a, b, 1); - - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); - r1 = _mm_xor_si128 (result, w); - - result = _mm_clmulepi64_si128 (a, b, 0); - - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); - result = _mm_xor_si128 (result, w); - - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); - result = _mm_xor_si128 (result, w); - - result = _mm_unpacklo_epi64(result, r1); - - r1 = _mm_load_si128((__m128i *) d64); - result = _mm_xor_si128(r1, result); - _mm_store_si128((__m128i *) d64, result); - d64 += 2; - s64 += 2; - } - } else { - while (d64 != top) { - - a = _mm_load_si128((__m128i *) s64); - result = _mm_clmulepi64_si128 (a, b, 1); - - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); - r1 = _mm_xor_si128 (result, w); - - result = _mm_clmulepi64_si128 (a, b, 0); - - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); - result = _mm_xor_si128 (result, w); - - result = _mm_unpacklo_epi64(result, r1); - - _mm_store_si128((__m128i *) d64, result); - d64 += 2; - s64 += 2; - } - } - gf_do_final_region_alignment(&rd); -} -#endif - -#if defined(INTEL_SSE4_PCLMUL) -static -void -gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int -xor) -{ - gf_val_64_t *s64, *d64, *top; - gf_region_data rd; - - __m128i a, b; - __m128i result, r1; - __m128i prim_poly; - __m128i w; - __m128i m1, m3, m4; - gf_internal_t * h = gf->scratch; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); - b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0); - m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff); - m3 = _mm_slli_si128(m1, 8); - m4 = _mm_slli_si128(m3, 4); - - s64 = (gf_val_64_t *) rd.s_start; - d64 = (gf_val_64_t *) rd.d_start; - top = (gf_val_64_t *) rd.d_top; - - if (xor) { - while (d64 != top) { - a = _mm_load_si128((__m128i *) s64); - result = _mm_clmulepi64_si128 (a, b, 1); - - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); - r1 = _mm_xor_si128 (result, w); - - result = _mm_clmulepi64_si128 (a, b, 0); - - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); - result = _mm_xor_si128 (result, w); - - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); - result = _mm_xor_si128 (result, w); - - result = _mm_unpacklo_epi64(result, r1); - - r1 = _mm_load_si128((__m128i *) d64); - result = _mm_xor_si128(r1, result); - _mm_store_si128((__m128i *) d64, result); - d64 += 2; - s64 += 2; - } - } else { - while (d64 != top) { - a = _mm_load_si128((__m128i *) s64); - result = _mm_clmulepi64_si128 (a, b, 1); - - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); - r1 = _mm_xor_si128 (result, w); - - result = _mm_clmulepi64_si128 (a, b, 0); - - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); - result = _mm_xor_si128 (result, w); - - result = _mm_unpacklo_epi64(result, r1); - - _mm_store_si128((__m128i *) d64, result); - d64 += 2; - s64 += 2; - } - } - gf_do_final_region_alignment(&rd); -} -#endif - -static - inline -gf_val_64_t gf_w64_euclid (gf_t *gf, gf_val_64_t b) -{ - gf_val_64_t e_i, e_im1, e_ip1; - gf_val_64_t d_i, d_im1, d_ip1; - gf_val_64_t y_i, y_im1, y_ip1; - gf_val_64_t c_i; - gf_val_64_t one = 1; - - if (b == 0) return -1; - e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; - e_i = b; - d_im1 = 64; - for (d_i = d_im1-1; ((one << d_i) & e_i) == 0; d_i--) ; - y_i = 1; - y_im1 = 0; - - while (e_i != 1) { - - e_ip1 = e_im1; - d_ip1 = d_im1; - c_i = 0; - - while (d_ip1 >= d_i) { - c_i ^= (one << (d_ip1 - d_i)); - e_ip1 ^= (e_i << (d_ip1 - d_i)); - d_ip1--; - if (e_ip1 == 0) return 0; - while ((e_ip1 & (one << d_ip1)) == 0) d_ip1--; - } - - y_ip1 = y_im1 ^ gf->multiply.w64(gf, c_i, y_i); - y_im1 = y_i; - y_i = y_ip1; - - e_im1 = e_i; - d_im1 = d_i; - e_i = e_ip1; - d_i = d_ip1; - } - - return y_i; -} - -/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only - include it for completeness. It does have the feature that it requires no - extra memory. -*/ - -static -inline -gf_val_64_t -gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) -{ - uint64_t pl, pr, ppl, ppr, i, a, bl, br, one, lbit; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - ppr = h->prim_poly; - - /* Allen: set leading one of primitive polynomial */ - - ppl = 1; - - a = a64; - bl = 0; - br = b64; - one = 1; - lbit = (one << 63); - - pl = 0; /* Allen: left side of product */ - pr = 0; /* Allen: right side of product */ - - /* Allen: unlike the corresponding functions for smaller word sizes, - * this loop carries out the initial carryless multiply by - * shifting b itself rather than simply looking at successively - * higher shifts of b */ - - for (i = 0; i < GF_FIELD_WIDTH; i++) { - if (a & (one << i)) { - pl ^= bl; - pr ^= br; - } - - bl <<= 1; - if (br & lbit) bl ^= 1; - br <<= 1; - } - - /* Allen: the name of the variable "one" is no longer descriptive at this point */ - - one = lbit >> 1; - ppl = (h->prim_poly >> 2) | one; - ppr = (h->prim_poly << (GF_FIELD_WIDTH-2)); - while (one != 0) { - if (pl & one) { - pl ^= ppl; - pr ^= ppr; - } - one >>= 1; - ppr >>= 1; - if (ppl & 1) ppr ^= lbit; - ppl >>= 1; - } - return pr; -} - -/* - * ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply. - */ - -static -inline -gf_val_64_t -gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) -{ - gf_val_64_t rv = 0; - -#if defined(INTEL_SSE4_PCLMUL) - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i v, w; - gf_internal_t * h = gf->scratch; - - a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0); - b = _mm_insert_epi64 (a, b64, 0); - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); - /* Do the initial multiply */ - - result = _mm_clmulepi64_si128 (a, b, 0); - - /* Mask off the high order 32 bits using subtraction of the polynomial. - * NOTE: this part requires that the polynomial have at least 32 leading 0 bits. - */ - - /* Adam: We cant include the leading one in the 64 bit pclmul, - so we need to split up the high 8 bytes of the result into two - parts before we multiply them with the prim_poly.*/ - - v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); - w = _mm_clmulepi64_si128 (prim_poly, v, 0); - result = _mm_xor_si128 (result, w); - v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); - w = _mm_clmulepi64_si128 (prim_poly, v, 0); - result = _mm_xor_si128 (result, w); - - rv = ((gf_val_64_t)_mm_extract_epi64(result, 0)); -#endif - return rv; -} - -static -inline -gf_val_64_t -gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) -{ - gf_val_64_t rv = 0; - -#if defined(INTEL_SSE4_PCLMUL) - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i v, w; - gf_internal_t * h = gf->scratch; - - a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0); - b = _mm_insert_epi64 (a, b64, 0); - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); - - /* Do the initial multiply */ - - result = _mm_clmulepi64_si128 (a, b, 0); - - v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); - w = _mm_clmulepi64_si128 (prim_poly, v, 0); - result = _mm_xor_si128 (result, w); - v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); - w = _mm_clmulepi64_si128 (prim_poly, v, 0); - result = _mm_xor_si128 (result, w); - - v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); - w = _mm_clmulepi64_si128 (prim_poly, v, 0); - result = _mm_xor_si128 (result, w); - v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); - w = _mm_clmulepi64_si128 (prim_poly, v, 0); - result = _mm_xor_si128 (result, w); - - rv = ((gf_val_64_t)_mm_extract_epi64(result, 0)); -#endif - return rv; -} - - - void -gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) -{ -#if defined(INTEL_SSE4_PCLMUL) - gf_internal_t *h; - uint8_t *s8, *d8, *dtop; - gf_region_data rd; - __m128i v, b, m, prim_poly, c, fr, w, result; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - h = (gf_internal_t *) gf->scratch; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - dtop = (uint8_t *) rd.d_top; - - v = _mm_insert_epi64(_mm_setzero_si128(), val, 0); - m = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); - - if (xor) { - while (d8 != dtop) { - b = _mm_load_si128((__m128i *) s8); - result = _mm_clmulepi64_si128 (b, v, 0); - c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); - w = _mm_clmulepi64_si128 (prim_poly, c, 0); - result = _mm_xor_si128 (result, w); - c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); - w = _mm_clmulepi64_si128 (prim_poly, c, 0); - fr = _mm_xor_si128 (result, w); - fr = _mm_and_si128 (fr, m); - - result = _mm_clmulepi64_si128 (b, v, 1); - c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); - w = _mm_clmulepi64_si128 (prim_poly, c, 0); - result = _mm_xor_si128 (result, w); - c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); - w = _mm_clmulepi64_si128 (prim_poly, c, 0); - result = _mm_xor_si128 (result, w); - result = _mm_slli_si128 (result, 8); - fr = _mm_xor_si128 (result, fr); - result = _mm_load_si128((__m128i *) d8); - fr = _mm_xor_si128 (result, fr); - - _mm_store_si128((__m128i *) d8, fr); - d8 += 16; - s8 += 16; - } - } else { - while (d8 < dtop) { - b = _mm_load_si128((__m128i *) s8); - result = _mm_clmulepi64_si128 (b, v, 0); - c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); - w = _mm_clmulepi64_si128 (prim_poly, c, 0); - result = _mm_xor_si128 (result, w); - c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); - w = _mm_clmulepi64_si128 (prim_poly, c, 0); - fr = _mm_xor_si128 (result, w); - fr = _mm_and_si128 (fr, m); - - result = _mm_clmulepi64_si128 (b, v, 1); - c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); - w = _mm_clmulepi64_si128 (prim_poly, c, 0); - result = _mm_xor_si128 (result, w); - c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); - w = _mm_clmulepi64_si128 (prim_poly, c, 0); - result = _mm_xor_si128 (result, w); - result = _mm_slli_si128 (result, 8); - fr = _mm_xor_si128 (result, fr); - - _mm_store_si128((__m128i *) d8, fr); - d8 += 16; - s8 += 16; - } - } - gf_do_final_region_alignment(&rd); -#endif -} - -void -gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) -{ - gf_internal_t *h; - struct gf_split_4_64_lazy_data *ld; - int i, j, k; - uint64_t pp, v, s, *s64, *d64, *top; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - ld = (struct gf_split_4_64_lazy_data *) h->private; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); - gf_do_initial_region_alignment(&rd); - - if (ld->last_value != val) { - v = val; - for (i = 0; i < 16; i++) { - ld->tables[i][0] = 0; - for (j = 1; j < 16; j <<= 1) { - for (k = 0; k < j; k++) { - ld->tables[i][k^j] = (v ^ ld->tables[i][k]); - } - v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); - } - } - } - ld->last_value = val; - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - top = (uint64_t *) rd.d_top; - - while (d64 != top) { - v = (xor) ? *d64 : 0; - s = *s64; - i = 0; - while (s != 0) { - v ^= ld->tables[i][s&0xf]; - s >>= 4; - i++; - } - *d64 = v; - d64++; - s64++; - } - gf_do_final_region_alignment(&rd); -} - -static -inline -uint64_t -gf_w64_split_8_8_multiply (gf_t *gf, uint64_t a64, uint64_t b64) -{ - uint64_t product, i, j, mask, tb; - gf_internal_t *h; - struct gf_split_8_8_data *d8; - - h = (gf_internal_t *) gf->scratch; - d8 = (struct gf_split_8_8_data *) h->private; - product = 0; - mask = 0xff; - - for (i = 0; a64 != 0; i++) { - tb = b64; - for (j = 0; tb != 0; j++) { - product ^= d8->tables[i+j][a64&mask][tb&mask]; - tb >>= 8; - } - a64 >>= 8; - } - return product; -} - -void -gf_w64_split_8_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) -{ - gf_internal_t *h; - struct gf_split_8_64_lazy_data *ld; - int i, j, k; - uint64_t pp, v, s, *s64, *d64, *top; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - ld = (struct gf_split_8_64_lazy_data *) h->private; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); - gf_do_initial_region_alignment(&rd); - - if (ld->last_value != val) { - v = val; - for (i = 0; i < 8; i++) { - ld->tables[i][0] = 0; - for (j = 1; j < 256; j <<= 1) { - for (k = 0; k < j; k++) { - ld->tables[i][k^j] = (v ^ ld->tables[i][k]); - } - v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); - } - } - } - ld->last_value = val; - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - top = (uint64_t *) rd.d_top; - - while (d64 != top) { - v = (xor) ? *d64 : 0; - s = *s64; - i = 0; - while (s != 0) { - v ^= ld->tables[i][s&0xff]; - s >>= 8; - i++; - } - *d64 = v; - d64++; - s64++; - } - gf_do_final_region_alignment(&rd); -} - -void -gf_w64_split_16_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) -{ - gf_internal_t *h; - struct gf_split_16_64_lazy_data *ld; - int i, j, k; - uint64_t pp, v, s, *s64, *d64, *top; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - ld = (struct gf_split_16_64_lazy_data *) h->private; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); - gf_do_initial_region_alignment(&rd); - - if (ld->last_value != val) { - v = val; - for (i = 0; i < 4; i++) { - ld->tables[i][0] = 0; - for (j = 1; j < (1<<16); j <<= 1) { - for (k = 0; k < j; k++) { - ld->tables[i][k^j] = (v ^ ld->tables[i][k]); - } - v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); - } - } - } - ld->last_value = val; - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - top = (uint64_t *) rd.d_top; - - while (d64 != top) { - v = (xor) ? *d64 : 0; - s = *s64; - i = 0; - while (s != 0) { - v ^= ld->tables[i][s&0xffff]; - s >>= 16; - i++; - } - *d64 = v; - d64++; - s64++; - } - gf_do_final_region_alignment(&rd); -} - -static -int gf_w64_shift_init(gf_t *gf) -{ - gf->multiply.w64 = gf_w64_shift_multiply; - gf->inverse.w64 = gf_w64_euclid; - gf->multiply_region.w64 = gf_w64_multiply_region_from_single; - return 1; -} - -static -int gf_w64_cfm_init(gf_t *gf) -{ - gf->inverse.w64 = gf_w64_euclid; - gf->multiply_region.w64 = gf_w64_multiply_region_from_single; - -#if defined(INTEL_SSE4_PCLMUL) - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - - if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ - gf->multiply.w64 = gf_w64_clm_multiply_2; - gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; - }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ - gf->multiply.w64 = gf_w64_clm_multiply_4; - gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4; - } else { - return 0; - } - return 1; -#endif - - return 0; -} - -static -void -gf_w64_group_set_shift_tables(uint64_t *shift, uint64_t val, gf_internal_t *h) -{ - int i; - uint64_t j; - uint64_t one = 1; - int g_s; - - g_s = h->arg1; - shift[0] = 0; - - for (i = 1; i < (1 << g_s); i <<= 1) { - for (j = 0; j < i; j++) shift[i|j] = shift[j]^val; - if (val & (one << 63)) { - val <<= 1; - val ^= h->prim_poly; - } else { - val <<= 1; - } - } -} - -static -inline -gf_val_64_t -gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) -{ - uint64_t top, bot, mask, tp; - int g_s, g_r, lshift, rshift; - struct gf_w64_group_data *gd; - - gf_internal_t *h = (gf_internal_t *) gf->scratch; - g_s = h->arg1; - g_r = h->arg2; - gd = (struct gf_w64_group_data *) h->private; - gf_w64_group_set_shift_tables(gd->shift, b, h); - - mask = ((1 << g_s) - 1); - top = 0; - bot = gd->shift[a&mask]; - a >>= g_s; - - if (a == 0) return bot; - lshift = 0; - rshift = 64; - - do { /* Shifting out is straightfoward */ - lshift += g_s; - rshift -= g_s; - tp = gd->shift[a&mask]; - top ^= (tp >> rshift); - bot ^= (tp << lshift); - a >>= g_s; - } while (a != 0); - - /* Reducing is a bit gross, because I don't zero out the index bits of top. - The reason is that we throw top away. Even better, that last (tp >> rshift) - is going to be ignored, so it doesn't matter how (tp >> 64) is implemented. */ - - lshift = ((lshift-1) / g_r) * g_r; - rshift = 64 - lshift; - mask = (1 << g_r) - 1; - while (lshift >= 0) { - tp = gd->reduce[(top >> lshift) & mask]; - top ^= (tp >> rshift); - bot ^= (tp << lshift); - lshift -= g_r; - rshift += g_r; - } - - return bot; -} - -static -void gf_w64_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) -{ - int i, fzb; - uint64_t a64, smask, rmask, top, bot, tp; - int lshift, rshift, g_s, g_r; - gf_region_data rd; - uint64_t *s64, *d64, *dtop; - struct gf_w64_group_data *gd; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gd = (struct gf_w64_group_data *) h->private; - g_s = h->arg1; - g_r = h->arg2; - gf_w64_group_set_shift_tables(gd->shift, val, h); - - for (i = 63; !(val & (1ULL << i)); i--) ; - i += g_s; - - /* i is the bit position of the first zero bit in any element of - gd->shift[] */ - - if (i > 64) i = 64; - - fzb = i; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); - - gf_do_initial_region_alignment(&rd); - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - dtop = (uint64_t *) rd.d_top; - - smask = (1 << g_s) - 1; - rmask = (1 << g_r) - 1; - - while (d64 < dtop) { - a64 = *s64; - - top = 0; - bot = gd->shift[a64&smask]; - a64 >>= g_s; - i = fzb; - - if (a64 != 0) { - lshift = 0; - rshift = 64; - - do { - lshift += g_s; - rshift -= g_s; - tp = gd->shift[a64&smask]; - top ^= (tp >> rshift); - bot ^= (tp << lshift); - a64 >>= g_s; - } while (a64 != 0); - i += lshift; - - lshift = ((i-64-1) / g_r) * g_r; - rshift = 64 - lshift; - while (lshift >= 0) { - tp = gd->reduce[(top >> lshift) & rmask]; - top ^= (tp >> rshift); - bot ^= (tp << lshift); - lshift -= g_r; - rshift += g_r; - } - } - - if (xor) bot ^= *d64; - *d64 = bot; - d64++; - s64++; - } - gf_do_final_region_alignment(&rd); -} - -static -inline -gf_val_64_t -gf_w64_group_s_equals_r_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) -{ - int leftover, rs; - uint64_t p, l, ind, a64; - int bits_left; - int g_s; - - struct gf_w64_group_data *gd; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - g_s = h->arg1; - - gd = (struct gf_w64_group_data *) h->private; - gf_w64_group_set_shift_tables(gd->shift, b, h); - - leftover = 64 % g_s; - if (leftover == 0) leftover = g_s; - - rs = 64 - leftover; - a64 = a; - ind = a64 >> rs; - a64 <<= leftover; - p = gd->shift[ind]; - - bits_left = rs; - rs = 64 - g_s; - - while (bits_left > 0) { - bits_left -= g_s; - ind = a64 >> rs; - a64 <<= g_s; - l = p >> rs; - p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)); - } - return p; -} - -static -void gf_w64_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) -{ - int leftover, rs; - uint64_t p, l, ind, a64; - int bits_left; - int g_s; - gf_region_data rd; - uint64_t *s64, *d64, *top; - struct gf_w64_group_data *gd; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gd = (struct gf_w64_group_data *) h->private; - g_s = h->arg1; - gf_w64_group_set_shift_tables(gd->shift, val, h); - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); - gf_do_initial_region_alignment(&rd); - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - top = (uint64_t *) rd.d_top; - - leftover = 64 % g_s; - if (leftover == 0) leftover = g_s; - - while (d64 < top) { - rs = 64 - leftover; - a64 = *s64; - ind = a64 >> rs; - a64 <<= leftover; - p = gd->shift[ind]; - - bits_left = rs; - rs = 64 - g_s; - - while (bits_left > 0) { - bits_left -= g_s; - ind = a64 >> rs; - a64 <<= g_s; - l = p >> rs; - p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)); - } - if (xor) p ^= *d64; - *d64 = p; - d64++; - s64++; - } - gf_do_final_region_alignment(&rd); -} - - -static -int gf_w64_group_init(gf_t *gf) -{ - uint64_t i, j, p, index; - struct gf_w64_group_data *gd; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - int g_r, g_s; - - g_s = h->arg1; - g_r = h->arg2; - - gd = (struct gf_w64_group_data *) h->private; - gd->shift = (uint64_t *) (&(gd->memory)); - gd->reduce = gd->shift + (1 << g_s); - - gd->reduce[0] = 0; - for (i = 0; i < (1 << g_r); i++) { - p = 0; - index = 0; - for (j = 0; j < g_r; j++) { - if (i & (1 << j)) { - p ^= (h->prim_poly << j); - index ^= (1 << j); - if (j > 0) index ^= (h->prim_poly >> (64-j)); - } - } - gd->reduce[index] = p; - } - - if (g_s == g_r) { - gf->multiply.w64 = gf_w64_group_s_equals_r_multiply; - gf->multiply_region.w64 = gf_w64_group_s_equals_r_multiply_region; - } else { - gf->multiply.w64 = gf_w64_group_multiply; - gf->multiply_region.w64 = gf_w64_group_multiply_region; - } - gf->divide.w64 = NULL; - gf->inverse.w64 = gf_w64_euclid; - - return 1; -} - -static -gf_val_64_t gf_w64_extract_word(gf_t *gf, void *start, int bytes, int index) -{ - uint64_t *r64, rv; - - r64 = (uint64_t *) start; - rv = r64[index]; - return rv; -} - -static -gf_val_64_t gf_w64_composite_extract_word(gf_t *gf, void *start, int bytes, int index) -{ - int sub_size; - gf_internal_t *h; - uint8_t *r8, *top; - uint64_t a, b, *r64; - gf_region_data rd; - - h = (gf_internal_t *) gf->scratch; - gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); - r64 = (uint64_t *) start; - if (r64 + index < (uint64_t *) rd.d_start) return r64[index]; - if (r64 + index >= (uint64_t *) rd.d_top) return r64[index]; - index -= (((uint64_t *) rd.d_start) - r64); - r8 = (uint8_t *) rd.d_start; - top = (uint8_t *) rd.d_top; - sub_size = (top-r8)/2; - - a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index); - b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index); - return (a | ((uint64_t)b << 32)); -} - -static -gf_val_64_t gf_w64_split_extract_word(gf_t *gf, void *start, int bytes, int index) -{ - int i; - uint64_t *r64, rv; - uint8_t *r8; - gf_region_data rd; - - gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 128); - r64 = (uint64_t *) start; - if (r64 + index < (uint64_t *) rd.d_start) return r64[index]; - if (r64 + index >= (uint64_t *) rd.d_top) return r64[index]; - index -= (((uint64_t *) rd.d_start) - r64); - r8 = (uint8_t *) rd.d_start; - r8 += ((index & 0xfffffff0)*8); - r8 += (index & 0xf); - r8 += 112; - rv =0; - for (i = 0; i < 8; i++) { - rv <<= 8; - rv |= *r8; - r8 -= 16; - } - return rv; -} - -static -inline -gf_val_64_t -gf_w64_bytwo_b_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b) -{ - uint64_t prod, pp, bmask; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - prod = 0; - bmask = 0x8000000000000000ULL; - - while (1) { - if (a & 1) prod ^= b; - a >>= 1; - if (a == 0) return prod; - if (b & bmask) { - b = ((b << 1) ^ pp); - } else { - b <<= 1; - } - } -} - -static -inline -gf_val_64_t -gf_w64_bytwo_p_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b) -{ - uint64_t prod, pp, pmask, amask; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - prod = 0; - - /* changed from declare then shift to just declare.*/ - - pmask = 0x8000000000000000ULL; - amask = 0x8000000000000000ULL; - - while (amask != 0) { - if (prod & pmask) { - prod = ((prod << 1) ^ pp); - } else { - prod <<= 1; - } - if (a & amask) prod ^= b; - amask >>= 1; - } - return prod; -} - -static -void -gf_w64_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) -{ - uint64_t *s64, *d64, ta, prod, amask, pmask, pp; - gf_region_data rd; - gf_internal_t *h; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); - gf_do_initial_region_alignment(&rd); - - h = (gf_internal_t *) gf->scratch; - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - pmask = 0x80000000; - pmask <<= 32; - pp = h->prim_poly; - - if (xor) { - while (s64 < (uint64_t *) rd.s_top) { - prod = 0; - amask = pmask; - ta = *s64; - while (amask != 0) { - prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1); - if (val & amask) prod ^= ta; - amask >>= 1; - } - *d64 ^= prod; - d64++; - s64++; - } - } else { - while (s64 < (uint64_t *) rd.s_top) { - prod = 0; - amask = pmask; - ta = *s64; - while (amask != 0) { - prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1); - if (val & amask) prod ^= ta; - amask >>= 1; - } - *d64 = prod; - d64++; - s64++; - } - } - gf_do_final_region_alignment(&rd); -} - -static -void -gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) -{ - uint64_t *s64, *d64, ta, tb, prod, bmask, pp; - gf_region_data rd; - gf_internal_t *h; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); - gf_do_initial_region_alignment(&rd); - - h = (gf_internal_t *) gf->scratch; - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - bmask = 0x80000000; - bmask <<= 32; - pp = h->prim_poly; - - if (xor) { - while (s64 < (uint64_t *) rd.s_top) { - prod = 0; - tb = val; - ta = *s64; - while (1) { - if (tb & 1) prod ^= ta; - tb >>= 1; - if (tb == 0) break; - ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1); - } - *d64 ^= prod; - d64++; - s64++; - } - } else { - while (s64 < (uint64_t *) rd.s_top) { - prod = 0; - tb = val; - ta = *s64; - while (1) { - if (tb & 1) prod ^= ta; - tb >>= 1; - if (tb == 0) break; - ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1); - } - *d64 = prod; - d64++; - s64++; - } - } - gf_do_final_region_alignment(&rd); -} - -#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ - t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ - t2 = _mm_and_si128(va, m2); \ - t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ - va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } - -#define BYTWO_P_ONESTEP {\ - SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ - t1 = _mm_and_si128(v, one); \ - t1 = _mm_sub_epi64(t1, one); \ - t1 = _mm_and_si128(t1, ta); \ - prod = _mm_xor_si128(prod, t1); \ - v = _mm_srli_epi64(v, 1); } - - -void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) -{ -#ifdef INTEL_SSE2 - int i; - uint8_t *s8, *d8; - uint64_t vrev, one64; - uint64_t amask; - __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; - gf_region_data rd; - gf_internal_t *h; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - h = (gf_internal_t *) gf->scratch; - one64 = 1; - vrev = 0; - for (i = 0; i < 64; i++) { - vrev <<= 1; - if (!(val & (one64 << i))) vrev |= 1; - } - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - amask = -1; - amask ^= 1; - pp = _mm_set1_epi64x(h->prim_poly); - m1 = _mm_set1_epi64x(amask); - m2 = _mm_set1_epi64x(one64 << 63); - one = _mm_set1_epi64x(1); - - while (d8 < (uint8_t *) rd.d_top) { - prod = _mm_setzero_si128(); - v = _mm_set1_epi64x(vrev); - ta = _mm_load_si128((__m128i *) s8); - tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; - _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); - d8 += 16; - s8 += 16; - } - gf_do_final_region_alignment(&rd); -#endif -} - -#ifdef INTEL_SSE2 -static -void -gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd) -{ - uint64_t one64, amask; - uint8_t *d8, *s8; - __m128i pp, m1, m2, t1, t2, va, vb; - gf_internal_t *h; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - h = (gf_internal_t *) rd->gf->scratch; - one64 = 1; - amask = -1; - amask ^= 1; - pp = _mm_set1_epi64x(h->prim_poly); - m1 = _mm_set1_epi64x(amask); - m2 = _mm_set1_epi64x(one64 << 63); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); - vb = _mm_load_si128 ((__m128i *)(d8)); - vb = _mm_xor_si128(vb, va); - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd) -{ - uint64_t one64, amask; - uint8_t *d8, *s8; - __m128i pp, m1, m2, t1, t2, va; - gf_internal_t *h; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - h = (gf_internal_t *) rd->gf->scratch; - one64 = 1; - amask = -1; - amask ^= 1; - pp = _mm_set1_epi64x(h->prim_poly); - m1 = _mm_set1_epi64x(amask); - m2 = _mm_set1_epi64x(one64 << 63); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); - _mm_store_si128((__m128i *)d8, va); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static -void -gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) -{ - uint64_t itb, amask, one64; - uint8_t *d8, *s8; - __m128i pp, m1, m2, t1, t2, va, vb; - gf_region_data rd; - gf_internal_t *h; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - if (val == 2) { - if (xor) { - gf_w64_bytwo_b_sse_region_2_xor(&rd); - } else { - gf_w64_bytwo_b_sse_region_2_noxor(&rd); - } - gf_do_final_region_alignment(&rd); - return; - } - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - h = (gf_internal_t *) gf->scratch; - - one64 = 1; - amask = -1; - amask ^= 1; - pp = _mm_set1_epi64x(h->prim_poly); - m1 = _mm_set1_epi64x(amask); - m2 = _mm_set1_epi64x(one64 << 63); - - while (d8 < (uint8_t *) rd.d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8)); - itb = val; - while (1) { - if (itb & 1) vb = _mm_xor_si128(vb, va); - itb >>= 1; - if (itb == 0) break; - SSE_AB2(pp, m1, m2, va, t1, t2); - } - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } - - gf_do_final_region_alignment(&rd); -} -#endif - - -static -int gf_w64_bytwo_init(gf_t *gf) -{ - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - - if (h->mult_type == GF_MULT_BYTWO_p) { - gf->multiply.w64 = gf_w64_bytwo_p_multiply; - #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) - gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; - else - gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region; - #else - gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) - return 0; - #endif - } else { - gf->multiply.w64 = gf_w64_bytwo_b_multiply; - #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) - gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; - else - gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region; - #else - gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) - return 0; - #endif - } - gf->inverse.w64 = gf_w64_euclid; - return 1; -} - - -static -gf_val_64_t -gf_w64_composite_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint32_t b0 = b & 0x00000000ffffffff; - uint32_t b1 = (b & 0xffffffff00000000) >> 32; - uint32_t a0 = a & 0x00000000ffffffff; - uint32_t a1 = (a & 0xffffffff00000000) >> 32; - uint32_t a1b1; - - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - return ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32)); -} - -/* - * Composite field division trick (explained in 2007 tech report) - * - * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 - * - * let c = b^-1 - * - * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0) - * - * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 - * - * let d = b1c1 and d+1 = b0c0 - * - * solve s*b1c1+b1c0+b0c1 = 0 - * - * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1 - * - * c0 = (d+1)b0^-1 - * c1 = d*b1^-1 - * - * a / b = a * c - */ - -static -gf_val_64_t -gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint32_t a0 = a & 0x00000000ffffffff; - uint32_t a1 = (a & 0xffffffff00000000) >> 32; - uint32_t c0, c1, d, tmp; - uint64_t c; - uint32_t a0inv, a1inv; - - if (a0 == 0) { - a1inv = base_gf->inverse.w32(base_gf, a1); - c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); - c1 = a1inv; - } else if (a1 == 0) { - c0 = base_gf->inverse.w32(base_gf, a0); - c1 = 0; - } else { - a1inv = base_gf->inverse.w32(base_gf, a1); - a0inv = base_gf->inverse.w32(base_gf, a0); - - d = base_gf->multiply.w32(base_gf, a1, a0inv); - - tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly); - tmp = base_gf->inverse.w32(base_gf, tmp); - - d = base_gf->multiply.w32(base_gf, d, tmp); - - c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv); - c1 = base_gf->multiply.w32(base_gf, d, a1inv); - } - - c = c0 | ((uint64_t)c1 << 32); - - return c; -} - -static -void -gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint32_t b0 = val & 0x00000000ffffffff; - uint32_t b1 = (val & 0xffffffff00000000) >> 32; - uint64_t *s64, *d64; - uint64_t *top; - uint64_t a0, a1, a1b1; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); - - s64 = rd.s_start; - d64 = rd.d_start; - top = rd.d_top; - - if (xor) { - while (d64 < top) { - a0 = *s64 & 0x00000000ffffffff; - a1 = (*s64 & 0xffffffff00000000) >> 32; - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - *d64 ^= ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32)); - s64++; - d64++; - } - } else { - while (d64 < top) { - a0 = *s64 & 0x00000000ffffffff; - a1 = (*s64 & 0xffffffff00000000) >> 32; - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - *d64 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32)); - s64++; - d64++; - } - } -} - -static -void -gf_w64_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - gf_val_32_t val0 = val & 0x00000000ffffffff; - gf_val_32_t val1 = (val & 0xffffffff00000000) >> 32; - uint8_t *slow, *shigh; - uint8_t *dlow, *dhigh, *top; - int sub_reg_size; - gf_region_data rd; - - if (!xor) { - memset(dest, 0, bytes); - } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); - gf_do_initial_region_alignment(&rd); - - slow = (uint8_t *) rd.s_start; - dlow = (uint8_t *) rd.d_start; - top = (uint8_t*) rd.d_top; - sub_reg_size = (top - dlow)/2; - shigh = slow + sub_reg_size; - dhigh = dlow + sub_reg_size; - - base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor); - base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor); - base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); - - gf_do_final_region_alignment(&rd); -} - - - -static -int gf_w64_composite_init(gf_t *gf) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - - if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w64 = gf_w64_composite_multiply_region_alt; - } else { - gf->multiply_region.w64 = gf_w64_composite_multiply_region; - } - - gf->multiply.w64 = gf_w64_composite_multiply; - gf->divide.w64 = NULL; - gf->inverse.w64 = gf_w64_composite_inverse; - - return 1; -} - -#ifdef INTEL_SSSE3 -static - void -gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) -{ - gf_internal_t *h; - int i, j, k; - uint64_t pp, v, *s64, *d64, *top; - __m128i si, tables[16][8], p[8], v0, mask1; - struct gf_split_4_64_lazy_data *ld; - uint8_t btable[16]; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128); - gf_do_initial_region_alignment(&rd); - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - top = (uint64_t *) rd.d_top; - - ld = (struct gf_split_4_64_lazy_data *) h->private; - - v = val; - for (i = 0; i < 16; i++) { - ld->tables[i][0] = 0; - for (j = 1; j < 16; j <<= 1) { - for (k = 0; k < j; k++) { - ld->tables[i][k^j] = (v ^ ld->tables[i][k]); - } - v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); - } - for (j = 0; j < 8; j++) { - for (k = 0; k < 16; k++) { - btable[k] = (uint8_t) ld->tables[i][k]; - ld->tables[i][k] >>= 8; - } - tables[i][j] = _mm_loadu_si128((__m128i *) btable); - } - } - - mask1 = _mm_set1_epi8(0xf); - - while (d64 != top) { - - if (xor) { - for (i = 0; i < 8; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2)); - } else { - for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128(); - } - i = 0; - for (k = 0; k < 8; k++) { - v0 = _mm_load_si128((__m128i *) s64); - /* MM_PRINT8("v", v0); */ - s64 += 2; - - si = _mm_and_si128(v0, mask1); - - for (j = 0; j < 8; j++) { - p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); - } - i++; - v0 = _mm_srli_epi32(v0, 4); - si = _mm_and_si128(v0, mask1); - for (j = 0; j < 8; j++) { - p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); - } - i++; - } - for (i = 0; i < 8; i++) { - /* MM_PRINT8("v", p[i]); */ - _mm_store_si128((__m128i *) d64, p[i]); - d64 += 2; - } - } - gf_do_final_region_alignment(&rd); -} -#endif - -#ifdef INTEL_SSE4 -static - void -gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) -{ - gf_internal_t *h; - int i, j, k; - uint64_t pp, v, *s64, *d64, *top; - __m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1; - struct gf_split_4_64_lazy_data *ld; - uint8_t btable[16]; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128); - gf_do_initial_region_alignment(&rd); - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - top = (uint64_t *) rd.d_top; - - ld = (struct gf_split_4_64_lazy_data *) h->private; - - v = val; - for (i = 0; i < 16; i++) { - ld->tables[i][0] = 0; - for (j = 1; j < 16; j <<= 1) { - for (k = 0; k < j; k++) { - ld->tables[i][k^j] = (v ^ ld->tables[i][k]); - } - v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); - } - for (j = 0; j < 8; j++) { - for (k = 0; k < 16; k++) { - btable[k] = (uint8_t) ld->tables[i][k]; - ld->tables[i][k] >>= 8; - } - tables[i][j] = _mm_loadu_si128((__m128i *) btable); - } - } - - mask1 = _mm_set1_epi8(0xf); - mask8 = _mm_set1_epi16(0xff); - mask16 = _mm_set1_epi32(0xffff); - - while (d64 != top) { - - for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128(); - - for (k = 0; k < 8; k++) { - st[k] = _mm_load_si128((__m128i *) s64); - s64 += 2; - } - - for (k = 0; k < 4; k ++) { - st[k] = _mm_shuffle_epi32(st[k], _MM_SHUFFLE(3,1,2,0)); - st[k+4] = _mm_shuffle_epi32(st[k+4], _MM_SHUFFLE(2,0,3,1)); - t1 = _mm_blend_epi16(st[k], st[k+4], 0xf0); - st[k] = _mm_srli_si128(st[k], 8); - st[k+4] = _mm_slli_si128(st[k+4], 8); - st[k+4] = _mm_blend_epi16(st[k], st[k+4], 0xf0); - st[k] = t1; - } - -/* - printf("After pack pass 1\n"); - for (k = 0; k < 8; k++) { - MM_PRINT8("v", st[k]); - } - printf("\n"); - */ - - t1 = _mm_packus_epi32(_mm_and_si128(st[0], mask16), _mm_and_si128(st[2], mask16)); - st[2] = _mm_packus_epi32(_mm_srli_epi32(st[0], 16), _mm_srli_epi32(st[2], 16)); - st[0] = t1; - t1 = _mm_packus_epi32(_mm_and_si128(st[1], mask16), _mm_and_si128(st[3], mask16)); - st[3] = _mm_packus_epi32(_mm_srli_epi32(st[1], 16), _mm_srli_epi32(st[3], 16)); - st[1] = t1; - t1 = _mm_packus_epi32(_mm_and_si128(st[4], mask16), _mm_and_si128(st[6], mask16)); - st[6] = _mm_packus_epi32(_mm_srli_epi32(st[4], 16), _mm_srli_epi32(st[6], 16)); - st[4] = t1; - t1 = _mm_packus_epi32(_mm_and_si128(st[5], mask16), _mm_and_si128(st[7], mask16)); - st[7] = _mm_packus_epi32(_mm_srli_epi32(st[5], 16), _mm_srli_epi32(st[7], 16)); - st[5] = t1; - -/* - printf("After pack pass 2\n"); - for (k = 0; k < 8; k++) { - MM_PRINT8("v", st[k]); - } - printf("\n"); - */ - t1 = _mm_packus_epi16(_mm_and_si128(st[0], mask8), _mm_and_si128(st[1], mask8)); - st[1] = _mm_packus_epi16(_mm_srli_epi16(st[0], 8), _mm_srli_epi16(st[1], 8)); - st[0] = t1; - t1 = _mm_packus_epi16(_mm_and_si128(st[2], mask8), _mm_and_si128(st[3], mask8)); - st[3] = _mm_packus_epi16(_mm_srli_epi16(st[2], 8), _mm_srli_epi16(st[3], 8)); - st[2] = t1; - t1 = _mm_packus_epi16(_mm_and_si128(st[4], mask8), _mm_and_si128(st[5], mask8)); - st[5] = _mm_packus_epi16(_mm_srli_epi16(st[4], 8), _mm_srli_epi16(st[5], 8)); - st[4] = t1; - t1 = _mm_packus_epi16(_mm_and_si128(st[6], mask8), _mm_and_si128(st[7], mask8)); - st[7] = _mm_packus_epi16(_mm_srli_epi16(st[6], 8), _mm_srli_epi16(st[7], 8)); - st[6] = t1; - -/* - printf("After final pack pass 2\n"); - for (k = 0; k < 8; k++) { - MM_PRINT8("v", st[k]); - } - */ - i = 0; - for (k = 0; k < 8; k++) { - si = _mm_and_si128(st[k], mask1); - - for (j = 0; j < 8; j++) { - p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); - } - i++; - st[k] = _mm_srli_epi32(st[k], 4); - si = _mm_and_si128(st[k], mask1); - for (j = 0; j < 8; j++) { - p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); - } - i++; - } - - t1 = _mm_unpacklo_epi8(p[0], p[1]); - p[1] = _mm_unpackhi_epi8(p[0], p[1]); - p[0] = t1; - t1 = _mm_unpacklo_epi8(p[2], p[3]); - p[3] = _mm_unpackhi_epi8(p[2], p[3]); - p[2] = t1; - t1 = _mm_unpacklo_epi8(p[4], p[5]); - p[5] = _mm_unpackhi_epi8(p[4], p[5]); - p[4] = t1; - t1 = _mm_unpacklo_epi8(p[6], p[7]); - p[7] = _mm_unpackhi_epi8(p[6], p[7]); - p[6] = t1; - -/* - printf("After unpack pass 1:\n"); - for (i = 0; i < 8; i++) { - MM_PRINT8("v", p[i]); - } - */ - - t1 = _mm_unpacklo_epi16(p[0], p[2]); - p[2] = _mm_unpackhi_epi16(p[0], p[2]); - p[0] = t1; - t1 = _mm_unpacklo_epi16(p[1], p[3]); - p[3] = _mm_unpackhi_epi16(p[1], p[3]); - p[1] = t1; - t1 = _mm_unpacklo_epi16(p[4], p[6]); - p[6] = _mm_unpackhi_epi16(p[4], p[6]); - p[4] = t1; - t1 = _mm_unpacklo_epi16(p[5], p[7]); - p[7] = _mm_unpackhi_epi16(p[5], p[7]); - p[5] = t1; - -/* - printf("After unpack pass 2:\n"); - for (i = 0; i < 8; i++) { - MM_PRINT8("v", p[i]); - } - */ - - t1 = _mm_unpacklo_epi32(p[0], p[4]); - p[4] = _mm_unpackhi_epi32(p[0], p[4]); - p[0] = t1; - t1 = _mm_unpacklo_epi32(p[1], p[5]); - p[5] = _mm_unpackhi_epi32(p[1], p[5]); - p[1] = t1; - t1 = _mm_unpacklo_epi32(p[2], p[6]); - p[6] = _mm_unpackhi_epi32(p[2], p[6]); - p[2] = t1; - t1 = _mm_unpacklo_epi32(p[3], p[7]); - p[7] = _mm_unpackhi_epi32(p[3], p[7]); - p[3] = t1; - - if (xor) { - for (i = 0; i < 8; i++) { - t1 = _mm_load_si128((__m128i *) d64); - _mm_store_si128((__m128i *) d64, _mm_xor_si128(p[i], t1)); - d64 += 2; - } - } else { - for (i = 0; i < 8; i++) { - _mm_store_si128((__m128i *) d64, p[i]); - d64 += 2; - } - } - - } - - gf_do_final_region_alignment(&rd); -} -#endif - -#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1); - -static -int gf_w64_split_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_split_4_64_lazy_data *d4; - struct gf_split_8_64_lazy_data *d8; - struct gf_split_8_8_data *d88; - struct gf_split_16_64_lazy_data *d16; - uint64_t p, basep; - int exp, i, j; - - h = (gf_internal_t *) gf->scratch; - - /* Defaults */ - - gf->multiply_region.w64 = gf_w64_multiply_region_from_single; - - gf->multiply.w64 = gf_w64_bytwo_p_multiply; - -#if defined(INTEL_SSE4_PCLMUL) - if ((!(h->region_type & GF_REGION_NOSSE) && - (h->arg1 == 64 || h->arg2 == 64)) || - h->mult_type == GF_MULT_DEFAULT){ - - if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ - gf->multiply.w64 = gf_w64_clm_multiply_2; - gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; - }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ - gf->multiply.w64 = gf_w64_clm_multiply_4; - gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4; - }else{ - return 0; - } - } -#endif - - gf->inverse.w64 = gf_w64_euclid; - - /* Allen: set region pointers for default mult type. Single pointers are - * taken care of above (explicitly for sse, implicitly for no sse). */ - -#ifdef INTEL_SSE4 - if (h->mult_type == GF_MULT_DEFAULT) { - d4 = (struct gf_split_4_64_lazy_data *) h->private; - d4->last_value = 0; - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; - } -#else - if (h->mult_type == GF_MULT_DEFAULT) { - d8 = (struct gf_split_8_64_lazy_data *) h->private; - d8->last_value = 0; - gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region; - } -#endif - - if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) { - d4 = (struct gf_split_4_64_lazy_data *) h->private; - d4->last_value = 0; - - if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSSE)) return 0; - if(h->region_type & GF_REGION_ALTMAP) - { - #ifdef INTEL_SSSE3 - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region; - #else - return 0; - #endif - } - else //no altmap - { - #ifdef INTEL_SSE4 - if(h->region_type & GF_REGION_NOSSE) - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; - else - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; - #else - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; - if(h->region_type & GF_REGION_SSE) - return 0; - #endif - } - } - if ((h->arg1 == 8 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 8)) { - d8 = (struct gf_split_8_64_lazy_data *) h->private; - d8->last_value = 0; - gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region; - } - if ((h->arg1 == 16 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 16)) { - d16 = (struct gf_split_16_64_lazy_data *) h->private; - d16->last_value = 0; - gf->multiply_region.w64 = gf_w64_split_16_64_lazy_multiply_region; - } - if ((h->arg1 == 8 && h->arg2 == 8)) { - d88 = (struct gf_split_8_8_data *) h->private; - gf->multiply.w64 = gf_w64_split_8_8_multiply; - - /* The performance of this guy sucks, so don't bother with a region op */ - - basep = 1; - for (exp = 0; exp < 15; exp++) { - for (j = 0; j < 256; j++) d88->tables[exp][0][j] = 0; - for (i = 0; i < 256; i++) d88->tables[exp][i][0] = 0; - d88->tables[exp][1][1] = basep; - for (i = 2; i < 256; i++) { - if (i&1) { - p = d88->tables[exp][i^1][1]; - d88->tables[exp][i][1] = p ^ basep; - } else { - p = d88->tables[exp][i>>1][1]; - d88->tables[exp][i][1] = GF_MULTBY_TWO(p); - } - } - for (i = 1; i < 256; i++) { - p = d88->tables[exp][i][1]; - for (j = 1; j < 256; j++) { - if (j&1) { - d88->tables[exp][i][j] = d88->tables[exp][i][j^1] ^ p; - } else { - d88->tables[exp][i][j] = GF_MULTBY_TWO(d88->tables[exp][i][j>>1]); - } - } - } - for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep); - } - } - return 1; -} - -int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) -{ - switch(mult_type) - { - case GF_MULT_SHIFT: - return sizeof(gf_internal_t); - break; - case GF_MULT_CARRY_FREE: - return sizeof(gf_internal_t); - break; - case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: - return sizeof(gf_internal_t); - break; - - case GF_MULT_DEFAULT: - - /* Allen: set the *local* arg1 and arg2, just for scratch size purposes, - * then fall through to split table scratch size code. */ - -#ifdef INTEL_SSE4 - arg1 = 64; - arg2 = 4; -#else - arg1 = 64; - arg2 = 8; -#endif - - case GF_MULT_SPLIT_TABLE: - if (arg1 == 8 && arg2 == 8) { - return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64; - } - if ((arg1 == 16 && arg2 == 64) || (arg2 == 16 && arg1 == 64)) { - return sizeof(gf_internal_t) + sizeof(struct gf_split_16_64_lazy_data) + 64; - } - if ((arg1 == 8 && arg2 == 64) || (arg2 == 8 && arg1 == 64)) { - return sizeof(gf_internal_t) + sizeof(struct gf_split_8_64_lazy_data) + 64; - } - - if ((arg1 == 64 && arg2 == 4) || (arg1 == 4 && arg2 == 64)) { - return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64; - } - return 0; - case GF_MULT_GROUP: - return sizeof(gf_internal_t) + sizeof(struct gf_w64_group_data) + - sizeof(uint64_t) * (1 << arg1) + - sizeof(uint64_t) * (1 << arg2) + 64; - break; - case GF_MULT_COMPOSITE: - if (arg1 == 2) return sizeof(gf_internal_t) + 64; - return 0; - break; - default: - return 0; - } -} - -int gf_w64_init(gf_t *gf) -{ - gf_internal_t *h; - int no_default_flag = 0; - - h = (gf_internal_t *) gf->scratch; - - /* Allen: set default primitive polynomial / irreducible polynomial if needed */ - - /* Omitting the leftmost 1 as in w=32 */ - - if (h->prim_poly == 0) { - if (h->mult_type == GF_MULT_COMPOSITE) { - h->prim_poly = gf_composite_get_default_poly(h->base_gf); - if (h->prim_poly == 0) return 0; /* This shouldn't happen */ - } else { - h->prim_poly = 0x1b; - } - if (no_default_flag == 1) { - fprintf(stderr,"Code contains no default irreducible polynomial for given base field\n"); - return 0; - } - } - - gf->multiply.w64 = NULL; - gf->divide.w64 = NULL; - gf->inverse.w64 = NULL; - gf->multiply_region.w64 = NULL; - - switch(h->mult_type) { - case GF_MULT_CARRY_FREE: if (gf_w64_cfm_init(gf) == 0) return 0; break; - case GF_MULT_SHIFT: if (gf_w64_shift_init(gf) == 0) return 0; break; - case GF_MULT_COMPOSITE: if (gf_w64_composite_init(gf) == 0) return 0; break; - case GF_MULT_DEFAULT: - case GF_MULT_SPLIT_TABLE: if (gf_w64_split_init(gf) == 0) return 0; break; - case GF_MULT_GROUP: if (gf_w64_group_init(gf) == 0) return 0; break; - case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: if (gf_w64_bytwo_init(gf) == 0) return 0; break; - default: return 0; - } - if (h->divide_type == GF_DIVIDE_EUCLID) { - gf->divide.w64 = gf_w64_divide_from_inverse; - gf->inverse.w64 = gf_w64_euclid; - } - - if (gf->inverse.w64 != NULL && gf->divide.w64 == NULL) { - gf->divide.w64 = gf_w64_divide_from_inverse; - } - if (gf->inverse.w64 == NULL && gf->divide.w64 != NULL) { - gf->inverse.w64 = gf_w64_inverse_from_divide; - } - - if (h->region_type == GF_REGION_CAUCHY) return 0; - - if (h->region_type & GF_REGION_ALTMAP) { - if (h->mult_type == GF_MULT_COMPOSITE) { - gf->extract_word.w64 = gf_w64_composite_extract_word; - } else if (h->mult_type == GF_MULT_SPLIT_TABLE) { - gf->extract_word.w64 = gf_w64_split_extract_word; - } - } else { - gf->extract_word.w64 = gf_w64_extract_word; - } - - return 1; -} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w8.c b/src/erasure-code/jerasure/gf-complete/src/gf_w8.c deleted file mode 100644 index 89ef6a2eda67a..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/src/gf_w8.c +++ /dev/null @@ -1,2456 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_w8.c - * - * Routines for 8-bit Galois fields - */ - -#include "gf_int.h" -#include -#include - -#define GF_FIELD_WIDTH (8) -#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) -#define GF_HALF_SIZE (1 << (GF_FIELD_WIDTH/2)) -#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1 - -#define GF_BASE_FIELD_WIDTH (4) -#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) - -struct gf_w8_logtable_data { - uint8_t log_tbl[GF_FIELD_SIZE]; - uint8_t antilog_tbl[GF_FIELD_SIZE * 2]; - uint8_t inv_tbl[GF_FIELD_SIZE]; -}; - -struct gf_w8_logzero_table_data { - short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */ - uint8_t antilog_tbl[512+512+1]; - uint8_t *div_tbl; - uint8_t *inv_tbl; -}; - -struct gf_w8_logzero_small_table_data { - short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */ - uint8_t antilog_tbl[255*3]; - uint8_t inv_tbl[GF_FIELD_SIZE]; - uint8_t *div_tbl; -}; - -struct gf_w8_composite_data { - uint8_t *mult_table; -}; - -/* Don't change the order of these relative to gf_w8_half_table_data */ - -struct gf_w8_default_data { - uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE]; - uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE]; - uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; -}; - -struct gf_w8_half_table_data { - uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE]; - uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE]; -}; - -struct gf_w8_single_table_data { - uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; -}; - -struct gf_w8_double_table_data { - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint16_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE]; -}; - -struct gf_w8_double_table_lazy_data { - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint16_t mult[GF_FIELD_SIZE*GF_FIELD_SIZE]; -}; - -struct gf_w4_logtable_data { - uint8_t log_tbl[GF_BASE_FIELD_SIZE]; - uint8_t antilog_tbl[GF_BASE_FIELD_SIZE * 2]; - uint8_t *antilog_tbl_div; -}; - -struct gf_w4_single_table_data { - uint8_t div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE]; - uint8_t mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE]; -}; - -struct gf_w8_bytwo_data { - uint64_t prim_poly; - uint64_t mask1; - uint64_t mask2; -}; - -#define AB2(ip, am1 ,am2, b, t1, t2) {\ - t1 = (b << 1) & am1;\ - t2 = b & am2; \ - t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \ - b = (t1 ^ (t2 & ip));} - -#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ - t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ - t2 = _mm_and_si128(va, m2); \ - t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ - va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } - -#define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf(" %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); } - -static -inline -uint32_t gf_w8_inverse_from_divide (gf_t *gf, uint32_t a) -{ - return gf->divide.w32(gf, 1, a); -} - -static -inline -uint32_t gf_w8_divide_from_inverse (gf_t *gf, uint32_t a, uint32_t b) -{ - b = gf->inverse.w32(gf, b); - return gf->multiply.w32(gf, a, b); -} - -static -inline -uint32_t gf_w8_euclid (gf_t *gf, uint32_t b) -{ - uint32_t e_i, e_im1, e_ip1; - uint32_t d_i, d_im1, d_ip1; - uint32_t y_i, y_im1, y_ip1; - uint32_t c_i; - - if (b == 0) return -1; - e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; - e_i = b; - d_im1 = 8; - for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ; - y_i = 1; - y_im1 = 0; - - while (e_i != 1) { - - e_ip1 = e_im1; - d_ip1 = d_im1; - c_i = 0; - - while (d_ip1 >= d_i) { - c_i ^= (1 << (d_ip1 - d_i)); - e_ip1 ^= (e_i << (d_ip1 - d_i)); - if (e_ip1 == 0) return 0; - while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; - } - - y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); - y_im1 = y_i; - y_i = y_ip1; - - e_im1 = e_i; - d_im1 = d_i; - e_i = e_ip1; - d_i = d_ip1; - } - - return y_i; -} - -static -gf_val_32_t gf_w8_extract_word(gf_t *gf, void *start, int bytes, int index) -{ - uint8_t *r8; - - r8 = (uint8_t *) start; - return r8[index]; -} - -static -gf_val_32_t gf_w8_composite_extract_word(gf_t *gf, void *start, int bytes, int index) -{ - int sub_size; - gf_internal_t *h; - uint8_t *r8, *top; - uint8_t a, b; - gf_region_data rd; - - h = (gf_internal_t *) gf->scratch; - gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); - r8 = (uint8_t *) start; - if (r8 + index < (uint8_t *) rd.d_start) return r8[index]; - if (r8 + index >= (uint8_t *) rd.d_top) return r8[index]; - index -= (((uint8_t *) rd.d_start) - r8); - r8 = (uint8_t *) rd.d_start; - top = (uint8_t *) rd.d_top; - sub_size = (top-r8)/2; - - a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index); - b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index); - return (a | (b << 4)); -} - -static -inline -uint32_t gf_w8_matrix (gf_t *gf, uint32_t b) -{ - return gf_bitmatrix_inverse(b, 8, ((gf_internal_t *) (gf->scratch))->prim_poly); -} - - -static -inline -gf_val_32_t -gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) -{ - gf_val_32_t rv = 0; - -#if defined(INTEL_SSE4_PCLMUL) - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); - b = _mm_insert_epi32 (a, b8, 0); - - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); - - /* Do the initial multiply */ - - result = _mm_clmulepi64_si128 (a, b, 0); - - /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only - have to do the reduction at most twice, because (w-2)/z == 2. Where - z is equal to the number of zeros after the leading 1 - - _mm_clmulepi64_si128 is the carryless multiply operation. Here - _mm_srli_si128 shifts the result to the right by 1 byte. This allows - us to multiply the prim_poly by the leading bits of the result. We - then xor the result of that operation back with the result.*/ - - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - - /* Extracts 32 bit value from result. */ - - rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - -#endif - return rv; -} - -static -inline -gf_val_32_t -gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) -{ - gf_val_32_t rv = 0; - -#if defined(INTEL_SSE4_PCLMUL) - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); - b = _mm_insert_epi32 (a, b8, 0); - - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); - - /* Do the initial multiply */ - - result = _mm_clmulepi64_si128 (a, b, 0); - - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - - /* Extracts 32 bit value from result. */ - - rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - -#endif - return rv; -} - -static -inline -gf_val_32_t -gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) -{ - gf_val_32_t rv = 0; - -#if defined(INTEL_SSE4_PCLMUL) - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); - b = _mm_insert_epi32 (a, b8, 0); - - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); - - /* Do the initial multiply */ - - result = _mm_clmulepi64_si128 (a, b, 0); - - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - - /* Extracts 32 bit value from result. */ - rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - -#endif - return rv; -} - - -static -void -gf_w8_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int - xor) -{ - gf_region_data rd; - uint8_t *s8; - uint8_t *d8; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); - gf_do_initial_region_alignment(&rd); - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - if (xor) { - while (d8 < ((uint8_t *) rd.d_top)) { - *d8 ^= gf->multiply.w32(gf, val, *s8); - d8++; - s8++; - } - } else { - while (d8 < ((uint8_t *) rd.d_top)) { - *d8 = gf->multiply.w32(gf, val, *s8); - d8++; - s8++; - } - } - gf_do_final_region_alignment(&rd); -} - -#if defined(INTEL_SSE4_PCLMUL) -static -void -gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int - xor) -{ - gf_region_data rd; - uint8_t *s8; - uint8_t *d8; - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); - gf_do_initial_region_alignment(&rd); - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - if (xor) { - while (d8 < ((uint8_t *) rd.d_top)) { - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); - d8++; - s8++; - } - } else { - while (d8 < ((uint8_t *) rd.d_top)) { - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - d8++; - s8++; - } - } - gf_do_final_region_alignment(&rd); -} -#endif - -#if defined(INTEL_SSE4_PCLMUL) -static -void -gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int - xor) -{ - gf_region_data rd; - uint8_t *s8; - uint8_t *d8; - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); - gf_do_initial_region_alignment(&rd); - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - if (xor) { - while (d8 < ((uint8_t *) rd.d_top)) { - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); - d8++; - s8++; - } - } else { - while (d8 < ((uint8_t *) rd.d_top)) { - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - d8++; - s8++; - } - } - gf_do_final_region_alignment(&rd); -} -#endif - -#if defined(INTEL_SSE4_PCLMUL) -static -void -gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int - xor) -{ - gf_region_data rd; - uint8_t *s8; - uint8_t *d8; - - __m128i a, b; - __m128i result; - __m128i prim_poly; - __m128i w; - gf_internal_t * h = gf->scratch; - - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); - gf_do_initial_region_alignment(&rd); - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - if (xor) { - while (d8 < ((uint8_t *) rd.d_top)) { - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); - d8++; - s8++; - } - } else { - while (d8 < ((uint8_t *) rd.d_top)) { - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); - result = _mm_clmulepi64_si128 (a, b, 0); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); - result = _mm_xor_si128 (result, w); - *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - d8++; - s8++; - } - } - gf_do_final_region_alignment(&rd); -} -#endif - -/* ------------------------------------------------------------ -IMPLEMENTATION: SHIFT: - -JSP: The world's dumbest multiplication algorithm. I only -include it for completeness. It does have the feature that it requires no -extra memory. - */ - -static -inline - uint32_t -gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8) -{ - uint16_t product, i, pp, a, b; - gf_internal_t *h; - - a = a8; - b = b8; - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - product = 0; - - for (i = 0; i < GF_FIELD_WIDTH; i++) { - if (a & (1 << i)) product ^= (b << i); - } - for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { - if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); - } - return product; -} - -static -int gf_w8_cfm_init(gf_t *gf) -{ -#if defined(INTEL_SSE4_PCLMUL) - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - - if ((0xe0 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w8_clm_multiply_2; - gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_2; - }else if ((0xc0 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w8_clm_multiply_3; - gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_3; - }else if ((0x80 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w8_clm_multiply_4; - gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_4; - }else{ - return 0; - } - return 1; -#endif - - return 0; - -} - -static -int gf_w8_shift_init(gf_t *gf) -{ - gf->multiply.w32 = gf_w8_shift_multiply; /* The others will be set automatically */ - return 1; -} - -/* ------------------------------------------------------------ -IMPLEMENTATION: LOG_TABLE: - -JSP: Kevin wrote this, and I'm converting it to my structure. -*/ - -static -inline - uint32_t -gf_w8_logzero_multiply (gf_t *gf, uint32_t a, uint32_t b) -{ - struct gf_w8_logzero_table_data *ltd; - - ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private; - return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]]; -} - -static -inline - uint32_t -gf_w8_logzero_divide (gf_t *gf, uint32_t a, uint32_t b) -{ - struct gf_w8_logzero_table_data *ltd; - - ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private; - return ltd->div_tbl[ltd->log_tbl[a] - ltd->log_tbl[b]]; -} - -static -inline - uint32_t -gf_w8_logzero_small_multiply (gf_t *gf, uint32_t a, uint32_t b) -{ - struct gf_w8_logzero_small_table_data *std; - - std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private; - if (b == 0) return 0; - return std->antilog_tbl[std->log_tbl[a] + std->log_tbl[b]]; -} - -static -inline - uint32_t -gf_w8_logzero_small_divide (gf_t *gf, uint32_t a, uint32_t b) -{ - struct gf_w8_logzero_small_table_data *std; - - std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private; - return std->div_tbl[std->log_tbl[a] - std->log_tbl[b]]; -} - -static -inline - uint32_t -gf_w8_log_multiply (gf_t *gf, uint32_t a, uint32_t b) -{ - struct gf_w8_logtable_data *ltd; - - ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private; - return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])]; -} - -static -inline - uint32_t -gf_w8_log_divide (gf_t *gf, uint32_t a, uint32_t b) -{ - int log_sum = 0; - struct gf_w8_logtable_data *ltd; - - if (a == 0 || b == 0) return 0; - ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private; - - log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE); - return (ltd->antilog_tbl[log_sum]); -} - -static - uint32_t -gf_w8_log_inverse (gf_t *gf, uint32_t a) -{ - struct gf_w8_logtable_data *ltd; - - ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private; - return (ltd->inv_tbl[a]); -} - -static - uint32_t -gf_w8_logzero_inverse (gf_t *gf, uint32_t a) -{ - struct gf_w8_logzero_table_data *ltd; - - ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private; - return (ltd->inv_tbl[a]); -} - -static - uint32_t -gf_w8_logzero_small_inverse (gf_t *gf, uint32_t a) -{ - struct gf_w8_logzero_small_table_data *std; - - std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private; - return (std->inv_tbl[a]); -} - -static - void -gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) -{ - int i; - uint8_t lv; - uint8_t *s8, *d8; - struct gf_w8_logtable_data *ltd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private; - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - - lv = ltd->log_tbl[val]; - - if (xor) { - for (i = 0; i < bytes; i++) { - d8[i] ^= (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]); - } - } else { - for (i = 0; i < bytes; i++) { - d8[i] = (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]); - } - } -} - -static - void -gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) -{ - int i; - uint8_t lv; - uint8_t *s8, *d8; - struct gf_w8_logzero_table_data *ltd; - struct gf_w8_logzero_small_table_data *std; - short *log; - uint8_t *alt; - gf_internal_t *h; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - h = (gf_internal_t *) gf->scratch; - - if (h->arg1 == 1) { - std = (struct gf_w8_logzero_small_table_data *) h->private; - log = std->log_tbl; - alt = std->antilog_tbl; - } else { - ltd = (struct gf_w8_logzero_table_data *) h->private; - log = ltd->log_tbl; - alt = ltd->antilog_tbl; - } - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - - lv = log[val]; - - if (xor) { - for (i = 0; i < bytes; i++) { - d8[i] ^= (alt[lv + log[s8[i]]]); - } - } else { - for (i = 0; i < bytes; i++) { - d8[i] = (alt[lv + log[s8[i]]]); - } - } -} - - static -int gf_w8_log_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_w8_logtable_data *ltd = NULL; - struct gf_w8_logzero_table_data *ztd = NULL; - struct gf_w8_logzero_small_table_data *std = NULL; - uint8_t *alt; - uint8_t *inv; - int i, b; - int check = 0; - - h = (gf_internal_t *) gf->scratch; - if (h->mult_type == GF_MULT_LOG_TABLE) { - ltd = h->private; - alt = ltd->antilog_tbl; - inv = ltd->inv_tbl; - } else if (h->mult_type == GF_MULT_LOG_ZERO) { - std = h->private; - alt = std->antilog_tbl; - std->div_tbl = (alt + 255); - inv = std->inv_tbl; - } else { - ztd = h->private; - alt = ztd->antilog_tbl; - ztd->inv_tbl = (alt + 512 + 256); - ztd->div_tbl = (alt + 255); - inv = ztd->inv_tbl; - } - - for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) { - if (h->mult_type == GF_MULT_LOG_TABLE) - ltd->log_tbl[i] = 0; - else if (h->mult_type == GF_MULT_LOG_ZERO) - std->log_tbl[i] = 0; - else - ztd->log_tbl[i] = 0; - } - - if (h->mult_type == GF_MULT_LOG_TABLE) { - ltd->log_tbl[0] = 0; - } else if (h->mult_type == GF_MULT_LOG_ZERO) { - std->log_tbl[0] = 510; - } else { - ztd->log_tbl[0] = 512; - } - - b = 1; - for (i = 0; i < GF_MULT_GROUP_SIZE; i++) { - if (h->mult_type == GF_MULT_LOG_TABLE) { - if (ltd->log_tbl[b] != 0) check = 1; - ltd->log_tbl[b] = i; - } else if (h->mult_type == GF_MULT_LOG_ZERO) { - if (std->log_tbl[b] != 0) check = 1; - std->log_tbl[b] = i; - } else { - if (ztd->log_tbl[b] != 0) check = 1; - ztd->log_tbl[b] = i; - } - alt[i] = b; - alt[i+GF_MULT_GROUP_SIZE] = b; - b <<= 1; - if (b & GF_FIELD_SIZE) { - b = b ^ h->prim_poly; - } - } - if (check) { - _gf_errno = GF_E_LOGPOLY; - return 0; - } - - if (h->mult_type == GF_MULT_LOG_ZERO) bzero(alt+510, 255); - - if (h->mult_type == GF_MULT_LOG_ZERO_EXT) { - bzero(alt+512, 255); - alt[512+512] = 0; - } - - inv[0] = 0; /* Not really, but we need to fill it with something */ - i = 1; - b = GF_MULT_GROUP_SIZE; - do { - inv[i] = alt[b]; - i <<= 1; - if (i & (1 << 8)) i ^= h->prim_poly; - b--; - } while (i != 1); - - if (h->mult_type == GF_MULT_LOG_TABLE) { - gf->inverse.w32 = gf_w8_log_inverse; - gf->divide.w32 = gf_w8_log_divide; - gf->multiply.w32 = gf_w8_log_multiply; - gf->multiply_region.w32 = gf_w8_log_multiply_region; - } else if (h->mult_type == GF_MULT_LOG_ZERO) { - gf->inverse.w32 = gf_w8_logzero_small_inverse; - gf->divide.w32 = gf_w8_logzero_small_divide; - gf->multiply.w32 = gf_w8_logzero_small_multiply; - gf->multiply_region.w32 = gf_w8_logzero_multiply_region; - } else { - gf->inverse.w32 = gf_w8_logzero_inverse; - gf->divide.w32 = gf_w8_logzero_divide; - gf->multiply.w32 = gf_w8_logzero_multiply; - gf->multiply_region.w32 = gf_w8_logzero_multiply_region; - } - return 1; -} - -/* ------------------------------------------------------------ -IMPLEMENTATION: FULL_TABLE: - -JSP: Kevin wrote this, and I'm converting it to my structure. - */ - -static - gf_val_32_t -gf_w8_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_w8_single_table_data *ftd; - - ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private; - return (ftd->multtable[a][b]); -} - -static - gf_val_32_t -gf_w8_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_w8_single_table_data *ftd; - - ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private; - return (ftd->divtable[a][b]); -} - -static - gf_val_32_t -gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_w8_default_data *ftd; - - ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private; - return (ftd->multtable[a][b]); -} - -#ifdef INTEL_SSSE3 -static - gf_val_32_t -gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_w8_default_data *ftd; - - ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private; - return (ftd->divtable[a][b]); -} -#endif - -static - gf_val_32_t -gf_w8_double_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_w8_double_table_data *ftd; - - ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private; - return (ftd->mult[a][b]); -} - -static - gf_val_32_t -gf_w8_double_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_w8_double_table_data *ftd; - - ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private; - return (ftd->div[a][b]); -} - -static - void -gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint16_t *base; - uint32_t b, c, vc, vb; - gf_internal_t *h; - struct gf_w8_double_table_data *dtd; - struct gf_w8_double_table_lazy_data *ltd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - h = (gf_internal_t *) (gf->scratch); - if (h->region_type & GF_REGION_LAZY) { - ltd = (struct gf_w8_double_table_lazy_data *) h->private; - base = ltd->mult; - for (b = 0; b < GF_FIELD_SIZE; b++) { - vb = (ltd->smult[val][b] << 8); - for (c = 0; c < GF_FIELD_SIZE; c++) { - vc = ltd->smult[val][c]; - base[(b << 8)| c] = (vb | vc); - } - } - - } else { - dtd = (struct gf_w8_double_table_data *) h->private; - base = &(dtd->mult[val][0]); - } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); - gf_do_initial_region_alignment(&rd); - gf_two_byte_region_table_multiply(&rd, base); - gf_do_final_region_alignment(&rd); -} - -static - gf_val_32_t -gf_w8_double_table_lazy_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_w8_double_table_lazy_data *ftd; - - ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private; - return (ftd->smult[a][b]); -} - -static - gf_val_32_t -gf_w8_double_table_lazy_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_w8_double_table_lazy_data *ftd; - - ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private; - return (ftd->div[a][b]); -} - -static - void -gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - int i; - uint8_t *s8, *d8; - struct gf_w8_single_table_data *ftd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private; - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - - if (xor) { - for (i = 0; i < bytes; i++) { - d8[i] ^= ftd->multtable[s8[i]][val]; - } - } else { - for (i = 0; i < bytes; i++) { - d8[i] = ftd->multtable[s8[i]][val]; - } - } -} - -#ifdef INTEL_SSSE3 -static - void -gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint8_t *bh, *bl, *sptr, *dptr; - __m128i loset, t1, r, va, mth, mtl; - struct gf_w8_half_table_data *htd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - bh = (uint8_t *) htd->high; - bh += (val << 4); - bl = (uint8_t *) htd->low; - bl += (val << 4); - - sptr = rd.s_start; - dptr = rd.d_start; - - mth = _mm_loadu_si128 ((__m128i *)(bh)); - mtl = _mm_loadu_si128 ((__m128i *)(bl)); - loset = _mm_set1_epi8 (0x0f); - - if (xor) { - while (sptr < (uint8_t *) rd.s_top) { - va = _mm_load_si128 ((__m128i *)(sptr)); - t1 = _mm_and_si128 (loset, va); - r = _mm_shuffle_epi8 (mtl, t1); - va = _mm_srli_epi64 (va, 4); - t1 = _mm_and_si128 (loset, va); - r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1)); - va = _mm_load_si128 ((__m128i *)(dptr)); - r = _mm_xor_si128 (r, va); - _mm_store_si128 ((__m128i *)(dptr), r); - dptr += 16; - sptr += 16; - } - } else { - while (sptr < (uint8_t *) rd.s_top) { - va = _mm_load_si128 ((__m128i *)(sptr)); - t1 = _mm_and_si128 (loset, va); - r = _mm_shuffle_epi8 (mtl, t1); - va = _mm_srli_epi64 (va, 4); - t1 = _mm_and_si128 (loset, va); - r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1)); - _mm_store_si128 ((__m128i *)(dptr), r); - dptr += 16; - sptr += 16; - } - } - - gf_do_final_region_alignment(&rd); -} -#endif - - -/* ------------------------------------------------------------ -IMPLEMENTATION: FULL_TABLE: - */ - -static - gf_val_32_t -gf_w8_split_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - struct gf_w8_half_table_data *htd; - htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private; - - return htd->high[b][a>>4] ^ htd->low[b][a&0xf]; -} - -static - void -gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - int i; - uint8_t *s8, *d8; - struct gf_w8_half_table_data *htd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private; - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - - if (xor) { - for (i = 0; i < bytes; i++) { - d8[i] ^= (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]); - } - } else { - for (i = 0; i < bytes; i++) { - d8[i] = (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]); - } - } -} - - - static -int gf_w8_split_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_w8_half_table_data *htd; - int a, b; - - h = (gf_internal_t *) gf->scratch; - htd = (struct gf_w8_half_table_data *)h->private; - - bzero(htd->high, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE); - bzero(htd->low, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE); - - for (a = 1; a < GF_FIELD_SIZE; a++) { - for (b = 1; b < GF_HALF_SIZE; b++) { - htd->low[a][b] = gf_w8_shift_multiply(gf,a,b); - htd->high[a][b] = gf_w8_shift_multiply(gf,a,b<<4); - } - } - - gf->multiply.w32 = gf_w8_split_multiply; - - #ifdef INTEL_SSSE3 - if (h->region_type & GF_REGION_NOSSE) - gf->multiply_region.w32 = gf_w8_split_multiply_region; - else - gf->multiply_region.w32 = gf_w8_split_multiply_region_sse; - #else - gf->multiply_region.w32 = gf_w8_split_multiply_region; - if(h->region_type & GF_REGION_SSE) - return 0; - #endif - - return 1; -} - -/* JSP: This is disgusting, but it is what it is. If there is no SSE, - then the default is equivalent to single table. If there is SSE, then - we use the "gf_w8_default_data" which is a hybrid of SPLIT & TABLE. */ - -static -int gf_w8_table_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_w8_single_table_data *ftd = NULL; - struct gf_w8_double_table_data *dtd = NULL; - struct gf_w8_double_table_lazy_data *ltd = NULL; - struct gf_w8_default_data *dd = NULL; - int a, b, c, prod, scase, issse; - - h = (gf_internal_t *) gf->scratch; - - issse = 0; -#ifdef INTEL_SSSE3 - issse = 1; -#endif - - if (h->mult_type == GF_MULT_DEFAULT && issse) { - dd = (struct gf_w8_default_data *)h->private; - scase = 3; - bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE); - bzero(dd->low, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE); - bzero(dd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - bzero(dd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - } else if (h->mult_type == GF_MULT_DEFAULT || - h->region_type == 0 || (h->region_type & GF_REGION_CAUCHY)) { - ftd = (struct gf_w8_single_table_data *)h->private; - bzero(ftd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - bzero(ftd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - scase = 0; - } else if (h->region_type == GF_REGION_DOUBLE_TABLE) { - dtd = (struct gf_w8_double_table_data *)h->private; - bzero(dtd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - bzero(dtd->mult, sizeof(uint16_t) * GF_FIELD_SIZE * GF_FIELD_SIZE * GF_FIELD_SIZE); - scase = 1; - } else if (h->region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) { - ltd = (struct gf_w8_double_table_lazy_data *)h->private; - bzero(ltd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - bzero(ltd->smult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - scase = 2; - } else { - fprintf(stderr, "Internal error in gf_w8_table_init\n"); - exit(0); - } - - for (a = 1; a < GF_FIELD_SIZE; a++) { - for (b = 1; b < GF_FIELD_SIZE; b++) { - prod = gf_w8_shift_multiply(gf,a,b); - switch (scase) { - case 0: - ftd->multtable[a][b] = prod; - ftd->divtable[prod][b] = a; - break; - case 1: - dtd->div[prod][b] = a; - for (c = 0; c < GF_FIELD_SIZE; c++) { - dtd->mult[a][(c<<8)|b] |= prod; - dtd->mult[a][(b<<8)|c] |= (prod<<8); - } - break; - case 2: - ltd->div[prod][b] = a; - ltd->smult[a][b] = prod; - break; - case 3: - dd->multtable[a][b] = prod; - dd->divtable[prod][b] = a; - if ((b & 0xf) == b) { dd->low[a][b] = prod; } - if ((b & 0xf0) == b) { dd->high[a][b>>4] = prod; } - break; - } - } - } - - gf->inverse.w32 = NULL; /* Will set from divide */ - switch (scase) { - case 0: - gf->divide.w32 = gf_w8_table_divide; - gf->multiply.w32 = gf_w8_table_multiply; - gf->multiply_region.w32 = gf_w8_table_multiply_region; - break; - case 1: - gf->divide.w32 = gf_w8_double_table_divide; - gf->multiply.w32 = gf_w8_double_table_multiply; - gf->multiply_region.w32 = gf_w8_double_table_multiply_region; - break; - case 2: - gf->divide.w32 = gf_w8_double_table_lazy_divide; - gf->multiply.w32 = gf_w8_double_table_lazy_multiply; - gf->multiply_region.w32 = gf_w8_double_table_multiply_region; - break; - case 3: -#ifdef INTEL_SSSE3 - gf->divide.w32 = gf_w8_default_divide; - gf->multiply.w32 = gf_w8_default_multiply; - gf->multiply_region.w32 = gf_w8_split_multiply_region_sse; -#endif - break; - } - return 1; -} - -static - void -gf_w8_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint8_t val0 = val & 0x0f; - uint8_t val1 = (val & 0xf0) >> 4; - gf_region_data rd; - int sub_reg_size; - - if (val == 0) { - if (xor) return; - bzero(dest, bytes); - return; - } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); - gf_do_initial_region_alignment(&rd); - - sub_reg_size = ((char*)rd.d_top - (char*)rd.d_start) / 2; - - base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start, val0, sub_reg_size, xor); - base_gf->multiply_region.w32(base_gf, (char*)rd.s_start+sub_reg_size, rd.d_start, val1, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, rd.s_start, (char*)rd.d_start+sub_reg_size, val1, sub_reg_size, xor); - base_gf->multiply_region.w32(base_gf, (char*)rd.s_start+sub_reg_size, (char*)rd.d_start+sub_reg_size, val0, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, (char*)rd.s_start+sub_reg_size, (char*)rd.d_start+sub_reg_size, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); - - gf_do_final_region_alignment(&rd); -} - -static -gf_val_32_t -gf_w8_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint8_t b0 = b & 0x0f; - uint8_t b1 = (b & 0xf0) >> 4; - uint8_t a0 = a & 0x0f; - uint8_t a1 = (a & 0xf0) >> 4; - uint8_t a1b1; - - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - return ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((base_gf->multiply.w32(base_gf, a1, b0) ^ - base_gf->multiply.w32(base_gf, a0, b1) ^ - base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4)); -} - -static -gf_val_32_t -gf_w8_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - uint8_t b0 = b & 0x0f; - uint8_t b1 = (b & 0xf0) >> 4; - uint8_t a0 = a & 0x0f; - uint8_t a1 = (a & 0xf0) >> 4; - uint8_t a1b1, *mt; - struct gf_w8_composite_data *cd; - - cd = (struct gf_w8_composite_data *) h->private; - mt = cd->mult_table; - - a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1); - - return ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | - ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ - GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ - GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4)); -} - -/* - * Composite field division trick (explained in 2007 tech report) - * - * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 - * - * let c = b^-1 - * - * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0) - * - * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 - * - * let d = b1c1 and d+1 = b0c0 - * - * solve s*b1c1+b1c0+b0c1 = 0 - * - * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1 - * - * c0 = (d+1)b0^-1 - * c1 = d*b1^-1 - * - * a / b = a * c - */ - -static -gf_val_32_t -gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint8_t a0 = a & 0x0f; - uint8_t a1 = (a & 0xf0) >> 4; - uint8_t c0, c1, c, d, tmp; - uint8_t a0inv, a1inv; - - if (a0 == 0) { - a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf; - c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); - c1 = a1inv; - } else if (a1 == 0) { - c0 = base_gf->inverse.w32(base_gf, a0); - c1 = 0; - } else { - a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf; - a0inv = base_gf->inverse.w32(base_gf, a0) & 0xf; - - d = base_gf->multiply.w32(base_gf, a1, a0inv) & 0xf; - - tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly) & 0xf; - tmp = base_gf->inverse.w32(base_gf, tmp) & 0xf; - - d = base_gf->multiply.w32(base_gf, d, tmp) & 0xf; - - c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv) & 0xf; - c1 = base_gf->multiply.w32(base_gf, d, a1inv) & 0xf; - } - - c = c0 | (c1 << 4); - - return c; -} - -static -void -gf_w8_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - gf_region_data rd; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint8_t b0 = val & 0x0f; - uint8_t b1 = (val & 0xf0) >> 4; - uint8_t *s8; - uint8_t *d8; - uint8_t *mt; - uint8_t a0, a1, a1b1; - struct gf_w8_composite_data *cd; - - cd = (struct gf_w8_composite_data *) h->private; - - if (val == 0) { - if (xor) return; - bzero(dest, bytes); - return; - } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); - gf_do_initial_region_alignment(&rd); - - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - mt = cd->mult_table; - if (mt == NULL) { - if (xor) { - while (d8 < (uint8_t *) rd.d_top) { - a0 = *s8 & 0x0f; - a1 = (*s8 & 0xf0) >> 4; - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - *d8 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((base_gf->multiply.w32(base_gf, a1, b0) ^ - base_gf->multiply.w32(base_gf, a0, b1) ^ - base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4)); - s8++; - d8++; - } - } else { - while (d8 < (uint8_t *) rd.d_top) { - a0 = *s8 & 0x0f; - a1 = (*s8 & 0xf0) >> 4; - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - *d8 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((base_gf->multiply.w32(base_gf, a1, b0) ^ - base_gf->multiply.w32(base_gf, a0, b1) ^ - base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4)); - s8++; - d8++; - } - } - } else { - if (xor) { - while (d8 < (uint8_t *) rd.d_top) { - a0 = *s8 & 0x0f; - a1 = (*s8 & 0xf0) >> 4; - a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1); - - *d8 ^= ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | - ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ - GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ - GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4)); - s8++; - d8++; - } - } else { - while (d8 < (uint8_t *) rd.d_top) { - a0 = *s8 & 0x0f; - a1 = (*s8 & 0xf0) >> 4; - a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1); - - *d8 = ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | - ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ - GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ - GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4)); - s8++; - d8++; - } - } - } - gf_do_final_region_alignment(&rd); - return; -} - -static -int gf_w8_composite_init(gf_t *gf) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - struct gf_w8_composite_data *cd; - - if (h->base_gf == NULL) return 0; - - cd = (struct gf_w8_composite_data *) h->private; - cd->mult_table = gf_w4_get_mult_table(h->base_gf); - - if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w32 = gf_w8_composite_multiply_region_alt; - } else { - gf->multiply_region.w32 = gf_w8_composite_multiply_region; - } - - if (cd->mult_table == NULL) { - gf->multiply.w32 = gf_w8_composite_multiply_recursive; - } else { - gf->multiply.w32 = gf_w8_composite_multiply_inline; - } - gf->divide.w32 = NULL; - gf->inverse.w32 = gf_w8_composite_inverse; - - return 1; -} - -static -inline - gf_val_32_t -gf_w8_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - uint32_t prod, pp, pmask, amask; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - - prod = 0; - pmask = 0x80; - amask = 0x80; - - while (amask != 0) { - if (prod & pmask) { - prod = ((prod << 1) ^ pp); - } else { - prod <<= 1; - } - if (a & amask) prod ^= b; - amask >>= 1; - } - return prod; -} - -static -inline - gf_val_32_t -gf_w8_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - uint32_t prod, pp, bmask; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - prod = 0; - bmask = 0x80; - - while (1) { - if (a & 1) prod ^= b; - a >>= 1; - if (a == 0) return prod; - if (b & bmask) { - b = ((b << 1) ^ pp); - } else { - b <<= 1; - } - } -} - -static - void -gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint64_t *s64, *d64, t1, t2, ta, prod, amask; - gf_region_data rd; - struct gf_w8_bytwo_data *btd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); - gf_do_initial_region_alignment(&rd); - - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - - if (xor) { - while (s64 < (uint64_t *) rd.s_top) { - prod = 0; - amask = 0x80; - ta = *s64; - while (amask != 0) { - AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); - if (val & amask) prod ^= ta; - amask >>= 1; - } - *d64 ^= prod; - d64++; - s64++; - } - } else { - while (s64 < (uint64_t *) rd.s_top) { - prod = 0; - amask = 0x80; - ta = *s64; - while (amask != 0) { - AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); - if (val & amask) prod ^= ta; - amask >>= 1; - } - *d64 = prod; - d64++; - s64++; - } - } - gf_do_final_region_alignment(&rd); -} - -#define BYTWO_P_ONESTEP {\ - SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ - t1 = _mm_and_si128(v, one); \ - t1 = _mm_sub_epi8(t1, one); \ - t1 = _mm_and_si128(t1, ta); \ - prod = _mm_xor_si128(prod, t1); \ - v = _mm_srli_epi64(v, 1); } - -#ifdef INTEL_SSE2 -static - void -gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - int i; - uint8_t *s8, *d8; - uint8_t vrev; - __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; - struct gf_w8_bytwo_data *btd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - vrev = 0; - for (i = 0; i < 8; i++) { - vrev <<= 1; - if (!(val & (1 << i))) vrev |= 1; - } - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); - one = _mm_set1_epi8(1); - - while (d8 < (uint8_t *) rd.d_top) { - prod = _mm_setzero_si128(); - v = _mm_set1_epi8(vrev); - ta = _mm_load_si128((__m128i *) s8); - tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - BYTWO_P_ONESTEP; - _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); - d8 += 16; - s8 += 16; - } - gf_do_final_region_alignment(&rd); -} -#endif - -#ifdef INTEL_SSE2 -static - void -gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, m2, t1, t2, va; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); - _mm_store_si128((__m128i *)d8, va); - d8 += 16; - s8 += 16; - } -} -#endif - -#ifdef INTEL_SSE2 -static - void -gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) -{ - uint8_t *d8, *s8; - __m128i pp, m1, m2, t1, t2, va, vb; - - s8 = (uint8_t *) rd->s_start; - d8 = (uint8_t *) rd->d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); - - while (d8 < (uint8_t *) rd->d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); - vb = _mm_load_si128 ((__m128i *)(d8)); - vb = _mm_xor_si128(vb, va); - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } -} -#endif - - -#ifdef INTEL_SSE2 -static - void -gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - int itb; - uint8_t *d8, *s8; - __m128i pp, m1, m2, t1, t2, va, vb; - struct gf_w8_bytwo_data *btd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - - if (val == 2) { - if (xor) { - gf_w8_bytwo_b_sse_region_2_xor(&rd, btd); - } else { - gf_w8_bytwo_b_sse_region_2_noxor(&rd, btd); - } - gf_do_final_region_alignment(&rd); - return; - } - - s8 = (uint8_t *) rd.s_start; - d8 = (uint8_t *) rd.d_start; - - pp = _mm_set1_epi8(btd->prim_poly&0xff); - m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); - - while (d8 < (uint8_t *) rd.d_top) { - va = _mm_load_si128 ((__m128i *)(s8)); - vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8)); - itb = val; - while (1) { - if (itb & 1) vb = _mm_xor_si128(vb, va); - itb >>= 1; - if (itb == 0) break; - SSE_AB2(pp, m1, m2, va, t1, t2); - } - _mm_store_si128((__m128i *)d8, vb); - d8 += 16; - s8 += 16; - } - - gf_do_final_region_alignment(&rd); -} -#endif - -static - void -gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint64_t *s64, *d64, t1, t2, ta, tb, prod; - struct gf_w8_bytwo_data *btd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } - - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); - gf_do_initial_region_alignment(&rd); - - btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; - s64 = (uint64_t *) rd.s_start; - d64 = (uint64_t *) rd.d_start; - - switch (val) { - case 2: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= ta; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta; - d64++; - s64++; - } - } - break; - case 3: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 4: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= ta; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta; - d64++; - s64++; - } - } - break; - case 5: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta ^ prod; - d64++; - s64++; - } - } - case 6: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta ^ prod; - d64++; - s64++; - } - } - /* - case 7: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta ^ prod; - d64++; - s64++; - } - } - break; - */ - case 8: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= ta; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta; - d64++; - s64++; - } - } - break; - /* - case 9: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 10: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 11: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 12: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 13: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 14: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 15: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - */ - default: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - prod = *d64 ; - ta = *s64; - tb = val; - while (1) { - if (tb & 1) prod ^= ta; - tb >>= 1; - if (tb == 0) break; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - } - *d64 = prod; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - prod = 0 ; - ta = *s64; - tb = val; - while (1) { - if (tb & 1) prod ^= ta; - tb >>= 1; - if (tb == 0) break; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - } - *d64 = prod; - d64++; - s64++; - } - } - break; - } - gf_do_final_region_alignment(&rd); -} - - static -int gf_w8_bytwo_init(gf_t *gf) -{ - gf_internal_t *h; - uint64_t ip, m1, m2; - struct gf_w8_bytwo_data *btd; - - h = (gf_internal_t *) gf->scratch; - btd = (struct gf_w8_bytwo_data *) (h->private); - ip = h->prim_poly & 0xff; - m1 = 0xfe; - m2 = 0x80; - btd->prim_poly = 0; - btd->mask1 = 0; - btd->mask2 = 0; - - while (ip != 0) { - btd->prim_poly |= ip; - btd->mask1 |= m1; - btd->mask2 |= m2; - ip <<= GF_FIELD_WIDTH; - m1 <<= GF_FIELD_WIDTH; - m2 <<= GF_FIELD_WIDTH; - } - - if (h->mult_type == GF_MULT_BYTWO_p) { - gf->multiply.w32 = gf_w8_bytwo_p_multiply; -#ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) - gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w8_bytwo_p_sse_multiply_region; -#else - gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) - return 0; -#endif - } else { - gf->multiply.w32 = gf_w8_bytwo_b_multiply; -#ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) - gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w8_bytwo_b_sse_multiply_region; -#else - gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) - return 0; -#endif - } - return 1; -} - - -/* ------------------------------------------------------------ - General procedures. - You don't need to error check here on in init, because it's done - for you in gf_error_check(). - */ - -int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) -{ - switch(mult_type) - { - case GF_MULT_DEFAULT: -#ifdef INTEL_SSSE3 - return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64; -#endif - return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64; - case GF_MULT_TABLE: - if (region_type == GF_REGION_CAUCHY) { - return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64; - } - - if (region_type == GF_REGION_DEFAULT) { - return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64; - } - if (region_type & GF_REGION_DOUBLE_TABLE) { - if (region_type == GF_REGION_DOUBLE_TABLE) { - return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_data) + 64; - } else if (region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) { - return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_lazy_data) + 64; - } else { - return 0; - } - } - return 0; - break; - case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: - return sizeof(gf_internal_t) + sizeof(struct gf_w8_bytwo_data); - break; - case GF_MULT_SPLIT_TABLE: - if ((arg1 == 4 && arg2 == 8) || (arg1 == 8 && arg2 == 4)) { - return sizeof(gf_internal_t) + sizeof(struct gf_w8_half_table_data) + 64; - } - break; - case GF_MULT_LOG_TABLE: - return sizeof(gf_internal_t) + sizeof(struct gf_w8_logtable_data) + 64; - break; - case GF_MULT_LOG_ZERO: - return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_small_table_data) + 64; - break; - case GF_MULT_LOG_ZERO_EXT: - return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_table_data) + 64; - break; - case GF_MULT_CARRY_FREE: - return sizeof(gf_internal_t); - break; - case GF_MULT_SHIFT: - return sizeof(gf_internal_t); - break; - case GF_MULT_COMPOSITE: - return sizeof(gf_internal_t) + sizeof(struct gf_w8_composite_data) + 64; - default: - return 0; - } - return 0; -} - -int gf_w8_init(gf_t *gf) -{ - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - - /* Allen: set default primitive polynomial / irreducible polynomial if needed */ - - if (h->prim_poly == 0) { - if (h->mult_type == GF_MULT_COMPOSITE) { - h->prim_poly = gf_composite_get_default_poly(h->base_gf); - if (h->prim_poly == 0) return 0; /* JSP: This shouldn't happen, but just in case. */ - } else { - h->prim_poly = 0x11d; - } - } - if (h->mult_type != GF_MULT_COMPOSITE) { - h->prim_poly |= 0x100; - } - - gf->multiply.w32 = NULL; - gf->divide.w32 = NULL; - gf->inverse.w32 = NULL; - gf->multiply_region.w32 = NULL; - gf->extract_word.w32 = gf_w8_extract_word; - - switch(h->mult_type) { - case GF_MULT_DEFAULT: - case GF_MULT_TABLE: if (gf_w8_table_init(gf) == 0) return 0; break; - case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: if (gf_w8_bytwo_init(gf) == 0) return 0; break; - case GF_MULT_LOG_ZERO: - case GF_MULT_LOG_ZERO_EXT: - case GF_MULT_LOG_TABLE: if (gf_w8_log_init(gf) == 0) return 0; break; - case GF_MULT_CARRY_FREE: if (gf_w8_cfm_init(gf) == 0) return 0; break; - case GF_MULT_SHIFT: if (gf_w8_shift_init(gf) == 0) return 0; break; - case GF_MULT_SPLIT_TABLE: if (gf_w8_split_init(gf) == 0) return 0; break; - case GF_MULT_COMPOSITE: if (gf_w8_composite_init(gf) == 0) return 0; break; - default: return 0; - } - - if (h->divide_type == GF_DIVIDE_EUCLID) { - gf->divide.w32 = gf_w8_divide_from_inverse; - gf->inverse.w32 = gf_w8_euclid; - } else if (h->divide_type == GF_DIVIDE_MATRIX) { - gf->divide.w32 = gf_w8_divide_from_inverse; - gf->inverse.w32 = gf_w8_matrix; - } - - if (gf->divide.w32 == NULL) { - gf->divide.w32 = gf_w8_divide_from_inverse; - if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w8_euclid; - } - - if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w8_inverse_from_divide; - - if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) { - gf->extract_word.w32 = gf_w8_composite_extract_word; - } - - if (h->region_type == GF_REGION_CAUCHY) { - gf->multiply_region.w32 = gf_wgen_cauchy_region; - gf->extract_word.w32 = gf_wgen_extract_word; - } - - if (gf->multiply_region.w32 == NULL) { - gf->multiply_region.w32 = gf_w8_multiply_region_from_single; - } - - return 1; -} - - -/* Inline setup functions */ - -uint8_t *gf_w8_get_mult_table(gf_t *gf) -{ - gf_internal_t *h; - struct gf_w8_default_data *ftd; - struct gf_w8_single_table_data *std; - - h = (gf_internal_t *) gf->scratch; - if (gf->multiply.w32 == gf_w8_default_multiply) { - ftd = (struct gf_w8_default_data *) h->private; - return (uint8_t *) ftd->multtable; - } else if (gf->multiply.w32 == gf_w8_table_multiply) { - std = (struct gf_w8_single_table_data *) h->private; - return (uint8_t *) std->multtable; - } - return NULL; -} - -uint8_t *gf_w8_get_div_table(gf_t *gf) -{ - struct gf_w8_default_data *ftd; - struct gf_w8_single_table_data *std; - - if (gf->multiply.w32 == gf_w8_default_multiply) { - ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private; - return (uint8_t *) ftd->divtable; - } else if (gf->multiply.w32 == gf_w8_table_multiply) { - std = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private; - return (uint8_t *) std->divtable; - } - return NULL; -} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c b/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c deleted file mode 100644 index 68c6bb0785801..0000000000000 --- a/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c +++ /dev/null @@ -1,1019 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_wgen.c - * - * Routines for Galois fields for general w < 32. For specific w, - like 4, 8, 16, 32, 64 and 128, see the other files. - */ - -#include "gf_int.h" -#include -#include - -struct gf_wgen_table_w8_data { - uint8_t *mult; - uint8_t *div; - uint8_t base; -}; - -struct gf_wgen_table_w16_data { - uint16_t *mult; - uint16_t *div; - uint16_t base; -}; - -struct gf_wgen_log_w8_data { - uint8_t *log; - uint8_t *anti; - uint8_t *danti; - uint8_t base; -}; - -struct gf_wgen_log_w16_data { - uint16_t *log; - uint16_t *anti; - uint16_t *danti; - uint16_t base; -}; - -struct gf_wgen_log_w32_data { - uint32_t *log; - uint32_t *anti; - uint32_t *danti; - uint32_t base; -}; - -struct gf_wgen_group_data { - uint32_t *reduce; - uint32_t *shift; - uint32_t mask; - uint64_t rmask; - int tshift; - uint32_t memory; -}; - -static -inline -gf_val_32_t gf_wgen_inverse_from_divide (gf_t *gf, gf_val_32_t a) -{ - return gf->divide.w32(gf, 1, a); -} - -static -inline -gf_val_32_t gf_wgen_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - b = gf->inverse.w32(gf, b); - return gf->multiply.w32(gf, a, b); -} - -static -inline -gf_val_32_t gf_wgen_euclid (gf_t *gf, gf_val_32_t b) -{ - - gf_val_32_t e_i, e_im1, e_ip1; - gf_val_32_t d_i, d_im1, d_ip1; - gf_val_32_t y_i, y_im1, y_ip1; - gf_val_32_t c_i; - - if (b == 0) return -1; - e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; - e_i = b; - d_im1 = ((gf_internal_t *) (gf->scratch))->w; - for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ; - y_i = 1; - y_im1 = 0; - - while (e_i != 1) { - - e_ip1 = e_im1; - d_ip1 = d_im1; - c_i = 0; - - while (d_ip1 >= d_i) { - c_i ^= (1 << (d_ip1 - d_i)); - e_ip1 ^= (e_i << (d_ip1 - d_i)); - if (e_ip1 == 0) return 0; - while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; - } - - y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); - y_im1 = y_i; - y_i = y_ip1; - - e_im1 = e_i; - d_im1 = d_i; - e_i = e_ip1; - d_i = d_ip1; - } - - return y_i; -} - -gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index) -{ - uint8_t *ptr; - uint32_t rv; - int rs; - int byte, bit, i; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - rs = bytes / h->w; - byte = index/8; - bit = index%8; - - ptr = (uint8_t *) start; - ptr += bytes; - ptr -= rs; - ptr += byte; - - rv = 0; - for (i = 0; i < h->w; i++) { - rv <<= 1; - if ((*ptr) & (1 << bit)) rv |= 1; - ptr -= rs; - } - - return rv; -} - -static -inline -gf_val_32_t gf_wgen_matrix (gf_t *gf, gf_val_32_t b) -{ - return gf_bitmatrix_inverse(b, ((gf_internal_t *) (gf->scratch))->w, - ((gf_internal_t *) (gf->scratch))->prim_poly); -} - -static -inline -uint32_t -gf_wgen_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32) -{ - uint64_t product, i, pp, a, b, one; - gf_internal_t *h; - - a = a32; - b = b32; - h = (gf_internal_t *) gf->scratch; - one = 1; - pp = h->prim_poly | (one << h->w); - - product = 0; - - for (i = 0; i < h->w; i++) { - if (a & (one << i)) product ^= (b << i); - } - for (i = h->w*2-1; i >= h->w; i--) { - if (product & (one << i)) product ^= (pp << (i-h->w)); - } - return product; -} - -static -int gf_wgen_shift_init(gf_t *gf) -{ - gf->multiply.w32 = gf_wgen_shift_multiply; - gf->inverse.w32 = gf_wgen_euclid; - return 1; -} - -static -gf_val_32_t -gf_wgen_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - uint32_t prod, pp, bmask; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - prod = 0; - bmask = (1 << (h->w-1)); - - while (1) { - if (a & 1) prod ^= b; - a >>= 1; - if (a == 0) return prod; - if (b & bmask) { - b = ((b << 1) ^ pp); - } else { - b <<= 1; - } - } -} - -static -int gf_wgen_bytwo_b_init(gf_t *gf) -{ - gf->multiply.w32 = gf_wgen_bytwo_b_multiply; - gf->inverse.w32 = gf_wgen_euclid; - return 1; -} - -static -inline -gf_val_32_t -gf_wgen_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - uint32_t prod, pp, pmask, amask; - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - prod = 0; - pmask = (1 << ((h->w)-1)); /*Ben: Had an operator precedence warning here*/ - amask = pmask; - - while (amask != 0) { - if (prod & pmask) { - prod = ((prod << 1) ^ pp); - } else { - prod <<= 1; - } - if (a & amask) prod ^= b; - amask >>= 1; - } - return prod; -} - - -static -int gf_wgen_bytwo_p_init(gf_t *gf) -{ - gf->multiply.w32 = gf_wgen_bytwo_p_multiply; - gf->inverse.w32 = gf_wgen_euclid; - return 1; -} - -static -void -gf_wgen_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h) -{ - int i; - uint32_t j; - int g_s; - - if (h->mult_type == GF_MULT_DEFAULT) { - g_s = 2; - } else { - g_s = h->arg1; - } - - shift[0] = 0; - - for (i = 1; i < (1 << g_s); i <<= 1) { - for (j = 0; j < i; j++) shift[i|j] = shift[j]^val; - if (val & (1 << (h->w-1))) { - val <<= 1; - val ^= h->prim_poly; - } else { - val <<= 1; - } - } -} - -static -inline -gf_val_32_t -gf_wgen_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - int leftover, rs; - uint32_t p, l, ind, a32; - int bits_left; - int g_s; - int w; - - struct gf_wgen_group_data *gd; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - g_s = h->arg1; - w = h->w; - - gd = (struct gf_wgen_group_data *) h->private; - gf_wgen_group_set_shift_tables(gd->shift, b, h); - - leftover = w % g_s; - if (leftover == 0) leftover = g_s; - - rs = w - leftover; - a32 = a; - ind = a32 >> rs; - a32 <<= leftover; - a32 &= gd->mask; - p = gd->shift[ind]; - - bits_left = rs; - rs = w - g_s; - - while (bits_left > 0) { - bits_left -= g_s; - ind = a32 >> rs; - a32 <<= g_s; - a32 &= gd->mask; - l = p >> rs; - p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)) & gd->mask; - } - return p; -} - -char *bits(uint32_t v) -{ - char *rv; - int i, j; - - rv = malloc(30); - j = 0; - for (i = 27; i >= 0; i--) { - rv[j] = '0' + ((v & (1 << i)) ? 1 : 0); - j++; - } - rv[j] = '\0'; - return rv; -} -char *bits_56(uint64_t v) -{ - char *rv; - int i, j; - uint64_t one; - - one = 1; - - rv = malloc(60); - j = 0; - for (i = 55; i >= 0; i--) { - rv[j] = '0' + ((v & (one << i)) ? 1 : 0); - j++; - } - rv[j] = '\0'; - return rv; -} - -static -inline -gf_val_32_t -gf_wgen_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - int i; - int leftover; - uint64_t p, l, r; - uint32_t a32, ind; - int g_s, g_r; - struct gf_wgen_group_data *gd; - int w; - - gf_internal_t *h = (gf_internal_t *) gf->scratch; - if (h->mult_type == GF_MULT_DEFAULT) { - g_s = 2; - g_r = 8; - } else { - g_s = h->arg1; - g_r = h->arg2; - } - w = h->w; - gd = (struct gf_wgen_group_data *) h->private; - gf_wgen_group_set_shift_tables(gd->shift, b, h); - - leftover = w % g_s; - if (leftover == 0) leftover = g_s; - - a32 = a; - ind = a32 >> (w - leftover); - p = gd->shift[ind]; - p <<= g_s; - a32 <<= leftover; - a32 &= gd->mask; - - i = (w - leftover); - while (i > g_s) { - ind = a32 >> (w-g_s); - p ^= gd->shift[ind]; - a32 <<= g_s; - a32 &= gd->mask; - p <<= g_s; - i -= g_s; - } - - ind = a32 >> (h->w-g_s); - p ^= gd->shift[ind]; - - for (i = gd->tshift ; i >= 0; i -= g_r) { - l = p & (gd->rmask << i); - r = gd->reduce[l >> (i+w)]; - r <<= (i); - p ^= r; - } - return p & gd->mask; -} - -static -int gf_wgen_group_init(gf_t *gf) -{ - uint32_t i, j, p, index; - struct gf_wgen_group_data *gd; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - int g_s, g_r; - - if (h->mult_type == GF_MULT_DEFAULT) { - g_s = 2; - g_r = 8; - } else { - g_s = h->arg1; - g_r = h->arg2; - } - gd = (struct gf_wgen_group_data *) h->private; - gd->shift = &(gd->memory); - gd->reduce = gd->shift + (1 << g_s); - gd->mask = (h->w != 31) ? ((1 << h->w)-1) : 0x7fffffff; - - gd->rmask = (1 << g_r) - 1; - gd->rmask <<= h->w; - - gd->tshift = h->w % g_s; - if (gd->tshift == 0) gd->tshift = g_s; - gd->tshift = (h->w - gd->tshift); - gd->tshift = ((gd->tshift-1)/g_r) * g_r; - - gd->reduce[0] = 0; - for (i = 0; i < (1 << g_r); i++) { - p = 0; - index = 0; - for (j = 0; j < g_r; j++) { - if (i & (1 << j)) { - p ^= (h->prim_poly << j); - index ^= (h->prim_poly >> (h->w-j)); - } - } - gd->reduce[index] = (p & gd->mask); - } - - if (g_s == g_r) { - gf->multiply.w32 = gf_wgen_group_s_equals_r_multiply; - } else { - gf->multiply.w32 = gf_wgen_group_multiply; - } - gf->divide.w32 = NULL; - gf->divide.w32 = NULL; - return 1; -} - - -static -gf_val_32_t -gf_wgen_table_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_internal_t *h; - struct gf_wgen_table_w8_data *std; - - h = (gf_internal_t *) gf->scratch; - std = (struct gf_wgen_table_w8_data *) h->private; - - return (std->mult[(a<w)+b]); -} - -static -gf_val_32_t -gf_wgen_table_8_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_internal_t *h; - struct gf_wgen_table_w8_data *std; - - h = (gf_internal_t *) gf->scratch; - std = (struct gf_wgen_table_w8_data *) h->private; - - return (std->div[(a<w)+b]); -} - -static -int gf_wgen_table_8_init(gf_t *gf) -{ - gf_internal_t *h; - int w; - struct gf_wgen_table_w8_data *std; - uint32_t a, b, p; - - h = (gf_internal_t *) gf->scratch; - w = h->w; - std = (struct gf_wgen_table_w8_data *) h->private; - - std->mult = &(std->base); - std->div = std->mult + ((1<w)*(1<w)); - - for (a = 0; a < (1 << w); a++) { - std->mult[a] = 0; - std->mult[a<div[a] = 0; - std->div[a<mult[(a<div[(p<multiply.w32 = gf_wgen_table_8_multiply; - gf->divide.w32 = gf_wgen_table_8_divide; - return 1; -} - -static -gf_val_32_t -gf_wgen_table_16_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_internal_t *h; - struct gf_wgen_table_w16_data *std; - - h = (gf_internal_t *) gf->scratch; - std = (struct gf_wgen_table_w16_data *) h->private; - - return (std->mult[(a<w)+b]); -} - -static -gf_val_32_t -gf_wgen_table_16_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_internal_t *h; - struct gf_wgen_table_w16_data *std; - - h = (gf_internal_t *) gf->scratch; - std = (struct gf_wgen_table_w16_data *) h->private; - - return (std->div[(a<w)+b]); -} - -static -int gf_wgen_table_16_init(gf_t *gf) -{ - gf_internal_t *h; - int w; - struct gf_wgen_table_w16_data *std; - uint32_t a, b, p; - - h = (gf_internal_t *) gf->scratch; - w = h->w; - std = (struct gf_wgen_table_w16_data *) h->private; - - std->mult = &(std->base); - std->div = std->mult + ((1<w)*(1<w)); - - for (a = 0; a < (1 << w); a++) { - std->mult[a] = 0; - std->mult[a<div[a] = 0; - std->div[a<mult[(a<div[(p<multiply.w32 = gf_wgen_table_16_multiply; - gf->divide.w32 = gf_wgen_table_16_divide; - return 1; -} - -static -int gf_wgen_table_init(gf_t *gf) -{ - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - if (h->w <= 8) return gf_wgen_table_8_init(gf); - if (h->w <= 14) return gf_wgen_table_16_init(gf); - - /* Returning zero to make the compiler happy, but this won't get - executed, because it is tested in _scratch_space. */ - - return 0; -} - -static -gf_val_32_t -gf_wgen_log_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_internal_t *h; - struct gf_wgen_log_w8_data *std; - - h = (gf_internal_t *) gf->scratch; - std = (struct gf_wgen_log_w8_data *) h->private; - - if (a == 0 || b == 0) return 0; - return (std->anti[std->log[a]+std->log[b]]); -} - -static -gf_val_32_t -gf_wgen_log_8_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_internal_t *h; - struct gf_wgen_log_w8_data *std; - int index; - - h = (gf_internal_t *) gf->scratch; - std = (struct gf_wgen_log_w8_data *) h->private; - - if (a == 0 || b == 0) return 0; - index = std->log[a]; - index -= std->log[b]; - - return (std->danti[index]); -} - -static -int gf_wgen_log_8_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_wgen_log_w8_data *std; - int w; - uint32_t a, i; - int check = 0; - - h = (gf_internal_t *) gf->scratch; - w = h->w; - std = (struct gf_wgen_log_w8_data *) h->private; - - std->log = &(std->base); - std->anti = std->log + (1<w); - std->danti = std->anti + (1<w)-1; - - for (i = 0; i < (1 << w); i++) - std->log[i] = 0; - - a = 1; - for(i=0; i < (1<log[a] != 0) check = 1; - std->log[a] = i; - std->anti[i] = a; - std->danti[i] = a; - a <<= 1; - if(a & (1<prim_poly; - //a &= ((1 << w)-1); - } - - if (check != 0) { - _gf_errno = GF_E_LOGPOLY; - return 0; - } - - gf->multiply.w32 = gf_wgen_log_8_multiply; - gf->divide.w32 = gf_wgen_log_8_divide; - return 1; -} - -static -gf_val_32_t -gf_wgen_log_16_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_internal_t *h; - struct gf_wgen_log_w16_data *std; - - h = (gf_internal_t *) gf->scratch; - std = (struct gf_wgen_log_w16_data *) h->private; - - if (a == 0 || b == 0) return 0; - return (std->anti[std->log[a]+std->log[b]]); -} - -static -gf_val_32_t -gf_wgen_log_16_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_internal_t *h; - struct gf_wgen_log_w16_data *std; - int index; - - h = (gf_internal_t *) gf->scratch; - std = (struct gf_wgen_log_w16_data *) h->private; - - if (a == 0 || b == 0) return 0; - index = std->log[a]; - index -= std->log[b]; - - return (std->danti[index]); -} - -static -int gf_wgen_log_16_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_wgen_log_w16_data *std; - int w; - uint32_t a, i; - int check = 0; - - h = (gf_internal_t *) gf->scratch; - w = h->w; - std = (struct gf_wgen_log_w16_data *) h->private; - - std->log = &(std->base); - std->anti = std->log + (1<w); - std->danti = std->anti + (1<w)-1; - - for (i = 0; i < (1 << w); i++) - std->log[i] = 0; - - a = 1; - for(i=0; i < (1<log[a] != 0) check = 1; - std->log[a] = i; - std->anti[i] = a; - std->danti[i] = a; - a <<= 1; - if(a & (1<prim_poly; - //a &= ((1 << w)-1); - } - - if (check) { - if (h->mult_type != GF_MULT_LOG_TABLE) return gf_wgen_shift_init(gf); - _gf_errno = GF_E_LOGPOLY; - return 0; - } - - gf->multiply.w32 = gf_wgen_log_16_multiply; - gf->divide.w32 = gf_wgen_log_16_divide; - return 1; -} - -static -gf_val_32_t -gf_wgen_log_32_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_internal_t *h; - struct gf_wgen_log_w32_data *std; - - h = (gf_internal_t *) gf->scratch; - std = (struct gf_wgen_log_w32_data *) h->private; - - if (a == 0 || b == 0) return 0; - return (std->anti[std->log[a]+std->log[b]]); -} - -static -gf_val_32_t -gf_wgen_log_32_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_internal_t *h; - struct gf_wgen_log_w32_data *std; - int index; - - h = (gf_internal_t *) gf->scratch; - std = (struct gf_wgen_log_w32_data *) h->private; - - if (a == 0 || b == 0) return 0; - index = std->log[a]; - index -= std->log[b]; - - return (std->danti[index]); -} - -static -int gf_wgen_log_32_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_wgen_log_w32_data *std; - int w; - uint32_t a, i; - int check = 0; - - h = (gf_internal_t *) gf->scratch; - w = h->w; - std = (struct gf_wgen_log_w32_data *) h->private; - - std->log = &(std->base); - std->anti = std->log + (1<w); - std->danti = std->anti + (1<w)-1; - - for (i = 0; i < (1 << w); i++) - std->log[i] = 0; - - a = 1; - for(i=0; i < (1<log[a] != 0) check = 1; - std->log[a] = i; - std->anti[i] = a; - std->danti[i] = a; - a <<= 1; - if(a & (1<prim_poly; - //a &= ((1 << w)-1); - } - - if (check != 0) { - _gf_errno = GF_E_LOGPOLY; - return 0; - } - - gf->multiply.w32 = gf_wgen_log_32_multiply; - gf->divide.w32 = gf_wgen_log_32_divide; - return 1; -} - -static -int gf_wgen_log_init(gf_t *gf) -{ - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - if (h->w <= 8) return gf_wgen_log_8_init(gf); - if (h->w <= 16) return gf_wgen_log_16_init(gf); - if (h->w <= 32) return gf_wgen_log_32_init(gf); - - /* Returning zero to make the compiler happy, but this won't get - executed, because it is tested in _scratch_space. */ - - return 0; -} - -int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2) -{ - - switch(mult_type) - { - case GF_MULT_DEFAULT: - if (w <= 8) { - return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w8_data) + - sizeof(uint8_t)*(1 << w)*(1<scratch; - rs = bytes / (h->w); - - written = (xor) ? 0xffffffff : 0; - for (i = 0; i < h->w; i++) { - for (j = 0; j < h->w; j++) { - if (val & (1 << j)) { - gf_multby_one(src, ((char*)dest) + j*rs, rs, (written & (1 << j))); - written |= (1 << j); - } - } - src = (char*)src + rs; - val = gf->multiply.w32(gf, val, 2); - } -} - -int gf_wgen_init(gf_t *gf) -{ - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - if (h->prim_poly == 0) { - switch (h->w) { - case 1: h->prim_poly = 1; break; - case 2: h->prim_poly = 7; break; - case 3: h->prim_poly = 013; break; - case 4: h->prim_poly = 023; break; - case 5: h->prim_poly = 045; break; - case 6: h->prim_poly = 0103; break; - case 7: h->prim_poly = 0211; break; - case 8: h->prim_poly = 0435; break; - case 9: h->prim_poly = 01021; break; - case 10: h->prim_poly = 02011; break; - case 11: h->prim_poly = 04005; break; - case 12: h->prim_poly = 010123; break; - case 13: h->prim_poly = 020033; break; - case 14: h->prim_poly = 042103; break; - case 15: h->prim_poly = 0100003; break; - case 16: h->prim_poly = 0210013; break; - case 17: h->prim_poly = 0400011; break; - case 18: h->prim_poly = 01000201; break; - case 19: h->prim_poly = 02000047; break; - case 20: h->prim_poly = 04000011; break; - case 21: h->prim_poly = 010000005; break; - case 22: h->prim_poly = 020000003; break; - case 23: h->prim_poly = 040000041; break; - case 24: h->prim_poly = 0100000207; break; - case 25: h->prim_poly = 0200000011; break; - case 26: h->prim_poly = 0400000107; break; - case 27: h->prim_poly = 01000000047; break; - case 28: h->prim_poly = 02000000011; break; - case 29: h->prim_poly = 04000000005; break; - case 30: h->prim_poly = 010040000007; break; - case 31: h->prim_poly = 020000000011; break; - case 32: h->prim_poly = 00020000007; break; - default: fprintf(stderr, "gf_wgen_init: w not defined yet\n"); exit(1); - } - } else { - if (h->w == 32) { - h->prim_poly &= 0xffffffff; - } else { - h->prim_poly |= (1 << h->w); - if (h->prim_poly & ~((1ULL<<(h->w+1))-1)) return 0; - } - } - - gf->multiply.w32 = NULL; - gf->divide.w32 = NULL; - gf->inverse.w32 = NULL; - gf->multiply_region.w32 = gf_wgen_cauchy_region; - gf->extract_word.w32 = gf_wgen_extract_word; - - switch(h->mult_type) { - case GF_MULT_DEFAULT: - if (h->w <= 8) { - if (gf_wgen_table_init(gf) == 0) return 0; - } else if (h->w <= 16) { - if (gf_wgen_log_init(gf) == 0) return 0; - } else { - if (gf_wgen_bytwo_p_init(gf) == 0) return 0; - } - break; - case GF_MULT_SHIFT: if (gf_wgen_shift_init(gf) == 0) return 0; break; - case GF_MULT_BYTWO_b: if (gf_wgen_bytwo_b_init(gf) == 0) return 0; break; - case GF_MULT_BYTWO_p: if (gf_wgen_bytwo_p_init(gf) == 0) return 0; break; - case GF_MULT_GROUP: if (gf_wgen_group_init(gf) == 0) return 0; break; - case GF_MULT_TABLE: if (gf_wgen_table_init(gf) == 0) return 0; break; - case GF_MULT_LOG_TABLE: if (gf_wgen_log_init(gf) == 0) return 0; break; - default: return 0; - } - if (h->divide_type == GF_DIVIDE_EUCLID) { - gf->divide.w32 = gf_wgen_divide_from_inverse; - gf->inverse.w32 = gf_wgen_euclid; - } else if (h->divide_type == GF_DIVIDE_MATRIX) { - gf->divide.w32 = gf_wgen_divide_from_inverse; - gf->inverse.w32 = gf_wgen_matrix; - } - - if (gf->inverse.w32== NULL && gf->divide.w32 == NULL) gf->inverse.w32 = gf_wgen_euclid; - - if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) { - gf->divide.w32 = gf_wgen_divide_from_inverse; - } - if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) { - gf->inverse.w32 = gf_wgen_inverse_from_divide; - } - return 1; -} diff --git a/src/erasure-code/jerasure/jerasure/include/cauchy.h b/src/erasure-code/jerasure/jerasure/include/cauchy.h deleted file mode 100644 index a4fad6bd6ee7f..0000000000000 --- a/src/erasure-code/jerasure/jerasure/include/cauchy.h +++ /dev/null @@ -1,45 +0,0 @@ -/* * - * Copyright (c) 2013, James S. Plank and Kevin Greenan - * All rights reserved. - * - * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure - * Coding Techniques - * - * Revision 2.0: Galois Field backend now links to GF-Complete - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * - Neither the name of the University of Tennessee nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY - * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - - -extern int *cauchy_original_coding_matrix(int k, int m, int w); -extern int *cauchy_xy_coding_matrix(int k, int m, int w, int *x, int *y); -extern void cauchy_improve_coding_matrix(int k, int m, int w, int *matrix); -extern int *cauchy_good_general_coding_matrix(int k, int m, int w); -extern int cauchy_n_ones(int n, int w); diff --git a/src/erasure-code/jerasure/jerasure/include/galois.h b/src/erasure-code/jerasure/jerasure/include/galois.h deleted file mode 100644 index d75be6a5d9a0a..0000000000000 --- a/src/erasure-code/jerasure/jerasure/include/galois.h +++ /dev/null @@ -1,99 +0,0 @@ -/* * - * Copyright (c) 2013, James S. Plank and Kevin Greenan - * All rights reserved. - * - * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure - * Coding Techniques - * - * Revision 2.0: Galois Field backend now links to GF-Complete - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * - Neither the name of the University of Tennessee nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY - * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - - -#ifndef _GALOIS_H -#define _GALOIS_H - -#include -#include -#include - -extern void galois_change_technique(gf_t *gf, int w); - -extern int galois_single_multiply(int a, int b, int w); -extern int galois_single_divide(int a, int b, int w); -extern int galois_inverse(int x, int w); - -void galois_region_xor( char *src, /* Source Region */ - char *dest, /* Dest Region (holds result) */ - int nbytes); /* Number of bytes in region */ - -/* These multiply regions in w=8, w=16 and w=32. They are much faster - than calling galois_single_multiply. The regions must be long word aligned. */ - -void galois_w08_region_multiply(char *region, /* Region to multiply */ - int multby, /* Number to multiply by */ - int nbytes, /* Number of bytes in region */ - char *r2, /* If r2 != NULL, products go here. - Otherwise region is overwritten */ - int add); /* If (r2 != NULL && add) the produce is XOR'd with r2 */ - -void galois_w16_region_multiply(char *region, /* Region to multiply */ - int multby, /* Number to multiply by */ - int nbytes, /* Number of bytes in region */ - char *r2, /* If r2 != NULL, products go here. - Otherwise region is overwritten */ - int add); /* If (r2 != NULL && add) the produce is XOR'd with r2 */ - -void galois_w32_region_multiply(char *region, /* Region to multiply */ - int multby, /* Number to multiply by */ - int nbytes, /* Number of bytes in region */ - char *r2, /* If r2 != NULL, products go here. - Otherwise region is overwritten */ - int add); /* If (r2 != NULL && add) the produce is XOR'd with r2 */ - -gf_t* galois_init_field(int w, - int mult_type, - int region_type, - int divide_type, - uint64_t prim_poly, - int arg1, - int arg2); - -gf_t* galois_init_composite_field(int w, - int region_type, - int divide_type, - int degree, - gf_t* base_gf); - -gf_t * galois_get_field_ptr(int w); - - -#endif diff --git a/src/erasure-code/jerasure/jerasure/include/jerasure.h b/src/erasure-code/jerasure/jerasure/include/jerasure.h deleted file mode 100644 index 08367809b7152..0000000000000 --- a/src/erasure-code/jerasure/jerasure/include/jerasure.h +++ /dev/null @@ -1,294 +0,0 @@ -/* * - * Copyright (c) 2013, James S. Plank and Kevin Greenan - * All rights reserved. - * - * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure - * Coding Techniques - * - * Revision 2.0: Galois Field backend now links to GF-Complete - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * - Neither the name of the University of Tennessee nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY - * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - - -#ifndef _JERASURE_H -#define _JERASURE_H - -/* This uses procedures from the Galois Field arithmetic library */ - -#include "galois.h" - -/* ------------------------------------------------------------ */ -/* In all of the routines below: - - k = Number of data devices - m = Number of coding devices - w = Word size - - data_ptrs = An array of k pointers to data which is size bytes. - Size must be a multiple of sizeof(long). - Pointers must also be longword aligned. - - coding_ptrs = An array of m pointers to coding data which is size bytes. - - packetsize = The size of a coding block with bitmatrix coding. - When you code with a bitmatrix, you will use w packets - of size packetsize. - - matrix = an array of k*m integers. - It represents an m by k matrix. - Element i,j is in matrix[i*k+j]; - - bitmatrix = an array of k*m*w*w integers. - It represents an mw by kw matrix. - Element i,j is in matrix[i*k*w+j]; - - erasures = an array of id's of erased devices. - Id's are integers between 0 and k+m-1. - Id's 0 to k-1 are id's of data devices. - Id's k to k+m-1 are id's of coding devices: - Coding device id = id-k. - If there are e erasures, erasures[e] = -1. - - schedule = an array of schedule operations. - - If there are m operations, then schedule[m][0] = -1. - - operation = an array of 5 integers: - - 0 = operation: 0 for copy, 1 for xor (-1 for end) - 1 = source device (0 - k+m-1) - 2 = source packet (0 - w-1) - 3 = destination device (0 - k+m-1) - 4 = destination packet (0 - w-1) - */ - -/* --------------------------------------------------------------- */ -/* Bitmatrices / schedules ---------------------------------------- */ -/* - - jerasure_matrix_to_bitmatrix turns a m X k matrix in GF(2^w) into a - wm X wk bitmatrix (in GF(2)). This is - explained in the Cauchy Reed-Solomon coding - paper. - - - jerasure_dumb_bitmatrix_to_schedule turns a bitmatrix into a schedule - using the straightforward algorithm -- just - schedule the dot products defined by each - row of the matrix. - - - jerasure_smart_bitmatrix_to_schedule turns a bitmatrix into a schedule, - but tries to use previous dot products to - calculate new ones. This is the optimization - explained in the original Liberation code paper. - - - jerasure_generate_schedule_cache precalcalculate all the schedule for the - given distribution bitmatrix. M must equal 2. - - - jerasure_free_schedule frees a schedule that was allocated with - jerasure_XXX_bitmatrix_to_schedule. - - - jerasure_free_schedule_cache frees a schedule cache that was created with - jerasure_generate_schedule_cache. - */ - -int *jerasure_matrix_to_bitmatrix(int k, int m, int w, int *matrix); -int **jerasure_dumb_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix); -int **jerasure_smart_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix); -int ***jerasure_generate_schedule_cache(int k, int m, int w, int *bitmatrix, int smart); - -void jerasure_free_schedule(int **schedule); -void jerasure_free_schedule_cache(int k, int m, int ***cache); - - -/* ------------------------------------------------------------ */ -/* Encoding - these are all straightforward. jerasure_matrix_encode only - works with w = 8|16|32. */ - -void jerasure_do_parity(int k, char **data_ptrs, char *parity_ptr, int size); - -void jerasure_matrix_encode(int k, int m, int w, int *matrix, - char **data_ptrs, char **coding_ptrs, int size); - -void jerasure_bitmatrix_encode(int k, int m, int w, int *bitmatrix, - char **data_ptrs, char **coding_ptrs, int size, int packetsize); - -void jerasure_schedule_encode(int k, int m, int w, int **schedule, - char **data_ptrs, char **coding_ptrs, int size, int packetsize); - -/* ------------------------------------------------------------ */ -/* Decoding. -------------------------------------------------- */ - -/* These return integers, because the matrix may not be invertible. - - The parameter row_k_ones should be set to 1 if row k of the matrix - (or rows kw to (k+1)w+1) of th distribution matrix are all ones - (or all identity matrices). Then you can improve the performance - of decoding when there is more than one failure, and the parity - device didn't fail. You do it by decoding all but one of the data - devices, and then decoding the last data device from the data devices - and the parity device. - - jerasure_schedule_decode_lazy generates the schedule on the fly. - - jerasure_matrix_decode only works when w = 8|16|32. - - jerasure_make_decoding_matrix/bitmatrix make the k*k decoding matrix - (or wk*wk bitmatrix) by taking the rows corresponding to k - non-erased devices of the distribution matrix, and then - inverting that matrix. - - You should already have allocated the decoding matrix and - dm_ids, which is a vector of k integers. These will be - filled in appropriately. dm_ids[i] is the id of element - i of the survivors vector. I.e. row i of the decoding matrix - times dm_ids equals data drive i. - - Both of these routines take "erased" instead of "erasures". - Erased is a vector with k+m elements, which has 0 or 1 for - each device's id, according to whether the device is erased. - - jerasure_erasures_to_erased allocates and returns erased from erasures. - - */ - -int jerasure_matrix_decode(int k, int m, int w, - int *matrix, int row_k_ones, int *erasures, - char **data_ptrs, char **coding_ptrs, int size); - -int jerasure_bitmatrix_decode(int k, int m, int w, - int *bitmatrix, int row_k_ones, int *erasures, - char **data_ptrs, char **coding_ptrs, int size, int packetsize); - -int jerasure_schedule_decode_lazy(int k, int m, int w, int *bitmatrix, int *erasures, - char **data_ptrs, char **coding_ptrs, int size, int packetsize, - int smart); - -int jerasure_schedule_decode_cache(int k, int m, int w, int ***scache, int *erasures, - char **data_ptrs, char **coding_ptrs, int size, int packetsize); - -int jerasure_make_decoding_matrix(int k, int m, int w, int *matrix, int *erased, - int *decoding_matrix, int *dm_ids); - -int jerasure_make_decoding_bitmatrix(int k, int m, int w, int *matrix, int *erased, - int *decoding_matrix, int *dm_ids); - -int *jerasure_erasures_to_erased(int k, int m, int *erasures); - -/* ------------------------------------------------------------ */ -/* These perform dot products and schedules. -------------------*/ -/* - src_ids is a matrix of k id's (0 - k-1 for data devices, k - k+m-1 - for coding devices) that identify the source devices. Dest_id is - the id of the destination device. - - jerasure_matrix_dotprod only works when w = 8|16|32. - - jerasure_do_scheduled_operations executes the schedule on w*packetsize worth of - bytes from each device. ptrs is an array of pointers which should have as many - elements as the highest referenced device in the schedule. - - */ - -void jerasure_matrix_dotprod(int k, int w, int *matrix_row, - int *src_ids, int dest_id, - char **data_ptrs, char **coding_ptrs, int size); - -void jerasure_bitmatrix_dotprod(int k, int w, int *bitmatrix_row, - int *src_ids, int dest_id, - char **data_ptrs, char **coding_ptrs, int size, int packetsize); - -void jerasure_do_scheduled_operations(char **ptrs, int **schedule, int packetsize); - -/* ------------------------------------------------------------ */ -/* Matrix Inversion ------------------------------------------- */ -/* - The two matrix inversion functions work on rows*rows matrices of - ints. If a bitmatrix, then each int will just be zero or one. - Otherwise, they will be elements of gf(2^w). Obviously, you can - do bit matrices with crs_invert_matrix() and set w = 1, but - crs_invert_bitmatrix will be more efficient. - - The two invertible functions return whether a matrix is invertible. - They are more efficient than the inverstion functions. - - Mat will be destroyed when the matrix inversion or invertible - testing is done. Sorry. - - Inv must be allocated by the caller. - - The two invert_matrix functions return 0 on success, and -1 if the - matrix is uninvertible. - - The two invertible function simply return whether the matrix is - invertible. (0 or 1). Mat will be destroyed. - */ - -int jerasure_invert_matrix(int *mat, int *inv, int rows, int w); -int jerasure_invert_bitmatrix(int *mat, int *inv, int rows); -int jerasure_invertible_matrix(int *mat, int rows, int w); -int jerasure_invertible_bitmatrix(int *mat, int rows); - -/* ------------------------------------------------------------ */ -/* Basic matrix operations -------------------------------------*/ -/* - Each of the print_matrix routines require a w. In jerasure_print_matrix, - this is to calculate the field width. In jerasure_print_bitmatrix, it is - to put spaces between the bits. - - jerasure_matrix_multiply is a simple matrix multiplier in GF(2^w). It returns a r1*c2 - matrix, which is the product of the two input matrices. It allocates - the product. Obviously, c1 should equal r2. However, this is not - validated by the procedure. -*/ - -void jerasure_print_matrix(int *matrix, int rows, int cols, int w); -void jerasure_print_bitmatrix(int *matrix, int rows, int cols, int w); - - -int *jerasure_matrix_multiply(int *m1, int *m2, int r1, int c1, int r2, int c2, int w); - -/* ------------------------------------------------------------ */ -/* Stats ------------------------------------------------------ */ -/* - jerasure_get_stats fills in a vector of three doubles: - - fill_in[0] is the number of bytes that have been XOR'd - fill_in[1] is the number of bytes that have been copied - fill_in[2] is the number of bytes that have been multiplied - by a constant in GF(2^w) - - When jerasure_get_stats() is called, it resets its values. - */ - -void jerasure_get_stats(double *fill_in); - -int jerasure_autoconf_test(); - -#endif diff --git a/src/erasure-code/jerasure/jerasure/include/liberation.h b/src/erasure-code/jerasure/jerasure/include/liberation.h deleted file mode 100644 index f2fb7233fcf99..0000000000000 --- a/src/erasure-code/jerasure/jerasure/include/liberation.h +++ /dev/null @@ -1,47 +0,0 @@ -/* * - * Copyright (c) 2013, James S. Plank and Kevin Greenan - * All rights reserved. - * - * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure - * Coding Techniques - * - * Revision 2.0: Galois Field backend now links to GF-Complete - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * - Neither the name of the University of Tennessee nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY - * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - - -#ifndef _LIBERATION - -extern int *liberation_coding_bitmatrix(int k, int w); -extern int *liber8tion_coding_bitmatrix(int k); -extern int *blaum_roth_coding_bitmatrix(int k, int w); - -#endif diff --git a/src/erasure-code/jerasure/jerasure/include/reed_sol.h b/src/erasure-code/jerasure/jerasure/include/reed_sol.h deleted file mode 100644 index d2d8fe8caf2f1..0000000000000 --- a/src/erasure-code/jerasure/jerasure/include/reed_sol.h +++ /dev/null @@ -1,50 +0,0 @@ -/* * - * Copyright (c) 2013, James S. Plank and Kevin Greenan - * All rights reserved. - * - * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure - * Coding Techniques - * - * Revision 2.0: Galois Field backend now links to GF-Complete - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * - Neither the name of the University of Tennessee nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY - * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - - -extern int *reed_sol_vandermonde_coding_matrix(int k, int m, int w); -extern int *reed_sol_extended_vandermonde_matrix(int rows, int cols, int w); -extern int *reed_sol_big_vandermonde_distribution_matrix(int rows, int cols, int w); - -extern int reed_sol_r6_encode(int k, int w, char **data_ptrs, char **coding_ptrs, int size); -extern int *reed_sol_r6_coding_matrix(int k, int w); - -extern void reed_sol_galois_w08_region_multby_2(char *region, int nbytes); -extern void reed_sol_galois_w16_region_multby_2(char *region, int nbytes); -extern void reed_sol_galois_w32_region_multby_2(char *region, int nbytes); diff --git a/src/erasure-code/jerasure/jerasure/src/cauchy.c b/src/erasure-code/jerasure/jerasure/src/cauchy.c deleted file mode 100644 index f63dfb7eab48f..0000000000000 --- a/src/erasure-code/jerasure/jerasure/src/cauchy.c +++ /dev/null @@ -1,405 +0,0 @@ -/* * - * Copyright (c) 2014, James S. Plank and Kevin Greenan - * All rights reserved. - * - * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure - * Coding Techniques - * - * Revision 2.0: Galois Field backend now links to GF-Complete - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * - Neither the name of the University of Tennessee nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY - * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/* Jerasure's authors: - - Revision 2.x - 2014: James S. Plank and Kevin M. Greenan - Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman. - Revision 1.0 - 2007: James S. Plank - */ - -#include -#include -#include - -#include "galois.h" -#include "jerasure.h" -#include "cauchy.h" - -static int PPs[33] = { -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1 }; -static int NOs[33]; -static int ONEs[33][33]; - -static int *cbest_0; -static int *cbest_1; -static int cbest_2[3]; -static int cbest_3[7]; -static int cbest_4[15]; -static int cbest_5[31]; -static int cbest_6[63]; -static int cbest_7[127]; -static int cbest_8[255]; -static int cbest_9[511]; -static int cbest_10[1023]; -static int cbest_11[1023]; -static int *cbest_12, *cbest_13, *cbest_14, *cbest_15, *cbest_16, *cbest_17, *cbest_18, *cbest_19, *cbest_20, - *cbest_21, *cbest_22, *cbest_23, *cbest_24, *cbest_25, *cbest_26, *cbest_27, *cbest_28, *cbest_29, *cbest_30, - *cbest_31, *cbest_32; - -static int cbest_max_k[33] = { -1, -1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 1023, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1 }; - -static int cbest_init = 0; - -static int *cbest_all[33]; - - -#define talloc(type, num) (type *) malloc(sizeof(type)*(num)) - -int cauchy_n_ones(int n, int w) -{ - int no; - int cno; - int nones; - int i, j; - int highbit; - - highbit = (1 << (w-1)); - - if (PPs[w] == -1) { - nones = 0; - PPs[w] = galois_single_multiply(highbit, 2, w); - for (i = 0; i < w; i++) { - if (PPs[w] & (1 << i)) { - ONEs[w][nones] = (1 << i); - nones++; - } - } - NOs[w] = nones; - } - - no = 0; - for (i = 0; i < w; i++) if (n & (1 << i)) no++; - cno = no; - for (i = 1; i < w; i++) { - if (n & highbit) { - n ^= highbit; - n <<= 1; - n ^= PPs[w]; - cno--; - for (j = 0; j < NOs[w]; j++) { - cno += (n & ONEs[w][j]) ? 1 : -1; - } - } else { - n <<= 1; - } - no += cno; - } - return no; -} - -int *cauchy_original_coding_matrix(int k, int m, int w) -{ - int *matrix; - int i, j, index; - - if (w < 31 && (k+m) > (1 << w)) return NULL; - matrix = talloc(int, k*m); - if (matrix == NULL) return NULL; - index = 0; - for (i = 0; i < m; i++) { - for (j = 0; j < k; j++) { - matrix[index] = galois_single_divide(1, (i ^ (m+j)), w); - index++; - } - } - return matrix; -} - -int *cauchy_xy_coding_matrix(int k, int m, int w, int *X, int *Y) -{ - int index, i, j; - int *matrix; - - matrix = talloc(int, k*m); - if (matrix == NULL) { return NULL; } - index = 0; - for (i = 0; i < m; i++) { - for (j = 0; j < k; j++) { - matrix[index] = galois_single_divide(1, (X[i] ^ Y[j]), w); - index++; - } - } - return matrix; -} - -void cauchy_improve_coding_matrix(int k, int m, int w, int *matrix) -{ - int index, i, j, x; - int tmp; - int bno, tno, bno_index; - - for (j = 0; j < k; j++) { - if (matrix[j] != 1) { - tmp = galois_single_divide(1, matrix[j], w); - index = j; - for (i = 0; i < m; i++) { - matrix[index] = galois_single_multiply(matrix[index], tmp, w); - index += k; - } - } - } - for (i = 1; i < m; i++) { - bno = 0; - index = i*k; - for (j = 0; j < k; j++) bno += cauchy_n_ones(matrix[index+j], w); - bno_index = -1; - for (j = 0; j < k; j++) { - if (matrix[index+j] != 1) { - tmp = galois_single_divide(1, matrix[index+j], w); - tno = 0; - for (x = 0; x < k; x++) { - tno += cauchy_n_ones(galois_single_multiply(matrix[index+x], tmp, w), w); - } - if (tno < bno) { - bno = tno; - bno_index = j; - } - } - } - if (bno_index != -1) { - tmp = galois_single_divide(1, matrix[index+bno_index], w); - for (j = 0; j < k; j++) { - matrix[index+j] = galois_single_multiply(matrix[index+j], tmp, w); - } - } - } -} - -int *cauchy_good_general_coding_matrix(int k, int m, int w) -{ - int *matrix, i; - - if (m == 2 && k <= cbest_max_k[w]) { - matrix = talloc(int, k*m); - if (matrix == NULL) return NULL; - if (!cbest_init) { - cbest_init = 1; - cbest_all[0] = cbest_0; cbest_all[1] = cbest_1; cbest_all[2] = cbest_2; cbest_all[3] = cbest_3; cbest_all[4] = - cbest_4; cbest_all[5] = cbest_5; cbest_all[6] = cbest_6; cbest_all[7] = cbest_7; cbest_all[8] = cbest_8; - cbest_all[9] = cbest_9; cbest_all[10] = cbest_10; cbest_all[11] = cbest_11; cbest_all[12] = cbest_12; - cbest_all[13] = cbest_13; cbest_all[14] = cbest_14; cbest_all[15] = cbest_15; cbest_all[16] = cbest_16; - cbest_all[17] = cbest_17; cbest_all[18] = cbest_18; cbest_all[19] = cbest_19; cbest_all[20] = cbest_20; - cbest_all[21] = cbest_21; cbest_all[22] = cbest_22; cbest_all[23] = cbest_23; cbest_all[24] = cbest_24; - cbest_all[25] = cbest_25; cbest_all[26] = cbest_26; cbest_all[27] = cbest_27; cbest_all[28] = cbest_28; - cbest_all[29] = cbest_29; cbest_all[30] = cbest_30; cbest_all[31] = cbest_31; cbest_all[32] = (int *) cbest_32; - } - for (i = 0; i < k; i++) { - matrix[i] = 1; - matrix[i+k] = cbest_all[w][i]; - } - return matrix; - } else { - matrix = cauchy_original_coding_matrix(k, m, w); - if (matrix == NULL) return NULL; - cauchy_improve_coding_matrix(k, m, w, matrix); - return matrix; - } -} - -static int cbest_2[3] = { 1, 2, 3 }; -static int cbest_3[7] = { 1, 2, 5, 4, 7, 3, 6 }; - -static int cbest_4[15] = { 1, 2, 9, 4, 8, 13, 3, 6, 12, 5, 11, 15, 10, 14, 7 }; - -static int cbest_5[31] = { 1, 2, 18, 4, 9, 8, 22, 16, 3, 11, 19, 5, 10, 6, 20, 27, 13, 23, 26, 12, - 17, 25, 24, 31, 30, 7, 15, 21, 29, 14, 28 }; - -static int cbest_6[63] = { 1, 2, 33, 4, 8, 49, 16, 32, 57, 3, 6, 12, 24, 48, 5, 35, 9, 37, 10, 17, - 41, 51, 56, 61, 18, 28, 53, 14, 20, 34, 7, 13, 25, 36, 59, 26, 39, 40, 45, 50, 60, 52, 63, - 11, 30, 55, 19, 22, 29, 43, 58, 15, 21, 38, 44, 47, 62, 27, 54, 42, 31, 23, 46 }; - -static int cbest_7[127] = { 1, 2, 68, 4, 34, 8, 17, 16, 76, 32, 38, 3, 64, 69, 5, 19, 35, 70, 6, 9, - 18, 102, 10, 36, 85, 12, 21, 42, 51, 72, 77, 84, 20, 25, 33, 50, 78, 98, 24, 39, 49, 100, 110 - , 48, 65, 93, 40, 66, 71, 92, 7, 46, 55, 87, 96, 103, 106, 11, 23, 37, 54, 81, 86, 108, 13, - 22, 27, 43, 53, 73, 80, 14, 26, 52, 74, 79, 99, 119, 44, 95, 101, 104, 111, 118, 29, 59, 89, - 94, 117, 28, 41, 58, 67, 88, 115, 116, 47, 57, 83, 97, 107, 114, 127, 56, 82, 109, 113, 126, - 112, 125, 15, 63, 75, 123, 124, 31, 45, 62, 91, 105, 122, 30, 61, 90, 121, 60, 120 }; - -static int cbest_8[255] = { 1, 2, 142, 4, 71, 8, 70, 173, 3, 35, 143, 16, 17, 67, 134, 140, 172, 6, 34 - , 69, 201, 216, 5, 33, 86, 12, 65, 138, 158, 159, 175, 10, 32, 43, 66, 108, 130, 193, 234, 9, - 24, 25, 50, 68, 79, 100, 132, 174, 200, 217, 20, 21, 42, 48, 87, 169, 41, 54, 64, 84, 96, 117 - , 154, 155, 165, 226, 77, 82, 135, 136, 141, 168, 192, 218, 238, 7, 18, 19, 39, 40, 78, 113, - 116, 128, 164, 180, 195, 205, 220, 232, 14, 26, 27, 58, 109, 156, 157, 203, 235, 13, 28, 29, 38 - , 51, 56, 75, 85, 90, 101, 110, 112, 139, 171, 11, 37, 49, 52, 76, 83, 102, 119, 131, 150, 151 - , 167, 182, 184, 188, 197, 219, 224, 45, 55, 80, 94, 97, 133, 170, 194, 204, 221, 227, 236, 36, - 47, 73, 92, 98, 104, 118, 152, 153, 166, 202, 207, 239, 251, 22, 23, 44, 74, 91, 148, 149, 161 - , 181, 190, 233, 46, 59, 88, 137, 146, 147, 163, 196, 208, 212, 222, 250, 57, 81, 95, 106, 111, - 129, 160, 176, 199, 243, 249, 15, 53, 72, 93, 103, 115, 125, 162, 183, 185, 189, 206, 225, 255, - 186, 210, 230, 237, 242, 248, 30, 31, 62, 89, 99, 105, 114, 121, 124, 178, 209, 213, 223, 228, - 241, 254, 60, 191, 198, 247, 120, 240, 107, 127, 144, 145, 177, 211, 214, 246, 245, 123, 126, - 187, 231, 253, 63, 179, 229, 244, 61, 122, 215, 252 }; - -static int cbest_9[511] = { 1, 2, 264, 4, 132, 8, 66, 16, 33, 32, 280, 64, 140, 128, 3, 70, 265, 5, - 133, 256, 266, 6, 9, 35, 67, 134, 268, 396, 10, 17, 34, 330, 12, 18, 68, 198, 297, 20, 37, 74 - , 136, 148, 165, 281, 296, 24, 36, 41, 65, 82, 99, 164, 272, 282, 388, 40, 49, 98, 141, 194, - 284, 328, 412, 48, 97, 129, 142, 196, 346, 71, 72, 96, 130, 313, 392, 80, 206, 257, 267, 312, - 334, 7, 135, 156, 173, 192, 258, 269, 397, 404, 11, 78, 144, 161, 172, 260, 270, 299, 331, 344, - 398, 13, 19, 39, 69, 86, 103, 160, 167, 199, 202, 298, 322, 384, 14, 21, 38, 43, 75, 102, 137, - 149, 166, 204, 289, 332, 408, 462, 22, 25, 42, 51, 83, 101, 138, 150, 273, 283, 288, 301, 350, - 389, 429, 26, 50, 76, 100, 195, 274, 285, 300, 329, 363, 390, 413, 428, 28, 45, 84, 143, 197, - 200, 214, 231, 276, 286, 315, 320, 347, 362, 414, 458, 44, 53, 73, 90, 107, 131, 152, 169, 181, - 230, 314, 338, 361, 393, 400, 454, 460, 52, 57, 81, 106, 115, 168, 175, 180, 207, 229, 305, 335 - , 348, 360, 394, 421, 478, 56, 105, 114, 157, 163, 174, 193, 210, 227, 228, 259, 304, 317, 326, - 405, 420, 445, 79, 104, 113, 145, 158, 162, 212, 226, 261, 271, 316, 345, 379, 399, 406, 444, - 450, 456, 87, 88, 112, 146, 203, 225, 262, 291, 323, 336, 378, 385, 425, 452, 474, 15, 205, 222 - , 224, 239, 290, 303, 333, 367, 377, 386, 409, 424, 431, 463, 470, 476, 23, 139, 151, 189, 208, - 238, 302, 324, 351, 366, 376, 410, 430, 437, 27, 47, 77, 94, 111, 177, 188, 237, 275, 293, 342, - 365, 391, 436, 448, 29, 46, 55, 85, 110, 119, 171, 176, 183, 201, 215, 218, 235, 236, 277, 287, - 292, 321, 355, 364, 415, 417, 459, 466, 472, 30, 54, 59, 91, 109, 118, 153, 170, 182, 220, 234, - 278, 307, 339, 354, 401, 416, 423, 441, 455, 461, 468, 495, 58, 108, 117, 154, 233, 306, 319, - 349, 353, 383, 395, 402, 422, 440, 447, 479, 494, 92, 116, 211, 232, 318, 327, 340, 352, 382, - 446, 493, 61, 159, 213, 216, 247, 309, 381, 407, 427, 451, 457, 464, 491, 492, 60, 89, 123, 147 - , 185, 246, 263, 308, 337, 371, 380, 426, 433, 453, 475, 487, 490, 122, 184, 191, 223, 245, 370, - 387, 432, 439, 471, 477, 486, 489, 511, 121, 179, 190, 209, 243, 244, 295, 325, 359, 369, 411, - 438, 485, 488, 510, 95, 120, 178, 242, 294, 343, 358, 368, 419, 449, 483, 484, 509, 219, 241, - 357, 418, 443, 467, 473, 482, 507, 508, 31, 221, 240, 255, 279, 356, 442, 469, 481, 503, 506, - 155, 254, 403, 480, 502, 505, 63, 93, 127, 253, 311, 341, 375, 501, 504, 62, 126, 187, 217, 251 - , 252, 310, 374, 435, 465, 499, 500, 125, 186, 250, 373, 434, 498, 124, 249, 372, 497, 248, 496 - }; - -static int cbest_10[1023] = { 1, 2, 516, 4, 258, 8, 129, 16, 32, 580, 64, 128, 290, 145, 256, 3, 512, - 517, 5, 259, 518, 588, 6, 9, 18, 36, 72, 144, 774, 10, 17, 131, 262, 288, 524, 645, 12, 33, - 133, 266, 294, 387, 532, 576, 581, 20, 34, 65, 137, 274, 548, 582, 24, 66, 291, 838, 40, 68, - 130, 147, 161, 322, 644, 709, 806, 48, 132, 193, 257, 386, 596, 80, 136, 298, 419, 612, 661, 772 - , 96, 149, 260, 272, 306, 403, 513, 146, 153, 160, 264, 292, 385, 514, 519, 544, 584, 589, 708, - 870, 7, 19, 37, 73, 192, 354, 590, 770, 775, 11, 38, 74, 177, 263, 289, 418, 520, 525, 534, 641 - , 660, 725, 802, 836, 846, 13, 22, 76, 148, 209, 267, 295, 320, 330, 402, 526, 528, 533, 577, - 647, 717, 804, 14, 21, 26, 35, 44, 135, 152, 165, 201, 275, 304, 384, 401, 435, 549, 578, 583, - 604, 608, 782, 903, 25, 52, 67, 88, 139, 270, 296, 391, 417, 550, 620, 653, 790, 834, 839, 41, - 50, 69, 104, 141, 176, 278, 302, 323, 395, 423, 540, 598, 640, 705, 724, 807, 866, 28, 42, 49, - 70, 82, 100, 163, 208, 282, 310, 556, 592, 597, 646, 663, 677, 711, 716, 868, 878, 81, 134, 151 - , 164, 195, 200, 299, 326, 352, 362, 400, 434, 564, 613, 657, 768, 773, 902, 967, 97, 138, 155, - 169, 197, 261, 273, 307, 358, 390, 416, 433, 451, 614, 652, 733, 800, 814, 844, 854, 935, 56, 84 - , 98, 140, 181, 217, 265, 293, 328, 338, 394, 422, 515, 545, 585, 704, 788, 822, 871, 919, 162, - 179, 276, 355, 407, 427, 546, 586, 591, 616, 662, 669, 676, 710, 727, 741, 771, 780, 901, 39, 75 - , 150, 157, 194, 211, 225, 268, 280, 308, 314, 389, 411, 439, 521, 530, 535, 628, 656, 721, 803, - 832, 837, 842, 847, 966, 23, 77, 112, 154, 168, 196, 300, 321, 331, 393, 421, 432, 450, 522, 527 - , 529, 552, 606, 643, 673, 693, 713, 732, 805, 864, 874, 934, 999, 15, 27, 45, 54, 78, 90, 108, - 180, 216, 305, 483, 560, 579, 600, 605, 609, 719, 778, 783, 852, 876, 886, 899, 918, 983, 46, 53 - , 89, 167, 178, 185, 203, 213, 271, 297, 324, 334, 336, 360, 370, 406, 426, 467, 542, 551, 610, - 621, 649, 668, 726, 740, 786, 791, 810, 820, 835, 900, 917, 931, 951, 965, 975, 30, 51, 105, 156 - , 205, 210, 224, 279, 303, 356, 366, 388, 405, 410, 438, 449, 459, 536, 541, 594, 599, 622, 655, - 720, 812, 818, 862, 867, 933, 29, 43, 71, 83, 92, 101, 106, 143, 173, 283, 311, 312, 346, 392, - 409, 420, 437, 443, 557, 566, 593, 642, 659, 672, 692, 707, 712, 737, 757, 869, 879, 911, 998, - 60, 102, 241, 327, 353, 363, 399, 425, 482, 558, 565, 624, 679, 718, 735, 749, 769, 798, 898, - 963, 982, 58, 86, 166, 183, 184, 202, 212, 219, 233, 286, 359, 431, 466, 615, 636, 648, 689, 729 - , 801, 815, 840, 845, 850, 855, 884, 916, 930, 950, 964, 974, 981, 995, 1015, 57, 85, 99, 120, - 171, 199, 204, 229, 318, 329, 339, 368, 404, 448, 458, 465, 499, 654, 671, 685, 784, 789, 823, - 872, 882, 915, 932, 949, 997, 1007, 116, 142, 159, 172, 277, 408, 436, 442, 455, 481, 491, 547, - 572, 587, 617, 630, 658, 665, 706, 723, 736, 756, 776, 781, 816, 860, 894, 897, 910, 947, 991, - 114, 221, 240, 269, 281, 309, 315, 332, 342, 344, 378, 398, 424, 441, 475, 487, 531, 618, 629, - 678, 695, 734, 743, 748, 808, 833, 843, 929, 943, 962, 973, 113, 182, 189, 218, 227, 232, 301, - 364, 374, 430, 457, 523, 553, 562, 602, 607, 688, 728, 753, 796, 830, 865, 875, 927, 980, 994, - 1014, 55, 79, 91, 109, 170, 187, 198, 215, 228, 284, 415, 464, 498, 554, 561, 601, 670, 675, 684 - , 715, 745, 765, 779, 848, 853, 877, 887, 909, 914, 948, 979, 996, 1006, 1013, 47, 110, 158, 249 - , 316, 325, 335, 337, 361, 371, 397, 447, 454, 480, 490, 497, 538, 543, 611, 632, 664, 722, 787, - 811, 821, 880, 896, 913, 946, 961, 971, 990, 1011, 31, 94, 220, 245, 357, 367, 429, 440, 474, - 486, 537, 595, 623, 651, 681, 694, 701, 742, 759, 813, 819, 858, 863, 892, 928, 942, 945, 972, - 989, 993, 1003, 1023, 62, 93, 107, 188, 207, 226, 237, 243, 313, 340, 347, 376, 456, 471, 473, - 507, 567, 568, 626, 752, 890, 907, 926, 1005, 61, 103, 124, 175, 186, 214, 372, 414, 453, 463, - 489, 503, 559, 625, 638, 674, 691, 714, 731, 739, 744, 764, 794, 799, 828, 908, 925, 939, 959, - 978, 1012, 59, 87, 122, 248, 287, 350, 396, 413, 446, 485, 495, 496, 637, 751, 826, 841, 851, - 885, 912, 941, 960, 970, 977, 1010, 118, 121, 235, 244, 319, 369, 382, 428, 445, 574, 650, 667, - 680, 700, 758, 761, 785, 873, 883, 944, 988, 992, 1002, 1009, 1022, 117, 206, 223, 231, 236, 242 - , 470, 472, 506, 573, 631, 687, 777, 817, 856, 861, 895, 906, 987, 1004, 1021, 115, 174, 191, 333 - , 343, 345, 379, 452, 462, 469, 488, 502, 505, 619, 690, 697, 730, 738, 755, 809, 888, 924, 938, - 958, 969, 1019, 253, 365, 375, 412, 484, 494, 501, 563, 603, 750, 767, 792, 797, 831, 923, 940, - 957, 976, 1001, 234, 251, 285, 348, 444, 479, 555, 634, 666, 760, 824, 849, 905, 955, 1008, 111, - 222, 230, 247, 317, 380, 461, 511, 539, 633, 686, 703, 747, 881, 937, 986, 1020, 95, 190, 468, - 493, 504, 570, 696, 754, 859, 893, 968, 985, 1018, 63, 126, 252, 341, 377, 500, 569, 627, 683, - 766, 891, 922, 956, 1000, 1017, 125, 239, 250, 373, 478, 639, 795, 829, 904, 921, 954, 123, 246, - 351, 460, 477, 510, 702, 746, 763, 827, 936, 953, 119, 383, 492, 509, 575, 984, 682, 699, 857, - 1016, 238, 255, 889, 920, 476, 762, 793, 952, 349, 508, 635, 825, 381, 698, 254, 571, 127 }; - -static int cbest_11[1023] = { 1, - 2, 1026, 4, 513, 8, 16, 1282, 32, 64, 641, 128, 256, 512, 1346, 1024, 3, 673, 1027, 5, 10, 20, 40, 80, 160, 320, - 640, 6, 9, 515, 1030, 1280, 1539, 17, 517, 1034, 1283, 12, 18, 33, 521, 1042, 1362, 34, 65, 529, 1058, 1286, 1795, - 24, 36, 66, 129, 545, 643, 1090, 1290, 1667, 68, 130, 257, 577, 645, 672, 1154, 1298, 1344, 48, 72, 132, 258, 336, - 649, 681, 1314, 1347, 136, 168, 260, 514, 657, 769, 1538, 1923, 84, 96, 144, 264, 516, 1025, 1350, 1410, 1859, 42, - 272, 520, 705, 1032, 1354, 11, 21, 41, 81, 161, 192, 288, 321, 528, 675, 1028, 1537, 1699, 1794, 7, 22, 82, 162, - 322, 544, 642, 677, 897, 1031, 1046, 1066, 1106, 1186, 1281, 1366, 1378, 1666, 14, 44, 164, 324, 384, 523, 533, - 553, 576, 593, 644, 833, 1035, 1040, 1288, 1360, 1987, 13, 19, 28, 88, 328, 519, 648, 680, 689, 1043, 1056, 1284, - 1363, 1474, 1543, 1793, 1955, 26, 35, 56, 176, 656, 768, 1038, 1059, 1088, 1287, 1302, 1322, 1442, 1547, 1665, - 1922, 25, 37, 52, 67, 112, 340, 352, 525, 531, 737, 1091, 1152, 1291, 1296, 1555, 1858, 1875, 38, 69, 74, 104, 131, - 224, 547, 651, 661, 683, 704, 721, 961, 1050, 1062, 1155, 1299, 1312, 1345, 1370, 1571, 1799, 49, 70, 73, 133, 138, - 148, 170, 208, 259, 337, 448, 537, 549, 579, 647, 674, 929, 1094, 1294, 1315, 1352, 1536, 1603, 1671, 1698, 1803, - 1921, 50, 134, 137, 169, 261, 266, 276, 296, 338, 416, 581, 676, 896, 1074, 1098, 1158, 1348, 1394, 1408, 1675, - 1707, 1811, 1857, 2019, 76, 85, 97, 145, 262, 265, 522, 532, 552, 561, 585, 592, 653, 659, 685, 771, 832, 849, - 1064, 1162, 1194, 1306, 1318, 1351, 1386, 1411, 1506, 1683, 1827, 1986, 2003, 43, 86, 98, 140, 146, 172, 273, 344, - 518, 688, 773, 1033, 1110, 1122, 1170, 1355, 1490, 1542, 1697, 1792, 1927, 1954, 100, 193, 268, 274, 289, 597, 609, - 665, 697, 707, 777, 1029, 1044, 1104, 1184, 1330, 1364, 1376, 1414, 1546, 1664, 1731, 1863, 1931, 1963, 23, 46, 83, - 92, 152, 163, 184, 194, 290, 323, 368, 524, 530, 555, 693, 709, 736, 753, 785, 993, 1036, 1047, 1067, 1107, 1187, - 1218, 1320, 1358, 1367, 1379, 1418, 1450, 1545, 1554, 1867, 1874, 1939, 1985, 15, 30, 45, 60, 90, 120, 165, 180, - 196, 240, 280, 292, 325, 330, 360, 385, 480, 546, 650, 660, 679, 682, 713, 720, 745, 801, 899, 960, 977, 1041, - 1289, 1361, 1426, 1472, 1541, 1570, 1703, 1798, 1953, 29, 58, 89, 116, 166, 200, 232, 326, 329, 386, 464, 535, 536, - 548, 578, 595, 646, 835, 901, 928, 1048, 1057, 1070, 1190, 1285, 1300, 1368, 1382, 1440, 1475, 1559, 1579, 1602, - 1619, 1670, 1802, 1879, 1891, 1920, 27, 57, 177, 304, 388, 527, 557, 580, 691, 725, 837, 905, 937, 1039, 1054, - 1089, 1114, 1292, 1303, 1323, 1374, 1443, 1553, 1674, 1706, 1715, 1801, 1810, 1856, 1873, 1991, 2018, 2035, 53, - 106, 113, 178, 212, 332, 341, 353, 392, 424, 541, 560, 584, 601, 652, 658, 684, 770, 841, 848, 913, 1060, 1082, - 1096, 1153, 1202, 1297, 1402, 1478, 1522, 1569, 1673, 1682, 1705, 1797, 1826, 1959, 1995, 2002, 2027, 39, 54, 75, - 105, 114, 225, 342, 354, 400, 539, 569, 739, 772, 1051, 1063, 1078, 1092, 1138, 1160, 1192, 1304, 1313, 1326, 1371, - 1384, 1398, 1446, 1482, 1514, 1551, 1601, 1669, 1696, 1763, 1815, 1835, 1926, 71, 139, 149, 171, 209, 226, 298, - 356, 449, 565, 596, 608, 625, 663, 664, 696, 706, 723, 741, 776, 853, 865, 963, 1072, 1095, 1130, 1156, 1250, 1295, - 1310, 1353, 1392, 1687, 1730, 1747, 1809, 1862, 1930, 1962, 1971, 2007, 2017, 51, 78, 108, 135, 150, 210, 228, 267, - 277, 297, 339, 348, 417, 450, 551, 554, 587, 617, 655, 687, 692, 708, 752, 784, 931, 965, 992, 1009, 1075, 1099, - 1159, 1174, 1234, 1316, 1338, 1349, 1395, 1409, 1458, 1494, 1504, 1544, 1563, 1575, 1681, 1825, 1866, 1883, 1929, - 1938, 1961, 1984, 2001, 77, 142, 174, 263, 278, 346, 376, 418, 452, 496, 583, 669, 678, 701, 712, 729, 744, 761, - 800, 898, 933, 969, 976, 1001, 1065, 1108, 1120, 1163, 1168, 1195, 1307, 1319, 1334, 1356, 1387, 1416, 1448, 1488, - 1507, 1540, 1607, 1702, 1807, 1865, 1925, 1952, 87, 99, 141, 147, 156, 173, 188, 216, 248, 270, 300, 345, 372, 420, - 456, 488, 534, 563, 594, 667, 699, 757, 779, 789, 809, 834, 851, 900, 1102, 1111, 1123, 1171, 1328, 1412, 1491, - 1558, 1578, 1587, 1611, 1618, 1679, 1711, 1729, 1861, 1878, 1890, 1907, 1943, 2023, 94, 101, 124, 154, 186, 244, - 269, 275, 284, 526, 556, 589, 690, 724, 775, 836, 904, 936, 945, 981, 1045, 1068, 1105, 1166, 1185, 1198, 1216, - 1331, 1365, 1377, 1390, 1415, 1430, 1510, 1552, 1577, 1714, 1800, 1819, 1831, 1872, 1899, 1937, 1990, 2034, 47, 62, - 93, 102, 122, 153, 185, 195, 282, 291, 312, 362, 369, 432, 468, 540, 599, 600, 611, 715, 747, 840, 857, 912, 1037, - 1052, 1112, 1126, 1219, 1321, 1359, 1372, 1419, 1424, 1451, 1568, 1623, 1635, 1672, 1691, 1701, 1704, 1723, 1796, - 1958, 1994, 2011, 2026, 2043, 31, 61, 91, 121, 181, 197, 202, 234, 241, 281, 293, 308, 331, 361, 370, 481, 538, - 568, 613, 695, 711, 738, 755, 781, 787, 995, 1080, 1118, 1178, 1188, 1210, 1380, 1400, 1427, 1473, 1498, 1530, - 1550, 1557, 1600, 1617, 1668, 1719, 1735, 1762, 1779, 1814, 1834, 1843, 1877, 1889, 1935, 1967, 1993, 2025, 2039, - 59, 117, 167, 182, 198, 201, 233, 242, 294, 327, 387, 465, 482, 559, 564, 605, 624, 662, 722, 740, 803, 852, 864, - 881, 907, 917, 939, 962, 979, 997, 1049, 1071, 1086, 1146, 1191, 1206, 1222, 1266, 1301, 1324, 1369, 1383, 1406, - 1422, 1441, 1454, 1480, 1512, 1526, 1549, 1686, 1713, 1739, 1746, 1771, 1808, 1833, 1871, 1970, 1989, 2006, 2016, - 2033, 118, 305, 334, 364, 389, 394, 404, 426, 466, 484, 543, 550, 573, 586, 603, 616, 633, 654, 686, 717, 749, 793, - 805, 843, 873, 903, 930, 964, 1008, 1055, 1115, 1128, 1142, 1200, 1226, 1258, 1293, 1308, 1375, 1476, 1520, 1562, - 1574, 1680, 1824 }; - diff --git a/src/erasure-code/jerasure/jerasure/src/galois.c b/src/erasure-code/jerasure/jerasure/src/galois.c deleted file mode 100644 index 398a64944f797..0000000000000 --- a/src/erasure-code/jerasure/jerasure/src/galois.c +++ /dev/null @@ -1,353 +0,0 @@ -/* * - * Copyright (c) 2014, James S. Plank and Kevin Greenan - * All rights reserved. - * - * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure - * Coding Techniques - * - * Revision 2.0: Galois Field backend now links to GF-Complete - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * - Neither the name of the University of Tennessee nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY - * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/* Jerasure's authors: - - Revision 2.x - 2014: James S. Plank and Kevin M. Greenan - Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman. - Revision 1.0 - 2007: James S. Plank - */ - -#include -#include -#include - -#include "galois.h" - -#define MAX_GF_INSTANCES 64 -gf_t *gfp_array[MAX_GF_INSTANCES] = { 0 }; -int gfp_is_composite[MAX_GF_INSTANCES] = { 0 }; - -gf_t *galois_get_field_ptr(int w) -{ - if (gfp_array[w] != NULL) { - return gfp_array[w]; - } - - return NULL; -} - -gf_t* galois_init_field(int w, - int mult_type, - int region_type, - int divide_type, - uint64_t prim_poly, - int arg1, - int arg2) -{ - int scratch_size; - void *scratch_memory; - gf_t *gfp; - - if (w <= 0 || w > 32) { - fprintf(stderr, "ERROR -- cannot init default Galois field for w=%d\n", w); - exit(1); - } - - gfp = (gf_t *) malloc(sizeof(gf_t)); - if (!gfp) { - fprintf(stderr, "ERROR -- cannot allocate memory for Galois field w=%d\n", w); - exit(1); - } - - scratch_size = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2); - if (!scratch_size) { - fprintf(stderr, "ERROR -- cannot get scratch size for base field w=%d\n", w); - exit(1); - } - - scratch_memory = malloc(scratch_size); - if (!scratch_memory) { - fprintf(stderr, "ERROR -- cannot get scratch memory for base field w=%d\n", w); - exit(1); - } - - if(!gf_init_hard(gfp, - w, - mult_type, - region_type, - divide_type, - prim_poly, - arg1, - arg2, - NULL, - scratch_memory)) - { - fprintf(stderr, "ERROR -- cannot init default Galois field for w=%d\n", w); - exit(1); - } - - gfp_is_composite[w] = 0; - return gfp; -} - -gf_t* galois_init_composite_field(int w, - int region_type, - int divide_type, - int degree, - gf_t* base_gf) -{ - int scratch_size; - void *scratch_memory; - gf_t *gfp; - - if (w <= 0 || w > 32) { - fprintf(stderr, "ERROR -- cannot init composite field for w=%d\n", w); - exit(1); - } - - gfp = (gf_t *) malloc(sizeof(gf_t)); - if (!gfp) { - fprintf(stderr, "ERROR -- cannot allocate memory for Galois field w=%d\n", w); - exit(1); - } - - scratch_size = gf_scratch_size(w, GF_MULT_COMPOSITE, region_type, divide_type, degree, 0); - if (!scratch_size) { - fprintf(stderr, "ERROR -- cannot get scratch size for composite field w=%d\n", w); - exit(1); - } - - scratch_memory = malloc(scratch_size); - if (!scratch_memory) { - fprintf(stderr, "ERROR -- cannot get scratch memory for composite field w=%d\n", w); - exit(1); - } - - if(!gf_init_hard(gfp, - w, - GF_MULT_COMPOSITE, - region_type, - divide_type, - 0, - degree, - 0, - base_gf, - scratch_memory)) - { - fprintf(stderr, "ERROR -- cannot init default composite field for w=%d\n", w); - exit(1); - } - gfp_is_composite[w] = 1; - return gfp; -} - -static void galois_init_default_field(int w) -{ - if (w <= 0 || w > 32) { - fprintf(stderr, "ERROR -- cannot init default Galois field for w=%d\n", w); - exit(1); - } - - if (gfp_array[w] == NULL) { - gfp_array[w] = (gf_t*)malloc(sizeof(gf_t)); - if (gfp_array[w] == NULL) { - fprintf(stderr, "ERROR -- cannot allocate memory for Galois field w=%d\n", w); - exit(1); - } - } - - if (!gf_init_easy(gfp_array[w], w)) { - fprintf(stderr, "ERROR -- cannot init default Galois field for w=%d\n", w); - exit(1); - } -} - - -static int is_valid_gf(gf_t *gf, int w) -{ - // TODO: I assume we may eventually - // want to do w=64 and 128, so w - // will be needed to perform this check - (void)w; - - if (gf == NULL) { - return 0; - } - if (gf->multiply.w32 == NULL) { - return 0; - } - if (gf->multiply_region.w32 == NULL) { - return 0; - } - if (gf->divide.w32 == NULL) { - return 0; - } - if (gf->inverse.w32 == NULL) { - return 0; - } - if (gf->extract_word.w32 == NULL) { - return 0; - } - - return 1; -} - -void galois_change_technique(gf_t *gf, int w) -{ - if (w <= 0 || w > 32) { - fprintf(stderr, "ERROR -- cannot support Galois field for w=%d\n", w); - exit(1); - } - - if (!is_valid_gf(gf, w)) { - fprintf(stderr, "ERROR -- overriding with invalid Galois field for w=%d\n", w); - exit(1); - } - - if (gfp_array[w] != NULL) { - gf_free(gfp_array[w], gfp_is_composite[w]); - } - - gfp_array[w] = gf; -} - -int galois_single_multiply(int x, int y, int w) -{ - if (x == 0 || y == 0) return 0; - - if (gfp_array[w] == NULL) { - galois_init_default_field(w); - } - - if (w <= 32) { - return gfp_array[w]->multiply.w32(gfp_array[w], x, y); - } else { - fprintf(stderr, "ERROR -- Galois field not implemented for w=%d\n", w); - return 0; - } -} - -int galois_single_divide(int x, int y, int w) -{ - if (x == 0) return 0; - if (y == 0) return -1; - - if (gfp_array[w] == NULL) { - galois_init_default_field(w); - } - - if (w <= 32) { - return gfp_array[w]->divide.w32(gfp_array[w], x, y); - } else { - fprintf(stderr, "ERROR -- Galois field not implemented for w=%d\n", w); - return 0; - } -} - -void galois_w08_region_multiply(char *region, /* Region to multiply */ - int multby, /* Number to multiply by */ - int nbytes, /* Number of bytes in region */ - char *r2, /* If r2 != NULL, products go here */ - int add) -{ - if (gfp_array[8] == NULL) { - galois_init_default_field(8); - } - gfp_array[8]->multiply_region.w32(gfp_array[8], region, r2, multby, nbytes, add); -} - -void galois_w16_region_multiply(char *region, /* Region to multiply */ - int multby, /* Number to multiply by */ - int nbytes, /* Number of bytes in region */ - char *r2, /* If r2 != NULL, products go here */ - int add) -{ - if (gfp_array[16] == NULL) { - galois_init_default_field(16); - } - gfp_array[16]->multiply_region.w32(gfp_array[16], region, r2, multby, nbytes, add); -} - - -void galois_w32_region_multiply(char *region, /* Region to multiply */ - int multby, /* Number to multiply by */ - int nbytes, /* Number of bytes in region */ - char *r2, /* If r2 != NULL, products go here */ - int add) -{ - if (gfp_array[32] == NULL) { - galois_init_default_field(32); - } - gfp_array[32]->multiply_region.w32(gfp_array[32], region, r2, multby, nbytes, add); -} - -void galois_w8_region_xor(void *src, void *dest, int nbytes) -{ - if (gfp_array[8] == NULL) { - galois_init_default_field(8); - } - gfp_array[8]->multiply_region.w32(gfp_array[32], src, dest, 1, nbytes, 1); -} - -void galois_w16_region_xor(void *src, void *dest, int nbytes) -{ - if (gfp_array[16] == NULL) { - galois_init_default_field(16); - } - gfp_array[16]->multiply_region.w32(gfp_array[16], src, dest, 1, nbytes, 1); -} - -void galois_w32_region_xor(void *src, void *dest, int nbytes) -{ - if (gfp_array[32] == NULL) { - galois_init_default_field(32); - } - gfp_array[32]->multiply_region.w32(gfp_array[32], src, dest, 1, nbytes, 1); -} - -void galois_region_xor(char *src, char *dest, int nbytes) -{ - if (nbytes >= 16) { - galois_w32_region_xor(src, dest, nbytes); - } else { - int i = 0; - for (i = 0; i < nbytes; i++) { - *dest ^= *src; - dest++; - src++; - } - } -} - -int galois_inverse(int y, int w) -{ - if (y == 0) return -1; - return galois_single_divide(1, y, w); -} diff --git a/src/erasure-code/jerasure/jerasure/src/jerasure.c b/src/erasure-code/jerasure/jerasure/src/jerasure.c deleted file mode 100644 index 571b156e72643..0000000000000 --- a/src/erasure-code/jerasure/jerasure/src/jerasure.c +++ /dev/null @@ -1,1387 +0,0 @@ -/* * - * Copyright (c) 2014, James S. Plank and Kevin Greenan - * All rights reserved. - * - * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure - * Coding Techniques - * - * Revision 2.0: Galois Field backend now links to GF-Complete - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * - Neither the name of the University of Tennessee nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY - * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/* Jerasure's authors: - - Revision 2.x - 2014: James S. Plank and Kevin M. Greenan - Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman. - Revision 1.0 - 2007: James S. Plank - */ - -#include -#include -#include - -#include "galois.h" -#include "jerasure.h" - -#define talloc(type, num) (type *) malloc(sizeof(type)*(num)) - -static double jerasure_total_xor_bytes = 0; -static double jerasure_total_gf_bytes = 0; -static double jerasure_total_memcpy_bytes = 0; - -void jerasure_print_matrix(int *m, int rows, int cols, int w) -{ - int i, j; - int fw; - char s[30]; - unsigned int w2; - - if (w == 32) { - fw = 10; - } else { - w2 = (1 << w); - sprintf(s, "%u", w2-1); - fw = strlen(s); - } - - for (i = 0; i < rows; i++) { - for (j = 0; j < cols; j++) { - if (j != 0) printf(" "); - printf("%*u", fw, m[i*cols+j]); - } - printf("\n"); - } -} - -void jerasure_print_bitmatrix(int *m, int rows, int cols, int w) -{ - int i, j; - - for (i = 0; i < rows; i++) { - if (i != 0 && i%w == 0) printf("\n"); - for (j = 0; j < cols; j++) { - if (j != 0 && j%w == 0) printf(" "); - printf("%d", m[i*cols+j]); - } - printf("\n"); - } -} - -int jerasure_make_decoding_matrix(int k, int m, int w, int *matrix, int *erased, int *decoding_matrix, int *dm_ids) -{ - int i, j, *tmpmat; - - j = 0; - for (i = 0; j < k; i++) { - if (erased[i] == 0) { - dm_ids[j] = i; - j++; - } - } - - tmpmat = talloc(int, k*k); - if (tmpmat == NULL) { return -1; } - for (i = 0; i < k; i++) { - if (dm_ids[i] < k) { - for (j = 0; j < k; j++) tmpmat[i*k+j] = 0; - tmpmat[i*k+dm_ids[i]] = 1; - } else { - for (j = 0; j < k; j++) { - tmpmat[i*k+j] = matrix[(dm_ids[i]-k)*k+j]; - } - } - } - - i = jerasure_invert_matrix(tmpmat, decoding_matrix, k, w); - free(tmpmat); - return i; -} - -/* Internal Routine */ -int jerasure_make_decoding_bitmatrix(int k, int m, int w, int *matrix, int *erased, int *decoding_matrix, int *dm_ids) -{ - int i, j, *tmpmat; - int index, mindex; - - j = 0; - for (i = 0; j < k; i++) { - if (erased[i] == 0) { - dm_ids[j] = i; - j++; - } - } - - tmpmat = talloc(int, k*k*w*w); - if (tmpmat == NULL) { return -1; } - for (i = 0; i < k; i++) { - if (dm_ids[i] < k) { - index = i*k*w*w; - for (j = 0; j < k*w*w; j++) tmpmat[index+j] = 0; - index = i*k*w*w+dm_ids[i]*w; - for (j = 0; j < w; j++) { - tmpmat[index] = 1; - index += (k*w+1); - } - } else { - index = i*k*w*w; - mindex = (dm_ids[i]-k)*k*w*w; - for (j = 0; j < k*w*w; j++) { - tmpmat[index+j] = matrix[mindex+j]; - } - } - } - - i = jerasure_invert_bitmatrix(tmpmat, decoding_matrix, k*w); - free(tmpmat); - return i; -} - -int jerasure_matrix_decode(int k, int m, int w, int *matrix, int row_k_ones, int *erasures, - char **data_ptrs, char **coding_ptrs, int size) -{ - int i, edd, lastdrive; - int *tmpids; - int *erased, *decoding_matrix, *dm_ids; - - if (w != 8 && w != 16 && w != 32) return -1; - - erased = jerasure_erasures_to_erased(k, m, erasures); - if (erased == NULL) return -1; - - /* Find the number of data drives failed */ - - lastdrive = k; - - edd = 0; - for (i = 0; i < k; i++) { - if (erased[i]) { - edd++; - lastdrive = i; - } - } - - /* You only need to create the decoding matrix in the following cases: - - 1. edd > 0 and row_k_ones is false. - 2. edd > 0 and row_k_ones is true and coding device 0 has been erased. - 3. edd > 1 - - We're going to use lastdrive to denote when to stop decoding data. - At this point in the code, it is equal to the last erased data device. - However, if we can't use the parity row to decode it (i.e. row_k_ones=0 - or erased[k] = 1, we're going to set it to k so that the decoding - pass will decode all data. - */ - - if (!row_k_ones || erased[k]) lastdrive = k; - - dm_ids = NULL; - decoding_matrix = NULL; - - if (edd > 1 || (edd > 0 && (!row_k_ones || erased[k]))) { - dm_ids = talloc(int, k); - if (dm_ids == NULL) { - free(erased); - return -1; - } - - decoding_matrix = talloc(int, k*k); - if (decoding_matrix == NULL) { - free(erased); - free(dm_ids); - return -1; - } - - if (jerasure_make_decoding_matrix(k, m, w, matrix, erased, decoding_matrix, dm_ids) < 0) { - free(erased); - free(dm_ids); - free(decoding_matrix); - return -1; - } - } - - /* Decode the data drives. - If row_k_ones is true and coding device 0 is intact, then only decode edd-1 drives. - This is done by stopping at lastdrive. - We test whether edd > 0 so that we can exit the loop early if we're done. - */ - - for (i = 0; edd > 0 && i < lastdrive; i++) { - if (erased[i]) { - jerasure_matrix_dotprod(k, w, decoding_matrix+(i*k), dm_ids, i, data_ptrs, coding_ptrs, size); - edd--; - } - } - - /* Then if necessary, decode drive lastdrive */ - - if (edd > 0) { - tmpids = talloc(int, k); - for (i = 0; i < k; i++) { - tmpids[i] = (i < lastdrive) ? i : i+1; - } - jerasure_matrix_dotprod(k, w, matrix, tmpids, lastdrive, data_ptrs, coding_ptrs, size); - free(tmpids); - } - - /* Finally, re-encode any erased coding devices */ - - for (i = 0; i < m; i++) { - if (erased[k+i]) { - jerasure_matrix_dotprod(k, w, matrix+(i*k), NULL, i+k, data_ptrs, coding_ptrs, size); - } - } - - free(erased); - if (dm_ids != NULL) free(dm_ids); - if (decoding_matrix != NULL) free(decoding_matrix); - - return 0; -} - - -int *jerasure_matrix_to_bitmatrix(int k, int m, int w, int *matrix) -{ - int *bitmatrix; - int rowelts, rowindex, colindex, elt, i, j, l, x; - - bitmatrix = talloc(int, k*m*w*w); - if (matrix == NULL) { return NULL; } - - rowelts = k * w; - rowindex = 0; - - for (i = 0; i < m; i++) { - colindex = rowindex; - for (j = 0; j < k; j++) { - elt = matrix[i*k+j]; - for (x = 0; x < w; x++) { - for (l = 0; l < w; l++) { - bitmatrix[colindex+x+l*rowelts] = ((elt & (1 << l)) ? 1 : 0); - } - elt = galois_single_multiply(elt, 2, w); - } - colindex += w; - } - rowindex += rowelts * w; - } - return bitmatrix; -} - -void jerasure_matrix_encode(int k, int m, int w, int *matrix, - char **data_ptrs, char **coding_ptrs, int size) -{ - int i; - - if (w != 8 && w != 16 && w != 32) { - fprintf(stderr, "ERROR: jerasure_matrix_encode() and w is not 8, 16 or 32\n"); - exit(1); - } - - for (i = 0; i < m; i++) { - jerasure_matrix_dotprod(k, w, matrix+(i*k), NULL, k+i, data_ptrs, coding_ptrs, size); - } -} - -void jerasure_bitmatrix_dotprod(int k, int w, int *bitmatrix_row, - int *src_ids, int dest_id, - char **data_ptrs, char **coding_ptrs, int size, int packetsize) -{ - int j, sindex, pstarted, index, x, y; - char *dptr, *pptr, *bdptr, *bpptr; - - if (size%(w*packetsize) != 0) { - fprintf(stderr, "jerasure_bitmatrix_dotprod - size%c(w*packetsize)) must = 0\n", '%'); - exit(1); - } - - bpptr = (dest_id < k) ? data_ptrs[dest_id] : coding_ptrs[dest_id-k]; - - for (sindex = 0; sindex < size; sindex += (packetsize*w)) { - index = 0; - for (j = 0; j < w; j++) { - pstarted = 0; - pptr = bpptr + sindex + j*packetsize; - for (x = 0; x < k; x++) { - if (src_ids == NULL) { - bdptr = data_ptrs[x]; - } else if (src_ids[x] < k) { - bdptr = data_ptrs[src_ids[x]]; - } else { - bdptr = coding_ptrs[src_ids[x]-k]; - } - for (y = 0; y < w; y++) { - if (bitmatrix_row[index]) { - dptr = bdptr + sindex + y*packetsize; - if (!pstarted) { - memcpy(pptr, dptr, packetsize); - jerasure_total_memcpy_bytes += packetsize; - pstarted = 1; - } else { - galois_region_xor(dptr, pptr, packetsize); - jerasure_total_xor_bytes += packetsize; - } - } - index++; - } - } - } - } -} - -void jerasure_do_parity(int k, char **data_ptrs, char *parity_ptr, int size) -{ - int i; - - memcpy(parity_ptr, data_ptrs[0], size); - jerasure_total_memcpy_bytes += size; - - for (i = 1; i < k; i++) { - galois_region_xor(data_ptrs[i], parity_ptr, size); - jerasure_total_xor_bytes += size; - } -} - -int jerasure_invert_matrix(int *mat, int *inv, int rows, int w) -{ - int cols, i, j, k, x, rs2; - int row_start, tmp, inverse; - - cols = rows; - - k = 0; - for (i = 0; i < rows; i++) { - for (j = 0; j < cols; j++) { - inv[k] = (i == j) ? 1 : 0; - k++; - } - } - - /* First -- convert into upper triangular */ - for (i = 0; i < cols; i++) { - row_start = cols*i; - - /* Swap rows if we ave a zero i,i element. If we can't swap, then the - matrix was not invertible */ - - if (mat[row_start+i] == 0) { - for (j = i+1; j < rows && mat[cols*j+i] == 0; j++) ; - if (j == rows) return -1; - rs2 = j*cols; - for (k = 0; k < cols; k++) { - tmp = mat[row_start+k]; - mat[row_start+k] = mat[rs2+k]; - mat[rs2+k] = tmp; - tmp = inv[row_start+k]; - inv[row_start+k] = inv[rs2+k]; - inv[rs2+k] = tmp; - } - } - - /* Multiply the row by 1/element i,i */ - tmp = mat[row_start+i]; - if (tmp != 1) { - inverse = galois_single_divide(1, tmp, w); - for (j = 0; j < cols; j++) { - mat[row_start+j] = galois_single_multiply(mat[row_start+j], inverse, w); - inv[row_start+j] = galois_single_multiply(inv[row_start+j], inverse, w); - } - } - - /* Now for each j>i, add A_ji*Ai to Aj */ - k = row_start+i; - for (j = i+1; j != cols; j++) { - k += cols; - if (mat[k] != 0) { - if (mat[k] == 1) { - rs2 = cols*j; - for (x = 0; x < cols; x++) { - mat[rs2+x] ^= mat[row_start+x]; - inv[rs2+x] ^= inv[row_start+x]; - } - } else { - tmp = mat[k]; - rs2 = cols*j; - for (x = 0; x < cols; x++) { - mat[rs2+x] ^= galois_single_multiply(tmp, mat[row_start+x], w); - inv[rs2+x] ^= galois_single_multiply(tmp, inv[row_start+x], w); - } - } - } - } - } - - /* Now the matrix is upper triangular. Start at the top and multiply down */ - - for (i = rows-1; i >= 0; i--) { - row_start = i*cols; - for (j = 0; j < i; j++) { - rs2 = j*cols; - if (mat[rs2+i] != 0) { - tmp = mat[rs2+i]; - mat[rs2+i] = 0; - for (k = 0; k < cols; k++) { - inv[rs2+k] ^= galois_single_multiply(tmp, inv[row_start+k], w); - } - } - } - } - return 0; -} - -int jerasure_invertible_matrix(int *mat, int rows, int w) -{ - int cols, i, j, k, x, rs2; - int row_start, tmp, inverse; - - cols = rows; - - /* First -- convert into upper triangular */ - for (i = 0; i < cols; i++) { - row_start = cols*i; - - /* Swap rows if we ave a zero i,i element. If we can't swap, then the - matrix was not invertible */ - - if (mat[row_start+i] == 0) { - for (j = i+1; j < rows && mat[cols*j+i] == 0; j++) ; - if (j == rows) return 0; - rs2 = j*cols; - for (k = 0; k < cols; k++) { - tmp = mat[row_start+k]; - mat[row_start+k] = mat[rs2+k]; - mat[rs2+k] = tmp; - } - } - - /* Multiply the row by 1/element i,i */ - tmp = mat[row_start+i]; - if (tmp != 1) { - inverse = galois_single_divide(1, tmp, w); - for (j = 0; j < cols; j++) { - mat[row_start+j] = galois_single_multiply(mat[row_start+j], inverse, w); - } - } - - /* Now for each j>i, add A_ji*Ai to Aj */ - k = row_start+i; - for (j = i+1; j != cols; j++) { - k += cols; - if (mat[k] != 0) { - if (mat[k] == 1) { - rs2 = cols*j; - for (x = 0; x < cols; x++) { - mat[rs2+x] ^= mat[row_start+x]; - } - } else { - tmp = mat[k]; - rs2 = cols*j; - for (x = 0; x < cols; x++) { - mat[rs2+x] ^= galois_single_multiply(tmp, mat[row_start+x], w); - } - } - } - } - } - return 1; -} - -/* Converts a list-style version of the erasures into an array of k+m elements - where the element = 1 if the index has been erased, and zero otherwise */ - -int *jerasure_erasures_to_erased(int k, int m, int *erasures) -{ - int td; - int t_non_erased; - int *erased; - int i; - - td = k+m; - erased = talloc(int, td); - if (erased == NULL) return NULL; - t_non_erased = td; - - for (i = 0; i < td; i++) erased[i] = 0; - - for (i = 0; erasures[i] != -1; i++) { - if (erased[erasures[i]] == 0) { - erased[erasures[i]] = 1; - t_non_erased--; - if (t_non_erased < k) { - free(erased); - return NULL; - } - } - } - return erased; -} - -void jerasure_free_schedule(int **schedule) -{ - int i; - - for (i = 0; schedule[i][0] >= 0; i++) free(schedule[i]); - free(schedule[i]); - free(schedule); -} - -void jerasure_free_schedule_cache(int k, int m, int ***cache) -{ - int e1, e2; - - if (m != 2) { - fprintf(stderr, "jerasure_free_schedule_cache(): m must equal 2\n"); - exit(1); - } - - for (e1 = 0; e1 < k+m; e1++) { - for (e2 = 0; e2 < e1; e2++) { - jerasure_free_schedule(cache[e1*(k+m)+e2]); - } - jerasure_free_schedule(cache[e1*(k+m)+e1]); - } - free(cache); -} - -void jerasure_matrix_dotprod(int k, int w, int *matrix_row, - int *src_ids, int dest_id, - char **data_ptrs, char **coding_ptrs, int size) -{ - int init; - char *dptr, *sptr; - int i; - - if (w != 1 && w != 8 && w != 16 && w != 32) { - fprintf(stderr, "ERROR: jerasure_matrix_dotprod() called and w is not 1, 8, 16 or 32\n"); - exit(1); - } - - init = 0; - - dptr = (dest_id < k) ? data_ptrs[dest_id] : coding_ptrs[dest_id-k]; - - /* First copy or xor any data that does not need to be multiplied by a factor */ - - for (i = 0; i < k; i++) { - if (matrix_row[i] == 1) { - if (src_ids == NULL) { - sptr = data_ptrs[i]; - } else if (src_ids[i] < k) { - sptr = data_ptrs[src_ids[i]]; - } else { - sptr = coding_ptrs[src_ids[i]-k]; - } - if (init == 0) { - memcpy(dptr, sptr, size); - jerasure_total_memcpy_bytes += size; - init = 1; - } else { - galois_region_xor(sptr, dptr, size); - jerasure_total_xor_bytes += size; - } - } - } - - /* Now do the data that needs to be multiplied by a factor */ - - for (i = 0; i < k; i++) { - if (matrix_row[i] != 0 && matrix_row[i] != 1) { - if (src_ids == NULL) { - sptr = data_ptrs[i]; - } else if (src_ids[i] < k) { - sptr = data_ptrs[src_ids[i]]; - } else { - sptr = coding_ptrs[src_ids[i]-k]; - } - switch (w) { - case 8: galois_w08_region_multiply(sptr, matrix_row[i], size, dptr, init); break; - case 16: galois_w16_region_multiply(sptr, matrix_row[i], size, dptr, init); break; - case 32: galois_w32_region_multiply(sptr, matrix_row[i], size, dptr, init); break; - } - jerasure_total_gf_bytes += size; - init = 1; - } - } -} - - -int jerasure_bitmatrix_decode(int k, int m, int w, int *bitmatrix, int row_k_ones, int *erasures, - char **data_ptrs, char **coding_ptrs, int size, int packetsize) -{ - int i; - int *erased; - int *decoding_matrix; - int *dm_ids; - int edd, *tmpids, lastdrive; - - erased = jerasure_erasures_to_erased(k, m, erasures); - if (erased == NULL) return -1; - - /* See jerasure_matrix_decode for the logic of this routine. This one works just like - it, but calls the bitmatrix ops instead */ - - lastdrive = k; - - edd = 0; - for (i = 0; i < k; i++) { - if (erased[i]) { - edd++; - lastdrive = i; - } - } - - if (row_k_ones != 1 || erased[k]) lastdrive = k; - - dm_ids = NULL; - decoding_matrix = NULL; - - if (edd > 1 || (edd > 0 && (row_k_ones != 1 || erased[k]))) { - - dm_ids = talloc(int, k); - if (dm_ids == NULL) { - free(erased); - return -1; - } - - decoding_matrix = talloc(int, k*k*w*w); - if (decoding_matrix == NULL) { - free(erased); - free(dm_ids); - return -1; - } - - if (jerasure_make_decoding_bitmatrix(k, m, w, bitmatrix, erased, decoding_matrix, dm_ids) < 0) { - free(erased); - free(dm_ids); - free(decoding_matrix); - return -1; - } - } - - for (i = 0; edd > 0 && i < lastdrive; i++) { - if (erased[i]) { - jerasure_bitmatrix_dotprod(k, w, decoding_matrix+i*k*w*w, dm_ids, i, data_ptrs, coding_ptrs, size, packetsize); - edd--; - } - } - - if (edd > 0) { - tmpids = talloc(int, k); - for (i = 0; i < k; i++) { - tmpids[i] = (i < lastdrive) ? i : i+1; - } - jerasure_bitmatrix_dotprod(k, w, bitmatrix, tmpids, lastdrive, data_ptrs, coding_ptrs, size, packetsize); - free(tmpids); - } - - for (i = 0; i < m; i++) { - if (erased[k+i]) { - jerasure_bitmatrix_dotprod(k, w, bitmatrix+i*k*w*w, NULL, k+i, data_ptrs, coding_ptrs, size, packetsize); - } - } - - free(erased); - if (dm_ids != NULL) free(dm_ids); - if (decoding_matrix != NULL) free(decoding_matrix); - - return 0; -} - -static char **set_up_ptrs_for_scheduled_decoding(int k, int m, int *erasures, char **data_ptrs, char **coding_ptrs) -{ - int ddf, cdf; - int *erased; - char **ptrs; - int i, j, x; - - ddf = 0; - cdf = 0; - for (i = 0; erasures[i] != -1; i++) { - if (erasures[i] < k) ddf++; else cdf++; - } - - erased = jerasure_erasures_to_erased(k, m, erasures); - if (erased == NULL) return NULL; - - /* Set up ptrs. It will be as follows: - - - If data drive i has not failed, then ptrs[i] = data_ptrs[i]. - - If data drive i has failed, then ptrs[i] = coding_ptrs[j], where j is the - lowest unused non-failed coding drive. - - Elements k to k+ddf-1 are data_ptrs[] of the failed data drives. - - Elements k+ddf to k+ddf+cdf-1 are coding_ptrs[] of the failed data drives. - - The array row_ids contains the ids of ptrs. - The array ind_to_row_ids contains the row_id of drive i. - - However, we're going to set row_ids and ind_to_row in a different procedure. - */ - - ptrs = talloc(char *, k+m); - - j = k; - x = k; - for (i = 0; i < k; i++) { - if (erased[i] == 0) { - ptrs[i] = data_ptrs[i]; - } else { - while (erased[j]) j++; - ptrs[i] = coding_ptrs[j-k]; - j++; - ptrs[x] = data_ptrs[i]; - x++; - } - } - for (i = k; i < k+m; i++) { - if (erased[i]) { - ptrs[x] = coding_ptrs[i-k]; - x++; - } - } - free(erased); - return ptrs; -} - -static int set_up_ids_for_scheduled_decoding(int k, int m, int *erasures, int *row_ids, int *ind_to_row) -{ - int ddf, cdf; - int *erased; - int i, j, x; - - ddf = 0; - cdf = 0; - for (i = 0; erasures[i] != -1; i++) { - if (erasures[i] < k) ddf++; else cdf++; - } - - erased = jerasure_erasures_to_erased(k, m, erasures); - if (erased == NULL) return -1; - - /* See set_up_ptrs_for_scheduled_decoding for how these are set */ - - j = k; - x = k; - for (i = 0; i < k; i++) { - if (erased[i] == 0) { - row_ids[i] = i; - ind_to_row[i] = i; - } else { - while (erased[j]) j++; - row_ids[i] = j; - ind_to_row[j] = i; - j++; - row_ids[x] = i; - ind_to_row[i] = x; - x++; - } - } - for (i = k; i < k+m; i++) { - if (erased[i]) { - row_ids[x] = i; - ind_to_row[i] = x; - x++; - } - } - free(erased); - return 0; -} - -static int **jerasure_generate_decoding_schedule(int k, int m, int w, int *bitmatrix, int *erasures, int smart) -{ - int i, j, x, drive, y, index, z; - int *decoding_matrix, *inverse, *real_decoding_matrix; - int *ptr; - int *row_ids; - int *ind_to_row; - int ddf, cdf; - int **schedule; - int *b1, *b2; - - /* First, figure out the number of data drives that have failed, and the - number of coding drives that have failed: ddf and cdf */ - - ddf = 0; - cdf = 0; - for (i = 0; erasures[i] != -1; i++) { - if (erasures[i] < k) ddf++; else cdf++; - } - - row_ids = talloc(int, k+m); - ind_to_row = talloc(int, k+m); - - if (set_up_ids_for_scheduled_decoding(k, m, erasures, row_ids, ind_to_row) < 0) return NULL; - - /* Now, we're going to create one decoding matrix which is going to - decode everything with one call. The hope is that the scheduler - will do a good job. This matrix has w*e rows, where e is the - number of erasures (ddf+cdf) */ - - real_decoding_matrix = talloc(int, k*w*(cdf+ddf)*w); - - /* First, if any data drives have failed, then initialize the first - ddf*w rows of the decoding matrix from the standard decoding - matrix inversion */ - - if (ddf > 0) { - - decoding_matrix = talloc(int, k*k*w*w); - ptr = decoding_matrix; - for (i = 0; i < k; i++) { - if (row_ids[i] == i) { - bzero(ptr, k*w*w*sizeof(int)); - for (x = 0; x < w; x++) { - ptr[x+i*w+x*k*w] = 1; - } - } else { - memcpy(ptr, bitmatrix+k*w*w*(row_ids[i]-k), k*w*w*sizeof(int)); - } - ptr += (k*w*w); - } - inverse = talloc(int, k*k*w*w); - jerasure_invert_bitmatrix(decoding_matrix, inverse, k*w); - -/* printf("\nMatrix to invert\n"); - jerasure_print_bitmatrix(decoding_matrix, k*w, k*w, w); - printf("\n"); - printf("\nInverse\n"); - jerasure_print_bitmatrix(inverse, k*w, k*w, w); - printf("\n"); */ - - free(decoding_matrix); - ptr = real_decoding_matrix; - for (i = 0; i < ddf; i++) { - memcpy(ptr, inverse+k*w*w*row_ids[k+i], sizeof(int)*k*w*w); - ptr += (k*w*w); - } - free(inverse); - } - - /* Next, here comes the hard part. For each coding node that needs - to be decoded, you start by putting its rows of the distribution - matrix into the decoding matrix. If there were no failed data - nodes, then you're done. However, if there have been failed - data nodes, then you need to modify the columns that correspond - to the data nodes. You do that by first zeroing them. Then - whereever there is a one in the distribution matrix, you XOR - in the corresponding row from the failed data node's entry in - the decoding matrix. The whole process kind of makes my head - spin, but it works. - */ - - for (x = 0; x < cdf; x++) { - drive = row_ids[x+ddf+k]-k; - ptr = real_decoding_matrix + k*w*w*(ddf+x); - memcpy(ptr, bitmatrix+drive*k*w*w, sizeof(int)*k*w*w); - - for (i = 0; i < k; i++) { - if (row_ids[i] != i) { - for (j = 0; j < w; j++) { - bzero(ptr+j*k*w+i*w, sizeof(int)*w); - } - } - } - - /* There's the yucky part */ - - index = drive*k*w*w; - for (i = 0; i < k; i++) { - if (row_ids[i] != i) { - b1 = real_decoding_matrix+(ind_to_row[i]-k)*k*w*w; - for (j = 0; j < w; j++) { - b2 = ptr + j*k*w; - for (y = 0; y < w; y++) { - if (bitmatrix[index+j*k*w+i*w+y]) { - for (z = 0; z < k*w; z++) { - b2[z] = b2[z] ^ b1[z+y*k*w]; - } - } - } - } - } - } - } - -/* - printf("\n\nReal Decoding Matrix\n\n"); - jerasure_print_bitmatrix(real_decoding_matrix, (ddf+cdf)*w, k*w, w); - printf("\n"); */ - if (smart) { - schedule = jerasure_smart_bitmatrix_to_schedule(k, ddf+cdf, w, real_decoding_matrix); - } else { - schedule = jerasure_dumb_bitmatrix_to_schedule(k, ddf+cdf, w, real_decoding_matrix); - } - free(row_ids); - free(ind_to_row); - free(real_decoding_matrix); - return schedule; -} - -int jerasure_schedule_decode_lazy(int k, int m, int w, int *bitmatrix, int *erasures, - char **data_ptrs, char **coding_ptrs, int size, int packetsize, - int smart) -{ - int i, tdone; - char **ptrs; - int **schedule; - - ptrs = set_up_ptrs_for_scheduled_decoding(k, m, erasures, data_ptrs, coding_ptrs); - if (ptrs == NULL) return -1; - - schedule = jerasure_generate_decoding_schedule(k, m, w, bitmatrix, erasures, smart); - if (schedule == NULL) { - free(ptrs); - return -1; - } - - for (tdone = 0; tdone < size; tdone += packetsize*w) { - jerasure_do_scheduled_operations(ptrs, schedule, packetsize); - for (i = 0; i < k+m; i++) ptrs[i] += (packetsize*w); - } - - jerasure_free_schedule(schedule); - free(ptrs); - - return 0; -} - -int jerasure_schedule_decode_cache(int k, int m, int w, int ***scache, int *erasures, - char **data_ptrs, char **coding_ptrs, int size, int packetsize) -{ - int i, tdone; - char **ptrs; - int **schedule; - int index; - - if (erasures[1] == -1) { - index = erasures[0]*(k+m) + erasures[0]; - } else if (erasures[2] == -1) { - index = erasures[0]*(k+m) + erasures[1]; - } else { - return -1; - } - - schedule = scache[index]; - - ptrs = set_up_ptrs_for_scheduled_decoding(k, m, erasures, data_ptrs, coding_ptrs); - if (ptrs == NULL) return -1; - - - for (tdone = 0; tdone < size; tdone += packetsize*w) { - jerasure_do_scheduled_operations(ptrs, schedule, packetsize); - for (i = 0; i < k+m; i++) ptrs[i] += (packetsize*w); - } - - free(ptrs); - - return 0; -} - -/* This only works when m = 2 */ - -int ***jerasure_generate_schedule_cache(int k, int m, int w, int *bitmatrix, int smart) -{ - int ***scache; - int erasures[3]; - int e1, e2; - - /* Ok -- this is yucky, but it's how I'm doing it. You will make an index out - of erasures, which will be e1*(k+m)+(e2). If there is no e2, then e2 = e1. - Isn't that clever and confusing. Sorry. - - We're not going to worry about ordering -- in other words, the schedule for - e1,e2 will be the same as e2,e1. They will have the same pointer -- the - schedule will not be duplicated. */ - - if (m != 2) return NULL; - - scache = talloc(int **, (k+m)*(k+m+1)); - if (scache == NULL) return NULL; - - for (e1 = 0; e1 < k+m; e1++) { - erasures[0] = e1; - for (e2 = 0; e2 < e1; e2++) { - erasures[1] = e2; - erasures[2] = -1; - scache[e1*(k+m)+e2] = jerasure_generate_decoding_schedule(k, m, w, bitmatrix, erasures, smart); - scache[e2*(k+m)+e1] = scache[e1*(k+m)+e2]; - } - erasures[1] = -1; - scache[e1*(k+m)+e1] = jerasure_generate_decoding_schedule(k, m, w, bitmatrix, erasures, smart); - } - return scache; - -} - -int jerasure_invert_bitmatrix(int *mat, int *inv, int rows) -{ - int cols, i, j, k; - int tmp; - - cols = rows; - - k = 0; - for (i = 0; i < rows; i++) { - for (j = 0; j < cols; j++) { - inv[k] = (i == j) ? 1 : 0; - k++; - } - } - - /* First -- convert into upper triangular */ - - for (i = 0; i < cols; i++) { - - /* Swap rows if we have a zero i,i element. If we can't swap, then the - matrix was not invertible */ - - if ((mat[i*cols+i]) == 0) { - for (j = i+1; j < rows && (mat[j*cols+i]) == 0; j++) ; - if (j == rows) return -1; - for (k = 0; k < cols; k++) { - tmp = mat[i*cols+k]; mat[i*cols+k] = mat[j*cols+k]; mat[j*cols+k] = tmp; - tmp = inv[i*cols+k]; inv[i*cols+k] = inv[j*cols+k]; inv[j*cols+k] = tmp; - } - } - - /* Now for each j>i, add A_ji*Ai to Aj */ - for (j = i+1; j != rows; j++) { - if (mat[j*cols+i] != 0) { - for (k = 0; k < cols; k++) { - mat[j*cols+k] ^= mat[i*cols+k]; - inv[j*cols+k] ^= inv[i*cols+k]; - } - } - } - } - - /* Now the matrix is upper triangular. Start at the top and multiply down */ - - for (i = rows-1; i >= 0; i--) { - for (j = 0; j < i; j++) { - if (mat[j*cols+i]) { - for (k = 0; k < cols; k++) { - mat[j*cols+k] ^= mat[i*cols+k]; - inv[j*cols+k] ^= inv[i*cols+k]; - } - } - } - } - return 0; -} - -int jerasure_invertible_bitmatrix(int *mat, int rows) -{ - int cols, i, j, k; - int tmp; - - cols = rows; - - /* First -- convert into upper triangular */ - - for (i = 0; i < cols; i++) { - - /* Swap rows if we have a zero i,i element. If we can't swap, then the - matrix was not invertible */ - - if ((mat[i*cols+i]) == 0) { - for (j = i+1; j < rows && (mat[j*cols+i]) == 0; j++) ; - if (j == rows) return 0; - for (k = 0; k < cols; k++) { - tmp = mat[i*cols+k]; mat[i*cols+k] = mat[j*cols+k]; mat[j*cols+k] = tmp; - } - } - - /* Now for each j>i, add A_ji*Ai to Aj */ - for (j = i+1; j != rows; j++) { - if (mat[j*cols+i] != 0) { - for (k = 0; k < cols; k++) { - mat[j*cols+k] ^= mat[i*cols+k]; - } - } - } - } - return 1; -} - - -int *jerasure_matrix_multiply(int *m1, int *m2, int r1, int c1, int r2, int c2, int w) -{ - int *product, i, j, k; - - product = (int *) malloc(sizeof(int)*r1*c2); - for (i = 0; i < r1*c2; i++) product[i] = 0; - - for (i = 0; i < r1; i++) { - for (j = 0; j < c2; j++) { - for (k = 0; k < r2; k++) { - product[i*c2+j] ^= galois_single_multiply(m1[i*c1+k], m2[k*c2+j], w); - } - } - } - return product; -} - -void jerasure_get_stats(double *fill_in) -{ - fill_in[0] = jerasure_total_xor_bytes; - fill_in[1] = jerasure_total_gf_bytes; - fill_in[2] = jerasure_total_memcpy_bytes; - jerasure_total_xor_bytes = 0; - jerasure_total_gf_bytes = 0; - jerasure_total_memcpy_bytes = 0; -} - -void jerasure_do_scheduled_operations(char **ptrs, int **operations, int packetsize) -{ - char *sptr; - char *dptr; - int op; - - for (op = 0; operations[op][0] >= 0; op++) { - sptr = ptrs[operations[op][0]] + operations[op][1]*packetsize; - dptr = ptrs[operations[op][2]] + operations[op][3]*packetsize; - if (operations[op][4]) { -/* printf("%d,%d %d,%d\n", operations[op][0], - operations[op][1], - operations[op][2], - operations[op][3]); - printf("xor(0x%x, 0x%x -> 0x%x, %d)\n", sptr, dptr, dptr, packetsize); */ - galois_region_xor(sptr, dptr, packetsize); - jerasure_total_xor_bytes += packetsize; - } else { -/* printf("memcpy(0x%x <- 0x%x)\n", dptr, sptr); */ - memcpy(dptr, sptr, packetsize); - jerasure_total_memcpy_bytes += packetsize; - } - } -} - -void jerasure_schedule_encode(int k, int m, int w, int **schedule, - char **data_ptrs, char **coding_ptrs, int size, int packetsize) -{ - char **ptr_copy; - int i, tdone; - - ptr_copy = talloc(char *, (k+m)); - for (i = 0; i < k; i++) ptr_copy[i] = data_ptrs[i]; - for (i = 0; i < m; i++) ptr_copy[i+k] = coding_ptrs[i]; - for (tdone = 0; tdone < size; tdone += packetsize*w) { - jerasure_do_scheduled_operations(ptr_copy, schedule, packetsize); - for (i = 0; i < k+m; i++) ptr_copy[i] += (packetsize*w); - } - free(ptr_copy); -} - -int **jerasure_dumb_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix) -{ - int **operations; - int op; - int index, optodo, i, j; - - operations = talloc(int *, k*m*w*w+1); - op = 0; - - index = 0; - for (i = 0; i < m*w; i++) { - optodo = 0; - for (j = 0; j < k*w; j++) { - if (bitmatrix[index]) { - operations[op] = talloc(int, 5); - operations[op][4] = optodo; - operations[op][0] = j/w; - operations[op][1] = j%w; - operations[op][2] = k+i/w; - operations[op][3] = i%w; - optodo = 1; - op++; - - } - index++; - } - } - operations[op] = talloc(int, 5); - operations[op][0] = -1; - return operations; -} - -int **jerasure_smart_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix) -{ - int **operations; - int op; - int i, j; - int *diff, *from, *b1, *flink, *blink; - int *ptr, no, row; - int optodo; - int bestrow, bestdiff, top; - -/* printf("Scheduling:\n\n"); - jerasure_print_bitmatrix(bitmatrix, m*w, k*w, w); */ - - operations = talloc(int *, k*m*w*w+1); - op = 0; - - diff = talloc(int, m*w); - from = talloc(int, m*w); - flink = talloc(int, m*w); - blink = talloc(int, m*w); - - ptr = bitmatrix; - - bestdiff = k*w+1; - top = 0; - for (i = 0; i < m*w; i++) { - no = 0; - for (j = 0; j < k*w; j++) { - no += *ptr; - ptr++; - } - diff[i] = no; - from[i] = -1; - flink[i] = i+1; - blink[i] = i-1; - if (no < bestdiff) { - bestdiff = no; - bestrow = i; - } - } - - flink[m*w-1] = -1; - - while (top != -1) { - row = bestrow; - /* printf("Doing row %d - %d from %d\n", row, diff[row], from[row]); */ - - if (blink[row] == -1) { - top = flink[row]; - if (top != -1) blink[top] = -1; - } else { - flink[blink[row]] = flink[row]; - if (flink[row] != -1) { - blink[flink[row]] = blink[row]; - } - } - - ptr = bitmatrix + row*k*w; - if (from[row] == -1) { - optodo = 0; - for (j = 0; j < k*w; j++) { - if (ptr[j]) { - operations[op] = talloc(int, 5); - operations[op][4] = optodo; - operations[op][0] = j/w; - operations[op][1] = j%w; - operations[op][2] = k+row/w; - operations[op][3] = row%w; - optodo = 1; - op++; - } - } - } else { - operations[op] = talloc(int, 5); - operations[op][4] = 0; - operations[op][0] = k+from[row]/w; - operations[op][1] = from[row]%w; - operations[op][2] = k+row/w; - operations[op][3] = row%w; - op++; - b1 = bitmatrix + from[row]*k*w; - for (j = 0; j < k*w; j++) { - if (ptr[j] ^ b1[j]) { - operations[op] = talloc(int, 5); - operations[op][4] = 1; - operations[op][0] = j/w; - operations[op][1] = j%w; - operations[op][2] = k+row/w; - operations[op][3] = row%w; - optodo = 1; - op++; - } - } - } - bestdiff = k*w+1; - for (i = top; i != -1; i = flink[i]) { - no = 1; - b1 = bitmatrix + i*k*w; - for (j = 0; j < k*w; j++) no += (ptr[j] ^ b1[j]); - if (no < diff[i]) { - from[i] = row; - diff[i] = no; - } - if (diff[i] < bestdiff) { - bestdiff = diff[i]; - bestrow = i; - } - } - } - - operations[op] = talloc(int, 5); - operations[op][0] = -1; - free(from); - free(diff); - free(blink); - free(flink); - - return operations; -} - -void jerasure_bitmatrix_encode(int k, int m, int w, int *bitmatrix, - char **data_ptrs, char **coding_ptrs, int size, int packetsize) -{ - int i; - - if (packetsize%sizeof(long) != 0) { - fprintf(stderr, "jerasure_bitmatrix_encode - packetsize(%d) %c sizeof(long) != 0\n", packetsize, '%'); - exit(1); - } - if (size%(packetsize*w) != 0) { - fprintf(stderr, "jerasure_bitmatrix_encode - size(%d) %c (packetsize(%d)*w(%d))) != 0\n", - size, '%', packetsize, w); - exit(1); - } - - for (i = 0; i < m; i++) { - jerasure_bitmatrix_dotprod(k, w, bitmatrix+i*k*w*w, NULL, k+i, data_ptrs, coding_ptrs, size, packetsize); - } -} - -/* - * Exported function for use by autoconf to perform quick - * spot-check. - */ -int jerasure_autoconf_test() -{ - int x = galois_single_multiply(1, 2, 8); - if (x != 2) { - return -1; - } - return 0; -} - diff --git a/src/erasure-code/jerasure/jerasure/src/liberation.c b/src/erasure-code/jerasure/jerasure/src/liberation.c deleted file mode 100644 index 11a1c4fea7ee9..0000000000000 --- a/src/erasure-code/jerasure/jerasure/src/liberation.c +++ /dev/null @@ -1,262 +0,0 @@ -/* * - * Copyright (c) 2014, James S. Plank and Kevin Greenan - * All rights reserved. - * - * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure - * Coding Techniques - * - * Revision 2.0: Galois Field backend now links to GF-Complete - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * - Neither the name of the University of Tennessee nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY - * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/* Jerasure's authors: - - Revision 2.x - 2014: James S. Plank and Kevin M. Greenan - Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman. - Revision 1.0 - 2007: James S. Plank - */ - -#include -#include -#include - -#include "galois.h" -#include "jerasure.h" -#include "liberation.h" - -#define talloc(type, num) (type *) malloc(sizeof(type)*(num)) - -int *liberation_coding_bitmatrix(int k, int w) -{ - int *matrix, i, j, index; - - if (k > w) return NULL; - matrix = talloc(int, 2*k*w*w); - if (matrix == NULL) return NULL; - bzero(matrix, sizeof(int)*2*k*w*w); - - /* Set up identity matrices */ - - for(i = 0; i < w; i++) { - index = i*k*w+i; - for (j = 0; j < k; j++) { - matrix[index] = 1; - index += w; - } - } - - /* Set up liberation matrices */ - - for (j = 0; j < k; j++) { - index = k*w*w+j*w; - for (i = 0; i < w; i++) { - matrix[index+(j+i)%w] = 1; - index += (k*w); - } - if (j > 0) { - i = (j*((w-1)/2))%w; - matrix[k*w*w+j*w+i*k*w+(i+j-1)%w] = 1; - } - } - return matrix; -} - - -int *liber8tion_coding_bitmatrix(int k) -{ - int *matrix, i, j, index; - int w; - - w = 8; - if (k > w) return NULL; - matrix = talloc(int, 2*k*w*w); - if (matrix == NULL) return NULL; - bzero(matrix, sizeof(int)*2*k*w*w); - - /* Set up identity matrices */ - - for(i = 0; i < w; i++) { - index = i*k*w+i; - for (j = 0; j < k; j++) { - matrix[index] = 1; - index += w; - } - } - - /* Set up liber8tion matrices */ - - index = k*w*w; - - if (k == 0) return matrix; - matrix[index+0*k*w+0*w+0] = 1; - matrix[index+1*k*w+0*w+1] = 1; - matrix[index+2*k*w+0*w+2] = 1; - matrix[index+3*k*w+0*w+3] = 1; - matrix[index+4*k*w+0*w+4] = 1; - matrix[index+5*k*w+0*w+5] = 1; - matrix[index+6*k*w+0*w+6] = 1; - matrix[index+7*k*w+0*w+7] = 1; - - if (k == 1) return matrix; - matrix[index+0*k*w+1*w+7] = 1; - matrix[index+1*k*w+1*w+3] = 1; - matrix[index+2*k*w+1*w+0] = 1; - matrix[index+3*k*w+1*w+2] = 1; - matrix[index+4*k*w+1*w+6] = 1; - matrix[index+5*k*w+1*w+1] = 1; - matrix[index+6*k*w+1*w+5] = 1; - matrix[index+7*k*w+1*w+4] = 1; - matrix[index+4*k*w+1*w+7] = 1; - - if (k == 2) return matrix; - matrix[index+0*k*w+2*w+6] = 1; - matrix[index+1*k*w+2*w+2] = 1; - matrix[index+2*k*w+2*w+4] = 1; - matrix[index+3*k*w+2*w+0] = 1; - matrix[index+4*k*w+2*w+7] = 1; - matrix[index+5*k*w+2*w+3] = 1; - matrix[index+6*k*w+2*w+1] = 1; - matrix[index+7*k*w+2*w+5] = 1; - matrix[index+1*k*w+2*w+3] = 1; - - if (k == 3) return matrix; - matrix[index+0*k*w+3*w+2] = 1; - matrix[index+1*k*w+3*w+5] = 1; - matrix[index+2*k*w+3*w+7] = 1; - matrix[index+3*k*w+3*w+6] = 1; - matrix[index+4*k*w+3*w+0] = 1; - matrix[index+5*k*w+3*w+3] = 1; - matrix[index+6*k*w+3*w+4] = 1; - matrix[index+7*k*w+3*w+1] = 1; - matrix[index+5*k*w+3*w+4] = 1; - - if (k == 4) return matrix; - matrix[index+0*k*w+4*w+5] = 1; - matrix[index+1*k*w+4*w+6] = 1; - matrix[index+2*k*w+4*w+1] = 1; - matrix[index+3*k*w+4*w+7] = 1; - matrix[index+4*k*w+4*w+2] = 1; - matrix[index+5*k*w+4*w+4] = 1; - matrix[index+6*k*w+4*w+3] = 1; - matrix[index+7*k*w+4*w+0] = 1; - matrix[index+2*k*w+4*w+0] = 1; - - if (k == 5) return matrix; - matrix[index+0*k*w+5*w+1] = 1; - matrix[index+1*k*w+5*w+2] = 1; - matrix[index+2*k*w+5*w+3] = 1; - matrix[index+3*k*w+5*w+4] = 1; - matrix[index+4*k*w+5*w+5] = 1; - matrix[index+5*k*w+5*w+6] = 1; - matrix[index+6*k*w+5*w+7] = 1; - matrix[index+7*k*w+5*w+0] = 1; - matrix[index+7*k*w+5*w+2] = 1; - - if (k == 6) return matrix; - matrix[index+0*k*w+6*w+3] = 1; - matrix[index+1*k*w+6*w+0] = 1; - matrix[index+2*k*w+6*w+6] = 1; - matrix[index+3*k*w+6*w+5] = 1; - matrix[index+4*k*w+6*w+1] = 1; - matrix[index+5*k*w+6*w+7] = 1; - matrix[index+6*k*w+6*w+4] = 1; - matrix[index+7*k*w+6*w+2] = 1; - matrix[index+6*k*w+6*w+5] = 1; - - if (k == 7) return matrix; - matrix[index+0*k*w+7*w+4] = 1; - matrix[index+1*k*w+7*w+7] = 1; - matrix[index+2*k*w+7*w+1] = 1; - matrix[index+3*k*w+7*w+5] = 1; - matrix[index+4*k*w+7*w+3] = 1; - matrix[index+5*k*w+7*w+2] = 1; - matrix[index+6*k*w+7*w+0] = 1; - matrix[index+7*k*w+7*w+6] = 1; - matrix[index+3*k*w+7*w+1] = 1; - - return matrix; -} - -int *blaum_roth_coding_bitmatrix(int k, int w) -{ - int *matrix, i, j, index, l, m, p; - - if (k > w) return NULL ; - - matrix = talloc(int, 2*k*w*w); - if (matrix == NULL) return NULL; - bzero(matrix, sizeof(int)*2*k*w*w); - - /* Set up identity matrices */ - - for(i = 0; i < w; i++) { - index = i*k*w+i; - for (j = 0; j < k; j++) { - matrix[index] = 1; - index += w; - } - } - - /* Set up blaum_roth matrices -- Ignore identity */ - - p = w+1; - for (j = 0; j < k; j++) { - index = k*w*w+j*w; - if (j == 0) { - for (l = 0; l < w; l++) { - matrix[index+l] = 1; - index += k*w; - } - } else { - i = j; - for (l = 1; l <= w; l++) { - if (l != p-i) { - m = l+i; - if (m >= p) m -= p; - m--; - matrix[index+m] = 1; - } else { - matrix[index+i-1] = 1; - if (i%2 == 0) { - m = i/2; - } else { - m = (p/2) + 1 + (i/2); - } - m--; - matrix[index+m] = 1; - } - index += k*w; - } - } - } - - return matrix; -} diff --git a/src/erasure-code/jerasure/jerasure/src/reed_sol.c b/src/erasure-code/jerasure/jerasure/src/reed_sol.c deleted file mode 100644 index c0dfe83832a7e..0000000000000 --- a/src/erasure-code/jerasure/jerasure/src/reed_sol.c +++ /dev/null @@ -1,301 +0,0 @@ -/* * - * Copyright (c) 2014, James S. Plank and Kevin Greenan - * All rights reserved. - * - * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure - * Coding Techniques - * - * Revision 2.0: Galois Field backend now links to GF-Complete - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * - Neither the name of the University of Tennessee nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY - * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/* Jerasure's authors: - - Revision 2.x - 2014: James S. Plank and Kevin M. Greenan - Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman. - Revision 1.0 - 2007: James S. Plank - */ - -#include -#include -#include - -#include -#include "galois.h" -#include "jerasure.h" -#include "reed_sol.h" - -#define talloc(type, num) (type *) malloc(sizeof(type)*(num)) - -int *reed_sol_r6_coding_matrix(int k, int w) -{ - int *matrix; - int i, tmp; - - if (w != 8 && w != 16 && w != 32) return NULL; - - matrix = talloc(int, 2*k); - if (matrix == NULL) return NULL; - - for (i = 0; i < k; i++) matrix[i] = 1; - matrix[k] = 1; - tmp = 1; - for (i = 1; i < k; i++) { - tmp = galois_single_multiply(tmp, 2, w); - matrix[k+i] = tmp; - } - return matrix; -} - -int *reed_sol_vandermonde_coding_matrix(int k, int m, int w) -{ - int i, j; - int *vdm, *dist; - - vdm = reed_sol_big_vandermonde_distribution_matrix(k+m, k, w); - if (vdm == NULL) return NULL; - dist = talloc(int, m*k); - if (dist == NULL) { - free(vdm); - return NULL; - } - - i = k*k; - for (j = 0; j < m*k; j++) { - dist[j] = vdm[i]; - i++; - } - free(vdm); - return dist; -} - -static int prim08 = -1; -static gf_t GF08; - -void reed_sol_galois_w08_region_multby_2(char *region, int nbytes) -{ - if (prim08 == -1) { - prim08 = galois_single_multiply((1 << 7), 2, 8); - if (!gf_init_hard(&GF08, 8, GF_MULT_BYTWO_b, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, - prim08, 0, 0, NULL, NULL)) { - fprintf(stderr, "Error: Can't initialize the GF for reed_sol_galois_w08_region_multby_2\n"); - exit(1); - } - } - GF08.multiply_region.w32(&GF08, region, region, 2, nbytes, 0); -} - -static int prim16 = -1; -static gf_t GF16; - -void reed_sol_galois_w16_region_multby_2(char *region, int nbytes) -{ - if (prim16 == -1) { - prim16 = galois_single_multiply((1 << 15), 2, 16); - if (!gf_init_hard(&GF16, 16, GF_MULT_BYTWO_b, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, - prim16, 0, 0, NULL, NULL)) { - fprintf(stderr, "Error: Can't initialize the GF for reed_sol_galois_w16_region_multby_2\n"); - exit(1); - } - } - GF16.multiply_region.w32(&GF16, region, region, 2, nbytes, 0); -} - -static int prim32 = -1; -static gf_t GF32; - -void reed_sol_galois_w32_region_multby_2(char *region, int nbytes) -{ - if (prim32 == -1) { - prim32 = galois_single_multiply((1 << 31), 2, 32); - if (!gf_init_hard(&GF32, 32, GF_MULT_BYTWO_b, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, - prim32, 0, 0, NULL, NULL)) { - fprintf(stderr, "Error: Can't initialize the GF for reed_sol_galois_w32_region_multby_2\n"); - exit(1); - } - } - GF32.multiply_region.w32(&GF32, region, region, 2, nbytes, 0); -} - -int reed_sol_r6_encode(int k, int w, char **data_ptrs, char **coding_ptrs, int size) -{ - int i; - - /* First, put the XOR into coding region 0 */ - - memcpy(coding_ptrs[0], data_ptrs[0], size); - - for (i = 1; i < k; i++) galois_region_xor(data_ptrs[i], coding_ptrs[0], size); - - /* Next, put the sum of (2^j)*Dj into coding region 1 */ - - memcpy(coding_ptrs[1], data_ptrs[k-1], size); - - for (i = k-2; i >= 0; i--) { - switch (w) { - case 8: reed_sol_galois_w08_region_multby_2(coding_ptrs[1], size); break; - case 16: reed_sol_galois_w16_region_multby_2(coding_ptrs[1], size); break; - case 32: reed_sol_galois_w32_region_multby_2(coding_ptrs[1], size); break; - default: return 0; - } - - galois_region_xor(data_ptrs[i], coding_ptrs[1], size); - } - return 1; -} - -int *reed_sol_extended_vandermonde_matrix(int rows, int cols, int w) -{ - int *vdm; - int i, j, k; - - if (w < 30 && (1 << w) < rows) return NULL; - if (w < 30 && (1 << w) < cols) return NULL; - - vdm = talloc(int, rows*cols); - if (vdm == NULL) { return NULL; } - - vdm[0] = 1; - for (j = 1; j < cols; j++) vdm[j] = 0; - if (rows == 1) return vdm; - - i=(rows-1)*cols; - for (j = 0; j < cols-1; j++) vdm[i+j] = 0; - vdm[i+j] = 1; - if (rows == 2) return vdm; - - for (i = 1; i < rows-1; i++) { - k = 1; - for (j = 0; j < cols; j++) { - vdm[i*cols+j] = k; - k = galois_single_multiply(k, i, w); - } - } - return vdm; -} - -int *reed_sol_big_vandermonde_distribution_matrix(int rows, int cols, int w) -{ - int *dist; - int i, j, k; - int sindex, srindex, siindex, tmp; - - if (cols >= rows) return NULL; - - dist = reed_sol_extended_vandermonde_matrix(rows, cols, w); - if (dist == NULL) return NULL; - - sindex = 0; - for (i = 1; i < cols; i++) { - sindex += cols; - - /* Find an appropriate row -- where i,i != 0 */ - srindex = sindex+i; - for (j = i; j < rows && dist[srindex] == 0; j++) srindex += cols; - if (j >= rows) { /* This should never happen if rows/w are correct */ - fprintf(stderr, "reed_sol_big_vandermonde_distribution_matrix(%d,%d,%d) - couldn't make matrix\n", - rows, cols, w); - exit(1); - } - - /* If necessary, swap rows */ - if (j != i) { - srindex -= i; - for (k = 0; k < cols; k++) { - tmp = dist[srindex+k]; - dist[srindex+k] = dist[sindex+k]; - dist[sindex+k] = tmp; - } - } - - /* If Element i,i is not equal to 1, multiply the column by 1/i */ - - if (dist[sindex+i] != 1) { - tmp = galois_single_divide(1, dist[sindex+i], w); - srindex = i; - for (j = 0; j < rows; j++) { - dist[srindex] = galois_single_multiply(tmp, dist[srindex], w); - srindex += cols; - } - } - - /* Now, for each element in row i that is not in column 1, you need - to make it zero. Suppose that this is column j, and the element - at i,j = e. Then you want to replace all of column j with - (col-j + col-i*e). Note, that in row i, col-i = 1 and col-j = e. - So (e + 1e) = 0, which is indeed what we want. */ - - for (j = 0; j < cols; j++) { - tmp = dist[sindex+j]; - if (j != i && tmp != 0) { - srindex = j; - siindex = i; - for (k = 0; k < rows; k++) { - dist[srindex] = dist[srindex] ^ galois_single_multiply(tmp, dist[siindex], w); - srindex += cols; - siindex += cols; - } - } - } - } - /* We desire to have row k be all ones. To do that, multiply - the entire column j by 1/dist[k,j]. Then row j by 1/dist[j,j]. */ - - sindex = cols*cols; - for (j = 0; j < cols; j++) { - tmp = dist[sindex]; - if (tmp != 1) { - tmp = galois_single_divide(1, tmp, w); - srindex = sindex; - for (i = cols; i < rows; i++) { - dist[srindex] = galois_single_multiply(tmp, dist[srindex], w); - srindex += cols; - } - } - sindex++; - } - - /* Finally, we'd like the first column of each row to be all ones. To - do that, we multiply the row by the inverse of the first element. */ - - sindex = cols*(cols+1); - for (i = cols+1; i < rows; i++) { - tmp = dist[sindex]; - if (tmp != 1) { - tmp = galois_single_divide(1, tmp, w); - for (j = 0; j < cols; j++) dist[sindex+j] = galois_single_multiply(dist[sindex+j], tmp, w); - } - sindex += cols; - } - - return dist; -} -