Skip to content

Commit

Permalink
add c = add(mul(a, b), c) to la64.
Browse files Browse the repository at this point in the history
  • Loading branch information
pigirons committed Sep 13, 2024
1 parent 888d2af commit dbec1c1
Show file tree
Hide file tree
Showing 3 changed files with 302 additions and 64 deletions.
176 changes: 144 additions & 32 deletions loongarch64/asm/_LASX_.S
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
.globl lasx_fp32_fmadd_f32f32f32
.globl lasx_fp64_fmadd_f64f64f64
.globl lasx_fp32_add_mul_f32f32_f32
.globl lasx_fp64_add_mul_f64f64_f64

lasx_fp32_fmadd_f32f32f32:
xvxor.v $xr0, $xr0, $xr0
Expand All @@ -20,22 +22,22 @@ lasx_fp32_fmadd_f32f32f32:
xvxor.v $xr15, $xr15, $xr15
xvxor.v $xr16, $xr16, $xr16
.lasx.fp32.fmadd.f32f32f32:
xvfmadd.s $xr0, $xr24, $xr24, $xr0
xvfmadd.s $xr1, $xr24, $xr24, $xr1
xvfmadd.s $xr2, $xr24, $xr24, $xr2
xvfmadd.s $xr3, $xr24, $xr24, $xr3
xvfmadd.s $xr4, $xr24, $xr24, $xr4
xvfmadd.s $xr5, $xr24, $xr24, $xr5
xvfmadd.s $xr6, $xr24, $xr24, $xr6
xvfmadd.s $xr7, $xr24, $xr24, $xr7
xvfmadd.s $xr8, $xr24, $xr24, $xr8
xvfmadd.s $xr9, $xr24, $xr24, $xr9
xvfmadd.s $xr10, $xr24, $xr24, $xr10
xvfmadd.s $xr11, $xr24, $xr24, $xr11
xvfmadd.s $xr12, $xr24, $xr24, $xr12
xvfmadd.s $xr13, $xr24, $xr24, $xr13
xvfmadd.s $xr14, $xr24, $xr24, $xr14
xvfmadd.s $xr15, $xr24, $xr24, $xr15
xvfmadd.s $xr0, $xr16, $xr16, $xr0
xvfmadd.s $xr1, $xr16, $xr16, $xr1
xvfmadd.s $xr2, $xr16, $xr16, $xr2
xvfmadd.s $xr3, $xr16, $xr16, $xr3
xvfmadd.s $xr4, $xr16, $xr16, $xr4
xvfmadd.s $xr5, $xr16, $xr16, $xr5
xvfmadd.s $xr6, $xr16, $xr16, $xr6
xvfmadd.s $xr7, $xr16, $xr16, $xr7
xvfmadd.s $xr8, $xr16, $xr16, $xr8
xvfmadd.s $xr9, $xr16, $xr16, $xr9
xvfmadd.s $xr10, $xr16, $xr16, $xr10
xvfmadd.s $xr11, $xr16, $xr16, $xr11
xvfmadd.s $xr12, $xr16, $xr16, $xr12
xvfmadd.s $xr13, $xr16, $xr16, $xr13
xvfmadd.s $xr14, $xr16, $xr16, $xr14
xvfmadd.s $xr15, $xr16, $xr16, $xr15
addi.d $a0, $a0, -1
bne $a0, $r0, .lasx.fp32.fmadd.f32f32f32
jr $r1
Expand All @@ -59,23 +61,133 @@ lasx_fp64_fmadd_f64f64f64:
xvxor.v $xr15, $xr15, $xr15
xvxor.v $xr16, $xr16, $xr16
.lasx.fp64.fmadd.f64f64f64:
xvfmadd.d $xr0, $xr24, $xr24, $xr0
xvfmadd.d $xr1, $xr24, $xr24, $xr1
xvfmadd.d $xr2, $xr24, $xr24, $xr2
xvfmadd.d $xr3, $xr24, $xr24, $xr3
xvfmadd.d $xr4, $xr24, $xr24, $xr4
xvfmadd.d $xr5, $xr24, $xr24, $xr5
xvfmadd.d $xr6, $xr24, $xr24, $xr6
xvfmadd.d $xr7, $xr24, $xr24, $xr7
xvfmadd.d $xr8, $xr24, $xr24, $xr8
xvfmadd.d $xr9, $xr24, $xr24, $xr9
xvfmadd.d $xr10, $xr24, $xr24, $xr10
xvfmadd.d $xr11, $xr24, $xr24, $xr11
xvfmadd.d $xr12, $xr24, $xr24, $xr12
xvfmadd.d $xr13, $xr24, $xr24, $xr13
xvfmadd.d $xr14, $xr24, $xr24, $xr14
xvfmadd.d $xr15, $xr24, $xr24, $xr15
xvfmadd.d $xr0, $xr16, $xr16, $xr0
xvfmadd.d $xr1, $xr16, $xr16, $xr1
xvfmadd.d $xr2, $xr16, $xr16, $xr2
xvfmadd.d $xr3, $xr16, $xr16, $xr3
xvfmadd.d $xr4, $xr16, $xr16, $xr4
xvfmadd.d $xr5, $xr16, $xr16, $xr5
xvfmadd.d $xr6, $xr16, $xr16, $xr6
xvfmadd.d $xr7, $xr16, $xr16, $xr7
xvfmadd.d $xr8, $xr16, $xr16, $xr8
xvfmadd.d $xr9, $xr16, $xr16, $xr9
xvfmadd.d $xr10, $xr16, $xr16, $xr10
xvfmadd.d $xr11, $xr16, $xr16, $xr11
xvfmadd.d $xr12, $xr16, $xr16, $xr12
xvfmadd.d $xr13, $xr16, $xr16, $xr13
xvfmadd.d $xr14, $xr16, $xr16, $xr14
xvfmadd.d $xr15, $xr16, $xr16, $xr15
addi.d $a0, $a0, -1
bne $a0, $r0, .lasx.fp64.fmadd.f64f64f64
jr $r1

lasx_fp32_add_mul_f32f32_f32:
xvxor.v $xr0, $xr0, $xr0
xvxor.v $xr1, $xr1, $xr1
xvxor.v $xr2, $xr2, $xr2
xvxor.v $xr3, $xr3, $xr3
xvxor.v $xr4, $xr4, $xr4
xvxor.v $xr5, $xr5, $xr5
xvxor.v $xr6, $xr6, $xr6
xvxor.v $xr7, $xr7, $xr7
xvxor.v $xr8, $xr8, $xr8
xvxor.v $xr9, $xr9, $xr9
xvxor.v $xr10, $xr10, $xr10
xvxor.v $xr11, $xr11, $xr11
xvxor.v $xr12, $xr12, $xr12
xvxor.v $xr13, $xr13, $xr13
xvxor.v $xr14, $xr14, $xr14
xvxor.v $xr15, $xr15, $xr15
xvxor.v $xr16, $xr16, $xr16
xvxor.v $xr17, $xr17, $xr17
xvxor.v $xr18, $xr18, $xr18
xvxor.v $xr19, $xr19, $xr19
xvxor.v $xr20, $xr20, $xr20
xvxor.v $xr21, $xr21, $xr21
xvxor.v $xr22, $xr22, $xr22
xvxor.v $xr23, $xr23, $xr23
xvxor.v $xr24, $xr24, $xr24
.lasx.fp32.add.mul.f32f32.f32:
xvfmul.s $xr0, $xr24, $xr24, $xr0
xvfadd.s $xr1, $xr24, $xr24, $xr1
xvfmul.s $xr2, $xr24, $xr24, $xr2
xvfadd.s $xr3, $xr24, $xr24, $xr3
xvfmul.s $xr4, $xr24, $xr24, $xr4
xvfadd.s $xr5, $xr24, $xr24, $xr5
xvfmul.s $xr6, $xr24, $xr24, $xr6
xvfadd.s $xr7, $xr24, $xr24, $xr7
xvfmul.s $xr8, $xr24, $xr24, $xr8
xvfadd.s $xr9, $xr24, $xr24, $xr9
xvfmul.s $xr10, $xr24, $xr24, $xr10
xvfadd.s $xr11, $xr24, $xr24, $xr11
xvfmul.s $xr12, $xr24, $xr24, $xr12
xvfadd.s $xr13, $xr24, $xr24, $xr13
xvfmul.s $xr14, $xr24, $xr24, $xr14
xvfadd.s $xr15, $xr24, $xr24, $xr15
xvfmul.s $xr16, $xr24, $xr24, $xr16
xvfadd.s $xr17, $xr24, $xr24, $xr17
xvfmul.s $xr18, $xr24, $xr24, $xr18
xvfadd.s $xr19, $xr24, $xr24, $xr19
xvfmul.s $xr20, $xr24, $xr24, $xr20
xvfadd.s $xr21, $xr24, $xr24, $xr21
xvfmul.s $xr22, $xr24, $xr24, $xr22
xvfadd.s $xr23, $xr24, $xr24, $xr23
addi.d $a0, $a0, -1
bne $a0, $r0, .lasx.fp32.add.mul.f32f32.f32
jr $r1

lasx_fp64_add_mul_f64f64_f64:
xvxor.v $xr0, $xr0, $xr0
xvxor.v $xr1, $xr1, $xr1
xvxor.v $xr2, $xr2, $xr2
xvxor.v $xr3, $xr3, $xr3
xvxor.v $xr4, $xr4, $xr4
xvxor.v $xr5, $xr5, $xr5
xvxor.v $xr6, $xr6, $xr6
xvxor.v $xr7, $xr7, $xr7
xvxor.v $xr8, $xr8, $xr8
xvxor.v $xr9, $xr9, $xr9
xvxor.v $xr10, $xr10, $xr10
xvxor.v $xr11, $xr11, $xr11
xvxor.v $xr12, $xr12, $xr12
xvxor.v $xr13, $xr13, $xr13
xvxor.v $xr14, $xr14, $xr14
xvxor.v $xr15, $xr15, $xr15
xvxor.v $xr16, $xr16, $xr16
xvxor.v $xr17, $xr17, $xr17
xvxor.v $xr18, $xr18, $xr18
xvxor.v $xr19, $xr19, $xr19
xvxor.v $xr20, $xr20, $xr20
xvxor.v $xr21, $xr21, $xr21
xvxor.v $xr22, $xr22, $xr22
xvxor.v $xr23, $xr23, $xr23
xvxor.v $xr24, $xr24, $xr24
.lasx.fp64.add.mul.f64f64.f64:
xvfmul.d $xr0, $xr24, $xr24, $xr0
xvfadd.d $xr1, $xr24, $xr24, $xr1
xvfmul.d $xr2, $xr24, $xr24, $xr2
xvfadd.d $xr3, $xr24, $xr24, $xr3
xvfmul.d $xr4, $xr24, $xr24, $xr4
xvfadd.d $xr5, $xr24, $xr24, $xr5
xvfmul.d $xr6, $xr24, $xr24, $xr6
xvfadd.d $xr7, $xr24, $xr24, $xr7
xvfmul.d $xr8, $xr24, $xr24, $xr8
xvfadd.d $xr9, $xr24, $xr24, $xr9
xvfmul.d $xr10, $xr24, $xr24, $xr10
xvfadd.d $xr11, $xr24, $xr24, $xr11
xvfmul.d $xr12, $xr24, $xr24, $xr12
xvfadd.d $xr13, $xr24, $xr24, $xr13
xvfmul.d $xr14, $xr24, $xr24, $xr14
xvfadd.d $xr15, $xr24, $xr24, $xr15
xvfmul.d $xr16, $xr24, $xr24, $xr16
xvfadd.d $xr17, $xr24, $xr24, $xr17
xvfmul.d $xr18, $xr24, $xr24, $xr18
xvfadd.d $xr19, $xr24, $xr24, $xr19
xvfmul.d $xr20, $xr24, $xr24, $xr20
xvfadd.d $xr21, $xr24, $xr24, $xr21
xvfmul.d $xr22, $xr24, $xr24, $xr22
xvfadd.d $xr23, $xr24, $xr24, $xr23
addi.d $a0, $a0, -1
bne $a0, $r0, .lasx.fp64.add.mul.f64f64.f64
jr $r1

176 changes: 144 additions & 32 deletions loongarch64/asm/_LSX_.S
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
.globl lsx_fp32_fmadd_f32f32f32
.globl lsx_fp64_fmadd_f64f64f64
.globl lsx_fp32_add_mul_f32f32_f32
.globl lsx_fp64_add_mul_f64f64_f64

lsx_fp32_fmadd_f32f32f32:
vxor.v $vr0, $vr0, $vr0
Expand All @@ -20,22 +22,22 @@ lsx_fp32_fmadd_f32f32f32:
vxor.v $vr15, $vr15, $vr15
vxor.v $vr16, $vr16, $vr16
.lsx.fp32.fmadd.f32f32f32:
vfmadd.s $vr0, $vr24, $vr24, $vr0
vfmadd.s $vr1, $vr24, $vr24, $vr1
vfmadd.s $vr2, $vr24, $vr24, $vr2
vfmadd.s $vr3, $vr24, $vr24, $vr3
vfmadd.s $vr4, $vr24, $vr24, $vr4
vfmadd.s $vr5, $vr24, $vr24, $vr5
vfmadd.s $vr6, $vr24, $vr24, $vr6
vfmadd.s $vr7, $vr24, $vr24, $vr7
vfmadd.s $vr8, $vr24, $vr24, $vr8
vfmadd.s $vr9, $vr24, $vr24, $vr9
vfmadd.s $vr10, $vr24, $vr24, $vr10
vfmadd.s $vr11, $vr24, $vr24, $vr11
vfmadd.s $vr12, $vr24, $vr24, $vr12
vfmadd.s $vr13, $vr24, $vr24, $vr13
vfmadd.s $vr14, $vr24, $vr24, $vr14
vfmadd.s $vr15, $vr24, $vr24, $vr15
vfmadd.s $vr0, $vr16, $vr16, $vr0
vfmadd.s $vr1, $vr16, $vr16, $vr1
vfmadd.s $vr2, $vr16, $vr16, $vr2
vfmadd.s $vr3, $vr16, $vr16, $vr3
vfmadd.s $vr4, $vr16, $vr16, $vr4
vfmadd.s $vr5, $vr16, $vr16, $vr5
vfmadd.s $vr6, $vr16, $vr16, $vr6
vfmadd.s $vr7, $vr16, $vr16, $vr7
vfmadd.s $vr8, $vr16, $vr16, $vr8
vfmadd.s $vr9, $vr16, $vr16, $vr9
vfmadd.s $vr10, $vr16, $vr16, $vr10
vfmadd.s $vr11, $vr16, $vr16, $vr11
vfmadd.s $vr12, $vr16, $vr16, $vr12
vfmadd.s $vr13, $vr16, $vr16, $vr13
vfmadd.s $vr14, $vr16, $vr16, $vr14
vfmadd.s $vr15, $vr16, $vr16, $vr15
addi.d $a0, $a0, -1
bne $a0, $r0, .lsx.fp32.fmadd.f32f32f32
jr $r1
Expand All @@ -59,23 +61,133 @@ lsx_fp64_fmadd_f64f64f64:
vxor.v $vr15, $vr15, $vr15
vxor.v $vr16, $vr16, $vr16
.lsx.fp64.fmadd.f64f64f64:
vfmadd.d $vr0, $vr24, $vr24, $vr0
vfmadd.d $vr1, $vr24, $vr24, $vr1
vfmadd.d $vr2, $vr24, $vr24, $vr2
vfmadd.d $vr3, $vr24, $vr24, $vr3
vfmadd.d $vr4, $vr24, $vr24, $vr4
vfmadd.d $vr5, $vr24, $vr24, $vr5
vfmadd.d $vr6, $vr24, $vr24, $vr6
vfmadd.d $vr7, $vr24, $vr24, $vr7
vfmadd.d $vr8, $vr24, $vr24, $vr8
vfmadd.d $vr9, $vr24, $vr24, $vr9
vfmadd.d $vr10, $vr24, $vr24, $vr10
vfmadd.d $vr11, $vr24, $vr24, $vr11
vfmadd.d $vr12, $vr24, $vr24, $vr12
vfmadd.d $vr13, $vr24, $vr24, $vr13
vfmadd.d $vr14, $vr24, $vr24, $vr14
vfmadd.d $vr15, $vr24, $vr24, $vr15
vfmadd.d $vr0, $vr16, $vr16, $vr0
vfmadd.d $vr1, $vr16, $vr16, $vr1
vfmadd.d $vr2, $vr16, $vr16, $vr2
vfmadd.d $vr3, $vr16, $vr16, $vr3
vfmadd.d $vr4, $vr16, $vr16, $vr4
vfmadd.d $vr5, $vr16, $vr16, $vr5
vfmadd.d $vr6, $vr16, $vr16, $vr6
vfmadd.d $vr7, $vr16, $vr16, $vr7
vfmadd.d $vr8, $vr16, $vr16, $vr8
vfmadd.d $vr9, $vr16, $vr16, $vr9
vfmadd.d $vr10, $vr16, $vr16, $vr10
vfmadd.d $vr11, $vr16, $vr16, $vr11
vfmadd.d $vr12, $vr16, $vr16, $vr12
vfmadd.d $vr13, $vr16, $vr16, $vr13
vfmadd.d $vr14, $vr16, $vr16, $vr14
vfmadd.d $vr15, $vr16, $vr16, $vr15
addi.d $a0, $a0, -1
bne $a0, $r0, .lsx.fp64.fmadd.f64f64f64
jr $r1

lsx_fp32_add_mul_f32f32_f32:
vxor.v $vr0, $vr0, $vr0
vxor.v $vr1, $vr1, $vr1
vxor.v $vr2, $vr2, $vr2
vxor.v $vr3, $vr3, $vr3
vxor.v $vr4, $vr4, $vr4
vxor.v $vr5, $vr5, $vr5
vxor.v $vr6, $vr6, $vr6
vxor.v $vr7, $vr7, $vr7
vxor.v $vr8, $vr8, $vr8
vxor.v $vr9, $vr9, $vr9
vxor.v $vr10, $vr10, $vr10
vxor.v $vr11, $vr11, $vr11
vxor.v $vr12, $vr12, $vr12
vxor.v $vr13, $vr13, $vr13
vxor.v $vr14, $vr14, $vr14
vxor.v $vr15, $vr15, $vr15
vxor.v $vr16, $vr16, $vr16
vxor.v $vr17, $vr17, $vr17
vxor.v $vr18, $vr18, $vr18
vxor.v $vr19, $vr19, $vr19
vxor.v $vr20, $vr20, $vr20
vxor.v $vr21, $vr21, $vr21
vxor.v $vr22, $vr22, $vr22
vxor.v $vr23, $vr23, $vr23
vxor.v $vr24, $vr24, $vr24
.lsx.fp32.add.mul.f32f32.f32:
vfmul.s $vr0, $vr24, $vr24, $vr0
vfadd.s $vr1, $vr24, $vr24, $vr1
vfmul.s $vr2, $vr24, $vr24, $vr2
vfadd.s $vr3, $vr24, $vr24, $vr3
vfmul.s $vr4, $vr24, $vr24, $vr4
vfadd.s $vr5, $vr24, $vr24, $vr5
vfmul.s $vr6, $vr24, $vr24, $vr6
vfadd.s $vr7, $vr24, $vr24, $vr7
vfmul.s $vr8, $vr24, $vr24, $vr8
vfadd.s $vr9, $vr24, $vr24, $vr9
vfmul.s $vr10, $vr24, $vr24, $vr10
vfadd.s $vr11, $vr24, $vr24, $vr11
vfmul.s $vr12, $vr24, $vr24, $vr12
vfadd.s $vr13, $vr24, $vr24, $vr13
vfmul.s $vr14, $vr24, $vr24, $vr14
vfadd.s $vr15, $vr24, $vr24, $vr15
vfmul.s $vr16, $vr24, $vr24, $vr16
vfadd.s $vr17, $vr24, $vr24, $vr17
vfmul.s $vr18, $vr24, $vr24, $vr18
vfadd.s $vr19, $vr24, $vr24, $vr19
vfmul.s $vr20, $vr24, $vr24, $vr20
vfadd.s $vr21, $vr24, $vr24, $vr21
vfmul.s $vr22, $vr24, $vr24, $vr22
vfadd.s $vr23, $vr24, $vr24, $vr23
addi.d $a0, $a0, -1
bne $a0, $r0, .lsx.fp32.add.mul.f32f32.f32
jr $r1

lsx_fp64_add_mul_f64f64_f64:
vxor.v $vr0, $vr0, $vr0
vxor.v $vr1, $vr1, $vr1
vxor.v $vr2, $vr2, $vr2
vxor.v $vr3, $vr3, $vr3
vxor.v $vr4, $vr4, $vr4
vxor.v $vr5, $vr5, $vr5
vxor.v $vr6, $vr6, $vr6
vxor.v $vr7, $vr7, $vr7
vxor.v $vr8, $vr8, $vr8
vxor.v $vr9, $vr9, $vr9
vxor.v $vr10, $vr10, $vr10
vxor.v $vr11, $vr11, $vr11
vxor.v $vr12, $vr12, $vr12
vxor.v $vr13, $vr13, $vr13
vxor.v $vr14, $vr14, $vr14
vxor.v $vr15, $vr15, $vr15
vxor.v $vr16, $vr16, $vr16
vxor.v $vr17, $vr17, $vr17
vxor.v $vr18, $vr18, $vr18
vxor.v $vr19, $vr19, $vr19
vxor.v $vr20, $vr20, $vr20
vxor.v $vr21, $vr21, $vr21
vxor.v $vr22, $vr22, $vr22
vxor.v $vr23, $vr23, $vr23
vxor.v $vr24, $vr24, $vr24
.lsx.fp64.add.mul.f64f64.f64:
vfmul.d $vr0, $vr24, $vr24, $vr0
vfadd.d $vr1, $vr24, $vr24, $vr1
vfmul.d $vr2, $vr24, $vr24, $vr2
vfadd.d $vr3, $vr24, $vr24, $vr3
vfmul.d $vr4, $vr24, $vr24, $vr4
vfadd.d $vr5, $vr24, $vr24, $vr5
vfmul.d $vr6, $vr24, $vr24, $vr6
vfadd.d $vr7, $vr24, $vr24, $vr7
vfmul.d $vr8, $vr24, $vr24, $vr8
vfadd.d $vr9, $vr24, $vr24, $vr9
vfmul.d $vr10, $vr24, $vr24, $vr10
vfadd.d $vr11, $vr24, $vr24, $vr11
vfmul.d $vr12, $vr24, $vr24, $vr12
vfadd.d $vr13, $vr24, $vr24, $vr13
vfmul.d $vr14, $vr24, $vr24, $vr14
vfadd.d $vr15, $vr24, $vr24, $vr15
vfmul.d $vr16, $vr24, $vr24, $vr16
vfadd.d $vr17, $vr24, $vr24, $vr17
vfmul.d $vr18, $vr24, $vr24, $vr18
vfadd.d $vr19, $vr24, $vr24, $vr19
vfmul.d $vr20, $vr24, $vr24, $vr20
vfadd.d $vr21, $vr24, $vr24, $vr21
vfmul.d $vr22, $vr24, $vr24, $vr22
vfadd.d $vr23, $vr24, $vr24, $vr23
addi.d $a0, $a0, -1
bne $a0, $r0, .lsx.fp64.add.mul.f64f64.f64
jr $r1

Loading

0 comments on commit dbec1c1

Please sign in to comment.