copypadmul2

kidandcat · Sep 14, 2016 · 5496e33 · 5496e33
1 parent 98028f3
commit 5496e33
Show file tree

Hide file tree

Showing 17 changed files with 1,750 additions and 1,585 deletions.
diff --git a/cuda/conv_copypad.go b/cuda/conv_copypad.go
@@ -1,8 +1,6 @@
 package cuda
 
 import (
-	"unsafe"
-
 	"github.com/mumax/3/data"
 	"github.com/mumax/3/util"
 )
@@ -22,13 +20,13 @@ func copyUnPad(dst, src *data.Slice, dstsize, srcsize [3]int) {
 // Copies src into dst, which is larger, and multiplies by vol*Bsat.
 // The remainder of dst is not filled with zeros.
 // Used to zero-pad magnetization before convolution and in the meanwhile multiply m by its length.
-func copyPadMul(dst, src, vol *data.Slice, dstsize, srcsize [3]int, Bsat LUTPtr, regions *Bytes) {
+func copyPadMul(dst, src, vol *data.Slice, dstsize, srcsize [3]int, Msat MSlice) {
 	util.Argument(dst.NComp() == 1 && src.NComp() == 1)
 	util.Assert(dst.Len() == prod(dstsize) && src.Len() == prod(srcsize))
 
 	cfg := make3DConf(srcsize)
 
-	k_copypadmul_async(dst.DevPtr(0), dstsize[X], dstsize[Y], dstsize[Z],
-		src.DevPtr(0), vol.DevPtr(0), srcsize[X], srcsize[Y], srcsize[Z],
-		unsafe.Pointer(Bsat), regions.Ptr, cfg)
+	k_copypadmul2_async(dst.DevPtr(0), dstsize[X], dstsize[Y], dstsize[Z],
+		src.DevPtr(0), srcsize[X], srcsize[Y], srcsize[Z],
+		Msat.DevPtr(0), Msat.Mul(0), vol.DevPtr(0), cfg)
 }
diff --git a/cuda/conv_demag.go b/cuda/conv_demag.go
@@ -37,18 +37,18 @@ func NewDemag(inputSize, PBC [3]int, kernel [3][3]*data.Slice, test bool) *Demag
 // 	vol:  unitless mask used to scale m's length, may be nil
 // 	Bsat: saturation magnetization in Tesla
 // 	B:    resulting demag field, in Tesla
-func (c *DemagConvolution) Exec(B, m, vol *data.Slice, Bsat LUTPtr, regions *Bytes) {
+func (c *DemagConvolution) Exec(B, m, vol *data.Slice, Msat MSlice) {
 	util.Argument(B.Size() == c.inputSize && m.Size() == c.inputSize)
 	if c.is2D() {
-		c.exec2D(B, m, vol, Bsat, regions)
+		c.exec2D(B, m, vol, Msat)
 	} else {
-		c.exec3D(B, m, vol, Bsat, regions)
+		c.exec3D(B, m, vol, Msat)
 	}
 }
 
-func (c *DemagConvolution) exec3D(outp, inp, vol *data.Slice, Bsat LUTPtr, regions *Bytes) {
+func (c *DemagConvolution) exec3D(outp, inp, vol *data.Slice, Msat MSlice) {
 	for i := 0; i < 3; i++ { // FW FFT
-		c.fwFFT(i, inp, vol, Bsat, regions)
+		c.fwFFT(i, inp, vol, Msat)
 	}
 
 	// kern mul
@@ -62,20 +62,20 @@ func (c *DemagConvolution) exec3D(outp, inp, vol *data.Slice, Bsat LUTPtr, regio
 	}
 }
 
-func (c *DemagConvolution) exec2D(outp, inp, vol *data.Slice, Bsat LUTPtr, regions *Bytes) {
+func (c *DemagConvolution) exec2D(outp, inp, vol *data.Slice, Msat MSlice) {
 	// Convolution is separated into
 	// a 1D convolution for z and a 2D convolution for xy.
 	// So only 2 FFT buffers are needed at the same time.
 	Nx, Ny := c.fftKernLogicSize[X], c.fftKernLogicSize[Y]
 
 	// Z
-	c.fwFFT(Z, inp, vol, Bsat, regions)
+	c.fwFFT(Z, inp, vol, Msat)
 	kernMulRSymm2Dz_async(c.fftCBuf[Z], c.kern[Z][Z], Nx, Ny)
 	c.bwFFT(Z, outp)
 
 	// XY
-	c.fwFFT(X, inp, vol, Bsat, regions)
-	c.fwFFT(Y, inp, vol, Bsat, regions)
+	c.fwFFT(X, inp, vol, Msat)
+	c.fwFFT(Y, inp, vol, Msat)
 	kernMulRSymm2Dxy_async(c.fftCBuf[X], c.fftCBuf[Y],
 		c.kern[X][X], c.kern[Y][Y], c.kern[X][Y], Nx, Ny)
 	c.bwFFT(X, outp)
@@ -92,10 +92,10 @@ func zero1_async(dst *data.Slice) {
 }
 
 // forward FFT component i
-func (c *DemagConvolution) fwFFT(i int, inp, vol *data.Slice, Bsat LUTPtr, regions *Bytes) {
+func (c *DemagConvolution) fwFFT(i int, inp, vol *data.Slice, Msat MSlice) {
 	zero1_async(c.fftRBuf[i])
 	in := inp.Comp(i)
-	copyPadMul(c.fftRBuf[i], in, vol, c.realKernSize, c.inputSize, Bsat, regions)
+	copyPadMul(c.fftRBuf[i], in, vol, c.realKernSize, c.inputSize, Msat)
 	c.fwPlan.ExecAsync(c.fftRBuf[i], c.fftCBuf[i])
 }
 

diff --git a/cuda/conv_mfm.go b/cuda/conv_mfm.go
@@ -72,10 +72,10 @@ func (c *MFMConvolution) initFFTKern3D() {
 }
 
 // store MFM image in output, based on magnetization in inp.
-func (c *MFMConvolution) Exec(outp, inp, vol *data.Slice, Bsat LUTPtr, regions *Bytes) {
+func (c *MFMConvolution) Exec(outp, inp, vol *data.Slice, Msat MSlice) {
 	for i := 0; i < 3; i++ {
 		zero1_async(c.fftRBuf)
-		copyPadMul(c.fftRBuf, inp.Comp(i), vol, c.kernSize, c.size, Bsat, regions)
+		copyPadMul(c.fftRBuf, inp.Comp(i), vol, c.kernSize, c.size, Msat)
 		c.fwPlan.ExecAsync(c.fftRBuf, c.fftCBuf)
 
 		Nx, Ny := c.fftKernSize[X]/2, c.fftKernSize[Y] //   ??

diff --git a/cuda/conv_selftest.go b/cuda/conv_selftest.go
@@ -24,15 +24,12 @@ func testConvolution(c *DemagConvolution, PBC [3]int, realKern [3][3]*data.Slice
 	defer gpu.Free()
 	data.Copy(gpu, inhost)
 
-	regions := NewBytes(prod(c.inputSize))
-	defer regions.Free()
-	Bsat := NewSlice(1, [3]int{1, 1, 256})
-	defer Bsat.Free()
-	Memset(Bsat, 1)
-	BsatLUT := LUTPtr(Bsat.DevPtr(0))
+	Msat := NewSlice(1, [3]int{1, 1, 256})
+	defer Msat.Free()
+	Memset(Msat, 1)
 
 	vol := data.NilSlice(1, c.inputSize)
-	c.Exec(gpu, gpu, vol, BsatLUT, regions)
+	c.Exec(gpu, gpu, vol, ToMSlice(Msat))
 
 	output := gpu.HostCopy()
 

diff --git a/cuda/copypadmul.cu → cuda/copypadmul2.cu b/cuda/copypadmul.cu → cuda/copypadmul2.cu
@@ -1,21 +1,24 @@
+#include "amul.h"
+#include "constants.h"
 #include "stencil.h"
 #include <stdint.h>
 
 // Copy src (size S, smaller) into dst (size D, larger),
-// and multiply by Bsat as defined in regions.
+// and multiply by Bsat * vol
 extern "C" __global__ void
-copypadmul(float* __restrict__ dst, int Dx, int Dy, int Dz,
-           float* __restrict__ src, float* __restrict__ vol, int Sx, int Sy, int Sz,
-           float* __restrict__ BsatLUT, uint8_t* __restrict__ regions) {
+copypadmul2(float* __restrict__ dst, int Dx, int Dy, int Dz,
+            float* __restrict__ src, int Sx, int Sy, int Sz,
+            float* __restrict__ Ms_, float Ms_mul, 
+		    float* __restrict__ vol) {
 
 	int ix = blockIdx.x * blockDim.x + threadIdx.x;
 	int iy = blockIdx.y * blockDim.y + threadIdx.y;
 	int iz = blockIdx.z * blockDim.z + threadIdx.z;
 
 	if (ix<Sx && iy<Sy && iz<Sz) {
 		int sI = index(ix, iy, iz, Sx, Sy, Sz);  // source index
-		float Bsat = BsatLUT[regions[sI]];
-		float v = (vol == NULL? 1.0f: vol[sI]);
+		float Bsat = MU0 * amul(Ms_, Ms_mul, sI);
+		float v = amul(vol, 1.0f, sI);
 		dst[index(ix, iy, iz, Dx, Dy, Dz)] = Bsat * v * src[sI];
 	}
 }