Skip to content

Commit

Permalink
copypadmul2
Browse files Browse the repository at this point in the history
  • Loading branch information
barnex committed Sep 14, 2016
1 parent 98028f3 commit 5496e33
Show file tree
Hide file tree
Showing 17 changed files with 1,750 additions and 1,585 deletions.
10 changes: 4 additions & 6 deletions cuda/conv_copypad.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package cuda

import (
"unsafe"

"github.com/mumax/3/data"
"github.com/mumax/3/util"
)
Expand All @@ -22,13 +20,13 @@ func copyUnPad(dst, src *data.Slice, dstsize, srcsize [3]int) {
// Copies src into dst, which is larger, and multiplies by vol*Bsat.
// The remainder of dst is not filled with zeros.
// Used to zero-pad magnetization before convolution and in the meanwhile multiply m by its length.
func copyPadMul(dst, src, vol *data.Slice, dstsize, srcsize [3]int, Bsat LUTPtr, regions *Bytes) {
func copyPadMul(dst, src, vol *data.Slice, dstsize, srcsize [3]int, Msat MSlice) {
util.Argument(dst.NComp() == 1 && src.NComp() == 1)
util.Assert(dst.Len() == prod(dstsize) && src.Len() == prod(srcsize))

cfg := make3DConf(srcsize)

k_copypadmul_async(dst.DevPtr(0), dstsize[X], dstsize[Y], dstsize[Z],
src.DevPtr(0), vol.DevPtr(0), srcsize[X], srcsize[Y], srcsize[Z],
unsafe.Pointer(Bsat), regions.Ptr, cfg)
k_copypadmul2_async(dst.DevPtr(0), dstsize[X], dstsize[Y], dstsize[Z],
src.DevPtr(0), srcsize[X], srcsize[Y], srcsize[Z],
Msat.DevPtr(0), Msat.Mul(0), vol.DevPtr(0), cfg)
}
22 changes: 11 additions & 11 deletions cuda/conv_demag.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,18 +37,18 @@ func NewDemag(inputSize, PBC [3]int, kernel [3][3]*data.Slice, test bool) *Demag
// vol: unitless mask used to scale m's length, may be nil
// Bsat: saturation magnetization in Tesla
// B: resulting demag field, in Tesla
func (c *DemagConvolution) Exec(B, m, vol *data.Slice, Bsat LUTPtr, regions *Bytes) {
func (c *DemagConvolution) Exec(B, m, vol *data.Slice, Msat MSlice) {
util.Argument(B.Size() == c.inputSize && m.Size() == c.inputSize)
if c.is2D() {
c.exec2D(B, m, vol, Bsat, regions)
c.exec2D(B, m, vol, Msat)
} else {
c.exec3D(B, m, vol, Bsat, regions)
c.exec3D(B, m, vol, Msat)
}
}

func (c *DemagConvolution) exec3D(outp, inp, vol *data.Slice, Bsat LUTPtr, regions *Bytes) {
func (c *DemagConvolution) exec3D(outp, inp, vol *data.Slice, Msat MSlice) {
for i := 0; i < 3; i++ { // FW FFT
c.fwFFT(i, inp, vol, Bsat, regions)
c.fwFFT(i, inp, vol, Msat)
}

// kern mul
Expand All @@ -62,20 +62,20 @@ func (c *DemagConvolution) exec3D(outp, inp, vol *data.Slice, Bsat LUTPtr, regio
}
}

func (c *DemagConvolution) exec2D(outp, inp, vol *data.Slice, Bsat LUTPtr, regions *Bytes) {
func (c *DemagConvolution) exec2D(outp, inp, vol *data.Slice, Msat MSlice) {
// Convolution is separated into
// a 1D convolution for z and a 2D convolution for xy.
// So only 2 FFT buffers are needed at the same time.
Nx, Ny := c.fftKernLogicSize[X], c.fftKernLogicSize[Y]

// Z
c.fwFFT(Z, inp, vol, Bsat, regions)
c.fwFFT(Z, inp, vol, Msat)
kernMulRSymm2Dz_async(c.fftCBuf[Z], c.kern[Z][Z], Nx, Ny)
c.bwFFT(Z, outp)

// XY
c.fwFFT(X, inp, vol, Bsat, regions)
c.fwFFT(Y, inp, vol, Bsat, regions)
c.fwFFT(X, inp, vol, Msat)
c.fwFFT(Y, inp, vol, Msat)
kernMulRSymm2Dxy_async(c.fftCBuf[X], c.fftCBuf[Y],
c.kern[X][X], c.kern[Y][Y], c.kern[X][Y], Nx, Ny)
c.bwFFT(X, outp)
Expand All @@ -92,10 +92,10 @@ func zero1_async(dst *data.Slice) {
}

// forward FFT component i
func (c *DemagConvolution) fwFFT(i int, inp, vol *data.Slice, Bsat LUTPtr, regions *Bytes) {
func (c *DemagConvolution) fwFFT(i int, inp, vol *data.Slice, Msat MSlice) {
zero1_async(c.fftRBuf[i])
in := inp.Comp(i)
copyPadMul(c.fftRBuf[i], in, vol, c.realKernSize, c.inputSize, Bsat, regions)
copyPadMul(c.fftRBuf[i], in, vol, c.realKernSize, c.inputSize, Msat)
c.fwPlan.ExecAsync(c.fftRBuf[i], c.fftCBuf[i])
}

Expand Down
4 changes: 2 additions & 2 deletions cuda/conv_mfm.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ func (c *MFMConvolution) initFFTKern3D() {
}

// store MFM image in output, based on magnetization in inp.
func (c *MFMConvolution) Exec(outp, inp, vol *data.Slice, Bsat LUTPtr, regions *Bytes) {
func (c *MFMConvolution) Exec(outp, inp, vol *data.Slice, Msat MSlice) {
for i := 0; i < 3; i++ {
zero1_async(c.fftRBuf)
copyPadMul(c.fftRBuf, inp.Comp(i), vol, c.kernSize, c.size, Bsat, regions)
copyPadMul(c.fftRBuf, inp.Comp(i), vol, c.kernSize, c.size, Msat)
c.fwPlan.ExecAsync(c.fftRBuf, c.fftCBuf)

Nx, Ny := c.fftKernSize[X]/2, c.fftKernSize[Y] // ??
Expand Down
11 changes: 4 additions & 7 deletions cuda/conv_selftest.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,12 @@ func testConvolution(c *DemagConvolution, PBC [3]int, realKern [3][3]*data.Slice
defer gpu.Free()
data.Copy(gpu, inhost)

regions := NewBytes(prod(c.inputSize))
defer regions.Free()
Bsat := NewSlice(1, [3]int{1, 1, 256})
defer Bsat.Free()
Memset(Bsat, 1)
BsatLUT := LUTPtr(Bsat.DevPtr(0))
Msat := NewSlice(1, [3]int{1, 1, 256})
defer Msat.Free()
Memset(Msat, 1)

vol := data.NilSlice(1, c.inputSize)
c.Exec(gpu, gpu, vol, BsatLUT, regions)
c.Exec(gpu, gpu, vol, ToMSlice(Msat))

output := gpu.HostCopy()

Expand Down
15 changes: 9 additions & 6 deletions cuda/copypadmul.cu → cuda/copypadmul2.cu
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
#include "amul.h"
#include "constants.h"
#include "stencil.h"
#include <stdint.h>

// Copy src (size S, smaller) into dst (size D, larger),
// and multiply by Bsat as defined in regions.
// and multiply by Bsat * vol
extern "C" __global__ void
copypadmul(float* __restrict__ dst, int Dx, int Dy, int Dz,
float* __restrict__ src, float* __restrict__ vol, int Sx, int Sy, int Sz,
float* __restrict__ BsatLUT, uint8_t* __restrict__ regions) {
copypadmul2(float* __restrict__ dst, int Dx, int Dy, int Dz,
float* __restrict__ src, int Sx, int Sy, int Sz,
float* __restrict__ Ms_, float Ms_mul,
float* __restrict__ vol) {

int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iy = blockIdx.y * blockDim.y + threadIdx.y;
int iz = blockIdx.z * blockDim.z + threadIdx.z;

if (ix<Sx && iy<Sy && iz<Sz) {
int sI = index(ix, iy, iz, Sx, Sy, Sz); // source index
float Bsat = BsatLUT[regions[sI]];
float v = (vol == NULL? 1.0f: vol[sI]);
float Bsat = MU0 * amul(Ms_, Ms_mul, sI);
float v = amul(vol, 1.0f, sI);
dst[index(ix, iy, iz, Dx, Dy, Dz)] = Bsat * v * src[sI];
}
}
Expand Down
Loading

0 comments on commit 5496e33

Please sign in to comment.