Skip to content

Commit

Permalink
increase the size of host arrays when using more powerful GPUs
Browse files Browse the repository at this point in the history
git-svn-id: https://msieve.svn.sourceforge.net/svnroot/msieve/trunk@236 8c02911e-0617-4013-80d0-c3bcbcae04d5
  • Loading branch information
jasonp_sf committed Mar 11, 2010
1 parent c60eeb8 commit e1a38c4
Show file tree
Hide file tree
Showing 7 changed files with 155 additions and 31 deletions.
2 changes: 2 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ Version 1.45:
for inputs < 135 digits about 35% faster
- Fixed some degree 5 synchronization issues (thanks
Jayson King)
- Added code to increase the size of host arrays when using
more powerful GPUs (thanks Paul Zimmermann)
- Added code to automatically randomize the search for
inputs that are large enough
- Made the cutoff E-value more aggressive for the largest
Expand Down
11 changes: 7 additions & 4 deletions gnfs/poly/stage1/stage1_core_gpu/stage1_sieve_deg46_64.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ benefit from your work.
#include <stage1.h>
#include "stage1_core_deg46_64.h"

#define HOST_BATCH_SIZE 50000

/*------------------------------------------------------------------------*/
typedef struct {
uint32 num_roots;
Expand Down Expand Up @@ -421,6 +419,8 @@ sieve_lattice_deg46_64(msieve_obj *obj, lattice_fb_t *L,
uint32 threads_per_block;
gpu_info_t *gpu_info = L->gpu_info;
CUfunction gpu_kernel = L->gpu_kernel;
uint32 host_p_batch_size;
uint32 host_q_batch_size;

L->q_marshall = (q_soa_t *)xmalloc(sizeof(q_soa_t));
q_array = L->q_array = (q_soa_array_t *)xmalloc(
Expand All @@ -446,6 +446,9 @@ sieve_lattice_deg46_64(msieve_obj *obj, lattice_fb_t *L,
CUDA_TRY(cuMemAlloc(&L->gpu_found_array,
L->found_array_size * sizeof(found_t)))

host_p_batch_size = MAX(10000, L->found_array_size / 3);
host_q_batch_size = MAX(50000, 12 * L->found_array_size);

printf("------- %u-%u %u-%u\n",
small_p_min, small_p_max,
large_p_min, large_p_max);
Expand All @@ -472,7 +475,7 @@ sieve_lattice_deg46_64(msieve_obj *obj, lattice_fb_t *L,

q_soa_array_reset(q_array);

for (i = 0; i < HOST_BATCH_SIZE &&
for (i = 0; i < host_q_batch_size &&
min_large != (uint32)P_SEARCH_DONE; i++) {
min_large = sieve_fb_next(sieve_small, L->poly,
store_p_soa, L);
Expand All @@ -492,7 +495,7 @@ sieve_lattice_deg46_64(msieve_obj *obj, lattice_fb_t *L,

p_packed_reset(p_array);

for (i = 0; i < HOST_BATCH_SIZE &&
for (i = 0; i < host_p_batch_size &&
min_small != (uint32)P_SEARCH_DONE; i++) {
min_small = sieve_fb_next(sieve_large, L->poly,
store_p_packed, L);
Expand Down
54 changes: 49 additions & 5 deletions gnfs/poly/stage1/stage1_core_gpu/stage1_sieve_deg5_128.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,46 @@ typedef struct {
uint32 num_p;
uint64 last_p;

uint64 p[HOST_BATCH_SIZE];
uint64 lattice_size[HOST_BATCH_SIZE];
uint32 roots[4*POLY_BATCH_SIZE][HOST_BATCH_SIZE];
uint64 *p;
uint64 *lattice_size;
uint32 *roots[4*POLY_BATCH_SIZE];
} p_soa_var_t;

static void
p_soa_var_init(p_soa_var_t *soa, uint32 batch_size)
{
uint32 i;

memset(soa, 0, sizeof(soa));
soa->p = (uint64 *)xmalloc(batch_size * sizeof(uint64));
soa->lattice_size = (uint64 *)xmalloc(batch_size * sizeof(uint64));
for (i = 0; i < 4 * POLY_BATCH_SIZE; i += 4) {
soa->roots[i] = (uint32 *)xmalloc(batch_size *
sizeof(uint32));
soa->roots[i+1] = (uint32 *)xmalloc(batch_size *
sizeof(uint32));
soa->roots[i+2] = (uint32 *)xmalloc(batch_size *
sizeof(uint32));
soa->roots[i+3] = (uint32 *)xmalloc(batch_size *
sizeof(uint32));
}
}

static void
p_soa_var_free(p_soa_var_t *soa)
{
uint32 i;

free(soa->p);
free(soa->lattice_size);
for (i = 0; i < 4 * POLY_BATCH_SIZE; i += 4) {
free(soa->roots[i]);
free(soa->roots[i+1]);
free(soa->roots[i+2]);
free(soa->roots[i+3]);
}
}

static void
p_soa_var_reset(p_soa_var_t *soa)
{
Expand Down Expand Up @@ -243,6 +278,8 @@ sieve_lattice_deg5_128(msieve_obj *obj, lattice_fb_t *L,
uint32 threads_per_block;
gpu_info_t *gpu_info = L->gpu_info;
CUfunction gpu_kernel = L->gpu_kernel;
uint32 host_p_batch_size;
uint32 host_q_batch_size;

L->p_marshall = (p_soa_t *)xmalloc(sizeof(p_soa_t));
L->q_marshall = (q_soa_t *)xmalloc(sizeof(q_soa_t));
Expand All @@ -268,6 +305,11 @@ sieve_lattice_deg5_128(msieve_obj *obj, lattice_fb_t *L,
CUDA_TRY(cuMemAlloc(&L->gpu_found_array,
L->found_array_size * sizeof(found_t)))

host_p_batch_size = MAX(10000, L->found_array_size / 3);
host_q_batch_size = MAX(50000, 12 * L->found_array_size);
p_soa_var_init(p_array, host_p_batch_size);
p_soa_var_init(q_array, host_q_batch_size);

printf("------- %" PRIu64 "-%" PRIu64 " %" PRIu64 "-%" PRIu64 "\n",
small_p_min, small_p_max,
large_p_min, large_p_max);
Expand All @@ -280,7 +322,7 @@ sieve_lattice_deg5_128(msieve_obj *obj, lattice_fb_t *L,

L->fill_p = 0;
p_soa_var_reset(q_array);
for (i = 0; i < HOST_BATCH_SIZE &&
for (i = 0; i < host_q_batch_size &&
min_large != P_SEARCH_DONE; i++) {
min_large = sieve_fb_next(sieve_small, L->poly,
store_p_soa, L);
Expand All @@ -297,7 +339,7 @@ sieve_lattice_deg5_128(msieve_obj *obj, lattice_fb_t *L,

L->fill_p = 1;
p_soa_var_reset(p_array);
for (i = 0; i < HOST_BATCH_SIZE &&
for (i = 0; i < host_p_batch_size &&
min_small != P_SEARCH_DONE; i++) {
min_small = sieve_fb_next(sieve_large, L->poly,
store_p_soa, L);
Expand Down Expand Up @@ -325,6 +367,8 @@ sieve_lattice_deg5_128(msieve_obj *obj, lattice_fb_t *L,
CUDA_TRY(cuMemFree(L->gpu_p_array))
CUDA_TRY(cuMemFree(L->gpu_q_array))
CUDA_TRY(cuMemFree(L->gpu_found_array))
p_soa_var_free(p_array);
p_soa_var_free(q_array);
free(p_array);
free(q_array);
free(L->p_marshall);
Expand Down
44 changes: 37 additions & 7 deletions gnfs/poly/stage1/stage1_core_gpu/stage1_sieve_deg5_64.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,39 @@ benefit from your work.
#include <stage1.h>
#include "stage1_core_deg5_64.h"

#define HOST_BATCH_SIZE (104*384)

/*------------------------------------------------------------------------*/
typedef struct {
uint32 num_p;
uint32 last_p;

uint32 p[HOST_BATCH_SIZE];
uint32 lattice_size[HOST_BATCH_SIZE];
uint64 roots[POLY_BATCH_SIZE][HOST_BATCH_SIZE];
uint32 *p;
uint32 *lattice_size;
uint64 *roots[POLY_BATCH_SIZE];
} p_soa_var_t;

static void
p_soa_var_init(p_soa_var_t *soa, uint32 batch_size)
{
uint32 i;

memset(soa, 0, sizeof(soa));
soa->p = (uint32 *)xmalloc(batch_size * sizeof(uint32));
soa->lattice_size = (uint32 *)xmalloc(batch_size * sizeof(uint32));
for (i = 0; i < POLY_BATCH_SIZE; i++)
soa->roots[i] = (uint64 *)xmalloc(batch_size * sizeof(uint64));
}

static void
p_soa_var_free(p_soa_var_t *soa)
{
uint32 i;

free(soa->p);
free(soa->lattice_size);
for (i = 0; i < POLY_BATCH_SIZE; i++)
free(soa->roots[i]);
}

static void
p_soa_var_reset(p_soa_var_t *soa)
{
Expand Down Expand Up @@ -244,6 +265,8 @@ sieve_lattice_deg5_64(msieve_obj *obj, lattice_fb_t *L,
p_soa_var_t * p_array;
p_soa_var_t * q_array;
uint32 num_poly = L->poly->num_poly;
uint32 host_p_batch_size;
uint32 host_q_batch_size;

uint32 threads_per_block;
gpu_info_t *gpu_info = L->gpu_info;
Expand Down Expand Up @@ -273,6 +296,11 @@ sieve_lattice_deg5_64(msieve_obj *obj, lattice_fb_t *L,
CUDA_TRY(cuMemAlloc(&L->gpu_found_array,
L->found_array_size * sizeof(found_t)))

host_p_batch_size = MAX(10000, L->found_array_size / 3);
host_q_batch_size = MAX(50000, 12 * L->found_array_size);
p_soa_var_init(p_array, host_p_batch_size);
p_soa_var_init(q_array, host_q_batch_size);

printf("------- %u-%u %u-%u\n",
small_p_min, small_p_max,
large_p_min, large_p_max);
Expand All @@ -285,7 +313,7 @@ sieve_lattice_deg5_64(msieve_obj *obj, lattice_fb_t *L,

L->fill_p = 0;
p_soa_var_reset(q_array);
for (i = 0; i < HOST_BATCH_SIZE &&
for (i = 0; i < host_q_batch_size &&
min_large != (uint32)P_SEARCH_DONE; i++) {
min_large = sieve_fb_next(sieve_small, L->poly,
store_p_soa, L);
Expand All @@ -302,7 +330,7 @@ sieve_lattice_deg5_64(msieve_obj *obj, lattice_fb_t *L,

L->fill_p = 1;
p_soa_var_reset(p_array);
for (i = 0; i < HOST_BATCH_SIZE &&
for (i = 0; i < host_p_batch_size &&
min_small != (uint32)P_SEARCH_DONE; i++) {
min_small = sieve_fb_next(sieve_large, L->poly,
store_p_soa, L);
Expand Down Expand Up @@ -330,6 +358,8 @@ sieve_lattice_deg5_64(msieve_obj *obj, lattice_fb_t *L,
CUDA_TRY(cuMemFree(L->gpu_p_array))
CUDA_TRY(cuMemFree(L->gpu_q_array))
CUDA_TRY(cuMemFree(L->gpu_found_array))
p_soa_var_free(p_array);
p_soa_var_free(q_array);
free(p_array);
free(q_array);
free(L->p_marshall);
Expand Down
53 changes: 46 additions & 7 deletions gnfs/poly/stage1/stage1_core_gpu/stage1_sieve_deg5_96.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,48 @@ benefit from your work.
#include <stage1.h>
#include "stage1_core_deg5_96.h"

#define HOST_BATCH_SIZE (156*256)

/*------------------------------------------------------------------------*/
typedef struct {
uint32 num_p;
uint64 last_p;

uint64 p[HOST_BATCH_SIZE];
uint64 lattice_size[HOST_BATCH_SIZE];
uint32 roots[3*POLY_BATCH_SIZE][HOST_BATCH_SIZE];
uint64 *p;
uint64 *lattice_size;
uint32 *roots[3*POLY_BATCH_SIZE];
} p_soa_var_t;

static void
p_soa_var_init(p_soa_var_t *soa, uint32 batch_size)
{
uint32 i;

memset(soa, 0, sizeof(soa));
soa->p = (uint64 *)xmalloc(batch_size * sizeof(uint64));
soa->lattice_size = (uint64 *)xmalloc(batch_size * sizeof(uint64));
for (i = 0; i < 3 * POLY_BATCH_SIZE; i += 3) {
soa->roots[i] = (uint32 *)xmalloc(batch_size *
sizeof(uint32));
soa->roots[i+1] = (uint32 *)xmalloc(batch_size *
sizeof(uint32));
soa->roots[i+2] = (uint32 *)xmalloc(batch_size *
sizeof(uint32));
}
}

static void
p_soa_var_free(p_soa_var_t *soa)
{
uint32 i;

free(soa->p);
free(soa->lattice_size);
for (i = 0; i < 3 * POLY_BATCH_SIZE; i += 3) {
free(soa->roots[i]);
free(soa->roots[i+1]);
free(soa->roots[i+2]);
}
}

static void
p_soa_var_reset(p_soa_var_t *soa)
{
Expand Down Expand Up @@ -248,6 +278,8 @@ sieve_lattice_deg5_96(msieve_obj *obj, lattice_fb_t *L,
p_soa_var_t * p_array;
p_soa_var_t * q_array;
uint32 num_poly = L->poly->num_poly;
uint32 host_p_batch_size;
uint32 host_q_batch_size;

uint32 threads_per_block;
gpu_info_t *gpu_info = L->gpu_info;
Expand Down Expand Up @@ -277,6 +309,11 @@ sieve_lattice_deg5_96(msieve_obj *obj, lattice_fb_t *L,
CUDA_TRY(cuMemAlloc(&L->gpu_found_array,
L->found_array_size * sizeof(found_t)))

host_p_batch_size = MAX(10000, L->found_array_size / 3);
host_q_batch_size = MAX(50000, 12 * L->found_array_size);
p_soa_var_init(p_array, host_p_batch_size);
p_soa_var_init(q_array, host_q_batch_size);

printf("------- %" PRIu64 "-%" PRIu64 " %" PRIu64 "-%" PRIu64 "\n",
small_p_min, small_p_max,
large_p_min, large_p_max);
Expand All @@ -289,7 +326,7 @@ sieve_lattice_deg5_96(msieve_obj *obj, lattice_fb_t *L,

L->fill_p = 0;
p_soa_var_reset(q_array);
for (i = 0; i < HOST_BATCH_SIZE &&
for (i = 0; i < host_q_batch_size &&
min_large != P_SEARCH_DONE; i++) {
min_large = sieve_fb_next(sieve_small, L->poly,
store_p_soa, L);
Expand All @@ -306,7 +343,7 @@ sieve_lattice_deg5_96(msieve_obj *obj, lattice_fb_t *L,

L->fill_p = 1;
p_soa_var_reset(p_array);
for (i = 0; i < HOST_BATCH_SIZE &&
for (i = 0; i < host_p_batch_size &&
min_small != P_SEARCH_DONE; i++) {
min_small = sieve_fb_next(sieve_large, L->poly,
store_p_soa, L);
Expand Down Expand Up @@ -334,6 +371,8 @@ sieve_lattice_deg5_96(msieve_obj *obj, lattice_fb_t *L,
CUDA_TRY(cuMemFree(L->gpu_p_array))
CUDA_TRY(cuMemFree(L->gpu_q_array))
CUDA_TRY(cuMemFree(L->gpu_found_array))
p_soa_var_free(p_array);
p_soa_var_free(q_array);
free(p_array);
free(q_array);
free(L->p_marshall);
Expand Down
11 changes: 7 additions & 4 deletions gnfs/poly/stage1/stage1_core_gpu/stage1_sieve_deg6_128.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ benefit from your work.
#include <stage1.h>
#include "stage1_core_deg6_128.h"

#define HOST_BATCH_SIZE 50000

/*------------------------------------------------------------------------*/
typedef struct {
uint32 num_roots;
Expand Down Expand Up @@ -411,6 +409,8 @@ sieve_lattice_deg6_128(msieve_obj *obj, lattice_fb_t *L,
q_soa_array_t * q_array;
uint32 p_min_roots, p_max_roots;
uint32 q_min_roots, q_max_roots;
uint32 host_p_batch_size;
uint32 host_q_batch_size;

uint32 threads_per_block;
gpu_info_t *gpu_info = L->gpu_info;
Expand Down Expand Up @@ -440,6 +440,9 @@ sieve_lattice_deg6_128(msieve_obj *obj, lattice_fb_t *L,
CUDA_TRY(cuMemAlloc(&L->gpu_found_array,
L->found_array_size * sizeof(found_t)))

host_p_batch_size = MAX(10000, L->found_array_size / 3);
host_q_batch_size = MAX(50000, 12 * L->found_array_size);

printf("------- %" PRIu64 "-%" PRIu64 " %" PRIu64 "-%" PRIu64 "\n",
small_p_min, small_p_max,
large_p_min, large_p_max);
Expand All @@ -456,7 +459,7 @@ sieve_lattice_deg6_128(msieve_obj *obj, lattice_fb_t *L,

q_soa_array_reset(q_array);

for (i = 0; i < HOST_BATCH_SIZE &&
for (i = 0; i < host_q_batch_size &&
min_large != P_SEARCH_DONE; i++) {
min_large = sieve_fb_next(sieve_small, L->poly,
store_p_soa, L);
Expand All @@ -477,7 +480,7 @@ sieve_lattice_deg6_128(msieve_obj *obj, lattice_fb_t *L,

p_packed_reset(p_array);

for (i = 0; i < HOST_BATCH_SIZE &&
for (i = 0; i < host_p_batch_size &&
min_small != P_SEARCH_DONE; i++) {
min_small = sieve_fb_next(sieve_large, L->poly,
store_p_packed, L);
Expand Down
Loading

0 comments on commit e1a38c4

Please sign in to comment.