increase the size of host arrays when using more powerful GPUs

git-svn-id: https://msieve.svn.sourceforge.net/svnroot/msieve/trunk@236 8c02911e-0617-4013-80d0-c3bcbcae04d5
radii · Mar 11, 2010 · e1a38c4 · e1a38c4
1 parent c60eeb8
commit e1a38c4
Show file tree

Hide file tree

Showing 7 changed files with 155 additions and 31 deletions.
diff --git a/Changes b/Changes
@@ -7,6 +7,8 @@ Version 1.45:
 		  for inputs < 135 digits about 35% faster
 		- Fixed some degree 5 synchronization issues (thanks
 		  Jayson King)
+		- Added code to increase the size of host arrays when using
+		  more powerful GPUs (thanks Paul Zimmermann)
 		- Added code to automatically randomize the search for 
 		  inputs that are large enough
 		- Made the cutoff E-value more aggressive for the largest

diff --git a/gnfs/poly/stage1/stage1_core_gpu/stage1_sieve_deg46_64.c b/gnfs/poly/stage1/stage1_core_gpu/stage1_sieve_deg46_64.c
@@ -15,8 +15,6 @@ benefit from your work.
 #include <stage1.h>
 #include "stage1_core_deg46_64.h"
 
-#define HOST_BATCH_SIZE 50000
-
 /*------------------------------------------------------------------------*/
 typedef struct {
 	uint32 num_roots;
@@ -421,6 +419,8 @@ sieve_lattice_deg46_64(msieve_obj *obj, lattice_fb_t *L,
 	uint32 threads_per_block;
 	gpu_info_t *gpu_info = L->gpu_info;
        	CUfunction gpu_kernel = L->gpu_kernel;
+	uint32 host_p_batch_size;
+	uint32 host_q_batch_size;
 
 	L->q_marshall = (q_soa_t *)xmalloc(sizeof(q_soa_t));
 	q_array = L->q_array = (q_soa_array_t *)xmalloc(
@@ -446,6 +446,9 @@ sieve_lattice_deg46_64(msieve_obj *obj, lattice_fb_t *L,
 	CUDA_TRY(cuMemAlloc(&L->gpu_found_array, 
 			L->found_array_size * sizeof(found_t)))
 
+	host_p_batch_size = MAX(10000, L->found_array_size / 3);
+	host_q_batch_size = MAX(50000, 12 * L->found_array_size);
+
 	printf("------- %u-%u %u-%u\n",
 			small_p_min, small_p_max,
 			large_p_min, large_p_max);
@@ -472,7 +475,7 @@ sieve_lattice_deg46_64(msieve_obj *obj, lattice_fb_t *L,
 
 		q_soa_array_reset(q_array);
 
-		for (i = 0; i < HOST_BATCH_SIZE && 
+		for (i = 0; i < host_q_batch_size && 
 				min_large != (uint32)P_SEARCH_DONE; i++) {
 			min_large = sieve_fb_next(sieve_small, L->poly,
 						store_p_soa, L);
@@ -492,7 +495,7 @@ sieve_lattice_deg46_64(msieve_obj *obj, lattice_fb_t *L,
 
 			p_packed_reset(p_array);
 
-			for (i = 0; i < HOST_BATCH_SIZE && 
+			for (i = 0; i < host_p_batch_size && 
 			 	    min_small != (uint32)P_SEARCH_DONE; i++) {
 				min_small = sieve_fb_next(sieve_large, L->poly,
 							store_p_packed, L);

diff --git a/gnfs/poly/stage1/stage1_core_gpu/stage1_sieve_deg5_128.c b/gnfs/poly/stage1/stage1_core_gpu/stage1_sieve_deg5_128.c
@@ -22,11 +22,46 @@ typedef struct {
 	uint32 num_p;
 	uint64 last_p;
 
-	uint64 p[HOST_BATCH_SIZE];
-	uint64 lattice_size[HOST_BATCH_SIZE];
-	uint32 roots[4*POLY_BATCH_SIZE][HOST_BATCH_SIZE];
+	uint64 *p;
+	uint64 *lattice_size;
+	uint32 *roots[4*POLY_BATCH_SIZE];
 } p_soa_var_t;
 
+static void
+p_soa_var_init(p_soa_var_t *soa, uint32 batch_size)
+{
+	uint32 i;
+
+	memset(soa, 0, sizeof(soa));
+	soa->p = (uint64 *)xmalloc(batch_size * sizeof(uint64));
+	soa->lattice_size = (uint64 *)xmalloc(batch_size * sizeof(uint64));
+	for (i = 0; i < 4 * POLY_BATCH_SIZE; i += 4) {
+		soa->roots[i] = (uint32 *)xmalloc(batch_size * 
+						sizeof(uint32));
+		soa->roots[i+1] = (uint32 *)xmalloc(batch_size * 
+						sizeof(uint32));
+		soa->roots[i+2] = (uint32 *)xmalloc(batch_size * 
+						sizeof(uint32));
+		soa->roots[i+3] = (uint32 *)xmalloc(batch_size * 
+						sizeof(uint32));
+	}
+}
+
+static void
+p_soa_var_free(p_soa_var_t *soa)
+{
+	uint32 i;
+
+	free(soa->p);
+	free(soa->lattice_size);
+	for (i = 0; i < 4 * POLY_BATCH_SIZE; i += 4) {
+		free(soa->roots[i]);
+		free(soa->roots[i+1]);
+		free(soa->roots[i+2]);
+		free(soa->roots[i+3]);
+	}
+}
+
 static void
 p_soa_var_reset(p_soa_var_t *soa)
 {
@@ -243,6 +278,8 @@ sieve_lattice_deg5_128(msieve_obj *obj, lattice_fb_t *L,
 	uint32 threads_per_block;
 	gpu_info_t *gpu_info = L->gpu_info;
        	CUfunction gpu_kernel = L->gpu_kernel;
+	uint32 host_p_batch_size;
+	uint32 host_q_batch_size;
 
 	L->p_marshall = (p_soa_t *)xmalloc(sizeof(p_soa_t));
 	L->q_marshall = (q_soa_t *)xmalloc(sizeof(q_soa_t));
@@ -268,6 +305,11 @@ sieve_lattice_deg5_128(msieve_obj *obj, lattice_fb_t *L,
 	CUDA_TRY(cuMemAlloc(&L->gpu_found_array, 
 			L->found_array_size * sizeof(found_t)))
 
+	host_p_batch_size = MAX(10000, L->found_array_size / 3);
+	host_q_batch_size = MAX(50000, 12 * L->found_array_size);
+	p_soa_var_init(p_array, host_p_batch_size);
+	p_soa_var_init(q_array, host_q_batch_size);
+
 	printf("------- %" PRIu64 "-%" PRIu64 " %" PRIu64 "-%" PRIu64 "\n",
 			small_p_min, small_p_max,
 			large_p_min, large_p_max);
@@ -280,7 +322,7 @@ sieve_lattice_deg5_128(msieve_obj *obj, lattice_fb_t *L,
 
 		L->fill_p = 0;
 		p_soa_var_reset(q_array);
-		for (i = 0; i < HOST_BATCH_SIZE && 
+		for (i = 0; i < host_q_batch_size && 
 				min_large != P_SEARCH_DONE; i++) {
 			min_large = sieve_fb_next(sieve_small, L->poly,
 						store_p_soa, L);
@@ -297,7 +339,7 @@ sieve_lattice_deg5_128(msieve_obj *obj, lattice_fb_t *L,
 
 			L->fill_p = 1;
 			p_soa_var_reset(p_array);
-			for (i = 0; i < HOST_BATCH_SIZE && 
+			for (i = 0; i < host_p_batch_size && 
 					min_small != P_SEARCH_DONE; i++) {
 				min_small = sieve_fb_next(sieve_large, L->poly,
 							store_p_soa, L);
@@ -325,6 +367,8 @@ sieve_lattice_deg5_128(msieve_obj *obj, lattice_fb_t *L,
 	CUDA_TRY(cuMemFree(L->gpu_p_array))
 	CUDA_TRY(cuMemFree(L->gpu_q_array))
 	CUDA_TRY(cuMemFree(L->gpu_found_array))
+	p_soa_var_free(p_array);
+	p_soa_var_free(q_array);
 	free(p_array);
 	free(q_array);
 	free(L->p_marshall);

diff --git a/gnfs/poly/stage1/stage1_core_gpu/stage1_sieve_deg5_64.c b/gnfs/poly/stage1/stage1_core_gpu/stage1_sieve_deg5_64.c
@@ -15,18 +15,39 @@ benefit from your work.
 #include <stage1.h>
 #include "stage1_core_deg5_64.h"
 
-#define HOST_BATCH_SIZE (104*384)
-
 /*------------------------------------------------------------------------*/
 typedef struct {
 	uint32 num_p;
 	uint32 last_p;
 
-	uint32 p[HOST_BATCH_SIZE];
-	uint32 lattice_size[HOST_BATCH_SIZE];
-	uint64 roots[POLY_BATCH_SIZE][HOST_BATCH_SIZE];
+	uint32 *p;
+	uint32 *lattice_size;
+	uint64 *roots[POLY_BATCH_SIZE];
 } p_soa_var_t;
 
+static void
+p_soa_var_init(p_soa_var_t *soa, uint32 batch_size)
+{
+	uint32 i;
+
+	memset(soa, 0, sizeof(soa));
+	soa->p = (uint32 *)xmalloc(batch_size * sizeof(uint32));
+	soa->lattice_size = (uint32 *)xmalloc(batch_size * sizeof(uint32));
+	for (i = 0; i < POLY_BATCH_SIZE; i++)
+		soa->roots[i] = (uint64 *)xmalloc(batch_size * sizeof(uint64));
+}
+
+static void
+p_soa_var_free(p_soa_var_t *soa)
+{
+	uint32 i;
+
+	free(soa->p);
+	free(soa->lattice_size);
+	for (i = 0; i < POLY_BATCH_SIZE; i++)
+		free(soa->roots[i]);
+}
+
 static void
 p_soa_var_reset(p_soa_var_t *soa)
 {
@@ -244,6 +265,8 @@ sieve_lattice_deg5_64(msieve_obj *obj, lattice_fb_t *L,
 	p_soa_var_t * p_array;
 	p_soa_var_t * q_array;
 	uint32 num_poly = L->poly->num_poly;
+	uint32 host_p_batch_size;
+	uint32 host_q_batch_size;
 
 	uint32 threads_per_block;
 	gpu_info_t *gpu_info = L->gpu_info;
@@ -273,6 +296,11 @@ sieve_lattice_deg5_64(msieve_obj *obj, lattice_fb_t *L,
 	CUDA_TRY(cuMemAlloc(&L->gpu_found_array, 
 			L->found_array_size * sizeof(found_t)))
 
+	host_p_batch_size = MAX(10000, L->found_array_size / 3);
+	host_q_batch_size = MAX(50000, 12 * L->found_array_size);
+	p_soa_var_init(p_array, host_p_batch_size);
+	p_soa_var_init(q_array, host_q_batch_size);
+
 	printf("------- %u-%u %u-%u\n",
 			small_p_min, small_p_max,
 			large_p_min, large_p_max);
@@ -285,7 +313,7 @@ sieve_lattice_deg5_64(msieve_obj *obj, lattice_fb_t *L,
 
 		L->fill_p = 0;
 		p_soa_var_reset(q_array);
-		for (i = 0; i < HOST_BATCH_SIZE && 
+		for (i = 0; i < host_q_batch_size && 
 				min_large != (uint32)P_SEARCH_DONE; i++) {
 			min_large = sieve_fb_next(sieve_small, L->poly,
 						store_p_soa, L);
@@ -302,7 +330,7 @@ sieve_lattice_deg5_64(msieve_obj *obj, lattice_fb_t *L,
 
 			L->fill_p = 1;
 			p_soa_var_reset(p_array);
-			for (i = 0; i < HOST_BATCH_SIZE && 
+			for (i = 0; i < host_p_batch_size && 
 				    min_small != (uint32)P_SEARCH_DONE; i++) {
 				min_small = sieve_fb_next(sieve_large, L->poly,
 							store_p_soa, L);
@@ -330,6 +358,8 @@ sieve_lattice_deg5_64(msieve_obj *obj, lattice_fb_t *L,
 	CUDA_TRY(cuMemFree(L->gpu_p_array))
 	CUDA_TRY(cuMemFree(L->gpu_q_array))
 	CUDA_TRY(cuMemFree(L->gpu_found_array))
+	p_soa_var_free(p_array);
+	p_soa_var_free(q_array);
 	free(p_array);
 	free(q_array);
 	free(L->p_marshall);

diff --git a/gnfs/poly/stage1/stage1_core_gpu/stage1_sieve_deg5_96.c b/gnfs/poly/stage1/stage1_core_gpu/stage1_sieve_deg5_96.c
@@ -15,18 +15,48 @@ benefit from your work.
 #include <stage1.h>
 #include "stage1_core_deg5_96.h"
 
-#define HOST_BATCH_SIZE (156*256)
-
 /*------------------------------------------------------------------------*/
 typedef struct {
 	uint32 num_p;
 	uint64 last_p;
 
-	uint64 p[HOST_BATCH_SIZE];
-	uint64 lattice_size[HOST_BATCH_SIZE];
-	uint32 roots[3*POLY_BATCH_SIZE][HOST_BATCH_SIZE];
+	uint64 *p;
+	uint64 *lattice_size;
+	uint32 *roots[3*POLY_BATCH_SIZE];
 } p_soa_var_t;
 
+static void
+p_soa_var_init(p_soa_var_t *soa, uint32 batch_size)
+{
+	uint32 i;
+
+	memset(soa, 0, sizeof(soa));
+	soa->p = (uint64 *)xmalloc(batch_size * sizeof(uint64));
+	soa->lattice_size = (uint64 *)xmalloc(batch_size * sizeof(uint64));
+	for (i = 0; i < 3 * POLY_BATCH_SIZE; i += 3) {
+		soa->roots[i] = (uint32 *)xmalloc(batch_size * 
+						sizeof(uint32));
+		soa->roots[i+1] = (uint32 *)xmalloc(batch_size * 
+						sizeof(uint32));
+		soa->roots[i+2] = (uint32 *)xmalloc(batch_size * 
+						sizeof(uint32));
+	}
+}
+
+static void
+p_soa_var_free(p_soa_var_t *soa)
+{
+	uint32 i;
+
+	free(soa->p);
+	free(soa->lattice_size);
+	for (i = 0; i < 3 * POLY_BATCH_SIZE; i += 3) {
+		free(soa->roots[i]);
+		free(soa->roots[i+1]);
+		free(soa->roots[i+2]);
+	}
+}
+
 static void
 p_soa_var_reset(p_soa_var_t *soa)
 {
@@ -248,6 +278,8 @@ sieve_lattice_deg5_96(msieve_obj *obj, lattice_fb_t *L,
 	p_soa_var_t * p_array;
 	p_soa_var_t * q_array;
 	uint32 num_poly = L->poly->num_poly;
+	uint32 host_p_batch_size;
+	uint32 host_q_batch_size;
 
 	uint32 threads_per_block;
 	gpu_info_t *gpu_info = L->gpu_info;
@@ -277,6 +309,11 @@ sieve_lattice_deg5_96(msieve_obj *obj, lattice_fb_t *L,
 	CUDA_TRY(cuMemAlloc(&L->gpu_found_array, 
 			L->found_array_size * sizeof(found_t)))
 
+	host_p_batch_size = MAX(10000, L->found_array_size / 3);
+	host_q_batch_size = MAX(50000, 12 * L->found_array_size);
+	p_soa_var_init(p_array, host_p_batch_size);
+	p_soa_var_init(q_array, host_q_batch_size);
+
 	printf("------- %" PRIu64 "-%" PRIu64 " %" PRIu64 "-%" PRIu64 "\n",
 			small_p_min, small_p_max,
 			large_p_min, large_p_max);
@@ -289,7 +326,7 @@ sieve_lattice_deg5_96(msieve_obj *obj, lattice_fb_t *L,
 
 		L->fill_p = 0;
 		p_soa_var_reset(q_array);
-		for (i = 0; i < HOST_BATCH_SIZE && 
+		for (i = 0; i < host_q_batch_size && 
 				min_large != P_SEARCH_DONE; i++) {
 			min_large = sieve_fb_next(sieve_small, L->poly,
 						store_p_soa, L);
@@ -306,7 +343,7 @@ sieve_lattice_deg5_96(msieve_obj *obj, lattice_fb_t *L,
 
 			L->fill_p = 1;
 			p_soa_var_reset(p_array);
-			for (i = 0; i < HOST_BATCH_SIZE && 
+			for (i = 0; i < host_p_batch_size && 
 					min_small != P_SEARCH_DONE; i++) {
 				min_small = sieve_fb_next(sieve_large, L->poly,
 							store_p_soa, L);
@@ -334,6 +371,8 @@ sieve_lattice_deg5_96(msieve_obj *obj, lattice_fb_t *L,
 	CUDA_TRY(cuMemFree(L->gpu_p_array))
 	CUDA_TRY(cuMemFree(L->gpu_q_array))
 	CUDA_TRY(cuMemFree(L->gpu_found_array))
+	p_soa_var_free(p_array);
+	p_soa_var_free(q_array);
 	free(p_array);
 	free(q_array);
 	free(L->p_marshall);

diff --git a/gnfs/poly/stage1/stage1_core_gpu/stage1_sieve_deg6_128.c b/gnfs/poly/stage1/stage1_core_gpu/stage1_sieve_deg6_128.c
@@ -15,8 +15,6 @@ benefit from your work.
 #include <stage1.h>
 #include "stage1_core_deg6_128.h"
 
-#define HOST_BATCH_SIZE 50000
-
 /*------------------------------------------------------------------------*/
 typedef struct {
 	uint32 num_roots;
@@ -411,6 +409,8 @@ sieve_lattice_deg6_128(msieve_obj *obj, lattice_fb_t *L,
 	q_soa_array_t * q_array;
 	uint32 p_min_roots, p_max_roots;
 	uint32 q_min_roots, q_max_roots;
+	uint32 host_p_batch_size;
+	uint32 host_q_batch_size;
 
 	uint32 threads_per_block;
 	gpu_info_t *gpu_info = L->gpu_info;
@@ -440,6 +440,9 @@ sieve_lattice_deg6_128(msieve_obj *obj, lattice_fb_t *L,
 	CUDA_TRY(cuMemAlloc(&L->gpu_found_array, 
 			L->found_array_size * sizeof(found_t)))
 
+	host_p_batch_size = MAX(10000, L->found_array_size / 3);
+	host_q_batch_size = MAX(50000, 12 * L->found_array_size);
+
 	printf("------- %" PRIu64 "-%" PRIu64 " %" PRIu64 "-%" PRIu64 "\n",
 			small_p_min, small_p_max,
 			large_p_min, large_p_max);
@@ -456,7 +459,7 @@ sieve_lattice_deg6_128(msieve_obj *obj, lattice_fb_t *L,
 
 		q_soa_array_reset(q_array);
 
-		for (i = 0; i < HOST_BATCH_SIZE && 
+		for (i = 0; i < host_q_batch_size && 
 				min_large != P_SEARCH_DONE; i++) {
 			min_large = sieve_fb_next(sieve_small, L->poly,
 						store_p_soa, L);
@@ -477,7 +480,7 @@ sieve_lattice_deg6_128(msieve_obj *obj, lattice_fb_t *L,
 
 			p_packed_reset(p_array);
 
-			for (i = 0; i < HOST_BATCH_SIZE && 
+			for (i = 0; i < host_p_batch_size && 
 					min_small != P_SEARCH_DONE; i++) {
 				min_small = sieve_fb_next(sieve_large, L->poly,
 							store_p_packed, L);