34
34
#include "miscadmin.h"
35
35
#include "portability/mem.h"
36
36
#include "storage/dsm.h"
37
+ #include "storage/fd.h"
37
38
#include "storage/ipc.h"
38
39
#include "storage/pg_shmem.h"
39
40
#include "utils/guc.h"
@@ -349,6 +350,80 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
349
350
350
351
#ifdef USE_ANONYMOUS_SHMEM
351
352
353
+ #ifdef MAP_HUGETLB
354
+
355
+ /*
356
+ * Identify the huge page size to use.
357
+ *
358
+ * Some Linux kernel versions have a bug causing mmap() to fail on requests
359
+ * that are not a multiple of the hugepage size. Versions without that bug
360
+ * instead silently round the request up to the next hugepage multiple ---
361
+ * and then munmap() fails when we give it a size different from that.
362
+ * So we have to round our request up to a multiple of the actual hugepage
363
+ * size to avoid trouble.
364
+ *
365
+ * Doing the round-up ourselves also lets us make use of the extra memory,
366
+ * rather than just wasting it. Currently, we just increase the available
367
+ * space recorded in the shmem header, which will make the extra usable for
368
+ * purposes such as additional locktable entries. Someday, for very large
369
+ * hugepage sizes, we might want to think about more invasive strategies,
370
+ * such as increasing shared_buffers to absorb the extra space.
371
+ *
372
+ * Returns the (real or assumed) page size into *hugepagesize,
373
+ * and the hugepage-related mmap flags to use into *mmap_flags.
374
+ *
375
+ * Currently *mmap_flags is always just MAP_HUGETLB. Someday, on systems
376
+ * that support it, we might OR in additional bits to specify a particular
377
+ * non-default huge page size.
378
+ */
379
+ static void
380
+ GetHugePageSize (Size * hugepagesize , int * mmap_flags )
381
+ {
382
+ /*
383
+ * If we fail to find out the system's default huge page size, assume it
384
+ * is 2MB. This will work fine when the actual size is less. If it's
385
+ * more, we might get mmap() or munmap() failures due to unaligned
386
+ * requests; but at this writing, there are no reports of any non-Linux
387
+ * systems being picky about that.
388
+ */
389
+ * hugepagesize = 2 * 1024 * 1024 ;
390
+ * mmap_flags = MAP_HUGETLB ;
391
+
392
+ /*
393
+ * System-dependent code to find out the default huge page size.
394
+ *
395
+ * On Linux, read /proc/meminfo looking for a line like "Hugepagesize:
396
+ * nnnn kB". Ignore any failures, falling back to the preset default.
397
+ */
398
+ #ifdef __linux__
399
+ {
400
+ FILE * fp = AllocateFile ("/proc/meminfo" , "r" );
401
+ char buf [128 ];
402
+ unsigned int sz ;
403
+ char ch ;
404
+
405
+ if (fp )
406
+ {
407
+ while (fgets (buf , sizeof (buf ), fp ))
408
+ {
409
+ if (sscanf (buf , "Hugepagesize: %u %c" , & sz , & ch ) == 2 )
410
+ {
411
+ if (ch == 'k' )
412
+ {
413
+ * hugepagesize = sz * (Size ) 1024 ;
414
+ break ;
415
+ }
416
+ /* We could accept other units besides kB, if needed */
417
+ }
418
+ }
419
+ FreeFile (fp );
420
+ }
421
+ }
422
+ #endif /* __linux__ */
423
+ }
424
+
425
+ #endif /* MAP_HUGETLB */
426
+
352
427
/*
353
428
* Creates an anonymous mmap()ed shared memory segment.
354
429
*
@@ -371,27 +446,17 @@ CreateAnonymousSegment(Size *size)
371
446
{
372
447
/*
373
448
* Round up the request size to a suitable large value.
374
- *
375
- * Some Linux kernel versions are known to have a bug, which causes
376
- * mmap() with MAP_HUGETLB to fail if the request size is not a
377
- * multiple of any supported huge page size. To work around that, we
378
- * round up the request size to nearest 2MB. 2MB is the most common
379
- * huge page page size on affected systems.
380
- *
381
- * Aside from that bug, even with a kernel that does the allocation
382
- * correctly, rounding it up ourselves avoids wasting memory. Without
383
- * it, if we for example make an allocation of 2MB + 1 bytes, the
384
- * kernel might decide to use two 2MB huge pages for that, and waste 2
385
- * MB - 1 of memory. When we do the rounding ourselves, we can use
386
- * that space for allocations.
387
449
*/
388
- int hugepagesize = 2 * 1024 * 1024 ;
450
+ Size hugepagesize ;
451
+ int mmap_flags ;
452
+
453
+ GetHugePageSize (& hugepagesize , & mmap_flags );
389
454
390
455
if (allocsize % hugepagesize != 0 )
391
456
allocsize += hugepagesize - (allocsize % hugepagesize );
392
457
393
458
ptr = mmap (NULL , allocsize , PROT_READ | PROT_WRITE ,
394
- PG_MMAP_FLAGS | MAP_HUGETLB , -1 , 0 );
459
+ PG_MMAP_FLAGS | mmap_flags , -1 , 0 );
395
460
mmap_errno = errno ;
396
461
if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED )
397
462
elog (DEBUG1 , "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m" ,
@@ -402,7 +467,7 @@ CreateAnonymousSegment(Size *size)
402
467
if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON )
403
468
{
404
469
/*
405
- * use the original size, not the rounded up value, when falling back
470
+ * Use the original size, not the rounded- up value, when falling back
406
471
* to non-huge pages.
407
472
*/
408
473
allocsize = * size ;
0 commit comments