Skip to content

Commit

Permalink
Merge tag 'slab-for-6.13-v2' of git://git.kernel.org/pub/scm/linux/ke…
Browse files Browse the repository at this point in the history
…rnel/git/vbabka/slab

Pull slab updates from Vlastimil Babka:

 - Add new slab_strict_numa boot parameter to enforce per-object memory
   policies on top of slab folio policies, for systems where saving cost
   of remote accesses is more important than minimizing slab allocation
   overhead (Christoph Lameter)

 - Fix for freeptr_offset alignment check being too strict for m68k
   (Geert Uytterhoeven)

 - krealloc() fixes for not violating __GFP_ZERO guarantees on
   krealloc() when slub_debug (redzone and object tracking) is enabled
   (Feng Tang)

 - Fix a memory leak in case sysfs registration fails for a slab cache,
   and also no longer fail to create the cache in that case (Hyeonggon
   Yoo)

 - Fix handling of detected consistency problems (due to buggy slab
   user) with slub_debug enabled, so that it does not cause further list
   corruption bugs (yuan.gao)

 - Code cleanup and kerneldocs polishing (Zhen Lei, Vlastimil Babka)

* tag 'slab-for-6.13-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab:
  slab: Fix too strict alignment check in create_cache()
  mm/slab: Allow cache creation to proceed even if sysfs registration fails
  mm/slub: Avoid list corruption when removing a slab from the full list
  mm/slub, kunit: Add testcase for krealloc redzone and zeroing
  mm/slub: Improve redzone check and zeroing for krealloc()
  mm/slub: Consider kfence case for get_orig_size()
  SLUB: Add support for per object memory policies
  mm, slab: add kerneldocs for common SLAB_ flags
  mm/slab: remove duplicate check in create_cache()
  mm/slub: Move krealloc() and related code to slub.c
  mm/kasan: Don't store metadata inside kmalloc object when slub_debug_orig_size is on
  • Loading branch information
torvalds committed Nov 26, 2024
2 parents f5f4745 + 9008fe8 commit e06635e
Show file tree
Hide file tree
Showing 8 changed files with 324 additions and 136 deletions.
10 changes: 10 additions & 0 deletions Documentation/admin-guide/kernel-parameters.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6158,6 +6158,16 @@
For more information see Documentation/mm/slub.rst.
(slub_nomerge legacy name also accepted for now)

slab_strict_numa [MM]
Support memory policies on a per object level
in the slab allocator. The default is for memory
policies to be applied at the folio level when
a new folio is needed or a partial folio is
retrieved from the lists. Increases overhead
in the slab fastpaths but gains more accurate
NUMA kernel object placement which helps with slow
interconnects in NUMA systems.

slram= [HW,MTD]

smart2= [HW]
Expand Down
9 changes: 9 additions & 0 deletions Documentation/mm/slub.rst
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,15 @@ can be influenced by kernel parameters:
``slab_max_order`` to 0, what cause minimum possible order of
slabs allocation.

``slab_strict_numa``
Enables the application of memory policies on each
allocation. This results in more accurate placement of
objects which may result in the reduction of accesses
to remote nodes. The default is to only apply memory
policies at the folio level when a new folio is acquired
or a folio is retrieved from the lists. Enabling this
option reduces the fastpath performance of the slab allocator.

SLUB Debug output
=================

Expand Down
60 changes: 41 additions & 19 deletions include/linux/slab.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,17 @@ enum _slab_flag_bits {
#define SLAB_POISON __SLAB_FLAG_BIT(_SLAB_POISON)
/* Indicate a kmalloc slab */
#define SLAB_KMALLOC __SLAB_FLAG_BIT(_SLAB_KMALLOC)
/* Align objs on cache lines */
/**
* define SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries.
*
* Sufficiently large objects are aligned on cache line boundary. For object
* size smaller than a half of cache line size, the alignment is on the half of
* cache line size. In general, if object size is smaller than 1/2^n of cache
* line size, the alignment is adjusted to 1/2^n.
*
* If explicit alignment is also requested by the respective
* &struct kmem_cache_args field, the greater of both is alignments is applied.
*/
#define SLAB_HWCACHE_ALIGN __SLAB_FLAG_BIT(_SLAB_HWCACHE_ALIGN)
/* Use GFP_DMA memory */
#define SLAB_CACHE_DMA __SLAB_FLAG_BIT(_SLAB_CACHE_DMA)
Expand All @@ -87,8 +97,8 @@ enum _slab_flag_bits {
#define SLAB_STORE_USER __SLAB_FLAG_BIT(_SLAB_STORE_USER)
/* Panic if kmem_cache_create() fails */
#define SLAB_PANIC __SLAB_FLAG_BIT(_SLAB_PANIC)
/*
* SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
/**
* define SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
*
* This delays freeing the SLAB page by a grace period, it does _NOT_
* delay object freeing. This means that if you do kmem_cache_free()
Expand All @@ -99,20 +109,22 @@ enum _slab_flag_bits {
* stays valid, the trick to using this is relying on an independent
* object validation pass. Something like:
*
* begin:
* rcu_read_lock();
* obj = lockless_lookup(key);
* if (obj) {
* if (!try_get_ref(obj)) // might fail for free objects
* rcu_read_unlock();
* goto begin;
* ::
*
* begin:
* rcu_read_lock();
* obj = lockless_lookup(key);
* if (obj) {
* if (!try_get_ref(obj)) // might fail for free objects
* rcu_read_unlock();
* goto begin;
*
* if (obj->key != key) { // not the object we expected
* put_ref(obj);
* rcu_read_unlock();
* goto begin;
* }
* }
* if (obj->key != key) { // not the object we expected
* put_ref(obj);
* rcu_read_unlock();
* goto begin;
* }
* }
* rcu_read_unlock();
*
* This is useful if we need to approach a kernel structure obliquely,
Expand All @@ -137,7 +149,6 @@ enum _slab_flag_bits {
*
* Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
*/
/* Defer freeing slabs to RCU */
#define SLAB_TYPESAFE_BY_RCU __SLAB_FLAG_BIT(_SLAB_TYPESAFE_BY_RCU)
/* Trace allocations and frees */
#define SLAB_TRACE __SLAB_FLAG_BIT(_SLAB_TRACE)
Expand Down Expand Up @@ -170,7 +181,12 @@ enum _slab_flag_bits {
#else
# define SLAB_FAILSLAB __SLAB_FLAG_UNUSED
#endif
/* Account to memcg */
/**
* define SLAB_ACCOUNT - Account allocations to memcg.
*
* All object allocations from this cache will be memcg accounted, regardless of
* __GFP_ACCOUNT being or not being passed to individual allocations.
*/
#ifdef CONFIG_MEMCG
# define SLAB_ACCOUNT __SLAB_FLAG_BIT(_SLAB_ACCOUNT)
#else
Expand All @@ -197,7 +213,13 @@ enum _slab_flag_bits {
#endif

/* The following flags affect the page allocator grouping pages by mobility */
/* Objects are reclaimable */
/**
* define SLAB_RECLAIM_ACCOUNT - Objects are reclaimable.
*
* Use this flag for caches that have an associated shrinker. As a result, slab
* pages are allocated with __GFP_RECLAIMABLE, which affects grouping pages by
* mobility, and are accounted in SReclaimable counter in /proc/meminfo
*/
#ifndef CONFIG_SLUB_TINY
#define SLAB_RECLAIM_ACCOUNT __SLAB_FLAG_BIT(_SLAB_RECLAIM_ACCOUNT)
#else
Expand Down
42 changes: 42 additions & 0 deletions lib/slub_kunit.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,47 @@ static void test_leak_destroy(struct kunit *test)
KUNIT_EXPECT_EQ(test, 2, slab_errors);
}

static void test_krealloc_redzone_zeroing(struct kunit *test)
{
u8 *p;
int i;
struct kmem_cache *s = test_kmem_cache_create("TestSlub_krealloc", 64,
SLAB_KMALLOC|SLAB_STORE_USER|SLAB_RED_ZONE);

p = alloc_hooks(__kmalloc_cache_noprof(s, GFP_KERNEL, 48));
memset(p, 0xff, 48);

kasan_disable_current();
OPTIMIZER_HIDE_VAR(p);

/* Test shrink */
p = krealloc(p, 40, GFP_KERNEL | __GFP_ZERO);
for (i = 40; i < 64; i++)
KUNIT_EXPECT_EQ(test, p[i], SLUB_RED_ACTIVE);

/* Test grow within the same 64B kmalloc object */
p = krealloc(p, 56, GFP_KERNEL | __GFP_ZERO);
for (i = 40; i < 56; i++)
KUNIT_EXPECT_EQ(test, p[i], 0);
for (i = 56; i < 64; i++)
KUNIT_EXPECT_EQ(test, p[i], SLUB_RED_ACTIVE);

validate_slab_cache(s);
KUNIT_EXPECT_EQ(test, 0, slab_errors);

memset(p, 0xff, 56);
/* Test grow with allocating a bigger 128B object */
p = krealloc(p, 112, GFP_KERNEL | __GFP_ZERO);
for (i = 0; i < 56; i++)
KUNIT_EXPECT_EQ(test, p[i], 0xff);
for (i = 56; i < 112; i++)
KUNIT_EXPECT_EQ(test, p[i], 0);

kfree(p);
kasan_enable_current();
kmem_cache_destroy(s);
}

static int test_init(struct kunit *test)
{
slab_errors = 0;
Expand All @@ -214,6 +255,7 @@ static struct kunit_case test_cases[] = {
KUNIT_CASE(test_kmalloc_redzone_access),
KUNIT_CASE(test_kfree_rcu),
KUNIT_CASE(test_leak_destroy),
KUNIT_CASE(test_krealloc_redzone_zeroing),
{}
};

Expand Down
7 changes: 5 additions & 2 deletions mm/kasan/generic.c
Original file line number Diff line number Diff line change
Expand Up @@ -392,9 +392,12 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
* 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can
* be touched after it was freed, or
* 2. Object has a constructor, which means it's expected to
* retain its content until the next allocation.
* retain its content until the next allocation, or
* 3. It is from a kmalloc cache which enables the debug option
* to store original size.
*/
if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor) {
if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor ||
slub_debug_orig_size(cache)) {
cache->kasan_info.free_meta_offset = *size;
*size += sizeof(struct kasan_free_meta);
goto free_meta_added;
Expand Down
11 changes: 11 additions & 0 deletions mm/slab.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ struct slab {
struct {
unsigned inuse:16;
unsigned objects:15;
/*
* If slab debugging is enabled then the
* frozen bit can be reused to indicate
* that the slab was corrupted
*/
unsigned frozen:1;
};
};
Expand Down Expand Up @@ -695,6 +700,12 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
void __check_heap_object(const void *ptr, unsigned long n,
const struct slab *slab, bool to_user);

static inline bool slub_debug_orig_size(struct kmem_cache *s)
{
return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
(s->flags & SLAB_KMALLOC));
}

#ifdef CONFIG_SLUB_DEBUG
void skip_orig_size_check(struct kmem_cache *s, const void *object);
#endif
Expand Down
103 changes: 14 additions & 89 deletions mm/slab_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -222,15 +222,12 @@ static struct kmem_cache *create_cache(const char *name,
struct kmem_cache *s;
int err;

if (WARN_ON(args->useroffset + args->usersize > object_size))
args->useroffset = args->usersize = 0;

/* If a custom freelist pointer is requested make sure it's sane. */
err = -EINVAL;
if (args->use_freeptr_offset &&
(args->freeptr_offset >= object_size ||
!(flags & SLAB_TYPESAFE_BY_RCU) ||
!IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
!IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t))))
goto out;

err = -ENOMEM;
Expand All @@ -257,11 +254,23 @@ static struct kmem_cache *create_cache(const char *name,
* @object_size: The size of objects to be created in this cache.
* @args: Additional arguments for the cache creation (see
* &struct kmem_cache_args).
* @flags: See %SLAB_* flags for an explanation of individual @flags.
* @flags: See the desriptions of individual flags. The common ones are listed
* in the description below.
*
* Not to be called directly, use the kmem_cache_create() wrapper with the same
* parameters.
*
* Commonly used @flags:
*
* &SLAB_ACCOUNT - Account allocations to memcg.
*
* &SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries.
*
* &SLAB_RECLAIM_ACCOUNT - Objects are reclaimable.
*
* &SLAB_TYPESAFE_BY_RCU - Slab page (not individual objects) freeing delayed
* by a grace period - see the full description before using.
*
* Context: Cannot be called within a interrupt, but can be interrupted.
*
* Return: a pointer to the cache on success, NULL on failure.
Expand Down Expand Up @@ -1199,90 +1208,6 @@ module_init(slab_proc_init);

#endif /* CONFIG_SLUB_DEBUG */

static __always_inline __realloc_size(2) void *
__do_krealloc(const void *p, size_t new_size, gfp_t flags)
{
void *ret;
size_t ks;

/* Check for double-free before calling ksize. */
if (likely(!ZERO_OR_NULL_PTR(p))) {
if (!kasan_check_byte(p))
return NULL;
ks = ksize(p);
} else
ks = 0;

/* If the object still fits, repoison it precisely. */
if (ks >= new_size) {
/* Zero out spare memory. */
if (want_init_on_alloc(flags)) {
kasan_disable_current();
memset(kasan_reset_tag(p) + new_size, 0, ks - new_size);
kasan_enable_current();
}

p = kasan_krealloc((void *)p, new_size, flags);
return (void *)p;
}

ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_);
if (ret && p) {
/* Disable KASAN checks as the object's redzone is accessed. */
kasan_disable_current();
memcpy(ret, kasan_reset_tag(p), ks);
kasan_enable_current();
}

return ret;
}

/**
* krealloc - reallocate memory. The contents will remain unchanged.
* @p: object to reallocate memory for.
* @new_size: how many bytes of memory are required.
* @flags: the type of memory to allocate.
*
* If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size
* is 0 and @p is not a %NULL pointer, the object pointed to is freed.
*
* If __GFP_ZERO logic is requested, callers must ensure that, starting with the
* initial memory allocation, every subsequent call to this API for the same
* memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
* __GFP_ZERO is not fully honored by this API.
*
* This is the case, since krealloc() only knows about the bucket size of an
* allocation (but not the exact size it was allocated with) and hence
* implements the following semantics for shrinking and growing buffers with
* __GFP_ZERO.
*
* new bucket
* 0 size size
* |--------|----------------|
* | keep | zero |
*
* In any case, the contents of the object pointed to are preserved up to the
* lesser of the new and old sizes.
*
* Return: pointer to the allocated memory or %NULL in case of error
*/
void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
{
void *ret;

if (unlikely(!new_size)) {
kfree(p);
return ZERO_SIZE_PTR;
}

ret = __do_krealloc(p, new_size, flags);
if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
kfree(p);

return ret;
}
EXPORT_SYMBOL(krealloc_noprof);

/**
* kfree_sensitive - Clear sensitive information in memory before freeing
* @p: object to free memory of
Expand Down
Loading

0 comments on commit e06635e

Please sign in to comment.