Skip to content

Commit

Permalink
extstore: make defaults more aggressive
Browse files Browse the repository at this point in the history
extstore has a background thread which examines slab classes for items
to flush to disk. The thresholds for flushing to disk are managed by a
specialized "slab automove" algorithm. This algorithm was written in
2017 and not tuned since.

Most serious users set "ext_item_age=0" and force flush all items. This
is partially because the defaults do not flush aggressively enough,
which causes memory to run out and evictions to happen.

This change simplifies the slab automove portion. Instead of balancing
free chunks of memory per slab class, it sets a target of a certain
number of free global pages.

The extstore flusher thread also uses the page pool and some low chunk
limits to decide when to start flushing. Its sleep routines have also
been adjusted as it could oversleep too easily.

A few other small changes were required to avoid over-moving slab pages
around.
  • Loading branch information
dormando committed Aug 26, 2022
1 parent a102df4 commit 3d6d74a
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 113 deletions.
8 changes: 1 addition & 7 deletions items.c
Original file line number Diff line number Diff line change
Expand Up @@ -1669,13 +1669,7 @@ static void *lru_maintainer_thread(void *arg) {
LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_SLAB_MOVE, NULL,
src, dst);
}
// dst == 0 means reclaim to global pool, be more aggressive
if (dst != 0) {
last_automove_check = current_time;
} else if (dst == 0) {
// also ensure we minimize the thread sleep
to_sleep = 1000;
}
last_automove_check = current_time;
}
}
pthread_mutex_unlock(&lru_maintainer_lock);
Expand Down
4 changes: 2 additions & 2 deletions memcached.h
Original file line number Diff line number Diff line change
Expand Up @@ -507,8 +507,8 @@ struct settings {
double ext_max_frag; /* ideal maximum page fragmentation */
double slab_automove_freeratio; /* % of memory to hold free as buffer */
bool ext_drop_unread; /* skip unread items during compaction */
/* per-slab-class free chunk limit */
unsigned int ext_free_memchunks[MAX_NUMBER_OF_SLAB_CLASSES];
/* start flushing to extstore after memory below this */
unsigned int ext_global_pool_min;
#endif
#ifdef TLS
bool ssl_enabled; /* indicates whether SSL is enabled */
Expand Down
4 changes: 2 additions & 2 deletions proto_text.c
Original file line number Diff line number Diff line change
Expand Up @@ -2410,15 +2410,15 @@ static void process_extstore_command(conn *c, token_t *tokens, const size_t ntok
if (ntokens < 4) {
ok = false;
} else if (strcmp(tokens[1].value, "free_memchunks") == 0 && ntokens > 4) {
/* per-slab-class free chunk setting. */
// setting is deprecated and ignored, but accepted for backcompat
unsigned int clsid = 0;
unsigned int limit = 0;
if (!safe_strtoul(tokens[2].value, &clsid) ||
!safe_strtoul(tokens[3].value, &limit)) {
ok = false;
} else {
if (clsid < MAX_NUMBER_OF_SLAB_CLASSES) {
settings.ext_free_memchunks[clsid] = limit;
ok = true;
} else {
ok = false;
}
Expand Down
113 changes: 21 additions & 92 deletions slab_automove_extstore.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#define MIN_PAGES_FOR_SOURCE 2
#define MIN_PAGES_FOR_RECLAIM 2.5
#define MIN_PAGES_FREE 1.5
#define MEMCHECK_PERIOD 60

struct window_data {
uint64_t age;
Expand All @@ -23,23 +22,16 @@ struct window_data {
unsigned int relaxed;
};

struct window_global {
uint32_t pool_low;
uint32_t pool_high;
};

typedef struct {
struct window_data *window_data;
struct window_global *window_global;
struct settings *settings;
uint32_t window_size;
uint32_t window_cur;
uint32_t item_size;
rel_time_t last_memcheck_run;
double max_age_ratio;
double free_ratio;
bool pool_filled_once;
unsigned int free_mem[MAX_NUMBER_OF_SLAB_CLASSES];
unsigned int global_pool_watermark;
item_stats_automove iam_before[MAX_NUMBER_OF_SLAB_CLASSES];
item_stats_automove iam_after[MAX_NUMBER_OF_SLAB_CLASSES];
slab_stats_automove sam_before[MAX_NUMBER_OF_SLAB_CLASSES];
Expand All @@ -53,19 +45,15 @@ void *slab_automove_extstore_init(struct settings *settings) {
if (a == NULL)
return NULL;
a->window_data = calloc(window_size * MAX_NUMBER_OF_SLAB_CLASSES, sizeof(struct window_data));
a->window_global = calloc(window_size, sizeof(struct window_global));
a->window_size = window_size;
a->max_age_ratio = max_age_ratio;
a->free_ratio = settings->slab_automove_freeratio;
a->item_size = settings->ext_item_size;
a->last_memcheck_run = 0;
a->settings = settings;
a->pool_filled_once = false;
if (a->window_data == NULL || a->window_global == NULL) {
if (a->window_data == NULL) {
if (a->window_data)
free(a->window_data);
if (a->window_global)
free(a->window_global);
free(a);
return NULL;
}
Expand All @@ -80,7 +68,6 @@ void *slab_automove_extstore_init(struct settings *settings) {
void slab_automove_extstore_free(void *arg) {
slab_automove *a = (slab_automove *)arg;
free(a->window_data);
free(a->window_global);
free(a);
}

Expand All @@ -96,32 +83,19 @@ static void window_sum(struct window_data *wd, struct window_data *w,
}
}

/* This could potentially merge with above */
static void window_global_sum(struct window_global *wg,
struct window_global *w, uint32_t size) {
for (int x = 0; x < size; x++) {
struct window_global *d = &wg[x];
w->pool_high += d->pool_high;
w->pool_low += d->pool_low;
}
}

static void global_pool_check(slab_automove *a) {
static int global_pool_check(slab_automove *a) {
bool mem_limit_reached;
uint32_t free = a->free_mem[0];
struct window_global *wg = &a->window_global[a->window_cur % a->window_size];
unsigned int free = a->global_pool_watermark;
unsigned int count = global_page_pool_size(&mem_limit_reached);
memset(wg, 0, sizeof(struct window_global));
if (!mem_limit_reached)
return;
if (count < free / 2) {
wg->pool_low = 1;
return 0;
if (count < free) {
a->pool_filled_once = true;
} else if (count > free) {
wg->pool_high = 1;
return 1;
} else {
a->pool_filled_once = true;
}
return 0;
}

/* A percentage of memory is configured to be held "free" as buffers for the
Expand All @@ -135,24 +109,20 @@ static void global_pool_check(slab_automove *a) {
*/
static void memcheck(slab_automove *a) {
unsigned int total_pages = 0;
if (current_time < a->last_memcheck_run + MEMCHECK_PERIOD)
return;
a->last_memcheck_run = current_time;

// FIXME: is there a cached counter for total pages alloced?
// technically we only really need to do this once as the pages are
// prefilled and ratio isn't a runtime change.
for (int n = 1; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
slab_stats_automove *sam = &a->sam_after[n];
total_pages += sam->total_pages;
unsigned int hold_free = (sam->total_pages * sam->chunks_per_page)
* a->free_ratio;
if (sam->chunks_per_page * MIN_PAGES_FREE > hold_free)
hold_free = sam->chunks_per_page * MIN_PAGES_FREE;
a->free_mem[n] = hold_free;
if (a->settings->ext_free_memchunks[n] != hold_free && a->pool_filled_once) {
a->settings->ext_free_memchunks[n] = hold_free;
}
}
// remember to add what remains in global pool.
// always update what remains in the global page pool
total_pages += a->sam_after[0].total_pages;
a->free_mem[0] = total_pages * a->free_ratio;
a->global_pool_watermark = total_pages * a->free_ratio;
if (a->global_pool_watermark < 2)
a->global_pool_watermark = 2;
settings.ext_global_pool_min = a->global_pool_watermark;
}

static struct window_data *get_window_data(slab_automove *a, int class) {
Expand All @@ -166,16 +136,11 @@ void slab_automove_extstore_run(void *arg, int *src, int *dst) {
struct window_data w_sum;
int oldest = -1;
uint64_t oldest_age = 0;
int youngest = -1;
uint64_t youngest_age = ~0;
bool too_free = false;
*src = -1;
*dst = -1;

global_pool_check(a);
struct window_global wg_sum;
memset(&wg_sum, 0, sizeof(struct window_global));
window_global_sum(a->window_global, &wg_sum, a->window_size);
int global_low = global_pool_check(a);
// fill after structs
fill_item_stats_automove(a->iam_after);
fill_slab_stats_automove(a->sam_after);
Expand All @@ -187,13 +152,13 @@ void slab_automove_extstore_run(void *arg, int *src, int *dst) {
for (n = POWER_SMALLEST; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
bool small_slab = a->sam_before[n].chunk_size < a->item_size
? true : false;
bool free_enough = false;
struct window_data *wd = get_window_data(a, n);
// summarize the window-up-to-now.
memset(&w_sum, 0, sizeof(struct window_data));
int w_offset = n * a->window_size;
window_sum(&a->window_data[w_offset], &w_sum, a->window_size);
memset(wd, 0, sizeof(struct window_data));
unsigned int free_target = a->sam_after[n].chunks_per_page * MIN_PAGES_FREE;

// if page delta, oom, or evicted delta, mark window dirty
// classes marked dirty cannot donate memory back to global pool.
Expand All @@ -205,15 +170,9 @@ void slab_automove_extstore_run(void *arg, int *src, int *dst) {
if (a->sam_after[n].total_pages - a->sam_before[n].total_pages > 0) {
wd->dirty = 1;
}
// Mark excess free if we're over the free mem limit for too long.
// "free_enough" means it is either wobbling, recently received a new
// page of memory, or the crawler is freeing memory.
if (a->sam_after[n].free_chunks > a->free_mem[n]) {
free_enough = true;
}
// double the free requirements means we may have memory we can
// reclaim to global, if it stays this way for the whole window.
if (a->sam_after[n].free_chunks > (a->free_mem[n] * 2) && a->free_mem[n] > 0) {
if (a->sam_after[n].free_chunks > (free_target * 2)) {
wd->excess_free = 1;
}

Expand Down Expand Up @@ -249,14 +208,6 @@ void slab_automove_extstore_run(void *arg, int *src, int *dst) {
oldest_age = age;
}

// don't count as youngest if it hasn't been using new chunks.
// (if it was relaxed recently, and is currently "free enough")
if (age < youngest_age && a->sam_after[n].total_pages != 0
&& w_sum.excess_free < a->window_size
&& !(w_sum.relaxed && free_enough)) {
youngest = n;
youngest_age = age;
}
}
}

Expand All @@ -268,31 +219,9 @@ void slab_automove_extstore_run(void *arg, int *src, int *dst) {
if (a->window_cur < a->window_size)
return;

if (wg_sum.pool_high >= a->window_size && !wg_sum.pool_low && youngest != -1) {
if (a->sam_after[youngest].free_chunks <= a->free_mem[youngest]) {
*src = 0;
*dst = youngest;
}
struct window_data *wd = get_window_data(a, youngest);
// "relaxing" here and below allows us to skip classes which will
// never grow or are growing slowly, more quickly finding other
// classes which violate the age ratio.
wd->relaxed = 1;
} else if (!too_free && wg_sum.pool_low && oldest != -1) {
if (!too_free && global_low && oldest != -1) {
*src = oldest;
*dst = 0;
} else if (!too_free && youngest != -1 && oldest != -1 && youngest != oldest) {
// if we have a youngest and oldest, and oldest is outside the ratio.
if (youngest_age < ((double)oldest_age * a->max_age_ratio)) {
struct window_data *wd = get_window_data(a, youngest);
wd->relaxed = 1;
// only actually assign more memory if it's absorbed what it has
if (a->sam_after[youngest].free_chunks <= a->free_mem[youngest]) {
*src = 0;
*dst = youngest;

}
}
}
return;
}
20 changes: 10 additions & 10 deletions storage.c
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,7 @@ static int storage_write(void *storage, const int clsid, const int item_age) {
static pthread_t storage_write_tid;
static pthread_mutex_t storage_write_plock;
#define WRITE_SLEEP_MIN 500
#define MIN_PAGES_FREE 3

static void *storage_write_thread(void *arg) {
void *storage = arg;
Expand All @@ -591,6 +592,7 @@ static void *storage_write_thread(void *arg) {
while (1) {
// cache per-loop to avoid calls to the slabs_clsid() search loop
int min_class = slabs_clsid(settings.ext_item_size);
unsigned int global_pages = global_page_pool_size(NULL);
bool do_sleep = true;
counter++;
if (to_sleep > settings.ext_max_sleep)
Expand All @@ -601,7 +603,6 @@ static void *storage_write_thread(void *arg) {
bool mem_limit_reached = false;
unsigned int chunks_free;
int item_age;
int target = settings.ext_free_memchunks[x];
if (min_class > x || (backoff[x] && (counter % backoff[x] != 0))) {
// Long sleeps means we should retry classes sooner.
if (to_sleep > WRITE_SLEEP_MIN * 10)
Expand All @@ -610,21 +611,22 @@ static void *storage_write_thread(void *arg) {
}

// Avoid extra slab lock calls during heavy writing.
unsigned int chunks_perpage = 0;
chunks_free = slabs_available_chunks(x, &mem_limit_reached,
NULL);
&chunks_perpage);
unsigned int target = chunks_perpage * MIN_PAGES_FREE;

// storage_write() will fail and cut loop after filling write buffer.
while (1) {
// if we are low on chunks and no spare, push out early.
if (chunks_free < target && mem_limit_reached) {
if (chunks_free < target && global_pages <= settings.ext_global_pool_min) {
item_age = 0;
} else {
item_age = settings.ext_item_age;
}
if (storage_write(storage, x, item_age)) {
chunks_free++; // Allow stopping if we've done enough this loop
did_move = true;
do_sleep = false;
if (to_sleep > WRITE_SLEEP_MIN)
to_sleep /= 2;
} else {
Expand All @@ -635,15 +637,15 @@ static void *storage_write_thread(void *arg) {
if (!did_move) {
backoff[x]++;
} else if (backoff[x]) {
backoff[x] /= 2;
backoff[x] = 1;
}
}

// flip lock so we can be paused or stopped
pthread_mutex_unlock(&storage_write_plock);
if (do_sleep) {
usleep(to_sleep);
to_sleep *= 2;
to_sleep++;
}
pthread_mutex_lock(&storage_write_plock);
}
Expand Down Expand Up @@ -1379,10 +1381,8 @@ void *storage_init(void *conf) {
settings.ext_drop_under = cf->storage_file->page_count / 4;
}
crc32c_init();
/* Init free chunks to zero. */
for (int x = 0; x < MAX_NUMBER_OF_SLAB_CLASSES; x++) {
settings.ext_free_memchunks[x] = 0;
}

settings.ext_global_pool_min = 0;
storage = extstore_init(cf->storage_file, ext_cf, &eres);
if (storage == NULL) {
fprintf(stderr, "Failed to initialize external storage: %s\n",
Expand Down

0 comments on commit 3d6d74a

Please sign in to comment.