diff --git a/HOWTO b/HOWTO index 8073240ef1..c46b883424 100644 --- a/HOWTO +++ b/HOWTO @@ -1693,3 +1693,18 @@ write Write 'length' bytes beginning from 'offset' sync fsync() the file datasync fdatasync() the file trim trim the given file from the given 'offset' for 'length' bytes + + +9.0 CPU idleness profiling + +In some cases, we want to understand CPU overhead in a test. For example, +we test patches for the specific goodness of whether they reduce CPU usage. +fio implements a balloon approach to create a thread per CPU that runs at +idle priority, meaning that it only runs when nobody else needs the cpu. +By measuring the amount of work completed by the thread, idleness of each +CPU can be derived accordingly. + +An unit work is defined as touching a full page of unsigned characters. Mean +and standard deviation of time to complete an unit work is reported in "unit +work" section. Options can be chosen to report detailed percpu idleness or +overall system idleness by aggregating percpu stats. diff --git a/Makefile b/Makefile index 52ec4a2dad..a4a478d9b4 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,8 @@ SOURCE := gettime.c fio.c ioengines.c init.c stat.c log.c time.c filesetup.c \ engines/mmap.c engines/sync.c engines/null.c engines/net.c \ memalign.c server.c client.c iolog.c backend.c libfio.c flow.c \ json.c lib/zipf.c lib/axmap.c lib/lfsr.c gettime-thread.c \ - helpers.c lib/flist_sort.c lib/hweight.c lib/getrusage.c + helpers.c lib/flist_sort.c lib/hweight.c lib/getrusage.c \ + idletime.c ifdef CONFIG_64BIT_LLP64 CFLAGS += -DBITS_PER_LONG=32 diff --git a/README b/README index 7c4552d98b..c43b795c21 100644 --- a/README +++ b/README @@ -145,6 +145,9 @@ $ fio --max-jobs Maximum number of threads/processes to support --server=args Start backend server. See Client/Server section. --client=host Connect to specified backend. + --idle-prof=option Report cpu idleness on a system or percpu basis + (option=system,percpu) or run unit work + calibration only (option=calibrate). Any parameters following the options will be assumed to be job files, diff --git a/backend.c b/backend.c index 87810e89d6..218ae2545d 100644 --- a/backend.c +++ b/backend.c @@ -51,6 +51,7 @@ #include "memalign.h" #include "server.h" #include "lib/getrusage.h" +#include "idletime.h" static pthread_t disk_util_thread; static struct fio_mutex *disk_thread_mutex; @@ -1499,6 +1500,8 @@ static void run_threads(void) if (fio_gtod_offload && fio_start_gtod_thread()) return; + + fio_idle_prof_init(); set_sig_handlers(); @@ -1556,6 +1559,9 @@ static void run_threads(void) } } + /* start idle threads before io threads start to run */ + fio_idle_prof_start(); + set_genesis_time(); while (todo) { @@ -1718,6 +1724,8 @@ static void run_threads(void) usleep(10000); } + fio_idle_prof_stop(); + update_io_ticks(); fio_unpin_memory(); } diff --git a/fio.1 b/fio.1 index 1f81ea7bae..6b7e945909 100644 --- a/fio.1 +++ b/fio.1 @@ -77,6 +77,9 @@ Background a fio server, writing the pid to the given pid file. .TP .BI \-\-client \fR=\fPhost Instead of running the jobs locally, send and run them on the given host. +.TP +.BI \-\-idle\-prof \fR=\fPoption +Report cpu idleness on a system or percpu basis (\fIoption\fP=system,percpu) or run unit work calibration only (\fIoption\fP=calibrate). .SH "JOB FILE FORMAT" Job files are in `ini' format. They consist of one or more job definitions, which begin with a job name in square brackets and diff --git a/idletime.c b/idletime.c new file mode 100644 index 0000000000..6100134f69 --- /dev/null +++ b/idletime.c @@ -0,0 +1,472 @@ +#include +#include "json.h" +#include "idletime.h" + +static volatile struct idle_prof_common ipc; + +/* Get time to complete an unit work on a particular cpu. + * The minimum number in CALIBRATE_RUNS runs is returned. + */ +static double calibrate_unit(unsigned char *data) +{ + unsigned long t, i, j, k; + struct timeval tps; + double tunit = 0.0; + + for (i=0; iinit_lock); + + /* exit if any other thread failed to start */ + if (ipc.status == IDLE_PROF_STATUS_ABORT) + return NULL; + +#if defined(FIO_HAVE_CPU_AFFINITY) + os_cpu_mask_t cpu_mask; + memset(&cpu_mask, 0, sizeof(cpu_mask)); + fio_cpu_set(&cpu_mask, ipt->cpu); + + if ((retval=fio_setaffinity(gettid(), cpu_mask)) == -1) + log_err("fio: fio_setaffinity failed\n"); +#else + retval = -1; + log_err("fio: fio_setaffinity not supported\n"); +#endif + if (retval == -1) { + ipt->state = TD_EXITED; + pthread_mutex_unlock(&ipt->init_lock); + return NULL; + } + + ipt->cali_time = calibrate_unit(ipt->data); + + /* delay to set IDLE class till now for better calibration accuracy */ +#if defined(FIO_HAVE_SCHED_IDLE) + if ((retval = fio_set_sched_idle())) + log_err("fio: fio_set_sched_idle failed\n"); +#else + retval = -1; + log_err("fio: fio_set_sched_idle not supported\n"); +#endif + if (retval == -1) { + ipt->state = TD_EXITED; + pthread_mutex_unlock(&ipt->init_lock); + return NULL; + } + + ipt->state = TD_INITIALIZED; + + /* signal the main thread that calibration is done */ + pthread_cond_signal(&ipt->cond); + pthread_mutex_unlock(&ipt->init_lock); + + /* wait for other calibration to finish */ + pthread_mutex_lock(&ipt->start_lock); + + /* exit if other threads failed to initialize */ + if (ipc.status == IDLE_PROF_STATUS_ABORT) + return NULL; + + /* exit if we are doing calibration only */ + if (ipc.status == IDLE_PROF_STATUS_CALI_STOP) + return NULL; + + fio_gettime(&ipt->tps, NULL); + ipt->state = TD_RUNNING; + + j = 0; + while (1) { + for (k=0; k < page_size; k++) { + ipt->data[(k+j)%page_size] = k%256; + if (ipc.status == IDLE_PROF_STATUS_PROF_STOP) { + fio_gettime(&ipt->tpe, NULL); + goto idle_prof_done; + } + } + j++; + } + +idle_prof_done: + + ipt->loops = j + (double)k/page_size; + ipt->state = TD_EXITED; + pthread_mutex_unlock(&ipt->start_lock); + + return NULL; +} + +/* calculate mean and standard deviation to complete an unit of work */ +static void calibration_stats(void) +{ + int i; + double sum=0.0, var=0.0; + struct idle_prof_thread *ipt; + + for (i = 0; i < ipc.nr_cpus; i++) { + ipt = &ipc.ipts[i]; + sum += ipt->cali_time; + } + + ipc.cali_mean = sum/ipc.nr_cpus; + + for (i = 0; i < ipc.nr_cpus; i++) { + ipt = &ipc.ipts[i]; + var += pow(ipt->cali_time-ipc.cali_mean, 2); + } + + ipc.cali_stddev = sqrt(var/(ipc.nr_cpus-1)); +} + +void fio_idle_prof_init(void) +{ + int i, ret; + struct timeval tp; + struct timespec ts; + pthread_attr_t tattr; + struct idle_prof_thread *ipt; + + ipc.nr_cpus = cpus_online(); + ipc.status = IDLE_PROF_STATUS_OK; + + if (ipc.opt == IDLE_PROF_OPT_NONE) + return; + + if ((ret = pthread_attr_init(&tattr))) { + log_err("fio: pthread_attr_init %s\n", strerror(ret)); + return; + } + if ((ret = pthread_attr_setscope(&tattr, PTHREAD_SCOPE_SYSTEM))) { + log_err("fio: pthread_attr_setscope %s\n", strerror(ret)); + return; + } + + ipc.ipts = malloc(ipc.nr_cpus * sizeof(struct idle_prof_thread)); + if (!ipc.ipts) { + log_err("fio: malloc failed\n"); + return; + } + + ipc.buf = malloc(ipc.nr_cpus * page_size); + if (!ipc.buf) { + log_err("fio: malloc failed\n"); + free(ipc.ipts); + return; + } + + /* profiling aborts on any single thread failure since the + * result won't be accurate if any cpu is not used. + */ + for (i = 0; i < ipc.nr_cpus; i++) { + ipt = &ipc.ipts[i]; + + ipt->cpu = i; + ipt->state = TD_NOT_CREATED; + ipt->data = (unsigned char *)(ipc.buf + page_size * i); + + if ((ret = pthread_mutex_init(&ipt->init_lock, NULL))) { + ipc.status = IDLE_PROF_STATUS_ABORT; + log_err("fio: pthread_mutex_init %s\n", strerror(ret)); + break; + } + + if ((ret = pthread_mutex_init(&ipt->start_lock, NULL))) { + ipc.status = IDLE_PROF_STATUS_ABORT; + log_err("fio: pthread_mutex_init %s\n", strerror(ret)); + break; + } + + if ((ret = pthread_cond_init(&ipt->cond, NULL))) { + ipc.status = IDLE_PROF_STATUS_ABORT; + log_err("fio: pthread_cond_init %s\n", strerror(ret)); + break; + } + + /* make sure all threads are spawned before they start */ + pthread_mutex_lock(&ipt->init_lock); + + /* make sure all threads finish init before profiling starts */ + pthread_mutex_lock(&ipt->start_lock); + + if ((ret = pthread_create(&ipt->thread, &tattr, idle_prof_thread_fn, ipt))) { + ipc.status = IDLE_PROF_STATUS_ABORT; + log_err("fio: pthread_create %s\n", strerror(ret)); + break; + } else { + ipt->state = TD_CREATED; + } + + if ((ret = pthread_detach(ipt->thread))) { + /* log error and let the thread spin */ + log_err("fio: pthread_detatch %s\n", strerror(ret)); + } + } + + /* let good threads continue so that they can exit + * if errors on other threads occurred previously. + */ + for (i = 0; i < ipc.nr_cpus; i++) { + ipt = &ipc.ipts[i]; + pthread_mutex_unlock(&ipt->init_lock); + } + + if (ipc.status == IDLE_PROF_STATUS_ABORT) + return; + + /* wait for calibration to finish */ + for (i = 0; i < ipc.nr_cpus; i++) { + ipt = &ipc.ipts[i]; + pthread_mutex_lock(&ipt->init_lock); + while ((ipt->state!=TD_EXITED) && (ipt->state!=TD_INITIALIZED)) { + fio_gettime(&tp, NULL); + ts.tv_sec = tp.tv_sec + 1; + ts.tv_nsec = tp.tv_usec * 1000; + pthread_cond_timedwait(&ipt->cond, &ipt->init_lock, &ts); + } + pthread_mutex_unlock(&ipt->init_lock); + + /* any thread failed to initialize would abort other threads + * later after fio_idle_prof_start. + */ + if (ipt->state == TD_EXITED) + ipc.status = IDLE_PROF_STATUS_ABORT; + } + + if (ipc.status != IDLE_PROF_STATUS_ABORT) + calibration_stats(); + else + ipc.cali_mean = ipc.cali_stddev = 0.0; + + if (ipc.opt == IDLE_PROF_OPT_CALI) + ipc.status = IDLE_PROF_STATUS_CALI_STOP; +} + +void fio_idle_prof_start(void) +{ + int i; + struct idle_prof_thread *ipt; + + if (ipc.opt == IDLE_PROF_OPT_NONE) + return; + + /* unlock regardless abort is set or not */ + for (i = 0; i < ipc.nr_cpus; i++) { + ipt = &ipc.ipts[i]; + pthread_mutex_unlock(&ipt->start_lock); + } +} + +void fio_idle_prof_stop(void) +{ + int i; + uint64_t runt; + struct timeval tp; + struct timespec ts; + struct idle_prof_thread *ipt; + + if (ipc.opt == IDLE_PROF_OPT_NONE) + return; + + if (ipc.opt == IDLE_PROF_OPT_CALI) + return; + + ipc.status = IDLE_PROF_STATUS_PROF_STOP; + + /* wait for all threads to exit from profiling */ + for (i = 0; i < ipc.nr_cpus; i++) { + ipt = &ipc.ipts[i]; + pthread_mutex_lock(&ipt->start_lock); + while ((ipt->state!=TD_EXITED) && (ipt->state!=TD_NOT_CREATED)) { + fio_gettime(&tp, NULL); + ts.tv_sec = tp.tv_sec + 1; + ts.tv_nsec = tp.tv_usec * 1000; + /* timed wait in case a signal is not received */ + pthread_cond_timedwait(&ipt->cond, &ipt->start_lock, &ts); + } + pthread_mutex_unlock(&ipt->start_lock); + + /* calculate idleness */ + if (ipc.cali_mean != 0.0) { + runt = utime_since(&ipt->tps, &ipt->tpe); + ipt->idleness = ipt->loops * ipc.cali_mean / runt; + } else + ipt->idleness = 0.0; + } + + /* memory allocations are freed via explicit fio_idle_prof_cleanup + * after profiling stats are collected by apps. + */ + + return; +} + +/* return system idle percentage when cpu is -1; + * return one cpu idle percentage otherwise. + */ +static double fio_idle_prof_cpu_stat(int cpu) +{ + int i, nr_cpus = ipc.nr_cpus; + struct idle_prof_thread *ipt; + double p = 0.0; + + if (ipc.opt == IDLE_PROF_OPT_NONE) + return 0.0; + + if ((cpu >= nr_cpus) || (cpu < -1)) { + log_err("fio: idle profiling invalid cpu index\n"); + return 0.0; + } + + if (cpu == -1) { + for (i = 0; i < nr_cpus; i++) { + ipt = &ipc.ipts[i]; + p += ipt->idleness; + } + p /= nr_cpus; + } else { + ipt = &ipc.ipts[cpu]; + p = ipt->idleness; + } + + return p*100.0; +} + +void fio_idle_prof_cleanup(void) +{ + if (ipc.ipts) { + free(ipc.ipts); + ipc.ipts = NULL; + } + + if (ipc.buf) { + free(ipc.buf); + ipc.buf = NULL; + } +} + +int fio_idle_prof_parse_opt(const char *args) +{ + ipc.opt = IDLE_PROF_OPT_NONE; /* default */ + + if (!args) { + log_err("fio: empty idle-prof option string\n"); + return -1; + } + +#if defined(FIO_HAVE_CPU_AFFINITY) && defined(FIO_HAVE_SCHED_IDLE) + if (strcmp("calibrate", args) == 0) { + ipc.opt = IDLE_PROF_OPT_CALI; + fio_idle_prof_init(); + fio_idle_prof_start(); + fio_idle_prof_stop(); + show_idle_prof_stats(FIO_OUTPUT_NORMAL, NULL); + return 1; + } else if (strcmp("system", args) == 0) { + ipc.opt = IDLE_PROF_OPT_SYSTEM; + return 0; + } else if (strcmp("percpu", args) == 0) { + ipc.opt = IDLE_PROF_OPT_PERCPU; + return 0; + } else { + log_err("fio: incorrect idle-prof option\n", args); + return -1; + } +#else + log_err("fio: idle-prof not supported on this platform\n"); + return -1; +#endif +} + +void show_idle_prof_stats(int output, struct json_object *parent) +{ + int i, nr_cpus = ipc.nr_cpus; + struct json_object *tmp; + char s[MAX_CPU_STR_LEN]; + + if (output == FIO_OUTPUT_NORMAL) { + if (ipc.opt > IDLE_PROF_OPT_CALI) + log_info("\nCPU idleness:\n"); + else if (ipc.opt == IDLE_PROF_OPT_CALI) + log_info("CPU idleness:\n"); + + if (ipc.opt >= IDLE_PROF_OPT_SYSTEM) + log_info(" system: %3.2f%%\n", fio_idle_prof_cpu_stat(-1)); + + if (ipc.opt == IDLE_PROF_OPT_PERCPU) { + log_info(" percpu: %3.2f%%", fio_idle_prof_cpu_stat(0)); + for (i=1; i= IDLE_PROF_OPT_CALI) { + log_info(" unit work: mean=%3.2fus,", ipc.cali_mean); + log_info(" stddev=%3.2f\n", ipc.cali_stddev); + } + + /* dynamic mem allocations can now be freed */ + if (ipc.opt != IDLE_PROF_OPT_NONE) + fio_idle_prof_cleanup(); + + return; + } + + if ((ipc.opt != IDLE_PROF_OPT_NONE) && (output == FIO_OUTPUT_JSON)) { + if (!parent) + return; + + tmp = json_create_object(); + if (!tmp) + return; + + json_object_add_value_object(parent, "cpu_idleness", tmp); + json_object_add_value_float(tmp, "system", fio_idle_prof_cpu_stat(-1)); + + if (ipc.opt == IDLE_PROF_OPT_PERCPU) { + for (i=0; i"); printf("\n Jens Axboe \n"); } @@ -1611,6 +1620,14 @@ int parse_cmd_line(int argc, char *argv[]) case 'D': pid_file = strdup(optarg); break; + case 'I': + if ((ret = fio_idle_prof_parse_opt(optarg))) { + /* exit on error and calibration only */ + do_exit++; + if (ret == -1) + exit_val = 1; + } + break; case 'C': if (is_backend) { log_err("fio: can't be both client and server\n"); diff --git a/os/os-linux.h b/os/os-linux.h index 4e837da937..acc5a202aa 100644 --- a/os/os-linux.h +++ b/os/os-linux.h @@ -35,6 +35,7 @@ #define FIO_HAVE_TRIM #define FIO_HAVE_BINJECT #define FIO_HAVE_GETTID +#define FIO_HAVE_SCHED_IDLE #define FIO_USE_GENERIC_INIT_RANDOM_STATE #ifdef MAP_HUGETLB @@ -246,4 +247,10 @@ static inline int os_trim(int fd, unsigned long long start, return errno; } +static inline int fio_set_sched_idle(void) +{ + struct sched_param p = { .sched_priority = 0, }; + return sched_setscheduler(gettid(), SCHED_IDLE, &p); +} + #endif diff --git a/os/os-windows.h b/os/os-windows.h index ef71dd708c..2d57891f41 100644 --- a/os/os-windows.h +++ b/os/os-windows.h @@ -21,6 +21,7 @@ #define FIO_HAVE_CPU_AFFINITY #define FIO_HAVE_CHARDEV_SIZE #define FIO_HAVE_GETTID +#define FIO_HAVE_SCHED_IDLE #define FIO_USE_GENERIC_RAND #define FIO_PREFERRED_ENGINE "windowsaio" @@ -247,4 +248,11 @@ static inline int init_random_state(struct thread_data *td, unsigned long *rand_ } +static inline int fio_set_sched_idle(void) +{ + /* SetThreadPriority returns nonzero for success */ + return (SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_IDLE))? 0 : -1; +} + + #endif /* FIO_OS_WINDOWS_H */ diff --git a/stat.c b/stat.c index 5cc6b7b914..7e2feea25b 100644 --- a/stat.c +++ b/stat.c @@ -12,6 +12,7 @@ #include "lib/ieee754.h" #include "json.h" #include "lib/getrusage.h" +#include "idletime.h" void update_rusage_stat(struct thread_data *td) { @@ -1335,6 +1336,8 @@ void show_run_stats(void) /* disk util stats, if any */ show_disk_util(1, root); + show_idle_prof_stats(FIO_OUTPUT_JSON, root); + json_print_object(root); log_info("\n"); json_free_object(root); @@ -1355,6 +1358,8 @@ void show_run_stats(void) else if (output_format == FIO_OUTPUT_NORMAL) show_disk_util(0, NULL); + show_idle_prof_stats(FIO_OUTPUT_NORMAL, NULL); + free(runstats); free(threadstats); }