Skip to content

Commit 6cb9d51

Browse files
committed
Fix leakage of cost_limit when multiple autovacuum workers are active.
When using default autovacuum_vac_cost_limit, autovac_balance_cost relied on VacuumCostLimit to contain the correct global value ... but after the first time through in a particular worker process, it didn't, because we'd trashed it in previous iterations. Depending on the state of other autovac workers, this could result in a steady reduction of the effective cost_limit setting as a particular worker processed more and more tables, causing it to go slower and slower. Spotted by Simon Poole (bug #5759). Fix by saving and restoring the GUC variables in the loop in do_autovacuum. In passing, improve a few comments. Back-patch to 8.3 ... the cost rebalancing code has been buggy since it was put in.
1 parent f068299 commit 6cb9d51

File tree

1 file changed

+45
-18
lines changed

1 file changed

+45
-18
lines changed

src/backend/postmaster/autovacuum.c

+45-18
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ typedef struct autovac_table
184184
*
185185
* wi_links entry into free list or running list
186186
* wi_dboid OID of the database this worker is supposed to work on
187-
* wi_tableoid OID of the table currently being vacuumed
187+
* wi_tableoid OID of the table currently being vacuumed, if any
188188
* wi_proc pointer to PGPROC of the running worker, NULL if not started
189189
* wi_launchtime Time at which this worker was launched
190190
* wi_cost_* Vacuum cost-based delay parameters current in this worker
@@ -1651,7 +1651,7 @@ FreeWorkerInfo(int code, Datum arg)
16511651
* limit setting of the remaining workers.
16521652
*
16531653
* We somewhat ignore the risk that the launcher changes its PID
1654-
* between we reading it and the actual kill; we expect ProcKill to be
1654+
* between us reading it and the actual kill; we expect ProcKill to be
16551655
* called shortly after us, and we assume that PIDs are not reused too
16561656
* quickly after a process exits.
16571657
*/
@@ -1695,16 +1695,18 @@ AutoVacuumUpdateDelay(void)
16951695

16961696
/*
16971697
* autovac_balance_cost
1698-
* Recalculate the cost limit setting for each active workers.
1698+
* Recalculate the cost limit setting for each active worker.
16991699
*
17001700
* Caller must hold the AutovacuumLock in exclusive mode.
17011701
*/
17021702
static void
17031703
autovac_balance_cost(void)
17041704
{
1705-
WorkerInfo worker;
1706-
17071705
/*
1706+
* The idea here is that we ration out I/O equally. The amount of I/O
1707+
* that a worker can consume is determined by cost_limit/cost_delay, so
1708+
* we try to equalize those ratios rather than the raw limit settings.
1709+
*
17081710
* note: in cost_limit, zero also means use value from elsewhere, because
17091711
* zero is not a valid value.
17101712
*/
@@ -1714,6 +1716,7 @@ autovac_balance_cost(void)
17141716
autovacuum_vac_cost_delay : VacuumCostDelay);
17151717
double cost_total;
17161718
double cost_avail;
1719+
WorkerInfo worker;
17171720

17181721
/* not set? nothing to do */
17191722
if (vac_cost_limit <= 0 || vac_cost_delay <= 0)
@@ -1740,7 +1743,7 @@ autovac_balance_cost(void)
17401743
return;
17411744

17421745
/*
1743-
* Adjust each cost limit of active workers to balance the total of cost
1746+
* Adjust cost limit of each active worker to balance the total of cost
17441747
* limit to autovacuum_vacuum_cost_limit.
17451748
*/
17461749
cost_avail = (double) vac_cost_limit / vac_cost_delay;
@@ -1756,14 +1759,19 @@ autovac_balance_cost(void)
17561759
(cost_avail * worker->wi_cost_limit_base / cost_total);
17571760

17581761
/*
1759-
* We put a lower bound of 1 to the cost_limit, to avoid division-
1760-
* by-zero in the vacuum code.
1762+
* We put a lower bound of 1 on the cost_limit, to avoid division-
1763+
* by-zero in the vacuum code. Also, in case of roundoff trouble
1764+
* in these calculations, let's be sure we don't ever set
1765+
* cost_limit to more than the base value.
17611766
*/
1762-
worker->wi_cost_limit = Max(Min(limit, worker->wi_cost_limit_base), 1);
1763-
1764-
elog(DEBUG2, "autovac_balance_cost(pid=%u db=%u, rel=%u, cost_limit=%d, cost_delay=%d)",
1765-
worker->wi_proc->pid, worker->wi_dboid,
1766-
worker->wi_tableoid, worker->wi_cost_limit, worker->wi_cost_delay);
1767+
worker->wi_cost_limit = Max(Min(limit,
1768+
worker->wi_cost_limit_base),
1769+
1);
1770+
1771+
elog(DEBUG2, "autovac_balance_cost(pid=%u db=%u, rel=%u, cost_limit=%d, cost_limit_base=%d, cost_delay=%d)",
1772+
worker->wi_proc->pid, worker->wi_dboid, worker->wi_tableoid,
1773+
worker->wi_cost_limit, worker->wi_cost_limit_base,
1774+
worker->wi_cost_delay);
17671775
}
17681776

17691777
worker = (WorkerInfo) SHMQueueNext(&AutoVacuumShmem->av_runningWorkers,
@@ -2015,6 +2023,8 @@ do_autovacuum(void)
20152023
char *datname,
20162024
*nspname,
20172025
*relname;
2026+
int stdVacuumCostDelay;
2027+
int stdVacuumCostLimit;
20182028

20192029
CHECK_FOR_INTERRUPTS();
20202030

@@ -2086,11 +2096,15 @@ do_autovacuum(void)
20862096
MyWorkerInfo->wi_tableoid = relid;
20872097
LWLockRelease(AutovacuumScheduleLock);
20882098

2089-
/* Set the initial vacuum cost parameters for this table */
2090-
VacuumCostDelay = tab->at_vacuum_cost_delay;
2091-
VacuumCostLimit = tab->at_vacuum_cost_limit;
2099+
/*
2100+
* Remember the prevailing values of the vacuum cost GUCs. We have
2101+
* to restore these at the bottom of the loop, else we'll compute
2102+
* wrong values in the next iteration of autovac_balance_cost().
2103+
*/
2104+
stdVacuumCostDelay = VacuumCostDelay;
2105+
stdVacuumCostLimit = VacuumCostLimit;
20922106

2093-
/* Last fixups before actually starting to work */
2107+
/* Must hold AutovacuumLock while mucking with cost balance info */
20942108
LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
20952109

20962110
/* advertise my cost delay parameters for the balancing algorithm */
@@ -2101,6 +2115,9 @@ do_autovacuum(void)
21012115
/* do a balance */
21022116
autovac_balance_cost();
21032117

2118+
/* set the active cost parameters from the result of that */
2119+
AutoVacuumUpdateDelay();
2120+
21042121
/* done */
21052122
LWLockRelease(AutovacuumLock);
21062123

@@ -2182,10 +2199,20 @@ do_autovacuum(void)
21822199
if (relname)
21832200
pfree(relname);
21842201

2185-
/* remove my info from shared memory */
2202+
/*
2203+
* Remove my info from shared memory. We could, but intentionally
2204+
* don't, clear wi_cost_limit and friends --- this is on the
2205+
* assumption that we probably have more to do with similar cost
2206+
* settings, so we don't want to give up our share of I/O for a very
2207+
* short interval and thereby thrash the global balance.
2208+
*/
21862209
LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
21872210
MyWorkerInfo->wi_tableoid = InvalidOid;
21882211
LWLockRelease(AutovacuumLock);
2212+
2213+
/* restore vacuum cost GUCs for the next iteration */
2214+
VacuumCostDelay = stdVacuumCostDelay;
2215+
VacuumCostLimit = stdVacuumCostLimit;
21892216
}
21902217

21912218
/*

0 commit comments

Comments
 (0)