Skip to content

Commit

Permalink
[batch] Dont grab an X lock on the batches table in attempts after up…
Browse files Browse the repository at this point in the history
…date (hail-is#14379)

By joining against the `batches` row in the insert `FOR UPDATE`, these
queries try to grab an exclusive lock on the row in the `batches` table.
This is a problem if the transaction already has a shared lock on that
row, like it does in `mark_job_complete`. In that case any two
concurrent MJCs from the same batch will deadlock.
  • Loading branch information
daniel-goldstein authored Feb 29, 2024
1 parent 358542f commit c547bd8
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 5 deletions.
10 changes: 5 additions & 5 deletions batch/sql/estimated-current.sql
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,7 @@ FOR EACH ROW
BEGIN
DECLARE job_cores_mcpu INT;
DECLARE cur_billing_project VARCHAR(100);
DECLARE cur_user VARCHAR(100);
DECLARE msec_diff_rollup BIGINT;
DECLARE cur_n_tokens INT;
DECLARE rand_token INT;
Expand All @@ -549,6 +550,7 @@ BEGIN
WHERE batch_id = NEW.batch_id AND job_id = NEW.job_id;

SELECT billing_project INTO cur_billing_project FROM batches WHERE id = NEW.batch_id;
SELECT `user` INTO cur_user FROM batches WHERE id = NEW.batch_id;

SET msec_diff_rollup = (GREATEST(COALESCE(NEW.rollup_time - NEW.start_time, 0), 0) -
GREATEST(COALESCE(OLD.rollup_time - OLD.start_time, 0), 0));
Expand All @@ -557,12 +559,11 @@ BEGIN

IF msec_diff_rollup != 0 THEN
INSERT INTO aggregated_billing_project_user_resources_v3 (billing_project, user, resource_id, token, `usage`)
SELECT batches.billing_project, batches.`user`,
SELECT cur_billing_project, cur_user,
attempt_resources.deduped_resource_id,
rand_token,
msec_diff_rollup * quantity
FROM attempt_resources
JOIN batches ON batches.id = attempt_resources.batch_id
WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id
FOR UPDATE
ON DUPLICATE KEY UPDATE `usage` = aggregated_billing_project_user_resources_v3.`usage` + msec_diff_rollup * quantity;
Expand Down Expand Up @@ -591,13 +592,12 @@ BEGIN

INSERT INTO aggregated_billing_project_user_resources_by_date_v3 (billing_date, billing_project, user, resource_id, token, `usage`)
SELECT cur_billing_date,
batches.billing_project,
batches.`user`,
cur_billing_project,
cur_user,
attempt_resources.deduped_resource_id,
rand_token,
msec_diff_rollup * quantity
FROM attempt_resources
JOIN batches ON batches.id = attempt_resources.batch_id
WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id
FOR UPDATE
ON DUPLICATE KEY UPDATE `usage` = aggregated_billing_project_user_resources_by_date_v3.`usage` + msec_diff_rollup * quantity;
Expand Down
76 changes: 76 additions & 0 deletions batch/sql/fix-mark-job-complete-deadlocks.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
DELIMITER $$

DROP TRIGGER IF EXISTS attempts_after_update $$
CREATE TRIGGER attempts_after_update AFTER UPDATE ON attempts
FOR EACH ROW
BEGIN
DECLARE job_cores_mcpu INT;
DECLARE cur_billing_project VARCHAR(100);
DECLARE cur_user VARCHAR(100);
DECLARE msec_diff_rollup BIGINT;
DECLARE cur_n_tokens INT;
DECLARE rand_token INT;
DECLARE cur_billing_date DATE;

SELECT n_tokens INTO cur_n_tokens FROM globals LOCK IN SHARE MODE;
SET rand_token = FLOOR(RAND() * cur_n_tokens);

SELECT cores_mcpu INTO job_cores_mcpu FROM jobs
WHERE batch_id = NEW.batch_id AND job_id = NEW.job_id;

SELECT billing_project INTO cur_billing_project FROM batches WHERE id = NEW.batch_id;
SELECT `user` INTO cur_user FROM batches WHERE id = NEW.batch_id;

SET msec_diff_rollup = (GREATEST(COALESCE(NEW.rollup_time - NEW.start_time, 0), 0) -
GREATEST(COALESCE(OLD.rollup_time - OLD.start_time, 0), 0));

SET cur_billing_date = CAST(UTC_DATE() AS DATE);

IF msec_diff_rollup != 0 THEN
INSERT INTO aggregated_billing_project_user_resources_v3 (billing_project, user, resource_id, token, `usage`)
SELECT cur_billing_project, cur_user,
attempt_resources.deduped_resource_id,
rand_token,
msec_diff_rollup * quantity
FROM attempt_resources
WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id
FOR UPDATE
ON DUPLICATE KEY UPDATE `usage` = aggregated_billing_project_user_resources_v3.`usage` + msec_diff_rollup * quantity;

INSERT INTO aggregated_job_group_resources_v3 (batch_id, job_group_id, resource_id, token, `usage`)
SELECT attempt_resources.batch_id,
job_group_self_and_ancestors.ancestor_id,
attempt_resources.deduped_resource_id,
rand_token,
msec_diff_rollup * quantity
FROM attempt_resources
LEFT JOIN jobs ON attempt_resources.batch_id = jobs.batch_id AND attempt_resources.job_id = jobs.job_id
LEFT JOIN job_group_self_and_ancestors ON jobs.batch_id = job_group_self_and_ancestors.batch_id AND jobs.job_group_id = job_group_self_and_ancestors.job_group_id
WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_resources.attempt_id = NEW.attempt_id
FOR UPDATE
ON DUPLICATE KEY UPDATE `usage` = aggregated_job_group_resources_v3.`usage` + msec_diff_rollup * quantity;

INSERT INTO aggregated_job_resources_v3 (batch_id, job_id, resource_id, `usage`)
SELECT attempt_resources.batch_id, attempt_resources.job_id,
attempt_resources.deduped_resource_id,
msec_diff_rollup * quantity
FROM attempt_resources
WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id
FOR UPDATE
ON DUPLICATE KEY UPDATE `usage` = aggregated_job_resources_v3.`usage` + msec_diff_rollup * quantity;

INSERT INTO aggregated_billing_project_user_resources_by_date_v3 (billing_date, billing_project, user, resource_id, token, `usage`)
SELECT cur_billing_date,
cur_billing_project,
cur_user,
attempt_resources.deduped_resource_id,
rand_token,
msec_diff_rollup * quantity
FROM attempt_resources
WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id
FOR UPDATE
ON DUPLICATE KEY UPDATE `usage` = aggregated_billing_project_user_resources_by_date_v3.`usage` + msec_diff_rollup * quantity;
END IF;
END $$

DELIMITER ;
3 changes: 3 additions & 0 deletions build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2365,6 +2365,9 @@ steps:
- name: finalize-job-groups
script: /io/sql/finalize-job-groups.sql
online: true
- name: fix-mark-job-complete-deadlocks
script: /io/sql/fix-mark-job-complete-deadlocks.sql
online: true
inputs:
- from: /repo/batch/sql
to: /io/sql
Expand Down

0 comments on commit c547bd8

Please sign in to comment.