Skip to content

Commit 0049350

Browse files
committed
PG-1604: Improve last key LSN calculation logic
Previosly we simply set the LSN for the new key to the first write location. This is however not correct, as there are many corner cases around this: * recovery / replication might write old LSNs * we can't handle multiple keys with the same TLI/LSN, which can happen with quick restarts without writes To support this in this commit we modify the following: * We only activate new keys outside crash recovery, or immediately if encryption is turned off * We also take the already existing last key into account (if exists), and only activate a new key if we progressed past its start location The remaining changes are just support infrastructure for this: * Since we might rewrite old records, we use the already existing keys for those writes, not the active last keys * We prefetch existing keys during initialization, so it doesn't accidentally happen in the critical section during a write There is a remaining bug with stopping wal encryption, also mentioned in a TODO message in the code. This will be addressed in a later PR as this fix already took too long.
1 parent a65c272 commit 0049350

File tree

7 files changed

+1400
-16
lines changed

7 files changed

+1400
-16
lines changed

contrib/pg_tde/meson.build

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ tap_tests = [
128128
't/wal_archiving.pl',
129129
't/wal_encrypt.pl',
130130
't/wal_key_tli.pl',
131+
't/059_tde_2pc_replication.pl',
132+
't/stream_rep.pl',
131133
]
132134

133135
tests += {

contrib/pg_tde/src/access/pg_tde_xlog_smgr.c

Lines changed: 96 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,8 @@ void
226226
TDEXLogSmgrInitWrite(bool encrypt_xlog)
227227
{
228228
WalEncryptionKey *key = pg_tde_read_last_wal_key();
229+
WalLocation start = {.tli = 1,.lsn = 0};
230+
WALKeyCacheRec *keys;
229231

230232
/*
231233
* Always generate a new key on starting PostgreSQL to protect against
@@ -246,6 +248,14 @@ TDEXLogSmgrInitWrite(bool encrypt_xlog)
246248
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
247249
}
248250

251+
keys = pg_tde_get_wal_cache_keys();
252+
253+
if (keys == NULL)
254+
{
255+
/* TODO cache is empty, try to preread keys from disk */
256+
keys = pg_tde_fetch_wal_keys(start);
257+
}
258+
249259
if (key)
250260
pfree(key);
251261
}
@@ -263,6 +273,32 @@ TDEXLogSmgrInitWriteReuseKey()
263273
}
264274
}
265275

276+
/*
277+
* Encrypt XLog page(s) from the buf and write to the segment file.
278+
*/
279+
static ssize_t
280+
TDEXLogWriteEncryptedPagesOldKeys(int fd, const void *buf, size_t count, off_t offset,
281+
TimeLineID tli, XLogSegNo segno, int segSize)
282+
{
283+
char *enc_buff = EncryptionBuf;
284+
285+
#ifndef FRONTEND
286+
Assert(count <= TDEXLogEncryptBuffSize());
287+
#endif
288+
289+
/* Copy the data as-is, as we might have unencrypted parts */
290+
memcpy(enc_buff, buf, count);
291+
292+
/*
293+
* This method potentially allocates, but only in very early execution
294+
* Shouldn't happen in a write, where we are in a critical section
295+
*/
296+
TDEXLogCryptBuffer(buf, enc_buff, count, offset, tli, segno, segSize);
297+
298+
return pg_pwrite(fd, enc_buff, count, offset);
299+
}
300+
301+
266302
/*
267303
* Encrypt XLog page(s) from the buf and write to the segment file.
268304
*/
@@ -284,6 +320,7 @@ TDEXLogWriteEncryptedPages(int fd, const void *buf, size_t count, off_t offset,
284320
#endif
285321

286322
CalcXLogPageIVPrefix(tli, segno, key->base_iv, iv_prefix);
323+
287324
pg_tde_stream_crypt(iv_prefix,
288325
offset,
289326
(char *) buf,
@@ -299,26 +336,59 @@ static ssize_t
299336
tdeheap_xlog_seg_write(int fd, const void *buf, size_t count, off_t offset,
300337
TimeLineID tli, XLogSegNo segno, int segSize)
301338
{
339+
bool lastKeyUsable;
340+
bool afterLastKey;
341+
#ifdef FRONTEND
342+
bool crashRecovery = false;
343+
#else
344+
bool crashRecovery = GetRecoveryState() == RECOVERY_STATE_CRASH;
345+
#endif
346+
347+
WalLocation loc = {.tli = tli};
348+
349+
XLogSegNoOffsetToRecPtr(segno, offset, segSize, loc.lsn);
350+
302351
/*
303352
* Set the last (most recent) key's start LSN if not set.
304353
*
305354
* This func called with WALWriteLock held, so no need in any extra sync.
306355
*/
307-
if (EncryptionKey.type != WAL_KEY_TYPE_INVALID && TDEXLogGetEncKeyLsn() == 0)
308-
{
309-
WalLocation loc = {.tli = tli};
310356

311-
XLogSegNoOffsetToRecPtr(segno, offset, segSize, loc.lsn);
357+
lastKeyUsable = (TDEXLogGetEncKeyLsn() != 0);
358+
afterLastKey = (TDEXLogGetEncKeyLsn() <= loc.lsn);
312359

313-
pg_tde_wal_last_key_set_location(loc);
314-
EncryptionKey.wal_start = loc;
315-
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
360+
if (EncryptionKey.type != WAL_KEY_TYPE_INVALID && !lastKeyUsable)
361+
{
362+
WALKeyCacheRec *last_key = pg_tde_get_last_wal_key();
363+
364+
if (!crashRecovery || EncryptionKey.type == WAL_KEY_TYPE_UNENCRYPTED)
365+
{
366+
/*
367+
* TODO: the unencrypted case is still not perfect, we need to
368+
* report an error in some cornercases
369+
*/
370+
if (last_key == NULL || last_key->start.lsn < loc.lsn)
371+
{
372+
pg_tde_wal_last_key_set_location(loc);
373+
EncryptionKey.wal_start = loc;
374+
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
375+
lastKeyUsable = true;
376+
}
377+
}
316378
}
317379

318-
if (EncryptionKey.type == WAL_KEY_TYPE_ENCRYPTED)
380+
if ((!afterLastKey || !lastKeyUsable) && EncryptionKey.type == WAL_KEY_TYPE_ENCRYPTED)
381+
{
382+
return TDEXLogWriteEncryptedPagesOldKeys(fd, buf, count, offset, tli, segno, segSize);
383+
}
384+
else if (EncryptionKey.type == WAL_KEY_TYPE_ENCRYPTED)
385+
{
319386
return TDEXLogWriteEncryptedPages(fd, buf, count, offset, tli, segno);
387+
}
320388
else
389+
{
321390
return pg_pwrite(fd, buf, count, offset);
391+
}
322392
}
323393

324394
/*
@@ -340,7 +410,7 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
340410
if (readsz <= 0)
341411
return readsz;
342412

343-
TDEXLogCryptBuffer(buf, count, offset, tli, segno, segSize);
413+
TDEXLogCryptBuffer(buf, buf, count, offset, tli, segno, segSize);
344414

345415
return readsz;
346416
}
@@ -349,20 +419,22 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
349419
* [De]Crypt buffer if needed based on provided segment offset, number and TLI
350420
*/
351421
void
352-
TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
422+
TDEXLogCryptBuffer(const void *buf, void *out_buf, size_t count, off_t offset,
353423
TimeLineID tli, XLogSegNo segno, int segSize)
354424
{
355425
WALKeyCacheRec *keys = pg_tde_get_wal_cache_keys();
356426
XLogRecPtr write_key_lsn;
357427
WalLocation data_end = {.tli = tli};
358428
WalLocation data_start = {.tli = tli};
359429

360-
if (!keys)
430+
if (keys == NULL)
361431
{
362432
WalLocation start = {.tli = 1,.lsn = 0};
363433

364434
/* cache is empty, try to read keys from disk */
365-
keys = pg_tde_fetch_wal_keys(start);
435+
pg_tde_fetch_wal_keys(start);
436+
437+
keys = pg_tde_get_wal_cache_keys();
366438
}
367439

368440
/*
@@ -421,6 +493,7 @@ TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
421493
off_t dec_end = XLogSegmentOffset(minlsn, segSize);
422494
size_t dec_sz;
423495
char *dec_buf = (char *) buf + (dec_off - offset);
496+
char *o_buf = (char *) out_buf + (dec_off - offset);
424497

425498
Assert(dec_off >= offset);
426499

@@ -432,17 +505,26 @@ TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
432505
dec_end = offset + count;
433506
}
434507

435-
dec_sz = dec_end - dec_off;
508+
if (dec_end > dec_off)
509+
{
510+
dec_sz = dec_end - dec_off;
511+
}
512+
else
513+
{
514+
/* assert? */
515+
dec_sz = 0;
516+
}
436517

437518
#ifdef TDE_XLOG_DEBUG
438519
elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %u_%X/%X",
439520
dec_off, dec_off - offset, dec_sz, curr_key->key.wal_start.tli, LSN_FORMAT_ARGS(curr_key->key.wal_start.lsn));
440521
#endif
522+
441523
pg_tde_stream_crypt(iv_prefix,
442524
dec_off,
443525
dec_buf,
444526
dec_sz,
445-
dec_buf,
527+
o_buf,
446528
curr_key->key.key,
447529
&curr_key->crypt_ctx);
448530
}

contrib/pg_tde/src/include/access/pg_tde_xlog_smgr.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ extern void TDEXLogSmgrInit(void);
1313
extern void TDEXLogSmgrInitWrite(bool encrypt_xlog);
1414
extern void TDEXLogSmgrInitWriteReuseKey(void);
1515

16-
extern void TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
16+
extern void TDEXLogCryptBuffer(const void *buf, void *out_buf, size_t count, off_t offset,
1717
TimeLineID tli, XLogSegNo segno, int segSize);
1818

1919
#endif /* PG_TDE_XLOGSMGR_H */

contrib/pg_tde/t/.pgtde.pm.swp

20 KB
Binary file not shown.

0 commit comments

Comments
 (0)