Fix level 0 GC dataloss bug (dgraph-io#1090)

When all level 0 tables are kept in memory and GC is triggered, it is possible that GC removes data from the value log file which has not yet persisted to disk. This PR fixes this issue by garbage collecting only the data which has been written to SSTs on the disk. When L0 is in memory, we GC only the data which is stored in Level 1 onwards. When L0 is not in memory. we GC data Level 0 onwards.
bhamail · Oct 23, 2019 · 12dea02 · 12dea02
1 parent 1afe2d0
commit 12dea02
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 5 deletions.
diff --git a/db.go b/db.go
@@ -596,7 +596,7 @@ func (db *DB) get(key []byte) (y.ValueStruct, error) {
 			*maxVs = vs
 		}
 	}
-	return db.lc.get(key, maxVs)
+	return db.lc.get(key, maxVs, 0)
 }
 
 func (db *DB) updateHead(ptrs []valuePointer) {
@@ -1107,10 +1107,18 @@ func (db *DB) RunValueLogGC(discardRatio float64) error {
 		return ErrInvalidRequest
 	}
 
+	// startLevel is the level from which we should search for the head key. When badger is running
+	// with KeepL0InMemory flag, all tables on L0 are kept in memory. This means we should pick head
+	// key from Level 1 onwards because if we pick the headkey from Level 0 we might end up losing
+	// data. See test TestL0GCBug.
+	startLevel := 0
+	if db.opt.KeepL0InMemory {
+		startLevel = 1
+	}
 	// Find head on disk
 	headKey := y.KeyWithTs(head, math.MaxUint64)
 	// Need to pass with timestamp, lsm get removes the last 8 bytes and compares key
-	val, err := db.lc.get(headKey, nil)
+	val, err := db.lc.get(headKey, nil, startLevel)
 	if err != nil {
 		return errors.Wrap(err, "Retrieving head from on-disk LSM")
 	}

diff --git a/db2_test.go b/db2_test.go
@@ -587,3 +587,75 @@ func TestReadSameVlog(t *testing.T) {
 		})
 	})
 }
+
+// The test ensures we don't lose data when badger is opened with KeepL0InMemory and GC is being
+// done.
+func TestL0GCBug(t *testing.T) {
+	dir, err := ioutil.TempDir("", "badger-test")
+	require.NoError(t, err)
+	defer removeDir(dir)
+
+	// Do not change any of the options below unless it's necessary.
+	opts := getTestOptions(dir)
+	opts.MaxTableSize = 1920
+	opts.NumMemtables = 1
+	opts.ValueLogMaxEntries = 2
+	opts.ValueThreshold = 2
+	opts.KeepL0InMemory = true
+	// Setting LoadingMode to mmap seems to cause segmentation fault while closing DB.
+	opts.ValueLogLoadingMode = options.FileIO
+	opts.TableLoadingMode = options.FileIO
+
+	db1, err := Open(opts)
+	require.NoError(t, err)
+	key := func(i int) []byte {
+		return []byte(fmt.Sprintf("%10d", i))
+	}
+	val := []byte{1, 1, 1, 1, 1, 1, 1, 1}
+	// Insert 100 entries. This will create about 50 vlog files and 2 SST files.
+	for i := 0; i < 100; i++ {
+		err = db1.Update(func(txn *Txn) error {
+			return txn.SetEntry(NewEntry(key(i), val))
+		})
+		require.NoError(t, err)
+	}
+	// Run value log GC multiple times. This would ensure at least
+	// one value log file is garbage collected.
+	for i := 0; i < 10; i++ {
+		err := db1.RunValueLogGC(0.01)
+		if err != nil && err != ErrNoRewrite {
+			t.Fatalf(err.Error())
+		}
+	}
+
+	// CheckKeys reads all the keys previously stored.
+	checkKeys := func(db *DB) {
+		for i := 0; i < 100; i++ {
+			err := db.View(func(txn *Txn) error {
+				item, err := txn.Get(key(i))
+				require.NoError(t, err)
+				val1 := getItemValue(t, item)
+				require.Equal(t, val, val1)
+				return nil
+			})
+			require.NoError(t, err)
+		}
+	}
+
+	checkKeys(db1)
+	// Simulate a crash by not closing db1 but releasing the locks.
+	if db1.dirLockGuard != nil {
+		require.NoError(t, db1.dirLockGuard.release())
+	}
+	if db1.valueDirGuard != nil {
+		require.NoError(t, db1.valueDirGuard.release())
+	}
+	require.NoError(t, db1.vlog.Close())
+
+	db2, err := Open(opts)
+	require.NoError(t, err)
+
+	// Ensure we still have all the keys.
+	checkKeys(db2)
+	require.NoError(t, db2.Close())
+}
diff --git a/levels.go b/levels.go
@@ -939,14 +939,19 @@ func (s *levelsController) close() error {
 }
 
 // get returns the found value if any. If not found, we return nil.
-func (s *levelsController) get(key []byte, maxVs *y.ValueStruct) (y.ValueStruct, error) {
-	// It's important that we iterate the levels from 0 on upward.  The reason is, if we iterated
+func (s *levelsController) get(key []byte, maxVs *y.ValueStruct, startLevel int) (
+	y.ValueStruct, error) {
+	// It's important that we iterate the levels from 0 on upward. The reason is, if we iterated
 	// in opposite order, or in parallel (naively calling all the h.RLock() in some order) we could
-	// read level L's tables post-compaction and level L+1's tables pre-compaction.  (If we do
+	// read level L's tables post-compaction and level L+1's tables pre-compaction. (If we do
 	// parallelize this, we will need to call the h.RLock() function by increasing order of level
 	// number.)
 	version := y.ParseTs(key)
 	for _, h := range s.levels {
+		// Ignore all levels below startLevel. This is useful for GC when L0 is kept in memory.
+		if h.level < startLevel {
+			continue
+		}
 		vs, err := h.get(key) // Calls h.RLock() and h.RUnlock().
 		if err != nil {
 			return y.ValueStruct{}, errors.Wrapf(err, "get key: %q", key)