Skip to content

Commit

Permalink
Fix bug of newer ingested data assigned with an older seqno (#12257)
Browse files Browse the repository at this point in the history
Summary:
**Context:**
We found an edge case where newer ingested data is assigned with an older seqno. This causes older data of that key to be returned for read.

Consider the following lsm shape:
![image](https://github.com/facebook/rocksdb/assets/83968999/973fd160-5065-49cd-8b7b-b6ab4badae23)
Then ingest a file to L5 containing new data of key_overlap. Because of [this](https://l.facebook.com/l.php?u=https%3A%2F%2Fgithub.com%2Ffacebook%2Frocksdb%2Fblob%2F5a26f392ca640818da0b8590be6119699e852b07%2Fdb%2Fexternal_sst_file_ingestion_job.cc%3Ffbclid%3DIwAR10clXxpUSrt6sYg12sUMeHfShS7XigFrsJHvZoUDroQpbj_Sb3dG_JZFc%23L951-L956&h=AT0m56P7O0ZML7jk1sdjgnZZyGPMXg9HkKvBEb8mE9ZM3fpJjPrArAMsaHWZQPt9Ki-Pn7lv7x-RT9NEd_202Y6D2juIVHOIt3EjCZptDKBLRBMG49F8iBUSM9ypiKe8XCfM-FNW2Hl4KbVq2e3nZRbMvUM), the file is assigned with seqno 2, older than the old data's seqno 4. After just another compaction, we will drop the new_v for key_overlap because of the seqno and cause older data to be returned.
![image](https://github.com/facebook/rocksdb/assets/83968999/a3ef95e4-e7ae-4c30-8d03-955cd4b5ed42)

**Summary:**
This PR removes the incorrect seqno assignment

Pull Request resolved: facebook/rocksdb#12257

Test Plan:
- New unit test failed before the fix but passes after
- python3 tools/db_crashtest.py --compaction_style=1 --ingest_external_file_one_in=10 --preclude_last_level_data_seconds=36000 --compact_files_one_in=10 --enable_blob_files=0 blackbox`
- Rehearsal stress test

Reviewed By: cbi42

Differential Revision: D52926092

Pulled By: hx235

fbshipit-source-id: 9e4dade0f6cc44e548db8fca27ccbc81a621cd6f
(cherry picked from commit 1b2b16b38ef760252d61b123e7e39c26306cd1c7)
  • Loading branch information
hx235 authored and rkhachatryan committed Jul 22, 2024
1 parent 9a10201 commit 8fe04d8
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 20 deletions.
76 changes: 76 additions & 0 deletions db/external_sst_file_basic_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include "db/version_edit.h"
#include "port/port.h"
#include "port/stack_trace.h"
#include "rocksdb/advanced_options.h"
#include "rocksdb/options.h"
#include "rocksdb/sst_file_writer.h"
#include "test_util/testharness.h"
#include "test_util/testutil.h"
Expand Down Expand Up @@ -1292,6 +1294,80 @@ TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) {
Destroy(options);
}

TEST_F(ExternalSSTFileBasicTest, ReadOldValueOfIngestedKeyBug) {
Options options = CurrentOptions();
options.compaction_style = kCompactionStyleUniversal;
options.disable_auto_compactions = true;
options.num_levels = 3;
options.preserve_internal_time_seconds = 36000;
DestroyAndReopen(options);

// To create the following LSM tree to trigger the bug:
// L0
// L1 with seqno [1, 2]
// L2 with seqno [3, 4]

// To create L1 shape
ASSERT_OK(
db_->Put(WriteOptions(), db_->DefaultColumnFamily(), "k1", "seqno1"));
ASSERT_OK(db_->Flush(FlushOptions()));
ASSERT_OK(
db_->Put(WriteOptions(), db_->DefaultColumnFamily(), "k1", "seqno2"));
ASSERT_OK(db_->Flush(FlushOptions()));
ColumnFamilyMetaData meta_1;
db_->GetColumnFamilyMetaData(&meta_1);
auto& files_1 = meta_1.levels[0].files;
ASSERT_EQ(files_1.size(), 2);
std::string file1 = files_1[0].db_path + files_1[0].name;
std::string file2 = files_1[1].db_path + files_1[1].name;
ASSERT_OK(db_->CompactFiles(CompactionOptions(), {file1, file2}, 1));
// To confirm L1 shape
ColumnFamilyMetaData meta_2;
db_->GetColumnFamilyMetaData(&meta_2);
ASSERT_EQ(meta_2.levels[0].files.size(), 0);
ASSERT_EQ(meta_2.levels[1].files.size(), 1);
// Seqno starts from non-zero due to seqno reservation for
// preserve_internal_time_seconds greater than 0;
ASSERT_EQ(meta_2.levels[1].files[0].largest_seqno, 102);
ASSERT_EQ(meta_2.levels[2].files.size(), 0);
// To create L2 shape
ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), "k2overlap",
"old_value"));
ASSERT_OK(db_->Flush(FlushOptions()));
ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), "k2overlap",
"old_value"));
ASSERT_OK(db_->Flush(FlushOptions()));
ColumnFamilyMetaData meta_3;
db_->GetColumnFamilyMetaData(&meta_3);
auto& files_3 = meta_3.levels[0].files;
std::string file3 = files_3[0].db_path + files_3[0].name;
std::string file4 = files_3[1].db_path + files_3[1].name;
ASSERT_OK(db_->CompactFiles(CompactionOptions(), {file3, file4}, 2));
// To confirm L2 shape
ColumnFamilyMetaData meta_4;
db_->GetColumnFamilyMetaData(&meta_4);
ASSERT_EQ(meta_4.levels[0].files.size(), 0);
ASSERT_EQ(meta_4.levels[1].files.size(), 1);
ASSERT_EQ(meta_4.levels[2].files.size(), 1);
ASSERT_EQ(meta_4.levels[2].files[0].largest_seqno, 104);

// Ingest a file with new value of the key "k2overlap"
SstFileWriter sst_file_writer(EnvOptions(), options);
std::string f = sst_files_dir_ + "f.sst";
ASSERT_OK(sst_file_writer.Open(f));
ASSERT_OK(sst_file_writer.Put("k2overlap", "new_value"));
ExternalSstFileInfo f_info;
ASSERT_OK(sst_file_writer.Finish(&f_info));
ASSERT_OK(db_->IngestExternalFile({f}, IngestExternalFileOptions()));

// To verify new value of the key "k2overlap" is correctly returned
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
std::string value;
ASSERT_OK(db_->Get(ReadOptions(), "k2overlap", &value));
// Before the fix, the value would be "old_value" and assertion failed
ASSERT_EQ(value, "new_value");
}

TEST_F(ExternalSSTFileBasicTest, IngestRangeDeletionTombstoneWithGlobalSeqno) {
for (int i = 5; i < 25; i++) {
ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i),
Expand Down
20 changes: 0 additions & 20 deletions db/external_sst_file_ingestion_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -937,26 +937,6 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
overlap_with_db = true;
break;
}

if (compaction_style == kCompactionStyleUniversal && lvl != 0) {
const std::vector<FileMetaData*>& level_files =
vstorage->LevelFiles(lvl);
const SequenceNumber level_largest_seqno =
(*std::max_element(level_files.begin(), level_files.end(),
[](FileMetaData* f1, FileMetaData* f2) {
return f1->fd.largest_seqno <
f2->fd.largest_seqno;
}))
->fd.largest_seqno;
// should only assign seqno to current level's largest seqno when
// the file fits
if (level_largest_seqno != 0 &&
IngestedFileFitInLevel(file_to_ingest, lvl)) {
*assigned_seqno = level_largest_seqno;
} else {
continue;
}
}
} else if (compaction_style == kCompactionStyleUniversal) {
continue;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a bug where older data of an ingested key can be returned for read when universal compaction is used

0 comments on commit 8fe04d8

Please sign in to comment.