Skip to content

Commit

Permalink
Support read rate-limiting in SequentialFileReader (facebook#9973)
Browse files Browse the repository at this point in the history
Summary:
Added rate limiter and read rate-limiting support to SequentialFileReader. I've updated call sites to SequentialFileReader::Read with appropriate IO priority (or left a TODO and specified IO_TOTAL for now).

The PR is separated into four commits: the first one added the rate-limiting support, but with some fixes in the unit test since the number of request bytes from rate limiter in SequentialFileReader are not accurate (there is overcharge at EOF). The second commit fixed this by allowing SequentialFileReader to check file size and determine how many bytes are left in the file to read. The third commit added benchmark related code. The fourth commit moved the logic of using file size to avoid overcharging the rate limiter into backup engine (the main user of SequentialFileReader).

Pull Request resolved: facebook#9973

Test Plan:
- `make check`, backup_engine_test covers usage of SequentialFileReader with rate limiter.
- Run db_bench to check if rate limiting is throttling as expected: Verified that reads and writes are together throttled at 2MB/s, and at 0.2MB chunks that are 100ms apart.
  - Set up: `./db_bench --benchmarks=fillrandom -db=/dev/shm/test_rocksdb`
  - Benchmark:
```
strace -ttfe read,write ./db_bench --benchmarks=backup -db=/dev/shm/test_rocksdb --backup_rate_limit=2097152 --use_existing_db
strace -ttfe read,write ./db_bench --benchmarks=restore -db=/dev/shm/test_rocksdb --restore_rate_limit=2097152 --use_existing_db
```
- db bench on backup and restore to ensure no performance regression.
  - backup (avg over 50 runs): pre-change: 1.90443e+06 micros/op; post-change: 1.8993e+06 micros/op (improve by 0.2%)
  - restore (avg over 50 runs): pre-change: 1.79105e+06 micros/op; post-change: 1.78192e+06 micros/op (improve by 0.5%)

```
# Set up
./db_bench --benchmarks=fillrandom -db=/tmp/test_rocksdb -num=10000000

# benchmark
TEST_TMPDIR=/tmp/test_rocksdb
NUM_RUN=50
for ((j=0;j<$NUM_RUN;j++))
do
   ./db_bench -db=$TEST_TMPDIR -num=10000000 -benchmarks=backup -use_existing_db | egrep 'backup'
  # Restore
  #./db_bench -db=$TEST_TMPDIR -num=10000000 -benchmarks=restore -use_existing_db
done > rate_limit.txt && awk -v NUM_RUN=$NUM_RUN '{sum+=$3;sum_sqrt+=$3^2}END{print sum/NUM_RUN, sqrt(sum_sqrt/NUM_RUN-(sum/NUM_RUN)^2)}' rate_limit.txt >> rate_limit_2.txt
```

Reviewed By: hx235

Differential Revision: D36327418

Pulled By: cbi42

fbshipit-source-id: e75d4307cff815945482df5ba630c1e88d064691
  • Loading branch information
cbi42 authored and facebook-github-bot committed May 24, 2022
1 parent fd24e44 commit 8515bd5
Show file tree
Hide file tree
Showing 17 changed files with 265 additions and 125 deletions.
26 changes: 22 additions & 4 deletions db/log_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -303,8 +303,14 @@ void Reader::UnmarkEOFInternal() {
}

Slice read_buffer;
Status status = file_->Read(remaining, &read_buffer,
backing_store_ + eof_offset_);
// TODO: rate limit log reader with approriate priority.
// TODO: avoid overcharging rate limiter:
// Note that the Read here might overcharge SequentialFileReader's internal
// rate limiter if priority is not IO_TOTAL, e.g., when there is not enough
// content left until EOF to read.
Status status =
file_->Read(remaining, &read_buffer, backing_store_ + eof_offset_,
Env::IO_TOTAL /* rate_limiter_priority */);

size_t added = read_buffer.size();
end_of_buffer_offset_ += added;
Expand Down Expand Up @@ -349,7 +355,13 @@ bool Reader::ReadMore(size_t* drop_size, int *error) {
if (!eof_ && !read_error_) {
// Last read was a full read, so this is a trailer to skip
buffer_.clear();
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
// TODO: rate limit log reader with approriate priority.
// TODO: avoid overcharging rate limiter:
// Note that the Read here might overcharge SequentialFileReader's internal
// rate limiter if priority is not IO_TOTAL, e.g., when there is not enough
// content left until EOF to read.
Status status = file_->Read(kBlockSize, &buffer_, backing_store_,
Env::IO_TOTAL /* rate_limiter_priority */);
TEST_SYNC_POINT_CALLBACK("LogReader::ReadMore:AfterReadFile", &status);
end_of_buffer_offset_ += buffer_.size();
if (!status.ok()) {
Expand Down Expand Up @@ -639,7 +651,13 @@ bool FragmentBufferedReader::TryReadMore(size_t* drop_size, int* error) {
if (!eof_ && !read_error_) {
// Last read was a full read, so this is a trailer to skip
buffer_.clear();
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
// TODO: rate limit log reader with approriate priority.
// TODO: avoid overcharging rate limiter:
// Note that the Read here might overcharge SequentialFileReader's internal
// rate limiter if priority is not IO_TOTAL, e.g., when there is not enough
// content left until EOF to read.
Status status = file_->Read(kBlockSize, &buffer_, backing_store_,
Env::IO_TOTAL /* rate_limiter_priority */);
end_of_buffer_offset_ += buffer_.size();
if (!status.ok()) {
buffer_.clear();
Expand Down
2 changes: 1 addition & 1 deletion db/repair.cc
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ class Repairer {
std::unique_ptr<SequentialFileReader> lfile_reader;
Status status = SequentialFileReader::Create(
fs, logname, fs->OptimizeForLogRead(file_options_), &lfile_reader,
nullptr);
nullptr /* dbg */, nullptr /* rate limiter */);
if (!status.ok()) {
return status;
}
Expand Down
5 changes: 3 additions & 2 deletions db/transaction_log_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,9 @@ Status TransactionLogIteratorImpl::OpenLogFile(
}
}
if (s.ok()) {
file_reader->reset(new SequentialFileReader(
std::move(file), fname, io_tracer_, options_->listeners));
file_reader->reset(new SequentialFileReader(std::move(file), fname,
io_tracer_, options_->listeners,
options_->rate_limiter.get()));
}
return s;
}
Expand Down
3 changes: 1 addition & 2 deletions env/io_posix.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,7 @@ class PosixSequentialFile : public FSSequentialFile {

public:
PosixSequentialFile(const std::string& fname, FILE* file, int fd,
size_t logical_block_size,
const EnvOptions& options);
size_t logical_block_size, const EnvOptions& options);
virtual ~PosixSequentialFile();

virtual IOStatus Read(size_t n, const IOOptions& opts, Slice* result,
Expand Down
5 changes: 4 additions & 1 deletion file/file_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source,
Slice slice;
while (size > 0) {
size_t bytes_to_read = std::min(sizeof(buffer), static_cast<size_t>(size));
io_s = status_to_io_status(src_reader->Read(bytes_to_read, &slice, buffer));
// TODO: rate limit copy file
io_s = status_to_io_status(
src_reader->Read(bytes_to_read, &slice, buffer,
Env::IO_TOTAL /* rate_limiter_priority */));
if (!io_s.ok()) {
return io_s;
}
Expand Down
13 changes: 9 additions & 4 deletions file/line_file_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,20 @@ IOStatus LineFileReader::Create(const std::shared_ptr<FileSystem>& fs,
const std::string& fname,
const FileOptions& file_opts,
std::unique_ptr<LineFileReader>* reader,
IODebugContext* dbg) {
IODebugContext* dbg,
RateLimiter* rate_limiter) {
std::unique_ptr<FSSequentialFile> file;
IOStatus io_s = fs->NewSequentialFile(fname, file_opts, &file, dbg);
if (io_s.ok()) {
reader->reset(new LineFileReader(std::move(file), fname));
reader->reset(new LineFileReader(
std::move(file), fname, nullptr,
std::vector<std::shared_ptr<EventListener>>{}, rate_limiter));
}
return io_s;
}

bool LineFileReader::ReadLine(std::string* out) {
bool LineFileReader::ReadLine(std::string* out,
Env::IOPriority rate_limiter_priority) {
assert(out);
if (!io_status_.ok()) {
// Status should be checked (or permit unchecked) any time we return false.
Expand All @@ -50,7 +54,8 @@ bool LineFileReader::ReadLine(std::string* out) {
// else flush and reload buffer
out->append(buf_begin_, buf_end_ - buf_begin_);
Slice result;
io_status_ = sfr_.Read(buf_.size(), &result, buf_.data());
io_status_ =
sfr_.Read(buf_.size(), &result, buf_.data(), rate_limiter_priority);
IOSTATS_ADD(bytes_read, result.size());
if (!io_status_.ok()) {
io_status_.MustCheck();
Expand Down
5 changes: 3 additions & 2 deletions file/line_file_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class LineFileReader {
static IOStatus Create(const std::shared_ptr<FileSystem>& fs,
const std::string& fname, const FileOptions& file_opts,
std::unique_ptr<LineFileReader>* reader,
IODebugContext* dbg);
IODebugContext* dbg, RateLimiter* rate_limiter);

LineFileReader(const LineFileReader&) = delete;
LineFileReader& operator=(const LineFileReader&) = delete;
Expand All @@ -41,7 +41,8 @@ class LineFileReader {
// the line to `out`, without delimiter, or returning false on failure. You
// must check GetStatus() to determine whether the failure was just
// end-of-file (OK status) or an I/O error (another status).
bool ReadLine(std::string* out);
// The internal rate limiter will be charged at the specified priority.
bool ReadLine(std::string* out, Env::IOPriority rate_limiter_priority);

// Returns the number of the line most recently returned from ReadLine.
// Return value is unspecified if ReadLine has returned false due to
Expand Down
2 changes: 1 addition & 1 deletion file/random_access_file_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ IOStatus RandomAccessFileReader::MultiRead(
#endif // !NDEBUG

// To be paranoid modify scratch a little bit, so in case underlying
// FileSystem doesn't fill the buffer but return succee and `scratch` returns
// FileSystem doesn't fill the buffer but return success and `scratch` returns
// contains a previous block, returned value will not pass checksum.
// This byte might not change anything for direct I/O case, but it's OK.
for (size_t i = 0; i < num_reqs; i++) {
Expand Down
100 changes: 68 additions & 32 deletions file/sequence_file_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,18 @@ namespace ROCKSDB_NAMESPACE {
IOStatus SequentialFileReader::Create(
const std::shared_ptr<FileSystem>& fs, const std::string& fname,
const FileOptions& file_opts, std::unique_ptr<SequentialFileReader>* reader,
IODebugContext* dbg) {
IODebugContext* dbg, RateLimiter* rate_limiter) {
std::unique_ptr<FSSequentialFile> file;
IOStatus io_s = fs->NewSequentialFile(fname, file_opts, &file, dbg);
if (io_s.ok()) {
reader->reset(new SequentialFileReader(std::move(file), fname));
reader->reset(new SequentialFileReader(std::move(file), fname, nullptr, {},
rate_limiter));
}
return io_s;
}

IOStatus SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
IOStatus SequentialFileReader::Read(size_t n, Slice* result, char* scratch,
Env::IOPriority rate_limiter_priority) {
IOStatus io_s;
if (use_direct_io()) {
#ifndef ROCKSDB_LITE
Expand All @@ -55,53 +57,87 @@ IOStatus SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
buf.Alignment(alignment);
buf.AllocateNewBuffer(size);

Slice tmp;
uint64_t orig_offset = 0;
FileOperationInfo::StartTimePoint start_ts;
if (ShouldNotifyListeners()) {
orig_offset = aligned_offset + buf.CurrentSize();
start_ts = FileOperationInfo::StartNow();
while (buf.CurrentSize() < size) {
size_t allowed;
if (rate_limiter_priority != Env::IO_TOTAL && rate_limiter_ != nullptr) {
allowed = rate_limiter_->RequestToken(
buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
rate_limiter_priority, nullptr /* stats */,
RateLimiter::OpType::kRead);
} else {
assert(buf.CurrentSize() == 0);
allowed = size;
}

Slice tmp;
uint64_t orig_offset = 0;
FileOperationInfo::StartTimePoint start_ts;
if (ShouldNotifyListeners()) {
orig_offset = aligned_offset + buf.CurrentSize();
start_ts = FileOperationInfo::StartNow();
}
io_s = file_->PositionedRead(aligned_offset + buf.CurrentSize(), allowed,
IOOptions(), &tmp, buf.Destination(),
nullptr /* dbg */);
if (ShouldNotifyListeners()) {
auto finish_ts = FileOperationInfo::FinishNow();
NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
io_s);
}
buf.Size(buf.CurrentSize() + tmp.size());
if (!io_s.ok() || tmp.size() < allowed) {
break;
}
}
io_s = file_->PositionedRead(aligned_offset, size, IOOptions(), &tmp,
buf.BufferStart(), nullptr);
if (io_s.ok() && offset_advance < tmp.size()) {
buf.Size(tmp.size());

if (io_s.ok() && offset_advance < buf.CurrentSize()) {
r = buf.Read(scratch, offset_advance,
std::min(tmp.size() - offset_advance, n));
std::min(buf.CurrentSize() - offset_advance, n));
}
*result = Slice(scratch, r);
if (ShouldNotifyListeners()) {
auto finish_ts = FileOperationInfo::FinishNow();
NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
io_s);
}
#endif // !ROCKSDB_LITE
} else {
// To be paranoid, modify scratch a little bit, so in case underlying
// FileSystem doesn't fill the buffer but return succee and `scratch`
// FileSystem doesn't fill the buffer but return success and `scratch`
// returns contains a previous block, returned value will not pass
// checksum.
// It's hard to find useful byte for direct I/O case, so we skip it.
if (n > 0 && scratch != nullptr) {
scratch[0]++;
}

size_t read = 0;
while (read < n) {
size_t allowed;
if (rate_limiter_priority != Env::IO_TOTAL && rate_limiter_ != nullptr) {
allowed = rate_limiter_->RequestToken(
n - read, 0 /* alignment */, rate_limiter_priority,
nullptr /* stats */, RateLimiter::OpType::kRead);
} else {
allowed = n;
}
#ifndef ROCKSDB_LITE
FileOperationInfo::StartTimePoint start_ts;
if (ShouldNotifyListeners()) {
start_ts = FileOperationInfo::StartNow();
}
FileOperationInfo::StartTimePoint start_ts;
if (ShouldNotifyListeners()) {
start_ts = FileOperationInfo::StartNow();
}
#endif

io_s = file_->Read(n, IOOptions(), result, scratch, nullptr);

Slice tmp;
io_s = file_->Read(allowed, IOOptions(), &tmp, scratch + read,
nullptr /* dbg */);
#ifndef ROCKSDB_LITE
if (ShouldNotifyListeners()) {
auto finish_ts = FileOperationInfo::FinishNow();
size_t offset = offset_.fetch_add(result->size());
NotifyOnFileReadFinish(offset, result->size(), start_ts, finish_ts, io_s);
}
if (ShouldNotifyListeners()) {
auto finish_ts = FileOperationInfo::FinishNow();
size_t offset = offset_.fetch_add(tmp.size());
NotifyOnFileReadFinish(offset, tmp.size(), start_ts, finish_ts, io_s);
}
#endif
read += tmp.size();
if (!io_s.ok() || tmp.size() < allowed) {
break;
}
}
*result = Slice(scratch, read);
}
IOSTATS_ADD(bytes_read, result->size());
return io_s;
Expand Down
26 changes: 20 additions & 6 deletions file/sequence_file_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,19 @@ class SequentialFileReader {
FSSequentialFilePtr file_;
std::atomic<size_t> offset_{0}; // read offset
std::vector<std::shared_ptr<EventListener>> listeners_{};
RateLimiter* rate_limiter_;

public:
explicit SequentialFileReader(
std::unique_ptr<FSSequentialFile>&& _file, const std::string& _file_name,
const std::shared_ptr<IOTracer>& io_tracer = nullptr,
const std::vector<std::shared_ptr<EventListener>>& listeners = {})
const std::vector<std::shared_ptr<EventListener>>& listeners = {},
RateLimiter* rate_limiter =
nullptr) // TODO: migrate call sites to provide rate limiter
: file_name_(_file_name),
file_(std::move(_file), io_tracer, _file_name),
listeners_() {
listeners_(),
rate_limiter_(rate_limiter) {
#ifndef ROCKSDB_LITE
AddFileIOListeners(listeners);
#else
Expand All @@ -77,11 +81,14 @@ class SequentialFileReader {
std::unique_ptr<FSSequentialFile>&& _file, const std::string& _file_name,
size_t _readahead_size,
const std::shared_ptr<IOTracer>& io_tracer = nullptr,
const std::vector<std::shared_ptr<EventListener>>& listeners = {})
const std::vector<std::shared_ptr<EventListener>>& listeners = {},
RateLimiter* rate_limiter =
nullptr) // TODO: migrate call sites to provide rate limiter
: file_name_(_file_name),
file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size),
io_tracer, _file_name),
listeners_() {
listeners_(),
rate_limiter_(rate_limiter) {
#ifndef ROCKSDB_LITE
AddFileIOListeners(listeners);
#else
Expand All @@ -91,12 +98,19 @@ class SequentialFileReader {
static IOStatus Create(const std::shared_ptr<FileSystem>& fs,
const std::string& fname, const FileOptions& file_opts,
std::unique_ptr<SequentialFileReader>* reader,
IODebugContext* dbg);
IODebugContext* dbg, RateLimiter* rate_limiter);

SequentialFileReader(const SequentialFileReader&) = delete;
SequentialFileReader& operator=(const SequentialFileReader&) = delete;

IOStatus Read(size_t n, Slice* result, char* scratch);
// `rate_limiter_priority` is used to charge the internal rate limiter when
// enabled. The special value `Env::IO_TOTAL` makes this operation bypass the
// rate limiter. The amount charged to the internal rate limiter is n, even
// when less than n bytes are actually read (e.g. at end of file). To avoid
// overcharging the rate limiter, the caller can use file size to cap n to
// read until end of file.
IOStatus Read(size_t n, Slice* result, char* scratch,
Env::IOPriority rate_limiter_priority);

IOStatus Skip(uint64_t n);

Expand Down
2 changes: 1 addition & 1 deletion options/options_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ Status RocksDBOptionsParser::Parse(const ConfigOptions& config_options_in,
std::unordered_map<std::string, std::string> opt_map;
std::string line;
// we only support single-lined statement.
while (lf_reader.ReadLine(&line)) {
while (lf_reader.ReadLine(&line, Env::IO_TOTAL /* rate_limiter_priority */)) {
int line_num = static_cast<int>(lf_reader.GetLineNumber());
line = TrimAndRemoveComment(line);
if (line.empty()) {
Expand Down
Loading

0 comments on commit 8515bd5

Please sign in to comment.