vitastor/src/cluster_client_wb.cpp

499 lines
16 KiB
C++

// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
#include <cassert>
#include "cluster_client_impl.h"
writeback_cache_t::~writeback_cache_t()
{
for (auto & bp: dirty_buffers)
{
if (!--(*bp.second.refcnt))
{
free(bp.second.refcnt); // refcnt is allocated with the buffer
}
}
dirty_buffers.clear();
}
dirty_buf_it_t writeback_cache_t::find_dirty(uint64_t inode, uint64_t offset)
{
auto dirty_it = dirty_buffers.lower_bound((object_id){
.inode = inode,
.stripe = offset,
});
while (dirty_it != dirty_buffers.begin())
{
dirty_it--;
if (dirty_it->first.inode != inode ||
(dirty_it->first.stripe + dirty_it->second.len) <= offset)
{
dirty_it++;
break;
}
}
return dirty_it;
}
bool writeback_cache_t::is_left_merged(dirty_buf_it_t dirty_it)
{
if (dirty_it != dirty_buffers.begin())
{
auto prev_it = dirty_it;
prev_it--;
if (prev_it->first.inode == dirty_it->first.inode &&
prev_it->first.stripe+prev_it->second.len == dirty_it->first.stripe &&
prev_it->second.state == CACHE_DIRTY)
{
return true;
}
}
return false;
}
bool writeback_cache_t::is_right_merged(dirty_buf_it_t dirty_it)
{
auto next_it = dirty_it;
next_it++;
if (next_it != dirty_buffers.end() &&
next_it->first.inode == dirty_it->first.inode &&
next_it->first.stripe == dirty_it->first.stripe+dirty_it->second.len &&
next_it->second.state == CACHE_DIRTY)
{
return true;
}
return false;
}
bool writeback_cache_t::is_merged(const dirty_buf_it_t & dirty_it)
{
return is_left_merged(dirty_it) || is_right_merged(dirty_it);
}
void writeback_cache_t::copy_write(cluster_op_t *op, int state)
{
// Save operation for replay when one of PGs goes out of sync
// (primary OSD drops our connection in this case)
// ...or just save it for writeback if write buffering is enabled
if (op->len == 0)
{
return;
}
auto dirty_it = find_dirty(op->inode, op->offset);
auto new_end = op->offset + op->len;
while (dirty_it != dirty_buffers.end() &&
dirty_it->first.inode == op->inode &&
dirty_it->first.stripe < op->offset+op->len)
{
assert(dirty_it->first.stripe + dirty_it->second.len > op->offset);
// Remove overlapping part(s) of buffers
auto old_end = dirty_it->first.stripe + dirty_it->second.len;
if (dirty_it->first.stripe < op->offset)
{
if (old_end > new_end)
{
// Split into end and start
dirty_it->second.len = op->offset - dirty_it->first.stripe;
dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
.inode = op->inode,
.stripe = new_end,
}, (cluster_buffer_t){
.buf = dirty_it->second.buf + new_end - dirty_it->first.stripe,
.len = old_end - new_end,
.state = dirty_it->second.state,
.flush_id = dirty_it->second.flush_id,
.refcnt = dirty_it->second.refcnt,
});
(*dirty_it->second.refcnt)++;
if (dirty_it->second.state == CACHE_DIRTY)
{
writeback_bytes -= op->len;
writeback_queue_size++;
}
break;
}
else
{
// Only leave the beginning
if (dirty_it->second.state == CACHE_DIRTY)
{
writeback_bytes -= old_end - op->offset;
if (is_left_merged(dirty_it) && !is_right_merged(dirty_it))
{
writeback_queue_size++;
}
}
dirty_it->second.len = op->offset - dirty_it->first.stripe;
dirty_it++;
}
}
else if (old_end > new_end)
{
// Only leave the end
if (dirty_it->second.state == CACHE_DIRTY)
{
writeback_bytes -= new_end - dirty_it->first.stripe;
if (!is_left_merged(dirty_it) && is_right_merged(dirty_it))
{
writeback_queue_size++;
}
}
auto new_dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
.inode = op->inode,
.stripe = new_end,
}, (cluster_buffer_t){
.buf = dirty_it->second.buf + new_end - dirty_it->first.stripe,
.len = old_end - new_end,
.state = dirty_it->second.state,
.flush_id = dirty_it->second.flush_id,
.refcnt = dirty_it->second.refcnt,
});
dirty_buffers.erase(dirty_it);
dirty_it = new_dirty_it;
break;
}
else
{
// Remove the whole buffer
if (dirty_it->second.state == CACHE_DIRTY && !is_merged(dirty_it))
{
writeback_bytes -= dirty_it->second.len;
assert(writeback_queue_size > 0);
writeback_queue_size--;
}
if (!--(*dirty_it->second.refcnt))
{
free(dirty_it->second.refcnt);
}
dirty_buffers.erase(dirty_it++);
}
}
// Overlapping buffers are removed, just insert the new one
uint64_t *refcnt = (uint64_t*)malloc_or_die(sizeof(uint64_t) + op->len);
uint8_t *buf = (uint8_t*)refcnt + sizeof(uint64_t);
*refcnt = 1;
dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
.inode = op->inode,
.stripe = op->offset,
}, (cluster_buffer_t){
.buf = buf,
.len = op->len,
.state = state,
.refcnt = refcnt,
});
if (state == CACHE_DIRTY)
{
writeback_bytes += op->len;
// Track consecutive write-back operations
if (!is_merged(dirty_it))
{
// <writeback_queue> is OK to contain more than actual number of consecutive
// requests as long as it doesn't miss anything. But <writeback_queue_size>
// is always calculated correctly.
writeback_queue_size++;
writeback_queue.push_back((object_id){
.inode = op->inode,
.stripe = op->offset,
});
}
}
uint64_t pos = 0, len = op->len, iov_idx = 0;
while (len > 0 && iov_idx < op->iov.count)
{
auto & iov = op->iov.buf[iov_idx];
memcpy(buf + pos, iov.iov_base, iov.iov_len);
pos += iov.iov_len;
iov_idx++;
}
}
int writeback_cache_t::repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd)
{
int repeated = 0;
if (dirty_buffers.size())
{
// peer_osd just dropped connection
// determine WHICH dirty_buffers are now obsolete and repeat them
for (auto wr_it = dirty_buffers.begin(), flush_it = wr_it, last_it = wr_it; ; )
{
bool end = wr_it == dirty_buffers.end();
bool flush_this = !end && wr_it->second.state != CACHE_REPEATING &&
cli->affects_osd(wr_it->first.inode, wr_it->first.stripe, wr_it->second.len, peer_osd);
if (flush_it != wr_it && (end || !flush_this ||
wr_it->first.inode != flush_it->first.inode ||
wr_it->first.stripe != last_it->first.stripe+last_it->second.len))
{
repeated++;
flush_buffers(cli, flush_it, wr_it);
flush_it = wr_it;
}
if (end)
break;
last_it = wr_it;
wr_it++;
if (!flush_this)
flush_it = wr_it;
}
}
return repeated;
}
void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from_it, dirty_buf_it_t to_it)
{
auto prev_it = to_it;
prev_it--;
bool is_writeback = from_it->second.state == CACHE_DIRTY;
cluster_op_t *op = new cluster_op_t;
op->flags = OSD_OP_IGNORE_READONLY|OP_FLUSH_BUFFER;
op->opcode = OSD_OP_WRITE;
op->cur_inode = op->inode = from_it->first.inode;
op->offset = from_it->first.stripe;
op->len = prev_it->first.stripe + prev_it->second.len - from_it->first.stripe;
uint32_t calc_len = 0;
uint64_t flush_id = ++last_flush_id;
for (auto it = from_it; it != to_it; it++)
{
it->second.state = CACHE_REPEATING;
it->second.flush_id = flush_id;
(*it->second.refcnt)++;
flushed_buffers.emplace(flush_id, it->second.refcnt);
op->iov.push_back(it->second.buf, it->second.len);
calc_len += it->second.len;
}
assert(calc_len == op->len);
writebacks_active++;
op->callback = [this, flush_id](cluster_op_t* op)
{
// Buffer flushes should be always retried, regardless of the error,
// so they should never result in an error here
assert(op->retval == op->len);
for (auto fl_it = flushed_buffers.find(flush_id);
fl_it != flushed_buffers.end() && fl_it->first == flush_id; )
{
if (!--(*fl_it->second)) // refcnt
{
free(fl_it->second);
}
flushed_buffers.erase(fl_it++);
}
for (auto dirty_it = find_dirty(op->inode, op->offset);
dirty_it != dirty_buffers.end() && dirty_it->first.inode == op->inode &&
dirty_it->first.stripe < op->offset+op->len; dirty_it++)
{
if (dirty_it->second.flush_id == flush_id && dirty_it->second.state == CACHE_REPEATING)
{
dirty_it->second.flush_id = 0;
dirty_it->second.state = CACHE_WRITTEN;
}
}
delete op;
writebacks_active--;
// We can't call execute_internal because it affects an invalid copy of the list here
// (erase_op remembers `next` after writeback callback)
};
if (is_writeback)
{
cli->execute_internal(op);
}
else
{
// Insert repeated flushes into the beginning
cli->unshift_op(op);
cli->continue_rw(op);
}
}
void writeback_cache_t::start_writebacks(cluster_client_t *cli, int count)
{
if (!writeback_queue.size())
{
return;
}
std::vector<object_id> queue_copy;
queue_copy.swap(writeback_queue);
int started = 0, i = 0;
for (i = 0; i < queue_copy.size() && (!count || started < count); i++)
{
object_id & req = queue_copy[i];
auto dirty_it = find_dirty(req.inode, req.stripe);
if (dirty_it == dirty_buffers.end() ||
dirty_it->first.inode != req.inode ||
dirty_it->second.state != CACHE_DIRTY)
{
continue;
}
auto from_it = dirty_it;
uint64_t off = dirty_it->first.stripe;
while (from_it != dirty_buffers.begin())
{
from_it--;
if (from_it->second.state != CACHE_DIRTY ||
from_it->first.inode != req.inode ||
from_it->first.stripe+from_it->second.len != off)
{
from_it++;
break;
}
off = from_it->first.stripe;
}
off = dirty_it->first.stripe + dirty_it->second.len;
auto to_it = dirty_it;
to_it++;
while (to_it != dirty_buffers.end())
{
if (to_it->second.state != CACHE_DIRTY ||
to_it->first.inode != req.inode ||
to_it->first.stripe != off)
{
break;
}
off = to_it->first.stripe + to_it->second.len;
to_it++;
}
started++;
assert(writeback_queue_size > 0);
writeback_queue_size--;
writeback_bytes -= off - from_it->first.stripe;
flush_buffers(cli, from_it, to_it);
}
queue_copy.erase(queue_copy.begin(), queue_copy.begin()+i);
if (writeback_queue.size())
{
queue_copy.insert(queue_copy.end(), writeback_queue.begin(), writeback_queue.end());
}
queue_copy.swap(writeback_queue);
}
static void copy_to_op(cluster_op_t *op, uint64_t offset, uint8_t *buf, uint64_t len, uint32_t bitmap_granularity)
{
if (op->opcode == OSD_OP_READ)
{
// Not OSD_OP_READ_BITMAP or OSD_OP_READ_CHAIN_BITMAP
int iov_idx = 0;
uint64_t cur_offset = op->offset;
while (iov_idx < op->iov.count && cur_offset+op->iov.buf[iov_idx].iov_len <= offset)
{
cur_offset += op->iov.buf[iov_idx].iov_len;
iov_idx++;
}
while (iov_idx < op->iov.count && cur_offset < offset+len)
{
auto & v = op->iov.buf[iov_idx];
auto begin = (cur_offset < offset ? offset : cur_offset);
auto end = (cur_offset+v.iov_len > offset+len ? offset+len : cur_offset+v.iov_len);
memcpy(
(uint8_t*)v.iov_base + begin - cur_offset,
buf + (cur_offset <= offset ? 0 : cur_offset-offset),
end - begin
);
cur_offset += v.iov_len;
iov_idx++;
}
}
// Set bitmap bits
int start_bit = (offset-op->offset)/bitmap_granularity;
int end_bit = (offset-op->offset+len)/bitmap_granularity;
for (int bit = start_bit; bit < end_bit;)
{
if (!(bit%8) && bit <= end_bit-8)
{
((uint8_t*)op->bitmap_buf)[bit/8] = 0xFF;
bit += 8;
}
else
{
((uint8_t*)op->bitmap_buf)[bit/8] |= (1 << (bit%8));
bit++;
}
}
}
bool writeback_cache_t::read_from_cache(cluster_op_t *op, uint32_t bitmap_granularity)
{
bool dirty_copied = false;
if (dirty_buffers.size() && (op->opcode == OSD_OP_READ ||
op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP))
{
// We also have to return reads from CACHE_REPEATING buffers - they are not
// guaranteed to be present on target OSDs at the moment of repeating
// And we're also free to return data from other cached buffers just
// because it's faster
auto dirty_it = find_dirty(op->cur_inode, op->offset);
while (dirty_it != dirty_buffers.end() && dirty_it->first.inode == op->cur_inode &&
dirty_it->first.stripe < op->offset+op->len)
{
uint64_t begin = dirty_it->first.stripe, end = dirty_it->first.stripe + dirty_it->second.len;
if (begin < op->offset)
begin = op->offset;
if (end > op->offset+op->len)
end = op->offset+op->len;
bool skip_prev = true;
uint64_t cur = begin, prev = begin;
while (cur < end)
{
unsigned bmp_loc = (cur - op->offset)/bitmap_granularity;
bool skip = (((*((uint8_t*)op->bitmap_buf + bmp_loc/8)) >> (bmp_loc%8)) & 0x1);
if (skip_prev != skip)
{
if (cur > prev && !skip)
{
// Copy data
dirty_copied = true;
copy_to_op(op, prev, dirty_it->second.buf + prev - dirty_it->first.stripe, cur-prev, bitmap_granularity);
}
skip_prev = skip;
prev = cur;
}
cur += bitmap_granularity;
}
assert(cur > prev);
if (!skip_prev)
{
// Copy data
dirty_copied = true;
copy_to_op(op, prev, dirty_it->second.buf + prev - dirty_it->first.stripe, cur-prev, bitmap_granularity);
}
dirty_it++;
}
}
return dirty_copied;
}
void writeback_cache_t::fsync_start()
{
for (auto & prev_op: dirty_buffers)
{
if (prev_op.second.state == CACHE_WRITTEN)
{
prev_op.second.state = CACHE_FLUSHING;
}
}
}
void writeback_cache_t::fsync_error()
{
for (auto & prev_op: dirty_buffers)
{
if (prev_op.second.state == CACHE_FLUSHING)
{
prev_op.second.state = CACHE_WRITTEN;
}
}
}
void writeback_cache_t::fsync_ok()
{
for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); )
{
if (uw_it->second.state == CACHE_FLUSHING)
{
if (!--(*uw_it->second.refcnt))
free(uw_it->second.refcnt);
dirty_buffers.erase(uw_it++);
}
else
uw_it++;
}
}