Implement graceful stopping of PGs

trace-sqes
Vitaliy Filippov 2020-04-03 13:03:42 +03:00
parent afe2e76c87
commit dfb6e15eaa
6 changed files with 86 additions and 18 deletions

View File

@ -370,6 +370,7 @@ void osd_t::exec_op(osd_op_t *cur_op)
delete cur_op; delete cur_op;
return; return;
} }
inflight_ops++;
cur_op->send_list.push_back(cur_op->reply.buf, OSD_PACKET_SIZE); cur_op->send_list.push_back(cur_op->reply.buf, OSD_PACKET_SIZE);
if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC || if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX || cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
@ -382,7 +383,6 @@ void osd_t::exec_op(osd_op_t *cur_op)
finish_op(cur_op, -EINVAL); finish_op(cur_op, -EINVAL);
return; return;
} }
inflight_ops++;
if (cur_op->req.hdr.opcode == OSD_OP_TEST_SYNC_STAB_ALL) if (cur_op->req.hdr.opcode == OSD_OP_TEST_SYNC_STAB_ALL)
{ {
exec_sync_stab_all(cur_op); exec_sync_stab_all(cur_op);

3
osd.h
View File

@ -200,6 +200,7 @@ class osd_t
std::map<uint64_t, int> osd_peer_fds; std::map<uint64_t, int> osd_peer_fds;
std::map<pg_num_t, pg_t> pgs; std::map<pg_num_t, pg_t> pgs;
std::set<pg_num_t> dirty_pgs;
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0; uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
int peering_state = 0; int peering_state = 0;
unsigned pg_count = 0; unsigned pg_count = 0;
@ -265,6 +266,8 @@ class osd_t
void handle_peers(); void handle_peers();
void repeer_pgs(osd_num_t osd_num, bool is_connected); void repeer_pgs(osd_num_t osd_num, bool is_connected);
void start_pg_peering(pg_num_t pg_num); void start_pg_peering(pg_num_t pg_num);
bool stop_pg(pg_num_t pg_num);
void finish_stop_pg(pg_t & pg);
// flushing, recovery and backfill // flushing, recovery and backfill
void submit_pg_flush_ops(pg_num_t pg_num); void submit_pg_flush_ops(pg_num_t pg_num);

View File

@ -265,6 +265,7 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
auto & pg = pgs[pg_num]; auto & pg = pgs[pg_num];
pg.state = PG_PEERING; pg.state = PG_PEERING;
pg.print_state(); pg.print_state();
// Reset PG state
pg.state_dict.clear(); pg.state_dict.clear();
incomplete_objects -= pg.incomplete_objects.size(); incomplete_objects -= pg.incomplete_objects.size();
misplaced_objects -= pg.misplaced_objects.size(); misplaced_objects -= pg.misplaced_objects.size();
@ -284,15 +285,18 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
cancel_op(p.second); cancel_op(p.second);
} }
pg.write_queue.clear(); pg.write_queue.clear();
// Forget this PG's unstable writes
for (auto it = unstable_writes.begin(); it != unstable_writes.end(); ) for (auto it = unstable_writes.begin(); it != unstable_writes.end(); )
{ {
// Forget this PG's unstable writes
pg_num_t n = (it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count + 1; pg_num_t n = (it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count + 1;
if (n == pg.pg_num) if (n == pg.pg_num)
unstable_writes.erase(it++); unstable_writes.erase(it++);
else else
it++; it++;
} }
pg.inflight = 0;
dirty_pgs.erase(pg.pg_num);
// Start peering
pg.pg_cursize = 0; pg.pg_cursize = 0;
for (int role = 0; role < pg.cur_set.size(); role++) for (int role = 0; role < pg.cur_set.size(); role++)
{ {
@ -472,3 +476,28 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
} }
ringloop->wakeup(); ringloop->wakeup();
} }
bool osd_t::stop_pg(pg_num_t pg_num)
{
auto pg_it = pgs.find(pg_num);
if (pg_it == pgs.end())
{
return false;
}
auto & pg = pg_it->second;
if (!(pg.state & PG_ACTIVE))
{
return false;
}
pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING;
if (pg.inflight == 0)
{
finish_stop_pg(pg);
}
return true;
}
void osd_t::finish_stop_pg(pg_t & pg)
{
pg.state = PG_OFFLINE;
}

View File

@ -92,7 +92,7 @@ struct pg_flush_batch_t
struct pg_t struct pg_t
{ {
int state; int state = PG_OFFLINE;
uint64_t pg_cursize = 3, pg_size = 3, pg_minsize = 2; uint64_t pg_cursize = 3, pg_size = 3, pg_minsize = 2;
pg_num_t pg_num; pg_num_t pg_num;
uint64_t clean_count = 0, total_count = 0; uint64_t clean_count = 0, total_count = 0;
@ -112,6 +112,7 @@ struct pg_t
pg_peering_state_t *peering_state = NULL; pg_peering_state_t *peering_state = NULL;
pg_flush_batch_t *flush_batch = NULL; pg_flush_batch_t *flush_batch = NULL;
int inflight = 0; // including write_queue
std::multimap<object_id, osd_op_t*> write_queue; std::multimap<object_id, osd_op_t*> write_queue;
void calc_object_states(); void calc_object_states();

View File

@ -35,11 +35,24 @@ struct osd_primary_op_data_t
uint64_t *prev_set = NULL; uint64_t *prev_set = NULL;
// for sync. oops, requires freeing // for sync. oops, requires freeing
std::vector<unstable_osd_num_t> *unstable_write_osds = NULL; std::vector<unstable_osd_num_t> *unstable_write_osds = NULL;
pg_num_t *dirty_pgs = NULL;
int dirty_pg_count = 0;
obj_ver_id *unstable_writes = NULL; obj_ver_id *unstable_writes = NULL;
}; };
void osd_t::finish_op(osd_op_t *cur_op, int retval) void osd_t::finish_op(osd_op_t *cur_op, int retval)
{ {
inflight_ops--;
if (cur_op->op_data && cur_op->op_data->pg_num > 0)
{
auto & pg = pgs[cur_op->op_data->pg_num];
int n = --pg.inflight;
assert(n >= 0);
if (n == 0 && (pg.state & PG_STOPPING))
{
finish_stop_pg(pg);
}
}
if (!cur_op->peer_fd) if (!cur_op->peer_fd)
{ {
// Copy lambda to be unaffected by `delete op` // Copy lambda to be unaffected by `delete op`
@ -71,12 +84,14 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
// But we must not use K in the process of calculating the PG number // But we must not use K in the process of calculating the PG number
// So we calculate the PG number using a separate setting which should be per-inode (FIXME) // So we calculate the PG number using a separate setting which should be per-inode (FIXME)
pg_num_t pg_num = (cur_op->req.rw.inode + cur_op->req.rw.offset / pg_stripe_size) % pg_count + 1; pg_num_t pg_num = (cur_op->req.rw.inode + cur_op->req.rw.offset / pg_stripe_size) % pg_count + 1;
if (pgs.find(pg_num) == pgs.end() || !(pgs[pg_num].state & PG_ACTIVE)) auto pg_it = pgs.find(pg_num);
if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
{ {
// This OSD is not primary for this PG or the PG is inactive
finish_op(cur_op, -EPIPE); finish_op(cur_op, -EPIPE);
return false; return false;
} }
uint64_t pg_block_size = bs_block_size * pgs[pg_num].pg_minsize; uint64_t pg_block_size = bs_block_size * pg_it->second.pg_minsize;
object_id oid = { object_id oid = {
.inode = cur_op->req.rw.inode, .inode = cur_op->req.rw.inode,
// oid.stripe = starting offset of the parity stripe, so it can be mapped back to the PG // oid.stripe = starting offset of the parity stripe, so it can be mapped back to the PG
@ -91,13 +106,14 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
return false; return false;
} }
osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc( osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc(
sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * pgs[pg_num].pg_size, 1 sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * pg_it->second.pg_size, 1
); );
op_data->pg_num = pg_num; op_data->pg_num = pg_num;
op_data->oid = oid; op_data->oid = oid;
op_data->stripes = ((osd_rmw_stripe_t*)(op_data+1)); op_data->stripes = ((osd_rmw_stripe_t*)(op_data+1));
cur_op->op_data = op_data; cur_op->op_data = op_data;
split_stripes(pgs[pg_num].pg_minsize, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes); split_stripes(pg_it->second.pg_minsize, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
pg_it->second.inflight++;
return true; return true;
} }
@ -601,6 +617,7 @@ resume_7:
// Remember PG as dirty to drop the connection when PG goes offline // Remember PG as dirty to drop the connection when PG goes offline
// (this is required because of the "lazy sync") // (this is required because of the "lazy sync")
this->clients[cur_op->peer_fd].dirty_pgs.insert(op_data->pg_num); this->clients[cur_op->peer_fd].dirty_pgs.insert(op_data->pg_num);
dirty_pgs.insert(op_data->pg_num);
} }
// Remove version override // Remove version override
object_id oid = op_data->oid; object_id oid = op_data->oid;
@ -683,11 +700,13 @@ resume_2:
goto finish; goto finish;
} }
// Save and clear unstable_writes // Save and clear unstable_writes
// FIXME: This is possible to do it on a per-client basis // In theory it is possible to do in on a per-client basis, but this seems to be an unnecessary complication
// It would be cool not to copy them here at all, but someone has to deduplicate them by object IDs anyway // It would be cool not to copy these here at all, but someone has to deduplicate them by object IDs anyway
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
{ {
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
op_data->dirty_pgs = new pg_num_t[dirty_pgs.size()];
op_data->dirty_pg_count = dirty_pgs.size();
osd_num_t last_osd = 0; osd_num_t last_osd = 0;
int last_start = 0, last_end = 0; int last_start = 0, last_end = 0;
for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++) for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
@ -719,8 +738,15 @@ resume_2:
.len = last_end - last_start, .len = last_end - last_start,
}); });
} }
int dpg = 0;
for (auto dirty_pg_num: dirty_pgs)
{
pgs[dirty_pg_num].inflight++;
op_data->dirty_pgs[dpg++] = dirty_pg_num;
}
dirty_pgs.clear();
this->unstable_writes.clear();
} }
this->unstable_writes.clear();
if (immediate_commit != IMMEDIATE_ALL) if (immediate_commit != IMMEDIATE_ALL)
{ {
// SYNC // SYNC
@ -740,6 +766,10 @@ resume_5:
op_data->st = 5; op_data->st = 5;
return; return;
resume_6: resume_6:
for (int i = 0; i < op_data->dirty_pg_count; i++)
{
pgs[op_data->dirty_pgs[i]].inflight--;
}
if (op_data->errors > 0) if (op_data->errors > 0)
{ {
// Return objects back into the unstable write set // Return objects back into the unstable write set
@ -747,20 +777,23 @@ resume_6:
{ {
for (int i = 0; i < unstable_osd.len; i++) for (int i = 0; i < unstable_osd.len; i++)
{ {
// Expect those from peered PGs // Except those from peered PGs
auto & w = op_data->unstable_writes[i]; auto & w = op_data->unstable_writes[i];
if (pgs[map_to_pg(w.oid)].state & PG_ACTIVE) pg_num_t wpg = map_to_pg(w.oid);
if (pgs[wpg].state & PG_ACTIVE)
{ {
uint64_t & dest = this->unstable_writes[(osd_object_id_t){ uint64_t & dest = this->unstable_writes[(osd_object_id_t){
.osd_num = unstable_osd.osd_num, .osd_num = unstable_osd.osd_num,
.oid = w.oid, .oid = w.oid,
}]; }];
dest = dest < w.version ? w.version : dest; dest = dest < w.version ? w.version : dest;
dirty_pgs.insert(wpg);
} }
} }
} }
} }
// FIXME: Free those in the destructor? // FIXME: Free those in the destructor?
delete op_data->dirty_pgs;
delete op_data->unstable_write_osds; delete op_data->unstable_write_osds;
delete[] op_data->unstable_writes; delete[] op_data->unstable_writes;
op_data->unstable_writes = NULL; op_data->unstable_writes = NULL;
@ -772,9 +805,12 @@ resume_6:
else else
{ {
finish: finish:
auto it = clients.find(cur_op->peer_fd); if (cur_op->peer_fd)
if (it != clients.end()) {
it->second.dirty_pgs.clear(); auto it = clients.find(cur_op->peer_fd);
if (it != clients.end())
it->second.dirty_pgs.clear();
}
finish_op(cur_op, 0); finish_op(cur_op, 0);
} }
assert(syncs_in_progress.front() == cur_op); assert(syncs_in_progress.front() == cur_op);

View File

@ -4,7 +4,6 @@
void osd_t::secondary_op_callback(osd_op_t *op) void osd_t::secondary_op_callback(osd_op_t *op)
{ {
inflight_ops--;
if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ || if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE) op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
{ {