Make deletions instantly stable

"2-phase" (write->stabilize) process is pointless for deletions because it
doesn't protect us from incomplete objects. This happens because it removes
the version information from metadata after stabilization. Deletions require
"3-phase" process with a potentially very long 3rd phase.

So, deletions will be allowed to generate degraded and incomplete objects,
and for it to not affect users' ability to delete something, the cluster
will allow to delete whole inodes while storing a list of them in etcd.
Proper TRIM will be impossible until the implementation of the aforementioned
"3-phase" process, though.

By the way, this change also fixes a possible write stall after rebalancing
which was caused by the lack of "stabilize delete" operations.
sync-io-test
Vitaliy Filippov 2020-06-02 20:43:28 +03:00
parent 985c309d7f
commit 571be0f380
5 changed files with 21 additions and 12 deletions

View File

@ -624,6 +624,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.journal_sector = proc_pos,
});
bs->journal.used_sectors[proc_pos]++;
// Deletions are treated as immediately stable, because
// "2-phase commit" (write->stabilize) isn't sufficient for them anyway
bs->mark_stable(ov);
}
}
started = true;

View File

@ -275,7 +275,16 @@ void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
#endif
auto & unstab = unstable_writes[it->oid];
unstab = unstab < it->version ? it->version : unstab;
dirty_db[*it].state = dirty_db[*it].state == ST_DEL_WRITTEN ? ST_DEL_SYNCED : ST_J_SYNCED;
if (dirty_db[*it].state == ST_DEL_WRITTEN)
{
dirty_db[*it].state = ST_DEL_SYNCED;
// Deletions are treated as immediately stable
mark_stable(*it);
}
else /* == ST_J_WRITTEN */
{
dirty_db[*it].state = ST_J_SYNCED;
}
}
in_progress_syncs.erase(PRIV(op)->in_progress_ptr);
op->retval = 0;

View File

@ -355,6 +355,11 @@ resume_4:
else if (dirty_it->second.state == ST_DEL_SUBMITTED)
{
dirty_it->second.state = imm ? ST_DEL_SYNCED : ST_DEL_WRITTEN;
if (imm)
{
// Deletions are treated as immediately stable
mark_stable(dirty_it->first);
}
}
if (immediate_commit == IMMEDIATE_ALL)
{

2
osd.h
View File

@ -193,7 +193,7 @@ class osd_t
void continue_primary_del(osd_op_t *cur_op);
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
bool finalize_primary_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
void handle_primary_bs_subop(osd_op_t *subop);
void add_bs_subop_stats(osd_op_t *subop);

View File

@ -284,7 +284,7 @@ resume_9:
// FIXME: Check for immediate_commit == IMMEDIATE_SMALL
resume_6:
resume_7:
if (!finalize_primary_write(cur_op, pg, pg.cur_loc_set, 6))
if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
{
return;
}
@ -305,7 +305,7 @@ resume_7:
}
}
bool osd_t::finalize_primary_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
{
osd_primary_op_data_t *op_data = cur_op->op_data;
if (op_data->st == base_state)
@ -598,8 +598,6 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
else if (op_data->st == 3) goto resume_3;
else if (op_data->st == 4) goto resume_4;
else if (op_data->st == 5) goto resume_5;
else if (op_data->st == 6) goto resume_6;
else if (op_data->st == 7) goto resume_7;
assert(op_data->st == 0);
// Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs
if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD))
@ -641,12 +639,6 @@ resume_5:
}
// Remove version override
pg.ver_override.erase(op_data->oid);
resume_6:
resume_7:
if (!finalize_primary_write(cur_op, pg, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set, 6))
{
return;
}
// Adjust PG stats after "instant stabilize", because we need object_state above
if (!op_data->object_state)
{