forked from vitalif/vitastor
Fix sync confirmations and some pipeline-stall bugs
parent
78807eb244
commit
e2b91968c5
|
@ -108,18 +108,30 @@ void blockstore::loop()
|
||||||
continue_sync(*cur_sync++);
|
continue_sync(*cur_sync++);
|
||||||
}
|
}
|
||||||
auto cur = submit_queue.begin();
|
auto cur = submit_queue.begin();
|
||||||
bool has_writes = false;
|
int has_writes = 0;
|
||||||
while (cur != submit_queue.end())
|
while (cur != submit_queue.end())
|
||||||
{
|
{
|
||||||
auto op_ptr = cur;
|
auto op_ptr = cur;
|
||||||
auto op = *(cur++);
|
auto op = *(cur++);
|
||||||
|
// FIXME: This needs some simplification
|
||||||
|
// Writes should not block reads if the ring is not null and if reads don't depend on them
|
||||||
|
// In all other cases we should stop submission
|
||||||
if (op->wait_for)
|
if (op->wait_for)
|
||||||
{
|
{
|
||||||
check_wait(op);
|
check_wait(op);
|
||||||
if (op->wait_for == WAIT_SQE)
|
if (op->wait_for == WAIT_SQE)
|
||||||
|
{
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
else if (op->wait_for)
|
else if (op->wait_for)
|
||||||
|
{
|
||||||
|
if ((op->flags & OP_TYPE_MASK) == OP_WRITE ||
|
||||||
|
(op->flags & OP_TYPE_MASK) == OP_DELETE)
|
||||||
|
{
|
||||||
|
has_writes = 2;
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
unsigned ring_space = io_uring_sq_space_left(&ringloop->ring);
|
unsigned ring_space = io_uring_sq_space_left(&ringloop->ring);
|
||||||
unsigned prev_sqe_pos = ringloop->ring.sq.sqe_tail;
|
unsigned prev_sqe_pos = ringloop->ring.sq.sqe_tail;
|
||||||
|
@ -131,8 +143,13 @@ void blockstore::loop()
|
||||||
else if ((op->flags & OP_TYPE_MASK) == OP_WRITE ||
|
else if ((op->flags & OP_TYPE_MASK) == OP_WRITE ||
|
||||||
(op->flags & OP_TYPE_MASK) == OP_DELETE)
|
(op->flags & OP_TYPE_MASK) == OP_DELETE)
|
||||||
{
|
{
|
||||||
has_writes = true;
|
if (has_writes == 2)
|
||||||
|
{
|
||||||
|
// Some writes could not be submitted
|
||||||
|
break;
|
||||||
|
}
|
||||||
dequeue_op = dequeue_write(op);
|
dequeue_op = dequeue_write(op);
|
||||||
|
has_writes = dequeue_op ? 1 : 2;
|
||||||
}
|
}
|
||||||
else if ((op->flags & OP_TYPE_MASK) == OP_SYNC)
|
else if ((op->flags & OP_TYPE_MASK) == OP_SYNC)
|
||||||
{
|
{
|
||||||
|
@ -229,7 +246,7 @@ void blockstore::check_wait(blockstore_operation *op)
|
||||||
}
|
}
|
||||||
else if (op->wait_for == WAIT_JOURNAL)
|
else if (op->wait_for == WAIT_JOURNAL)
|
||||||
{
|
{
|
||||||
if (journal.used_start < op->wait_detail)
|
if (journal.used_start == op->wait_detail)
|
||||||
{
|
{
|
||||||
// do not submit
|
// do not submit
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -312,6 +312,7 @@ class blockstore
|
||||||
int dequeue_sync(blockstore_operation *op);
|
int dequeue_sync(blockstore_operation *op);
|
||||||
void handle_sync_event(ring_data_t *data, blockstore_operation *op);
|
void handle_sync_event(ring_data_t *data, blockstore_operation *op);
|
||||||
int continue_sync(blockstore_operation *op);
|
int continue_sync(blockstore_operation *op);
|
||||||
|
void ack_one_sync(blockstore_operation *op);
|
||||||
int ack_sync(blockstore_operation *op);
|
int ack_sync(blockstore_operation *op);
|
||||||
|
|
||||||
// Stabilize
|
// Stabilize
|
||||||
|
|
|
@ -63,7 +63,8 @@ void journal_flusher_t::queue_flush(obj_ver_id ov)
|
||||||
auto it = flush_versions.find(ov.oid);
|
auto it = flush_versions.find(ov.oid);
|
||||||
if (it != flush_versions.end())
|
if (it != flush_versions.end())
|
||||||
{
|
{
|
||||||
it->second = ov.version;
|
if (it->second < ov.version)
|
||||||
|
it->second = ov.version;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -77,7 +78,8 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov)
|
||||||
auto it = flush_versions.find(ov.oid);
|
auto it = flush_versions.find(ov.oid);
|
||||||
if (it != flush_versions.end())
|
if (it != flush_versions.end())
|
||||||
{
|
{
|
||||||
it->second = ov.version;
|
if (it->second < ov.version)
|
||||||
|
it->second = ov.version;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -199,7 +201,12 @@ resume_0:
|
||||||
}
|
}
|
||||||
else if (!IS_STABLE(dirty_it->second.state))
|
else if (!IS_STABLE(dirty_it->second.state))
|
||||||
{
|
{
|
||||||
throw std::runtime_error("BUG: Unexpected dirty_entry state during flush: " + std::to_string(dirty_it->second.state));
|
char err[1024];
|
||||||
|
snprintf(
|
||||||
|
err, 1024, "BUG: Unexpected dirty_entry %lu:%lu v%lu state during flush: %d",
|
||||||
|
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
|
||||||
|
);
|
||||||
|
throw std::runtime_error(err);
|
||||||
}
|
}
|
||||||
if (dirty_it == bs->dirty_db.begin())
|
if (dirty_it == bs->dirty_db.begin())
|
||||||
{
|
{
|
||||||
|
@ -217,7 +224,7 @@ resume_0:
|
||||||
flusher->active_flushers--;
|
flusher->active_flushers--;
|
||||||
flusher->active_until_sync--;
|
flusher->active_until_sync--;
|
||||||
repeat_it = flusher->sync_to_repeat.find(cur.oid);
|
repeat_it = flusher->sync_to_repeat.find(cur.oid);
|
||||||
if (repeat_it->second != 0)
|
if (repeat_it->second > cur.version)
|
||||||
{
|
{
|
||||||
// Requeue version
|
// Requeue version
|
||||||
flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
|
flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
|
||||||
|
@ -475,12 +482,12 @@ resume_0:
|
||||||
wait_state = 0;
|
wait_state = 0;
|
||||||
flusher->active_flushers--;
|
flusher->active_flushers--;
|
||||||
repeat_it = flusher->sync_to_repeat.find(cur.oid);
|
repeat_it = flusher->sync_to_repeat.find(cur.oid);
|
||||||
if (repeat_it->second != 0)
|
if (repeat_it->second > cur.version)
|
||||||
{
|
{
|
||||||
// Requeue version
|
// Requeue version
|
||||||
flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
|
flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
|
||||||
flusher->sync_to_repeat.erase(repeat_it);
|
|
||||||
}
|
}
|
||||||
|
flusher->sync_to_repeat.erase(repeat_it);
|
||||||
goto resume_0;
|
goto resume_0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,9 +55,9 @@ int blockstore_journal_check_t::check_available(blockstore_operation *op, int re
|
||||||
}
|
}
|
||||||
if (!right_dir && next_pos >= bs->journal.used_start-512)
|
if (!right_dir && next_pos >= bs->journal.used_start-512)
|
||||||
{
|
{
|
||||||
// No space in the journal. Wait for it.
|
// No space in the journal. Wait until used_start changes.
|
||||||
op->wait_for = WAIT_JOURNAL;
|
op->wait_for = WAIT_JOURNAL;
|
||||||
op->wait_detail = next_pos;
|
op->wait_detail = bs->journal.used_start;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
@ -140,6 +140,7 @@ void blockstore::handle_sync_event(ring_data_t *data, blockstore_operation *op)
|
||||||
if (op->sync_state == SYNC_DATA_SYNC_SENT)
|
if (op->sync_state == SYNC_DATA_SYNC_SENT)
|
||||||
{
|
{
|
||||||
op->sync_state = SYNC_DATA_SYNC_DONE;
|
op->sync_state = SYNC_DATA_SYNC_DONE;
|
||||||
|
// FIXME: This is not needed, in fact
|
||||||
for (auto it = op->sync_big_writes.begin(); it != op->sync_big_writes.end(); it++)
|
for (auto it = op->sync_big_writes.begin(); it != op->sync_big_writes.end(); it++)
|
||||||
{
|
{
|
||||||
dirty_db[*it].state = ST_D_SYNCED;
|
dirty_db[*it].state = ST_D_SYNCED;
|
||||||
|
@ -148,18 +149,6 @@ void blockstore::handle_sync_event(ring_data_t *data, blockstore_operation *op)
|
||||||
else if (op->sync_state == SYNC_JOURNAL_SYNC_SENT)
|
else if (op->sync_state == SYNC_JOURNAL_SYNC_SENT)
|
||||||
{
|
{
|
||||||
op->sync_state = SYNC_DONE;
|
op->sync_state = SYNC_DONE;
|
||||||
for (auto it = op->sync_big_writes.begin(); it != op->sync_big_writes.end(); it++)
|
|
||||||
{
|
|
||||||
auto & unstab = unstable_writes[it->oid];
|
|
||||||
unstab = !unstab || unstab > it->version ? it->version : unstab;
|
|
||||||
dirty_db[*it].state = ST_D_META_SYNCED;
|
|
||||||
}
|
|
||||||
for (auto it = op->sync_small_writes.begin(); it != op->sync_small_writes.end(); it++)
|
|
||||||
{
|
|
||||||
auto & unstab = unstable_writes[it->oid];
|
|
||||||
unstab = !unstab || unstab > it->version ? it->version : unstab;
|
|
||||||
dirty_db[*it].state = ST_J_SYNCED;
|
|
||||||
}
|
|
||||||
ack_sync(op);
|
ack_sync(op);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -177,6 +166,8 @@ int blockstore::ack_sync(blockstore_operation *op)
|
||||||
auto it = op->in_progress_ptr;
|
auto it = op->in_progress_ptr;
|
||||||
int done_syncs = 1;
|
int done_syncs = 1;
|
||||||
++it;
|
++it;
|
||||||
|
// Acknowledge sync
|
||||||
|
ack_one_sync(op);
|
||||||
while (it != in_progress_syncs.end())
|
while (it != in_progress_syncs.end())
|
||||||
{
|
{
|
||||||
auto & next_sync = *it++;
|
auto & next_sync = *it++;
|
||||||
|
@ -185,16 +176,30 @@ int blockstore::ack_sync(blockstore_operation *op)
|
||||||
{
|
{
|
||||||
done_syncs++;
|
done_syncs++;
|
||||||
// Acknowledge next_sync
|
// Acknowledge next_sync
|
||||||
in_progress_syncs.erase(next_sync->in_progress_ptr);
|
ack_one_sync(next_sync);
|
||||||
next_sync->retval = 0;
|
|
||||||
next_sync->callback(next_sync);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Acknowledge sync
|
|
||||||
in_progress_syncs.erase(op->in_progress_ptr);
|
|
||||||
op->retval = 0;
|
|
||||||
op->callback(op);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void blockstore::ack_one_sync(blockstore_operation *op)
|
||||||
|
{
|
||||||
|
// Handle states
|
||||||
|
for (auto it = op->sync_big_writes.begin(); it != op->sync_big_writes.end(); it++)
|
||||||
|
{
|
||||||
|
auto & unstab = unstable_writes[it->oid];
|
||||||
|
unstab = unstab < it->version ? it->version : unstab;
|
||||||
|
dirty_db[*it].state = ST_D_META_SYNCED;
|
||||||
|
}
|
||||||
|
for (auto it = op->sync_small_writes.begin(); it != op->sync_small_writes.end(); it++)
|
||||||
|
{
|
||||||
|
auto & unstab = unstable_writes[it->oid];
|
||||||
|
unstab = unstab < it->version ? it->version : unstab;
|
||||||
|
dirty_db[*it].state = ST_J_SYNCED;
|
||||||
|
}
|
||||||
|
in_progress_syncs.erase(op->in_progress_ptr);
|
||||||
|
op->retval = 0;
|
||||||
|
op->callback(op);
|
||||||
|
}
|
||||||
|
|
|
@ -41,24 +41,6 @@ void blockstore::enqueue_write(blockstore_operation *op)
|
||||||
.len = op->len,
|
.len = op->len,
|
||||||
.journal_sector = 0,
|
.journal_sector = 0,
|
||||||
});
|
});
|
||||||
// Remember write as unsynced here, so external consumers could get
|
|
||||||
// the list of dirty objects to sync just before issuing a SYNC request
|
|
||||||
if (op->len == block_size || op->version == 1)
|
|
||||||
{
|
|
||||||
// Remember big write as unsynced
|
|
||||||
unsynced_big_writes.push_back((obj_ver_id){
|
|
||||||
.oid = op->oid,
|
|
||||||
.version = op->version,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Remember small write as unsynced
|
|
||||||
unsynced_small_writes.push_back((obj_ver_id){
|
|
||||||
.oid = op->oid,
|
|
||||||
.version = op->version,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// First step of the write algorithm: dequeue operation and submit initial write(s)
|
// First step of the write algorithm: dequeue operation and submit initial write(s)
|
||||||
|
@ -109,6 +91,11 @@ int blockstore::dequeue_write(blockstore_operation *op)
|
||||||
);
|
);
|
||||||
op->pending_ops = 1;
|
op->pending_ops = 1;
|
||||||
op->min_used_journal_sector = op->max_used_journal_sector = 0;
|
op->min_used_journal_sector = op->max_used_journal_sector = 0;
|
||||||
|
// Remember big write as unsynced
|
||||||
|
unsynced_big_writes.push_back((obj_ver_id){
|
||||||
|
.oid = op->oid,
|
||||||
|
.version = op->version,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -154,6 +141,11 @@ int blockstore::dequeue_write(blockstore_operation *op)
|
||||||
dirty_it->second.state = ST_J_SUBMITTED;
|
dirty_it->second.state = ST_J_SUBMITTED;
|
||||||
journal.next_free += op->len;
|
journal.next_free += op->len;
|
||||||
op->pending_ops = 2;
|
op->pending_ops = 2;
|
||||||
|
// Remember small write as unsynced
|
||||||
|
unsynced_small_writes.push_back((obj_ver_id){
|
||||||
|
.oid = op->oid,
|
||||||
|
.version = op->version,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -178,7 +178,7 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
|
||||||
.version = it->second,
|
.version = it->second,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
bsd->bs->enqueue_op(op);
|
bsd->bs->unstable_writes.clear();
|
||||||
op->callback = [io, n](blockstore_operation *op)
|
op->callback = [io, n](blockstore_operation *op)
|
||||||
{
|
{
|
||||||
io->error = op->retval < 0 ? -op->retval : 0;
|
io->error = op->retval < 0 ? -op->retval : 0;
|
||||||
|
@ -191,6 +191,7 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
|
||||||
printf("--- OP_SYNC %llx n=%d retval=%d\n", io, n, op->retval);
|
printf("--- OP_SYNC %llx n=%d retval=%d\n", io, n, op->retval);
|
||||||
delete op;
|
delete op;
|
||||||
};
|
};
|
||||||
|
bsd->bs->enqueue_op(op);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue