Implement immediate commit mode
parent
c3737ae3ff
commit
3f522c66e6
|
@ -355,7 +355,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
|
||||||
op->callback(op);
|
op->callback(op);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (0 && op->opcode == BS_OP_SYNC && immediate_commit)
|
if (op->opcode == BS_OP_SYNC && immediate_commit == IMMEDIATE_ALL)
|
||||||
{
|
{
|
||||||
op->retval = 0;
|
op->retval = 0;
|
||||||
op->callback(op);
|
op->callback(op);
|
||||||
|
|
|
@ -34,8 +34,7 @@
|
||||||
#define ST_D_IN_FLIGHT 15
|
#define ST_D_IN_FLIGHT 15
|
||||||
#define ST_D_SUBMITTED 16
|
#define ST_D_SUBMITTED 16
|
||||||
#define ST_D_WRITTEN 17
|
#define ST_D_WRITTEN 17
|
||||||
#define ST_D_META_WRITTEN 19
|
#define ST_D_SYNCED 20
|
||||||
#define ST_D_META_SYNCED 20
|
|
||||||
#define ST_D_STABLE 21
|
#define ST_D_STABLE 21
|
||||||
|
|
||||||
#define ST_DEL_IN_FLIGHT 31
|
#define ST_DEL_IN_FLIGHT 31
|
||||||
|
@ -46,13 +45,17 @@
|
||||||
|
|
||||||
#define ST_CURRENT 48
|
#define ST_CURRENT 48
|
||||||
|
|
||||||
|
#define IMMEDIATE_NONE 0
|
||||||
|
#define IMMEDIATE_SMALL 1
|
||||||
|
#define IMMEDIATE_ALL 2
|
||||||
|
|
||||||
#define IS_IN_FLIGHT(st) (st == ST_J_IN_FLIGHT || st == ST_D_IN_FLIGHT || st == ST_DEL_IN_FLIGHT || st == ST_J_SUBMITTED || st == ST_D_SUBMITTED || st == ST_DEL_SUBMITTED)
|
#define IS_IN_FLIGHT(st) (st == ST_J_IN_FLIGHT || st == ST_D_IN_FLIGHT || st == ST_DEL_IN_FLIGHT || st == ST_J_SUBMITTED || st == ST_D_SUBMITTED || st == ST_DEL_SUBMITTED)
|
||||||
#define IS_STABLE(st) (st == ST_J_STABLE || st == ST_D_STABLE || st == ST_DEL_STABLE || st == ST_CURRENT)
|
#define IS_STABLE(st) (st == ST_J_STABLE || st == ST_D_STABLE || st == ST_DEL_STABLE || st == ST_CURRENT)
|
||||||
#define IS_SYNCED(st) (IS_STABLE(st) || st == ST_J_SYNCED || st == ST_D_META_SYNCED || st == ST_DEL_SYNCED)
|
#define IS_SYNCED(st) (IS_STABLE(st) || st == ST_J_SYNCED || st == ST_D_SYNCED || st == ST_DEL_SYNCED)
|
||||||
#define IS_JOURNAL(st) (st >= ST_J_SUBMITTED && st <= ST_J_STABLE)
|
#define IS_JOURNAL(st) (st >= ST_J_SUBMITTED && st <= ST_J_STABLE)
|
||||||
#define IS_BIG_WRITE(st) (st >= ST_D_SUBMITTED && st <= ST_D_STABLE)
|
#define IS_BIG_WRITE(st) (st >= ST_D_SUBMITTED && st <= ST_D_STABLE)
|
||||||
#define IS_DELETE(st) (st >= ST_DEL_SUBMITTED && st <= ST_DEL_STABLE)
|
#define IS_DELETE(st) (st >= ST_DEL_SUBMITTED && st <= ST_DEL_STABLE)
|
||||||
#define IS_UNSYNCED(st) (st >= ST_J_SUBMITTED && st <= ST_J_WRITTEN || st >= ST_D_SUBMITTED && st <= ST_D_META_WRITTEN || st >= ST_DEL_SUBMITTED && st <= ST_DEL_WRITTEN)
|
#define IS_UNSYNCED(st) (st >= ST_J_SUBMITTED && st <= ST_J_WRITTEN || st >= ST_D_SUBMITTED && st <= ST_D_WRITTEN|| st >= ST_DEL_SUBMITTED && st <= ST_DEL_WRITTEN)
|
||||||
|
|
||||||
#define BS_SUBMIT_GET_SQE(sqe, data) \
|
#define BS_SUBMIT_GET_SQE(sqe, data) \
|
||||||
BS_SUBMIT_GET_ONLY_SQE(sqe); \
|
BS_SUBMIT_GET_ONLY_SQE(sqe); \
|
||||||
|
@ -195,8 +198,8 @@ class blockstore_impl_t
|
||||||
// It is safe to disable fsync() if drive write cache is writethrough
|
// It is safe to disable fsync() if drive write cache is writethrough
|
||||||
bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
|
bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
|
||||||
// Enable if you want every operation to be executed with an "implicit fsync"
|
// Enable if you want every operation to be executed with an "implicit fsync"
|
||||||
// FIXME Not implemented yet
|
// Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
|
||||||
bool immediate_commit = false;
|
int immediate_commit = IMMEDIATE_NONE;
|
||||||
bool inmemory_meta = false;
|
bool inmemory_meta = false;
|
||||||
int flusher_count;
|
int flusher_count;
|
||||||
/******* END OF OPTIONS *******/
|
/******* END OF OPTIONS *******/
|
||||||
|
@ -268,7 +271,7 @@ class blockstore_impl_t
|
||||||
bool enqueue_write(blockstore_op_t *op);
|
bool enqueue_write(blockstore_op_t *op);
|
||||||
int dequeue_write(blockstore_op_t *op);
|
int dequeue_write(blockstore_op_t *op);
|
||||||
int dequeue_del(blockstore_op_t *op);
|
int dequeue_del(blockstore_op_t *op);
|
||||||
void ack_write(blockstore_op_t *op);
|
int continue_write(blockstore_op_t *op);
|
||||||
void release_journal_sectors(blockstore_op_t *op);
|
void release_journal_sectors(blockstore_op_t *op);
|
||||||
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
|
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
|
||||||
|
|
||||||
|
|
|
@ -558,7 +558,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
.version = je->big_write.version,
|
.version = je->big_write.version,
|
||||||
};
|
};
|
||||||
bs->dirty_db.emplace(ov, (dirty_entry){
|
bs->dirty_db.emplace(ov, (dirty_entry){
|
||||||
.state = ST_D_META_SYNCED,
|
.state = ST_D_SYNCED,
|
||||||
.flags = 0,
|
.flags = 0,
|
||||||
.location = je->big_write.location,
|
.location = je->big_write.location,
|
||||||
.offset = je->big_write.offset,
|
.offset = je->big_write.offset,
|
||||||
|
@ -595,7 +595,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
{
|
{
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
it->second.state = (it->second.state == ST_D_META_SYNCED
|
it->second.state = (it->second.state == ST_D_SYNCED
|
||||||
? ST_D_STABLE
|
? ST_D_STABLE
|
||||||
: (it->second.state == ST_DEL_SYNCED ? ST_DEL_STABLE : ST_J_STABLE));
|
: (it->second.state == ST_DEL_SYNCED ? ST_DEL_STABLE : ST_J_STABLE));
|
||||||
if (it == bs->dirty_db.begin())
|
if (it == bs->dirty_db.begin())
|
||||||
|
|
|
@ -34,6 +34,14 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
|
||||||
{
|
{
|
||||||
disable_journal_fsync = true;
|
disable_journal_fsync = true;
|
||||||
}
|
}
|
||||||
|
if (config["immediate_commit"] == "all")
|
||||||
|
{
|
||||||
|
immediate_commit = IMMEDIATE_ALL;
|
||||||
|
}
|
||||||
|
else if (config["immediate_commit"] == "small")
|
||||||
|
{
|
||||||
|
immediate_commit = IMMEDIATE_SMALL;
|
||||||
|
}
|
||||||
metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
|
metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
|
||||||
cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10);
|
cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10);
|
||||||
data_device = config["data_device"];
|
data_device = config["data_device"];
|
||||||
|
@ -129,6 +137,22 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
|
||||||
{
|
{
|
||||||
metadata_buf_size = 4*1024*1024;
|
metadata_buf_size = 4*1024*1024;
|
||||||
}
|
}
|
||||||
|
if (meta_device == "")
|
||||||
|
{
|
||||||
|
disable_meta_fsync = disable_data_fsync;
|
||||||
|
}
|
||||||
|
if (journal_device == "")
|
||||||
|
{
|
||||||
|
disable_journal_fsync = disable_meta_fsync;
|
||||||
|
}
|
||||||
|
if (immediate_commit != IMMEDIATE_NONE && !disable_journal_fsync)
|
||||||
|
{
|
||||||
|
throw std::runtime_error("immediate_commit requires disable_journal_fsync");
|
||||||
|
}
|
||||||
|
if (immediate_commit == IMMEDIATE_ALL && !disable_data_fsync)
|
||||||
|
{
|
||||||
|
throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
|
||||||
|
}
|
||||||
// init some fields
|
// init some fields
|
||||||
clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
|
clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
|
||||||
clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size;
|
clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size;
|
||||||
|
@ -283,7 +307,6 @@ void blockstore_impl_t::open_meta()
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
meta_fd = data_fd;
|
meta_fd = data_fd;
|
||||||
disable_meta_fsync = disable_data_fsync;
|
|
||||||
meta_size = 0;
|
meta_size = 0;
|
||||||
if (meta_offset >= data_size)
|
if (meta_offset >= data_size)
|
||||||
{
|
{
|
||||||
|
@ -306,7 +329,6 @@ void blockstore_impl_t::open_journal()
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
journal.fd = meta_fd;
|
journal.fd = meta_fd;
|
||||||
disable_journal_fsync = disable_meta_fsync;
|
|
||||||
journal.device_size = 0;
|
journal.device_size = 0;
|
||||||
if (journal.offset >= data_size)
|
if (journal.offset >= data_size)
|
||||||
{
|
{
|
||||||
|
|
|
@ -181,7 +181,7 @@ resume_5:
|
||||||
{
|
{
|
||||||
dirty_it->second.state = ST_J_STABLE;
|
dirty_it->second.state = ST_J_STABLE;
|
||||||
}
|
}
|
||||||
else if (dirty_it->second.state == ST_D_META_SYNCED)
|
else if (dirty_it->second.state == ST_D_SYNCED)
|
||||||
{
|
{
|
||||||
dirty_it->second.state = ST_D_STABLE;
|
dirty_it->second.state = ST_D_STABLE;
|
||||||
}
|
}
|
||||||
|
|
|
@ -252,7 +252,7 @@ void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
|
||||||
#endif
|
#endif
|
||||||
auto & unstab = unstable_writes[it->oid];
|
auto & unstab = unstable_writes[it->oid];
|
||||||
unstab = unstab < it->version ? it->version : unstab;
|
unstab = unstab < it->version ? it->version : unstab;
|
||||||
dirty_db[*it].state = ST_D_META_SYNCED;
|
dirty_db[*it].state = ST_D_SYNCED;
|
||||||
}
|
}
|
||||||
for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
|
for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
|
||||||
{
|
{
|
||||||
|
|
|
@ -75,6 +75,10 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||||
// First step of the write algorithm: dequeue operation and submit initial write(s)
|
// First step of the write algorithm: dequeue operation and submit initial write(s)
|
||||||
int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
|
if (PRIV(op)->op_state)
|
||||||
|
{
|
||||||
|
return continue_write(op);
|
||||||
|
}
|
||||||
auto dirty_it = dirty_db.find((obj_ver_id){
|
auto dirty_it = dirty_db.find((obj_ver_id){
|
||||||
.oid = op->oid,
|
.oid = op->oid,
|
||||||
.version = op->version,
|
.version = op->version,
|
||||||
|
@ -129,11 +133,19 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
);
|
);
|
||||||
PRIV(op)->pending_ops = 1;
|
PRIV(op)->pending_ops = 1;
|
||||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||||
// Remember big write as unsynced
|
if (immediate_commit != IMMEDIATE_ALL)
|
||||||
unsynced_big_writes.push_back((obj_ver_id){
|
{
|
||||||
.oid = op->oid,
|
// Remember big write as unsynced
|
||||||
.version = op->version,
|
unsynced_big_writes.push_back((obj_ver_id){
|
||||||
});
|
.oid = op->oid,
|
||||||
|
.version = op->version,
|
||||||
|
});
|
||||||
|
PRIV(op)->op_state = 3;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PRIV(op)->op_state = 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -147,10 +159,11 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
}
|
}
|
||||||
// There is sufficient space. Get SQE(s)
|
// There is sufficient space. Get SQE(s)
|
||||||
struct io_uring_sqe *sqe1 = NULL;
|
struct io_uring_sqe *sqe1 = NULL;
|
||||||
if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_small_write) &&
|
if (immediate_commit != IMMEDIATE_NONE ||
|
||||||
|
(journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_small_write) &&
|
||||||
journal.sector_info[journal.cur_sector].dirty)
|
journal.sector_info[journal.cur_sector].dirty)
|
||||||
{
|
{
|
||||||
// Write current journal sector only if it's dirty and full
|
// Write current journal sector only if it's dirty and full, or in the immediate_commit mode
|
||||||
BS_SUBMIT_GET_SQE_DECL(sqe1);
|
BS_SUBMIT_GET_SQE_DECL(sqe1);
|
||||||
}
|
}
|
||||||
struct io_uring_sqe *sqe2 = NULL;
|
struct io_uring_sqe *sqe2 = NULL;
|
||||||
|
@ -160,15 +173,18 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
}
|
}
|
||||||
// Got SQEs. Prepare previous journal sector write if required
|
// Got SQEs. Prepare previous journal sector write if required
|
||||||
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||||
if (sqe1)
|
if (immediate_commit == IMMEDIATE_NONE)
|
||||||
{
|
{
|
||||||
prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
|
if (sqe1)
|
||||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
|
{
|
||||||
PRIV(op)->pending_ops++;
|
prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
|
||||||
}
|
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
|
||||||
else
|
PRIV(op)->pending_ops++;
|
||||||
{
|
}
|
||||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
else
|
||||||
|
{
|
||||||
|
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Then pre-fill journal entry
|
// Then pre-fill journal entry
|
||||||
journal_entry_small_write *je = (journal_entry_small_write*)
|
journal_entry_small_write *je = (journal_entry_small_write*)
|
||||||
|
@ -188,6 +204,12 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
je->crc32_data = crc32c(0, op->buf, op->len);
|
je->crc32_data = crc32c(0, op->buf, op->len);
|
||||||
je->crc32 = je_crc32((journal_entry*)je);
|
je->crc32 = je_crc32((journal_entry*)je);
|
||||||
journal.crc32_last = je->crc32;
|
journal.crc32_last = je->crc32;
|
||||||
|
if (immediate_commit != IMMEDIATE_NONE)
|
||||||
|
{
|
||||||
|
prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
|
||||||
|
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
|
||||||
|
PRIV(op)->pending_ops++;
|
||||||
|
}
|
||||||
if (op->len > 0)
|
if (op->len > 0)
|
||||||
{
|
{
|
||||||
// Prepare journal data write
|
// Prepare journal data write
|
||||||
|
@ -215,19 +237,99 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
journal.next_free = journal_block_size;
|
journal.next_free = journal_block_size;
|
||||||
}
|
}
|
||||||
// Remember small write as unsynced
|
if (immediate_commit == IMMEDIATE_NONE)
|
||||||
unsynced_small_writes.push_back((obj_ver_id){
|
{
|
||||||
.oid = op->oid,
|
// Remember small write as unsynced
|
||||||
.version = op->version,
|
unsynced_small_writes.push_back((obj_ver_id){
|
||||||
});
|
.oid = op->oid,
|
||||||
|
.version = op->version,
|
||||||
|
});
|
||||||
|
}
|
||||||
if (!PRIV(op)->pending_ops)
|
if (!PRIV(op)->pending_ops)
|
||||||
{
|
{
|
||||||
ack_write(op);
|
PRIV(op)->op_state = 4;
|
||||||
|
continue_write(op);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PRIV(op)->op_state = 3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int blockstore_impl_t::continue_write(blockstore_op_t *op)
|
||||||
|
{
|
||||||
|
io_uring_sqe *sqe = NULL;
|
||||||
|
journal_entry_big_write *je;
|
||||||
|
auto & dirty_entry = dirty_db[(obj_ver_id){
|
||||||
|
.oid = op->oid,
|
||||||
|
.version = op->version,
|
||||||
|
}];
|
||||||
|
if (PRIV(op)->op_state == 2)
|
||||||
|
goto resume_2;
|
||||||
|
else if (PRIV(op)->op_state == 4)
|
||||||
|
goto resume_4;
|
||||||
|
else
|
||||||
|
return 1;
|
||||||
|
resume_2:
|
||||||
|
// Only for the immediate_commit mode: prepare and submit big_write journal entry
|
||||||
|
sqe = get_sqe();
|
||||||
|
if (!sqe)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
je = (journal_entry_big_write*)prefill_single_journal_entry(journal, JE_BIG_WRITE, sizeof(journal_entry_big_write));
|
||||||
|
dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
||||||
|
journal.sector_info[journal.cur_sector].dirty = false;
|
||||||
|
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("journal offset %lu is used by %lu:%lu v%lu\n", journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version);
|
||||||
|
#endif
|
||||||
|
je->oid = op->oid;
|
||||||
|
je->version = op->version;
|
||||||
|
je->offset = op->offset;
|
||||||
|
je->len = op->len;
|
||||||
|
je->location = dirty_entry.location;
|
||||||
|
je->crc32 = je_crc32((journal_entry*)je);
|
||||||
|
journal.crc32_last = je->crc32;
|
||||||
|
prepare_journal_sector_write(journal, journal.cur_sector, sqe,
|
||||||
|
[this, op](ring_data_t *data) { handle_write_event(data, op); });
|
||||||
|
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
|
||||||
|
PRIV(op)->pending_ops = 1;
|
||||||
|
PRIV(op)->op_state = 3;
|
||||||
|
return 1;
|
||||||
|
resume_4:
|
||||||
|
// Switch object state
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("Ack write %lu:%lu v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_entry.state);
|
||||||
|
#endif
|
||||||
|
bool imm = dirty_entry.state == ST_D_SUBMITTED
|
||||||
|
? (immediate_commit == IMMEDIATE_ALL)
|
||||||
|
: (immediate_commit != IMMEDIATE_NONE);
|
||||||
|
if (imm)
|
||||||
|
{
|
||||||
|
auto & unstab = unstable_writes[op->oid];
|
||||||
|
unstab = unstab < op->version ? op->version : unstab;
|
||||||
|
}
|
||||||
|
if (dirty_entry.state == ST_J_SUBMITTED)
|
||||||
|
{
|
||||||
|
dirty_entry.state = imm ? ST_J_SYNCED : ST_J_WRITTEN;
|
||||||
|
}
|
||||||
|
else if (dirty_entry.state == ST_D_SUBMITTED)
|
||||||
|
{
|
||||||
|
dirty_entry.state = imm ? ST_D_SYNCED : ST_D_WRITTEN;
|
||||||
|
}
|
||||||
|
else if (dirty_entry.state == ST_DEL_SUBMITTED)
|
||||||
|
{
|
||||||
|
dirty_entry.state = imm ? ST_DEL_SYNCED : ST_DEL_WRITTEN;
|
||||||
|
}
|
||||||
|
// Acknowledge write
|
||||||
|
op->retval = op->len;
|
||||||
|
FINISH_OP(op);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *op)
|
void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
live = true;
|
live = true;
|
||||||
|
@ -243,7 +345,11 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
|
||||||
if (PRIV(op)->pending_ops == 0)
|
if (PRIV(op)->pending_ops == 0)
|
||||||
{
|
{
|
||||||
release_journal_sectors(op);
|
release_journal_sectors(op);
|
||||||
ack_write(op);
|
PRIV(op)->op_state++;
|
||||||
|
if (!continue_write(op))
|
||||||
|
{
|
||||||
|
submit_queue.push_front(op);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -275,33 +381,6 @@ void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void blockstore_impl_t::ack_write(blockstore_op_t *op)
|
|
||||||
{
|
|
||||||
// Switch object state
|
|
||||||
auto & dirty_entry = dirty_db[(obj_ver_id){
|
|
||||||
.oid = op->oid,
|
|
||||||
.version = op->version,
|
|
||||||
}];
|
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
|
||||||
printf("Ack write %lu:%lu v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_entry.state);
|
|
||||||
#endif
|
|
||||||
if (dirty_entry.state == ST_J_SUBMITTED)
|
|
||||||
{
|
|
||||||
dirty_entry.state = ST_J_WRITTEN;
|
|
||||||
}
|
|
||||||
else if (dirty_entry.state == ST_D_SUBMITTED)
|
|
||||||
{
|
|
||||||
dirty_entry.state = ST_D_WRITTEN;
|
|
||||||
}
|
|
||||||
else if (dirty_entry.state == ST_DEL_SUBMITTED)
|
|
||||||
{
|
|
||||||
dirty_entry.state = ST_DEL_WRITTEN;
|
|
||||||
}
|
|
||||||
// Acknowledge write without sync
|
|
||||||
op->retval = op->len;
|
|
||||||
FINISH_OP(op);
|
|
||||||
}
|
|
||||||
|
|
||||||
int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
auto dirty_it = dirty_db.find((obj_ver_id){
|
auto dirty_it = dirty_db.find((obj_ver_id){
|
||||||
|
@ -313,8 +392,30 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
BS_SUBMIT_GET_ONLY_SQE(sqe);
|
io_uring_sqe *sqe = NULL;
|
||||||
|
if (immediate_commit != IMMEDIATE_NONE ||
|
||||||
|
(journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
|
||||||
|
journal.sector_info[journal.cur_sector].dirty)
|
||||||
|
{
|
||||||
|
// Write current journal sector only if it's dirty and full, or in the immediate_commit mode
|
||||||
|
BS_SUBMIT_GET_SQE_DECL(sqe);
|
||||||
|
}
|
||||||
|
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||||
// Prepare journal sector write
|
// Prepare journal sector write
|
||||||
|
if (immediate_commit == IMMEDIATE_NONE)
|
||||||
|
{
|
||||||
|
if (sqe)
|
||||||
|
{
|
||||||
|
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
|
||||||
|
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
|
||||||
|
PRIV(op)->pending_ops++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Pre-fill journal entry
|
||||||
journal_entry_del *je = (journal_entry_del*)
|
journal_entry_del *je = (journal_entry_del*)
|
||||||
prefill_single_journal_entry(journal, JE_DELETE, sizeof(struct journal_entry_del));
|
prefill_single_journal_entry(journal, JE_DELETE, sizeof(struct journal_entry_del));
|
||||||
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
||||||
|
@ -326,15 +427,26 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
||||||
je->version = op->version;
|
je->version = op->version;
|
||||||
je->crc32 = je_crc32((journal_entry*)je);
|
je->crc32 = je_crc32((journal_entry*)je);
|
||||||
journal.crc32_last = je->crc32;
|
journal.crc32_last = je->crc32;
|
||||||
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
|
||||||
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
|
|
||||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
|
|
||||||
PRIV(op)->pending_ops = 1;
|
|
||||||
dirty_it->second.state = ST_DEL_SUBMITTED;
|
dirty_it->second.state = ST_DEL_SUBMITTED;
|
||||||
// Remember small write as unsynced
|
if (immediate_commit != IMMEDIATE_NONE)
|
||||||
unsynced_small_writes.push_back((obj_ver_id){
|
{
|
||||||
.oid = op->oid,
|
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
|
||||||
.version = op->version,
|
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
|
||||||
});
|
PRIV(op)->pending_ops++;
|
||||||
|
// Remember small write as unsynced
|
||||||
|
unsynced_small_writes.push_back((obj_ver_id){
|
||||||
|
.oid = op->oid,
|
||||||
|
.version = op->version,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (!PRIV(op)->pending_ops)
|
||||||
|
{
|
||||||
|
PRIV(op)->op_state = 4;
|
||||||
|
continue_write(op);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PRIV(op)->op_state = 3;
|
||||||
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
2
test.cpp
2
test.cpp
|
@ -181,7 +181,7 @@ int main0(int argc, char *argv[])
|
||||||
},
|
},
|
||||||
.version = 1,
|
.version = 1,
|
||||||
}] = (dirty_entry){
|
}] = (dirty_entry){
|
||||||
.state = ST_D_META_SYNCED,
|
.state = ST_D_SYNCED,
|
||||||
.flags = 0,
|
.flags = 0,
|
||||||
.location = (uint64_t)i << 17,
|
.location = (uint64_t)i << 17,
|
||||||
.offset = 0,
|
.offset = 0,
|
||||||
|
|
Loading…
Reference in New Issue