forked from vitalif/vitastor
Allow zero-length overwrites
parent
4de5290619
commit
4677ace4cc
|
@ -207,7 +207,7 @@ bool journal_flusher_co::loop()
|
||||||
for (; it != v.end(); it++)
|
for (; it != v.end(); it++)
|
||||||
if (it->offset >= offset)
|
if (it->offset >= offset)
|
||||||
break;
|
break;
|
||||||
if (it == v.end() || it->offset > offset)
|
if (it == v.end() || it->offset > offset && it->len > 0)
|
||||||
{
|
{
|
||||||
submit_offset = dirty_it->second.location + offset - dirty_it->second.offset;
|
submit_offset = dirty_it->second.location + offset - dirty_it->second.offset;
|
||||||
submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset;
|
submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset;
|
||||||
|
|
|
@ -3,6 +3,11 @@
|
||||||
int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
|
int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
|
||||||
uint32_t item_state, uint64_t item_version)
|
uint32_t item_state, uint64_t item_version)
|
||||||
{
|
{
|
||||||
|
if (!len)
|
||||||
|
{
|
||||||
|
// Zero-length version - skip
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
if (IS_IN_FLIGHT(item_state))
|
if (IS_IN_FLIGHT(item_state))
|
||||||
{
|
{
|
||||||
// Pause until it's written somewhere
|
// Pause until it's written somewhere
|
||||||
|
|
|
@ -147,7 +147,13 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
}
|
}
|
||||||
// There is sufficient space. Get SQE(s)
|
// There is sufficient space. Get SQE(s)
|
||||||
BS_SUBMIT_GET_ONLY_SQE(sqe1);
|
BS_SUBMIT_GET_ONLY_SQE(sqe1);
|
||||||
BS_SUBMIT_GET_SQE(sqe2, data2);
|
struct io_uring_sqe *sqe2 = NULL;
|
||||||
|
struct ring_data_t *data2 = NULL;
|
||||||
|
if (op->len > 0)
|
||||||
|
{
|
||||||
|
BS_SUBMIT_GET_SQE_DECL(sqe2);
|
||||||
|
data2 = ((ring_data_t*)sqe2->user_data);
|
||||||
|
}
|
||||||
// Got SQEs. Prepare journal sector write
|
// Got SQEs. Prepare journal sector write
|
||||||
journal_entry_small_write *je = (journal_entry_small_write*)
|
journal_entry_small_write *je = (journal_entry_small_write*)
|
||||||
prefill_single_journal_entry(journal, JE_SMALL_WRITE, sizeof(struct journal_entry_small_write));
|
prefill_single_journal_entry(journal, JE_SMALL_WRITE, sizeof(struct journal_entry_small_write));
|
||||||
|
@ -169,23 +175,31 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||||
prepare_journal_sector_write(journal, sqe1, cb);
|
prepare_journal_sector_write(journal, sqe1, cb);
|
||||||
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
||||||
// Prepare journal data write
|
if (op->len > 0)
|
||||||
if (journal.inmemory)
|
|
||||||
{
|
{
|
||||||
// Copy data
|
// Prepare journal data write
|
||||||
memcpy(journal.buffer + journal.next_free, op->buf, op->len);
|
if (journal.inmemory)
|
||||||
|
{
|
||||||
|
// Copy data
|
||||||
|
memcpy(journal.buffer + journal.next_free, op->buf, op->len);
|
||||||
|
}
|
||||||
|
data2->iov = (struct iovec){ op->buf, op->len };
|
||||||
|
data2->callback = cb;
|
||||||
|
my_uring_prep_writev(
|
||||||
|
sqe2, journal.fd, &data2->iov, 1, journal.offset + journal.next_free
|
||||||
|
);
|
||||||
|
PRIV(op)->pending_ops = 2;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Zero-length overwrite. Allowed to bump object version in EC placement groups without actually writing data
|
||||||
|
PRIV(op)->pending_ops = 1;
|
||||||
}
|
}
|
||||||
data2->iov = (struct iovec){ op->buf, op->len };
|
|
||||||
data2->callback = cb;
|
|
||||||
my_uring_prep_writev(
|
|
||||||
sqe2, journal.fd, &data2->iov, 1, journal.offset + journal.next_free
|
|
||||||
);
|
|
||||||
dirty_it->second.location = journal.next_free;
|
dirty_it->second.location = journal.next_free;
|
||||||
dirty_it->second.state = ST_J_SUBMITTED;
|
dirty_it->second.state = ST_J_SUBMITTED;
|
||||||
journal.next_free += op->len;
|
journal.next_free += op->len;
|
||||||
if (journal.next_free >= journal.len)
|
if (journal.next_free >= journal.len)
|
||||||
journal.next_free = 512;
|
journal.next_free = 512;
|
||||||
PRIV(op)->pending_ops = 2;
|
|
||||||
// Remember small write as unsynced
|
// Remember small write as unsynced
|
||||||
unsynced_small_writes.push_back((obj_ver_id){
|
unsynced_small_writes.push_back((obj_ver_id){
|
||||||
.oid = op->oid,
|
.oid = op->oid,
|
||||||
|
|
19
osd_ops.h
19
osd_ops.h
|
@ -116,6 +116,23 @@ struct __attribute__((__packed__)) osd_reply_secondary_list_t
|
||||||
uint64_t stable_count;
|
uint64_t stable_count;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// read or write to the primary OSD
|
||||||
|
struct __attribute__((__packed__)) osd_op_rw_t
|
||||||
|
{
|
||||||
|
osd_op_header_t header;
|
||||||
|
// inode
|
||||||
|
uint64_t inode;
|
||||||
|
// offset
|
||||||
|
uint64_t offset;
|
||||||
|
// length
|
||||||
|
uint64_t len;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct __attribute__((__packed__)) osd_reply_rw_t
|
||||||
|
{
|
||||||
|
osd_reply_header_t header;
|
||||||
|
};
|
||||||
|
|
||||||
union osd_any_op_t
|
union osd_any_op_t
|
||||||
{
|
{
|
||||||
osd_op_header_t hdr;
|
osd_op_header_t hdr;
|
||||||
|
@ -124,6 +141,7 @@ union osd_any_op_t
|
||||||
osd_op_secondary_sync_t sec_sync;
|
osd_op_secondary_sync_t sec_sync;
|
||||||
osd_op_secondary_stabilize_t sec_stabilize;
|
osd_op_secondary_stabilize_t sec_stabilize;
|
||||||
osd_op_secondary_list_t sec_list;
|
osd_op_secondary_list_t sec_list;
|
||||||
|
osd_op_rw_t rw;
|
||||||
};
|
};
|
||||||
|
|
||||||
union osd_any_reply_t
|
union osd_any_reply_t
|
||||||
|
@ -134,4 +152,5 @@ union osd_any_reply_t
|
||||||
osd_reply_secondary_sync_t sec_sync;
|
osd_reply_secondary_sync_t sec_sync;
|
||||||
osd_reply_secondary_stabilize_t sec_stabilize;
|
osd_reply_secondary_stabilize_t sec_stabilize;
|
||||||
osd_reply_secondary_list_t sec_list;
|
osd_reply_secondary_list_t sec_list;
|
||||||
|
osd_reply_rw_t rw;
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue