Make basic primary-write work

blocking-uring-test
Vitaliy Filippov 2020-02-25 01:20:45 +03:00
parent 09588a349f
commit 74673c761f
13 changed files with 184 additions and 87 deletions

View File

@ -1,7 +1,7 @@
BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o timerfd_interval.o
CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so test_blockstore stub_osd test_osd
all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so test_blockstore stub_osd osd_test
clean:
rm -f *.o
@ -49,8 +49,8 @@ stub_osd: stub_osd.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o stub_osd stub_osd.cpp rw_blocking.o -ltcmalloc_minimal
rw_blocking.o: rw_blocking.cpp rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
test_osd: test_osd.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o test_osd test_osd.cpp rw_blocking.o -ltcmalloc_minimal
osd_test: osd_test.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o osd_test osd_test.cpp rw_blocking.o -ltcmalloc_minimal
libfio_sec_osd.so: fio_sec_osd.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o libfio_sec_osd.so fio_sec_osd.cpp rw_blocking.o -luring

View File

@ -535,7 +535,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
else if (je->type == JE_BIG_WRITE)
{
#ifdef BLOCKSTORE_DEBUG
printf("je_big_write oid=%lu:%lu ver=%lu\n", je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version);
printf("je_big_write oid=%lu:%lu ver=%lu loc=%lu\n", je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location);
#endif
auto clean_it = bs->clean_db.find(je->big_write.oid);
if (clean_it == bs->clean_db.end() ||

View File

@ -310,15 +310,17 @@ void osd_t::exec_op(osd_op_t *cur_op)
}
if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
(cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ || cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE ||
cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE) &&
(cur_op->req.sec_rw.len > OSD_RW_MAX || cur_op->req.sec_rw.len % OSD_RW_ALIGN || cur_op->req.sec_rw.offset % OSD_RW_ALIGN))
(cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ || cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE) &&
(cur_op->req.sec_rw.len > OSD_RW_MAX || cur_op->req.sec_rw.len % OSD_RW_ALIGN || cur_op->req.sec_rw.offset % OSD_RW_ALIGN) ||
(cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE) &&
(cur_op->req.rw.len > OSD_RW_MAX || cur_op->req.rw.len % OSD_RW_ALIGN || cur_op->req.rw.offset % OSD_RW_ALIGN))
{
// Bad command
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
cur_op->reply.hdr.id = cur_op->req.hdr.id;
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
cur_op->reply.hdr.retval = -EINVAL;
outbox_push(this->clients[cur_op->peer_fd], cur_op);
return;
}
inflight_ops++;

8
osd.h
View File

@ -157,7 +157,6 @@ struct osd_client_t
int write_state = 0;
};
struct osd_primary_read_t;
struct osd_rmw_stripe_t;
class osd_t
@ -181,6 +180,9 @@ class osd_t
unsigned pg_count = 0;
uint64_t next_subop_id = 1;
// Unstable writes
spp::sparse_hash_map<osd_num_t, spp::sparse_hash_map<object_id, uint64_t>> unstable_writes;
// client & peer I/O
bool stopping = false;
@ -207,8 +209,8 @@ class osd_t
int handle_epoll_events();
void read_requests();
void handle_read(ring_data_t *data, int peer_fd);
void handle_read_op(osd_client_t *cl);
void handle_read_reply(osd_client_t *cl);
void handle_op_hdr(osd_client_t *cl);
void handle_reply_hdr(osd_client_t *cl);
void send_replies();
void handle_send(ring_data_t *data, int peer_fd);
void outbox_push(osd_client_t & cl, osd_op_t *op);

View File

@ -332,7 +332,7 @@ void osd_t::start_pg_peering(int pg_idx)
throw std::runtime_error("local OP_LIST failed");
}
printf(
"Got object list from OSD %lu (local): %d objects (%lu of them stable)\n",
"Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n",
role_osd, bs_op->retval, bs_op->version
);
ps->list_results[role_osd] = {
@ -377,7 +377,7 @@ void osd_t::start_pg_peering(int pg_idx)
return;
}
printf(
"Got object list from OSD %lu: %ld objects (%lu of them stable)\n",
"Got object list from OSD %lu: %ld object versions (%lu of them stable)\n",
role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
);
ps->list_results[role_osd] = {

View File

@ -17,11 +17,13 @@ void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &
}
else if (st.n_roles < pg.pg_minsize)
{
printf("Object is unfound: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver);
state = OBJ_INCOMPLETE;
pg.state = pg.state | PG_HAS_UNFOUND;
}
else
{
printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver);
state = OBJ_DEGRADED;
pg.state = pg.state | PG_HAS_DEGRADED;
}
@ -133,6 +135,7 @@ void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &
pg.clean_count++;
}
// FIXME: Write at least some tests for this function
void pg_t::calc_object_states()
{
auto & pg = *this;
@ -188,7 +191,7 @@ void pg_t::calc_object_states()
{
if (st.n_stable > 0 || st.n_roles >= pg.pg_minsize)
{
// Version is either recoverable or stable, choose it as target and skip previous versions
// Last processed version is either recoverable or stable, choose it as target and skip previous versions
st.ver_end = i;
i++;
while (i < all.size() && st.oid.inode == all[i].oid.inode &&
@ -201,13 +204,13 @@ void pg_t::calc_object_states()
i++;
}
st.obj_end = i;
remember_object(st, all);
i--;
continue;
}
else
{
// Remember that there are newer unrecoverable versions
// Last processed version is unstable and unrecoverable
// We'll know that because target_ver < max_ver
st.ver_start = i;
st.target_ver = all[i].version;
st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;

View File

@ -127,6 +127,8 @@ struct pg_t
spp::sparse_hash_map<object_id, uint64_t> ver_override;
pg_peering_state_t *peering_state = NULL;
std::multimap<object_id, osd_op_t*> write_queue;
void calc_object_states();
void remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all);
};
@ -139,11 +141,13 @@ inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
{
return a.oid < b.oid ||
// object versions come in descending order
a.oid == b.oid && a.version > b.version ||
a.oid == b.oid && a.version == b.version ||
a.oid == b.oid && a.version == b.version && a.osd_num < b.osd_num;
// ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, osd_num ASC
return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && (
(a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) ||
(a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && (
a.version > b.version || a.version == b.version && a.osd_num < b.osd_num
)
);
}
inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)

View File

@ -174,8 +174,7 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
{
zero_read = role;
}
if (osd_set[role] != 0 &&
(w ? stripes[role].write_end : stripes[role].read_end) != 0)
if (osd_set[role] != 0 && (w || stripes[role].read_end != 0))
{
n_subops++;
}
@ -195,11 +194,12 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
int subop = 0;
for (int role = 0; role < pg_size; role++)
{
if ((submit_type == SUBMIT_WRITE ? stripes[role].write_end : stripes[role].read_end) == 0 && zero_read != role)
// We always submit zero-length writes to all replicas, even if the stripe is not modified
if (!(w || stripes[role].read_end != 0 || zero_read == role))
{
continue;
}
auto role_osd_num = osd_set[role];
osd_num_t role_osd_num = osd_set[role];
if (role_osd_num != 0)
{
if (role_osd_num == this->osd_num)
@ -240,6 +240,10 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
};
subops[subop].buf = w ? stripes[role].write_buf : stripes[role].read_buf;
if (w && stripes[role].write_end > 0)
{
subops[subop].send_list.push_back(stripes[role].write_buf, stripes[role].write_end - stripes[role].write_start);
}
subops[subop].callback = [cur_op, this](osd_op_t *subop)
{
// so it doesn't get freed
@ -318,7 +322,7 @@ resume_1:
if (vo_it != pg.ver_override.end())
{
op_data->st = 1;
//pg.write_queue.push_back(cur_op);
pg.write_queue.emplace(op_data->oid, cur_op);
return;
}
}
@ -326,25 +330,48 @@ resume_1:
cur_op->rmw_buf = calc_rmw_reads(cur_op->buf, op_data->stripes, pg.cur_set.data(), pg.pg_size, pg.pg_minsize, pg.pg_cursize);
// Read required blocks
submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, pg.cur_set.data(), cur_op);
op_data->st = 2;
resume_2:
op_data->st = 2;
return;
resume_3:
// Save version override
// Save version override for parallel reads
pg.ver_override[op_data->oid] = op_data->fact_ver;
// Calculate parity
calc_rmw_parity(op_data->stripes, op_data->pg_size);
calc_rmw_parity(op_data->stripes, pg.pg_size);
// Send writes
submit_primary_subops(SUBMIT_WRITE, pg.pg_size, pg.cur_set.data(), cur_op);
op_data->st = 4;
resume_4:
op_data->st = 4;
return;
resume_5:
// Remember version as unstable
// Remove version override if degraded
osd_num_t *osd_set = pg.cur_set.data();
for (int role = 0; role < pg.pg_size; role++)
{
if (osd_set[role] != 0)
{
this->unstable_writes[osd_set[role]][(object_id){
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | role,
}] = op_data->fact_ver;
}
}
// Remember PG as dirty to drop the connection when PG goes offline
// (this is required because of the "lazy sync")
this->clients[cur_op->peer_fd].dirty_pgs.insert(op_data->pg_num);
// Remove version override
pg.ver_override.erase(op_data->oid);
finish_primary_op(cur_op, cur_op->req.rw.len);
// Continue other write operations to the same object
{
auto next_it = pg.write_queue.find(op_data->oid);
if (next_it != pg.write_queue.end())
{
osd_op_t *next_op = next_it->second;
pg.write_queue.erase(next_it);
continue_primary_write(next_op);
}
}
}
void osd_t::exec_primary_sync(osd_op_t *cur_op)

View File

@ -68,11 +68,11 @@ void osd_t::handle_read(ring_data_t *data, int peer_fd)
{
if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
{
handle_read_reply(&cl);
handle_reply_hdr(&cl);
}
else
{
handle_read_op(&cl);
handle_op_hdr(&cl);
}
}
else if (cl.read_state == CL_READ_DATA)
@ -97,32 +97,39 @@ void osd_t::handle_read(ring_data_t *data, int peer_fd)
}
}
void osd_t::handle_read_op(osd_client_t *cl)
void osd_t::handle_op_hdr(osd_client_t *cl)
{
osd_op_t *cur_op = cl->read_op;
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE ||
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
{
if (cur_op->req.sec_rw.len > 0)
cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
cl->read_remaining = 0;
}
else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE ||
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK)
{
// Allocate a buffer
cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
if (cur_op->req.sec_rw.len > 0)
cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
cl->read_remaining = cur_op->req.sec_rw.len;
}
else if (cur_op->req.hdr.opcode == OSD_OP_READ ||
cur_op->req.hdr.opcode == OSD_OP_WRITE)
else if (cur_op->req.hdr.opcode == OSD_OP_READ)
{
cur_op->buf = memalign(512, cur_op->req.rw.len);
if (cur_op->req.rw.len > 0)
cur_op->buf = memalign(512, cur_op->req.rw.len);
cl->read_remaining = 0;
}
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE ||
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK ||
cur_op->req.hdr.opcode == OSD_OP_WRITE)
else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
{
if (cur_op->req.rw.len > 0)
cur_op->buf = memalign(512, cur_op->req.rw.len);
cl->read_remaining = cur_op->req.rw.len;
}
if (cl->read_remaining > 0)
{
// Read data
cl->read_buf = cur_op->buf;
cl->read_remaining = (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE
? cur_op->req.sec_rw.len
: cur_op->req.rw.len);
cl->read_state = CL_READ_DATA;
}
else
@ -134,7 +141,7 @@ void osd_t::handle_read_op(osd_client_t *cl)
}
}
void osd_t::handle_read_reply(osd_client_t *cl)
void osd_t::handle_reply_hdr(osd_client_t *cl)
{
osd_op_t *cur_op = cl->read_op;
auto req_it = cl->sent_ops.find(cur_op->req.hdr.id);

View File

@ -170,6 +170,8 @@ void* calc_rmw_reads(void *write_buf, osd_rmw_stripe_t *stripes, uint64_t *osd_s
{
start = !end || stripes[role].req_start < start ? stripes[role].req_start : start;
end = std::max(stripes[role].req_end, end);
stripes[role].write_start = stripes[role].req_start;
stripes[role].write_end = stripes[role].req_end;
}
}
for (int role = 0; role < pg_minsize; role++)
@ -251,7 +253,7 @@ static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, ui
stripe.read_start < wr_end)
{
os = std::max(stripe.read_start, wr_start);
oe = std::min(stripe.req_end, wr_end);
oe = std::min(stripe.read_end, wr_end);
}
if (ne && (!oe || ns <= os))
{

View File

@ -1,13 +1,6 @@
#include <string.h>
#include "osd_rmw.cpp"
#define PATTERN0 0x8c4641acc762840e
#define PATTERN1 0x70a549add9a2280a
#define PATTERN2 0xffe3bad5f578a78e
#define PATTERN3 0x426bd7854eb08509
#define set_pattern(buf, len, pattern) for (uint64_t i = 0; i < len; i += 8) { *(uint64_t*)((void*)buf + i) = pattern; }
#define check_pattern(buf, len, pattern) for (uint64_t i = 0; i < len; i += 8) { assert(*(uint64_t*)(buf + i) == pattern); }
#include "test_pattern.h"
int main(int narg, char *args[])
{

View File

@ -15,6 +15,7 @@
#include "osd_ops.h"
#include "rw_blocking.h"
#include "test_pattern.h"
int connect_osd(const char *osd_address, int osd_port);
@ -22,11 +23,9 @@ uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t ve
void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_t len);
bool check_pattern(void *buf, uint64_t offset, uint64_t len, uint64_t pattern);
void test_primary_write(int connect_fd, uint64_t inode, uint64_t offset, uint64_t len, uint64_t pattern);
#define PATTERN0 0x8c4641acc762840e
#define PATTERN1 0x70a549add9a2280a
#define PATTERN2 (PATTERN0 ^ PATTERN1)
void test_sync_stab_all(int connect_fd);
int main0(int narg, char *args[])
{
@ -39,7 +38,32 @@ int main0(int narg, char *args[])
test_write(connect_fd, 2, 1, 1, PATTERN1);
close(connect_fd);
connect_fd = connect_osd("127.0.0.1", 11205);
test_write(connect_fd, 2, 2, 1, PATTERN2);
test_write(connect_fd, 2, 2, 1, PATTERN0^PATTERN1);
close(connect_fd);
return 0;
}
int main1(int narg, char *args[])
{
int connect_fd;
void *data;
// Cluster read
connect_fd = connect_osd("127.0.0.1", 11203);
data = test_primary_read(connect_fd, 2, 0, 128*1024);
if (data)
{
check_pattern(data, 128*1024, PATTERN0);
printf("inode=2 0-128K OK\n");
free(data);
}
data = test_primary_read(connect_fd, 2, 0, 256*1024);
if (data)
{
check_pattern(data, 128*1024, PATTERN0);
check_pattern(data+128*1024, 128*1024, PATTERN1);
printf("inode=2 0-256K OK\n");
free(data);
}
close(connect_fd);
return 0;
}
@ -47,21 +71,24 @@ int main0(int narg, char *args[])
int main(int narg, char *args[])
{
int connect_fd;
void *data;
// Cluster read
// Cluster write (sync not implemented yet)
connect_fd = connect_osd("127.0.0.1", 11203);
data = test_primary_read(connect_fd, 2, 0, 128*1024);
if (data && check_pattern(data, 0, 128*1024, PATTERN0))
printf("inode=2 0-128K OK\n");
if (data)
free(data);
data = test_primary_read(connect_fd, 2, 0, 256*1024);
if (data && check_pattern(data, 0, 128*1024, PATTERN0) &&
check_pattern(data, 128*1024, 128*1024, PATTERN1))
printf("inode=2 0-256K OK\n");
if (data)
free(data);
test_primary_write(connect_fd, 2, 0, 128*1024, PATTERN0);
test_primary_write(connect_fd, 2, 128*1024, 128*1024, PATTERN1);
test_sync_stab_all(connect_fd);
close(connect_fd);
connect_fd = connect_osd("127.0.0.1", 11204);
if (connect_fd >= 0)
{
test_sync_stab_all(connect_fd);
close(connect_fd);
}
connect_fd = connect_osd("127.0.0.1", 11205);
if (connect_fd >= 0)
{
test_sync_stab_all(connect_fd);
close(connect_fd);
}
return 0;
}
@ -182,15 +209,33 @@ void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_
return data;
}
bool check_pattern(void *buf, uint64_t offset, uint64_t len, uint64_t pattern)
void test_primary_write(int connect_fd, uint64_t inode, uint64_t offset, uint64_t len, uint64_t pattern)
{
for (int i = 0; i < len/sizeof(uint64_t); i++)
{
if (((uint64_t*)(buf+offset))[i] != pattern)
{
printf("(result + %lu bytes = %lx) != %lx\n", i*sizeof(uint64_t)+offset, ((uint64_t*)buf+offset)[i], pattern);
return false;
}
}
return true;
osd_any_op_t op;
osd_any_reply_t reply;
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = 1;
op.hdr.opcode = OSD_OP_WRITE;
op.rw.inode = inode;
op.rw.offset = offset;
op.rw.len = len;
void *data = memalign(512, len);
set_pattern(data, len, pattern);
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
write_blocking(connect_fd, data, len);
free(data);
int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
assert(check_reply(r, op, reply, len));
}
void test_sync_stab_all(int connect_fd)
{
osd_any_op_t op;
osd_any_reply_t reply;
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = 1;
op.hdr.opcode = OSD_OP_TEST_SYNC_STAB_ALL;
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
assert(check_reply(r, op, reply, 0));
}

12
test_pattern.h Normal file
View File

@ -0,0 +1,12 @@
#pragma once
#include <assert.h>
#include <stdint.h>
#define PATTERN0 0x8c4641acc762840e
#define PATTERN1 0x70a549add9a2280a
#define PATTERN2 0xffe3bad5f578a78e
#define PATTERN3 0x426bd7854eb08509
#define set_pattern(buf, len, pattern) for (uint64_t i = 0; i < len; i += 8) { *(uint64_t*)((void*)buf + i) = pattern; }
#define check_pattern(buf, len, pattern) for (uint64_t i = 0; i < len; i += 8) { assert(*(uint64_t*)(buf + i) == pattern); }