Implement "describe object(s)" operation

Required to implement fixing inconsistent objects in vitastor-cli
test-double-alloc
Vitaliy Filippov 2023-04-15 16:06:17 +03:00
parent 6648f6bb6e
commit 0439981a66
9 changed files with 193 additions and 11 deletions

View File

@ -111,7 +111,7 @@ target_compile_options(vitastor_common PUBLIC -fPIC)
add_executable(vitastor-osd add_executable(vitastor-osd
osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp osd_primary_describe.cpp
) )
target_link_libraries(vitastor-osd target_link_libraries(vitastor-osd
vitastor_common vitastor_common

View File

@ -251,10 +251,6 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
} }
cl->read_remaining = cur_op->req.sec_read_bmp.len; cl->read_remaining = cur_op->req.sec_read_bmp.len;
} }
else if (cur_op->req.hdr.opcode == OSD_OP_READ)
{
cl->read_remaining = 0;
}
else if (cur_op->req.hdr.opcode == OSD_OP_WRITE) else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
{ {
if (cur_op->req.rw.len > 0) if (cur_op->req.rw.len > 0)
@ -274,6 +270,12 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
} }
cl->read_remaining = cur_op->req.show_conf.json_len; cl->read_remaining = cur_op->req.show_conf.json_len;
} }
/*else if (cur_op->req.hdr.opcode == OSD_OP_READ ||
cur_op->req.hdr.opcode == OSD_OP_SCRUB ||
cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
{
cl->read_remaining = 0;
}*/
if (cl->read_remaining > 0) if (cl->read_remaining > 0)
{ {
// Read data // Read data
@ -367,6 +369,16 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
op->buf = malloc_or_die(op->reply.hdr.retval); op->buf = malloc_or_die(op->reply.hdr.retval);
cl->recv_list.push_back(op->buf, op->reply.hdr.retval); cl->recv_list.push_back(op->buf, op->reply.hdr.retval);
} }
else if (op->reply.hdr.opcode == OSD_OP_DESCRIBE && op->reply.hdr.retval > 0)
{
delete cl->read_op;
cl->read_op = op;
cl->read_state = CL_READ_REPLY_DATA;
cl->read_remaining = op->reply.describe.result_bytes;
free(op->buf);
op->buf = malloc_or_die(op->reply.describe.result_bytes);
cl->recv_list.push_back(op->buf, op->reply.describe.result_bytes);
}
else else
{ {
reuse: reuse:

View File

@ -73,7 +73,8 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
? (cur_op->req.hdr.opcode == OSD_OP_READ || ? (cur_op->req.hdr.opcode == OSD_OP_READ ||
cur_op->req.hdr.opcode == OSD_OP_SEC_READ || cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
cur_op->req.hdr.opcode == OSD_OP_SEC_LIST || cur_op->req.hdr.opcode == OSD_OP_SEC_LIST ||
cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG) cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG ||
cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
: (cur_op->req.hdr.opcode == OSD_OP_WRITE || : (cur_op->req.hdr.opcode == OSD_OP_WRITE ||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE ||

View File

@ -363,6 +363,7 @@ void osd_t::exec_op(osd_op_t *cur_op)
cur_op->req.hdr.opcode != OSD_OP_READ && cur_op->req.hdr.opcode != OSD_OP_READ &&
cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP && cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
cur_op->req.hdr.opcode != OSD_OP_SCRUB && cur_op->req.hdr.opcode != OSD_OP_SCRUB &&
cur_op->req.hdr.opcode != OSD_OP_DESCRIBE &&
cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG) cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
{ {
// Readonly mode // Readonly mode
@ -397,6 +398,10 @@ void osd_t::exec_op(osd_op_t *cur_op)
{ {
continue_primary_scrub(cur_op); continue_primary_scrub(cur_op);
} }
else if (cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
{
continue_primary_describe(cur_op);
}
else else
{ {
exec_secondary(cur_op); exec_secondary(cur_op);

View File

@ -260,6 +260,7 @@ class osd_t
bool prepare_primary_rw(osd_op_t *cur_op); bool prepare_primary_rw(osd_op_t *cur_op);
void continue_primary_read(osd_op_t *cur_op); void continue_primary_read(osd_op_t *cur_op);
void continue_primary_scrub(osd_op_t *cur_op); void continue_primary_scrub(osd_op_t *cur_op);
void continue_primary_describe(osd_op_t *cur_op);
void continue_primary_write(osd_op_t *cur_op); void continue_primary_write(osd_op_t *cur_op);
void cancel_primary_write(osd_op_t *cur_op); void cancel_primary_write(osd_op_t *cur_op);
void continue_primary_sync(osd_op_t *cur_op); void continue_primary_sync(osd_op_t *cur_op);

View File

@ -22,4 +22,5 @@ const char* osd_op_names[] = {
"ping", "ping",
"sec_read_bmp", "sec_read_bmp",
"scrub", "scrub",
"describe",
}; };

View File

@ -30,7 +30,8 @@
#define OSD_OP_PING 15 #define OSD_OP_PING 15
#define OSD_OP_SEC_READ_BMP 16 #define OSD_OP_SEC_READ_BMP 16
#define OSD_OP_SCRUB 17 #define OSD_OP_SCRUB 17
#define OSD_OP_MAX 17 #define OSD_OP_DESCRIBE 18
#define OSD_OP_MAX 18
#define OSD_RW_MAX 64*1024*1024 #define OSD_RW_MAX 64*1024*1024
#define OSD_PROTOCOL_VERSION 1 #define OSD_PROTOCOL_VERSION 1
@ -44,6 +45,11 @@
#define MEM_ALIGNMENT 4096 #define MEM_ALIGNMENT 4096
#endif #endif
// Constants for osd_reply_describe_item_t.loc_bad
#define LOC_OUTDATED 1
#define LOC_CORRUPTED 2
#define LOC_INCONSISTENT 4
// common request and reply headers // common request and reply headers
struct __attribute__((__packed__)) osd_op_header_t struct __attribute__((__packed__)) osd_op_header_t
{ {
@ -229,6 +235,36 @@ struct __attribute__((__packed__)) osd_reply_sync_t
osd_reply_header_t header; osd_reply_header_t header;
}; };
// describe unclean object states in detail
struct __attribute__((__packed__)) osd_op_describe_t
{
osd_op_header_t header;
// state mask to filter objects by state (0 or 0xfff..ff = all objects)
uint64_t object_state;
// minimum inode and offset
uint64_t min_inode, min_offset;
// maximum inode and offset
uint64_t max_inode, max_offset;
// limit
uint64_t limit;
};
struct __attribute__((__packed__)) osd_reply_describe_t
{
osd_reply_header_t header;
// size of the resulting <osd_reply_describe_item_t> array in bytes
uint64_t result_bytes;
};
struct __attribute__((__packed__)) osd_reply_describe_item_t
{
uint64_t inode;
uint64_t stripe;
uint32_t role; // part number: 0 for replicas, 0..pg_size-1 for EC
uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED / LOC_INCONSISTENT
osd_num_t osd_num; // OSD number
};
// FIXME it would be interesting to try to unify blockstore_op and osd_op formats // FIXME it would be interesting to try to unify blockstore_op and osd_op formats
union osd_any_op_t union osd_any_op_t
{ {
@ -242,6 +278,7 @@ union osd_any_op_t
osd_op_show_config_t show_conf; osd_op_show_config_t show_conf;
osd_op_rw_t rw; osd_op_rw_t rw;
osd_op_sync_t sync; osd_op_sync_t sync;
osd_op_describe_t describe;
uint8_t buf[OSD_PACKET_SIZE]; uint8_t buf[OSD_PACKET_SIZE];
}; };
@ -257,6 +294,7 @@ union osd_any_reply_t
osd_reply_show_config_t show_conf; osd_reply_show_config_t show_conf;
osd_reply_rw_t rw; osd_reply_rw_t rw;
osd_reply_sync_t sync; osd_reply_sync_t sync;
osd_reply_describe_t describe;
uint8_t buf[OSD_PACKET_SIZE]; uint8_t buf[OSD_PACKET_SIZE];
}; };

View File

@ -13,10 +13,6 @@
#define PG_EPOCH_BITS 48 #define PG_EPOCH_BITS 48
#define LOC_OUTDATED 1
#define LOC_CORRUPTED 2
#define LOC_INCONSISTENT 4
struct pg_obj_loc_t struct pg_obj_loc_t
{ {
uint64_t role; uint64_t role;

View File

@ -0,0 +1,128 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include <queue>
#include "osd_primary.h"
struct unclean_list_t
{
btree::btree_map<object_id, pg_osd_set_state_t*>::iterator it, end;
uint64_t state_mask, state;
};
struct desc_item_list_t
{
int alloc, size;
osd_reply_describe_item_t *items;
};
static void include_list(std::vector<unclean_list_t> & lists,
btree::btree_map<object_id, pg_osd_set_state_t*> & from,
osd_op_describe_t & desc, uint64_t state_mask, uint64_t state)
{
auto it = desc.min_inode || desc.min_offset ? from.lower_bound((object_id){
.inode = desc.min_inode,
.stripe = desc.min_offset,
}) : from.begin();
auto end_it = desc.max_inode || desc.max_offset ? from.upper_bound((object_id){
.inode = desc.max_inode,
.stripe = desc.max_offset,
}) : from.end();
lists.push_back((unclean_list_t){
.it = it,
.end = end_it,
.state_mask = state_mask,
.state = state,
});
}
struct obj_list_t
{
object_id oid;
int list_id;
};
static inline bool operator < (const obj_list_t & a, const obj_list_t & b)
{
return b.oid < a.oid;
}
static void scan_lists(std::vector<unclean_list_t> & lists, uint64_t limit, desc_item_list_t & res)
{
if (limit > 1048576)
{
limit = 1048576;
}
std::priority_queue<obj_list_t> min;
for (int i = 0; i < lists.size(); i++)
{
if (lists[i].it != lists[i].end)
{
min.push((obj_list_t){ .oid = lists[i].it->first, .list_id = i });
}
}
while (min.size() && (!limit || res.size < limit))
{
auto i = min.top().list_id;
min.pop();
for (auto & chunk: lists[i].it->second->osd_set)
{
if (res.size >= res.alloc)
{
res.alloc = !res.alloc ? 128 : (res.alloc*2);
res.items = (osd_reply_describe_item_t*)realloc_or_die(res.items, res.alloc * sizeof(osd_reply_describe_item_t));
}
res.items[res.size++] = (osd_reply_describe_item_t){
.inode = lists[i].it->first.inode,
.stripe = lists[i].it->first.stripe,
.role = (uint32_t)chunk.role,
.loc_bad = chunk.loc_bad,
.osd_num = chunk.osd_num,
};
}
lists[i].it++;
if (lists[i].it != lists[i].end)
{
min.push((obj_list_t){ .oid = lists[i].it->first, .list_id = i });
}
}
}
// Describe unclean objects
void osd_t::continue_primary_describe(osd_op_t *cur_op)
{
auto & desc = cur_op->req.describe;
if (!desc.object_state)
desc.object_state = ~desc.object_state;
std::vector<unclean_list_t> lists;
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
{
auto & pg = pg_it->second;
if (desc.object_state & OBJ_INCONSISTENT)
include_list(lists, pg.inconsistent_objects, desc, 0, 0);
if (desc.object_state & OBJ_CORRUPTED)
{
if (!(desc.object_state & OBJ_INCOMPLETE))
include_list(lists, pg.incomplete_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED);
if (!(desc.object_state & OBJ_DEGRADED))
include_list(lists, pg.degraded_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED);
if (!(desc.object_state & OBJ_MISPLACED))
include_list(lists, pg.misplaced_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED);
}
uint64_t skip_corrupted = !(desc.object_state & OBJ_CORRUPTED) ? OBJ_CORRUPTED : 0;
if (desc.object_state & OBJ_INCOMPLETE)
include_list(lists, pg.incomplete_objects, desc, skip_corrupted, 0);
if (desc.object_state & OBJ_DEGRADED)
include_list(lists, pg.degraded_objects, desc, skip_corrupted, 0);
if (desc.object_state & OBJ_MISPLACED)
include_list(lists, pg.misplaced_objects, desc, skip_corrupted, 0);
}
desc_item_list_t res = {};
scan_lists(lists, desc.limit, res);
assert(!cur_op->buf);
cur_op->buf = res.items;
cur_op->reply.describe.result_bytes = res.size * sizeof(osd_reply_describe_item_t);
if (res.items)
cur_op->iov.push_back(res.items, res.size * sizeof(osd_reply_describe_item_t));
finish_op(cur_op, res.size);
}