forked from vitalif/vitastor
Implement "describe object(s)" operation
Required to implement fixing inconsistent objects in vitastor-clitest-double-alloc
parent
6648f6bb6e
commit
0439981a66
|
@ -111,7 +111,7 @@ target_compile_options(vitastor_common PUBLIC -fPIC)
|
|||
add_executable(vitastor-osd
|
||||
osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
|
||||
osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
|
||||
osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp
|
||||
osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp osd_primary_describe.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-osd
|
||||
vitastor_common
|
||||
|
|
|
@ -251,10 +251,6 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
|||
}
|
||||
cl->read_remaining = cur_op->req.sec_read_bmp.len;
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_READ)
|
||||
{
|
||||
cl->read_remaining = 0;
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
|
||||
{
|
||||
if (cur_op->req.rw.len > 0)
|
||||
|
@ -274,6 +270,12 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
|||
}
|
||||
cl->read_remaining = cur_op->req.show_conf.json_len;
|
||||
}
|
||||
/*else if (cur_op->req.hdr.opcode == OSD_OP_READ ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SCRUB ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
|
||||
{
|
||||
cl->read_remaining = 0;
|
||||
}*/
|
||||
if (cl->read_remaining > 0)
|
||||
{
|
||||
// Read data
|
||||
|
@ -367,6 +369,16 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
|||
op->buf = malloc_or_die(op->reply.hdr.retval);
|
||||
cl->recv_list.push_back(op->buf, op->reply.hdr.retval);
|
||||
}
|
||||
else if (op->reply.hdr.opcode == OSD_OP_DESCRIBE && op->reply.hdr.retval > 0)
|
||||
{
|
||||
delete cl->read_op;
|
||||
cl->read_op = op;
|
||||
cl->read_state = CL_READ_REPLY_DATA;
|
||||
cl->read_remaining = op->reply.describe.result_bytes;
|
||||
free(op->buf);
|
||||
op->buf = malloc_or_die(op->reply.describe.result_bytes);
|
||||
cl->recv_list.push_back(op->buf, op->reply.describe.result_bytes);
|
||||
}
|
||||
else
|
||||
{
|
||||
reuse:
|
||||
|
|
|
@ -73,7 +73,8 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||
? (cur_op->req.hdr.opcode == OSD_OP_READ ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_LIST ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)
|
||||
cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
|
||||
: (cur_op->req.hdr.opcode == OSD_OP_WRITE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE ||
|
||||
|
|
|
@ -363,6 +363,7 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
|||
cur_op->req.hdr.opcode != OSD_OP_READ &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_SCRUB &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_DESCRIBE &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
|
||||
{
|
||||
// Readonly mode
|
||||
|
@ -397,6 +398,10 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
|||
{
|
||||
continue_primary_scrub(cur_op);
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
|
||||
{
|
||||
continue_primary_describe(cur_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
exec_secondary(cur_op);
|
||||
|
|
|
@ -260,6 +260,7 @@ class osd_t
|
|||
bool prepare_primary_rw(osd_op_t *cur_op);
|
||||
void continue_primary_read(osd_op_t *cur_op);
|
||||
void continue_primary_scrub(osd_op_t *cur_op);
|
||||
void continue_primary_describe(osd_op_t *cur_op);
|
||||
void continue_primary_write(osd_op_t *cur_op);
|
||||
void cancel_primary_write(osd_op_t *cur_op);
|
||||
void continue_primary_sync(osd_op_t *cur_op);
|
||||
|
|
|
@ -22,4 +22,5 @@ const char* osd_op_names[] = {
|
|||
"ping",
|
||||
"sec_read_bmp",
|
||||
"scrub",
|
||||
"describe",
|
||||
};
|
||||
|
|
|
@ -30,7 +30,8 @@
|
|||
#define OSD_OP_PING 15
|
||||
#define OSD_OP_SEC_READ_BMP 16
|
||||
#define OSD_OP_SCRUB 17
|
||||
#define OSD_OP_MAX 17
|
||||
#define OSD_OP_DESCRIBE 18
|
||||
#define OSD_OP_MAX 18
|
||||
#define OSD_RW_MAX 64*1024*1024
|
||||
#define OSD_PROTOCOL_VERSION 1
|
||||
|
||||
|
@ -44,6 +45,11 @@
|
|||
#define MEM_ALIGNMENT 4096
|
||||
#endif
|
||||
|
||||
// Constants for osd_reply_describe_item_t.loc_bad
|
||||
#define LOC_OUTDATED 1
|
||||
#define LOC_CORRUPTED 2
|
||||
#define LOC_INCONSISTENT 4
|
||||
|
||||
// common request and reply headers
|
||||
struct __attribute__((__packed__)) osd_op_header_t
|
||||
{
|
||||
|
@ -229,6 +235,36 @@ struct __attribute__((__packed__)) osd_reply_sync_t
|
|||
osd_reply_header_t header;
|
||||
};
|
||||
|
||||
// describe unclean object states in detail
|
||||
struct __attribute__((__packed__)) osd_op_describe_t
|
||||
{
|
||||
osd_op_header_t header;
|
||||
// state mask to filter objects by state (0 or 0xfff..ff = all objects)
|
||||
uint64_t object_state;
|
||||
// minimum inode and offset
|
||||
uint64_t min_inode, min_offset;
|
||||
// maximum inode and offset
|
||||
uint64_t max_inode, max_offset;
|
||||
// limit
|
||||
uint64_t limit;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) osd_reply_describe_t
|
||||
{
|
||||
osd_reply_header_t header;
|
||||
// size of the resulting <osd_reply_describe_item_t> array in bytes
|
||||
uint64_t result_bytes;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) osd_reply_describe_item_t
|
||||
{
|
||||
uint64_t inode;
|
||||
uint64_t stripe;
|
||||
uint32_t role; // part number: 0 for replicas, 0..pg_size-1 for EC
|
||||
uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED / LOC_INCONSISTENT
|
||||
osd_num_t osd_num; // OSD number
|
||||
};
|
||||
|
||||
// FIXME it would be interesting to try to unify blockstore_op and osd_op formats
|
||||
union osd_any_op_t
|
||||
{
|
||||
|
@ -242,6 +278,7 @@ union osd_any_op_t
|
|||
osd_op_show_config_t show_conf;
|
||||
osd_op_rw_t rw;
|
||||
osd_op_sync_t sync;
|
||||
osd_op_describe_t describe;
|
||||
uint8_t buf[OSD_PACKET_SIZE];
|
||||
};
|
||||
|
||||
|
@ -257,6 +294,7 @@ union osd_any_reply_t
|
|||
osd_reply_show_config_t show_conf;
|
||||
osd_reply_rw_t rw;
|
||||
osd_reply_sync_t sync;
|
||||
osd_reply_describe_t describe;
|
||||
uint8_t buf[OSD_PACKET_SIZE];
|
||||
};
|
||||
|
||||
|
|
|
@ -13,10 +13,6 @@
|
|||
|
||||
#define PG_EPOCH_BITS 48
|
||||
|
||||
#define LOC_OUTDATED 1
|
||||
#define LOC_CORRUPTED 2
|
||||
#define LOC_INCONSISTENT 4
|
||||
|
||||
struct pg_obj_loc_t
|
||||
{
|
||||
uint64_t role;
|
||||
|
|
|
@ -0,0 +1,128 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <queue>
|
||||
#include "osd_primary.h"
|
||||
|
||||
struct unclean_list_t
|
||||
{
|
||||
btree::btree_map<object_id, pg_osd_set_state_t*>::iterator it, end;
|
||||
uint64_t state_mask, state;
|
||||
};
|
||||
|
||||
struct desc_item_list_t
|
||||
{
|
||||
int alloc, size;
|
||||
osd_reply_describe_item_t *items;
|
||||
};
|
||||
|
||||
static void include_list(std::vector<unclean_list_t> & lists,
|
||||
btree::btree_map<object_id, pg_osd_set_state_t*> & from,
|
||||
osd_op_describe_t & desc, uint64_t state_mask, uint64_t state)
|
||||
{
|
||||
auto it = desc.min_inode || desc.min_offset ? from.lower_bound((object_id){
|
||||
.inode = desc.min_inode,
|
||||
.stripe = desc.min_offset,
|
||||
}) : from.begin();
|
||||
auto end_it = desc.max_inode || desc.max_offset ? from.upper_bound((object_id){
|
||||
.inode = desc.max_inode,
|
||||
.stripe = desc.max_offset,
|
||||
}) : from.end();
|
||||
lists.push_back((unclean_list_t){
|
||||
.it = it,
|
||||
.end = end_it,
|
||||
.state_mask = state_mask,
|
||||
.state = state,
|
||||
});
|
||||
}
|
||||
|
||||
struct obj_list_t
|
||||
{
|
||||
object_id oid;
|
||||
int list_id;
|
||||
};
|
||||
|
||||
static inline bool operator < (const obj_list_t & a, const obj_list_t & b)
|
||||
{
|
||||
return b.oid < a.oid;
|
||||
}
|
||||
|
||||
static void scan_lists(std::vector<unclean_list_t> & lists, uint64_t limit, desc_item_list_t & res)
|
||||
{
|
||||
if (limit > 1048576)
|
||||
{
|
||||
limit = 1048576;
|
||||
}
|
||||
std::priority_queue<obj_list_t> min;
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (lists[i].it != lists[i].end)
|
||||
{
|
||||
min.push((obj_list_t){ .oid = lists[i].it->first, .list_id = i });
|
||||
}
|
||||
}
|
||||
while (min.size() && (!limit || res.size < limit))
|
||||
{
|
||||
auto i = min.top().list_id;
|
||||
min.pop();
|
||||
for (auto & chunk: lists[i].it->second->osd_set)
|
||||
{
|
||||
if (res.size >= res.alloc)
|
||||
{
|
||||
res.alloc = !res.alloc ? 128 : (res.alloc*2);
|
||||
res.items = (osd_reply_describe_item_t*)realloc_or_die(res.items, res.alloc * sizeof(osd_reply_describe_item_t));
|
||||
}
|
||||
res.items[res.size++] = (osd_reply_describe_item_t){
|
||||
.inode = lists[i].it->first.inode,
|
||||
.stripe = lists[i].it->first.stripe,
|
||||
.role = (uint32_t)chunk.role,
|
||||
.loc_bad = chunk.loc_bad,
|
||||
.osd_num = chunk.osd_num,
|
||||
};
|
||||
}
|
||||
lists[i].it++;
|
||||
if (lists[i].it != lists[i].end)
|
||||
{
|
||||
min.push((obj_list_t){ .oid = lists[i].it->first, .list_id = i });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Describe unclean objects
|
||||
void osd_t::continue_primary_describe(osd_op_t *cur_op)
|
||||
{
|
||||
auto & desc = cur_op->req.describe;
|
||||
if (!desc.object_state)
|
||||
desc.object_state = ~desc.object_state;
|
||||
std::vector<unclean_list_t> lists;
|
||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
||||
{
|
||||
auto & pg = pg_it->second;
|
||||
if (desc.object_state & OBJ_INCONSISTENT)
|
||||
include_list(lists, pg.inconsistent_objects, desc, 0, 0);
|
||||
if (desc.object_state & OBJ_CORRUPTED)
|
||||
{
|
||||
if (!(desc.object_state & OBJ_INCOMPLETE))
|
||||
include_list(lists, pg.incomplete_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED);
|
||||
if (!(desc.object_state & OBJ_DEGRADED))
|
||||
include_list(lists, pg.degraded_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED);
|
||||
if (!(desc.object_state & OBJ_MISPLACED))
|
||||
include_list(lists, pg.misplaced_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED);
|
||||
}
|
||||
uint64_t skip_corrupted = !(desc.object_state & OBJ_CORRUPTED) ? OBJ_CORRUPTED : 0;
|
||||
if (desc.object_state & OBJ_INCOMPLETE)
|
||||
include_list(lists, pg.incomplete_objects, desc, skip_corrupted, 0);
|
||||
if (desc.object_state & OBJ_DEGRADED)
|
||||
include_list(lists, pg.degraded_objects, desc, skip_corrupted, 0);
|
||||
if (desc.object_state & OBJ_MISPLACED)
|
||||
include_list(lists, pg.misplaced_objects, desc, skip_corrupted, 0);
|
||||
}
|
||||
desc_item_list_t res = {};
|
||||
scan_lists(lists, desc.limit, res);
|
||||
assert(!cur_op->buf);
|
||||
cur_op->buf = res.items;
|
||||
cur_op->reply.describe.result_bytes = res.size * sizeof(osd_reply_describe_item_t);
|
||||
if (res.items)
|
||||
cur_op->iov.push_back(res.items, res.size * sizeof(osd_reply_describe_item_t));
|
||||
finish_op(cur_op, res.size);
|
||||
}
|
Loading…
Reference in New Issue