diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index aaf222e5..bacab75a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -111,7 +111,7 @@ target_compile_options(vitastor_common PUBLIC -fPIC) add_executable(vitastor-osd osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp - osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp + osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp osd_primary_describe.cpp ) target_link_libraries(vitastor-osd vitastor_common diff --git a/src/msgr_receive.cpp b/src/msgr_receive.cpp index 1c964dd2..9c7e6d7a 100644 --- a/src/msgr_receive.cpp +++ b/src/msgr_receive.cpp @@ -251,10 +251,6 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl) } cl->read_remaining = cur_op->req.sec_read_bmp.len; } - else if (cur_op->req.hdr.opcode == OSD_OP_READ) - { - cl->read_remaining = 0; - } else if (cur_op->req.hdr.opcode == OSD_OP_WRITE) { if (cur_op->req.rw.len > 0) @@ -274,6 +270,12 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl) } cl->read_remaining = cur_op->req.show_conf.json_len; } + /*else if (cur_op->req.hdr.opcode == OSD_OP_READ || + cur_op->req.hdr.opcode == OSD_OP_SCRUB || + cur_op->req.hdr.opcode == OSD_OP_DESCRIBE) + { + cl->read_remaining = 0; + }*/ if (cl->read_remaining > 0) { // Read data @@ -367,6 +369,16 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl) op->buf = malloc_or_die(op->reply.hdr.retval); cl->recv_list.push_back(op->buf, op->reply.hdr.retval); } + else if (op->reply.hdr.opcode == OSD_OP_DESCRIBE && op->reply.hdr.retval > 0) + { + delete cl->read_op; + cl->read_op = op; + cl->read_state = CL_READ_REPLY_DATA; + cl->read_remaining = op->reply.describe.result_bytes; + free(op->buf); + op->buf = malloc_or_die(op->reply.describe.result_bytes); + cl->recv_list.push_back(op->buf, op->reply.describe.result_bytes); + } else { reuse: diff --git a/src/msgr_send.cpp b/src/msgr_send.cpp index 5248e641..32348e17 100644 --- a/src/msgr_send.cpp +++ b/src/msgr_send.cpp @@ -73,7 +73,8 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op) ? (cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_SEC_READ || cur_op->req.hdr.opcode == OSD_OP_SEC_LIST || - cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG) + cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG || + cur_op->req.hdr.opcode == OSD_OP_DESCRIBE) : (cur_op->req.hdr.opcode == OSD_OP_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE || diff --git a/src/osd.cpp b/src/osd.cpp index 1b3dcb0a..e6c7bfab 100644 --- a/src/osd.cpp +++ b/src/osd.cpp @@ -363,6 +363,7 @@ void osd_t::exec_op(osd_op_t *cur_op) cur_op->req.hdr.opcode != OSD_OP_READ && cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP && cur_op->req.hdr.opcode != OSD_OP_SCRUB && + cur_op->req.hdr.opcode != OSD_OP_DESCRIBE && cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG) { // Readonly mode @@ -397,6 +398,10 @@ void osd_t::exec_op(osd_op_t *cur_op) { continue_primary_scrub(cur_op); } + else if (cur_op->req.hdr.opcode == OSD_OP_DESCRIBE) + { + continue_primary_describe(cur_op); + } else { exec_secondary(cur_op); diff --git a/src/osd.h b/src/osd.h index 3f0b504e..23f9f5e0 100644 --- a/src/osd.h +++ b/src/osd.h @@ -260,6 +260,7 @@ class osd_t bool prepare_primary_rw(osd_op_t *cur_op); void continue_primary_read(osd_op_t *cur_op); void continue_primary_scrub(osd_op_t *cur_op); + void continue_primary_describe(osd_op_t *cur_op); void continue_primary_write(osd_op_t *cur_op); void cancel_primary_write(osd_op_t *cur_op); void continue_primary_sync(osd_op_t *cur_op); diff --git a/src/osd_ops.cpp b/src/osd_ops.cpp index 812babeb..4cc60f37 100644 --- a/src/osd_ops.cpp +++ b/src/osd_ops.cpp @@ -22,4 +22,5 @@ const char* osd_op_names[] = { "ping", "sec_read_bmp", "scrub", + "describe", }; diff --git a/src/osd_ops.h b/src/osd_ops.h index 986a5c6b..7522e0bf 100644 --- a/src/osd_ops.h +++ b/src/osd_ops.h @@ -30,7 +30,8 @@ #define OSD_OP_PING 15 #define OSD_OP_SEC_READ_BMP 16 #define OSD_OP_SCRUB 17 -#define OSD_OP_MAX 17 +#define OSD_OP_DESCRIBE 18 +#define OSD_OP_MAX 18 #define OSD_RW_MAX 64*1024*1024 #define OSD_PROTOCOL_VERSION 1 @@ -44,6 +45,11 @@ #define MEM_ALIGNMENT 4096 #endif +// Constants for osd_reply_describe_item_t.loc_bad +#define LOC_OUTDATED 1 +#define LOC_CORRUPTED 2 +#define LOC_INCONSISTENT 4 + // common request and reply headers struct __attribute__((__packed__)) osd_op_header_t { @@ -229,6 +235,36 @@ struct __attribute__((__packed__)) osd_reply_sync_t osd_reply_header_t header; }; +// describe unclean object states in detail +struct __attribute__((__packed__)) osd_op_describe_t +{ + osd_op_header_t header; + // state mask to filter objects by state (0 or 0xfff..ff = all objects) + uint64_t object_state; + // minimum inode and offset + uint64_t min_inode, min_offset; + // maximum inode and offset + uint64_t max_inode, max_offset; + // limit + uint64_t limit; +}; + +struct __attribute__((__packed__)) osd_reply_describe_t +{ + osd_reply_header_t header; + // size of the resulting array in bytes + uint64_t result_bytes; +}; + +struct __attribute__((__packed__)) osd_reply_describe_item_t +{ + uint64_t inode; + uint64_t stripe; + uint32_t role; // part number: 0 for replicas, 0..pg_size-1 for EC + uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED / LOC_INCONSISTENT + osd_num_t osd_num; // OSD number +}; + // FIXME it would be interesting to try to unify blockstore_op and osd_op formats union osd_any_op_t { @@ -242,6 +278,7 @@ union osd_any_op_t osd_op_show_config_t show_conf; osd_op_rw_t rw; osd_op_sync_t sync; + osd_op_describe_t describe; uint8_t buf[OSD_PACKET_SIZE]; }; @@ -257,6 +294,7 @@ union osd_any_reply_t osd_reply_show_config_t show_conf; osd_reply_rw_t rw; osd_reply_sync_t sync; + osd_reply_describe_t describe; uint8_t buf[OSD_PACKET_SIZE]; }; diff --git a/src/osd_peering_pg.h b/src/osd_peering_pg.h index 45b17eba..c7397794 100644 --- a/src/osd_peering_pg.h +++ b/src/osd_peering_pg.h @@ -13,10 +13,6 @@ #define PG_EPOCH_BITS 48 -#define LOC_OUTDATED 1 -#define LOC_CORRUPTED 2 -#define LOC_INCONSISTENT 4 - struct pg_obj_loc_t { uint64_t role; diff --git a/src/osd_primary_describe.cpp b/src/osd_primary_describe.cpp new file mode 100644 index 00000000..b259975b --- /dev/null +++ b/src/osd_primary_describe.cpp @@ -0,0 +1,128 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 (see README.md for details) + +#include +#include "osd_primary.h" + +struct unclean_list_t +{ + btree::btree_map::iterator it, end; + uint64_t state_mask, state; +}; + +struct desc_item_list_t +{ + int alloc, size; + osd_reply_describe_item_t *items; +}; + +static void include_list(std::vector & lists, + btree::btree_map & from, + osd_op_describe_t & desc, uint64_t state_mask, uint64_t state) +{ + auto it = desc.min_inode || desc.min_offset ? from.lower_bound((object_id){ + .inode = desc.min_inode, + .stripe = desc.min_offset, + }) : from.begin(); + auto end_it = desc.max_inode || desc.max_offset ? from.upper_bound((object_id){ + .inode = desc.max_inode, + .stripe = desc.max_offset, + }) : from.end(); + lists.push_back((unclean_list_t){ + .it = it, + .end = end_it, + .state_mask = state_mask, + .state = state, + }); +} + +struct obj_list_t +{ + object_id oid; + int list_id; +}; + +static inline bool operator < (const obj_list_t & a, const obj_list_t & b) +{ + return b.oid < a.oid; +} + +static void scan_lists(std::vector & lists, uint64_t limit, desc_item_list_t & res) +{ + if (limit > 1048576) + { + limit = 1048576; + } + std::priority_queue min; + for (int i = 0; i < lists.size(); i++) + { + if (lists[i].it != lists[i].end) + { + min.push((obj_list_t){ .oid = lists[i].it->first, .list_id = i }); + } + } + while (min.size() && (!limit || res.size < limit)) + { + auto i = min.top().list_id; + min.pop(); + for (auto & chunk: lists[i].it->second->osd_set) + { + if (res.size >= res.alloc) + { + res.alloc = !res.alloc ? 128 : (res.alloc*2); + res.items = (osd_reply_describe_item_t*)realloc_or_die(res.items, res.alloc * sizeof(osd_reply_describe_item_t)); + } + res.items[res.size++] = (osd_reply_describe_item_t){ + .inode = lists[i].it->first.inode, + .stripe = lists[i].it->first.stripe, + .role = (uint32_t)chunk.role, + .loc_bad = chunk.loc_bad, + .osd_num = chunk.osd_num, + }; + } + lists[i].it++; + if (lists[i].it != lists[i].end) + { + min.push((obj_list_t){ .oid = lists[i].it->first, .list_id = i }); + } + } +} + +// Describe unclean objects +void osd_t::continue_primary_describe(osd_op_t *cur_op) +{ + auto & desc = cur_op->req.describe; + if (!desc.object_state) + desc.object_state = ~desc.object_state; + std::vector lists; + for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++) + { + auto & pg = pg_it->second; + if (desc.object_state & OBJ_INCONSISTENT) + include_list(lists, pg.inconsistent_objects, desc, 0, 0); + if (desc.object_state & OBJ_CORRUPTED) + { + if (!(desc.object_state & OBJ_INCOMPLETE)) + include_list(lists, pg.incomplete_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED); + if (!(desc.object_state & OBJ_DEGRADED)) + include_list(lists, pg.degraded_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED); + if (!(desc.object_state & OBJ_MISPLACED)) + include_list(lists, pg.misplaced_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED); + } + uint64_t skip_corrupted = !(desc.object_state & OBJ_CORRUPTED) ? OBJ_CORRUPTED : 0; + if (desc.object_state & OBJ_INCOMPLETE) + include_list(lists, pg.incomplete_objects, desc, skip_corrupted, 0); + if (desc.object_state & OBJ_DEGRADED) + include_list(lists, pg.degraded_objects, desc, skip_corrupted, 0); + if (desc.object_state & OBJ_MISPLACED) + include_list(lists, pg.misplaced_objects, desc, skip_corrupted, 0); + } + desc_item_list_t res = {}; + scan_lists(lists, desc.limit, res); + assert(!cur_op->buf); + cur_op->buf = res.items; + cur_op->reply.describe.result_bytes = res.size * sizeof(osd_reply_describe_item_t); + if (res.items) + cur_op->iov.push_back(res.items, res.size * sizeof(osd_reply_describe_item_t)); + finish_op(cur_op, res.size); +}