diff --git a/src/rm_inode.cpp b/src/rm_inode.cpp index cfc3132cd..f4d66bf43 100644 --- a/src/rm_inode.cpp +++ b/src/rm_inode.cpp @@ -19,25 +19,14 @@ const char *exe_name = NULL; -struct rm_pg_t; - -struct rm_pg_osd_t -{ - rm_pg_t *pg = NULL; - osd_num_t osd_num; - bool sent = false; -}; - struct rm_pg_t { pg_num_t pg_num; osd_num_t rm_osd_num; - std::vector list_osds; - int state = 0; - int to_list; std::set objects; std::set::iterator obj_pos; uint64_t obj_count = 0, obj_done = 0, obj_prev_done = 0; + int state = 0; int in_flight = 0; }; @@ -47,6 +36,7 @@ protected: uint64_t inode = 0; pool_id_t pool_id = 0; uint64_t iodepth = 0, parallel_osds = 0; + inode_list_t *lister = NULL; ring_loop_t *ringloop = NULL; epoll_manager_t *epmgr = NULL; @@ -58,7 +48,7 @@ protected: uint64_t pgs_to_list = 0; bool started = false; bool progress = true; - bool list_first = false; + bool list_first = false, lists_done = false; int log_level = 0; public: @@ -137,149 +127,37 @@ public: void start_delete() { - if (cli->st_cli.pool_config.find(pool_id) == cli->st_cli.pool_config.end()) + lister = cli->list_inode_start(inode, [this](std::set&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status) { - fprintf(stderr, "Pool %u does not exist\n", pool_id); + rm_pg_t *rm = new rm_pg_t({ + .pg_num = pg_num, + .rm_osd_num = primary_osd, + .objects = objects, + .obj_count = objects.size(), + .obj_done = 0, + .obj_prev_done = 0, + }); + rm->obj_pos = rm->objects.begin(); + lists.push_back(rm); + if (list_first) + { + cli->list_inode_next(lister, 1); + } + if (status & INODE_LIST_DONE) + { + lists_done = true; + } + pgs_to_list--; + continue_delete(); + }); + if (!lister) + { + fprintf(stderr, "Failed to list inode %lx objects\n", inode); exit(1); } - auto pool_cfg = cli->st_cli.pool_config[pool_id]; - for (auto & pg_item: pool_cfg.pg_config) - { - auto & pg = pg_item.second; - if (pg.pause || !pg.cur_primary || !(pg.cur_state & PG_ACTIVE)) - { - fprintf(stderr, "PG %u is inactive, skipping\n", pg_item.first); - continue; - } - rm_pg_t *r = new rm_pg_t(); - r->pg_num = pg_item.first; - r->rm_osd_num = pg.cur_primary; - r->state = RM_LISTING; - if (pg.cur_state != PG_ACTIVE) - { - std::set all_peers; - for (osd_num_t pg_osd: pg.target_set) - { - if (pg_osd != 0) - { - all_peers.insert(pg_osd); - } - } - for (osd_num_t pg_osd: pg.all_peers) - { - if (pg_osd != 0) - { - all_peers.insert(pg_osd); - } - } - for (auto & hist_item: pg.target_history) - { - for (auto pg_osd: hist_item) - { - if (pg_osd != 0) - { - all_peers.insert(pg_osd); - } - } - } - for (osd_num_t peer_osd: all_peers) - { - r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = peer_osd, .sent = false }); - } - } - else - { - r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = pg.cur_primary, .sent = false }); - } - r->to_list = r->list_osds.size(); - lists.push_back(r); - } - std::sort(lists.begin(), lists.end(), [](rm_pg_t *a, rm_pg_t *b) - { - return a->rm_osd_num < b->rm_osd_num ? true : false; - }); - pgs_to_list = lists.size(); + pgs_to_list = cli->list_pg_count(lister); + cli->list_inode_next(lister, parallel_osds); started = true; - continue_delete(); - } - - void send_list(rm_pg_osd_t *cur_list) - { - if (cur_list->sent) - { - return; - } - if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) == - cli->msgr.osd_peer_fds.end()) - { - // Initiate connection - cli->msgr.connect_peer(cur_list->osd_num, cli->st_cli.peer_states[cur_list->osd_num]); - return; - } - osd_op_t *op = new osd_op_t(); - op->op_type = OSD_OP_OUT; - op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num]; - op->req = (osd_any_op_t){ - .sec_list = { - .header = { - .magic = SECONDARY_OSD_OP_MAGIC, - .id = cli->msgr.next_subop_id++, - .opcode = OSD_OP_SEC_LIST, - }, - .list_pg = cur_list->pg->pg_num, - .pg_count = (pg_num_t)cli->st_cli.pool_config[pool_id].real_pg_count, - .pg_stripe_size = cli->st_cli.pool_config[pool_id].pg_stripe_size, - .min_inode = inode, - .max_inode = inode, - }, - }; - op->callback = [this, cur_list](osd_op_t *op) - { - cur_list->pg->to_list--; - if (op->reply.hdr.retval < 0) - { - fprintf(stderr, "Failed to get PG %u/%u object list from OSD %lu (retval=%ld), skipping\n", - pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval); - } - else - { - if (op->reply.sec_list.stable_count < op->reply.hdr.retval) - { - // Unstable objects, if present, mean that someone still writes into the inode. Warn the user about it. - printf( - "[PG %u/%u] Inode still has %lu unstable object versions - is it still open? Not a good idea to delete it.\n", - pool_id, cur_list->pg->pg_num, op->reply.hdr.retval - op->reply.sec_list.stable_count - ); - } - if (log_level > 0) - { - printf( - "[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n", - pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval - ); - } - for (uint64_t i = 0; i < op->reply.hdr.retval; i++) - { - object_id oid = ((obj_ver_id*)op->buf)[i].oid; - oid.stripe = oid.stripe & ~STRIPE_MASK; - cur_list->pg->objects.insert(oid); - } - } - delete op; - if (cur_list->pg->to_list <= 0) - { - cur_list->pg->obj_done = cur_list->pg->obj_prev_done = 0; - cur_list->pg->obj_pos = cur_list->pg->objects.begin(); - cur_list->pg->obj_count = cur_list->pg->objects.size(); - total_count += cur_list->pg->obj_count; - total_prev_pct = 0; - cur_list->pg->state = RM_REMOVING; - pgs_to_list--; - } - continue_delete(); - }; - cli->msgr.outbox_push(op); - cur_list->sent = true; } void send_ops(rm_pg_t *cur_list) @@ -300,11 +178,11 @@ public: .rw = { .header = { .magic = SECONDARY_OSD_OP_MAGIC, - .id = cli->msgr.next_subop_id++, + .id = cli->next_op_id(), .opcode = OSD_OP_DELETE, }, .inode = cur_list->obj_pos->inode, - .offset = (cur_list->obj_pos->stripe & ~STRIPE_MASK), + .offset = cur_list->obj_pos->stripe, .len = 0, }, }; @@ -313,7 +191,8 @@ public: cur_list->in_flight--; if (op->reply.hdr.retval < 0) { - fprintf(stderr, "Failed to remove object from PG %u (OSD %lu) (retval=%ld)\n", + fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n", + op->req.rw.inode, op->req.rw.offset, cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval); } delete op; @@ -321,69 +200,33 @@ public: total_done++; continue_delete(); }; - cli->msgr.outbox_push(op); cur_list->obj_pos++; cur_list->in_flight++; - } - if (!cur_list->in_flight && cur_list->obj_pos == cur_list->objects.end()) - { - cur_list->obj_count = 0; - cur_list->obj_done = cur_list->obj_prev_done = 0; - cur_list->state = RM_END; + cli->msgr.outbox_push(op); } } void continue_delete() { - int par_osd = 0; - osd_num_t max_seen_osd = 0; - bool no_del = false; - if (list_first) + if (list_first && !lists_done) { - int i, n = 0; - for (i = 0; i < lists.size(); i++) - { - if (lists[i]->state == RM_LISTING) - { - n++; - } - } - if (n > 0) - { - no_del = true; - } + return; } for (int i = 0; i < lists.size(); i++) { - if (lists[i]->state == RM_END) + if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end()) { delete lists[i]; lists.erase(lists.begin()+i, lists.begin()+i+1); i--; + if (!lists_done) + { + cli->list_inode_next(lister, 1); + } } - else if (lists[i]->rm_osd_num > max_seen_osd) + else { - if (lists[i]->state == RM_LISTING) - { - for (int j = 0; j < lists[i]->list_osds.size(); j++) - { - send_list(&lists[i]->list_osds[j]); - } - } - else if (lists[i]->state == RM_REMOVING) - { - if (no_del) - { - continue; - } - send_ops(lists[i]); - } - par_osd++; - max_seen_osd = lists[i]->rm_osd_num; - if (par_osd >= parallel_osds) - { - break; - } + send_ops(lists[i]); } } if (progress && total_count > 0 && total_done*1000/total_count != total_prev_pct) @@ -391,7 +234,7 @@ public: printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list); total_prev_pct = total_done*1000/total_count; } - if (!lists.size()) + if (lists_done && !lists.size()) { printf("Done, inode %lu in pool %u removed\n", (inode & ((1l << (64-POOL_ID_BITS)) - 1)), pool_id); exit(0); diff --git a/tests/run_3osds.sh b/tests/run_3osds.sh index a23780ff4..e926662d1 100644 --- a/tests/run_3osds.sh +++ b/tests/run_3osds.sh @@ -3,6 +3,7 @@ . `dirname $0`/common.sh OSD_SIZE=${OSD_SIZE:-1024} +PG_COUNT=${PG_COUNT:-1} dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1)) dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1)) @@ -25,16 +26,16 @@ if [ -n "$GLOBAL_CONF" ]; then $ETCDCTL put /vitastor/config/global "$GLOBAL_CONF" fi -$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1,"pg_count":1,"failure_domain":"osd"}}' +$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1,"pg_count":'$PG_COUNT',"failure_domain":"osd"}}' sleep 2 -if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(. | length) != 0 and (.[0].items["1"]["1"].osd_set | sort) == ["1","2","3"]'); then - format_error "FAILED: 1 PG NOT CONFIGURED" +if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(. | length) != 0 and ([ .[0].items["1"][] | select((.osd_set | sort) == ["1","2","3"]) ] | length) == '$PG_COUNT); then + format_error "FAILED: $PG_COUNT PG(s) NOT CONFIGURED" fi -if ! ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active"]'); then - format_error "FAILED: 1 PG NOT UP" +if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active"]) ] | length == '$PG_COUNT); then + format_error "FAILED: $PG_COUNT PG(s) NOT UP" fi if ! cmp build/src/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so; then diff --git a/tests/test_rm.sh b/tests/test_rm.sh new file mode 100755 index 000000000..edfe4ccce --- /dev/null +++ b/tests/test_rm.sh @@ -0,0 +1,15 @@ +#!/bin/bash -ex + +PG_COUNT=16 +. `dirname $0`/run_3osds.sh + +LD_PRELOAD=libasan.so.5 \ + fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 \ + -end_fsync=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -cluster_log_level=10 + +$ETCDCTL get --prefix '/vitastor/pg/state' + +LD_PRELOAD=libasan.so.5 \ + build/src/vitastor-rm --etcd_address $ETCD_URL --pool 1 --inode 1 + +format_green OK