Use new listing methods in rm_inode

nbd-vmsplice
Vitaliy Filippov 2021-07-19 01:42:35 +03:00
parent 7d3d696110
commit a02b02eb04
3 changed files with 66 additions and 207 deletions

View File

@ -19,25 +19,14 @@
const char *exe_name = NULL;
struct rm_pg_t;
struct rm_pg_osd_t
{
rm_pg_t *pg = NULL;
osd_num_t osd_num;
bool sent = false;
};
struct rm_pg_t
{
pg_num_t pg_num;
osd_num_t rm_osd_num;
std::vector<rm_pg_osd_t> list_osds;
int state = 0;
int to_list;
std::set<object_id> objects;
std::set<object_id>::iterator obj_pos;
uint64_t obj_count = 0, obj_done = 0, obj_prev_done = 0;
int state = 0;
int in_flight = 0;
};
@ -47,6 +36,7 @@ protected:
uint64_t inode = 0;
pool_id_t pool_id = 0;
uint64_t iodepth = 0, parallel_osds = 0;
inode_list_t *lister = NULL;
ring_loop_t *ringloop = NULL;
epoll_manager_t *epmgr = NULL;
@ -58,7 +48,7 @@ protected:
uint64_t pgs_to_list = 0;
bool started = false;
bool progress = true;
bool list_first = false;
bool list_first = false, lists_done = false;
int log_level = 0;
public:
@ -137,149 +127,37 @@ public:
void start_delete()
{
if (cli->st_cli.pool_config.find(pool_id) == cli->st_cli.pool_config.end())
lister = cli->list_inode_start(inode, [this](std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
{
fprintf(stderr, "Pool %u does not exist\n", pool_id);
rm_pg_t *rm = new rm_pg_t({
.pg_num = pg_num,
.rm_osd_num = primary_osd,
.objects = objects,
.obj_count = objects.size(),
.obj_done = 0,
.obj_prev_done = 0,
});
rm->obj_pos = rm->objects.begin();
lists.push_back(rm);
if (list_first)
{
cli->list_inode_next(lister, 1);
}
if (status & INODE_LIST_DONE)
{
lists_done = true;
}
pgs_to_list--;
continue_delete();
});
if (!lister)
{
fprintf(stderr, "Failed to list inode %lx objects\n", inode);
exit(1);
}
auto pool_cfg = cli->st_cli.pool_config[pool_id];
for (auto & pg_item: pool_cfg.pg_config)
{
auto & pg = pg_item.second;
if (pg.pause || !pg.cur_primary || !(pg.cur_state & PG_ACTIVE))
{
fprintf(stderr, "PG %u is inactive, skipping\n", pg_item.first);
continue;
}
rm_pg_t *r = new rm_pg_t();
r->pg_num = pg_item.first;
r->rm_osd_num = pg.cur_primary;
r->state = RM_LISTING;
if (pg.cur_state != PG_ACTIVE)
{
std::set<osd_num_t> all_peers;
for (osd_num_t pg_osd: pg.target_set)
{
if (pg_osd != 0)
{
all_peers.insert(pg_osd);
}
}
for (osd_num_t pg_osd: pg.all_peers)
{
if (pg_osd != 0)
{
all_peers.insert(pg_osd);
}
}
for (auto & hist_item: pg.target_history)
{
for (auto pg_osd: hist_item)
{
if (pg_osd != 0)
{
all_peers.insert(pg_osd);
}
}
}
for (osd_num_t peer_osd: all_peers)
{
r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = peer_osd, .sent = false });
}
}
else
{
r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = pg.cur_primary, .sent = false });
}
r->to_list = r->list_osds.size();
lists.push_back(r);
}
std::sort(lists.begin(), lists.end(), [](rm_pg_t *a, rm_pg_t *b)
{
return a->rm_osd_num < b->rm_osd_num ? true : false;
});
pgs_to_list = lists.size();
pgs_to_list = cli->list_pg_count(lister);
cli->list_inode_next(lister, parallel_osds);
started = true;
continue_delete();
}
void send_list(rm_pg_osd_t *cur_list)
{
if (cur_list->sent)
{
return;
}
if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) ==
cli->msgr.osd_peer_fds.end())
{
// Initiate connection
cli->msgr.connect_peer(cur_list->osd_num, cli->st_cli.peer_states[cur_list->osd_num]);
return;
}
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num];
op->req = (osd_any_op_t){
.sec_list = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = cli->msgr.next_subop_id++,
.opcode = OSD_OP_SEC_LIST,
},
.list_pg = cur_list->pg->pg_num,
.pg_count = (pg_num_t)cli->st_cli.pool_config[pool_id].real_pg_count,
.pg_stripe_size = cli->st_cli.pool_config[pool_id].pg_stripe_size,
.min_inode = inode,
.max_inode = inode,
},
};
op->callback = [this, cur_list](osd_op_t *op)
{
cur_list->pg->to_list--;
if (op->reply.hdr.retval < 0)
{
fprintf(stderr, "Failed to get PG %u/%u object list from OSD %lu (retval=%ld), skipping\n",
pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval);
}
else
{
if (op->reply.sec_list.stable_count < op->reply.hdr.retval)
{
// Unstable objects, if present, mean that someone still writes into the inode. Warn the user about it.
printf(
"[PG %u/%u] Inode still has %lu unstable object versions - is it still open? Not a good idea to delete it.\n",
pool_id, cur_list->pg->pg_num, op->reply.hdr.retval - op->reply.sec_list.stable_count
);
}
if (log_level > 0)
{
printf(
"[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n",
pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval
);
}
for (uint64_t i = 0; i < op->reply.hdr.retval; i++)
{
object_id oid = ((obj_ver_id*)op->buf)[i].oid;
oid.stripe = oid.stripe & ~STRIPE_MASK;
cur_list->pg->objects.insert(oid);
}
}
delete op;
if (cur_list->pg->to_list <= 0)
{
cur_list->pg->obj_done = cur_list->pg->obj_prev_done = 0;
cur_list->pg->obj_pos = cur_list->pg->objects.begin();
cur_list->pg->obj_count = cur_list->pg->objects.size();
total_count += cur_list->pg->obj_count;
total_prev_pct = 0;
cur_list->pg->state = RM_REMOVING;
pgs_to_list--;
}
continue_delete();
};
cli->msgr.outbox_push(op);
cur_list->sent = true;
}
void send_ops(rm_pg_t *cur_list)
@ -300,11 +178,11 @@ public:
.rw = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = cli->msgr.next_subop_id++,
.id = cli->next_op_id(),
.opcode = OSD_OP_DELETE,
},
.inode = cur_list->obj_pos->inode,
.offset = (cur_list->obj_pos->stripe & ~STRIPE_MASK),
.offset = cur_list->obj_pos->stripe,
.len = 0,
},
};
@ -313,7 +191,8 @@ public:
cur_list->in_flight--;
if (op->reply.hdr.retval < 0)
{
fprintf(stderr, "Failed to remove object from PG %u (OSD %lu) (retval=%ld)\n",
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
op->req.rw.inode, op->req.rw.offset,
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
}
delete op;
@ -321,69 +200,33 @@ public:
total_done++;
continue_delete();
};
cli->msgr.outbox_push(op);
cur_list->obj_pos++;
cur_list->in_flight++;
}
if (!cur_list->in_flight && cur_list->obj_pos == cur_list->objects.end())
{
cur_list->obj_count = 0;
cur_list->obj_done = cur_list->obj_prev_done = 0;
cur_list->state = RM_END;
cli->msgr.outbox_push(op);
}
}
void continue_delete()
{
int par_osd = 0;
osd_num_t max_seen_osd = 0;
bool no_del = false;
if (list_first)
if (list_first && !lists_done)
{
int i, n = 0;
for (i = 0; i < lists.size(); i++)
{
if (lists[i]->state == RM_LISTING)
{
n++;
}
}
if (n > 0)
{
no_del = true;
}
return;
}
for (int i = 0; i < lists.size(); i++)
{
if (lists[i]->state == RM_END)
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
{
delete lists[i];
lists.erase(lists.begin()+i, lists.begin()+i+1);
i--;
if (!lists_done)
{
cli->list_inode_next(lister, 1);
}
}
else if (lists[i]->rm_osd_num > max_seen_osd)
else
{
if (lists[i]->state == RM_LISTING)
{
for (int j = 0; j < lists[i]->list_osds.size(); j++)
{
send_list(&lists[i]->list_osds[j]);
}
}
else if (lists[i]->state == RM_REMOVING)
{
if (no_del)
{
continue;
}
send_ops(lists[i]);
}
par_osd++;
max_seen_osd = lists[i]->rm_osd_num;
if (par_osd >= parallel_osds)
{
break;
}
send_ops(lists[i]);
}
}
if (progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
@ -391,7 +234,7 @@ public:
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
total_prev_pct = total_done*1000/total_count;
}
if (!lists.size())
if (lists_done && !lists.size())
{
printf("Done, inode %lu in pool %u removed\n", (inode & ((1l << (64-POOL_ID_BITS)) - 1)), pool_id);
exit(0);

View File

@ -3,6 +3,7 @@
. `dirname $0`/common.sh
OSD_SIZE=${OSD_SIZE:-1024}
PG_COUNT=${PG_COUNT:-1}
dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
@ -25,16 +26,16 @@ if [ -n "$GLOBAL_CONF" ]; then
$ETCDCTL put /vitastor/config/global "$GLOBAL_CONF"
fi
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1,"pg_count":1,"failure_domain":"osd"}}'
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1,"pg_count":'$PG_COUNT',"failure_domain":"osd"}}'
sleep 2
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(. | length) != 0 and (.[0].items["1"]["1"].osd_set | sort) == ["1","2","3"]'); then
format_error "FAILED: 1 PG NOT CONFIGURED"
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(. | length) != 0 and ([ .[0].items["1"][] | select((.osd_set | sort) == ["1","2","3"]) ] | length) == '$PG_COUNT); then
format_error "FAILED: $PG_COUNT PG(s) NOT CONFIGURED"
fi
if ! ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active"]'); then
format_error "FAILED: 1 PG NOT UP"
if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active"]) ] | length == '$PG_COUNT); then
format_error "FAILED: $PG_COUNT PG(s) NOT UP"
fi
if ! cmp build/src/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so; then

15
tests/test_rm.sh Executable file
View File

@ -0,0 +1,15 @@
#!/bin/bash -ex
PG_COUNT=16
. `dirname $0`/run_3osds.sh
LD_PRELOAD=libasan.so.5 \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 \
-end_fsync=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -cluster_log_level=10
$ETCDCTL get --prefix '/vitastor/pg/state'
LD_PRELOAD=libasan.so.5 \
build/src/vitastor-rm --etcd_address $ETCD_URL --pool 1 --inode 1
format_green OK