Use new listing methods in rm_inode
parent
7d3d696110
commit
a02b02eb04
241
src/rm_inode.cpp
241
src/rm_inode.cpp
|
@ -19,25 +19,14 @@
|
|||
|
||||
const char *exe_name = NULL;
|
||||
|
||||
struct rm_pg_t;
|
||||
|
||||
struct rm_pg_osd_t
|
||||
{
|
||||
rm_pg_t *pg = NULL;
|
||||
osd_num_t osd_num;
|
||||
bool sent = false;
|
||||
};
|
||||
|
||||
struct rm_pg_t
|
||||
{
|
||||
pg_num_t pg_num;
|
||||
osd_num_t rm_osd_num;
|
||||
std::vector<rm_pg_osd_t> list_osds;
|
||||
int state = 0;
|
||||
int to_list;
|
||||
std::set<object_id> objects;
|
||||
std::set<object_id>::iterator obj_pos;
|
||||
uint64_t obj_count = 0, obj_done = 0, obj_prev_done = 0;
|
||||
int state = 0;
|
||||
int in_flight = 0;
|
||||
};
|
||||
|
||||
|
@ -47,6 +36,7 @@ protected:
|
|||
uint64_t inode = 0;
|
||||
pool_id_t pool_id = 0;
|
||||
uint64_t iodepth = 0, parallel_osds = 0;
|
||||
inode_list_t *lister = NULL;
|
||||
|
||||
ring_loop_t *ringloop = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
|
@ -58,7 +48,7 @@ protected:
|
|||
uint64_t pgs_to_list = 0;
|
||||
bool started = false;
|
||||
bool progress = true;
|
||||
bool list_first = false;
|
||||
bool list_first = false, lists_done = false;
|
||||
int log_level = 0;
|
||||
|
||||
public:
|
||||
|
@ -137,149 +127,37 @@ public:
|
|||
|
||||
void start_delete()
|
||||
{
|
||||
if (cli->st_cli.pool_config.find(pool_id) == cli->st_cli.pool_config.end())
|
||||
lister = cli->list_inode_start(inode, [this](std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
|
||||
{
|
||||
fprintf(stderr, "Pool %u does not exist\n", pool_id);
|
||||
rm_pg_t *rm = new rm_pg_t({
|
||||
.pg_num = pg_num,
|
||||
.rm_osd_num = primary_osd,
|
||||
.objects = objects,
|
||||
.obj_count = objects.size(),
|
||||
.obj_done = 0,
|
||||
.obj_prev_done = 0,
|
||||
});
|
||||
rm->obj_pos = rm->objects.begin();
|
||||
lists.push_back(rm);
|
||||
if (list_first)
|
||||
{
|
||||
cli->list_inode_next(lister, 1);
|
||||
}
|
||||
if (status & INODE_LIST_DONE)
|
||||
{
|
||||
lists_done = true;
|
||||
}
|
||||
pgs_to_list--;
|
||||
continue_delete();
|
||||
});
|
||||
if (!lister)
|
||||
{
|
||||
fprintf(stderr, "Failed to list inode %lx objects\n", inode);
|
||||
exit(1);
|
||||
}
|
||||
auto pool_cfg = cli->st_cli.pool_config[pool_id];
|
||||
for (auto & pg_item: pool_cfg.pg_config)
|
||||
{
|
||||
auto & pg = pg_item.second;
|
||||
if (pg.pause || !pg.cur_primary || !(pg.cur_state & PG_ACTIVE))
|
||||
{
|
||||
fprintf(stderr, "PG %u is inactive, skipping\n", pg_item.first);
|
||||
continue;
|
||||
}
|
||||
rm_pg_t *r = new rm_pg_t();
|
||||
r->pg_num = pg_item.first;
|
||||
r->rm_osd_num = pg.cur_primary;
|
||||
r->state = RM_LISTING;
|
||||
if (pg.cur_state != PG_ACTIVE)
|
||||
{
|
||||
std::set<osd_num_t> all_peers;
|
||||
for (osd_num_t pg_osd: pg.target_set)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
all_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
for (osd_num_t pg_osd: pg.all_peers)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
all_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
for (auto & hist_item: pg.target_history)
|
||||
{
|
||||
for (auto pg_osd: hist_item)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
all_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (osd_num_t peer_osd: all_peers)
|
||||
{
|
||||
r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = peer_osd, .sent = false });
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = pg.cur_primary, .sent = false });
|
||||
}
|
||||
r->to_list = r->list_osds.size();
|
||||
lists.push_back(r);
|
||||
}
|
||||
std::sort(lists.begin(), lists.end(), [](rm_pg_t *a, rm_pg_t *b)
|
||||
{
|
||||
return a->rm_osd_num < b->rm_osd_num ? true : false;
|
||||
});
|
||||
pgs_to_list = lists.size();
|
||||
pgs_to_list = cli->list_pg_count(lister);
|
||||
cli->list_inode_next(lister, parallel_osds);
|
||||
started = true;
|
||||
continue_delete();
|
||||
}
|
||||
|
||||
void send_list(rm_pg_osd_t *cur_list)
|
||||
{
|
||||
if (cur_list->sent)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) ==
|
||||
cli->msgr.osd_peer_fds.end())
|
||||
{
|
||||
// Initiate connection
|
||||
cli->msgr.connect_peer(cur_list->osd_num, cli->st_cli.peer_states[cur_list->osd_num]);
|
||||
return;
|
||||
}
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num];
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_list = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = cli->msgr.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_LIST,
|
||||
},
|
||||
.list_pg = cur_list->pg->pg_num,
|
||||
.pg_count = (pg_num_t)cli->st_cli.pool_config[pool_id].real_pg_count,
|
||||
.pg_stripe_size = cli->st_cli.pool_config[pool_id].pg_stripe_size,
|
||||
.min_inode = inode,
|
||||
.max_inode = inode,
|
||||
},
|
||||
};
|
||||
op->callback = [this, cur_list](osd_op_t *op)
|
||||
{
|
||||
cur_list->pg->to_list--;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to get PG %u/%u object list from OSD %lu (retval=%ld), skipping\n",
|
||||
pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (op->reply.sec_list.stable_count < op->reply.hdr.retval)
|
||||
{
|
||||
// Unstable objects, if present, mean that someone still writes into the inode. Warn the user about it.
|
||||
printf(
|
||||
"[PG %u/%u] Inode still has %lu unstable object versions - is it still open? Not a good idea to delete it.\n",
|
||||
pool_id, cur_list->pg->pg_num, op->reply.hdr.retval - op->reply.sec_list.stable_count
|
||||
);
|
||||
}
|
||||
if (log_level > 0)
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n",
|
||||
pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
for (uint64_t i = 0; i < op->reply.hdr.retval; i++)
|
||||
{
|
||||
object_id oid = ((obj_ver_id*)op->buf)[i].oid;
|
||||
oid.stripe = oid.stripe & ~STRIPE_MASK;
|
||||
cur_list->pg->objects.insert(oid);
|
||||
}
|
||||
}
|
||||
delete op;
|
||||
if (cur_list->pg->to_list <= 0)
|
||||
{
|
||||
cur_list->pg->obj_done = cur_list->pg->obj_prev_done = 0;
|
||||
cur_list->pg->obj_pos = cur_list->pg->objects.begin();
|
||||
cur_list->pg->obj_count = cur_list->pg->objects.size();
|
||||
total_count += cur_list->pg->obj_count;
|
||||
total_prev_pct = 0;
|
||||
cur_list->pg->state = RM_REMOVING;
|
||||
pgs_to_list--;
|
||||
}
|
||||
continue_delete();
|
||||
};
|
||||
cli->msgr.outbox_push(op);
|
||||
cur_list->sent = true;
|
||||
}
|
||||
|
||||
void send_ops(rm_pg_t *cur_list)
|
||||
|
@ -300,11 +178,11 @@ public:
|
|||
.rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = cli->msgr.next_subop_id++,
|
||||
.id = cli->next_op_id(),
|
||||
.opcode = OSD_OP_DELETE,
|
||||
},
|
||||
.inode = cur_list->obj_pos->inode,
|
||||
.offset = (cur_list->obj_pos->stripe & ~STRIPE_MASK),
|
||||
.offset = cur_list->obj_pos->stripe,
|
||||
.len = 0,
|
||||
},
|
||||
};
|
||||
|
@ -313,7 +191,8 @@ public:
|
|||
cur_list->in_flight--;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to remove object from PG %u (OSD %lu) (retval=%ld)\n",
|
||||
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
|
||||
op->req.rw.inode, op->req.rw.offset,
|
||||
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||
}
|
||||
delete op;
|
||||
|
@ -321,77 +200,41 @@ public:
|
|||
total_done++;
|
||||
continue_delete();
|
||||
};
|
||||
cli->msgr.outbox_push(op);
|
||||
cur_list->obj_pos++;
|
||||
cur_list->in_flight++;
|
||||
}
|
||||
if (!cur_list->in_flight && cur_list->obj_pos == cur_list->objects.end())
|
||||
{
|
||||
cur_list->obj_count = 0;
|
||||
cur_list->obj_done = cur_list->obj_prev_done = 0;
|
||||
cur_list->state = RM_END;
|
||||
cli->msgr.outbox_push(op);
|
||||
}
|
||||
}
|
||||
|
||||
void continue_delete()
|
||||
{
|
||||
int par_osd = 0;
|
||||
osd_num_t max_seen_osd = 0;
|
||||
bool no_del = false;
|
||||
if (list_first)
|
||||
if (list_first && !lists_done)
|
||||
{
|
||||
int i, n = 0;
|
||||
for (i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (lists[i]->state == RM_LISTING)
|
||||
{
|
||||
n++;
|
||||
}
|
||||
}
|
||||
if (n > 0)
|
||||
{
|
||||
no_del = true;
|
||||
}
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (lists[i]->state == RM_END)
|
||||
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
|
||||
{
|
||||
delete lists[i];
|
||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||
i--;
|
||||
}
|
||||
else if (lists[i]->rm_osd_num > max_seen_osd)
|
||||
if (!lists_done)
|
||||
{
|
||||
if (lists[i]->state == RM_LISTING)
|
||||
{
|
||||
for (int j = 0; j < lists[i]->list_osds.size(); j++)
|
||||
{
|
||||
send_list(&lists[i]->list_osds[j]);
|
||||
cli->list_inode_next(lister, 1);
|
||||
}
|
||||
}
|
||||
else if (lists[i]->state == RM_REMOVING)
|
||||
else
|
||||
{
|
||||
if (no_del)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
send_ops(lists[i]);
|
||||
}
|
||||
par_osd++;
|
||||
max_seen_osd = lists[i]->rm_osd_num;
|
||||
if (par_osd >= parallel_osds)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
|
||||
{
|
||||
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
|
||||
total_prev_pct = total_done*1000/total_count;
|
||||
}
|
||||
if (!lists.size())
|
||||
if (lists_done && !lists.size())
|
||||
{
|
||||
printf("Done, inode %lu in pool %u removed\n", (inode & ((1l << (64-POOL_ID_BITS)) - 1)), pool_id);
|
||||
exit(0);
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
. `dirname $0`/common.sh
|
||||
|
||||
OSD_SIZE=${OSD_SIZE:-1024}
|
||||
PG_COUNT=${PG_COUNT:-1}
|
||||
|
||||
dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
|
||||
|
@ -25,16 +26,16 @@ if [ -n "$GLOBAL_CONF" ]; then
|
|||
$ETCDCTL put /vitastor/config/global "$GLOBAL_CONF"
|
||||
fi
|
||||
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1,"pg_count":1,"failure_domain":"osd"}}'
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1,"pg_count":'$PG_COUNT',"failure_domain":"osd"}}'
|
||||
|
||||
sleep 2
|
||||
|
||||
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(. | length) != 0 and (.[0].items["1"]["1"].osd_set | sort) == ["1","2","3"]'); then
|
||||
format_error "FAILED: 1 PG NOT CONFIGURED"
|
||||
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(. | length) != 0 and ([ .[0].items["1"][] | select((.osd_set | sort) == ["1","2","3"]) ] | length) == '$PG_COUNT); then
|
||||
format_error "FAILED: $PG_COUNT PG(s) NOT CONFIGURED"
|
||||
fi
|
||||
|
||||
if ! ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active"]'); then
|
||||
format_error "FAILED: 1 PG NOT UP"
|
||||
if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active"]) ] | length == '$PG_COUNT); then
|
||||
format_error "FAILED: $PG_COUNT PG(s) NOT UP"
|
||||
fi
|
||||
|
||||
if ! cmp build/src/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so; then
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
#!/bin/bash -ex
|
||||
|
||||
PG_COUNT=16
|
||||
. `dirname $0`/run_3osds.sh
|
||||
|
||||
LD_PRELOAD=libasan.so.5 \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 \
|
||||
-end_fsync=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -cluster_log_level=10
|
||||
|
||||
$ETCDCTL get --prefix '/vitastor/pg/state'
|
||||
|
||||
LD_PRELOAD=libasan.so.5 \
|
||||
build/src/vitastor-rm --etcd_address $ETCD_URL --pool 1 --inode 1
|
||||
|
||||
format_green OK
|
Loading…
Reference in New Issue