forked from vitalif/vitastor
Make rm_inode work with incomplete and degraded objects, allow to wait before deleting objects
parent
ccabbbfbcb
commit
4a17a61d1f
|
@ -399,10 +399,7 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
|
||||||
|
|
||||||
- Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during
|
- Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during
|
||||||
deletion because proper handling of object cleanup in a cluster should be "three-phase"
|
deletion because proper handling of object cleanup in a cluster should be "three-phase"
|
||||||
and it's currently not implemented. Inode removal tool currently can't handle unclean
|
and it's currently not implemented. Just to repeat the removal again in this case.
|
||||||
objects, so incomplete objects become undeletable. This will be fixed in near future
|
|
||||||
by allowing the inode removal tool to delete unclean objects. With this problem fixed
|
|
||||||
you'll be able just to repeat the removal again.
|
|
||||||
|
|
||||||
## Implementation Principles
|
## Implementation Principles
|
||||||
|
|
||||||
|
|
|
@ -94,7 +94,7 @@ struct pg_t
|
||||||
std::vector<osd_num_t> cur_set;
|
std::vector<osd_num_t> cur_set;
|
||||||
// same thing in state_dict-like format
|
// same thing in state_dict-like format
|
||||||
pg_osd_set_t cur_loc_set;
|
pg_osd_set_t cur_loc_set;
|
||||||
// moved object map. by default, each object is considered to reside on the cur_set.
|
// moved object map. by default, each object is considered to reside on cur_set.
|
||||||
// this map stores all objects that differ.
|
// this map stores all objects that differ.
|
||||||
// it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
|
// it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
|
||||||
// which is up to ~192 MB per 1 TB in the worst case scenario
|
// which is up to ~192 MB per 1 TB in the worst case scenario
|
||||||
|
|
203
rm_inode.cpp
203
rm_inode.cpp
|
@ -6,26 +6,38 @@
|
||||||
* May be included into a bigger "command-line management interface" in the future
|
* May be included into a bigger "command-line management interface" in the future
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
#include "epoll_manager.h"
|
#include "epoll_manager.h"
|
||||||
#include "cluster_client.h"
|
#include "cluster_client.h"
|
||||||
#include "pg_states.h"
|
#include "pg_states.h"
|
||||||
|
|
||||||
#define RM_NO_LIST 1
|
#define RM_LISTING 1
|
||||||
#define RM_LIST_SENT 2
|
#define RM_REMOVING 2
|
||||||
#define RM_REMOVING 3
|
#define RM_END 3
|
||||||
#define RM_END 4
|
|
||||||
|
|
||||||
const char *exe_name = NULL;
|
const char *exe_name = NULL;
|
||||||
|
|
||||||
|
struct rm_pg_t;
|
||||||
|
|
||||||
struct rm_pg_osd_t
|
struct rm_pg_osd_t
|
||||||
{
|
{
|
||||||
pg_num_t pg_num;
|
rm_pg_t *pg = NULL;
|
||||||
osd_num_t osd_num;
|
osd_num_t osd_num;
|
||||||
|
bool sent = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct rm_pg_t
|
||||||
|
{
|
||||||
|
pg_num_t pg_num;
|
||||||
|
osd_num_t rm_osd_num;
|
||||||
|
std::vector<rm_pg_osd_t> list_osds;
|
||||||
int state = 0;
|
int state = 0;
|
||||||
obj_ver_id *obj_list = NULL;
|
int to_list;
|
||||||
uint64_t obj_count = 0, obj_pos = 0, obj_done = 0, obj_prev_done = 0;
|
std::set<object_id> objects;
|
||||||
|
std::set<object_id>::iterator obj_pos;
|
||||||
|
uint64_t obj_count = 0, obj_done = 0, obj_prev_done = 0;
|
||||||
int in_flight = 0;
|
int in_flight = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -41,11 +53,12 @@ protected:
|
||||||
cluster_client_t *cli = NULL;
|
cluster_client_t *cli = NULL;
|
||||||
ring_consumer_t consumer;
|
ring_consumer_t consumer;
|
||||||
|
|
||||||
std::vector<rm_pg_osd_t*> lists;
|
std::vector<rm_pg_t*> lists;
|
||||||
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
|
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
|
||||||
uint64_t pgs_to_list = 0;
|
uint64_t pgs_to_list = 0;
|
||||||
bool started = false;
|
bool started = false;
|
||||||
bool progress = true;
|
bool progress = true;
|
||||||
|
bool list_first = false;
|
||||||
int log_level = 0;
|
int log_level = 0;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
@ -62,7 +75,7 @@ public:
|
||||||
else if (args[i][0] == '-' && args[i][1] == '-')
|
else if (args[i][0] == '-' && args[i][1] == '-')
|
||||||
{
|
{
|
||||||
const char *opt = args[i]+2;
|
const char *opt = args[i]+2;
|
||||||
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
|
cfg[opt] = !strcmp(opt, "json") || !strcmp(opt, "wait-list") || i == narg-1 ? "1" : args[++i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return cfg;
|
return cfg;
|
||||||
|
@ -74,7 +87,7 @@ public:
|
||||||
"Vitastor inode removal tool\n"
|
"Vitastor inode removal tool\n"
|
||||||
"(c) Vitaliy Filippov, 2020 (VNPL-1.0)\n\n"
|
"(c) Vitaliy Filippov, 2020 (VNPL-1.0)\n\n"
|
||||||
"USAGE:\n"
|
"USAGE:\n"
|
||||||
" %s --etcd_address <etcd_address> --pool <pool> --inode <inode>\n",
|
" %s --etcd_address <etcd_address> --pool <pool> --inode <inode> [--wait-list]\n",
|
||||||
exe_name
|
exe_name
|
||||||
);
|
);
|
||||||
exit(0);
|
exit(0);
|
||||||
|
@ -105,6 +118,7 @@ public:
|
||||||
parallel_osds = 4;
|
parallel_osds = 4;
|
||||||
log_level = cfg["log_level"].int64_value();
|
log_level = cfg["log_level"].int64_value();
|
||||||
progress = cfg["progress"].uint64_value() ? true : false;
|
progress = cfg["progress"].uint64_value() ? true : false;
|
||||||
|
list_first = cfg["wait-list"].uint64_value() ? true : false;
|
||||||
// Create client
|
// Create client
|
||||||
ringloop = new ring_loop_t(512);
|
ringloop = new ring_loop_t(512);
|
||||||
epmgr = new epoll_manager_t(ringloop);
|
epmgr = new epoll_manager_t(ringloop);
|
||||||
|
@ -137,21 +151,57 @@ public:
|
||||||
for (auto & pg_item: pool_cfg.pg_config)
|
for (auto & pg_item: pool_cfg.pg_config)
|
||||||
{
|
{
|
||||||
auto & pg = pg_item.second;
|
auto & pg = pg_item.second;
|
||||||
if (pg.pause || !pg.cur_primary || pg.cur_state != PG_ACTIVE)
|
if (pg.pause || !pg.cur_primary || !(pg.cur_state & PG_ACTIVE))
|
||||||
{
|
{
|
||||||
// FIXME Support deletion in non-clean active PGs by introducing a "primary-list" command
|
fprintf(stderr, "PG %u is inactive, skipping\n", pg_item.first);
|
||||||
fprintf(stderr, "PG %u is not active+clean, skipping\n", pg_item.first);
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
rm_pg_osd_t *r = new rm_pg_osd_t();
|
rm_pg_t *r = new rm_pg_t();
|
||||||
r->pg_num = pg_item.first;
|
r->pg_num = pg_item.first;
|
||||||
r->osd_num = pg.cur_primary;
|
r->rm_osd_num = pg.cur_primary;
|
||||||
r->state = RM_NO_LIST;
|
r->state = RM_LISTING;
|
||||||
|
if (pg.cur_state != PG_ACTIVE)
|
||||||
|
{
|
||||||
|
std::set<osd_num_t> all_peers;
|
||||||
|
for (osd_num_t pg_osd: pg.target_set)
|
||||||
|
{
|
||||||
|
if (pg_osd != 0)
|
||||||
|
{
|
||||||
|
all_peers.insert(pg_osd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (osd_num_t pg_osd: pg.all_peers)
|
||||||
|
{
|
||||||
|
if (pg_osd != 0)
|
||||||
|
{
|
||||||
|
all_peers.insert(pg_osd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto & hist_item: pg.target_history)
|
||||||
|
{
|
||||||
|
for (auto pg_osd: hist_item)
|
||||||
|
{
|
||||||
|
if (pg_osd != 0)
|
||||||
|
{
|
||||||
|
all_peers.insert(pg_osd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (osd_num_t peer_osd: all_peers)
|
||||||
|
{
|
||||||
|
r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = peer_osd, .sent = false });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = pg.cur_primary, .sent = false });
|
||||||
|
}
|
||||||
|
r->to_list = r->list_osds.size();
|
||||||
lists.push_back(r);
|
lists.push_back(r);
|
||||||
}
|
}
|
||||||
std::sort(lists.begin(), lists.end(), [](rm_pg_osd_t *a, rm_pg_osd_t *b)
|
std::sort(lists.begin(), lists.end(), [](rm_pg_t *a, rm_pg_t *b)
|
||||||
{
|
{
|
||||||
return a->osd_num < b->osd_num ? true : false;
|
return a->rm_osd_num < b->rm_osd_num ? true : false;
|
||||||
});
|
});
|
||||||
pgs_to_list = lists.size();
|
pgs_to_list = lists.size();
|
||||||
started = true;
|
started = true;
|
||||||
|
@ -160,6 +210,10 @@ public:
|
||||||
|
|
||||||
void send_list(rm_pg_osd_t *cur_list)
|
void send_list(rm_pg_osd_t *cur_list)
|
||||||
{
|
{
|
||||||
|
if (cur_list->sent)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) ==
|
if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) ==
|
||||||
cli->msgr.osd_peer_fds.end())
|
cli->msgr.osd_peer_fds.end())
|
||||||
{
|
{
|
||||||
|
@ -177,7 +231,7 @@ public:
|
||||||
.id = cli->msgr.next_subop_id++,
|
.id = cli->msgr.next_subop_id++,
|
||||||
.opcode = OSD_OP_SEC_LIST,
|
.opcode = OSD_OP_SEC_LIST,
|
||||||
},
|
},
|
||||||
.list_pg = cur_list->pg_num,
|
.list_pg = cur_list->pg->pg_num,
|
||||||
.pg_count = (pg_num_t)cli->st_cli.pool_config[pool_id].real_pg_count,
|
.pg_count = (pg_num_t)cli->st_cli.pool_config[pool_id].real_pg_count,
|
||||||
.pg_stripe_size = cli->st_cli.pool_config[pool_id].pg_stripe_size,
|
.pg_stripe_size = cli->st_cli.pool_config[pool_id].pg_stripe_size,
|
||||||
.min_inode = inode,
|
.min_inode = inode,
|
||||||
|
@ -186,53 +240,67 @@ public:
|
||||||
};
|
};
|
||||||
op->callback = [this, cur_list](osd_op_t *op)
|
op->callback = [this, cur_list](osd_op_t *op)
|
||||||
{
|
{
|
||||||
pgs_to_list--;
|
cur_list->pg->to_list--;
|
||||||
if (op->reply.hdr.retval < 0)
|
if (op->reply.hdr.retval < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to get object list from OSD %lu (retval=%ld), skipping the PG\n",
|
fprintf(stderr, "Failed to get PG %u/%u object list from OSD %lu (retval=%ld), skipping\n",
|
||||||
cur_list->osd_num, op->reply.hdr.retval);
|
pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval);
|
||||||
cli->msgr.stop_client(cur_list->osd_num);
|
|
||||||
delete op;
|
|
||||||
cur_list->state = RM_END;
|
|
||||||
continue_delete();
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
if (log_level > 0)
|
else
|
||||||
{
|
{
|
||||||
printf(
|
if (op->reply.sec_list.stable_count < op->reply.hdr.retval)
|
||||||
"[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n",
|
{
|
||||||
pool_id, cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval
|
// Unstable objects, if present, mean that someone still writes into the inode. Warn the user about it.
|
||||||
);
|
printf(
|
||||||
|
"[PG %u/%u] Inode still has %lu unstable object versions - is it still open? Not a good idea to delete it.\n",
|
||||||
|
pool_id, cur_list->pg->pg_num, op->reply.hdr.retval - op->reply.sec_list.stable_count
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (log_level > 0)
|
||||||
|
{
|
||||||
|
printf(
|
||||||
|
"[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n",
|
||||||
|
pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval
|
||||||
|
);
|
||||||
|
}
|
||||||
|
for (uint64_t i = 0; i < op->reply.hdr.retval; i++)
|
||||||
|
{
|
||||||
|
object_id oid = ((obj_ver_id*)op->buf)[i].oid;
|
||||||
|
oid.stripe = oid.stripe & ~STRIPE_MASK;
|
||||||
|
cur_list->pg->objects.insert(oid);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
cur_list->obj_list = (obj_ver_id*)op->buf;
|
|
||||||
cur_list->obj_count = (uint64_t)op->reply.hdr.retval;
|
|
||||||
cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0;
|
|
||||||
total_count += cur_list->obj_count;
|
|
||||||
total_prev_pct = 0;
|
|
||||||
// set op->buf to NULL so it doesn't get freed
|
|
||||||
op->buf = NULL;
|
|
||||||
delete op;
|
delete op;
|
||||||
cur_list->state = RM_REMOVING;
|
if (cur_list->pg->to_list <= 0)
|
||||||
|
{
|
||||||
|
cur_list->pg->obj_done = cur_list->pg->obj_prev_done = 0;
|
||||||
|
cur_list->pg->obj_pos = cur_list->pg->objects.begin();
|
||||||
|
cur_list->pg->obj_count = cur_list->pg->objects.size();
|
||||||
|
total_count += cur_list->pg->obj_count;
|
||||||
|
total_prev_pct = 0;
|
||||||
|
cur_list->pg->state = RM_REMOVING;
|
||||||
|
pgs_to_list--;
|
||||||
|
}
|
||||||
continue_delete();
|
continue_delete();
|
||||||
};
|
};
|
||||||
cur_list->state = RM_LIST_SENT;
|
|
||||||
cli->msgr.outbox_push(op);
|
cli->msgr.outbox_push(op);
|
||||||
|
cur_list->sent = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_ops(rm_pg_osd_t *cur_list)
|
void send_ops(rm_pg_t *cur_list)
|
||||||
{
|
{
|
||||||
if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) ==
|
if (cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
||||||
cli->msgr.osd_peer_fds.end())
|
cli->msgr.osd_peer_fds.end())
|
||||||
{
|
{
|
||||||
// Initiate connection
|
// Initiate connection
|
||||||
cli->msgr.connect_peer(cur_list->osd_num, cli->st_cli.peer_states[cur_list->osd_num]);
|
cli->msgr.connect_peer(cur_list->rm_osd_num, cli->st_cli.peer_states[cur_list->rm_osd_num]);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
while (cur_list->in_flight < iodepth && cur_list->obj_pos < cur_list->obj_count)
|
while (cur_list->in_flight < iodepth && cur_list->obj_pos != cur_list->objects.end())
|
||||||
{
|
{
|
||||||
osd_op_t *op = new osd_op_t();
|
osd_op_t *op = new osd_op_t();
|
||||||
op->op_type = OSD_OP_OUT;
|
op->op_type = OSD_OP_OUT;
|
||||||
op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num];
|
op->peer_fd = cli->msgr.osd_peer_fds[cur_list->rm_osd_num];
|
||||||
op->req = (osd_any_op_t){
|
op->req = (osd_any_op_t){
|
||||||
.rw = {
|
.rw = {
|
||||||
.header = {
|
.header = {
|
||||||
|
@ -240,8 +308,8 @@ public:
|
||||||
.id = cli->msgr.next_subop_id++,
|
.id = cli->msgr.next_subop_id++,
|
||||||
.opcode = OSD_OP_DELETE,
|
.opcode = OSD_OP_DELETE,
|
||||||
},
|
},
|
||||||
.inode = cur_list->obj_list[cur_list->obj_pos].oid.inode,
|
.inode = cur_list->obj_pos->inode,
|
||||||
.offset = (cur_list->obj_list[cur_list->obj_pos].oid.stripe & ~STRIPE_MASK),
|
.offset = (cur_list->obj_pos->stripe & ~STRIPE_MASK),
|
||||||
.len = 0,
|
.len = 0,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
@ -251,7 +319,7 @@ public:
|
||||||
if (op->reply.hdr.retval < 0)
|
if (op->reply.hdr.retval < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to remove object from PG %u (OSD %lu) (retval=%ld)\n",
|
fprintf(stderr, "Failed to remove object from PG %u (OSD %lu) (retval=%ld)\n",
|
||||||
cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval);
|
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||||
}
|
}
|
||||||
delete op;
|
delete op;
|
||||||
cur_list->obj_done++;
|
cur_list->obj_done++;
|
||||||
|
@ -262,12 +330,10 @@ public:
|
||||||
cur_list->obj_pos++;
|
cur_list->obj_pos++;
|
||||||
cur_list->in_flight++;
|
cur_list->in_flight++;
|
||||||
}
|
}
|
||||||
if (!cur_list->in_flight && cur_list->obj_pos >= cur_list->obj_count)
|
if (!cur_list->in_flight && cur_list->obj_pos == cur_list->objects.end())
|
||||||
{
|
{
|
||||||
free(cur_list->obj_list);
|
|
||||||
cur_list->obj_list = NULL;
|
|
||||||
cur_list->obj_count = 0;
|
cur_list->obj_count = 0;
|
||||||
cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0;
|
cur_list->obj_done = cur_list->obj_prev_done = 0;
|
||||||
cur_list->state = RM_END;
|
cur_list->state = RM_END;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -276,6 +342,22 @@ public:
|
||||||
{
|
{
|
||||||
int par_osd = 0;
|
int par_osd = 0;
|
||||||
osd_num_t max_seen_osd = 0;
|
osd_num_t max_seen_osd = 0;
|
||||||
|
bool no_del = false;
|
||||||
|
if (list_first)
|
||||||
|
{
|
||||||
|
int i, n = 0;
|
||||||
|
for (i = 0; i < lists.size(); i++)
|
||||||
|
{
|
||||||
|
if (lists[i]->state == RM_LISTING)
|
||||||
|
{
|
||||||
|
n++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (n > 0)
|
||||||
|
{
|
||||||
|
no_del = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
for (int i = 0; i < lists.size(); i++)
|
for (int i = 0; i < lists.size(); i++)
|
||||||
{
|
{
|
||||||
if (lists[i]->state == RM_END)
|
if (lists[i]->state == RM_END)
|
||||||
|
@ -284,18 +366,25 @@ public:
|
||||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||||
i--;
|
i--;
|
||||||
}
|
}
|
||||||
else if (lists[i]->osd_num > max_seen_osd)
|
else if (lists[i]->rm_osd_num > max_seen_osd)
|
||||||
{
|
{
|
||||||
if (lists[i]->state == RM_NO_LIST)
|
if (lists[i]->state == RM_LISTING)
|
||||||
{
|
{
|
||||||
send_list(lists[i]);
|
for (int j = 0; j < lists[i]->list_osds.size(); j++)
|
||||||
|
{
|
||||||
|
send_list(&lists[i]->list_osds[j]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (lists[i]->state == RM_REMOVING)
|
else if (lists[i]->state == RM_REMOVING)
|
||||||
{
|
{
|
||||||
|
if (no_del)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
send_ops(lists[i]);
|
send_ops(lists[i]);
|
||||||
}
|
}
|
||||||
par_osd++;
|
par_osd++;
|
||||||
max_seen_osd = lists[i]->osd_num;
|
max_seen_osd = lists[i]->rm_osd_num;
|
||||||
if (par_osd >= parallel_osds)
|
if (par_osd >= parallel_osds)
|
||||||
{
|
{
|
||||||
break;
|
break;
|
||||||
|
|
Loading…
Reference in New Issue