Compare commits

..

2 Commits

23 changed files with 649 additions and 136 deletions

View File

@ -10,18 +10,25 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p
if (!new_pg_history[new_pg]) if (!new_pg_history[new_pg])
{ {
new_pg_history[new_pg] = { new_pg_history[new_pg] = {
osd_sets: {}, osd_set_epochs: {},
all_peers: {}, all_peers: {},
epoch: 0, epoch: 0,
}; };
} }
const nh = new_pg_history[new_pg], oh = prev_pg_history[old_pg]; const nh = new_pg_history[new_pg], oh = prev_pg_history[old_pg];
nh.osd_sets[prev_pgs[old_pg].join(' ')] = prev_pgs[old_pg]; nh.osd_set_epochs[prev_pgs[old_pg].join(' ')] = { osd_set: prev_pgs[old_pg] };
if (oh && oh.osd_sets && oh.osd_sets.length) if (oh && oh.osd_sets && oh.osd_sets.length)
{ {
for (const pg of oh.osd_sets) for (const pg of oh.osd_sets)
{ {
nh.osd_sets[pg.join(' ')] = pg; nh.osd_set_epochs[pg.join(' ')] = { osd_set: pg };
}
}
if (oh && oh.osd_set_epochs && oh.osd_set_epochs.length)
{
for (const pg of oh.osd_set_epochs)
{
nh.osd_set_epochs[pg.osd_set.join(' ')] = { osd_set: pg.osd_set };
} }
} }
if (oh && oh.all_peers && oh.all_peers.length) if (oh && oh.all_peers && oh.all_peers.length)
@ -39,7 +46,7 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p
function finish_pg_history(merged_history) function finish_pg_history(merged_history)
{ {
merged_history.osd_sets = Object.values(merged_history.osd_sets); merged_history.osd_set_epochs = Object.values(merged_history.osd_set_epochs);
merged_history.all_peers = Object.values(merged_history.all_peers); merged_history.all_peers = Object.values(merged_history.all_peers);
} }

View File

@ -276,7 +276,12 @@ const etcd_tree = {
history: { history: {
/* <pool_id>: { /* <pool_id>: {
<pg_id>: { <pg_id>: {
osd_sets: osd_num_t[][], osd_set_epochs: {
osd_set: osd_num_t[],
min_epoch: uint64_t,
max_epoch: uint64_t,
}[],
osd_sets: osd_num_t[][], // outdated
all_peers: osd_num_t[], all_peers: osd_num_t[],
epoch: uint64_t, epoch: uint64_t,
}, },
@ -947,18 +952,6 @@ class Mon
osd_set, osd_set,
primary: this.pick_primary(pool_id, osd_set, up_osds, aff_osds), primary: this.pick_primary(pool_id, osd_set, up_osds, aff_osds),
}; };
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
prev_pgs[i].filter(osd_num => osd_num).length > 0)
{
pg_history[i] = pg_history[i] || {};
pg_history[i].osd_sets = pg_history[i].osd_sets || [];
pg_history[i].osd_sets.push(prev_pgs[i]);
}
if (pg_history[i] && pg_history[i].osd_sets)
{
pg_history[i].osd_sets = Object.values(pg_history[i].osd_sets
.reduce((a, c) => { a[c.join(' ')] = c; return a; }, {}));
}
}); });
for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++) for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
{ {

View File

@ -95,7 +95,7 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
} }
for (auto & hist_item: pg.target_history) for (auto & hist_item: pg.target_history)
{ {
for (auto pg_osd: hist_item) for (auto pg_osd: hist_item.osd_set)
{ {
if (pg_osd != 0) if (pg_osd != 0)
{ {
@ -105,11 +105,14 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
} }
for (osd_num_t peer_osd: all_peers) for (osd_num_t peer_osd: all_peers)
{ {
r->list_osds.push_back((inode_list_osd_t){ if (st_cli.peer_states.find(peer_osd) != st_cli.peer_states.end())
.pg = r, {
.osd_num = peer_osd, r->list_osds.push_back((inode_list_osd_t){
.sent = false, .pg = r,
}); .osd_num = peer_osd,
.sent = false,
});
}
} }
} }
else else

View File

@ -845,7 +845,27 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
{ {
history_set.push_back(pg_osd.uint64_value()); history_set.push_back(pg_osd.uint64_value());
} }
pg_cfg.target_history.push_back(history_set); pg_cfg.target_history.push_back((pg_history_set_t){ .osd_set = history_set });
}
// Newer format with epochs
for (auto hist_item: value["osd_set_epochs"].array_items())
{
pg_history_set_t history_set;
history_set.min_epoch = hist_item["min_epoch"].uint64_value();
history_set.max_epoch = hist_item["max_epoch"].uint64_value();
if (history_set.max_epoch < history_set.min_epoch)
{
history_set.max_epoch = 0;
history_set.min_epoch = 0;
}
for (auto pg_osd: hist_item["osd_set"].array_items())
{
history_set.osd_set.push_back(pg_osd.uint64_value());
}
if (history_set.max_epoch || history_set.osd_set.size())
{
pg_cfg.target_history.push_back(history_set);
}
} }
// Include these additional OSDs when peering the PG // Include these additional OSDs when peering the PG
for (auto pg_osd: value["all_peers"].array_items()) for (auto pg_osd: value["all_peers"].array_items())

View File

@ -26,7 +26,7 @@ struct pg_config_t
bool exists; bool exists;
osd_num_t primary; osd_num_t primary;
std::vector<osd_num_t> target_set; std::vector<osd_num_t> target_set;
std::vector<std::vector<osd_num_t>> target_history; std::vector<pg_history_set_t> target_history;
std::vector<osd_num_t> all_peers; std::vector<osd_num_t> all_peers;
bool pause; bool pause;
osd_num_t cur_primary; osd_num_t cur_primary;

View File

@ -36,6 +36,7 @@ struct sec_data
/* The list of completed io_u structs. */ /* The list of completed io_u structs. */
std::vector<io_u*> completed; std::vector<io_u*> completed;
uint64_t inflight = 0; uint64_t inflight = 0;
int mirror_fd = -1;
bool trace = false; bool trace = false;
}; };
@ -46,6 +47,7 @@ struct sec_options
char *etcd_host = NULL; char *etcd_host = NULL;
char *etcd_prefix = NULL; char *etcd_prefix = NULL;
char *image = NULL; char *image = NULL;
char *mirror_file = NULL;
uint64_t pool = 0; uint64_t pool = 0;
uint64_t inode = 0; uint64_t inode = 0;
int cluster_log = 0; int cluster_log = 0;
@ -132,6 +134,15 @@ static struct fio_option options[] = {
.category = FIO_OPT_C_ENGINE, .category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME, .group = FIO_OPT_G_FILENAME,
}, },
{
.name = "mirror_file",
.lname = "File name to mirror writes to",
.type = FIO_OPT_STR_STORE,
.off1 = offsetof(struct sec_options, mirror_file),
.help = "File name to mirror writes to (for debug purpose)",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{ {
.name = "use_rdma", .name = "use_rdma",
.lname = "Use RDMA", .lname = "Use RDMA",
@ -212,6 +223,16 @@ static int sec_setup(struct thread_data *td)
td->o.open_files++; td->o.open_files++;
} }
if (o->mirror_file)
{
bsd->mirror_fd = open(o->mirror_file, O_CREAT|O_RDWR, 0666);
if (bsd->mirror_fd < 0)
{
td_verror(td, errno, "open mirror file");
return 1;
}
}
if (!o->image) if (!o->image)
{ {
if (!(o->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1))) if (!(o->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)))
@ -265,6 +286,10 @@ static void sec_cleanup(struct thread_data *td)
sec_data *bsd = (sec_data*)td->io_ops_data; sec_data *bsd = (sec_data*)td->io_ops_data;
if (bsd) if (bsd)
{ {
if (bsd->mirror_fd >= 0)
{
close(bsd->mirror_fd);
}
if (bsd->watch) if (bsd->watch)
{ {
vitastor_c_close_watch(bsd->cli, bsd->watch); vitastor_c_close_watch(bsd->cli, bsd->watch);
@ -325,6 +350,24 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
bsd->last_sync = false; bsd->last_sync = false;
break; break;
case DDIR_WRITE: case DDIR_WRITE:
if (opt->mirror_file)
{
size_t done = 0;
while (done < io->xfer_buflen)
{
ssize_t r = pwrite(bsd->mirror_fd, io->xfer_buf+done, io->xfer_buflen-done, io->offset+done);
if (r < 0 && errno != EAGAIN)
{
fprintf(stderr, "Error writing mirror file: %s\n", strerror(errno));
io->error = errno;
return FIO_Q_COMPLETED;
}
if (r > 0)
{
done += r;
}
}
}
if (opt->image && vitastor_c_inode_get_readonly(bsd->watch)) if (opt->image && vitastor_c_inode_get_readonly(bsd->watch))
{ {
io->error = EROFS; io->error = EROFS;

View File

@ -333,7 +333,10 @@ void osd_t::on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num)
{ {
oid = op.first; oid = op.first;
first = false; first = false;
continue_primary_write(op.second); if (op.second->req.hdr.opcode == OSD_OP_DELETE)
continue_primary_del(op.second);
else
continue_primary_write(op.second);
} }
} }
} }
@ -608,7 +611,7 @@ void osd_t::apply_pg_config()
} }
for (auto & hist_item: pg_cfg.target_history) for (auto & hist_item: pg_cfg.target_history)
{ {
for (auto pg_osd: hist_item) for (auto pg_osd: hist_item.osd_set)
{ {
if (pg_osd != 0) if (pg_osd != 0)
{ {
@ -799,11 +802,40 @@ void osd_t::report_pg_states()
// Prevent race conditions (for the case when the monitor is updating this key at the same time) // Prevent race conditions (for the case when the monitor is updating this key at the same time)
pg.history_changed = false; pg.history_changed = false;
std::string history_key = base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num)); std::string history_key = base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num));
json11::Json::array target_history;
for (auto & pgh: pg.target_history)
{
target_history.push_back(json11::Json::object {
{ "osd_set", pgh.osd_set },
{ "min_epoch", pgh.min_epoch },
{ "max_epoch", pgh.max_epoch },
});
}
std::vector<osd_num_t> all_peers;
for (auto peer_osd: pg.all_peers)
{
bool found = false;
for (auto target_peer: pg.target_set)
{
if (target_peer == peer_osd)
{
found = true;
break;
}
}
if (!found)
{
all_peers.push_back(peer_osd);
}
}
json11::Json::object history_value = { json11::Json::object history_value = {
{ "epoch", pg.epoch }, { "epoch", pg.epoch },
{ "all_peers", pg.all_peers }, { "osd_set_epochs", target_history },
{ "osd_sets", pg.target_history },
}; };
if (all_peers.size())
{
history_value["all_peers"] = all_peers;
}
checks.push_back(json11::Json::object { checks.push_back(json11::Json::object {
{ "target", "MOD" }, { "target", "MOD" },
{ "key", history_key }, { "key", history_key },

View File

@ -268,6 +268,25 @@ bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
void osd_t::submit_recovery_op(osd_recovery_op_t *op) void osd_t::submit_recovery_op(osd_recovery_op_t *op)
{ {
// Check if the object is deleted
bool is_deleted = false;
pool_id_t pool_id = INODE_POOL(op->oid.inode);
auto pool_cfg_it = st_cli.pool_config.find(pool_id);
if (pool_cfg_it != st_cli.pool_config.end())
{
pg_num_t pg_num = (op->oid.stripe/pool_cfg_it->second.pg_stripe_size) % pg_counts[pool_id] + 1; // like map_to_pg()
auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num });
if (pg_it != pgs.end())
{
pg_osd_set_state_t *object_state;
get_object_osd_set(pg_it->second, op->oid, pg_it->second.cur_set.data(), &object_state);
if (object_state && (object_state->state & OBJ_DELETED))
{
// Object is deleted, but not from all OSDs - delete remaining copies
is_deleted = true;
}
}
}
op->osd_op = new osd_op_t(); op->osd_op = new osd_op_t();
op->osd_op->op_type = OSD_OP_OUT; op->osd_op->op_type = OSD_OP_OUT;
op->osd_op->req = (osd_any_op_t){ op->osd_op->req = (osd_any_op_t){
@ -275,7 +294,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
.id = 1, .id = 1,
.opcode = OSD_OP_WRITE, .opcode = (uint64_t)(is_deleted ? OSD_OP_DELETE : OSD_OP_WRITE),
}, },
.inode = op->oid.inode, .inode = op->oid.inode,
.offset = op->oid.stripe, .offset = op->oid.stripe,

View File

@ -3,6 +3,8 @@
#pragma once #pragma once
#include <vector>
#define POOL_SCHEME_REPLICATED 1 #define POOL_SCHEME_REPLICATED 1
#define POOL_SCHEME_XOR 2 #define POOL_SCHEME_XOR 2
#define POOL_SCHEME_JERASURE 3 #define POOL_SCHEME_JERASURE 3
@ -28,3 +30,9 @@ inline bool operator < (const pool_pg_num_t & a, const pool_pg_num_t & b)
{ {
return a.pool_id < b.pool_id || a.pool_id == b.pool_id && a.pg_num < b.pg_num; return a.pool_id < b.pool_id || a.pool_id == b.pool_id && a.pg_num < b.pg_num;
} }
struct pg_history_set_t
{
std::vector<osd_num_t> osd_set;
uint64_t min_epoch, max_epoch;
};

View File

@ -228,7 +228,7 @@ void osd_t::start_pg_peering(pg_t & pg)
for (auto & history_set: pg.target_history) for (auto & history_set: pg.target_history)
{ {
bool found = true; bool found = true;
for (auto history_osd: history_set) for (auto history_osd: history_set.osd_set)
{ {
if (history_osd != 0) if (history_osd != 0)
{ {
@ -539,47 +539,114 @@ void osd_t::finish_stop_pg(pg_t & pg)
report_pg_state(pg); report_pg_state(pg);
} }
static int count_nonzero_osds(const std::vector<osd_num_t> & v)
{
int n = 0;
for (auto & osd_num: v)
{
if (osd_num != 0)
{
n++;
}
}
return n;
}
void osd_t::report_pg_state(pg_t & pg) void osd_t::report_pg_state(pg_t & pg)
{ {
pg.print_state(); pg.print_state();
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }); this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
if (pg.state == PG_ACTIVE && (pg.target_history.size() > 0 || pg.all_peers.size() > pg.target_set.size())) if ((pg.state == PG_ACTIVE || pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD)) &&
(pg.target_history.size() != 1 ||
pg.target_history[0].osd_set != pg.target_set ||
pg.target_history[0].min_epoch != 0 ||
pg.target_history[0].max_epoch != pg.epoch ||
pg.all_peers.size() > count_nonzero_osds(pg.target_set)))
{ {
// Clear history of active+clean PGs // Clear history of active+clean PGs
pg.history_changed = true; pg.history_changed = true;
pg.target_history.clear(); pg.target_history.clear();
pg.all_peers = pg.target_set; pg.target_history.push_back((pg_history_set_t){
pg.cur_peers = pg.target_set; .osd_set = pg.cur_set,
} .min_epoch = 0,
else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD)) .max_epoch = pg.epoch,
{ });
// Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers if (pg.state == PG_ACTIVE)
pg.history_changed = true;
pg.target_history.clear();
std::set<osd_num_t> dead_peers;
for (auto pg_osd: pg.all_peers)
{ {
dead_peers.insert(pg_osd); pg.all_peers.clear();
} for (auto pg_osd: pg.target_set)
for (auto pg_osd: pg.cur_peers)
{
dead_peers.erase(pg_osd);
}
for (auto pg_osd: pg.target_set)
{
if (pg_osd)
{ {
dead_peers.insert(pg_osd); if (pg_osd)
pg.all_peers.push_back(pg_osd);
} }
} }
pg.all_peers.clear(); else
pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end()); {
// Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
std::set<osd_num_t> dead_peers(pg.all_peers.begin(), pg.all_peers.end());
for (auto pg_osd: pg.cur_peers)
{
dead_peers.erase(pg_osd);
}
for (auto pg_osd: pg.target_set)
{
if (pg_osd)
dead_peers.insert(pg_osd);
}
pg.all_peers.clear();
pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end());
}
pg.cur_peers.clear(); pg.cur_peers.clear();
for (auto pg_osd: pg.target_set) for (auto pg_osd: pg.target_set)
{ {
if (pg_osd) if (pg_osd)
{
pg.cur_peers.push_back(pg_osd); pg.cur_peers.push_back(pg_osd);
}
}
if (pg.history_changed)
{
bool epoch_already_reported = false;
int max_epoch_pos = -1;
for (int i = pg.target_history.size()-1; i >= 0; i--)
{
if (pg.target_history[i].min_epoch > pg.epoch)
{
printf("[PG %u/%u] Invalid PG history: there is an entry with min_epoch (%lu) > current epoch (%lu)\n",
pg.pool_id, pg.pg_num, pg.target_history[i].min_epoch, pg.epoch);
force_stop(1);
return;
}
if (max_epoch_pos < 0 || pg.target_history[i].max_epoch > pg.target_history[max_epoch_pos].max_epoch)
{
max_epoch_pos = i;
}
if (pg.target_history[i].min_epoch <= pg.epoch &&
pg.target_history[i].max_epoch >= pg.epoch)
{
if (pg.target_history[i].osd_set != pg.cur_set)
{
printf("[PG %u/%u] Invalid target_history: epoch %lu has another OSD set already registered\n", pg.pool_id, pg.pg_num, pg.epoch);
force_stop(1);
return;
}
// Already reported
epoch_already_reported = true;
break;
}
}
if (!epoch_already_reported)
{
if (max_epoch_pos >= 0 && pg.target_history[max_epoch_pos].osd_set == pg.cur_set)
{
pg.target_history[max_epoch_pos].max_epoch = pg.epoch;
}
else
{
pg.target_history.push_back((pg_history_set_t){
.osd_set = pg.cur_set,
.min_epoch = pg.epoch,
.max_epoch = pg.epoch,
});
} }
} }
} }

View File

@ -52,6 +52,7 @@ struct pg_obj_state_check_t
void walk(); void walk();
void start_object(); void start_object();
void recheck_version_osd_set();
void handle_version(); void handle_version();
void finish_object(); void finish_object();
}; };
@ -84,27 +85,19 @@ void pg_obj_state_check_t::walk()
pg->state = PG_INCOMPLETE | PG_HAS_INVALID; pg->state = PG_INCOMPLETE | PG_HAS_INVALID;
return; return;
} }
// Activate PG
if (pg->pg_cursize < pg->pg_size) if (pg->pg_cursize < pg->pg_size)
{ {
// Report PG history and activate // History will be reported on first write
pg->state |= PG_DEGRADED | PG_PEERED; pg->state |= PG_DEGRADED | PG_PEERED;
std::vector<osd_num_t> history_set;
for (auto peer_osd: pg->cur_set)
{
if (peer_osd != 0)
{
history_set.push_back(peer_osd);
}
}
pg->target_history.push_back(history_set);
pg->history_changed = true;
} }
else else
{ {
// Just activate
pg->state |= PG_ACTIVE; pg->state |= PG_ACTIVE;
// Clear history
pg->history_changed = true;
} }
if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size()) if (pg->cur_peers.size() < pg->all_peers.size())
{ {
pg->state |= PG_LEFT_ON_DEAD; pg->state |= PG_LEFT_ON_DEAD;
} }
@ -121,13 +114,82 @@ void pg_obj_state_check_t::start_object()
n_unstable = n_invalid = 0; n_unstable = n_invalid = 0;
} }
void pg_obj_state_check_t::recheck_version_osd_set()
{
uint64_t epoch = (last_ver >> (64-PG_EPOCH_BITS));
if (!pg->epoch_sizes_differ && n_copies >= pg->pg_size)
{
// Enough copies
return;
}
auto epoch_it = pg->target_by_epoch.lower_bound(epoch);
if (epoch_it == pg->target_by_epoch.end() || epoch_it->second.min_epoch > epoch)
{
// Epoch info not found
return;
}
if (pg->epoch_sizes_differ && n_copies >= epoch_it->second.osd_set.size())
{
// For the (unlikely) case of PG size change - enough copies
return;
}
// Recheck version against the OSD set corresponding to epoch if it's known
if (epoch_it != pg->target_by_epoch.end() && epoch_it->second.min_epoch <= epoch)
{
for (int j = 0; j < epoch_it->second.osd_set.size(); j++)
{
osd_num_t cur_osd = epoch_it->second.osd_set[j];
bool found = false;
for (int i = ver_start; i < ver_end; i++)
{
if (cur_osd == list[i].osd_num)
{
found = true;
break;
}
}
if (!found)
{
// Check if a newer version is present on the same OSD and masks the older one
// It happens for overwritten replicas in the following case:
// Version 1 is present on OSD 1,2,3
// Client tries to write Version 2
// OSD 3 succeeds to write Version 2, others don't. OSD 3 crashes, then starts again
// OSD 1 sees: version 1 on OSD 1,2 and version 2 on OSD 3
// (version 1 on OSD 3 is already masked/removed)
// Version 1 is not present on a full set, but it must not be removed
if (replicated)
{
for (int i = obj_start; i < ver_start; i++)
{
if (cur_osd == list[i].osd_num)
{
found = true;
break;
}
}
}
if (!found)
{
// Object is missing from one of the OSDs of that set.
// This means it's deleted or moved and we can safely drop this version.
target_ver = 0;
break;
}
}
}
}
}
void pg_obj_state_check_t::handle_version() void pg_obj_state_check_t::handle_version()
{ {
if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_data_size)) if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_data_size))
{ {
// Version is either stable or recoverable // Version is either stable or recoverable
target_ver = last_ver;
ver_end = list_pos; ver_end = list_pos;
target_ver = last_ver;
// Skip versions that are not present on any of OSDs for the corresponding PG epoch
recheck_version_osd_set();
} }
if (!target_ver) if (!target_ver)
{ {
@ -191,6 +253,8 @@ void pg_obj_state_check_t::finish_object()
// Version is either stable or recoverable // Version is either stable or recoverable
target_ver = last_ver; target_ver = last_ver;
ver_end = list_pos; ver_end = list_pos;
// Skip versions that are not present on any of OSDs for the corresponding PG epoch
recheck_version_osd_set();
} }
obj_end = list_pos; obj_end = list_pos;
// Remember the decision // Remember the decision
@ -244,11 +308,23 @@ void pg_obj_state_check_t::finish_object()
} }
} }
} }
if (!target_ver) if (!target_ver && (n_unstable >= obj_end-obj_start))
{ {
return; return;
} }
if (!replicated && n_roles < pg->pg_data_size) if (!target_ver)
{
// Object is present, but should not be :) i.e. it's a deleted object that reappeared
if (log_level > 1)
{
printf("Object is deleted: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
}
state = OBJ_DELETED;
pg->state = pg->state | PG_HAS_MISPLACED;
// To record all versions as outdated:
ver_end = obj_start;
}
else if (!replicated && n_roles < pg->pg_data_size)
{ {
if (log_level > 1) if (log_level > 1)
{ {
@ -276,7 +352,7 @@ void pg_obj_state_check_t::finish_object()
pg->state = pg->state | PG_HAS_MISPLACED; pg->state = pg->state | PG_HAS_MISPLACED;
} }
if (log_level > 1 && (state & (OBJ_INCOMPLETE | OBJ_DEGRADED)) || if (log_level > 1 && (state & (OBJ_INCOMPLETE | OBJ_DEGRADED)) ||
log_level > 2 && (state & OBJ_MISPLACED)) log_level > 2 && (state & (OBJ_MISPLACED | OBJ_DELETED)))
{ {
for (int i = obj_start; i < obj_end; i++) for (int i = obj_start; i < obj_end; i++)
{ {
@ -285,9 +361,9 @@ void pg_obj_state_check_t::finish_object()
} }
} }
pg->total_count++; pg->total_count++;
if (state != 0 || ver_end < obj_end) osd_set.clear();
if (target_ver != 0 && (state != 0 || ver_end < obj_end))
{ {
osd_set.clear();
for (int i = ver_start; i < ver_end; i++) for (int i = ver_start; i < ver_end; i++)
{ {
osd_set.push_back((pg_obj_loc_t){ osd_set.push_back((pg_obj_loc_t){
@ -310,7 +386,8 @@ void pg_obj_state_check_t::finish_object()
break; break;
} }
} }
if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num) if (j >= osd_set.size() && ((state & OBJ_DELETED) ||
pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num))
{ {
osd_set.push_back((pg_obj_loc_t){ osd_set.push_back((pg_obj_loc_t){
.role = (list[i].oid.stripe & STRIPE_MASK), .role = (list[i].oid.stripe & STRIPE_MASK),
@ -325,7 +402,11 @@ void pg_obj_state_check_t::finish_object()
} }
} }
} }
if (target_ver < max_ver) if (state & OBJ_DELETED)
{
pg->ver_override[oid] = max_ver;
}
else if (target_ver < max_ver)
{ {
pg->ver_override[oid] = target_ver; pg->ver_override[oid] = target_ver;
} }
@ -379,6 +460,7 @@ void pg_obj_state_check_t::finish_object()
} }
else else
{ {
assert(it->second.state == state);
it->second.object_count++; it->second.object_count++;
} }
if (state & OBJ_INCOMPLETE) if (state & OBJ_INCOMPLETE)
@ -399,6 +481,34 @@ void pg_obj_state_check_t::finish_object()
// FIXME: Write at least some tests for this function // FIXME: Write at least some tests for this function
void pg_t::calc_object_states(int log_level) void pg_t::calc_object_states(int log_level)
{ {
// Calculate intersections of target_history with cur_peers
for (auto & history_item: target_history)
{
if (history_item.max_epoch)
{
pg_history_set_t & set_copy = target_by_epoch[history_item.max_epoch];
set_copy.min_epoch = history_item.min_epoch;
set_copy.max_epoch = history_item.max_epoch;
for (int i = 0; i < history_item.osd_set.size(); i++)
{
if (history_item.osd_set[i] != 0)
{
for (int j = 0; j < cur_set.size(); j++)
{
if (cur_set[j] == history_item.osd_set[i])
{
set_copy.osd_set.push_back(history_item.osd_set[i]);
break;
}
}
}
}
if (set_copy.osd_set.size() != pg_size)
{
epoch_sizes_differ = true;
}
}
}
// Copy all object lists into one array // Copy all object lists into one array
pg_obj_state_check_t st; pg_obj_state_check_t st;
st.log_level = log_level; st.log_level = log_level;
@ -435,10 +545,18 @@ void pg_t::calc_object_states(int log_level)
std::sort(st.list.begin(), st.list.end()); std::sort(st.list.begin(), st.list.end());
// Walk over it and check object states // Walk over it and check object states
st.walk(); st.walk();
if (this->state & (PG_DEGRADED|PG_LEFT_ON_DEAD)) target_by_epoch.clear();
if (this->state != PG_ACTIVE)
{ {
assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1)); assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
epoch++; epoch++;
for (auto & pgh: target_history)
{
if (epoch <= pgh.max_epoch)
{
epoch = pgh.max_epoch+1;
}
}
} }
} }

View File

@ -89,7 +89,9 @@ struct pg_t
// epoch number - should increase with each non-clean activation of the PG // epoch number - should increase with each non-clean activation of the PG
uint64_t epoch = 0, reported_epoch = 0; uint64_t epoch = 0, reported_epoch = 0;
// target history and all potential peers // target history and all potential peers
std::vector<std::vector<osd_num_t>> target_history; std::vector<pg_history_set_t> target_history;
std::map<uint64_t, pg_history_set_t> target_by_epoch;
bool epoch_sizes_differ = false;
std::vector<osd_num_t> all_peers; std::vector<osd_num_t> all_peers;
bool history_changed = false; bool history_changed = false;
// peer list from the last peering event // peer list from the last peering event

View File

@ -199,6 +199,21 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
{ {
// PG may be degraded or have misplaced objects // PG may be degraded or have misplaced objects
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state); op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
if (op_data->object_state && (op_data->object_state->state & OBJ_DELETED))
{
// Object is deleted, just return zeroes
cur_op->reply.rw.version = 0;
cur_op->reply.rw.bitmap_len = op_data->pg_data_size * clean_entry_bitmap_size;
uint64_t zero_len = cur_op->reply.rw.bitmap_len + cur_op->req.rw.len;
while (zero_len >= 0)
{
uint64_t cur_zero_len = zero_buffer_size > zero_len ? zero_len : zero_buffer_size;
cur_op->iov.push_back(zero_buffer, cur_zero_len);
zero_len -= cur_zero_len;
}
finish_op(cur_op, cur_op->req.rw.len);
return;
}
} }
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED) if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
{ {
@ -290,7 +305,7 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
report_pg_state(pg); report_pg_state(pg);
} }
} }
else if (object_state->state & OBJ_MISPLACED) else if (object_state->state & (OBJ_MISPLACED | OBJ_DELETED))
{ {
this->misplaced_objects--; this->misplaced_objects--;
pg.misplaced_objects.erase(oid); pg.misplaced_objects.erase(oid);
@ -329,12 +344,6 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
else if (op_data->st == 4) goto resume_4; else if (op_data->st == 4) goto resume_4;
else if (op_data->st == 5) goto resume_5; else if (op_data->st == 5) goto resume_5;
assert(op_data->st == 0); assert(op_data->st == 0);
// Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs
if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD))
{
finish_op(cur_op, -EBUSY);
return;
}
if (!check_write_queue(cur_op, pg)) if (!check_write_queue(cur_op, pg))
{ {
return; return;
@ -342,11 +351,18 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
resume_1: resume_1:
// Determine which OSDs contain this object and delete it // Determine which OSDs contain this object and delete it
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state); op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
// Submit 1 read to determine the actual version number if (op_data->object_state && (op_data->object_state->state & OBJ_DELETED))
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op); {
op_data->fact_ver = pg.ver_override[op_data->oid];
}
else
{
// Submit 1 read to determine the actual version number
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
resume_2: resume_2:
op_data->st = 2; op_data->st = 2;
return; return;
}
resume_3: resume_3:
if (op_data->errors > 0) if (op_data->errors > 0)
{ {

View File

@ -133,6 +133,12 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitm
uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX; uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
pg_osd_set_state_t *object_state; pg_osd_set_state_t *object_state;
uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state); uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
if (object_state && (object_state->state & OBJ_DELETED))
{
// Object is deleted, zero out the bitmap
memset((uint8_t*)op_data->snapshot_bitmaps + chain_num*clean_entry_bitmap_size, 0, clean_entry_bitmap_size);
continue;
}
if (pg.scheme == POOL_SCHEME_REPLICATED) if (pg.scheme == POOL_SCHEME_REPLICATED)
{ {
osd_num_t read_target = 0; osd_num_t read_target = 0;

View File

@ -116,17 +116,19 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
if (osd_set[role] != 0 && (wr || !rep && stripes[role].read_end != 0)) if (osd_set[role] != 0 && (wr || !rep && stripes[role].read_end != 0))
n_subops++; n_subops++;
} }
if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep)) if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep) && zero_read >= 0)
n_subops = 1; n_subops = 1;
else else
zero_read = -1; zero_read = -1;
osd_op_t *subops = new osd_op_t[n_subops];
op_data->fact_ver = 0; op_data->fact_ver = 0;
op_data->done = op_data->errors = 0; op_data->done = op_data->errors = 0;
op_data->n_subops = n_subops; op_data->n_subops = n_subops;
op_data->subops = subops; if (n_subops > 0)
int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read); {
assert(sent == n_subops); op_data->subops = new osd_op_t[n_subops];
int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read);
assert(sent == n_subops);
}
} }
int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t op_version, int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t op_version,

View File

@ -154,10 +154,9 @@ resume_3:
if (pg.epoch > pg.reported_epoch) if (pg.epoch > pg.reported_epoch)
{ {
// Report newer epoch before writing // Report newer epoch before writing
// FIXME: We may report only one PG state here... // FIXME: We don't have to report all changed PG states here
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
pg.history_changed = true; pg.history_changed = true;
report_pg_states(); report_pg_state(pg);
resume_10: resume_10:
if (pg.epoch > pg.reported_epoch) if (pg.epoch > pg.reported_epoch)
{ {

View File

@ -4,26 +4,8 @@
#include <stdexcept> #include <stdexcept>
#include <string.h> #include <string.h>
#include <assert.h> #include <assert.h>
extern "C" { #include <jerasure/reed_sol.h>
#if defined __has_include #include <jerasure.h>
# if __has_include (<jerasure.h>)
# include <jerasure.h>
# endif
// In Gentoo hasn't jerasure directory with header files.
# if __has_include (<reed_sol.h>)
# include <reed_sol.h>
# elif __has_include (<jerasure/reed_sol.h>)
# include <jerasure/reed_sol.h>
# endif
#else
// default path to headers in Debian
# include <jerasure/reed_sol.h>
# include <jerasure.h>
#endif
}
#include <map> #include <map>
#include "allocator.h" #include "allocator.h"
#include "xor.h" #include "xor.h"

View File

@ -34,6 +34,7 @@
#define OBJ_DEGRADED 0x02 #define OBJ_DEGRADED 0x02
#define OBJ_INCOMPLETE 0x04 #define OBJ_INCOMPLETE 0x04
#define OBJ_MISPLACED 0x08 #define OBJ_MISPLACED 0x08
#define OBJ_DELETED 0x10
#define OBJ_NEEDS_STABLE 0x10000 #define OBJ_NEEDS_STABLE 0x10000
#define OBJ_NEEDS_ROLLBACK 0x20000 #define OBJ_NEEDS_ROLLBACK 0x20000

View File

@ -18,7 +18,7 @@ done
cd mon cd mon
npm install npm install
cd .. cd ..
node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" &>./testdata/mon.log & node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
MON_PID=$! MON_PID=$!
if [ -n "$GLOBAL_CONF" ]; then if [ -n "$GLOBAL_CONF" ]; then
@ -31,15 +31,31 @@ else
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"xor","pg_size":'$PG_SIZE',"pg_minsize":'$PG_MINSIZE',"parity_chunks":1,"pg_count":'$PG_COUNT',"failure_domain":"osd"}}' $ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"xor","pg_size":'$PG_SIZE',"pg_minsize":'$PG_MINSIZE',"parity_chunks":1,"pg_count":'$PG_COUNT',"failure_domain":"osd"}}'
fi fi
sleep 3 wait_up()
{
local sec=$1
local i=0
local configured=0
while [[ $i -lt $sec ]]; do
if $ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(. | length) != 0 and ([ .[0].items["1"][] |
select(((.osd_set | select(. != 0) | sort | unique) | length) == '$PG_SIZE') ] | length) == '$PG_COUNT; then
configured=1
if $ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active"]) ] | length == '$PG_COUNT; then
break
fi
fi
sleep 1
i=$((i+1))
if [ $i -eq $sec ]; then
if [[ $configured -ne 0 ]]; then
format_error "FAILED: $PG_COUNT PG(s) NOT CONFIGURED"
fi
format_error "FAILED: $PG_COUNT PG(s) NOT UP"
fi
done
}
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(. | length) != 0 and ([ .[0].items["1"][] | select(((.osd_set | select(. != 0) | sort | unique) | length) == '$PG_SIZE') ] | length) == '$PG_COUNT); then wait_up 60
format_error "FAILED: $PG_COUNT PG(s) NOT CONFIGURED"
fi
if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active"]) ] | length == '$PG_COUNT); then
format_error "FAILED: $PG_COUNT PG(s) NOT UP"
fi
if ! cmp build/src/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so; then if ! cmp build/src/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so; then
sudo rm -f /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so sudo rm -f /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so

View File

@ -4,19 +4,25 @@
if [ "$IMMEDIATE_COMMIT" != "" ]; then if [ "$IMMEDIATE_COMMIT" != "" ]; then
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 1" NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 1"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":5,"immediate_commit":"all"}' $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1,"immediate_commit":"all"}'
else else
NO_SAME="--journal_sector_buffer_count 1024 --log_level 1" NO_SAME="--journal_sector_buffer_count 1024 --log_level 1"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":5}' $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1}'
fi fi
start_osd()
{
local i=$1
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
}
OSD_SIZE=1024 OSD_SIZE=1024
OSD_COUNT=7 OSD_COUNT=7
OSD_ARGS= OSD_ARGS=
for i in $(seq 1 $OSD_COUNT); do for i in $(seq 1 $OSD_COUNT); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1)) dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log & start_osd $i
eval OSD${i}_PID=$!
done done
cd mon cd mon
@ -26,11 +32,11 @@ node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" --verbose 1
MON_PID=$! MON_PID=$!
if [ "$EC" != "" ]; then if [ "$EC" != "" ]; then
POOLCFG='"scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1'
PG_SIZE=3 PG_SIZE=3
POOLCFG='"scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1'
else else
POOLCFG='"scheme":"replicated","pg_size":2,"pg_minsize":2' PG_SIZE=${PG_SIZE:-2}
PG_SIZE=2 POOLCFG='"scheme":"replicated","pg_size":'$PG_SIZE',"pg_minsize":2'
fi fi
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":32,"failure_domain":"osd"}}' $ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":32,"failure_domain":"osd"}}'
@ -59,10 +65,10 @@ wait_finish_rebalance()
while [[ $i -lt $sec ]]; do while [[ $i -lt $sec ]]; do
($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 32') && \ ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 32') && \
break break
if [ $i -eq 60 ]; then
format_error "Rebalance couldn't finish in $sec seconds"
fi
sleep 1 sleep 1
i=$((i+1)) i=$((i+1))
if [ $i -eq $sec ]; then
format_error "Rebalance couldn't finish in $sec seconds"
fi
done done
} }

View File

@ -12,6 +12,8 @@ EC=1 ./test_change_pg_count.sh
./test_change_pg_size.sh ./test_change_pg_size.sh
./test_degraded_delete.sh
./test_etcd_fail.sh ./test_etcd_fail.sh
./test_failure_domain.sh ./test_failure_domain.sh

119
tests/test_degraded_delete.sh Executable file
View File

@ -0,0 +1,119 @@
#!/bin/bash -ex
# Run 3 OSDs
. `dirname $0`/run_3osds.sh
# Write inodes 1 and 2
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \
-rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -runtime=10
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \
-rw=write -etcd=$ETCD_URL -pool=1 -inode=2 -size=128M -runtime=10
# Stop OSD 1
kill -INT $OSD1_PID
sleep 2
# Remove inode 2
build/src/vitastor-cli rm-data --etcd_address $ETCD_URL --pool 1 --inode 2
# Run 3 more OSDs and move PG to 4,5,6
for i in $(seq 4 6); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
done
$ETCDCTL put /vitastor/config/osd/1 '{"reweight":0}'
$ETCDCTL put /vitastor/config/osd/2 '{"reweight":0}'
$ETCDCTL put /vitastor/config/osd/3 '{"reweight":0}'
# Wait for rebalance to finish
wait_finish_rebalance()
{
local sec=$1
local st=$2
local i=0
while [[ $i -lt $sec ]]; do
if $ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e \
'([ .[] | select(.state == ['$st'] and (.peers | contains([1]) | not) and (.peers | contains([2,3]) | not)) ] | length) == '$PG_COUNT; then
break
fi
sleep 1
i=$((i+1))
if [ $i -eq $sec ]; then
format_error "Rebalance couldn't finish in $sec seconds"
fi
done
}
wait_finish_rebalance 60 '"active","left_on_dead"'
# Stop OSD 2,3
kill -INT $OSD2_PID
kill -INT $OSD3_PID
sleep 2
# Verify that PGs are still active
if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active","left_on_dead"]) ] | length == '$PG_COUNT); then
format_error "FAILED: $PG_COUNT PG(s) NOT UP"
fi
# Start OSD 1
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd1.bin 2>/dev/null) &>./testdata/osd1.log &
OSD1_PID=$!
# Verify that inode 2 is removed and inode 1 is in place
wait_repeer_1()
{
local sec=$1
local i=0
while [[ $i -lt $sec ]]; do
if grep -q 'Repeer because of OSD 1' testdata/osd4.log testdata/osd5.log testdata/osd6.log; then
break
fi
sleep 1
i=$((i+1))
if [ $i -eq $sec ]; then
format_error "OSD 4/5/6 do not peer with older OSD 1"
fi
done
}
wait_repeer_1 15
wait_finish_rebalance 15 '"active"'
if ! ($ETCDCTL get /vitastor/pg/stats/1/1 --print-value-only | jq -s -e '.[0].object_count == 512'); then
format_error "FAILED: PG SHOULD CONTAIN EXACTLY 128 MB OF DATA, BUT IT DOESN'T"
fi
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=1:size=4096" \
-O raw ./testdata/inode1.bin
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=2:size="$((128*1024*1024)) \
-O raw ./testdata/inode2.bin
if (dd if=/dev/zero bs=4096 count=1 | diff - ./testdata/inode1.bin); then
format_error "FAILED: INODE 1 SEEMS LOST"
fi
if ! (dd if=/dev/zero bs=1M count=128 | diff - ./testdata/inode2.bin); then
format_error "FAILED: INODE 2 SEEMS RESTORED"
fi
format_green OK

52
tests/test_heal.sh Executable file
View File

@ -0,0 +1,52 @@
#!/bin/bash -ex
# Kill OSDs while writing
PG_SIZE=3
. `dirname $0`/run_7osds.sh
IMG_SIZE=960
$ETCDCTL put /vitastor/config/inode/1/1 '{"name":"testimg","size":'$((IMG_SIZE*1024*1024))'}'
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write \
-mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -cluster_log_level=10
kill_osds()
{
sleep 5
kill -9 $OSD1_PID
$ETCDCTL del /vitastor/osd/state/1
for i in 2 3 4 5 6 7; do
sleep 15
echo Killing OSD $i and starting OSD $((i-1))
p=OSD${i}_PID
kill -9 ${!p}
$ETCDCTL del /vitastor/osd/state/$i
start_osd $((i-1))
sleep 15
done
sleep 5
start_osd 7
sleep 5
}
kill_osds &
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=16 -fsync=256 -rw=randwrite \
-mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120 2>/dev/null
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testimg" \
-O raw ./testdata/read.bin
diff ./testdata/read.bin ./testdata/mirror.bin
format_green OK