diff --git a/src/osd_cluster.cpp b/src/osd_cluster.cpp index 52a17861..4ff3ac52 100644 --- a/src/osd_cluster.cpp +++ b/src/osd_cluster.cpp @@ -885,7 +885,6 @@ void osd_t::report_pg_states() if (pg.history_changed) { // Prevent race conditions (for the case when the monitor is updating this key at the same time) - // FIXME: target_history updates may be lost on PG re-peering pg.history_changed = false; std::string history_key = base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num)); json11::Json::object history_value = { diff --git a/src/osd_peering_pg.cpp b/src/osd_peering_pg.cpp index 8f3ba184..bf79fe50 100644 --- a/src/osd_peering_pg.cpp +++ b/src/osd_peering_pg.cpp @@ -86,21 +86,9 @@ void pg_obj_state_check_t::walk() } if (pg->pg_cursize < pg->pg_size) { - // Report PG history and activate + // Activate as degraded + // Current OSD set will be added into target_history on first write pg->state |= PG_DEGRADED | PG_PEERED; - std::vector history_set; - for (auto peer_osd: pg->cur_set) - { - if (peer_osd != 0) - history_set.push_back(peer_osd); - } - std::sort(history_set.begin(), history_set.end()); - auto it = std::lower_bound(pg->target_history.begin(), pg->target_history.end(), history_set); - if (it == pg->target_history.end() || *it != history_set) - { - pg->target_history.insert(it, history_set); - pg->history_changed = true; - } } else { @@ -438,7 +426,7 @@ void pg_t::calc_object_states(int log_level) std::sort(st.list.begin(), st.list.end()); // Walk over it and check object states st.walk(); - if (this->state & (PG_DEGRADED|PG_LEFT_ON_DEAD)) + if (this->state != PG_ACTIVE) { assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1)); epoch++; diff --git a/src/osd_primary_write.cpp b/src/osd_primary_write.cpp index 1dcb9c49..60870458 100644 --- a/src/osd_primary_write.cpp +++ b/src/osd_primary_write.cpp @@ -155,9 +155,21 @@ resume_3: if (pg.epoch > pg.reported_epoch) { // Report newer epoch before writing - // FIXME: We may report only one PG state here... + // FIXME: We don't have to report all changed PG states here this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }); - pg.history_changed = true; + if (pg.state != PG_ACTIVE) + { + // Check that current OSD set is in history and/or add it there + std::vector history_set; + for (auto peer_osd: pg.cur_set) + if (peer_osd != 0) + history_set.push_back(peer_osd); + std::sort(history_set.begin(), history_set.end()); + auto it = std::lower_bound(pg.target_history.begin(), pg.target_history.end(), history_set); + if (it == pg.target_history.end() || *it != history_set) + pg.target_history.insert(it, history_set); + pg.history_changed = true; + } report_pg_states(); resume_10: if (pg.epoch > pg.reported_epoch) @@ -166,6 +178,12 @@ resume_10: return; } } + // Recheck PG state after reporting history - maybe it's already stopping/restarting + if (pg.state & (PG_STOPPING|PG_REPEERING)) + { + pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE); + return; + } submit_primary_subops(SUBMIT_WRITE, op_data->target_ver, pg.cur_set.data(), cur_op); resume_4: op_data->st = 4;