forked from vitalif/vitastor
Fix infinite looping in continue_recovery_op() when pg_cancel_write_queue() is called
parent
9abf3c17c9
commit
738ad5af79
|
@ -258,6 +258,10 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
||||||
.len = 0,
|
.len = 0,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
if (log_level > 2)
|
||||||
|
{
|
||||||
|
printf("Submitting recovery operation for %lx:%lx\n", op->oid.inode, op->oid.stripe);
|
||||||
|
}
|
||||||
op->osd_op->callback = [this, op](osd_op_t *osd_op)
|
op->osd_op->callback = [this, op](osd_op_t *osd_op)
|
||||||
{
|
{
|
||||||
// Don't sync the write, it will be synced by our regular sync coroutine
|
// Don't sync the write, it will be synced by our regular sync coroutine
|
||||||
|
@ -267,6 +271,11 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
||||||
if (osd_op->reply.hdr.retval == -EPIPE)
|
if (osd_op->reply.hdr.retval == -EPIPE)
|
||||||
{
|
{
|
||||||
// PG is stopped or one of the OSDs is gone, error is harmless
|
// PG is stopped or one of the OSDs is gone, error is harmless
|
||||||
|
printf(
|
||||||
|
"Recovery operation failed with object %lx:%lx (PG %u/%u)\n",
|
||||||
|
op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
|
||||||
|
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
|
@ -552,24 +552,28 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||||
void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval)
|
void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval)
|
||||||
{
|
{
|
||||||
auto st_it = pg.write_queue.find(oid), it = st_it;
|
auto st_it = pg.write_queue.find(oid), it = st_it;
|
||||||
finish_op(first_op, retval);
|
if (it == pg.write_queue.end() || it->second != first_op)
|
||||||
if (it != pg.write_queue.end() && it->second == first_op)
|
|
||||||
{
|
|
||||||
it++;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
// Write queue doesn't match the first operation.
|
// Write queue doesn't match the first operation.
|
||||||
// first_op is a leftover operation from the previous peering of the same PG.
|
// first_op is a leftover operation from the previous peering of the same PG.
|
||||||
|
finish_op(first_op, retval);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
while (it != pg.write_queue.end() && it->first == oid)
|
std::vector<osd_op_t*> cancel_ops;
|
||||||
|
while (it != pg.write_queue.end())
|
||||||
{
|
{
|
||||||
finish_op(it->second, retval);
|
cancel_ops.push_back(it->second);
|
||||||
it++;
|
it++;
|
||||||
}
|
}
|
||||||
if (st_it != it)
|
if (st_it != it)
|
||||||
{
|
{
|
||||||
|
// First erase them and then run finish_op() for the sake of reenterability
|
||||||
|
// Calling finish_op() on a live iterator previously triggered a bug where some
|
||||||
|
// of the OSDs were looping infinitely if you stopped all of them with kill -INT during recovery
|
||||||
pg.write_queue.erase(st_it, it);
|
pg.write_queue.erase(st_it, it);
|
||||||
|
for (auto op: cancel_ops)
|
||||||
|
{
|
||||||
|
finish_op(op, retval);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue