Fix two potential read/write ordering problems (even though not yet seen in tests)
- Write operations could be 'stabilized' and previous versions could be purged from OSDs before the removal of version_override and following reads could potentially hit different version in EC pools - Object was marked clean after completing the delete during recovery, so reads could in theory hit a deleted version and return nothingrel-0.5
parent
98b54ca948
commit
05db1308aa
|
@ -198,6 +198,7 @@ class osd_t
|
|||
void continue_primary_del(osd_op_t *cur_op);
|
||||
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
|
||||
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
|
||||
void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
|
||||
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
|
||||
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
|
||||
void handle_primary_bs_subop(osd_op_t *subop);
|
||||
|
|
|
@ -18,7 +18,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
|||
// Our EC scheme stores data in fixed chunks equal to (K*block size)
|
||||
// K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools
|
||||
pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode);
|
||||
// FIXME: We have to access pool config here, so make sure that it doesn't change while its PGs are active...
|
||||
// Note: We read pool config here, so we must NOT change it when PGs are active
|
||||
auto pool_cfg_it = st_cli.pool_config.find(pool_id);
|
||||
if (pool_cfg_it == st_cli.pool_config.end())
|
||||
{
|
||||
|
@ -269,8 +269,6 @@ resume_3:
|
|||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return;
|
||||
}
|
||||
// Save version override for parallel reads
|
||||
pg.ver_override[op_data->oid] = op_data->fact_ver;
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Only (possibly) copy new data from the request into the recovery buffer
|
||||
|
@ -289,6 +287,9 @@ resume_3:
|
|||
}
|
||||
else
|
||||
{
|
||||
// For EC/XOR pools, save version override to make it impossible
|
||||
// for parallel reads to read different versions of data and parity
|
||||
pg.ver_override[op_data->oid] = op_data->fact_ver;
|
||||
// Recover missing stripes, calculate parity
|
||||
if (pg.scheme == POOL_SCHEME_XOR)
|
||||
{
|
||||
|
@ -332,8 +333,22 @@ resume_4:
|
|||
op_data->st = 4;
|
||||
return;
|
||||
resume_5:
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Remove version override just after the write, but before stabilizing
|
||||
pg.ver_override.erase(op_data->oid);
|
||||
}
|
||||
if (op_data->object_state)
|
||||
{
|
||||
// We must forget the unclean state of the object before deleting it
|
||||
// so the next reads don't accidentally read a deleted version
|
||||
// And it should be done at the same time as the removal of the version override
|
||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
||||
pg.clean_count++;
|
||||
}
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return;
|
||||
}
|
||||
|
@ -342,6 +357,7 @@ resume_7:
|
|||
if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
|
||||
{
|
||||
// FIXME: Check for immediate_commit == IMMEDIATE_SMALL
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
return;
|
||||
}
|
||||
if (op_data->fact_ver == 1)
|
||||
|
@ -390,10 +406,12 @@ resume_7:
|
|||
copies_to_delete_after_sync_count++;
|
||||
}
|
||||
}
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
}
|
||||
else
|
||||
{
|
||||
submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
if (op_data->n_subops > 0)
|
||||
{
|
||||
resume_8:
|
||||
|
@ -407,14 +425,9 @@ resume_9:
|
|||
}
|
||||
}
|
||||
}
|
||||
// Clear object state
|
||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
||||
pg.clean_count++;
|
||||
}
|
||||
cur_op->reply.hdr.retval = cur_op->req.rw.len;
|
||||
continue_others:
|
||||
// Remove version override
|
||||
pg.ver_override.erase(op_data->oid);
|
||||
object_id oid = op_data->oid;
|
||||
// Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue
|
||||
auto next_it = pg.write_queue.find(oid);
|
||||
|
@ -818,10 +831,14 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
|
|||
{
|
||||
throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
|
||||
}
|
||||
object_state->object_count--;
|
||||
if (!object_state->object_count)
|
||||
}
|
||||
|
||||
void osd_t::free_object_state(pg_t & pg, pg_osd_set_state_t **object_state)
|
||||
{
|
||||
if (*object_state && !(--(*object_state)->object_count))
|
||||
{
|
||||
pg.state_dict.erase(object_state->osd_set);
|
||||
pg.state_dict.erase((*object_state)->osd_set);
|
||||
*object_state = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -887,6 +904,7 @@ resume_5:
|
|||
else
|
||||
{
|
||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
}
|
||||
pg.total_count--;
|
||||
object_id oid = op_data->oid;
|
||||
|
|
|
@ -2,6 +2,14 @@
|
|||
|
||||
. `dirname $0`/common.sh
|
||||
|
||||
if [ "$EC" != "" ]; then
|
||||
POOLCFG='"scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1'
|
||||
NOBJ=512
|
||||
else
|
||||
POOLCFG='"scheme":"replicated","pg_size":2,"pg_minsize":2'
|
||||
NOBJ=1024
|
||||
fi
|
||||
|
||||
dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd3.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
|
@ -28,7 +36,7 @@ cd ..
|
|||
node mon/mon-main.js --etcd_url http://$ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
|
||||
MON_PID=$!
|
||||
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":16,"failure_domain":"osd"}}'
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":16,"failure_domain":"osd"}}'
|
||||
|
||||
sleep 2
|
||||
|
||||
|
@ -52,7 +60,7 @@ try_change()
|
|||
echo --- Change PG count to $n --- >>testdata/osd$i.log
|
||||
done
|
||||
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":'$n',"failure_domain":"osd"}}'
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":'$n',"failure_domain":"osd"}}'
|
||||
|
||||
for i in {1..10}; do
|
||||
($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == '$n) && \
|
||||
|
@ -82,8 +90,8 @@ try_change()
|
|||
|
||||
# Check that no objects are lost !
|
||||
nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'`
|
||||
if [ "$nobj" -ne 1024 ]; then
|
||||
format_error "Data lost after changing PG count to $n: 1024 objects expected, but got $nobj"
|
||||
if [ "$nobj" -ne $NOBJ ]; then
|
||||
format_error "Data lost after changing PG count to $n: $NOBJ objects expected, but got $nobj"
|
||||
fi
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue