Compare commits

...

4 Commits

Author SHA1 Message Date
Vitaliy Filippov 6366972fe8 Warn about full and almost full OSDs in status
Test / test_splitbrain (push) Successful in 18s Details
Test / test_snapshot_chain (push) Successful in 2m23s Details
Test / test_snapshot_chain_ec (push) Successful in 2m53s Details
Test / test_rebalance_verify_imm (push) Successful in 3m21s Details
Test / test_rebalance_verify (push) Successful in 3m46s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write (push) Successful in 54s Details
Test / test_write_xor (push) Successful in 48s Details
Test / test_write_no_same (push) Successful in 14s Details
Test / test_rebalance_verify_ec (push) Successful in 4m38s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 4m22s Details
Test / test_heal_pg_size_2 (push) Successful in 3m34s Details
Test / test_heal_ec (push) Successful in 3m38s Details
Test / test_heal_csum_32k_dmj (push) Successful in 5m44s Details
Test / test_heal_csum_32k_dj (push) Successful in 5m51s Details
Test / test_heal_csum_32k (push) Successful in 6m45s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m34s Details
Test / test_enospc (push) Successful in 1m47s Details
Test / test_enospc_xor (push) Successful in 2m41s Details
Test / test_enospc_imm (push) Successful in 1m31s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m39s Details
Test / test_heal_csum_4k (push) Successful in 6m15s Details
Test / test_scrub_zero_osd_2 (push) Successful in 32s Details
Test / test_scrub (push) Successful in 35s Details
Test / test_scrub_xor (push) Successful in 26s Details
Test / test_enospc_imm_xor (push) Successful in 1m13s Details
Test / test_nfs (push) Successful in 24s Details
Test / test_scrub_ec (push) Successful in 33s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 34s Details
Test / test_scrub_pg_size_3 (push) Successful in 42s Details
2024-04-07 19:39:51 +03:00
Vitaliy Filippov 2b863fb715 Add ENOSPC handling tests 2024-04-07 19:39:33 +03:00
Vitaliy Filippov 3bf4dd5abd Fix client op retry timeout - do not retry immediately 2024-04-07 19:08:36 +03:00
Vitaliy Filippov 3b84dcaedd Handle ENOSPC during write - rollback partial EC writes, remember partial replica writes
Test / test_rm (push) Successful in 14s Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m59s Details
Test / test_snapshot_down (push) Successful in 28s Details
Test / test_snapshot_down_ec (push) Successful in 30s Details
Test / test_splitbrain (push) Successful in 27s Details
Test / test_snapshot_chain (push) Successful in 2m41s Details
Test / test_snapshot_chain_ec (push) Successful in 3m12s Details
Test / test_rebalance_verify_imm (push) Successful in 3m33s Details
Test / test_rebalance_verify (push) Successful in 4m24s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write (push) Successful in 53s Details
Test / test_write_xor (push) Successful in 51s Details
Test / test_write_no_same (push) Successful in 11s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m11s Details
Test / test_rebalance_verify_ec (push) Successful in 6m3s Details
Test / test_heal_pg_size_2 (push) Successful in 4m57s Details
Test / test_heal_ec (push) Successful in 4m52s Details
Test / test_heal_csum_32k_dmj (push) Successful in 4m37s Details
Test / test_heal_csum_32k_dj (push) Successful in 6m55s Details
Test / test_heal_csum_32k (push) Successful in 6m42s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m41s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m45s Details
Test / test_scrub_zero_osd_2 (push) Successful in 44s Details
Test / test_scrub (push) Successful in 48s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m6s Details
Test / test_scrub_pg_size_3 (push) Successful in 1m30s Details
Test / test_scrub_ec (push) Successful in 51s Details
Test / test_nfs (push) Successful in 39s Details
Test / test_heal_csum_4k (push) Successful in 5m22s Details
Test / test_scrub_xor (push) Successful in 18s Details
2024-04-07 18:02:05 +03:00
15 changed files with 448 additions and 60 deletions

View File

@ -748,6 +748,78 @@ jobs:
echo ""
done
test_enospc:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_enospc.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_enospc_xor:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: SCHEME=xor /root/vitastor/tests/test_enospc.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_enospc_imm:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_enospc.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_enospc_imm_xor:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: IMMEDIATE_COMMIT=1 SCHEME=xor /root/vitastor/tests/test_enospc.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_scrub:
runs-on: ubuntu-latest
needs: build

View File

@ -11,6 +11,7 @@ affect their interaction with the cluster.
- [client_retry_interval](#client_retry_interval)
- [client_eio_retry_interval](#client_eio_retry_interval)
- [client_retry_enospc](#client_retry_enospc)
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
@ -20,6 +21,7 @@ affect their interaction with the cluster.
- [nbd_timeout](#nbd_timeout)
- [nbd_max_devices](#nbd_max_devices)
- [nbd_max_part](#nbd_max_part)
- [osd_nearfull_ratio](#osd_nearfull_ratio)
## client_retry_interval
@ -41,6 +43,15 @@ Retry time for I/O requests failed due to data corruption or unfinished
EC object deletions (has_incomplete PG state). 0 disables such retries
and clients are not blocked and just get EIO error code instead.
## client_retry_enospc
- Type: boolean
- Default: true
- Can be changed online: yes
Retry writes on out of space errors to wait until some space is freed on
OSDs.
## client_max_dirty_bytes
- Type: integer
@ -157,3 +168,18 @@ Maximum number of NBD devices in the system. This value is passed as
Maximum number of partitions per NBD device. This value is passed as
`max_part` parameter for the nbd kernel module when vitastor-nbd autoloads it.
Note that (nbds_max)*(1+max_part) usually can't exceed 256.
## osd_nearfull_ratio
- Type: number
- Default: 0.95
- Can be changed online: yes
Ratio of used space on OSD to treat it as "almost full" in vitastor-cli status output.
Remember that some client writes may hang or complete with an error if even
just one OSD becomes 100 % full!
However, unlike in Ceph, 100 % full Vitastor OSDs don't crash (in Ceph they're
unable to start at all), so you'll be able to recover from "out of space" errors
without destroying and recreating OSDs.

View File

@ -11,6 +11,7 @@
- [client_retry_interval](#client_retry_interval)
- [client_eio_retry_interval](#client_eio_retry_interval)
- [client_retry_enospc](#client_retry_enospc)
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
@ -20,6 +21,7 @@
- [nbd_timeout](#nbd_timeout)
- [nbd_max_devices](#nbd_max_devices)
- [nbd_max_part](#nbd_max_part)
- [osd_nearfull_ratio](#osd_nearfull_ratio)
## client_retry_interval
@ -42,6 +44,15 @@
0 отключает повторы таких запросов и клиенты не блокируются, а вместо
этого просто получают код ошибки EIO.
## client_retry_enospc
- Тип: булево (да/нет)
- Значение по умолчанию: true
- Можно менять на лету: да
Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е.
ожидать, пока на OSD не освободится место.
## client_max_dirty_bytes
- Тип: целое число
@ -158,3 +169,20 @@
Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
## osd_nearfull_ratio
- Тип: число
- Значение по умолчанию: 0.95
- Можно менять на лету: да
Доля занятого места на OSD, начиная с которой он считается "почти заполненным" в
выводе vitastor-cli status.
Помните, что часть клиентских запросов может зависнуть или завершиться с ошибкой,
если на 100 % заполнится хотя бы 1 OSD!
Однако, в отличие от Ceph, заполненные на 100 % OSD Vitastor не падают (в Ceph
заполненные на 100% OSD вообще не могут стартовать), так что вы сможете
восстановить работу кластера после ошибок отсутствия свободного места
без уничтожения и пересоздания OSD.

View File

@ -22,6 +22,16 @@
или незавершённых удалений EC-объектов (состояния PG has_incomplete).
0 отключает повторы таких запросов и клиенты не блокируются, а вместо
этого просто получают код ошибки EIO.
- name: client_retry_enospc
type: bool
default: true
online: true
info: |
Retry writes on out of space errors to wait until some space is freed on
OSDs.
info_ru: |
Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е.
ожидать, пока на OSD не освободится место.
- name: client_max_dirty_bytes
type: int
default: 33554432
@ -190,3 +200,27 @@
Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
- name: osd_nearfull_ratio
type: float
default: 0.95
online: true
info: |
Ratio of used space on OSD to treat it as "almost full" in vitastor-cli status output.
Remember that some client writes may hang or complete with an error if even
just one OSD becomes 100 % full!
However, unlike in Ceph, 100 % full Vitastor OSDs don't crash (in Ceph they're
unable to start at all), so you'll be able to recover from "out of space" errors
without destroying and recreating OSDs.
info_ru: |
Доля занятого места на OSD, начиная с которой он считается "почти заполненным" в
выводе vitastor-cli status.
Помните, что часть клиентских запросов может зависнуть или завершиться с ошибкой,
если на 100 % заполнится хотя бы 1 OSD!
Однако, в отличие от Ceph, заполненные на 100 % OSD Vitastor не падают (в Ceph
заполненные на 100% OSD вообще не могут стартовать), так что вы сможете
восстановить работу кластера после ошибок отсутствия свободного места
без уничтожения и пересоздания OSD.

View File

@ -91,6 +91,8 @@ const etcd_tree = {
client_max_writeback_iodepth: 256,
client_retry_interval: 50, // ms. min: 10
client_eio_retry_interval: 1000, // ms
client_retry_enospc: true,
osd_nearfull_ratio: 0.95,
// client and osd - configurable online
log_level: 0,
peer_connect_interval: 5, // seconds. min: 1

View File

@ -110,6 +110,12 @@ resume_2:
}
}
int mon_count = 0;
int osds_full = 0, osds_nearfull = 0;
double osd_nearfull_ratio = parent->cli->config["osd_nearfull_ratio"].number_value();
if (!osd_nearfull_ratio)
{
osd_nearfull_ratio = 0.95;
}
std::string mon_master;
for (int i = 0; i < mon_members.size(); i++)
{
@ -139,8 +145,18 @@ resume_2:
continue;
}
osd_count++;
total_raw += kv.value["size"].uint64_value();
free_raw += kv.value["free"].uint64_value();
auto osd_size = kv.value["size"].uint64_value();
auto osd_free = kv.value["free"].uint64_value();
total_raw += osd_size;
free_raw += osd_free;
if (!osd_free)
{
osds_full++;
}
else if (osd_free < (uint64_t)(osd_size*(1-osd_nearfull_ratio)))
{
osds_nearfull++;
}
auto peer_it = parent->cli->st_cli.peer_states.find(stat_osd_num);
if (peer_it != parent->cli->st_cli.peer_states.end())
{
@ -281,11 +297,27 @@ resume_2:
else if (no_scrub)
recovery_io += " scrub: "+str_repeat(" ", io_indent+1)+"disabled\n";
}
std::string warning_str;
if (osds_full)
{
warning_str += " "+std::to_string(osds_full)+
(osds_full > 1 ? " osds are full\n" : " osd is full\n");
}
if (osds_nearfull)
{
warning_str += " "+std::to_string(osds_nearfull)+
(osds_nearfull > 1 ? " osds are almost full\n" : " osd is almost full\n");
}
if (warning_str != "")
{
warning_str = "\n warning:\n"+warning_str;
}
printf(
" cluster:\n"
" etcd: %d / %zd up, %s database size\n"
" mon: %d up%s\n"
" osd: %d / %d up\n"
"%s"
" \n"
" data:\n"
" raw: %s used, %s / %s available%s\n"
@ -298,7 +330,7 @@ resume_2:
"%s",
etcd_alive, etcd_states.size(), format_size(etcd_db_size).c_str(),
mon_count, mon_master == "" ? "" : (", master "+mon_master).c_str(),
osd_up, osd_count,
osd_up, osd_count, warning_str.c_str(),
format_size(total_raw-free_raw).c_str(),
format_size(free_raw-free_down_raw).c_str(),
format_size(total_raw-down_raw).c_str(),

View File

@ -25,7 +25,8 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
if (msgr.osd_peer_fds.find(peer_osd) != msgr.osd_peer_fds.end())
{
// peer_osd just connected
continue_ops();
// retry operations waiting for connection immediately
continue_ops(client_retry_interval);
continue_lists();
continue_raw_ops(peer_osd);
}
@ -397,6 +398,8 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
client_eio_retry_interval = 10;
}
}
// client_retry_enospc
client_retry_enospc = config["client_retry_enospc"].is_null() ? true : config["client_retry_enospc"].bool_value();
// log_level
log_level = config["log_level"].uint64_value();
msgr.parse_config(config);
@ -817,7 +820,7 @@ resume_2:
return 1;
}
else if (op->retval != 0 && !(op->flags & OP_FLUSH_BUFFER) &&
op->retval != -EPIPE && (op->retval != -EIO || !client_eio_retry_interval) && op->retval != -ENOSPC)
op->retval != -EPIPE && (op->retval != -EIO || !client_eio_retry_interval) && (op->retval != -ENOSPC || !client_retry_enospc))
{
// Fatal error (neither -EPIPE, -EIO nor -ENOSPC)
erase_op(op);
@ -1209,7 +1212,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
// Set op->retry_after to retry operation after a short pause (not immediately)
if (!op->retry_after)
{
op->retry_after = op->retval == -EIO ? client_eio_retry_interval : client_retry_interval;
op->retry_after = op->retval != -EPIPE ? client_eio_retry_interval : client_retry_interval;
}
reset_retry_timer(op->retry_after);
if (stop_fd >= 0)
@ -1217,7 +1220,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
msgr.stop_client(stop_fd);
}
op->inflight_count--;
if (op->inflight_count == 0)
if (op->inflight_count == 0 && !op->retry_after)
{
if (op->opcode == OSD_OP_SYNC)
continue_sync(op);
@ -1242,7 +1245,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
{
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
}
if (op->inflight_count == 0)
if (op->inflight_count == 0 && !op->retry_after)
{
if (op->opcode == OSD_OP_SYNC)
continue_sync(op);

View File

@ -94,6 +94,7 @@ class cluster_client_t
int log_level = 0;
int client_retry_interval = 50; // ms
int client_eio_retry_interval = 1000; // ms
bool client_retry_enospc = true;
int retry_timeout_id = 0;
int retry_timeout_duration = 0;

View File

@ -301,8 +301,12 @@ class osd_t
pg_osd_set_state_t* add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
uint64_t old_pg_state, int log_at_level);
void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true);
pg_osd_set_state_t *mark_object(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, bool ref,
std::function<int(pg_osd_set_t & new_set)> calc_set);
pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent);
pg_osd_set_state_t *mark_partial_write(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
osd_rmw_stripe_t *stripes, bool ref);
void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref);
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
@ -317,6 +321,7 @@ class osd_t
void submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_delete, int chunks_to_delete_count);
int submit_primary_sync_subops(osd_op_t *cur_op);
void submit_primary_stab_subops(osd_op_t *cur_op);
void submit_primary_rollback_subops(osd_op_t *cur_op, const uint64_t* osd_set);
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state);

View File

@ -299,8 +299,8 @@ resume_2:
finish_op(cur_op, cur_op->req.rw.len);
}
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent)
pg_osd_set_state_t *osd_t::mark_object(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, bool ref,
std::function<int(pg_osd_set_t & new_set)> calc_set)
{
pg_osd_set_state_t *object_state = NULL;
get_object_osd_set(pg, oid, &object_state);
@ -315,58 +315,22 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
}
return object_state;
}
pg_osd_set_t corrupted_set;
pg_osd_set_t new_set;
if (object_state)
{
corrupted_set = object_state->osd_set;
new_set = object_state->osd_set;
}
else
{
for (int i = 0; i < pg.cur_set.size(); i++)
{
corrupted_set.push_back((pg_obj_loc_t){
new_set.push_back((pg_obj_loc_t){
.role = (pg.scheme == POOL_SCHEME_REPLICATED ? 0 : (uint64_t)i),
.osd_num = pg.cur_set[i],
});
}
}
// Mark object chunk(s) as corrupted
int changes = 0;
for (auto chunk_it = corrupted_set.begin(); chunk_it != corrupted_set.end(); )
{
auto & chunk = *chunk_it;
if (stripes[chunk.role].osd_num == chunk.osd_num)
{
if (stripes[chunk.role].not_exists)
{
changes++;
corrupted_set.erase(chunk_it, chunk_it+1);
continue;
}
if (stripes[chunk.role].read_error && chunk.loc_bad != LOC_CORRUPTED)
{
changes++;
chunk.loc_bad = LOC_CORRUPTED;
}
else if (stripes[chunk.role].read_end > 0 && !stripes[chunk.role].missing &&
(chunk.loc_bad & LOC_CORRUPTED))
{
changes++;
chunk.loc_bad &= ~LOC_CORRUPTED;
}
}
if (inconsistent && !chunk.loc_bad)
{
changes++;
chunk.loc_bad |= LOC_INCONSISTENT;
}
else if (!inconsistent && (chunk.loc_bad & LOC_INCONSISTENT))
{
changes++;
chunk.loc_bad &= ~LOC_INCONSISTENT;
}
chunk_it++;
}
int changes = calc_set(new_set);
if (!changes)
{
// No chunks newly marked as corrupted - object is already marked or moved
@ -379,7 +343,7 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
deref_object_state(pg, &object_state, ref);
}
// Insert object into the new state and retry
object_state = add_object_to_set(pg, oid, corrupted_set, old_pg_state, 2);
object_state = add_object_to_set(pg, oid, new_set, old_pg_state, 2);
if (ref)
{
object_state->ref_count++;
@ -387,6 +351,76 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
return object_state;
}
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent)
{
return mark_object(pg, oid, prev_object_state, ref, [stripes, inconsistent](pg_osd_set_t & new_set)
{
// Mark object chunk(s) as corrupted
int changes = 0;
for (auto chunk_it = new_set.begin(); chunk_it != new_set.end(); )
{
auto & chunk = *chunk_it;
if (stripes[chunk.role].osd_num == chunk.osd_num)
{
if (stripes[chunk.role].not_exists)
{
changes++;
new_set.erase(chunk_it, chunk_it+1);
continue;
}
if (stripes[chunk.role].read_error && chunk.loc_bad != LOC_CORRUPTED)
{
changes++;
chunk.loc_bad = LOC_CORRUPTED;
}
else if (stripes[chunk.role].read_end > 0 && !stripes[chunk.role].missing &&
(chunk.loc_bad & LOC_CORRUPTED))
{
changes++;
chunk.loc_bad &= ~LOC_CORRUPTED;
}
}
if (inconsistent && !chunk.loc_bad)
{
changes++;
chunk.loc_bad |= LOC_INCONSISTENT;
}
else if (!inconsistent && (chunk.loc_bad & LOC_INCONSISTENT))
{
changes++;
chunk.loc_bad &= ~LOC_INCONSISTENT;
}
chunk_it++;
}
return changes;
});
}
// Mark the object as partially updated (probably due to a ENOSPC)
pg_osd_set_state_t *osd_t::mark_partial_write(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
osd_rmw_stripe_t *stripes, bool ref)
{
return mark_object(pg, oid, prev_object_state, ref, [stripes](pg_osd_set_t & new_set)
{
// Mark object chunk(s) as outdated
int changes = 0;
for (auto chunk_it = new_set.begin(); chunk_it != new_set.end(); )
{
auto & chunk = *chunk_it;
if (stripes[chunk.role].osd_num == chunk.osd_num &&
stripes[chunk.role].read_error &&
chunk.loc_bad != LOC_OUTDATED)
{
changes++;
chunk.loc_bad = LOC_OUTDATED;
}
chunk_it++;
}
return changes;
});
}
pg_osd_set_state_t* osd_t::add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
uint64_t old_pg_state, int log_at_level)
{

View File

@ -25,7 +25,7 @@ struct osd_primary_op_data_t
uint64_t target_ver;
uint64_t orig_ver = 0, fact_ver = 0;
uint64_t scheme = 0;
int n_subops = 0, done = 0, errors = 0, errcode = 0;
int n_subops = 0, done = 0, errors = 0, drops = 0, errcode = 0;
int degraded = 0, pg_size, pg_data_size;
osd_rmw_stripe_t *stripes;
osd_op_t *subops = NULL;

View File

@ -133,7 +133,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
zero_read = -1;
osd_op_t *subops = new osd_op_t[n_subops];
op_data->fact_ver = 0;
op_data->done = op_data->errors = op_data->errcode = 0;
op_data->done = op_data->errors = op_data->drops = op_data->errcode = 0;
op_data->n_subops = n_subops;
op_data->subops = subops;
int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read);
@ -363,6 +363,13 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
memset(((osd_rmw_stripe_t*)subop->rmw_buf)->read_buf, 0, expected);
((osd_rmw_stripe_t*)subop->rmw_buf)->not_exists = true;
}
if (opcode == OSD_OP_SEC_READ && (retval == -EIO || retval == -EDOM) ||
opcode == OSD_OP_SEC_WRITE && retval != expected)
{
// We'll retry reads from other replica(s) on EIO/EDOM and mark object as corrupted
// And we'll mark write as failed
((osd_rmw_stripe_t*)subop->rmw_buf)->read_error = true;
}
if (retval == expected && (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE))
{
uint64_t version = subop->reply.sec_rw.version;
@ -404,14 +411,10 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
osd_op_names[opcode], subop->peer_fd, retval, expected
);
}
if (opcode == OSD_OP_SEC_READ && (retval == -EIO || retval == -EDOM))
{
// We'll retry reads from other replica(s) on EIO/EDOM and mark object as corrupted
((osd_rmw_stripe_t*)subop->rmw_buf)->read_error = true;
}
subop->rmw_buf = NULL;
// Error priority: ENOSPC and others > EIO > EDOM > EPIPE
// Error priority: ENOSPC > others > EIO > EDOM > EPIPE
if (op_data->errcode == 0 ||
retval == -ENOSPC && op_data->errcode != -ENOSPC ||
retval == -EIO && (op_data->errcode == -EDOM || op_data->errcode == -EPIPE) ||
retval == -EDOM && (op_data->errcode == -EPIPE) ||
retval != -EIO && retval != -EDOM && retval != -EPIPE)
@ -424,6 +427,7 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
(retval != -EIO || opcode != OSD_OP_SEC_READ))
{
// Drop connection on unexpected errors
op_data->drops++;
msgr.stop_client(subop->peer_fd);
}
}
@ -705,6 +709,96 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
}
}
void osd_t::submit_primary_rollback_subops(osd_op_t *cur_op, const uint64_t* osd_set)
{
osd_primary_op_data_t *op_data = cur_op->op_data;
osd_rmw_stripe_t *stripes = op_data->stripes;
assert(op_data->scheme != POOL_SCHEME_REPLICATED);
// Allocate subops
int n_subops = 0;
for (int role = 0; role < op_data->pg_size; role++)
{
if (osd_set[role] != 0 && !stripes[role].read_error &&
msgr.osd_peer_fds.find(osd_set[role]) != msgr.osd_peer_fds.end())
{
n_subops++;
}
}
op_data->n_subops = n_subops;
op_data->done = op_data->errors = 0;
if (!op_data->n_subops)
{
return;
}
op_data->subops = new osd_op_t[n_subops];
op_data->unstable_writes = new obj_ver_id[n_subops];
int i = 0;
for (int role = 0; role < op_data->pg_size; role++)
{
if (osd_set[role] != 0 && !stripes[role].read_error &&
msgr.osd_peer_fds.find(osd_set[role]) != msgr.osd_peer_fds.end())
{
osd_op_t *subop = &op_data->subops[i];
op_data->unstable_writes[i] = (obj_ver_id){
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | role,
},
.version = op_data->target_ver-1,
};
if (osd_set[role] == this->osd_num)
{
clock_gettime(CLOCK_REALTIME, &subop->tv_begin);
subop->op_type = (uint64_t)cur_op;
subop->bs_op = new blockstore_op_t((blockstore_op_t){
.opcode = BS_OP_ROLLBACK,
.callback = [subop, this](blockstore_op_t *bs_subop)
{
handle_primary_bs_subop(subop);
},
{
.len = 1,
},
.buf = (void*)(op_data->unstable_writes + i),
});
#ifdef OSD_DEBUG
printf(
"Submit rollback to local: %jx:%jx v%ju\n",
op_data->oid.inode, op_data->oid.stripe | role, op_data->target_ver-1
);
#endif
bs->enqueue_op(subop->bs_op);
}
else
{
subop->op_type = OSD_OP_OUT;
subop->req = (osd_any_op_t){ .sec_stab = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = msgr.next_subop_id++,
.opcode = OSD_OP_SEC_ROLLBACK,
},
.len = sizeof(obj_ver_id),
} };
subop->iov.push_back(op_data->unstable_writes + i, sizeof(obj_ver_id));
subop->callback = [cur_op, this](osd_op_t *subop)
{
handle_primary_subop(subop, cur_op);
};
#ifdef OSD_DEBUG
printf(
"Submit rollback to osd %ju: %jx:%jx v%ju\n", osd_set[role],
op_data->oid.inode, op_data->oid.stripe | role, op_data->target_ver-1
);
#endif
subop->peer_fd = msgr.osd_peer_fds.at(osd_set[role]);
msgr.outbox_push(subop);
}
i++;
}
}
}
void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval)
{
auto st_it = pg.write_queue.find(oid), it = st_it;

View File

@ -49,6 +49,8 @@ void osd_t::continue_primary_write(osd_op_t *cur_op)
else if (op_data->st == 8) goto resume_8;
else if (op_data->st == 9) goto resume_9;
else if (op_data->st == 10) goto resume_10;
else if (op_data->st == 11) goto resume_11;
else if (op_data->st == 12) goto resume_12;
assert(op_data->st == 0);
if (!check_write_queue(cur_op, pg))
{
@ -259,11 +261,31 @@ resume_5:
}
if (op_data->errors > 0)
{
// FIXME: Handle ENOSPC. If one of the subops fail with ENOSPC here,
// Handle ENOSPC/EDOM/ERANGE/EIO. If some subops fail, but others succeed,
// next writes to the same object will also fail because they'll try
// to overwrite the same version number which will result in EEXIST.
// To fix it, we should mark the object as degraded for replicas,
// and rollback successful part updates in case of EC.
if (op_data->done > 0 && !op_data->drops)
{
if (op_data->scheme != POOL_SCHEME_REPLICATED)
{
submit_primary_rollback_subops(cur_op, op_data->prev_set);
resume_11:
op_data->st = 11;
return;
resume_12:
// Ignore ROLLBACK errors - submit_primary_subops will drop the connection if it fails
delete[] op_data->unstable_writes;
op_data->unstable_writes = NULL;
}
else
{
mark_partial_write(pg, op_data->oid, op_data->object_state, op_data->stripes, true);
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
return;
}
}
deref_object_state(pg, &op_data->object_state, true);
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
return;

View File

@ -62,6 +62,11 @@ TEST_NAME=csum_4k_dmj OSD_ARGS="--data_csum_type crc32c --inmemory_metadata fal
TEST_NAME=csum_4k_dj OSD_ARGS="--data_csum_type crc32c --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
TEST_NAME=csum_4k OSD_ARGS="--data_csum_type crc32c" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
./test_enospc.sh
SCHEME=xor ./test_enospc.sh
IMMEDIATE_COMMIT=1 ./test_enospc.sh
IMMEDIATE_COMMIT=1 SCHEME=xor ./test_enospc.sh
./test_scrub.sh
ZERO_OSD=2 ./test_scrub.sh
SCHEME=xor ./test_scrub.sh

30
tests/test_enospc.sh Executable file
View File

@ -0,0 +1,30 @@
#!/bin/bash -ex
OSD_SIZE=200
GLOBAL_CONFIG=',"client_retry_enospc":false'
. `dirname $0`/run_3osds.sh
export LD_PRELOAD="build/src/libfio_vitastor.so"
# Should fail with ENOSPC
if fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \
-rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=500M -cluster_log_level=10; then
format_error "Should get ENOSPC, but didn't"
fi
# Should fail with ENOSPC too (the idea is to try to overwrite first objects to check their rollback)
if fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=32 \
-rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=500M -cluster_log_level=10; then
format_error "Should get ENOSPC, but didn't"
fi
# Should complete OK
if ! fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=4 \
-rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=100M -cluster_log_level=10 -number_ios=4096; then
format_error "Should do random writes over ENOSPC correctly, but got an error"
fi
export -n LD_PRELOAD
format_green OK