Compare commits

..

33 Commits

Author SHA1 Message Date
Vitaliy Filippov c64695b0e2 Fix eviction when random_pos selects the end
Test / test_move_reappear (push) Has been cancelled Details
Test / test_rm (push) Has been cancelled Details
Test / test_snapshot_chain (push) Has been cancelled Details
Test / build (push) Has been cancelled Details
Test / test_snapshot_chain_ec (push) Has been cancelled Details
Test / test_snapshot_down (push) Has been cancelled Details
Test / test_snapshot_down_ec (push) Has been cancelled Details
Test / test_splitbrain (push) Has been cancelled Details
Test / test_rebalance_verify (push) Has been cancelled Details
Test / test_rebalance_verify_imm (push) Has been cancelled Details
Test / test_rebalance_verify_ec (push) Has been cancelled Details
Test / test_rebalance_verify_ec_imm (push) Has been cancelled Details
Test / test_switch_primary (push) Has been cancelled Details
Test / test_write (push) Has been cancelled Details
Test / test_write_xor (push) Has been cancelled Details
Test / test_write_no_same (push) Has been cancelled Details
Test / test_heal_pg_size_2 (push) Has been cancelled Details
Test / test_heal_ec (push) Has been cancelled Details
Test / test_heal_csum_32k_dmj (push) Has been cancelled Details
Test / test_heal_csum_32k_dj (push) Has been cancelled Details
Test / test_heal_csum_32k (push) Has been cancelled Details
Test / test_heal_csum_4k_dmj (push) Has been cancelled Details
Test / test_heal_csum_4k_dj (push) Has been cancelled Details
Test / test_heal_csum_4k (push) Has been cancelled Details
Test / test_scrub (push) Has been cancelled Details
Test / test_scrub_zero_osd_2 (push) Has been cancelled Details
Test / test_scrub_xor (push) Has been cancelled Details
Test / test_scrub_pg_size_3 (push) Has been cancelled Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled Details
Test / test_scrub_ec (push) Has been cancelled Details
2024-01-28 14:59:56 +03:00
Vitaliy Filippov a1cdb4f556 Implement min/max list_count to make listings during performance test reasonable 2024-01-28 14:59:56 +03:00
Vitaliy Filippov ca2aa8da6a Fix and improve parallel allocation
- Do not try to allocate more DB blocks in an inode block until it's "confirmed" and "locked" by the first write
- Do not recheck for new zero DB blocks on first write into an inode block - a CAS failure means someone else is already writing into it
- Throw new allocation blocks away regardless of whether the known_version is 0 on a CAS failure
2024-01-28 14:59:56 +03:00
Vitaliy Filippov 1afcbef016 Implement key_prefix for K/V stress test 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 52db381a82 More fixes
- do not overwrite a block with older version if known version is newer
  (read may start before update and end after update)
- invalidated block versions can't be remembered and trusted
- right boundary for split blocks is right_half when diving down, not key_lt
- restart update also when block is "invalidated", not just on version mismatch
- copy callback in listings to avoid closure destruction bugs too
2024-01-28 14:59:56 +03:00
Vitaliy Filippov 06b730b22b Add logging and one more assert 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 4ddd0003c1 Make get_block() wait for updating when unrelated block is found along the path 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 78f101a0e8 Fix a race condition where changed blocks were parsed over existing cached blocks and getting a mix of data 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 502b5e791f Simplify code by removing an unneeded "optimisation" 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 808d8cddc9 Add kv_log_level, print warnings on level 1, trace ops on level 10 2024-01-28 14:59:56 +03:00
Vitaliy Filippov b517949d1e Fix duplicate keys in listings on parallel updates -- do not rewind key "iterator position" 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 818b5198c8 Implement key suffix to avoid collisions of multiple test workers 2024-01-28 14:59:56 +03:00
Vitaliy Filippov b15a48ac4f Do not complain on empty first block 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 3e6bb1c1fc Add JSON output for stress-tester 2024-01-28 14:59:56 +03:00
Vitaliy Filippov c489c0b545 Print total stats 2024-01-28 14:59:56 +03:00
Vitaliy Filippov cffbb6b1f7 Do not send more than op_count operations (fix segfault on finish) 2024-01-28 14:59:56 +03:00
Vitaliy Filippov a188daea0e Add some more resiliency to serialize() 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 3f09ed22ef Invalidate blocks being updated too 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 094f75b221 Change new block allocation method: make each writer choose multiple empty PG blocks and place blocks in them 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 774f2addd8 Remove blocks from cache on unsuccessful updates 2024-01-28 14:59:56 +03:00
Vitaliy Filippov e479bbb99f Allow to track multiple updates per block (it should never happen though) 2024-01-28 14:59:56 +03:00
Vitaliy Filippov ba7dd05be9 Do not call stop_updating after failed write_new_block and after clear_block (both delete the item) 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 98a71f4862 Track versions of parent blocks and recheck if changed during update 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 6af84ffec1 Fix resume_split condition (key_lt can also be "") 2024-01-28 14:59:56 +03:00
Vitaliy Filippov db96d19965 Experiment: transform offsets for better sharding 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 88d8905c0b More post-stress-test fixes
- Prevent _split types of new blocks
- Stop updating new blocks only after the whole update, otherwise pointers
  may become invalid
- Use recheck_none for updates initially
- Use UINT64_MAX as initial block version when postponing ops, otherwise the
  check fails when the block is initially empty. This for example leads to
  writing both leaf items & block pointers (which is incorrect) into the root
  block when starting stress-test with --parallelism 32
- Fix -EINTR comparison
2024-01-28 14:59:56 +03:00
Vitaliy Filippov 4d64ea3be3 Print operation statistics 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 4163e4f81e K/V fixes after stress-test :-)
- track block versions correctly - per inode block (128kb) instead of tree block (4kb)
- prevent multiple parallel CAS writes of the same inode block
- add logging for EILSEQ which means invalid data in the tree
- fix get_block updated flag which was true for blocks already in cache and was leading to infinite loops on "unrelated block" errors
- apply changes to blocks in cache only after successful writes (using "virtual changes")
- do not replace cached block with an older version from disk
- recheck "unrelated blocks" (read/update collisions) until data stops changing
- track tree path correctly - do not treat split block as parent of its right half
- correctly move blocks when finding new empty place on disk
- restart updates from the beginning when one of blocks is changed by a parallel update
- fix delete using SET opcode and setting key to the empty value instead
- prevent changing the same key more than 1 time in parallel
- fix listing verification
- resume continue_updates in update_find (required because it uses continue_update itself)
- add allow_old_cached parameter to get()
2024-01-28 14:59:56 +03:00
Vitaliy Filippov ec19561664 Implement K/V DB stress tester 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 9d34b71cb4 Evict blocks based on memory limit & block usage 2024-01-28 14:59:56 +03:00
Vitaliy Filippov 3ccf9b53e5 Track blocks per level 2024-01-28 14:59:56 +03:00
Vitaliy Filippov cc510e8712 Track block level 2024-01-28 14:59:56 +03:00
Vitaliy Filippov cd98575ed4 Experimental B-Tree Vitastor embedded K/V database implementation! 2024-01-28 14:59:56 +03:00
7 changed files with 13 additions and 69 deletions

View File

@ -1,7 +1,7 @@
- name: etcd_mon_ttl
type: sec
min: 5
default: 1
min: 10
default: 30
info: Monitor etcd lease refresh interval in seconds
info_ru: Интервал обновления etcd резервации (lease) монитором
- name: etcd_mon_timeout

View File

@ -55,7 +55,7 @@ const etcd_tree = {
// etcd connection - configurable online
etcd_address: "10.0.115.10:2379/v3",
// mon
etcd_mon_ttl: 5, // min: 1
etcd_mon_ttl: 30, // min: 10
etcd_mon_timeout: 1000, // ms. min: 0
etcd_mon_retries: 5, // min: 0
mon_change_timeout: 1000, // ms. min: 100
@ -480,10 +480,10 @@ class Mon
check_config()
{
this.config.etcd_mon_ttl = Number(this.config.etcd_mon_ttl) || 5;
if (this.config.etcd_mon_ttl < 1)
this.config.etcd_mon_ttl = Number(this.config.etcd_mon_ttl) || 30;
if (this.config.etcd_mon_ttl < 10)
{
this.config.etcd_mon_ttl = 1;
this.config.etcd_mon_ttl = 10;
}
this.config.etcd_mon_timeout = Number(this.config.etcd_mon_timeout) || 0;
if (this.config.etcd_mon_timeout <= 0)
@ -794,7 +794,7 @@ class Mon
{
this.failconnect('Lease expired');
}
}, this.config.etcd_mon_ttl*1000);
}, this.config.etcd_mon_timeout);
if (!this.signals_set)
{
process.on('SIGINT', this.on_stop_cb);

View File

@ -19,7 +19,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
if (config["autosync_writes"] != "")
if (config.find("autosync_writes") != config.end())
{
autosync_writes = strtoull(config["autosync_writes"].c_str(), NULL, 10);
}

View File

@ -17,7 +17,6 @@ struct rm_pg_t
uint64_t obj_count = 0, obj_done = 0;
int state = 0;
int in_flight = 0;
bool synced = false;
};
struct rm_inode_t
@ -49,7 +48,6 @@ struct rm_inode_t
.objects = objects,
.obj_count = objects.size(),
.obj_done = 0,
.synced = parent->cli->get_immediate_commit(inode),
});
if (min_offset == 0)
{
@ -153,37 +151,6 @@ struct rm_inode_t
}
cur_list->obj_pos++;
}
if (cur_list->in_flight == 0 && cur_list->obj_pos == cur_list->objects.end() &&
!cur_list->synced)
{
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->peer_fd = parent->cli->msgr.osd_peer_fds.at(cur_list->rm_osd_num);
op->req = (osd_any_op_t){
.sync = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = parent->cli->next_op_id(),
.opcode = OSD_OP_SYNC,
},
},
};
op->callback = [this, cur_list](osd_op_t *op)
{
cur_list->in_flight--;
cur_list->synced = true;
if (op->reply.hdr.retval < 0)
{
fprintf(stderr, "Failed to sync OSD %lu (retval=%ld)\n",
cur_list->rm_osd_num, op->reply.hdr.retval);
error_count++;
}
delete op;
continue_delete();
};
cur_list->in_flight++;
parent->cli->msgr.outbox_push(op);
}
}
void continue_delete()
@ -194,8 +161,7 @@ struct rm_inode_t
}
for (int i = 0; i < lists.size(); i++)
{
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end() &&
lists[i]->synced)
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
{
delete lists[i];
lists.erase(lists.begin()+i, lists.begin()+i+1);
@ -221,7 +187,7 @@ struct rm_inode_t
{
fprintf(stderr, "\n");
}
if (parent->progress && (total_done < total_count || inactive_osds.size() > 0 || error_count > 0))
if (parent->progress && (total_done < total_count || inactive_osds.size() > 0))
{
fprintf(
stderr, "Warning: Pool:%u,ID:%lu inode data may not have been fully removed.\n"

View File

@ -22,7 +22,7 @@ static blockstore_config_t json_to_bs(const json11::Json::object & config)
{
if (kv.second.is_string())
bs[kv.first] = kv.second.string_value();
else if (!kv.second.is_null())
else
bs[kv.first] = kv.second.dump();
}
return bs;
@ -194,8 +194,7 @@ void osd_t::parse_config(bool init)
if (autosync_interval > MAX_AUTOSYNC_INTERVAL)
autosync_interval = DEFAULT_AUTOSYNC_INTERVAL;
}
if (config["autosync_writes"].is_number() ||
config["autosync_writes"].string_value() != "")
if (!config["autosync_writes"].is_null())
{
// Allow to set it to 0
autosync_writes = config["autosync_writes"].uint64_value();

View File

@ -262,8 +262,7 @@ void osd_t::report_statistics()
for (auto st_it = inode_stats.begin(); st_it != inode_stats.end(); )
{
auto & kv = *st_it;
auto spc_it = bs_inode_space.find(kv.first);
if (spc_it == bs_inode_space.end() || !spc_it->second) // prevent autovivification
if (!bs_inode_space[kv.first])
{
// Is it an empty inode?
if (!tv_now.tv_sec)

View File

@ -706,26 +706,6 @@ resume_5:
remove_object_from_state(op_data->oid, &op_data->object_state, pg);
deref_object_state(pg, &op_data->object_state, true);
}
// Mark PG and OSDs as dirty
for (auto & chunk: (op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set))
{
this->dirty_osds.insert(chunk.osd_num);
}
for (auto cl_it = msgr.clients.find(cur_op->peer_fd); cl_it != msgr.clients.end(); )
{
cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
break;
}
dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
if (immediate_commit == IMMEDIATE_NONE)
{
unstable_write_count++;
if (unstable_write_count >= autosync_writes)
{
unstable_write_count = 0;
autosync();
}
}
pg.total_count--;
cur_op->reply.hdr.retval = 0;
continue_others: