diff --git a/mon/mon.js b/mon/mon.js index 963d5435..3d8aed55 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -87,6 +87,7 @@ const etcd_tree = { bind_address: "0.0.0.0", bind_port: 0, autosync_interval: 5, + autosync_writes: 128, client_queue_depth: 128, // unused recovery_queue_depth: 4, recovery_sync_batch: 16, diff --git a/src/blockstore.cpp b/src/blockstore.cpp index ff85f6c5..e014eb7a 100644 --- a/src/blockstore.cpp +++ b/src/blockstore.cpp @@ -68,6 +68,11 @@ uint64_t blockstore_t::get_free_block_count() return impl->get_free_block_count(); } +uint64_t blockstore_t::get_journal_size() +{ + return impl->get_journal_size(); +} + uint32_t blockstore_t::get_bitmap_granularity() { return impl->get_bitmap_granularity(); diff --git a/src/blockstore.h b/src/blockstore.h index 91933e61..46930543 100644 --- a/src/blockstore.h +++ b/src/blockstore.h @@ -194,5 +194,7 @@ public: uint64_t get_block_count(); uint64_t get_free_block_count(); + uint64_t get_journal_size(); + uint32_t get_bitmap_granularity(); }; diff --git a/src/blockstore_impl.h b/src/blockstore_impl.h index 4179010d..51ac5b0d 100644 --- a/src/blockstore_impl.h +++ b/src/blockstore_impl.h @@ -368,4 +368,5 @@ public: inline uint64_t get_block_count() { return block_count; } inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); } inline uint32_t get_bitmap_granularity() { return disk_alignment; } + inline uint64_t get_journal_size() { return journal.len; } }; diff --git a/src/osd.cpp b/src/osd.cpp index 296d6801..239f29c2 100644 --- a/src/osd.cpp +++ b/src/osd.cpp @@ -45,6 +45,12 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop) // FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config auto bs_cfg = json_to_bs(this->config); this->bs = new blockstore_t(bs_cfg, ringloop, tfd); + { + // Autosync based on the number of unstable writes to prevent stalls due to insufficient journal space + uint64_t max_autosync = bs->get_journal_size() / bs->get_block_size() / 2; + if (autosync_writes > max_autosync) + autosync_writes = max_autosync; + } this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id) { @@ -123,6 +129,11 @@ void osd_t::parse_config(const json11::Json & config) if (autosync_interval > MAX_AUTOSYNC_INTERVAL) autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; } + if (!config["autosync_writes"].is_null()) + { + // Allow to set it to 0 + autosync_writes = config["autosync_writes"].uint64_value(); + } if (!config["client_queue_depth"].is_null()) { client_queue_depth = config["client_queue_depth"].uint64_value(); diff --git a/src/osd.h b/src/osd.h index c8c5f44b..e3071e8e 100644 --- a/src/osd.h +++ b/src/osd.h @@ -35,6 +35,7 @@ #define MAX_AUTOSYNC_INTERVAL 3600 #define DEFAULT_AUTOSYNC_INTERVAL 5 +#define DEFAULT_AUTOSYNC_WRITES 128 #define MAX_RECOVERY_QUEUE 2048 #define DEFAULT_RECOVERY_QUEUE 4 #define DEFAULT_RECOVERY_BATCH 16 @@ -108,7 +109,8 @@ class osd_t int print_stats_interval = 3; int slow_log_interval = 10; int immediate_commit = IMMEDIATE_NONE; - int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // sync every 5 seconds + int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds + int autosync_writes = DEFAULT_AUTOSYNC_WRITES; int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE; int recovery_sync_batch = DEFAULT_RECOVERY_BATCH; int log_level = 0; @@ -140,6 +142,7 @@ class osd_t osd_op_t *autosync_op = NULL; // Unstable writes + uint64_t unstable_write_count = 0; std::map unstable_writes; std::deque syncs_in_progress; diff --git a/src/osd_primary_subops.cpp b/src/osd_primary_subops.cpp index 9b8eff08..56bd6f0f 100644 --- a/src/osd_primary_subops.cpp +++ b/src/osd_primary_subops.cpp @@ -5,8 +5,6 @@ void osd_t::autosync() { - // FIXME Autosync based on the number of unstable writes to prevent - // "journal_sector_buffer_count is too low for this batch" errors if (immediate_commit != IMMEDIATE_ALL && !autosync_op) { autosync_op = new osd_op_t(); diff --git a/src/osd_primary_write.cpp b/src/osd_primary_write.cpp index ad111d4e..ee8d9c77 100644 --- a/src/osd_primary_write.cpp +++ b/src/osd_primary_write.cpp @@ -274,6 +274,11 @@ continue_others: } // finish_op would invalidate next_it if it cleared pg.write_queue, but it doesn't do that :) finish_op(cur_op, cur_op->reply.hdr.retval); + if (unstable_write_count >= autosync_writes) + { + unstable_write_count = 0; + autosync(); + } if (next_op) { // Continue next write to the same object @@ -353,6 +358,7 @@ resume_7: else { lazy: + unstable_write_count++; if (op_data->scheme != POOL_SCHEME_REPLICATED) { // Remember version as unstable for EC/XOR diff --git a/tests/test_write.sh b/tests/test_write.sh index b41b9f57..6c561497 100755 --- a/tests/test_write.sh +++ b/tests/test_write.sh @@ -5,6 +5,12 @@ #LD_PRELOAD=libasan.so.5 \ # fio -thread -name=test -ioengine=build/src/libfio_vitastor_sec.so -bs=4k -fsync=128 `$ETCDCTL get /vitastor/osd/state/1 --print-value-only | jq -r '"-host="+.addresses[0]+" -port="+(.port|tostring)'` -rw=write -size=32M +# Random writes without immediate_commit were stalling OSDs + +LD_PRELOAD=libasan.so.5 \ + fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=124k -direct=1 -numjobs=16 -iodepth=4 \ + -rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -runtime=10 + # A lot of parallel syncs was crashing the primary OSD at some point LD_PRELOAD=libasan.so.5 \