Add up_wait_retry_interval to config and fix it so it actually works

2020-09-05 22:05:21 +03:00 · 2020-09-05 22:05:21 +03:00 · 73e26dbbea
parent 44973e7f27
commit 73e26dbbea
3 changed files with 41 additions and 17 deletions
--- a/cluster_client.cpp
+++ b/cluster_client.cpp
@ -101,16 +101,22 @@ void cluster_client_t::stop()
    }
 }
-void cluster_client_t::continue_ops()
+void cluster_client_t::continue_ops(bool up_retry)
 {
    if (retry_timeout_id)
    {
        tfd->clear_timer(retry_timeout_id);
        retry_timeout_id = 0;
    }
    for (auto op_it = cur_ops.begin(); op_it != cur_ops.end(); )
    {
-        continue_rw(*op_it++);
+        if ((*op_it)->up_wait)
        {
            if (up_retry)
            {
                (*op_it)->up_wait = false;
                continue_rw(*op_it++);
            }
            else
                op_it++;
        }
        else
            continue_rw(*op_it++);
    }
 }
@ -173,6 +179,15 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
    {
        client_dirty_limit = DEFAULT_CLIENT_DIRTY_LIMIT;
    }
    up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value();
    if (!up_wait_retry_interval)
    {
        up_wait_retry_interval = 500;
    }
    else if (up_wait_retry_interval < 50)
    {
        up_wait_retry_interval = 50;
    }
    msgr.peer_connect_interval = config["peer_connect_interval"].uint64_value();
    if (!msgr.peer_connect_interval)
    {
@ -696,9 +711,17 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
            part->osd_num, part->op.reply.hdr.retval, expected
        );
        msgr.stop_client(part->op.peer_fd);
-        if (part->op.reply.hdr.retval && !retry_timeout_id)
+        if (part->op.reply.hdr.retval == -EPIPE)
        {
-            retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int) { retry_timeout_id = 0; continue_ops(); });
+            op->up_wait = true;
            if (!retry_timeout_id)
            {
                retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
                {
                    retry_timeout_id = 0;
                    continue_ops(true);
                });
            }
        }
        if (!op->retval || op->retval == -EPIPE)
        {
--- a/cluster_client.h
+++ b/cluster_client.h
@ -40,6 +40,7 @@ protected:
    cluster_op_t *orig_op = NULL;
    bool is_internal = false;
    bool needs_reslice = false;
    bool up_wait = false;
    int sent_count = 0, done_count = 0;
    std::vector<cluster_op_part_t> parts;
    friend class cluster_client_t;
@ -59,7 +60,6 @@ class cluster_client_t
    // FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
    uint64_t client_dirty_limit = 0;
    int log_level;
    // FIXME: Put up_wait_retry_interval into config and fix it so it could actually work
    int up_wait_retry_interval = 500; // ms
    uint64_t op_id = 1;
@ -85,7 +85,7 @@ public:
    void stop();
 protected:
-    void continue_ops();
+    void continue_ops(bool up_retry = false);
    void on_load_config_hook(json11::Json::object & config);
    void on_load_pgs_hook(bool success);
    void on_change_hook(json11::Json::object & changes);
--- a/lp/mon.js
+++ b/lp/mon.js
@ -30,11 +30,11 @@ class Mon
            /* global: {
                // mon
                etcd_mon_ttl: 30, // min: 10
-                etcd_mon_timeout: 1000, // min: 0
+                etcd_mon_timeout: 1000, // ms. min: 0
                etcd_mon_retries: 5, // min: 0
-                mon_change_timeout: 1000, // min: 100
+                mon_change_timeout: 1000, // ms. min: 100
-                mon_stats_timeout: 1000, // min: 100
+                mon_stats_timeout: 1000, // ms. min: 100
-                osd_out_time: 1800, // min: 0
+                osd_out_time: 1800, // seconds. min: 0
                placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
                // client and osd
                use_sync_send_recv: false,
@ -45,8 +45,9 @@ class Mon
                pg_stripe_size: 4194304,
                immediate_commit: false, // 'all' or 'small'
                client_dirty_limit: 33554432,
-                peer_connect_interval: 5,
+                peer_connect_interval: 5, // seconds. min: 1
-                peer_connect_timeout: 5,
+                peer_connect_timeout: 5, // seconds. min: 1
                up_wait_retry_interval: 500, // ms. min: 50
                // osd
                etcd_report_interval: 30, // min: 10
                run_primary: true,