From 6bf1f539a6b38e8a596f0fcdbf3c8452fddad8b8 Mon Sep 17 00:00:00 2001
From: Vitaliy Filippov <vitalif@yourcmc.ru>
Date: Wed, 28 Feb 2024 00:51:13 +0300
Subject: [PATCH] Add EIO retry timeout and allow to disable these retries,
 rename up_wait_retry_interval to client_retry_interval

---
 docs/config/client.en.md    | 22 +++++++++
 docs/config/client.ru.md    | 23 ++++++++++
 docs/config/network.en.md   | 12 -----
 docs/config/network.ru.md   | 14 ------
 docs/config/src/client.yml  | 24 ++++++++++
 docs/config/src/network.yml | 15 -------
 mon/mon.js                  |  3 +-
 src/cluster_client.cpp      | 90 +++++++++++++++++++++++--------------
 src/cluster_client.h        |  9 ++--
 9 files changed, 134 insertions(+), 78 deletions(-)

diff --git a/docs/config/client.en.md b/docs/config/client.en.md
index 0cd2420e..149f48ba 100644
--- a/docs/config/client.en.md
+++ b/docs/config/client.en.md
@@ -9,6 +9,8 @@
 These parameters apply only to Vitastor clients (QEMU, fio, NBD and so on) and
 affect their interaction with the cluster.
 
+- [client_retry_interval](#client_retry_interval)
+- [client_eio_retry_interval](#client_eio_retry_interval)
 - [client_max_dirty_bytes](#client_max_dirty_bytes)
 - [client_max_dirty_ops](#client_max_dirty_ops)
 - [client_enable_writeback](#client_enable_writeback)
@@ -19,6 +21,26 @@ affect their interaction with the cluster.
 - [nbd_max_devices](#nbd_max_devices)
 - [nbd_max_part](#nbd_max_part)
 
+## client_retry_interval
+
+- Type: milliseconds
+- Default: 50
+- Minimum: 10
+- Can be changed online: yes
+
+Retry time for I/O requests failed due to inactive PGs or network
+connectivity errors.
+
+## client_eio_retry_interval
+
+- Type: milliseconds
+- Default: 1000
+- Can be changed online: yes
+
+Retry time for I/O requests failed due to data corruption or unfinished
+EC object deletions (has_incomplete PG state). 0 disables such retries
+and clients are not blocked and just get EIO error code instead.
+
 ## client_max_dirty_bytes
 
 - Type: integer
diff --git a/docs/config/client.ru.md b/docs/config/client.ru.md
index bc96e537..e714fe31 100644
--- a/docs/config/client.ru.md
+++ b/docs/config/client.ru.md
@@ -9,6 +9,8 @@
 Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD и т.п.) и
 затрагивают логику их работы с кластером.
 
+- [client_retry_interval](#client_retry_interval)
+- [client_eio_retry_interval](#client_eio_retry_interval)
 - [client_max_dirty_bytes](#client_max_dirty_bytes)
 - [client_max_dirty_ops](#client_max_dirty_ops)
 - [client_enable_writeback](#client_enable_writeback)
@@ -19,6 +21,27 @@
 - [nbd_max_devices](#nbd_max_devices)
 - [nbd_max_part](#nbd_max_part)
 
+## client_retry_interval
+
+- Тип: миллисекунды
+- Значение по умолчанию: 50
+- Минимальное значение: 10
+- Можно менять на лету: да
+
+Время повтора запросов ввода-вывода, неудачных из-за неактивных PG или
+ошибок сети.
+
+## client_eio_retry_interval
+
+- Тип: миллисекунды
+- Значение по умолчанию: 1000
+- Можно менять на лету: да
+
+Время повтора запросов ввода-вывода, неудачных из-за повреждения данных
+или незавершённых удалений EC-объектов (состояния PG has_incomplete).
+0 отключает повторы таких запросов и клиенты не блокируются, а вместо
+этого просто получают код ошибки EIO.
+
 ## client_max_dirty_bytes
 
 - Тип: целое число
diff --git a/docs/config/network.en.md b/docs/config/network.en.md
index a28464ee..85ffd1a5 100644
--- a/docs/config/network.en.md
+++ b/docs/config/network.en.md
@@ -25,7 +25,6 @@ between clients, OSDs and etcd.
 - [peer_connect_timeout](#peer_connect_timeout)
 - [osd_idle_timeout](#osd_idle_timeout)
 - [osd_ping_timeout](#osd_ping_timeout)
-- [up_wait_retry_interval](#up_wait_retry_interval)
 - [max_etcd_attempts](#max_etcd_attempts)
 - [etcd_quick_timeout](#etcd_quick_timeout)
 - [etcd_slow_timeout](#etcd_slow_timeout)
@@ -212,17 +211,6 @@ Maximum time to wait for OSD keepalive responses. If an OSD doesn't respond
 within this time, the connection to it is dropped and a reconnection attempt
 is scheduled.
 
-## up_wait_retry_interval
-
-- Type: milliseconds
-- Default: 50
-- Minimum: 10
-- Can be changed online: yes
-
-OSDs respond to clients with a special error code when they receive I/O
-requests for a PG that's not synchronized and started. This parameter sets
-the time for the clients to wait before re-attempting such I/O requests.
-
 ## max_etcd_attempts
 
 - Type: integer
diff --git a/docs/config/network.ru.md b/docs/config/network.ru.md
index 1d3ceaa0..f97d7c9f 100644
--- a/docs/config/network.ru.md
+++ b/docs/config/network.ru.md
@@ -25,7 +25,6 @@
 - [peer_connect_timeout](#peer_connect_timeout)
 - [osd_idle_timeout](#osd_idle_timeout)
 - [osd_ping_timeout](#osd_ping_timeout)
-- [up_wait_retry_interval](#up_wait_retry_interval)
 - [max_etcd_attempts](#max_etcd_attempts)
 - [etcd_quick_timeout](#etcd_quick_timeout)
 - [etcd_slow_timeout](#etcd_slow_timeout)
@@ -221,19 +220,6 @@ OSD в любом случае согласовывают реальное зн
 Если OSD не отвечает за это время, соединение отключается и производится
 повторная попытка соединения.
 
-## up_wait_retry_interval
-
-- Тип: миллисекунды
-- Значение по умолчанию: 50
-- Минимальное значение: 10
-- Можно менять на лету: да
-
-Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
-поднятым на данный момент на них PG, либо к PG в процессе синхронизации,
-они отвечают клиентам специальным кодом ошибки, означающим, что клиент
-должен некоторое время подождать перед повторением запроса. Именно это время
-ожидания задаёт данный параметр.
-
 ## max_etcd_attempts
 
 - Тип: целое число
diff --git a/docs/config/src/client.yml b/docs/config/src/client.yml
index 3bebd783..7ca579cb 100644
--- a/docs/config/src/client.yml
+++ b/docs/config/src/client.yml
@@ -1,3 +1,27 @@
+- name: client_retry_interval
+  type: ms
+  min: 10
+  default: 50
+  online: true
+  info: |
+    Retry time for I/O requests failed due to inactive PGs or network
+    connectivity errors.
+  info_ru: |
+    Время повтора запросов ввода-вывода, неудачных из-за неактивных PG или
+    ошибок сети.
+- name: client_eio_retry_interval
+  type: ms
+  default: 1000
+  online: true
+  info: |
+    Retry time for I/O requests failed due to data corruption or unfinished
+    EC object deletions (has_incomplete PG state). 0 disables such retries
+    and clients are not blocked and just get EIO error code instead.
+  info_ru: |
+    Время повтора запросов ввода-вывода, неудачных из-за повреждения данных
+    или незавершённых удалений EC-объектов (состояния PG has_incomplete).
+    0 отключает повторы таких запросов и клиенты не блокируются, а вместо
+    этого просто получают код ошибки EIO.
 - name: client_max_dirty_bytes
   type: int
   default: 33554432
diff --git a/docs/config/src/network.yml b/docs/config/src/network.yml
index 5bd2c808..ea0c7438 100644
--- a/docs/config/src/network.yml
+++ b/docs/config/src/network.yml
@@ -243,21 +243,6 @@
     Максимальное время ожидания ответа на запрос проверки состояния соединения.
     Если OSD не отвечает за это время, соединение отключается и производится
     повторная попытка соединения.
-- name: up_wait_retry_interval
-  type: ms
-  min: 10
-  default: 50
-  online: true
-  info: |
-    OSDs respond to clients with a special error code when they receive I/O
-    requests for a PG that's not synchronized and started. This parameter sets
-    the time for the clients to wait before re-attempting such I/O requests.
-  info_ru: |
-    Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
-    поднятым на данный момент на них PG, либо к PG в процессе синхронизации,
-    они отвечают клиентам специальным кодом ошибки, означающим, что клиент
-    должен некоторое время подождать перед повторением запроса. Именно это время
-    ожидания задаёт данный параметр.
 - name: max_etcd_attempts
   type: int
   default: 5
diff --git a/mon/mon.js b/mon/mon.js
index 6e8864c9..060d5c4a 100644
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -86,13 +86,14 @@ const etcd_tree = {
             client_max_buffered_bytes: 33554432,
             client_max_buffered_ops: 1024,
             client_max_writeback_iodepth: 256,
+            client_retry_interval: 50, // ms. min: 10
+            client_eio_retry_interval: 1000, // ms
             // client and osd - configurable online
             log_level: 0,
             peer_connect_interval: 5, // seconds. min: 1
             peer_connect_timeout: 5, // seconds. min: 1
             osd_idle_timeout: 5, // seconds. min: 1
             osd_ping_timeout: 5, // seconds. min: 1
-            up_wait_retry_interval: 50, // ms. min: 10
             max_etcd_attempts: 5,
             etcd_quick_timeout: 1000, // ms
             etcd_slow_timeout: 5000, // ms
diff --git a/src/cluster_client.cpp b/src/cluster_client.cpp
index 1a72874b..8e679f4a 100644
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@@ -265,7 +265,7 @@ void cluster_client_t::erase_op(cluster_op_t *op)
     }
 }
 
-void cluster_client_t::continue_ops(bool up_retry)
+void cluster_client_t::continue_ops(int time_passed)
 {
     if (!pgs_loaded)
     {
@@ -277,22 +277,27 @@ void cluster_client_t::continue_ops(bool up_retry)
         // Attempt to reenter the function
         return;
     }
+    int reset_duration = 0;
 restart:
     continuing_ops = 1;
     for (auto op = op_queue_head; op; )
     {
         cluster_op_t *next_op = op->next;
-        if (!op->up_wait || up_retry)
+        if (op->retry_after && time_passed)
         {
-            op->up_wait = false;
-            if (!op->prev_wait)
+            op->retry_after = op->retry_after > time_passed ? op->retry_after-time_passed : 0;
+            if (op->retry_after && (!reset_duration || op->retry_after < reset_duration))
             {
-                if (op->opcode == OSD_OP_SYNC)
-                    continue_sync(op);
-                else
-                    continue_rw(op);
+                reset_duration = op->retry_after;
             }
         }
+        if (!op->retry_after && !op->prev_wait)
+        {
+            if (op->opcode == OSD_OP_SYNC)
+                continue_sync(op);
+            else
+                continue_rw(op);
+        }
         op = next_op;
         if (continuing_ops == 2)
         {
@@ -300,6 +305,27 @@ restart:
         }
     }
     continuing_ops = 0;
+    reset_retry_timer(reset_duration);
+}
+
+void cluster_client_t::reset_retry_timer(int new_duration)
+{
+    if (retry_timeout_duration && retry_timeout_duration <= new_duration || !new_duration)
+    {
+        return;
+    }
+    if (retry_timeout_id)
+    {
+        tfd->clear_timer(retry_timeout_id);
+    }
+    retry_timeout_duration = new_duration;
+    retry_timeout_id = tfd->set_timer(retry_timeout_duration, false, [this](int)
+    {
+        int time_passed = retry_timeout_duration;
+        retry_timeout_id = 0;
+        retry_timeout_duration = 0;
+        continue_ops(time_passed);
+    });
 }
 
 void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_config)
@@ -349,15 +375,25 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
     {
         client_max_writeback_iodepth = DEFAULT_CLIENT_MAX_WRITEBACK_IODEPTH;
     }
-    // up_wait_retry_interval
-    up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value();
-    if (!up_wait_retry_interval)
+    // client_retry_interval
+    client_retry_interval = config["client_retry_interval"].uint64_value();
+    if (!client_retry_interval)
     {
-        up_wait_retry_interval = 50;
+        client_retry_interval = 50;
     }
-    else if (up_wait_retry_interval < 10)
+    else if (client_retry_interval < 10)
     {
-        up_wait_retry_interval = 10;
+        client_retry_interval = 10;
+    }
+    // client_eio_retry_interval
+    client_eio_retry_interval = 1000;
+    if (!config["client_eio_retry_interval"].is_null())
+    {
+        client_eio_retry_interval = config["client_eio_retry_interval"].uint64_value();
+        if (client_eio_retry_interval && client_eio_retry_interval < 10)
+        {
+            client_eio_retry_interval = 10;
+        }
     }
     // log_level
     log_level = config["log_level"].uint64_value();
@@ -716,15 +752,8 @@ resume_1:
                 // We'll need to retry again
                 if (op->parts[i].flags & PART_RETRY)
                 {
-                    op->up_wait = true;
-                    if (!retry_timeout_id)
-                    {
-                        retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
-                        {
-                            retry_timeout_id = 0;
-                            continue_ops(true);
-                        });
-                    }
+                    op->retry_after = client_retry_interval;
+                    reset_retry_timer(client_retry_interval);
                 }
                 op->state = 1;
             }
@@ -780,10 +809,9 @@ resume_2:
         return 1;
     }
     else if (op->retval != 0 && !(op->flags & OP_FLUSH_BUFFER) &&
-        op->retval != -EPIPE && op->retval != -EIO && op->retval != -ENOSPC)
+        op->retval != -EPIPE && (op->retval != -EIO || !client_eio_retry_interval) && op->retval != -ENOSPC)
     {
         // Fatal error (neither -EPIPE, -EIO nor -ENOSPC)
-        // FIXME: Add a parameter to allow to not wait for EIOs (incomplete or corrupted objects) to heal
         erase_op(op);
         return 1;
     }
@@ -1171,16 +1199,12 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
         // All next things like timer, continue_sync/rw and stop_client may affect the operation again
         // So do all these things after modifying operation state, otherwise we may hit reenterability bugs
         // FIXME postpone such things to set_immediate here to avoid bugs
-        // Mark op->up_wait = true to retry operation after a short pause (not immediately)
-        op->up_wait = true;
-        if (!retry_timeout_id)
+        // Set op->retry_after to retry operation after a short pause (not immediately)
+        if (!op->retry_after)
         {
-            retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
-            {
-                retry_timeout_id = 0;
-                continue_ops(true);
-            });
+            op->retry_after = op->retval == -EIO ? client_eio_retry_interval : client_retry_interval;
         }
+        reset_retry_timer(op->retry_after);
         if (op->inflight_count == 0)
         {
             if (op->opcode == OSD_OP_SYNC)
diff --git a/src/cluster_client.h b/src/cluster_client.h
index 140f7a32..89ca4bfc 100644
--- a/src/cluster_client.h
+++ b/src/cluster_client.h
@@ -59,7 +59,7 @@ protected:
     void *buf = NULL;
     cluster_op_t *orig_op = NULL;
     bool needs_reslice = false;
-    bool up_wait = false;
+    int retry_after = 0;
     int inflight_count = 0, done_count = 0;
     std::vector<cluster_op_part_t> parts;
     void *part_bitmaps = NULL;
@@ -92,9 +92,11 @@ class cluster_client_t
     uint64_t client_max_writeback_iodepth = 0;
 
     int log_level = 0;
-    int up_wait_retry_interval = 500; // ms
+    int client_retry_interval = 50; // ms
+    int client_eio_retry_interval = 1000; // ms
 
     int retry_timeout_id = 0;
+    int retry_timeout_duration = 0;
     std::vector<cluster_op_t*> offline_ops;
     cluster_op_t *op_queue_head = NULL, *op_queue_tail = NULL;
     writeback_cache_t *wb = NULL;
@@ -131,7 +133,7 @@ public:
 
     bool get_immediate_commit(uint64_t inode);
 
-    void continue_ops(bool up_retry = false);
+    void continue_ops(int time_passed = 0);
     inode_list_t *list_inode_start(inode_t inode,
         std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback);
     int list_pg_count(inode_list_t *lst);
@@ -152,6 +154,7 @@ protected:
     int continue_rw(cluster_op_t *op);
     bool check_rw(cluster_op_t *op);
     void slice_rw(cluster_op_t *op);
+    void reset_retry_timer(int new_duration);
     bool try_send(cluster_op_t *op, int i);
     int continue_sync(cluster_op_t *op);
     void send_sync(cluster_op_t *op, cluster_op_part_t *part);