Track recovery op latencies + refactor into a structure

2023-12-09 15:36:00 +03:00 · 2023-12-09 15:36:00 +03:00 · d84dee7098
parent dcc76eee15
commit d84dee7098
5 changed files with 44 additions and 30 deletions
--- a/src/osd.cpp
+++ b/src/osd.cpp
@ -421,14 +421,6 @@ void osd_t::exec_op(osd_op_t *cur_op)
    }
 }

-void osd_t::reset_stats()
-{
-    msgr.stats = {};
-    prev_stats = {};
-    memset(recovery_stat_count, 0, sizeof(recovery_stat_count));
-    memset(recovery_stat_bytes, 0, sizeof(recovery_stat_bytes));
-}
-
 void osd_t::print_stats()
 {
    for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
@ -466,19 +458,19 @@ void osd_t::print_stats()
    }
    for (int i = 0; i < 2; i++)
    {
-        if (recovery_stat_count[0][i] != recovery_stat_count[1][i])
+        if (recovery_stat[i].count > recovery_print_prev[i].count)
        {
-            uint64_t bw = (recovery_stat_bytes[0][i] - recovery_stat_bytes[1][i]) / print_stats_interval;
+            uint64_t bw = (recovery_stat[i].bytes - recovery_print_prev[i].bytes) / print_stats_interval;
            printf(
-                "[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s\n", osd_num, recovery_stat_names[i],
-                (recovery_stat_count[0][i] - recovery_stat_count[1][i]) * 1.0 / print_stats_interval,
+                "[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s, avg lat %ld us\n", osd_num, recovery_stat_names[i],
+                (recovery_stat[i].count - recovery_print_prev[i].count) * 1.0 / print_stats_interval,
                (bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
-                (bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
+                (bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s")),
+                (recovery_stat[i].usec - recovery_print_prev[i].usec) / (recovery_stat[i].count - recovery_print_prev[i].count)
            );
-            recovery_stat_count[1][i] = recovery_stat_count[0][i];
-            recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
        }
    }
+    memcpy(recovery_print_prev, recovery_stat, sizeof(recovery_stat));
    if (corrupted_objects > 0)
    {
        printf("[OSD %lu] %lu object(s) corrupted\n", osd_num, corrupted_objects);
--- a/src/osd.h
+++ b/src/osd.h
@ -87,6 +87,11 @@ struct osd_chain_read_t

 struct osd_rmw_stripe_t;

+struct recovery_stat_t
+{
+    uint64_t count, usec, bytes;
+};
+
 class osd_t
 {
    // config
@ -189,8 +194,8 @@ class osd_t
    std::map<uint64_t, inode_stats_t> inode_stats;
    std::map<uint64_t, timespec> vanishing_inodes;
    const char* recovery_stat_names[2] = { "degraded", "misplaced" };
-    uint64_t recovery_stat_count[2][2] = {};
-    uint64_t recovery_stat_bytes[2][2] = {};
+    recovery_stat_t recovery_stat[2];
+    recovery_stat_t recovery_print_prev[2];

    // cluster connection
    void parse_config(bool init);
@ -209,7 +214,6 @@ class osd_t
    void renew_lease(bool reload);
    void print_stats();
    void print_slow();
-    void reset_stats();
    json11::Json get_statistics();
    void report_statistics();
    void report_pg_state(pg_t & pg);
--- a/src/osd_cluster.cpp
+++ b/src/osd_cluster.cpp
@ -213,12 +213,14 @@ json11::Json osd_t::get_statistics()
    st["subop_stats"] = subop_stats;
    st["recovery_stats"] = json11::Json::object {
        { recovery_stat_names[0], json11::Json::object {
-            { "count", recovery_stat_count[0][0] },
-            { "bytes", recovery_stat_bytes[0][0] },
+            { "count", recovery_stat[0].count },
+            { "bytes", recovery_stat[0].bytes },
+            { "usec", recovery_stat[0].usec },
        } },
        { recovery_stat_names[1], json11::Json::object {
-            { "count", recovery_stat_count[0][1] },
-            { "bytes", recovery_stat_bytes[0][1] },
+            { "count", recovery_stat[1].count },
+            { "bytes", recovery_stat[1].bytes },
+            { "usec", recovery_stat[1].usec },
        } },
    };
    return st;
--- a/src/osd_primary_subops.cpp
+++ b/src/osd_primary_subops.cpp
@ -3,13 +3,15 @@

 #include "osd_primary.h"

+#define SELF_FD -1
+
 void osd_t::autosync()
 {
    if (immediate_commit != IMMEDIATE_ALL && !autosync_op)
    {
        autosync_op = new osd_op_t();
        autosync_op->op_type = OSD_OP_IN;
-        autosync_op->peer_fd = -1;
+        autosync_op->peer_fd = SELF_FD;
        autosync_op->req = (osd_any_op_t){
            .sync = {
                .header = {
@ -85,9 +87,13 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
    cur_op->reply.hdr.id = cur_op->req.hdr.id;
    cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
    cur_op->reply.hdr.retval = retval;
-    if (cur_op->peer_fd == -1)
+    if (cur_op->peer_fd == SELF_FD)
    {
-        msgr.measure_exec(cur_op);
+        // Do not include internal primary writes (recovery/rebalance) into client op statistics
+        if (cur_op->req.hdr.opcode != OSD_OP_WRITE)
+        {
+            msgr.measure_exec(cur_op);
+        }
        // Copy lambda to be unaffected by `delete op`
        std::function<void(osd_op_t*)>(cur_op->callback)(cur_op);
    }
--- a/src/osd_primary_write.cpp
+++ b/src/osd_primary_write.cpp
@ -292,16 +292,26 @@ resume_7:
    {
        {
            int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
-            recovery_stat_count[0][recovery_type]++;
-            if (!recovery_stat_count[0][recovery_type])
+            recovery_stat[recovery_type].count++;
+            if (!recovery_stat[recovery_type].count) // wrapped
            {
-                recovery_stat_count[0][recovery_type]++;
-                recovery_stat_bytes[0][recovery_type] = 0;
+                memset(&recovery_print_prev[recovery_type], 0, sizeof(recovery_print_prev[recovery_type]));
+                memset(&recovery_stat[recovery_type], 0, sizeof(recovery_stat[recovery_type]));
+                recovery_stat[recovery_type].count++;
            }
            for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size); role++)
            {
-                recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
+                recovery_stat[recovery_type].bytes += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
            }
+            if (!cur_op->tv_end.tv_sec)
+            {
+                clock_gettime(CLOCK_REALTIME, &cur_op->tv_end);
+            }
+            uint64_t usec = (
+                (cur_op->tv_end.tv_sec - cur_op->tv_begin.tv_sec)*1000000 +
+                (cur_op->tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000
+            );
+            recovery_stat[recovery_type].usec += usec;
        }
        // Any kind of a non-clean object can have extra chunks, because we don't record objects
        // as degraded & misplaced or incomplete & misplaced at the same time. So try to remove extra chunks