Try to catch "data lost during self-heal"

32M journal by default in simple-offsets
Do not reserve extra space for big_writes during sync - sync itself is needed to commit and clear them
2024-02-21 19:24:36 +03:00 · 2024-02-21 15:25:02 +03:00 · 2024-02-21 13:00:14 +03:00 · 2024-02-21 01:32:06 +03:00 · 2024-02-20 19:41:48 +03:00 · 2024-02-20 19:40:56 +03:00
60 changed files with 416 additions and 796 deletions
--- a/.gitea/workflows/test.yml
+++ b/.gitea/workflows/test.yml
@@ -64,546 +64,6 @@ jobs:
    # leak sanitizer sometimes crashes
    - run: cd /root/vitastor/build && ASAN_OPTIONS=detect_leaks=0 make -j16 test

-  test_add_osd:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 10
-      run: /root/vitastor/tests/test_add_osd.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_cas:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_cas.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_change_pg_count:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_change_pg_count.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_change_pg_count_ec:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: SCHEME=ec /root/vitastor/tests/test_change_pg_count.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_change_pg_size:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_change_pg_size.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_create_nomaxid:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_create_nomaxid.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_etcd_fail:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 10
-      run: /root/vitastor/tests/test_etcd_fail.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_interrupted_rebalance:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 10
-      run: /root/vitastor/tests/test_interrupted_rebalance.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_interrupted_rebalance_imm:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 10
-      run: IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_interrupted_rebalance.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_interrupted_rebalance_ec:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 10
-      run: SCHEME=ec /root/vitastor/tests/test_interrupted_rebalance.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_interrupted_rebalance_ec_imm:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 10
-      run: SCHEME=ec IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_interrupted_rebalance.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_failure_domain:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_failure_domain.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_snapshot:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_snapshot.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_snapshot_ec:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: SCHEME=ec /root/vitastor/tests/test_snapshot.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_minsize_1:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_minsize_1.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_move_reappear:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_move_reappear.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_rm:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_rm.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_snapshot_chain:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_snapshot_chain.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_snapshot_chain_ec:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: SCHEME=ec /root/vitastor/tests/test_snapshot_chain.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_snapshot_down:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_snapshot_down.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_snapshot_down_ec:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: SCHEME=ec /root/vitastor/tests/test_snapshot_down.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_splitbrain:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_splitbrain.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_rebalance_verify:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 10
-      run: /root/vitastor/tests/test_rebalance_verify.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_rebalance_verify_imm:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 10
-      run: IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_rebalance_verify.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_rebalance_verify_ec:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 10
-      run: SCHEME=ec /root/vitastor/tests/test_rebalance_verify.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_rebalance_verify_ec_imm:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 10
-      run: SCHEME=ec IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_rebalance_verify.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_switch_primary:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_switch_primary.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_write:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_write.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_write_xor:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: SCHEME=xor /root/vitastor/tests/test_write.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_write_no_same:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_write_no_same.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
  test_heal_pg_size_2:
    runs-on: ubuntu-latest
    needs: build
@@ -611,7 +71,7 @@ jobs:
    steps:
    - name: Run test
      id: test
-      timeout-minutes: 10
+      timeout-minutes: 1000
      run: PG_SIZE=2 /root/vitastor/tests/test_heal.sh
    - name: Print logs
      if: always() && steps.test.outcome == 'failure'
@@ -629,7 +89,7 @@ jobs:
    steps:
    - name: Run test
      id: test
-      timeout-minutes: 10
+      timeout-minutes: 1000
      run: SCHEME=ec /root/vitastor/tests/test_heal.sh
    - name: Print logs
      if: always() && steps.test.outcome == 'failure'
@@ -647,7 +107,7 @@ jobs:
    steps:
    - name: Run test
      id: test
-      timeout-minutes: 10
+      timeout-minutes: 1000
      run: TEST_NAME=csum_32k_dmj OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k --inmemory_metadata false --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
    - name: Print logs
      if: always() && steps.test.outcome == 'failure'
@@ -665,7 +125,7 @@ jobs:
    steps:
    - name: Run test
      id: test
-      timeout-minutes: 10
+      timeout-minutes: 1000
      run: TEST_NAME=csum_32k_dj  OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
    - name: Print logs
      if: always() && steps.test.outcome == 'failure'
@@ -683,7 +143,7 @@ jobs:
    steps:
    - name: Run test
      id: test
-      timeout-minutes: 10
+      timeout-minutes: 1000
      run: TEST_NAME=csum_32k     OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
    - name: Print logs
      if: always() && steps.test.outcome == 'failure'
@@ -701,7 +161,7 @@ jobs:
    steps:
    - name: Run test
      id: test
-      timeout-minutes: 10
+      timeout-minutes: 1000
      run: TEST_NAME=csum_4k_dmj  OSD_ARGS="--data_csum_type crc32c --inmemory_metadata false --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
    - name: Print logs
      if: always() && steps.test.outcome == 'failure'
@@ -719,7 +179,7 @@ jobs:
    steps:
    - name: Run test
      id: test
-      timeout-minutes: 10
+      timeout-minutes: 1000
      run: TEST_NAME=csum_4k_dj   OSD_ARGS="--data_csum_type crc32c --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
    - name: Print logs
      if: always() && steps.test.outcome == 'failure'
@@ -737,7 +197,7 @@ jobs:
    steps:
    - name: Run test
      id: test
-      timeout-minutes: 10
+      timeout-minutes: 1000
      run: TEST_NAME=csum_4k      OSD_ARGS="--data_csum_type crc32c" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
    - name: Print logs
      if: always() && steps.test.outcome == 'failure'
@@ -747,112 +207,3 @@ jobs:
          cat $i
          echo ""
        done
-
-  test_scrub:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_scrub.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_scrub_zero_osd_2:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: ZERO_OSD=2 /root/vitastor/tests/test_scrub.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_scrub_xor:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: SCHEME=xor /root/vitastor/tests/test_scrub.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_scrub_pg_size_3:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: PG_SIZE=3 /root/vitastor/tests/test_scrub.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: PG_SIZE=6 PG_MINSIZE=4 OSD_COUNT=6 SCHEME=ec /root/vitastor/tests/test_scrub.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_scrub_ec:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: SCHEME=ec /root/vitastor/tests/test_scrub.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
--- a/.gitea/workflows/tests-to-yaml.pl
+++ b/.gitea/workflows/tests-to-yaml.pl
@@ -39,6 +39,10 @@ for my $line (<>)
                $test_name .= '_'.lc($1).'_'.$2;
            }
        }
+        if ($test_name eq 'test_snapshot_chain_ec')
+        {
+            $timeout = 6;
+        }
        $line =~ s!\./test_!/root/vitastor/tests/test_!;
        # Gitea CI doesn't support artifacts yet, lol
        #- name: Upload results
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)

 project(vitastor)

-set(VERSION "1.4.2")
+set(VERSION "1.4.6")

 add_subdirectory(src)
--- a/csi/Makefile
+++ b/csi/Makefile
@@ -1,4 +1,4 @@
-VERSION ?= v1.4.2
+VERSION ?= v1.4.6

 all: build push

--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v1.4.2
+          image: vitalif/vitastor-csi:v1.4.6
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@@ -121,7 +121,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v1.4.2
+          image: vitalif/vitastor-csi:v1.4.6
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/src/config.go
+++ b/csi/src/config.go
@@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "1.4.2"
+    vitastorCSIDriverVersion = "1.4.6"
 )

 // Config struct fills the parameters of request or user input
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,4 +1,4 @@
-vitastor (1.4.2-1) unstable; urgency=medium
+vitastor (1.4.6-1) unstable; urgency=medium

  * Bugfixes

--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -35,8 +35,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-1.4.2; \
-    cd vitastor-1.4.2; \
+    cp -r /root/vitastor vitastor-1.4.6; \
+    cd vitastor-1.4.6; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -49,8 +49,8 @@ RUN set -e -x; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.2.orig.tar.xz vitastor-1.4.2; \
-    cd vitastor-1.4.2; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.6.orig.tar.xz vitastor-1.4.6; \
+    cd vitastor-1.4.6; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docs/config/monitor.en.md
+++ b/docs/config/monitor.en.md
@@ -19,8 +19,8 @@ These parameters only apply to Monitors.
 ## etcd_mon_ttl

 - Type: seconds
- Default: 30
- Minimum: 10
+- Default: 1
+- Minimum: 5

 Monitor etcd lease refresh interval in seconds

--- a/docs/config/monitor.ru.md
+++ b/docs/config/monitor.ru.md
@@ -19,8 +19,8 @@
 ## etcd_mon_ttl

 - Тип: секунды
- Значение по умолчанию: 30
- Минимальное значение: 10
+- Значение по умолчанию: 1
+- Минимальное значение: 5

 Интервал обновления etcd резервации (lease) монитором

--- a/docs/config/network.en.md
+++ b/docs/config/network.en.md
@@ -215,8 +215,8 @@ is scheduled.
 ## up_wait_retry_interval

 - Type: milliseconds
- Default: 500
- Minimum: 50
+- Default: 50
+- Minimum: 10
 - Can be changed online: yes

 OSDs respond to clients with a special error code when they receive I/O
--- a/docs/config/network.ru.md
+++ b/docs/config/network.ru.md
@@ -224,8 +224,8 @@ OSD в любом случае согласовывают реальное зн
 ## up_wait_retry_interval

 - Тип: миллисекунды
- Значение по умолчанию: 500
- Минимальное значение: 50
+- Значение по умолчанию: 50
+- Минимальное значение: 10
 - Можно менять на лету: да

 Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@@ -59,6 +59,7 @@ them, even without restarting by updating configuration in etcd.
 - [recovery_tune_client_util_high](#recovery_tune_client_util_high)
 - [recovery_tune_agg_interval](#recovery_tune_agg_interval)
 - [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
+- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)

 ## etcd_report_interval

@@ -604,5 +605,14 @@ is usually fine.
 - Default: 10
 - Can be changed online: yes

-Minimum possible value for auto-tuned recovery_sleep_us. Values lower
-than this value are changed to 0.
+Minimum possible value for auto-tuned recovery_sleep_us. Lower values
+are changed to 0.
+
+## recovery_tune_sleep_cutoff_us
+
+- Type: microseconds
+- Default: 10000000
+- Can be changed online: yes
+
+Maximum possible value for auto-tuned recovery_sleep_us. Higher values
+are treated as outliers and ignored in aggregation.
--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@@ -60,6 +60,7 @@
 - [recovery_tune_client_util_high](#recovery_tune_client_util_high)
 - [recovery_tune_agg_interval](#recovery_tune_agg_interval)
 - [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
+- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)

 ## etcd_report_interval

@@ -634,4 +635,14 @@ EC (кодов коррекции ошибок) с более, чем 1 диск
 - Можно менять на лету: да

 Минимальное возможное значение авто-подстроенного recovery_sleep_us.
-Значения ниже данного заменяются на 0.
+Меньшие значения заменяются на 0.
+
+## recovery_tune_sleep_cutoff_us
+
+- Тип: микросекунды
+- Значение по умолчанию: 10000000
+- Можно менять на лету: да
+
+Максимальное возможное значение авто-подстроенного recovery_sleep_us.
+Большие значения считаются случайными выбросами и игнорируются в
+усреднении.
--- a/docs/config/pool.en.md
+++ b/docs/config/pool.en.md
@@ -154,6 +154,9 @@ That is, if it becomes impossible to place PG data on at least (pg_minsize)
 OSDs, PG is deactivated for both read and write. So you know that a fresh
 write always goes to at least (pg_minsize) OSDs (disks).

+That is, pg_size minus pg_minsize sets the number of disk failures to tolerate
+without temporary downtime (for [osd_out_time](monitor.en.md#osd_out_time)).
+
 FIXME: pg_minsize behaviour may be changed in the future to only make PGs
 read-only instead of deactivating them.

--- a/docs/config/pool.ru.md
+++ b/docs/config/pool.ru.md
@@ -157,6 +157,10 @@
 OSD, PG деактивируется на чтение и запись. Иными словами, всегда известно,
 что новые блоки данных всегда записываются как минимум на pg_minsize дисков.

+По сути, разница pg_size и pg_minsize задаёт число отказов дисков, которые пул
+может пережить без временной (на [osd_out_time](monitor.ru.md#osd_out_time))
+остановки обслуживания.
+
 FIXME: Поведение pg_minsize может быть изменено в будущем с полной деактивации
 PG на перевод их в режим только для чтения.

--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@@ -731,8 +731,19 @@
  default: 10
  online: true
  info: |
-    Minimum possible value for auto-tuned recovery_sleep_us. Values lower
-    than this value are changed to 0.
+    Minimum possible value for auto-tuned recovery_sleep_us. Lower values
+    are changed to 0.
  info_ru: |
    Минимальное возможное значение авто-подстроенного recovery_sleep_us.
-    Значения ниже данного заменяются на 0.
+    Меньшие значения заменяются на 0.
+- name: recovery_tune_sleep_cutoff_us
+  type: us
+  default: 10000000
+  online: true
+  info: |
+    Maximum possible value for auto-tuned recovery_sleep_us. Higher values
+    are treated as outliers and ignored in aggregation.
+  info_ru: |
+    Максимальное возможное значение авто-подстроенного recovery_sleep_us.
+    Большие значения считаются случайными выбросами и игнорируются в
+    усреднении.
--- a/docs/usage/disk.en.md
+++ b/docs/usage/disk.en.md
@@ -261,7 +261,7 @@ Options (see also [Cluster-Wide Disk Layout Parameters](../config/layout-cluster
 ```
 --object_size 128k       Set blockstore block size
 --bitmap_granularity 4k  Set bitmap granularity
--journal_size 16M       Set journal size
+--journal_size 32M       Set journal size
 --data_csum_type none    Set data checksum type (crc32c or none)
 --csum_block_size 4k     Set data checksum block size
 --device_block_size 4k   Set device block size
--- a/docs/usage/disk.ru.md
+++ b/docs/usage/disk.ru.md
@@ -267,7 +267,7 @@ OSD отключены fsync-и.
 ```
 --object_size 128k       Размер блока хранилища
 --bitmap_granularity 4k  Гранулярность битовых карт
--journal_size 16M       Размер журнала
+--journal_size 32M       Размер журнала
 --data_csum_type none    Задать тип контрольных сумм (crc32c или none)
 --csum_block_size 4k     Задать размер блока расчёта контрольных сумм
 --device_block_size 4k   Размер блока устройства
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -675,7 +675,12 @@ class Mon
                {
                    this.parse_kv(e.kv);
                    const key = e.kv.key.substr(this.etcd_prefix.length);
-                    if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/')
+                    if (key.substr(0, 11) == '/osd/state/')
+                    {
+                        stats_changed = true;
+                        changed = true;
+                    }
+                    else if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/')
                    {
                        stats_changed = true;
                    }
@@ -1635,9 +1640,13 @@ class Mon
        }
        const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
        // Sum derived values instead of deriving summed
-        for (const osd in this.state.osd.stats)
+        for (const osd in this.state.osd.state)
        {
            const derived = this.prev_stats.osd_diff[osd];
+            if (!this.state.osd.state[osd] || !derived)
+            {
+                continue;
+            }
            for (const type in sum_diff)
            {
                for (const op in derived[type]||{})
@@ -1738,9 +1747,13 @@ class Mon
            const used = this.state.pool.stats[pool_id].used_raw_tb;
            this.state.pool.stats[pool_id].used_raw_tb = Number(used)/1024/1024/1024/1024;
        }
-        for (const osd_num in this.state.osd.inodestats)
+        for (const osd_num in this.state.osd.state)
        {
            const ist = this.state.osd.inodestats[osd_num];
+            if (!ist || !this.state.osd.state[osd_num])
+            {
+                continue;
+            }
            for (const pool_id in ist)
            {
                inode_stats[pool_id] = inode_stats[pool_id] || {};
@@ -1756,9 +1769,14 @@ class Mon
                }
            }
        }
-        for (const osd in this.prev_stats.osd_diff)
+        for (const osd in this.state.osd.state)
        {
-            for (const pool_id in this.prev_stats.osd_diff[osd].inode_stats)
+            const osd_diff = this.prev_stats.osd_diff[osd];
+            if (!osd_diff || !this.state.osd.state[osd])
+            {
+                continue;
+            }
+            for (const pool_id in osd_diff.inode_stats)
            {
                for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id])
                {
--- a/mon/package.json
+++ b/mon/package.json
@@ -1,6 +1,6 @@
 {
  "name": "vitastor-mon",
-  "version": "1.4.2",
+  "version": "1.4.6",
  "description": "Vitastor SDS monitor service",
  "main": "mon-main.js",
  "scripts": {
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VERSION = '1.4.2'
+VERSION = '1.4.6'

 LOG = logging.getLogger(__name__)

--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -24,4 +24,4 @@ rm fio
 mv fio-copy fio
 FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-1.4.2/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.2$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-1.4.6/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.6$(rpm --eval '%dist').tar.gz *
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -36,7 +36,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-1.4.2.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.4.6.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.4.2
+Version:        1.4.6
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.4.2.el7.tar.gz
+Source0:        vitastor-1.4.6.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-1.4.2.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.4.6.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.4.2
+Version:        1.4.6
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.4.2.el8.tar.gz
+Source0:        vitastor-1.4.6.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el9.Dockerfile
+++ b/rpm/vitastor-el9.Dockerfile
@@ -18,7 +18,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-1.4.2.el9.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.4.6.el9.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el9.spec
+++ b/rpm/vitastor-el9.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.4.2
+Version:        1.4.6
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.4.2.el9.tar.gz
+Source0:        vitastor-1.4.6.el9.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVERSION="1.4.2")
+add_definitions(-DVERSION="1.4.6")
 add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
 add_link_options(-fno-omit-frame-pointer)
 if (${WITH_ASAN})
--- a/src/blockstore_flush.cpp
+++ b/src/blockstore_flush.cpp
@@ -19,7 +19,6 @@ journal_flusher_t::journal_flusher_t(blockstore_impl_t *bs)
    syncing_flushers = 0;
    // FIXME: allow to configure flusher_start_threshold and journal_trim_interval
    flusher_start_threshold = bs->dsk.journal_block_size / sizeof(journal_entry_stable);
-    journal_trim_interval = 512;
    journal_trim_counter = bs->journal.flush_journal ? 1 : 0;
    trim_wanted = bs->journal.flush_journal ? 1 : 0;
    journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->dsk.journal_block_size);
@@ -184,8 +183,7 @@ void journal_flusher_t::mark_trim_possible()
    if (trim_wanted > 0)
    {
        dequeuing = true;
-        if (!journal_trim_counter)
-            journal_trim_counter = journal_trim_interval;
+        journal_trim_counter = 0;
        bs->ringloop->wakeup();
    }
 }
@@ -366,9 +364,10 @@ resume_0:
        !flusher->flush_queue.size() || !flusher->dequeuing)
    {
 stop_flusher:
-        if (flusher->trim_wanted > 0 && flusher->journal_trim_counter > 0)
+        if (flusher->trim_wanted > 0 && cur.oid.inode != 0)
        {
            // Attempt forced trim
+            cur.oid = {};
            flusher->active_flushers++;
            goto trim_journal;
        }
@@ -416,6 +415,7 @@ stop_flusher:
                flusher->sync_to_repeat.erase(cur.oid);
                if (!flusher->try_find_other(dirty_end, cur))
                {
+                    cur.oid = {};
                    goto stop_flusher;
                }
            }
@@ -584,7 +584,8 @@ resume_2:
        flusher->sync_to_repeat.erase(repeat_it);
    trim_journal:
        // Clear unused part of the journal every <journal_trim_interval> flushes
-        if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval) || flusher->trim_wanted > 0)
+        if (bs->journal_trim_interval && !((++flusher->journal_trim_counter) % bs->journal_trim_interval) ||
+            flusher->trim_wanted > 0)
        {
    resume_26:
    resume_27:
@@ -1346,7 +1347,6 @@ bool journal_flusher_co::trim_journal(int wait_base)
    else if (wait_state == wait_base+2) goto resume_2;
    else if (wait_state == wait_base+3) goto resume_3;
    else if (wait_state == wait_base+4) goto resume_4;
-    flusher->journal_trim_counter = 0;
    new_trim_pos = bs->journal.get_trim_pos();
    if (new_trim_pos != bs->journal.used_start)
    {
@@ -1419,6 +1419,7 @@ bool journal_flusher_co::trim_journal(int wait_base)
                exit(0);
            }
        }
+        flusher->journal_trim_counter = 0;
        flusher->trimming = false;
    }
    return true;
--- a/src/blockstore_flush.h
+++ b/src/blockstore_flush.h
@@ -107,7 +107,7 @@ class journal_flusher_t
    blockstore_impl_t *bs;
    friend class journal_flusher_co;

-    int journal_trim_counter, journal_trim_interval;
+    int journal_trim_counter;
    bool trimming;
    void* journal_superblock;

--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@@ -195,6 +195,10 @@ void blockstore_impl_t::loop()
                    // ring is full, stop submission
                    break;
                }
+                else if (PRIV(op)->wait_for == WAIT_JOURNAL)
+                {
+                    PRIV(op)->wait_detail2 = (unstable_writes.size()+unstable_unsynced);
+                }
            }
        }
        if (op_idx != new_idx)
@@ -273,7 +277,8 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
    }
    else if (PRIV(op)->wait_for == WAIT_JOURNAL)
    {
-        if (journal.used_start == PRIV(op)->wait_detail && !unstable_count_changed)
+        if (journal.used_start == PRIV(op)->wait_detail &&
+            (unstable_writes.size()+unstable_unsynced) == PRIV(op)->wait_detail2)
        {
            // do not submit
 #ifdef BLOCKSTORE_DEBUG
@@ -281,7 +286,6 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
 #endif
            return;
        }
-        unstable_count_changed = false;
        flusher->release_trim();
        PRIV(op)->wait_for = 0;
    }
@@ -353,7 +357,6 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
                    };
                }
                unstable_writes.clear();
-                unstable_count_changed = true;
                op->callback = [old_callback](blockstore_op_t *op)
                {
                    obj_ver_id *vers = (obj_ver_id*)op->buf;
--- a/src/blockstore_impl.h
+++ b/src/blockstore_impl.h
@@ -202,7 +202,7 @@ struct blockstore_op_private_t
 {
    // Wait status
    int wait_for;
-    uint64_t wait_detail;
+    uint64_t wait_detail, wait_detail2;
    int pending_ops;
    int op_state;

@@ -253,6 +253,7 @@ class blockstore_impl_t
    bool inmemory_meta = false;
    // Maximum and minimum flusher count
    unsigned max_flusher_count, min_flusher_count;
+    unsigned journal_trim_interval;
    // Maximum queue depth
    unsigned max_write_iodepth = 128;
    // Enable small (journaled) write throttling, useful for the SSD+HDD case
@@ -276,7 +277,6 @@ class blockstore_impl_t
    std::vector<blockstore_op_t*> submit_queue;
    std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
    int unsynced_big_write_count = 0, unstable_unsynced = 0;
-    bool unstable_count_changed = false;
    int unsynced_queued_ops = 0;
    allocator *data_alloc = NULL;
    uint64_t used_blocks = 0;
--- a/src/blockstore_journal.cpp
+++ b/src/blockstore_journal.cpp
@@ -103,7 +103,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
    if (data_after > 0)
    {
        next_pos = next_pos + data_after;
-        if (next_pos > bs->journal.len)
+        if (next_pos >= bs->journal.len)
        {
            if (right_dir)
                next_pos = bs->journal.block_size + data_after;
@@ -146,7 +146,7 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
        journal.in_sector_pos = 0;
        auto next_next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
        // double check that next_free doesn't cross used_start from the left
-        assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
+        assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
        journal.next_free = next_next_free;
        memset(journal.inmemory
            ? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
--- a/src/blockstore_open.cpp
+++ b/src/blockstore_open.cpp
@@ -13,6 +13,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
        max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
    }
    min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
+    journal_trim_interval = strtoull(config["journal_trim_interval"].c_str(), NULL, 10);
    max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
    throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
    throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
@@ -31,6 +32,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
    {
        min_flusher_count = 1;
    }
+    if (!journal_trim_interval)
+    {
+        journal_trim_interval = 512;
+    }
    if (!max_write_iodepth)
    {
        max_write_iodepth = 128;
--- a/src/blockstore_read.cpp
+++ b/src/blockstore_read.cpp
@@ -505,7 +505,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
        for (auto & rv: PRIV(read_op)->read_vec)
        {
            if (rv.journal_sector)
-                journal.used_sectors[rv.journal_sector-1]++;
+                journal.used_sectors.at(rv.journal_sector-1)++;
        }
    }
    read_op->retval = 0;
@@ -966,7 +966,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
            {
                if (rv.journal_sector)
                {
-                    auto used = --journal.used_sectors[rv.journal_sector-1];
+                    auto used = --journal.used_sectors.at(rv.journal_sector-1);
                    if (used == 0)
                    {
                        journal.used_sectors.erase(rv.journal_sector-1);
--- a/src/blockstore_rollback.cpp
+++ b/src/blockstore_rollback.cpp
@@ -162,7 +162,6 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
                    unstable_writes.erase(unstab_it);
                else
                    unstab_it->second = max_unstable;
-                unstable_count_changed = true;
            }
        }
    }
@@ -216,7 +215,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
 #endif
            data_alloc->set(dirty_it->second.location >> dsk.block_order, false);
        }
-        auto used = --journal.used_sectors[dirty_it->second.journal_sector];
+        auto used = --journal.used_sectors.at(dirty_it->second.journal_sector);
 #ifdef BLOCKSTORE_DEBUG
        printf(
            "remove usage of journal offset %08lx by %lx:%lx v%lu (%lu refs)\n", dirty_it->second.journal_sector,
@@ -226,6 +225,11 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
        if (used == 0)
        {
            journal.used_sectors.erase(dirty_it->second.journal_sector);
+            if (dirty_it->second.journal_sector == journal.sector_info[journal.cur_sector].offset)
+            {
+                // Mark current sector as "full" to select the new one
+                journal.in_sector_pos = dsk.journal_block_size;
+            }
            flusher->mark_trim_possible();
        }
        free_dirty_dyn_data(dirty_it->second);
--- a/src/blockstore_stable.cpp
+++ b/src/blockstore_stable.cpp
@@ -307,35 +307,49 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
                return STAB_SPLIT_DONE;
            }
        }
-        else if (IS_IN_FLIGHT(dirty_it->second.state))
-        {
-            // Object write is still in progress. Wait until the write request completes
-            return STAB_SPLIT_WAIT;
-        }
-        else if (!IS_SYNCED(dirty_it->second.state))
-        {
-            // Object not synced yet - sync it
-            // In previous versions we returned EBUSY here and required
-            // the caller (OSD) to issue a global sync first. But a global sync
-            // waits for all writes in the queue including inflight writes. And
-            // inflight writes may themselves be blocked by unstable writes being
-            // still present in the journal and not flushed away from it.
-            // So we must sync specific objects here.
-            //
-            // Even more, we have to process "stabilize" request in parts. That is,
-            // we must stabilize all objects which are already synced. Otherwise
-            // they may block objects which are NOT synced yet.
-            return STAB_SPLIT_SYNC;
-        }
        else if (IS_STABLE(dirty_it->second.state))
        {
            // Already stable
            return STAB_SPLIT_DONE;
        }
-        else
+        while (true)
        {
-            return STAB_SPLIT_TODO;
+            if (IS_IN_FLIGHT(dirty_it->second.state))
+            {
+                // Object write is still in progress. Wait until the write request completes
+                return STAB_SPLIT_WAIT;
+            }
+            else if (!IS_SYNCED(dirty_it->second.state))
+            {
+                // Object not synced yet - sync it
+                // In previous versions we returned EBUSY here and required
+                // the caller (OSD) to issue a global sync first. But a global sync
+                // waits for all writes in the queue including inflight writes. And
+                // inflight writes may themselves be blocked by unstable writes being
+                // still present in the journal and not flushed away from it.
+                // So we must sync specific objects here.
+                //
+                // Even more, we have to process "stabilize" request in parts. That is,
+                // we must stabilize all objects which are already synced. Otherwise
+                // they may block objects which are NOT synced yet.
+                return STAB_SPLIT_SYNC;
+            }
+            else if (IS_STABLE(dirty_it->second.state))
+            {
+                break;
+            }
+            // Check previous versions too
+            if (dirty_it == dirty_db.begin())
+            {
+                break;
+            }
+            dirty_it--;
+            if (dirty_it->first.oid != ov.oid)
+            {
+                break;
+            }
        }
+        return STAB_SPLIT_TODO;
    });
    if (r != 1)
    {
@@ -537,6 +551,5 @@ void blockstore_impl_t::mark_stable(obj_ver_id v, bool forget_dirty)
        unstab_it->second <= v.version)
    {
        unstable_writes.erase(unstab_it);
-        unstable_count_changed = true;
    }
 }
--- a/src/blockstore_sync.cpp
+++ b/src/blockstore_sync.cpp
@@ -92,8 +92,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            }
        }
        else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
-            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
-            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
+            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0))
        {
            return 0;
        }
@@ -116,7 +115,10 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
                journal, (dirty_entry.state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
                sizeof(journal_entry_big_write) + dyn_size
            );
-            dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
+            auto jsec = dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
+            assert(journal.next_free >= journal.used_start
+                ? (jsec >= journal.used_start && jsec < journal.next_free)
+                : (jsec >= journal.used_start || jsec < journal.next_free));
            journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
            printf(
--- a/src/blockstore_write.cpp
+++ b/src/blockstore_write.cpp
@@ -320,7 +320,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, unsynced_big_write_count + 1,
            sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
-            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
+            (unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size))
        {
            return 0;
        }
@@ -412,7 +412,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
                sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
            || !space_check.check_available(op, 1,
                sizeof(journal_entry_small_write) + dyn_size,
-                op->len + (unstable_writes.size()+unstable_unsynced)*journal.block_size))
+                op->len + (unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size))
        {
            return 0;
        }
@@ -436,7 +436,19 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            journal, op->opcode == BS_OP_WRITE_STABLE ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE,
            sizeof(journal_entry_small_write) + dyn_size
        );
-        dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
+        auto jsec = dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
+        if (!(journal.next_free >= journal.used_start
+            ? (jsec >= journal.used_start && jsec < journal.next_free)
+            : (jsec >= journal.used_start || jsec < journal.next_free)))
+        {
+            printf(
+                "BUG: journal offset %08lx is used by %lx:%lx v%lu (%lu refs) BUT used_start=%lx next_free=%lx\n",
+                dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
+                journal.used_sectors[journal.sector_info[journal.cur_sector].offset],
+                journal.used_start, journal.next_free
+            );
+            abort();
+        }
        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
        printf(
@@ -463,7 +475,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            }
        }
        // double check that next_free doesn't cross used_start from the left
-        assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
+        assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
        journal.next_free = next_next_free;
        je->oid = op->oid;
        je->version = op->version;
@@ -505,7 +517,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        if (next_next_free >= journal.len)
            next_next_free = dsk.journal_block_size;
        // double check that next_free doesn't cross used_start from the left
-        assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
+        assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
        journal.next_free = next_next_free;
        if (!(dirty_it->second.state & BS_ST_INSTANT))
        {
@@ -549,7 +561,7 @@ resume_2:
        uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
-            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
+            (unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size))
        {
            return 0;
        }
@@ -558,7 +570,19 @@ resume_2:
            journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
            sizeof(journal_entry_big_write) + dyn_size
        );
-        dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
+        auto jsec = dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
+        if (!(journal.next_free >= journal.used_start
+            ? (jsec >= journal.used_start && jsec < journal.next_free)
+            : (jsec >= journal.used_start || jsec < journal.next_free)))
+        {
+            printf(
+                "BUG: journal offset %08lx is used by %lx:%lx v%lu (%lu refs) BUT used_start=%lx next_free=%lx\n",
+                dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
+                journal.used_sectors[journal.sector_info[journal.cur_sector].offset],
+                journal.used_start, journal.next_free
+            );
+            abort();
+        }
        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
        printf(
--- a/src/cli_status.cpp
+++ b/src/cli_status.cpp
@@ -106,7 +106,7 @@ resume_2:
            if (etcd_states[i]["error"].is_null())
            {
                etcd_alive++;
-                etcd_db_size = etcd_states[i]["dbSizeInUse"].uint64_value();
+                etcd_db_size = etcd_states[i]["dbSize"].uint64_value();
            }
        }
        int mon_count = 0;
--- a/src/disk_simple_offsets.cpp
+++ b/src/disk_simple_offsets.cpp
@@ -47,7 +47,7 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
    if (!bitmap_granularity)
        bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
    if (!journal_size)
-        journal_size = 16*1024*1024;
+        journal_size = 32*1024*1024;
    if (!device_block_size)
        device_block_size = 4096;
    if (!data_csum_type)
--- a/src/disk_tool.cpp
+++ b/src/disk_tool.cpp
@@ -167,7 +167,7 @@ static const char *help_text =
    "  Calculate offsets for old simple&stupid (no superblock) OSD deployment. Options:\n"
    "    --object_size 128k       Set blockstore block size\n"
    "    --bitmap_granularity 4k  Set bitmap granularity\n"
-    "    --journal_size 16M       Set journal size\n"
+    "    --journal_size 32M       Set journal size\n"
    "    --data_csum_type none    Set data checksum type (crc32c or none)\n"
    "    --csum_block_size 4k     Set data checksum block size\n"
    "    --device_block_size 4k   Set device block size\n"
--- a/src/disk_tool_prepare.cpp
+++ b/src/disk_tool_prepare.cpp
@@ -8,6 +8,7 @@
 int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_hdd)
 {
    static const char *allow_additional_params[] = {
+        "autosync_writes",
        "data_io",
        "meta_io",
        "journal_io",
@@ -99,12 +100,9 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
        options["disable_journal_fsync"] = options["disable_data_fsync"];
    }
    // Calculate offsets if the same device is used for two or more of data, meta, and journal
-    if (options["journal_size"] == "")
+    if (options["journal_size"] == "" && (options["journal_device"] == "" || options["journal_device"] == options["data_device"]))
    {
-        if (options["journal_device"] == "")
-            options["journal_size"] = is_hdd ? "128M" : "32M";
-        else if (is_hdd)
-            options["journal_size"] = DEFAULT_HYBRID_JOURNAL;
+        options["journal_size"] = is_hdd || !json_is_true(options["disable_data_fsync"]) ? "128M" : "32M";
    }
    bool is_hybrid = is_hdd && options["journal_device"] != "" && options["journal_device"] != options["data_device"];
    if (is_hdd)
@@ -114,6 +112,15 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
        if (is_hybrid && options["throttle_small_writes"] == "")
            options["throttle_small_writes"] = "1";
    }
+    else if (!json_is_true(options["disable_data_fsync"]))
+    {
+        if (options.find("min_flusher_count") == options.end())
+            options["min_flusher_count"] = "32";
+        if (options.find("max_flusher_count") == options.end())
+            options["max_flusher_count"] = "256";
+        if (options.find("autosync_writes") == options.end())
+            options["autosync_writes"] = "512";
+    }
    json11::Json::object sb;
    blockstore_disk_t dsk;
    try
@@ -616,6 +623,7 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
        options.erase("disable_meta_fsync");
        options.erase("disable_journal_fsync");
    }
+    auto journal_size = options["journal_size"];
    for (auto & dev: devinfo)
    {
        if (!hybrid || dev.is_hdd)
@@ -633,11 +641,13 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
                    {
                        return 1;
                    }
+                    options.erase("journal_size");
                }
                // Treat all disks as SSDs if not in the hybrid mode
                prepare_one(options, dev.is_hdd ? 1 : 0);
                if (hybrid)
                {
+                    options["journal_size"] = journal_size;
                    options.erase("journal_device");
                    options.erase("meta_device");
                }
--- a/src/messenger.cpp
+++ b/src/messenger.cpp
@@ -45,11 +45,12 @@ void osd_messenger_t::init()
 #endif
    keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
    {
-        std::vector<int> to_stop;
-        std::vector<osd_op_t*> to_ping;
-        for (auto cl_it = clients.begin(); cl_it != clients.end(); cl_it++)
+        auto cl_it = clients.begin();
+        while (cl_it != clients.end())
        {
            auto cl = cl_it->second;
+            cl_it++;
+            auto peer_fd = cl->peer_fd;
            if (!cl->osd_num || cl->peer_state != PEER_CONNECTED && cl->peer_state != PEER_RDMA)
            {
                // Do not run keepalive on regular clients
@@ -62,7 +63,9 @@ void osd_messenger_t::init()
                {
                    // Ping timed out, stop the client
                    fprintf(stderr, "Ping timed out for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
-                    to_stop.push_back(cl->peer_fd);
+                    stop_client(peer_fd, true);
+                    // Restart iterator because it may be invalidated
+                    cl_it = clients.upper_bound(peer_fd);
                }
            }
            else if (cl->idle_time_remaining > 0)
@@ -100,9 +103,11 @@ void osd_messenger_t::init()
                            stop_client(fail_fd, true);
                        }
                    };
-                    to_ping.push_back(op);
                    cl->ping_time_remaining = osd_ping_timeout;
                    cl->idle_time_remaining = osd_idle_timeout;
+                    outbox_push(op);
+                    // Restart iterator because it may be invalidated
+                    cl_it = clients.upper_bound(peer_fd);
                }
            }
            else
@@ -110,15 +115,6 @@ void osd_messenger_t::init()
                cl->idle_time_remaining = osd_idle_timeout;
            }
        }
-        // Don't stop clients while a 'clients' iterator is still active
-        for (int peer_fd: to_stop)
-        {
-            stop_client(peer_fd, true);
-        }
-        for (auto op: to_ping)
-        {
-            outbox_push(op);
-        }
    });
 }

--- a/src/osd.cpp
+++ b/src/osd.cpp
@@ -233,6 +233,8 @@ void osd_t::parse_config(bool init)
        ? 10 : config["recovery_tune_agg_interval"].uint64_value();
    recovery_tune_sleep_min_us = config["recovery_tune_sleep_min_us"].is_null()
        ? 10 : config["recovery_tune_sleep_min_us"].uint64_value();
+    recovery_tune_sleep_cutoff_us = config["recovery_tune_sleep_cutoff_us"].is_null()
+        ? 10000000 : config["recovery_tune_sleep_cutoff_us"].uint64_value();
    recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
    if (recovery_pg_switch < 1)
        recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
--- a/src/osd.h
+++ b/src/osd.h
@@ -125,6 +125,7 @@ class osd_t
    int recovery_tune_interval = 1;
    int recovery_tune_agg_interval = 10;
    int recovery_tune_sleep_min_us = 10;
+    int recovery_tune_sleep_cutoff_us = 10000000;
    int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
    int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
    int inode_vanish_time = 60;
@@ -282,6 +283,7 @@ class osd_t
    void exec_sync_stab_all(osd_op_t *cur_op);
    void exec_show_config(osd_op_t *cur_op);
    void exec_secondary(osd_op_t *cur_op);
+    void exec_secondary_real(osd_op_t *cur_op);
    void secondary_op_callback(osd_op_t *cur_op);

    // primary ops
--- a/src/osd_cluster.cpp
+++ b/src/osd_cluster.cpp
@@ -843,7 +843,13 @@ void osd_t::report_pg_states()
                    pg_state_exists = true;
                    if (pg.state == PG_OFFLINE && pg_it->second.cur_primary != this->osd_num)
                    {
-                        // Nothing to check or report, PG is already taken over by another OSD
+                        // Nothing to report, PG is already taken over by another OSD
+                        checks.push_back(json11::Json::object {
+                            { "target", "MOD" },
+                            { "key", state_key_base64 },
+                            { "result", "LESS" },
+                            { "mod_revision", st_cli.etcd_watch_revision+1 },
+                        });
                        continue;
                    }
                }
@@ -851,11 +857,6 @@ void osd_t::report_pg_states()
        }
        if (!pg_state_exists)
        {
-            if (pg.state == PG_OFFLINE)
-            {
-                // Nothing to check or report, PG is already stopped
-                continue;
-            }
            // Check that the PG key does not exist
            // Failed check indicates an unsuccessful PG lock attempt in this case
            checks.push_back(json11::Json::object {
--- a/src/osd_flush.cpp
+++ b/src/osd_flush.cpp
@@ -422,6 +422,10 @@ void osd_t::tune_recovery()
    rtune_avg_lat = total_recovery_usec/recovery_count;
    uint64_t target_lat = rtune_avg_lat * rtune_avg_lat/1000000.0 * recovery_count/recovery_tune_interval / rtune_target_util;
    auto sleep_us = target_lat > rtune_avg_lat+recovery_tune_sleep_min_us ? target_lat-rtune_avg_lat : 0;
+    if (sleep_us > recovery_tune_sleep_cutoff_us)
+    {
+        return;
+    }
    if (recovery_target_sleep_items.size() != recovery_tune_agg_interval)
    {
        recovery_target_sleep_items.resize(recovery_tune_agg_interval);
@@ -438,7 +442,7 @@ void osd_t::tune_recovery()
    if (recovery_target_sleep_count < recovery_tune_agg_interval)
        recovery_target_sleep_count++;
    recovery_target_sleep_us = recovery_target_sleep_total / recovery_target_sleep_count;
-    if (log_level > 4)
+    if (log_level > 1)
    {
        printf(
            "[OSD %lu] auto-tune: client util: %.2f, recovery util: %.2f, lat: %lu us -> target util %.2f, delay %lu us\n",
--- a/src/osd_peering.cpp
+++ b/src/osd_peering.cpp
@@ -222,6 +222,9 @@ void osd_t::start_pg_peering(pg_t & pg)
    }
    if (pg.pg_cursize < pg.pg_minsize)
    {
+        // FIXME: Incomplete EC PGs may currently easily lead to write hangs ("slow ops" in OSD logs)
+        // because such PGs don't flush unstable entries on secondary OSDs so they can't remove these
+        // entries from their journals...
        pg.state = PG_INCOMPLETE;
        report_pg_state(pg);
        return;
--- a/src/osd_rmw.cpp
+++ b/src/osd_rmw.cpp
@@ -861,15 +861,15 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
 static void calc_rmw_parity_copy_parity(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t start, uint32_t end)
 {
-    if (write_osd_set != read_osd_set)
+    if (write_osd_set != read_osd_set && end != 0)
    {
        for (int role = pg_minsize; role < pg_size; role++)
        {
-            if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size))
+            if (write_osd_set[role] != read_osd_set[role] && write_osd_set[role] != 0 && (start != 0 || end != chunk_size))
            {
                // Copy new parity into the read buffer to write it back
                memcpy(
-                    (uint8_t*)stripes[role].read_buf + start,
+                    (uint8_t*)stripes[role].read_buf + start - stripes[role].read_start,
                    stripes[role].write_buf,
                    end - start
                );
--- a/src/osd_rmw_test.cpp
+++ b/src/osd_rmw_test.cpp
@@ -30,6 +30,7 @@ void test16();
 void test_recover_22_d2();
 void test_ec43_error_bruteforce();
 void test_recover_53_d5();
+void test_recover_22();

 int main(int narg, char *args[])
 {
@@ -70,6 +71,8 @@ int main(int narg, char *args[])
    test_ec43_error_bruteforce();
    // Test 19
    test_recover_53_d5();
+    // Test 20
+    test_recover_22();
    // End
    printf("all ok\n");
    return 0;
@@ -1244,3 +1247,99 @@ void test_recover_53_d5()
    // Done
    use_ec(8, 5, false);
 }
+
+void test_recover_22()
+{
+    const int bmp = 128*1024 / 4096 / 8;
+    use_ec(4, 2, true);
+    osd_num_t osd_set[4] = { 1, 2, 3, 4 };
+    osd_num_t write_osd_set[4] = { 5, 0, 3, 0 };
+    osd_rmw_stripe_t stripes[4] = {};
+    unsigned bitmaps[4] = { 0 };
+    // split
+    void *write_buf = (uint8_t*)malloc_or_die(4096);
+    set_pattern(write_buf, 4096, PATTERN0);
+    split_stripes(2, 128*1024, 120*1024, 4096, stripes);
+    assert(stripes[0].req_start == 120*1024 && stripes[0].req_end == 124*1024);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
+    // calc_rmw
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 2, write_osd_set, 128*1024, bmp);
+    for (int i = 0; i < 4; i++)
+        stripes[i].bmp_buf = bitmaps+i;
+    assert(rmw_buf);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 120*1024 && stripes[1].read_end == 124*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
+    assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
+    assert(stripes[0].write_start == 120*1024 && stripes[0].write_end == 124*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
+    assert(stripes[2].write_start == 120*1024 && stripes[2].write_end == 124*1024);
+    assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
+    assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
+    assert(stripes[1].read_buf == (uint8_t*)rmw_buf+132*1024);
+    assert(stripes[2].read_buf == NULL);
+    assert(stripes[3].read_buf == NULL);
+    assert(stripes[0].write_buf == write_buf);
+    assert(stripes[1].write_buf == NULL);
+    assert(stripes[2].write_buf == (uint8_t*)rmw_buf);
+    assert(stripes[3].write_buf == NULL);
+    // encode
+    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
+    set_pattern(stripes[1].read_buf, 4*1024, PATTERN2);
+    memset(stripes[0].bmp_buf, 0xff, bmp);
+    memset(stripes[1].bmp_buf, 0xff, bmp);
+    calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 128*1024, bmp);
+    assert(*(uint32_t*)stripes[2].bmp_buf == 0);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
+    assert(stripes[2].write_start == 120*1024 && stripes[2].write_end == 124*1024);
+    assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
+    assert(stripes[0].write_buf == stripes[0].read_buf);
+    assert(stripes[1].write_buf == NULL);
+    assert(stripes[2].write_buf == (uint8_t*)rmw_buf);
+    assert(stripes[3].write_buf == NULL);
+    check_pattern(stripes[2].write_buf, 4*1024, PATTERN0^PATTERN2);
+    // decode and verify
+    memset(stripes, 0, sizeof(stripes));
+    split_stripes(2, 128*1024, 0, 256*1024, stripes);
+    assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 128*1024);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
+    for (int role = 0; role < 4; role++)
+    {
+        stripes[role].read_start = stripes[role].req_start;
+        stripes[role].read_end = stripes[role].req_end;
+    }
+    assert(extend_missing_stripes(stripes, write_osd_set, 2, 4) == 0);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
+    assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
+    void *read_buf = alloc_read_buffer(stripes, 4, 0);
+    for (int i = 0; i < 4; i++)
+        stripes[i].bmp_buf = bitmaps+i;
+    assert(read_buf);
+    assert(stripes[0].read_buf == read_buf);
+    assert(stripes[1].read_buf == (uint8_t*)read_buf+128*1024);
+    assert(stripes[2].read_buf == (uint8_t*)read_buf+2*128*1024);
+    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
+    set_pattern(stripes[0].read_buf+120*1024, 4*1024, PATTERN0);
+    set_pattern(stripes[2].read_buf, 128*1024, PATTERN1^PATTERN2);
+    set_pattern(stripes[2].read_buf+120*1024, 4*1024, PATTERN0^PATTERN2);
+    memset(stripes[0].bmp_buf, 0xff, bmp);
+    memset(stripes[2].bmp_buf, 0, bmp);
+    bitmaps[1] = 0;
+    bitmaps[3] = 0;
+    reconstruct_stripes_ec(stripes, 4, 2, bmp);
+    assert(bitmaps[0] == 0xFFFFFFFF);
+    assert(*(uint32_t*)stripes[1].bmp_buf == 0xFFFFFFFF);
+    check_pattern(stripes[1].read_buf, 128*1024, PATTERN2);
+    free(read_buf);
+    // Done
+    free(rmw_buf);
+    free(write_buf);
+    use_ec(4, 2, false);
+}
--- a/src/osd_secondary.cpp
+++ b/src/osd_secondary.cpp
@@ -42,8 +42,10 @@ void osd_t::secondary_op_callback(osd_op_t *op)
    int retval = op->bs_op->retval;
    delete op->bs_op;
    op->bs_op = NULL;
-    if (op->is_recovery_related() && recovery_target_sleep_us)
+    if (op->is_recovery_related() && recovery_target_sleep_us &&
+        op->req.hdr.opcode == OSD_OP_SEC_STABILIZE)
    {
+        // Apply pause AFTER commit. Do not apply pause to SYNC at all
        if (!op->tv_end.tv_sec)
        {
            clock_gettime(CLOCK_REALTIME, &op->tv_end);
@@ -59,7 +61,25 @@ void osd_t::secondary_op_callback(osd_op_t *op)
    }
 }

-void osd_t::exec_secondary(osd_op_t *cur_op)
+void osd_t::exec_secondary(osd_op_t *op)
+{
+    if (op->is_recovery_related() && recovery_target_sleep_us &&
+        op->req.hdr.opcode != OSD_OP_SEC_STABILIZE && op->req.hdr.opcode != OSD_OP_SEC_SYNC)
+    {
+        // Apply pause BEFORE write/delete
+        tfd->set_timer_us(recovery_target_sleep_us, false, [this, op](int timer_id)
+        {
+            clock_gettime(CLOCK_REALTIME, &op->tv_begin);
+            exec_secondary_real(op);
+        });
+    }
+    else
+    {
+        exec_secondary_real(op);
+    }
+}
+
+void osd_t::exec_secondary_real(osd_op_t *cur_op)
 {
    if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
    {
--- a/src/vitastor.pc.in
+++ b/src/vitastor.pc.in
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

 Name: Vitastor
 Description: Vitastor client library
-Version: 1.4.2
+Version: 1.4.6
 Libs: -L${libdir} -lvitastor_client
 Cflags: -I${includedir}

--- a/tests/run_3osds.sh
+++ b/tests/run_3osds.sh
@@ -22,7 +22,7 @@ if [ "$IMMEDIATE_COMMIT" != "" ]; then
    NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 10 --etcd_stats_interval 5"
    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"recovery_tune_util_low":1,"immediate_commit":"all","client_enable_writeback":true,"client_max_writeback_iodepth":32'$GLOBAL_CONFIG'}'
 else
-    NO_SAME="--journal_sector_buffer_count 1024 --log_level 10 --etcd_stats_interval 5"
+    NO_SAME="--journal_sector_buffer_count 1024 --log_level 10 --etcd_stats_interval 5 --min_flusher_count 16"
    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"recovery_tune_util_low":1,"client_enable_writeback":true,"client_max_writeback_iodepth":32'$GLOBAL_CONFIG'}'
 fi

--- a/tests/test_heal.sh
+++ b/tests/test_heal.sh
@@ -30,14 +30,16 @@ kill_osds()
    kill -9 $OSD1_PID
    $ETCDCTL del /vitastor/osd/state/1

-    for i in $(seq 2 $OSD_COUNT); do
+    for kill_osd in $(seq 2 $OSD_COUNT); do
        sleep 15
-        echo Killing OSD $i and starting OSD $((i-1))
-        p=OSD${i}_PID
+        # Wait for all PGs to clear has_degraded - all data will be at least in 2 copies
+        wait_condition 600 "$ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only |\
+            jq -s -e '[ .[] | select(.state | contains(["'"'"active"'"'"])) | select(.state | contains(["'"'"has_degraded"'"'"]) | not) ] | length == '$PG_COUNT"
+        echo Killing OSD $kill_osd and starting OSD $((kill_osd-1))
+        p=OSD${kill_osd}_PID
        kill -9 ${!p}
-        $ETCDCTL del /vitastor/osd/state/$i
-        start_osd $((i-1))
-        sleep 15
+        $ETCDCTL del /vitastor/osd/state/$kill_osd
+        start_osd $((kill_osd-1))
    done

    sleep 5
@@ -58,6 +60,7 @@ qemu-img convert -S 4096 -p \
    -O raw ./testdata/read.bin

 if ! diff -q ./testdata/read.bin ./testdata/mirror.bin; then
+    sleep 100000
    format_error Data lost during self-heal
 fi

--- a/tests/test_scrub.sh
+++ b/tests/test_scrub.sh
@@ -44,7 +44,7 @@ wait_condition 10 "$ETCDCTL"$' get --print-value-only /vitastor/config/pgs | jq
 $ETCDCTL put /vitastor/pg/history/1/1 `$ETCDCTL get --print-value-only /vitastor/pg/history/1/1 | jq -s -c '(.[0] // {}) + {"next_scrub":1}'`

 # Wait for scrub to finish
-wait_condition 60 "$ETCDCTL get --prefix /vitastor/pg/history/ --print-value-only | jq -s -e '([ .[] | select(.next_scrub == 0 or .next_scrub == null) ] | length) == $PG_COUNT'" Scrubbing
+wait_condition 300 "$ETCDCTL get --prefix /vitastor/pg/history/ --print-value-only | jq -s -e '([ .[] | select(.next_scrub == 0 or .next_scrub == null) ] | length) == $PG_COUNT'" Scrubbing

 if [[ ($SCHEME = replicated && $PG_SIZE < 3) || ($SCHEME != replicated && $((PG_SIZE-PG_DATA_SIZE)) < 2) ]]; then
    # Check that objects are marked as inconsistent if 2 replicas or EC/XOR 2+1
@@ -56,7 +56,7 @@ if [[ ($SCHEME = replicated && $PG_SIZE < 3) || ($SCHEME != replicated && $((PG_
        build/src/vitastor-cli fix --etcd_address $ETCD_URL --bad_osds $ZERO_OSD
 elif [[ ($SCHEME = replicated && $PG_SIZE > 2) || ($SCHEME != replicated && $((PG_SIZE-PG_DATA_SIZE)) > 1) ]]; then
    # Check that everything heals
-    wait_finish_rebalance 60
+    wait_finish_rebalance 300

    build/src/vitastor-cli describe --etcd_address $ETCD_URL --json | jq -e '. | length == 0'
 fi
--- a/tests/test_write.sh
+++ b/tests/test_write.sh
@@ -6,21 +6,37 @@ check_qemu
 #LD_PRELOAD=libasan.so.5 \
 #    fio -thread -name=test -ioengine=build/src/libfio_vitastor_sec.so -bs=4k -fsync=128 `$ETCDCTL get /vitastor/osd/state/1 --print-value-only | jq -r '"-host="+.addresses[0]+" -port="+(.port|tostring)'` -rw=write -size=32M

+# Small sequential writes were causing various bugs at different moments
+
+echo Small sequential writes
+
+LD_PRELOAD="build/src/libfio_vitastor.so" \
+    fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -numjobs=1 -iodepth=16 \
+        -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -runtime=10
+
 # Random writes without immediate_commit were stalling OSDs

+echo 68k random writes
+
 LD_PRELOAD="build/src/libfio_vitastor.so" \
    fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=68k -direct=1 -numjobs=16 -iodepth=4 \
        -rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -runtime=10

 # A lot of parallel syncs was crashing the primary OSD at some point

+echo T64Q1 writes with fsync
+
 LD_PRELOAD="build/src/libfio_vitastor.so" \
    fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -numjobs=64 -iodepth=1 -fsync=1 \
        -rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -number_ios=100

+echo Linear write
+
 LD_PRELOAD="build/src/libfio_vitastor.so" \
    fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -cluster_log_level=10

+echo T1Q1 writes with fsync=32
+
 LD_PRELOAD="build/src/libfio_vitastor.so" \
    fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=1 -fsync=32 -buffer_pattern=0xdeadface \
        -rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -number_ios=1024