Compare commits

...

10 Commits

Author SHA1 Message Date
c777a0041a Release 1.4.4
All checks were successful
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m23s
Test / test_move_reappear (push) Successful in 21s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 30s
Test / test_splitbrain (push) Successful in 25s
Test / test_snapshot_chain (push) Successful in 2m18s
Test / test_snapshot_chain_ec (push) Successful in 3m13s
Test / test_rebalance_verify_imm (push) Successful in 3m8s
Test / test_rebalance_verify (push) Successful in 3m41s
Test / test_switch_primary (push) Successful in 36s
Test / test_write (push) Successful in 40s
Test / test_write_no_same (push) Successful in 16s
Test / test_write_xor (push) Successful in 39s
Test / test_rebalance_verify_ec (push) Successful in 4m56s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m21s
Test / test_heal_pg_size_2 (push) Successful in 4m15s
Test / test_heal_ec (push) Successful in 5m1s
Test / test_heal_csum_32k_dj (push) Successful in 5m32s
Test / test_heal_csum_32k (push) Successful in 5m38s
Test / test_heal_csum_4k_dmj (push) Successful in 5m43s
Test / test_scrub (push) Successful in 1m31s
Test / test_scrub_zero_osd_2 (push) Successful in 1m17s
Test / test_heal_csum_4k_dj (push) Successful in 5m57s
Test / test_scrub_xor (push) Successful in 30s
Test / test_scrub_pg_size_3 (push) Successful in 1m7s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 41s
Test / test_scrub_ec (push) Successful in 24s
Test / test_heal_csum_32k_dmj (push) Successful in 3m56s
Test / test_heal_csum_4k (push) Successful in 3m16s
A couple of fixes for EC pools

- Fix a segfault possible on partial EC overwrite in 1234 -> 5030 rebalance scenario
- Fix two problems leading to EC pools stalling on rebalance & parallel sudden stops
  of OSDs, for example during a sudden poweroff of a host:
  - Recovery auto-tuning (1.4.0 feature) could apply too large delays and stall
    the EC journal - fixed by limiting delays with a new recovery_tune_sleep_cutoff_us
    parameter (10 seconds by default) and applying recovery pauses before write
    operations, not after them, to not occupy space in the journal for long time
  - Dynamic journal space reservation (1.3.0 feature) wasn't accounting new writes
    when checking the limit so OSDs could still fill the journal fully and stall -
    fixed by including new writes into the limit
- Print etcd dbSize instead of dbSizeInUse in status
2024-02-11 16:23:08 +03:00
2947ea93e8 Raise test_snapshot_chain_ec timeout to 6 minutes 2024-02-11 16:13:52 +03:00
978bdc128a Apply recovery pause before writes, after commits, and do not apply it to syncs to not block EC pools from functioning 2024-02-11 16:13:52 +03:00
bb2f395f1e Add cutoff threshold for recovery auto-tuning 2024-02-11 16:13:52 +03:00
b127da40f7 Add a FIXME about incomplete PGs 2024-02-11 13:42:51 +03:00
ca34a6047a Fix dynamic journal space reservation: include the new write itself, too 2024-02-11 13:42:51 +03:00
38ba76e893 Fix flusher sometimes being unable to trim journal when the flush queue is empty 2024-02-11 13:42:51 +03:00
1e3c4edea0 Print etcd dbSize instead of dbSizeInUse in status 2024-02-11 13:42:51 +03:00
e7ac855b07 Fix that EC segfault (1234 -> 5030 partial overwrite) 2024-02-11 13:42:51 +03:00
c53357ac45 Add a test for EC segfault with partial overwrite in 1234 -> 5030 rebalance scenario 2024-02-11 13:42:51 +03:00
38 changed files with 221 additions and 56 deletions

View File

@@ -395,7 +395,7 @@ jobs:
steps:
- name: Run test
id: test
timeout-minutes: 3
timeout-minutes: 6
run: SCHEME=ec /root/vitastor/tests/test_snapshot_chain.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'

View File

@@ -39,6 +39,10 @@ for my $line (<>)
$test_name .= '_'.lc($1).'_'.$2;
}
}
if ($test_name eq 'test_snapshot_chain_ec')
{
$timeout = 6;
}
$line =~ s!\./test_!/root/vitastor/tests/test_!;
# Gitea CI doesn't support artifacts yet, lol
#- name: Upload results

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
set(VERSION "1.4.3")
set(VERSION "1.4.4")
add_subdirectory(src)

View File

@@ -1,4 +1,4 @@
VERSION ?= v1.4.3
VERSION ?= v1.4.4
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v1.4.3
image: vitalif/vitastor-csi:v1.4.4
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -121,7 +121,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v1.4.3
image: vitalif/vitastor-csi:v1.4.4
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "1.4.3"
vitastorCSIDriverVersion = "1.4.4"
)
// Config struct fills the parameters of request or user input

2
debian/changelog vendored
View File

@@ -1,4 +1,4 @@
vitastor (1.4.3-1) unstable; urgency=medium
vitastor (1.4.4-1) unstable; urgency=medium
* Bugfixes

View File

@@ -35,8 +35,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-1.4.3; \
cd vitastor-1.4.3; \
cp -r /root/vitastor vitastor-1.4.4; \
cd vitastor-1.4.4; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -49,8 +49,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.3.orig.tar.xz vitastor-1.4.3; \
cd vitastor-1.4.3; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.4.orig.tar.xz vitastor-1.4.4; \
cd vitastor-1.4.4; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -19,8 +19,8 @@ These parameters only apply to Monitors.
## etcd_mon_ttl
- Type: seconds
- Default: 30
- Minimum: 10
- Default: 1
- Minimum: 5
Monitor etcd lease refresh interval in seconds

View File

@@ -19,8 +19,8 @@
## etcd_mon_ttl
- Тип: секунды
- Значение по умолчанию: 30
- Минимальное значение: 10
- Значение по умолчанию: 1
- Минимальное значение: 5
Интервал обновления etcd резервации (lease) монитором

View File

@@ -215,8 +215,8 @@ is scheduled.
## up_wait_retry_interval
- Type: milliseconds
- Default: 500
- Minimum: 50
- Default: 50
- Minimum: 10
- Can be changed online: yes
OSDs respond to clients with a special error code when they receive I/O

View File

@@ -224,8 +224,8 @@ OSD в любом случае согласовывают реальное зн
## up_wait_retry_interval
- Тип: миллисекунды
- Значение по умолчанию: 500
- Минимальное значение: 50
- Значение по умолчанию: 50
- Минимальное значение: 10
- Можно менять на лету: да
Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не

View File

@@ -59,6 +59,7 @@ them, even without restarting by updating configuration in etcd.
- [recovery_tune_client_util_high](#recovery_tune_client_util_high)
- [recovery_tune_agg_interval](#recovery_tune_agg_interval)
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)
## etcd_report_interval
@@ -604,5 +605,14 @@ is usually fine.
- Default: 10
- Can be changed online: yes
Minimum possible value for auto-tuned recovery_sleep_us. Values lower
than this value are changed to 0.
Minimum possible value for auto-tuned recovery_sleep_us. Lower values
are changed to 0.
## recovery_tune_sleep_cutoff_us
- Type: microseconds
- Default: 10000000
- Can be changed online: yes
Maximum possible value for auto-tuned recovery_sleep_us. Higher values
are treated as outliers and ignored in aggregation.

View File

@@ -60,6 +60,7 @@
- [recovery_tune_client_util_high](#recovery_tune_client_util_high)
- [recovery_tune_agg_interval](#recovery_tune_agg_interval)
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)
## etcd_report_interval
@@ -634,4 +635,14 @@ EC (кодов коррекции ошибок) с более, чем 1 диск
- Можно менять на лету: да
Минимальное возможное значение авто-подстроенного recovery_sleep_us.
Значения ниже данного заменяются на 0.
Меньшие значения заменяются на 0.
## recovery_tune_sleep_cutoff_us
- Тип: микросекунды
- Значение по умолчанию: 10000000
- Можно менять на лету: да
Максимальное возможное значение авто-подстроенного recovery_sleep_us.
Большие значения считаются случайными выбросами и игнорируются в
усреднении.

View File

@@ -731,8 +731,19 @@
default: 10
online: true
info: |
Minimum possible value for auto-tuned recovery_sleep_us. Values lower
than this value are changed to 0.
Minimum possible value for auto-tuned recovery_sleep_us. Lower values
are changed to 0.
info_ru: |
Минимальное возможное значение авто-подстроенного recovery_sleep_us.
Значения ниже данного заменяются на 0.
Меньшие значения заменяются на 0.
- name: recovery_tune_sleep_cutoff_us
type: us
default: 10000000
online: true
info: |
Maximum possible value for auto-tuned recovery_sleep_us. Higher values
are treated as outliers and ignored in aggregation.
info_ru: |
Максимальное возможное значение авто-подстроенного recovery_sleep_us.
Большие значения считаются случайными выбросами и игнорируются в
усреднении.

View File

@@ -1,6 +1,6 @@
{
"name": "vitastor-mon",
"version": "1.4.3",
"version": "1.4.4",
"description": "Vitastor SDS monitor service",
"main": "mon-main.js",
"scripts": {

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '1.4.3'
VERSION = '1.4.4'
LOG = logging.getLogger(__name__)

View File

@@ -24,4 +24,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-1.4.3/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.3$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-1.4.4/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.4$(rpm --eval '%dist').tar.gz *

View File

@@ -36,7 +36,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.4.3.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.4.4.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 1.4.3
Version: 1.4.4
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.4.3.el7.tar.gz
Source0: vitastor-1.4.4.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.4.3.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.4.4.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 1.4.3
Version: 1.4.4
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.4.3.el8.tar.gz
Source0: vitastor-1.4.4.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -18,7 +18,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.4.3.el9.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.4.4.el9.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 1.4.3
Version: 1.4.4
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.4.3.el9.tar.gz
Source0: vitastor-1.4.4.el9.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="1.4.3")
add_definitions(-DVERSION="1.4.4")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
add_link_options(-fno-omit-frame-pointer)
if (${WITH_ASAN})

View File

@@ -184,8 +184,7 @@ void journal_flusher_t::mark_trim_possible()
if (trim_wanted > 0)
{
dequeuing = true;
if (!journal_trim_counter)
journal_trim_counter = journal_trim_interval;
journal_trim_counter = 0;
bs->ringloop->wakeup();
}
}
@@ -366,7 +365,7 @@ resume_0:
!flusher->flush_queue.size() || !flusher->dequeuing)
{
stop_flusher:
if (flusher->trim_wanted > 0 && flusher->journal_trim_counter > 0)
if (flusher->trim_wanted > 0 && !flusher->journal_trim_counter)
{
// Attempt forced trim
flusher->active_flushers++;
@@ -1346,7 +1345,6 @@ bool journal_flusher_co::trim_journal(int wait_base)
else if (wait_state == wait_base+2) goto resume_2;
else if (wait_state == wait_base+3) goto resume_3;
else if (wait_state == wait_base+4) goto resume_4;
flusher->journal_trim_counter = 0;
new_trim_pos = bs->journal.get_trim_pos();
if (new_trim_pos != bs->journal.used_start)
{
@@ -1419,6 +1417,7 @@ bool journal_flusher_co::trim_journal(int wait_base)
exit(0);
}
}
flusher->journal_trim_counter = 0;
flusher->trimming = false;
}
return true;

View File

@@ -76,6 +76,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
// 2nd step: Data device is synced, prepare & write journal entries
// Check space in the journal and journal memory buffers
blockstore_journal_check_t space_check(this);
auto reservation = (unstable_writes.size()+unstable_unsynced+PRIV(op)->sync_big_writes.size())*journal.block_size;
if (dsk.csum_block_size)
{
// More complex check because all journal entries have different lengths
@@ -85,15 +86,14 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
left--;
auto & dirty_entry = dirty_db.at(sbw);
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size, 0))
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size, left ? 0 : reservation))
{
return 0;
}
}
}
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, reservation))
{
return 0;
}

View File

@@ -320,7 +320,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, unsynced_big_write_count + 1,
sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
(unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size))
{
return 0;
}
@@ -412,7 +412,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
|| !space_check.check_available(op, 1,
sizeof(journal_entry_small_write) + dyn_size,
op->len + (unstable_writes.size()+unstable_unsynced)*journal.block_size))
op->len + (unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size))
{
return 0;
}
@@ -549,7 +549,7 @@ resume_2:
uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
(unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size))
{
return 0;
}

View File

@@ -106,7 +106,7 @@ resume_2:
if (etcd_states[i]["error"].is_null())
{
etcd_alive++;
etcd_db_size = etcd_states[i]["dbSizeInUse"].uint64_value();
etcd_db_size = etcd_states[i]["dbSize"].uint64_value();
}
}
int mon_count = 0;

View File

@@ -233,6 +233,8 @@ void osd_t::parse_config(bool init)
? 10 : config["recovery_tune_agg_interval"].uint64_value();
recovery_tune_sleep_min_us = config["recovery_tune_sleep_min_us"].is_null()
? 10 : config["recovery_tune_sleep_min_us"].uint64_value();
recovery_tune_sleep_cutoff_us = config["recovery_tune_sleep_cutoff_us"].is_null()
? 10000000 : config["recovery_tune_sleep_cutoff_us"].uint64_value();
recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
if (recovery_pg_switch < 1)
recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;

View File

@@ -125,6 +125,7 @@ class osd_t
int recovery_tune_interval = 1;
int recovery_tune_agg_interval = 10;
int recovery_tune_sleep_min_us = 10;
int recovery_tune_sleep_cutoff_us = 10000000;
int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
int inode_vanish_time = 60;
@@ -282,6 +283,7 @@ class osd_t
void exec_sync_stab_all(osd_op_t *cur_op);
void exec_show_config(osd_op_t *cur_op);
void exec_secondary(osd_op_t *cur_op);
void exec_secondary_real(osd_op_t *cur_op);
void secondary_op_callback(osd_op_t *cur_op);
// primary ops

View File

@@ -422,6 +422,10 @@ void osd_t::tune_recovery()
rtune_avg_lat = total_recovery_usec/recovery_count;
uint64_t target_lat = rtune_avg_lat * rtune_avg_lat/1000000.0 * recovery_count/recovery_tune_interval / rtune_target_util;
auto sleep_us = target_lat > rtune_avg_lat+recovery_tune_sleep_min_us ? target_lat-rtune_avg_lat : 0;
if (sleep_us > recovery_tune_sleep_cutoff_us)
{
return;
}
if (recovery_target_sleep_items.size() != recovery_tune_agg_interval)
{
recovery_target_sleep_items.resize(recovery_tune_agg_interval);
@@ -438,7 +442,7 @@ void osd_t::tune_recovery()
if (recovery_target_sleep_count < recovery_tune_agg_interval)
recovery_target_sleep_count++;
recovery_target_sleep_us = recovery_target_sleep_total / recovery_target_sleep_count;
if (log_level > 4)
if (log_level > 1)
{
printf(
"[OSD %lu] auto-tune: client util: %.2f, recovery util: %.2f, lat: %lu us -> target util %.2f, delay %lu us\n",

View File

@@ -222,6 +222,9 @@ void osd_t::start_pg_peering(pg_t & pg)
}
if (pg.pg_cursize < pg.pg_minsize)
{
// FIXME: Incomplete EC PGs may currently easily lead to write hangs ("slow ops" in OSD logs)
// because such PGs don't flush unstable entries on secondary OSDs so they can't remove these
// entries from their journals...
pg.state = PG_INCOMPLETE;
report_pg_state(pg);
return;

View File

@@ -861,15 +861,15 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
static void calc_rmw_parity_copy_parity(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t start, uint32_t end)
{
if (write_osd_set != read_osd_set)
if (write_osd_set != read_osd_set && end != 0)
{
for (int role = pg_minsize; role < pg_size; role++)
{
if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size))
if (write_osd_set[role] != read_osd_set[role] && write_osd_set[role] != 0 && (start != 0 || end != chunk_size))
{
// Copy new parity into the read buffer to write it back
memcpy(
(uint8_t*)stripes[role].read_buf + start,
(uint8_t*)stripes[role].read_buf + start - stripes[role].read_start,
stripes[role].write_buf,
end - start
);

View File

@@ -30,6 +30,7 @@ void test16();
void test_recover_22_d2();
void test_ec43_error_bruteforce();
void test_recover_53_d5();
void test_recover_22();
int main(int narg, char *args[])
{
@@ -70,6 +71,8 @@ int main(int narg, char *args[])
test_ec43_error_bruteforce();
// Test 19
test_recover_53_d5();
// Test 20
test_recover_22();
// End
printf("all ok\n");
return 0;
@@ -1244,3 +1247,99 @@ void test_recover_53_d5()
// Done
use_ec(8, 5, false);
}
void test_recover_22()
{
const int bmp = 128*1024 / 4096 / 8;
use_ec(4, 2, true);
osd_num_t osd_set[4] = { 1, 2, 3, 4 };
osd_num_t write_osd_set[4] = { 5, 0, 3, 0 };
osd_rmw_stripe_t stripes[4] = {};
unsigned bitmaps[4] = { 0 };
// split
void *write_buf = (uint8_t*)malloc_or_die(4096);
set_pattern(write_buf, 4096, PATTERN0);
split_stripes(2, 128*1024, 120*1024, 4096, stripes);
assert(stripes[0].req_start == 120*1024 && stripes[0].req_end == 124*1024);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
// calc_rmw
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 2, write_osd_set, 128*1024, bmp);
for (int i = 0; i < 4; i++)
stripes[i].bmp_buf = bitmaps+i;
assert(rmw_buf);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 120*1024 && stripes[1].read_end == 124*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
assert(stripes[0].write_start == 120*1024 && stripes[0].write_end == 124*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
assert(stripes[2].write_start == 120*1024 && stripes[2].write_end == 124*1024);
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
assert(stripes[1].read_buf == (uint8_t*)rmw_buf+132*1024);
assert(stripes[2].read_buf == NULL);
assert(stripes[3].read_buf == NULL);
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == NULL);
assert(stripes[2].write_buf == (uint8_t*)rmw_buf);
assert(stripes[3].write_buf == NULL);
// encode
set_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
set_pattern(stripes[1].read_buf, 4*1024, PATTERN2);
memset(stripes[0].bmp_buf, 0xff, bmp);
memset(stripes[1].bmp_buf, 0xff, bmp);
calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 128*1024, bmp);
assert(*(uint32_t*)stripes[2].bmp_buf == 0);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
assert(stripes[2].write_start == 120*1024 && stripes[2].write_end == 124*1024);
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
assert(stripes[0].write_buf == stripes[0].read_buf);
assert(stripes[1].write_buf == NULL);
assert(stripes[2].write_buf == (uint8_t*)rmw_buf);
assert(stripes[3].write_buf == NULL);
check_pattern(stripes[2].write_buf, 4*1024, PATTERN0^PATTERN2);
// decode and verify
memset(stripes, 0, sizeof(stripes));
split_stripes(2, 128*1024, 0, 256*1024, stripes);
assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 128*1024);
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
for (int role = 0; role < 4; role++)
{
stripes[role].read_start = stripes[role].req_start;
stripes[role].read_end = stripes[role].req_end;
}
assert(extend_missing_stripes(stripes, write_osd_set, 2, 4) == 0);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
void *read_buf = alloc_read_buffer(stripes, 4, 0);
for (int i = 0; i < 4; i++)
stripes[i].bmp_buf = bitmaps+i;
assert(read_buf);
assert(stripes[0].read_buf == read_buf);
assert(stripes[1].read_buf == (uint8_t*)read_buf+128*1024);
assert(stripes[2].read_buf == (uint8_t*)read_buf+2*128*1024);
set_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
set_pattern(stripes[0].read_buf+120*1024, 4*1024, PATTERN0);
set_pattern(stripes[2].read_buf, 128*1024, PATTERN1^PATTERN2);
set_pattern(stripes[2].read_buf+120*1024, 4*1024, PATTERN0^PATTERN2);
memset(stripes[0].bmp_buf, 0xff, bmp);
memset(stripes[2].bmp_buf, 0, bmp);
bitmaps[1] = 0;
bitmaps[3] = 0;
reconstruct_stripes_ec(stripes, 4, 2, bmp);
assert(bitmaps[0] == 0xFFFFFFFF);
assert(*(uint32_t*)stripes[1].bmp_buf == 0xFFFFFFFF);
check_pattern(stripes[1].read_buf, 128*1024, PATTERN2);
free(read_buf);
// Done
free(rmw_buf);
free(write_buf);
use_ec(4, 2, false);
}

View File

@@ -42,8 +42,10 @@ void osd_t::secondary_op_callback(osd_op_t *op)
int retval = op->bs_op->retval;
delete op->bs_op;
op->bs_op = NULL;
if (op->is_recovery_related() && recovery_target_sleep_us)
if (op->is_recovery_related() && recovery_target_sleep_us &&
op->req.hdr.opcode == OSD_OP_SEC_STABILIZE)
{
// Apply pause AFTER commit. Do not apply pause to SYNC at all
if (!op->tv_end.tv_sec)
{
clock_gettime(CLOCK_REALTIME, &op->tv_end);
@@ -59,7 +61,25 @@ void osd_t::secondary_op_callback(osd_op_t *op)
}
}
void osd_t::exec_secondary(osd_op_t *cur_op)
void osd_t::exec_secondary(osd_op_t *op)
{
if (op->is_recovery_related() && recovery_target_sleep_us &&
op->req.hdr.opcode != OSD_OP_SEC_STABILIZE && op->req.hdr.opcode != OSD_OP_SEC_SYNC)
{
// Apply pause BEFORE write/delete
tfd->set_timer_us(recovery_target_sleep_us, false, [this, op](int timer_id)
{
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
exec_secondary_real(op);
});
}
else
{
exec_secondary_real(op);
}
}
void osd_t::exec_secondary_real(osd_op_t *cur_op)
{
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
{

View File

@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor
Description: Vitastor client library
Version: 1.4.3
Version: 1.4.4
Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir}