forked from vitalif/vitastor
Compare commits
42 Commits
Author | SHA1 | Date |
---|---|---|
Vitaliy Filippov | ec90fe6ec1 | |
Vitaliy Filippov | 18c72f4835 | |
Vitaliy Filippov | 59fbcef734 | |
Vitaliy Filippov | 40b7c21fb1 | |
Vitaliy Filippov | efb3678606 | |
Vitaliy Filippov | 462650134e | |
Vitaliy Filippov | 8d87e32175 | |
Vitaliy Filippov | b0b2e7df3c | |
Vitaliy Filippov | 97efb9e299 | |
Vitaliy Filippov | f6d705383a | |
Vitaliy Filippov | 68567c0e1f | |
Vitaliy Filippov | 04b00003e9 | |
Vitaliy Filippov | 307c1731c1 | |
Vitaliy Filippov | 75a6a556b5 | |
Vitaliy Filippov | a48e2bbf18 | |
Vitaliy Filippov | 688821665a | |
Vitaliy Filippov | 3e162d95a0 | |
Vitaliy Filippov | 829381b335 | |
Vitaliy Filippov | 54f2353f24 | |
Vitaliy Filippov | e47f6fba60 | |
Vitaliy Filippov | 883bf84a16 | |
Vitaliy Filippov | 52097c4856 | |
Vitaliy Filippov | e1355cbc74 | |
Vitaliy Filippov | 8f8b90be7a | |
Vitaliy Filippov | ad9f619370 | |
Vitaliy Filippov | f4769ba7c7 | |
Vitaliy Filippov | 843b7052d2 | |
Vitaliy Filippov | df99e232ee | |
Vitaliy Filippov | 3a40fa4127 | |
Vitaliy Filippov | 4095bcc558 | |
Vitaliy Filippov | 564d64e271 | |
Vitaliy Filippov | cf54741c95 | |
Vitaliy Filippov | 18a5fafa2a | |
Vitaliy Filippov | 06f4978085 | |
Vitaliy Filippov | 7ebf1588c5 | |
Vitaliy Filippov | b0ad1e1e6d | |
Vitaliy Filippov | 0949f08407 | |
Vitaliy Filippov | 04a1f18fa5 | |
Vitaliy Filippov | cf9a641d66 | |
Vitaliy Filippov | 05db1308aa | |
Vitaliy Filippov | 98b54ca948 | |
Vitaliy Filippov | 23225c5e62 |
|
@ -1,4 +1,4 @@
|
|||
vitastor (0.5.10-1) unstable; urgency=medium
|
||||
vitastor (0.5.13-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
|
|
|
@ -40,10 +40,10 @@ RUN set -e -x; \
|
|||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-0.5.10; \
|
||||
ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.5.10/qemu; \
|
||||
ln -s /root/fio-build/fio-*/ vitastor-0.5.10/fio; \
|
||||
cd vitastor-0.5.10; \
|
||||
cp -r /root/vitastor vitastor-0.5.13; \
|
||||
ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.5.13/qemu; \
|
||||
ln -s /root/fio-build/fio-*/ vitastor-0.5.13/fio; \
|
||||
cd vitastor-0.5.13; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
sh copy-qemu-includes.sh; \
|
||||
|
@ -59,8 +59,8 @@ RUN set -e -x; \
|
|||
echo "dep:fio=$FIO" > debian/substvars; \
|
||||
echo "dep:qemu=$QEMU" >> debian/substvars; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.5.10.orig.tar.xz vitastor-0.5.10; \
|
||||
cd vitastor-0.5.10; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.5.13.orig.tar.xz vitastor-0.5.13; \
|
||||
cd vitastor-0.5.13; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
|
|
@ -104,6 +104,17 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
|
|||
return res;
|
||||
}
|
||||
|
||||
function shuffle(array)
|
||||
{
|
||||
for (let i = array.length - 1, j, x; i > 0; i--)
|
||||
{
|
||||
j = Math.floor(Math.random() * (i + 1));
|
||||
x = array[i];
|
||||
array[i] = array[j];
|
||||
array[j] = x;
|
||||
}
|
||||
}
|
||||
|
||||
function make_int_pgs(weights, pg_count)
|
||||
{
|
||||
const total_weight = Object.values(weights).reduce((a, c) => Number(a) + Number(c), 0);
|
||||
|
@ -120,6 +131,7 @@ function make_int_pgs(weights, pg_count)
|
|||
weight_left -= weights[pg_name];
|
||||
pg_left -= n;
|
||||
}
|
||||
shuffle(int_pgs);
|
||||
return int_pgs;
|
||||
}
|
||||
|
||||
|
|
|
@ -53,7 +53,6 @@ ExecStart=/usr/bin/vitastor-osd \\
|
|||
--osd_num $OSD_NUM \\
|
||||
--disable_data_fsync 1 \\
|
||||
--immediate_commit all \\
|
||||
--flusher_count 256 \\
|
||||
--disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
|
||||
--journal_no_same_sector_overwrites true \\
|
||||
--journal_sector_buffer_count 1024 \\
|
||||
|
|
|
@ -32,7 +32,8 @@ ExecStart=/usr/local/bin/etcd -name etcd$ETCD_NUM --data-dir /var/lib/etcd$ETCD_
|
|||
--advertise-client-urls http://$IP:2379 --listen-client-urls http://$IP:2379 \\
|
||||
--initial-advertise-peer-urls http://$IP:2380 --listen-peer-urls http://$IP:2380 \\
|
||||
--initial-cluster-token vitastor-etcd-1 --initial-cluster $ETCD_HOSTS \\
|
||||
--initial-cluster-state new --max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision
|
||||
--initial-cluster-state new --max-txn-ops=100000 --max-request-bytes=104857600 \\
|
||||
--auto-compaction-retention=10 --auto-compaction-mode=revision
|
||||
WorkingDirectory=/var/lib/etcd$ETCD_NUM.etcd
|
||||
ExecStartPre=+chown -R etcd /var/lib/etcd$ETCD_NUM.etcd
|
||||
User=etcd
|
||||
|
|
50
mon/mon.js
50
mon/mon.js
|
@ -92,7 +92,8 @@ const etcd_tree = {
|
|||
disable_device_lock,
|
||||
// blockstore - configurable
|
||||
max_write_iodepth,
|
||||
flusher_count,
|
||||
min_flusher_count: 1,
|
||||
max_flusher_count: 256,
|
||||
inmemory_metadata,
|
||||
inmemory_journal,
|
||||
journal_sector_buffer_count,
|
||||
|
@ -182,7 +183,7 @@ const etcd_tree = {
|
|||
/* <pool_id>: {
|
||||
<pg_id>: {
|
||||
primary: osd_num_t,
|
||||
state: ("starting"|"peering"|"incomplete"|"active"|"stopping"|"offline"|
|
||||
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
||||
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
||||
"has_invalid"|"left_on_dead")[],
|
||||
}
|
||||
|
@ -541,7 +542,7 @@ class Mon
|
|||
for (const osd_num of this.all_osds().sort((a, b) => a - b))
|
||||
{
|
||||
const stat = this.state.osd.stats[osd_num];
|
||||
if (stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
|
||||
if (stat && stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
|
||||
{
|
||||
// Numeric IDs are reserved for OSDs
|
||||
const osd_cfg = this.state.config.osd[osd_num];
|
||||
|
@ -692,6 +693,11 @@ class Mon
|
|||
pg_history[i].osd_sets = pg_history[i].osd_sets || [];
|
||||
pg_history[i].osd_sets.push(prev_pgs[i]);
|
||||
}
|
||||
if (pg_history[i] && pg_history[i].osd_sets)
|
||||
{
|
||||
pg_history[i].osd_sets = Object.values(pg_history[i].osd_sets
|
||||
.reduce((a, c) => { a[c.join(' ')] = c; return a; }, {}));
|
||||
}
|
||||
});
|
||||
for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
|
||||
{
|
||||
|
@ -842,7 +848,7 @@ class Mon
|
|||
{
|
||||
// Take configuration and state, check it against the stored configuration hash
|
||||
// Recalculate PGs and save them to etcd if the configuration is changed
|
||||
// FIXME: Also do not change anything if the distribution is good enough and no PGs are degraded
|
||||
// FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
|
||||
const { up_osds, levels, osd_tree } = this.get_osd_tree();
|
||||
const tree_cfg = {
|
||||
osd_tree,
|
||||
|
@ -901,7 +907,14 @@ class Mon
|
|||
prev_pgs[pg-1] = this.state.history.last_clean_pgs.items[pool_id][pg].osd_set;
|
||||
}
|
||||
prev_pgs = JSON.parse(JSON.stringify(prev_pgs.length ? prev_pgs : real_prev_pgs));
|
||||
const old_pg_count = prev_pgs.length;
|
||||
const old_pg_count = real_prev_pgs.length;
|
||||
const optimize_cfg = {
|
||||
osd_tree: pool_tree,
|
||||
pg_count: pool_cfg.pg_count,
|
||||
pg_size: pool_cfg.pg_size,
|
||||
pg_minsize: pool_cfg.pg_minsize,
|
||||
max_combinations: pool_cfg.max_osd_combinations,
|
||||
};
|
||||
let optimize_result;
|
||||
if (old_pg_count > 0)
|
||||
{
|
||||
|
@ -928,24 +941,23 @@ class Mon
|
|||
pg.pop();
|
||||
}
|
||||
}
|
||||
optimize_result = await LPOptimizer.optimize_change({
|
||||
prev_pgs,
|
||||
osd_tree: pool_tree,
|
||||
pg_size: pool_cfg.pg_size,
|
||||
pg_minsize: pool_cfg.pg_minsize,
|
||||
max_combinations: pool_cfg.max_osd_combinations,
|
||||
});
|
||||
if (!this.state.config.pgs.hash)
|
||||
{
|
||||
// Re-shuffle PGs
|
||||
optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
|
||||
}
|
||||
else
|
||||
{
|
||||
optimize_result = await LPOptimizer.optimize_initial({
|
||||
osd_tree: pool_tree,
|
||||
pg_count: pool_cfg.pg_count,
|
||||
pg_size: pool_cfg.pg_size,
|
||||
pg_minsize: pool_cfg.pg_minsize,
|
||||
max_combinations: pool_cfg.max_osd_combinations,
|
||||
optimize_result = await LPOptimizer.optimize_change({
|
||||
prev_pgs,
|
||||
...optimize_cfg,
|
||||
});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
|
||||
}
|
||||
if (old_pg_count != optimize_result.int_pgs.length)
|
||||
{
|
||||
console.log(
|
||||
|
@ -1072,7 +1084,7 @@ class Mon
|
|||
const op_stats = {}, subop_stats = {}, recovery_stats = {};
|
||||
for (const osd in this.state.osd.stats)
|
||||
{
|
||||
const st = this.state.osd.stats[osd];
|
||||
const st = this.state.osd.stats[osd]||{};
|
||||
for (const op in st.op_stats||{})
|
||||
{
|
||||
op_stats[op] = op_stats[op] || { count: 0n, usec: 0n, bytes: 0n };
|
||||
|
|
|
@ -48,4 +48,4 @@ FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Ve
|
|||
QEMU=`rpm -qi qemu qemu-kvm | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
perl -i -pe 's/(Requires:\s*qemu(?:-kvm)?)([^\n]+)?/$1 = '$QEMU'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-0.5.10/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.5.10$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-0.5.13/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.5.13$(rpm --eval '%dist').tar.gz *
|
||||
|
|
|
@ -37,7 +37,7 @@ ADD . /root/vitastor
|
|||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.5.10.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.5.13.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
Name: vitastor
|
||||
Version: 0.5.10
|
||||
Version: 0.5.13
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.5.10.el7.tar.gz
|
||||
Source0: vitastor-0.5.13.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
|
|
@ -35,7 +35,7 @@ ADD . /root/vitastor
|
|||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.5.10.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.5.13.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
Name: vitastor
|
||||
Version: 0.5.10
|
||||
Version: 0.5.13
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.5.10.el8.tar.gz
|
||||
Source0: vitastor-0.5.13.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
|
|
@ -14,7 +14,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
|||
endif()
|
||||
|
||||
add_definitions(-DVERSION="0.6-dev")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith)
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -I ${CMAKE_SOURCE_DIR}/src)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||
add_link_options(-fsanitize=address -fno-omit-frame-pointer)
|
||||
|
@ -66,7 +66,8 @@ target_link_libraries(fio_vitastor_blk
|
|||
# vitastor-osd
|
||||
add_executable(vitastor-osd
|
||||
osd_main.cpp osd.cpp osd_secondary.cpp msgr_receive.cpp msgr_send.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
|
||||
osd_primary.cpp osd_primary_subops.cpp etcd_state_client.cpp messenger.cpp osd_cluster.cpp http_client.cpp osd_ops.cpp pg_states.cpp
|
||||
osd_primary.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
|
||||
etcd_state_client.cpp messenger.cpp msgr_stop.cpp msgr_op.cpp osd_cluster.cpp http_client.cpp osd_ops.cpp pg_states.cpp
|
||||
osd_rmw.cpp base64.cpp timerfd_manager.cpp epoll_manager.cpp ../json11/json11.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-osd
|
||||
|
@ -86,7 +87,7 @@ target_link_libraries(fio_vitastor_sec
|
|||
# libvitastor_client.so
|
||||
add_library(vitastor_client SHARED
|
||||
cluster_client.cpp epoll_manager.cpp etcd_state_client.cpp
|
||||
messenger.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp ../json11/json11.cpp
|
||||
messenger.cpp msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp ../json11/json11.cpp
|
||||
http_client.cpp osd_ops.cpp pg_states.cpp timerfd_manager.cpp base64.cpp
|
||||
)
|
||||
target_link_libraries(vitastor_client
|
||||
|
@ -161,7 +162,8 @@ target_link_libraries(osd_rmw_test Jerasure tcmalloc_minimal)
|
|||
|
||||
# stub_uring_osd
|
||||
add_executable(stub_uring_osd
|
||||
stub_uring_osd.cpp epoll_manager.cpp messenger.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp timerfd_manager.cpp ../json11/json11.cpp
|
||||
stub_uring_osd.cpp epoll_manager.cpp messenger.cpp msgr_stop.cpp msgr_op.cpp
|
||||
msgr_send.cpp msgr_receive.cpp ringloop.cpp timerfd_manager.cpp ../json11/json11.cpp
|
||||
)
|
||||
target_link_libraries(stub_uring_osd
|
||||
${LIBURING_LIBRARIES}
|
||||
|
@ -175,6 +177,15 @@ target_link_libraries(osd_peering_pg_test tcmalloc_minimal)
|
|||
# test_allocator
|
||||
add_executable(test_allocator test_allocator.cpp allocator.cpp)
|
||||
|
||||
# test_cluster_client
|
||||
add_executable(test_cluster_client
|
||||
test_cluster_client.cpp
|
||||
pg_states.cpp osd_ops.cpp cluster_client.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
|
||||
etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
|
||||
)
|
||||
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
|
||||
target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
|
||||
|
||||
## test_blockstore, test_shit
|
||||
#add_executable(test_blockstore test_blockstore.cpp timerfd_interval.cpp)
|
||||
#target_link_libraries(test_blockstore blockstore)
|
||||
|
|
|
@ -37,6 +37,21 @@ allocator::~allocator()
|
|||
delete[] mask;
|
||||
}
|
||||
|
||||
bool allocator::get(uint64_t addr)
|
||||
{
|
||||
if (addr >= size)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
uint64_t p2 = 1, offset = 0;
|
||||
while (p2 * 64 < size)
|
||||
{
|
||||
offset += p2;
|
||||
p2 = p2 * 64;
|
||||
}
|
||||
return ((mask[offset + addr/64] >> (addr % 64)) & 1);
|
||||
}
|
||||
|
||||
void allocator::set(uint64_t addr, bool value)
|
||||
{
|
||||
if (addr >= size)
|
||||
|
|
|
@ -16,6 +16,7 @@ class allocator
|
|||
public:
|
||||
allocator(uint64_t blocks);
|
||||
~allocator();
|
||||
bool get(uint64_t addr);
|
||||
void set(uint64_t addr, bool value);
|
||||
uint64_t find_free();
|
||||
uint64_t get_free_count();
|
||||
|
|
|
@ -58,7 +58,7 @@ uint64_t blockstore_t::get_free_block_count()
|
|||
return impl->get_free_block_count();
|
||||
}
|
||||
|
||||
uint32_t blockstore_t::get_disk_alignment()
|
||||
uint32_t blockstore_t::get_bitmap_granularity()
|
||||
{
|
||||
return impl->get_disk_alignment();
|
||||
return impl->get_bitmap_granularity();
|
||||
}
|
||||
|
|
|
@ -183,5 +183,5 @@ public:
|
|||
uint64_t get_block_count();
|
||||
uint64_t get_free_block_count();
|
||||
|
||||
uint32_t get_disk_alignment();
|
||||
uint32_t get_bitmap_granularity();
|
||||
};
|
||||
|
|
|
@ -3,12 +3,13 @@
|
|||
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
|
||||
journal_flusher_t::journal_flusher_t(blockstore_impl_t *bs)
|
||||
{
|
||||
this->bs = bs;
|
||||
this->flusher_count = flusher_count;
|
||||
this->cur_flusher_count = 1;
|
||||
this->target_flusher_count = 1;
|
||||
this->max_flusher_count = bs->max_flusher_count;
|
||||
this->min_flusher_count = bs->min_flusher_count;
|
||||
this->cur_flusher_count = bs->min_flusher_count;
|
||||
this->target_flusher_count = bs->min_flusher_count;
|
||||
dequeuing = false;
|
||||
trimming = false;
|
||||
active_flushers = 0;
|
||||
|
@ -19,8 +20,8 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
|
|||
journal_trim_counter = 0;
|
||||
trim_wanted = 0;
|
||||
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->journal_block_size);
|
||||
co = new journal_flusher_co[flusher_count];
|
||||
for (int i = 0; i < flusher_count; i++)
|
||||
co = new journal_flusher_co[max_flusher_count];
|
||||
for (int i = 0; i < max_flusher_count; i++)
|
||||
{
|
||||
co[i].bs = bs;
|
||||
co[i].flusher = this;
|
||||
|
@ -71,10 +72,10 @@ bool journal_flusher_t::is_active()
|
|||
void journal_flusher_t::loop()
|
||||
{
|
||||
target_flusher_count = bs->write_iodepth*2;
|
||||
if (target_flusher_count <= 0)
|
||||
target_flusher_count = 1;
|
||||
else if (target_flusher_count > flusher_count)
|
||||
target_flusher_count = flusher_count;
|
||||
if (target_flusher_count < min_flusher_count)
|
||||
target_flusher_count = min_flusher_count;
|
||||
else if (target_flusher_count > max_flusher_count)
|
||||
target_flusher_count = max_flusher_count;
|
||||
if (target_flusher_count > cur_flusher_count)
|
||||
cur_flusher_count = target_flusher_count;
|
||||
else if (target_flusher_count < cur_flusher_count)
|
||||
|
@ -237,7 +238,8 @@ bool journal_flusher_co::loop()
|
|||
else if (wait_state == 21)
|
||||
goto resume_21;
|
||||
resume_0:
|
||||
if (!flusher->flush_queue.size() || !flusher->dequeuing)
|
||||
if (flusher->flush_queue.size() < flusher->min_flusher_count && !flusher->trim_wanted ||
|
||||
!flusher->flush_queue.size() || !flusher->dequeuing)
|
||||
{
|
||||
stop_flusher:
|
||||
if (flusher->trim_wanted > 0 && flusher->journal_trim_counter > 0)
|
||||
|
@ -482,6 +484,14 @@ resume_1:
|
|||
}
|
||||
if (has_delete)
|
||||
{
|
||||
clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
|
||||
if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
|
||||
{
|
||||
printf("Fatal error (metadata corruption or bug): tried to delete metadata entry %lu (%lx:%lx) while deleting %lx:%lx\n",
|
||||
clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, cur.oid.inode, cur.oid.stripe);
|
||||
exit(1);
|
||||
}
|
||||
// zero out new metadata entry
|
||||
memset(meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
|
||||
}
|
||||
else
|
||||
|
@ -646,7 +656,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
|
|||
{
|
||||
char err[1024];
|
||||
snprintf(
|
||||
err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu unstable state during flush: %d",
|
||||
err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu unstable state during flush: 0x%x",
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
|
||||
);
|
||||
throw std::runtime_error(err);
|
||||
|
@ -775,7 +785,10 @@ void journal_flusher_co::update_clean_db()
|
|||
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %lu (new location is %lu)\n", old_clean_loc >> bs->block_order, clean_loc >> bs->block_order);
|
||||
printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
|
||||
old_clean_loc >> bs->block_order,
|
||||
cur.oid.inode, cur.oid.stripe, cur.version,
|
||||
clean_loc >> bs->block_order);
|
||||
#endif
|
||||
bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
|
||||
}
|
||||
|
@ -783,6 +796,11 @@ void journal_flusher_co::update_clean_db()
|
|||
{
|
||||
auto clean_it = bs->clean_db.find(cur.oid);
|
||||
bs->clean_db.erase(clean_it);
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %lu from %lx:%lx v%lu (delete)\n",
|
||||
clean_loc >> bs->block_order,
|
||||
cur.oid.inode, cur.oid.stripe, cur.version);
|
||||
#endif
|
||||
bs->data_alloc->set(clean_loc >> bs->block_order, false);
|
||||
clean_loc = UINT64_MAX;
|
||||
}
|
||||
|
@ -804,7 +822,7 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
|
|||
goto resume_1;
|
||||
else if (wait_state == wait_base+2)
|
||||
goto resume_2;
|
||||
if (!(fsync_meta ? bs->disable_meta_fsync : bs->disable_journal_fsync))
|
||||
if (!(fsync_meta ? bs->disable_meta_fsync : bs->disable_data_fsync))
|
||||
{
|
||||
cur_sync = flusher->syncs.end();
|
||||
while (cur_sync != flusher->syncs.begin())
|
||||
|
|
|
@ -80,7 +80,7 @@ class journal_flusher_t
|
|||
{
|
||||
int trim_wanted = 0;
|
||||
bool dequeuing;
|
||||
int flusher_count, cur_flusher_count, target_flusher_count;
|
||||
int min_flusher_count, max_flusher_count, cur_flusher_count, target_flusher_count;
|
||||
int flusher_start_threshold;
|
||||
journal_flusher_co *co;
|
||||
blockstore_impl_t *bs;
|
||||
|
@ -99,7 +99,7 @@ class journal_flusher_t
|
|||
std::deque<object_id> flush_queue;
|
||||
std::map<object_id, uint64_t> flush_versions;
|
||||
public:
|
||||
journal_flusher_t(int flusher_count, blockstore_impl_t *bs);
|
||||
journal_flusher_t(blockstore_impl_t *bs);
|
||||
~journal_flusher_t();
|
||||
void loop();
|
||||
bool is_active();
|
||||
|
|
|
@ -10,9 +10,9 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
|
|||
ring_consumer.loop = [this]() { loop(); };
|
||||
ringloop->register_consumer(&ring_consumer);
|
||||
initialized = 0;
|
||||
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, block_size);
|
||||
data_fd = meta_fd = journal.fd = -1;
|
||||
parse_config(config);
|
||||
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, block_size);
|
||||
try
|
||||
{
|
||||
open_data();
|
||||
|
@ -31,7 +31,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
|
|||
close(journal.fd);
|
||||
throw;
|
||||
}
|
||||
flusher = new journal_flusher_t(flusher_count, this);
|
||||
flusher = new journal_flusher_t(this);
|
||||
}
|
||||
|
||||
blockstore_impl_t::~blockstore_impl_t()
|
||||
|
|
|
@ -197,8 +197,8 @@ class blockstore_impl_t
|
|||
// Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
|
||||
int immediate_commit = IMMEDIATE_NONE;
|
||||
bool inmemory_meta = false;
|
||||
// Maximum flusher count
|
||||
unsigned flusher_count;
|
||||
// Maximum and minimum flusher count
|
||||
unsigned max_flusher_count, min_flusher_count;
|
||||
// Maximum queue depth
|
||||
unsigned max_write_iodepth = 128;
|
||||
/******* END OF OPTIONS *******/
|
||||
|
@ -210,6 +210,7 @@ class blockstore_impl_t
|
|||
blockstore_dirty_db_t dirty_db;
|
||||
std::vector<blockstore_op_t*> submit_queue;
|
||||
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
||||
int unsynced_big_write_count = 0;
|
||||
allocator *data_alloc = NULL;
|
||||
uint8_t *zero_object;
|
||||
|
||||
|
@ -283,7 +284,7 @@ class blockstore_impl_t
|
|||
// Stabilize
|
||||
int dequeue_stable(blockstore_op_t *op);
|
||||
int continue_stable(blockstore_op_t *op);
|
||||
void mark_stable(const obj_ver_id & ov);
|
||||
void mark_stable(const obj_ver_id & ov, bool forget_dirty = false);
|
||||
void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
|
||||
void stabilize_object(object_id oid, uint64_t max_ver);
|
||||
|
||||
|
@ -326,5 +327,5 @@ public:
|
|||
inline uint32_t get_block_size() { return block_size; }
|
||||
inline uint64_t get_block_count() { return block_count; }
|
||||
inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
|
||||
inline uint32_t get_disk_alignment() { return disk_alignment; }
|
||||
inline uint32_t get_bitmap_granularity() { return disk_alignment; }
|
||||
};
|
||||
|
|
|
@ -111,7 +111,10 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
|
|||
{
|
||||
// free the previous block
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %lu (new location is %lu)\n", clean_it->second.location >> block_order, done_cnt+i);
|
||||
printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
|
||||
clean_it->second.location >> block_order,
|
||||
clean_it->first.inode, clean_it->first.stripe, clean_it->second.version,
|
||||
done_cnt+i);
|
||||
#endif
|
||||
bs->data_alloc->set(clean_it->second.location >> block_order, false);
|
||||
}
|
||||
|
@ -399,6 +402,18 @@ resume_1:
|
|||
}
|
||||
}
|
||||
}
|
||||
for (auto ov: double_allocs)
|
||||
{
|
||||
auto dirty_it = bs->dirty_db.find(ov);
|
||||
if (dirty_it != bs->dirty_db.end() &&
|
||||
IS_BIG_WRITE(dirty_it->second.state) &&
|
||||
dirty_it->second.location == UINT64_MAX)
|
||||
{
|
||||
printf("Fatal error (bug): %lx:%lx v%lu big_write journal_entry was allocated over another object\n",
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
bs->flusher->mark_trim_possible();
|
||||
bs->journal.dirty_start = bs->journal.next_free;
|
||||
printf(
|
||||
|
@ -549,7 +564,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
|||
unstab = unstab < ov.version ? ov.version : unstab;
|
||||
if (je->type == JE_SMALL_WRITE_INSTANT)
|
||||
{
|
||||
bs->mark_stable(ov);
|
||||
bs->mark_stable(ov, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -579,32 +594,10 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
|||
// its data and metadata are already flushed.
|
||||
// We don't know if newer versions are flushed, but
|
||||
// the previous delete definitely is.
|
||||
// So we flush previous dirty entries, but retain the clean one.
|
||||
// So we forget previous dirty entries, but retain the clean one.
|
||||
// This feature is required for writes happening shortly
|
||||
// after deletes.
|
||||
auto dirty_end = dirty_it;
|
||||
dirty_end++;
|
||||
while (1)
|
||||
{
|
||||
if (dirty_it == bs->dirty_db.begin())
|
||||
{
|
||||
break;
|
||||
}
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != je->big_write.oid)
|
||||
{
|
||||
dirty_it++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
||||
bs->erase_dirty(
|
||||
dirty_it, dirty_end,
|
||||
clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX
|
||||
);
|
||||
// Remove it from the flusher's queue, too
|
||||
// Otherwise it may end up referring to a small unstable write after reading the rest of the journal
|
||||
bs->flusher->remove_flush(je->big_write.oid);
|
||||
erase_dirty_object(dirty_it);
|
||||
}
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
||||
|
@ -616,18 +609,33 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
|||
.oid = je->big_write.oid,
|
||||
.version = je->big_write.version,
|
||||
};
|
||||
bs->dirty_db.emplace(ov, (dirty_entry){
|
||||
auto dirty_it = bs->dirty_db.emplace(ov, (dirty_entry){
|
||||
.state = (BS_ST_BIG_WRITE | BS_ST_SYNCED),
|
||||
.flags = 0,
|
||||
.location = je->big_write.location,
|
||||
.offset = je->big_write.offset,
|
||||
.len = je->big_write.len,
|
||||
.journal_sector = proc_pos,
|
||||
});
|
||||
}).first;
|
||||
if (bs->data_alloc->get(je->big_write.location >> bs->block_order))
|
||||
{
|
||||
// This is probably a big_write that's already flushed and freed, but it may
|
||||
// also indicate a bug. So we remember such entries and recheck them afterwards.
|
||||
// If it's not a bug they won't be present after reading the whole journal.
|
||||
dirty_it->second.location = UINT64_MAX;
|
||||
double_allocs.push_back(ov);
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Allocate block %lu\n", je->big_write.location >> bs->block_order);
|
||||
printf(
|
||||
"Allocate block (journal) %lu: %lx:%lx v%lu\n",
|
||||
je->big_write.location >> bs->block_order,
|
||||
ov.oid.inode, ov.oid.stripe, ov.version
|
||||
);
|
||||
#endif
|
||||
bs->data_alloc->set(je->big_write.location >> bs->block_order, true);
|
||||
}
|
||||
bs->journal.used_sectors[proc_pos]++;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
|
@ -639,7 +647,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
|||
unstab = unstab < ov.version ? ov.version : unstab;
|
||||
if (je->type == JE_BIG_WRITE_INSTANT)
|
||||
{
|
||||
bs->mark_stable(ov);
|
||||
bs->mark_stable(ov, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -653,7 +661,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
|||
.oid = je->stable.oid,
|
||||
.version = je->stable.version,
|
||||
};
|
||||
bs->mark_stable(ov);
|
||||
bs->mark_stable(ov, true);
|
||||
}
|
||||
else if (je->type == JE_ROLLBACK)
|
||||
{
|
||||
|
@ -672,9 +680,26 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
|||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
|
||||
#endif
|
||||
bool dirty_exists = false;
|
||||
auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
|
||||
.oid = je->del.oid,
|
||||
.version = UINT64_MAX,
|
||||
});
|
||||
if (dirty_it != bs->dirty_db.begin())
|
||||
{
|
||||
dirty_it--;
|
||||
dirty_exists = dirty_it->first.oid == je->del.oid;
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(je->del.oid);
|
||||
if (clean_it == bs->clean_db.end() ||
|
||||
clean_it->second.version < je->del.version)
|
||||
bool clean_exists = (clean_it != bs->clean_db.end() &&
|
||||
clean_it->second.version < je->del.version);
|
||||
if (!clean_exists && dirty_exists)
|
||||
{
|
||||
// Clean entry doesn't exist. This means that the delete is already flushed.
|
||||
// So we must not flush this object anymore.
|
||||
erase_dirty_object(dirty_it);
|
||||
}
|
||||
else if (clean_exists || dirty_exists)
|
||||
{
|
||||
// oid, version
|
||||
obj_ver_id ov = {
|
||||
|
@ -692,8 +717,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
|||
bs->journal.used_sectors[proc_pos]++;
|
||||
// Deletions are treated as immediately stable, because
|
||||
// "2-phase commit" (write->stabilize) isn't sufficient for them anyway
|
||||
bs->mark_stable(ov);
|
||||
bs->mark_stable(ov, true);
|
||||
}
|
||||
// Ignore delete if neither preceding dirty entries nor the clean one are present
|
||||
}
|
||||
started = true;
|
||||
pos += je->size;
|
||||
|
@ -704,3 +730,30 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
|||
bs->journal.next_free = next_free;
|
||||
return 1;
|
||||
}
|
||||
|
||||
void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it)
|
||||
{
|
||||
auto oid = dirty_it->first.oid;
|
||||
auto dirty_end = dirty_it;
|
||||
dirty_end++;
|
||||
while (1)
|
||||
{
|
||||
if (dirty_it == bs->dirty_db.begin())
|
||||
{
|
||||
break;
|
||||
}
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != oid)
|
||||
{
|
||||
dirty_it++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(oid);
|
||||
uint64_t clean_loc = clean_it != bs->clean_db.end()
|
||||
? clean_it->second.location : UINT64_MAX;
|
||||
bs->erase_dirty(dirty_it, dirty_end, clean_loc);
|
||||
// Remove it from the flusher's queue, too
|
||||
// Otherwise it may end up referring to a small unstable write after reading the rest of the journal
|
||||
bs->flusher->remove_flush(oid);
|
||||
}
|
||||
|
|
|
@ -36,6 +36,7 @@ class blockstore_init_journal
|
|||
bool started = false;
|
||||
uint64_t next_free;
|
||||
std::vector<bs_init_journal_done> done;
|
||||
std::vector<obj_ver_id> double_allocs;
|
||||
uint64_t journal_pos = 0;
|
||||
uint64_t continue_pos = 0;
|
||||
void *init_write_buf = NULL;
|
||||
|
@ -48,6 +49,7 @@ class blockstore_init_journal
|
|||
std::function<void(ring_data_t*)> simple_callback;
|
||||
int handle_journal_part(void *buf, uint64_t done_pos, uint64_t len);
|
||||
void handle_event(ring_data_t *data);
|
||||
void erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it);
|
||||
public:
|
||||
blockstore_init_journal(blockstore_impl_t* bs);
|
||||
int loop();
|
||||
|
|
|
@ -69,7 +69,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
|
|||
journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10);
|
||||
meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
|
||||
bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
|
||||
flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
|
||||
max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
|
||||
if (!max_flusher_count)
|
||||
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
|
||||
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
|
||||
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
|
||||
// Validate
|
||||
if (!block_size)
|
||||
|
@ -80,9 +83,13 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
|
|||
{
|
||||
throw std::runtime_error("Bad block size");
|
||||
}
|
||||
if (!flusher_count)
|
||||
if (!max_flusher_count)
|
||||
{
|
||||
flusher_count = 32;
|
||||
max_flusher_count = 256;
|
||||
}
|
||||
if (!min_flusher_count)
|
||||
{
|
||||
min_flusher_count = 1;
|
||||
}
|
||||
if (!max_write_iodepth)
|
||||
{
|
||||
|
|
|
@ -163,10 +163,7 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
|
|||
auto rm_start = it;
|
||||
auto rm_end = it;
|
||||
it--;
|
||||
while (it->first.oid == ov.oid &&
|
||||
it->first.version > ov.version &&
|
||||
!IS_IN_FLIGHT(it->second.state) &&
|
||||
!IS_STABLE(it->second.state))
|
||||
while (1)
|
||||
{
|
||||
if (it->first.oid != ov.oid)
|
||||
break;
|
||||
|
@ -176,7 +173,7 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
|
|||
max_unstable = it->first.version;
|
||||
break;
|
||||
}
|
||||
else if (IS_STABLE(it->second.state))
|
||||
else if (IS_IN_FLIGHT(it->second.state) || IS_STABLE(it->second.state))
|
||||
break;
|
||||
// Remove entry
|
||||
rm_start = it;
|
||||
|
@ -187,7 +184,6 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
|
|||
if (rm_start != rm_end)
|
||||
{
|
||||
erase_dirty(rm_start, rm_end, UINT64_MAX);
|
||||
}
|
||||
auto unstab_it = unstable_writes.find(ov.oid);
|
||||
if (unstab_it != unstable_writes.end())
|
||||
{
|
||||
|
@ -197,6 +193,7 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
|
|||
unstab_it->second = max_unstable;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t *op)
|
||||
|
@ -251,10 +248,12 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
|
|||
}
|
||||
while (1)
|
||||
{
|
||||
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
|
||||
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc &&
|
||||
dirty_it->second.location != UINT64_MAX)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %lu\n", dirty_it->second.location >> block_order);
|
||||
printf("Free block %lu from %lx:%lx v%lu\n", dirty_it->second.location >> block_order,
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
||||
#endif
|
||||
data_alloc->set(dirty_it->second.location >> block_order, false);
|
||||
}
|
||||
|
|
|
@ -168,6 +168,9 @@ resume_5:
|
|||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||
{
|
||||
// Mark all dirty_db entries up to op->version as stable
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Stabilize %lx:%lx v%lu\n", v->oid.inode, v->oid.stripe, v->version);
|
||||
#endif
|
||||
mark_stable(*v);
|
||||
}
|
||||
// Acknowledge op
|
||||
|
@ -176,22 +179,39 @@ resume_5:
|
|||
return 2;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::mark_stable(const obj_ver_id & v)
|
||||
void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
|
||||
{
|
||||
auto dirty_it = dirty_db.find(v);
|
||||
if (dirty_it != dirty_db.end())
|
||||
{
|
||||
while (1)
|
||||
{
|
||||
bool was_stable = IS_STABLE(dirty_it->second.state);
|
||||
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_SYNCED)
|
||||
{
|
||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_STABLE;
|
||||
}
|
||||
else if (IS_STABLE(dirty_it->second.state))
|
||||
if (forget_dirty && (IS_BIG_WRITE(dirty_it->second.state) ||
|
||||
IS_DELETE(dirty_it->second.state)))
|
||||
{
|
||||
// Big write overrides all previous dirty entries
|
||||
auto erase_end = dirty_it;
|
||||
while (dirty_it != dirty_db.begin())
|
||||
{
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != v.oid)
|
||||
{
|
||||
dirty_it++;
|
||||
break;
|
||||
}
|
||||
if (dirty_it == dirty_db.begin())
|
||||
}
|
||||
auto clean_it = clean_db.find(v.oid);
|
||||
uint64_t clean_loc = clean_it != clean_db.end()
|
||||
? clean_it->second.location : UINT64_MAX;
|
||||
erase_dirty(dirty_it, erase_end, clean_loc);
|
||||
break;
|
||||
}
|
||||
if (was_stable || dirty_it == dirty_db.begin())
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
|
|||
if (PRIV(op)->op_state == 0)
|
||||
{
|
||||
stop_sync_submitted = false;
|
||||
unsynced_big_write_count -= unsynced_big_writes.size();
|
||||
PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
|
||||
PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
|
||||
PRIV(op)->sync_small_checked = 0;
|
||||
|
|
|
@ -201,7 +201,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||
if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
|
||||
{
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
|
||||
if (!space_check.check_available(op, unsynced_big_write_count + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
@ -224,7 +224,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||
dirty_it->second.location = loc << block_order;
|
||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Allocate block %lu\n", loc);
|
||||
printf(
|
||||
"Allocate block %lu for %lx:%lx v%lu\n",
|
||||
loc, op->oid.inode, op->oid.stripe, op->version
|
||||
);
|
||||
#endif
|
||||
data_alloc->set(loc, true);
|
||||
uint64_t stripe_offset = (op->offset % bitmap_granularity);
|
||||
|
@ -250,11 +253,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
{
|
||||
// Remember big write as unsynced
|
||||
unsynced_big_writes.push_back((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version,
|
||||
});
|
||||
// Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
|
||||
unsynced_big_write_count++;
|
||||
PRIV(op)->op_state = 3;
|
||||
}
|
||||
else
|
||||
|
@ -267,7 +267,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||
// Small (journaled) write
|
||||
// First check if the journal has sufficient space
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (unsynced_big_writes.size() && !space_check.check_available(op, unsynced_big_writes.size(), sizeof(journal_entry_big_write), 0)
|
||||
if (unsynced_big_write_count && !space_check.check_available(op, unsynced_big_write_count, sizeof(journal_entry_big_write), 0)
|
||||
|| !space_check.check_available(op, 1, sizeof(journal_entry_small_write), op->len + JOURNAL_STABILIZE_RESERVATION))
|
||||
{
|
||||
return 0;
|
||||
|
@ -359,14 +359,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||
{
|
||||
journal.next_free = journal_block_size;
|
||||
}
|
||||
if (immediate_commit == IMMEDIATE_NONE)
|
||||
{
|
||||
// Remember small write as unsynced
|
||||
unsynced_small_writes.push_back((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version,
|
||||
});
|
||||
}
|
||||
if (!PRIV(op)->pending_ops)
|
||||
{
|
||||
PRIV(op)->op_state = 4;
|
||||
|
@ -431,7 +423,7 @@ resume_2:
|
|||
resume_4:
|
||||
// Switch object state
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Ack write %lx:%lx v%lu = state %x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
|
||||
printf("Ack write %lx:%lx v%lu = state 0x%x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
|
||||
#endif
|
||||
bool imm = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
|
||||
? (immediate_commit == IMMEDIATE_ALL)
|
||||
|
@ -445,11 +437,31 @@ resume_4:
|
|||
| (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
|
||||
if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
|
||||
{
|
||||
// Deletions are treated as immediately stable
|
||||
// Deletions and 'instant' operations are treated as immediately stable
|
||||
mark_stable(dirty_it->first);
|
||||
}
|
||||
if (immediate_commit == IMMEDIATE_ALL)
|
||||
if (!imm)
|
||||
{
|
||||
if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
|
||||
{
|
||||
// Remember big write as unsynced
|
||||
unsynced_big_writes.push_back((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version,
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
// Remember small write as unsynced
|
||||
unsynced_small_writes.push_back((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version,
|
||||
});
|
||||
}
|
||||
}
|
||||
if (imm && (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
|
||||
{
|
||||
// Unblock small writes
|
||||
dirty_it++;
|
||||
while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
|
||||
{
|
||||
|
@ -583,14 +595,6 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
|||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
|
||||
PRIV(op)->pending_ops++;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Remember delete as unsynced
|
||||
unsynced_small_writes.push_back((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version,
|
||||
});
|
||||
}
|
||||
if (!PRIV(op)->pending_ops)
|
||||
{
|
||||
PRIV(op)->op_state = 4;
|
||||
|
|
|
@ -2,8 +2,17 @@
|
|||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#include <stdexcept>
|
||||
#include <assert.h>
|
||||
#include "cluster_client.h"
|
||||
|
||||
#define PART_SENT 1
|
||||
#define PART_DONE 2
|
||||
#define PART_ERROR 4
|
||||
#define CACHE_DIRTY 1
|
||||
#define CACHE_FLUSHING 2
|
||||
#define CACHE_REPEATING 3
|
||||
#define OP_FLUSH_BUFFER 2
|
||||
|
||||
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
|
||||
{
|
||||
this->ringloop = ringloop;
|
||||
|
@ -20,39 +29,17 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
|||
// peer_osd just connected
|
||||
continue_ops();
|
||||
}
|
||||
else if (unsynced_writes.size())
|
||||
else if (dirty_buffers.size())
|
||||
{
|
||||
// peer_osd just dropped connection
|
||||
for (auto op: syncing_writes)
|
||||
// determine WHICH dirty_buffers are now obsolete and repeat them
|
||||
for (auto & wr: dirty_buffers)
|
||||
{
|
||||
for (auto & part: op->parts)
|
||||
if (affects_osd(wr.first.inode, wr.first.stripe, wr.second.len, peer_osd) &&
|
||||
wr.second.state != CACHE_REPEATING)
|
||||
{
|
||||
if (part.osd_num == peer_osd && part.done)
|
||||
{
|
||||
// repeat this operation
|
||||
part.osd_num = 0;
|
||||
part.done = false;
|
||||
assert(!part.sent);
|
||||
op->done_count--;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto op: unsynced_writes)
|
||||
{
|
||||
for (auto & part: op->parts)
|
||||
{
|
||||
if (part.osd_num == peer_osd && part.done)
|
||||
{
|
||||
// repeat this operation
|
||||
part.osd_num = 0;
|
||||
part.done = false;
|
||||
assert(!part.sent);
|
||||
op->done_count--;
|
||||
}
|
||||
}
|
||||
if (op->done_count < op->parts.size())
|
||||
{
|
||||
cur_ops.insert(op);
|
||||
// FIXME: Flush in larger parts
|
||||
flush_buffer(wr.first, &wr.second);
|
||||
}
|
||||
}
|
||||
continue_ops();
|
||||
|
@ -90,37 +77,87 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
|||
|
||||
cluster_client_t::~cluster_client_t()
|
||||
{
|
||||
for (auto bp: dirty_buffers)
|
||||
{
|
||||
free(bp.second.buf);
|
||||
}
|
||||
dirty_buffers.clear();
|
||||
if (ringloop)
|
||||
{
|
||||
ringloop->unregister_consumer(&consumer);
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::stop()
|
||||
{
|
||||
while (msgr.clients.size() > 0)
|
||||
{
|
||||
msgr.stop_client(msgr.clients.begin()->first);
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::continue_ops(bool up_retry)
|
||||
{
|
||||
for (auto op_it = cur_ops.begin(); op_it != cur_ops.end(); )
|
||||
if (!pgs_loaded)
|
||||
{
|
||||
if ((*op_it)->up_wait)
|
||||
// We're offline
|
||||
return;
|
||||
}
|
||||
if (continuing_ops)
|
||||
{
|
||||
if (up_retry)
|
||||
// Attempt to reenter the function
|
||||
continuing_ops = 2;
|
||||
return;
|
||||
}
|
||||
restart:
|
||||
continuing_ops = 1;
|
||||
op_queue_pos = 0;
|
||||
bool has_flushes = false, has_writes = false;
|
||||
while (op_queue_pos < op_queue.size())
|
||||
{
|
||||
(*op_it)->up_wait = false;
|
||||
continue_rw(*op_it++);
|
||||
auto op = op_queue[op_queue_pos];
|
||||
bool rm = false, is_flush = op->flags & OP_FLUSH_BUFFER;
|
||||
auto opcode = op->opcode;
|
||||
if (!op->up_wait || up_retry)
|
||||
{
|
||||
op->up_wait = false;
|
||||
if (opcode == OSD_OP_READ || opcode == OSD_OP_WRITE)
|
||||
{
|
||||
if (is_flush || !has_flushes)
|
||||
{
|
||||
// Regular writes can't proceed before buffer flushes
|
||||
rm = continue_rw(op);
|
||||
}
|
||||
}
|
||||
else if (opcode == OSD_OP_SYNC)
|
||||
{
|
||||
if (!has_writes)
|
||||
{
|
||||
// SYNC can't proceed before previous writes
|
||||
rm = continue_sync(op);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (opcode == OSD_OP_WRITE)
|
||||
{
|
||||
has_writes = has_writes || !rm;
|
||||
if (is_flush)
|
||||
{
|
||||
has_flushes = has_writes || !rm;
|
||||
}
|
||||
}
|
||||
else if (opcode == OSD_OP_SYNC)
|
||||
{
|
||||
// Postpone writes until previous SYNC completes
|
||||
// ...so dirty_writes can't contain anything newer than SYNC
|
||||
has_flushes = has_writes || !rm;
|
||||
}
|
||||
if (rm)
|
||||
{
|
||||
op_queue.erase(op_queue.begin()+op_queue_pos, op_queue.begin()+op_queue_pos+1);
|
||||
}
|
||||
else
|
||||
op_it++;
|
||||
{
|
||||
op_queue_pos++;
|
||||
}
|
||||
else
|
||||
continue_rw(*op_it++);
|
||||
if (continuing_ops == 2)
|
||||
{
|
||||
goto restart;
|
||||
}
|
||||
}
|
||||
continuing_ops = 0;
|
||||
}
|
||||
|
||||
static uint32_t is_power_of_two(uint64_t value)
|
||||
|
@ -141,16 +178,11 @@ static uint32_t is_power_of_two(uint64_t value)
|
|||
void cluster_client_t::on_load_config_hook(json11::Json::object & config)
|
||||
{
|
||||
bs_block_size = config["block_size"].uint64_value();
|
||||
bs_disk_alignment = config["disk_alignment"].uint64_value();
|
||||
bs_bitmap_granularity = config["bitmap_granularity"].uint64_value();
|
||||
if (!bs_block_size)
|
||||
{
|
||||
bs_block_size = DEFAULT_BLOCK_SIZE;
|
||||
}
|
||||
if (!bs_disk_alignment)
|
||||
{
|
||||
bs_disk_alignment = DEFAULT_DISK_ALIGNMENT;
|
||||
}
|
||||
if (!bs_bitmap_granularity)
|
||||
{
|
||||
bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
|
||||
|
@ -165,13 +197,26 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
|
|||
// Cluster-wide immediate_commit mode
|
||||
immediate_commit = true;
|
||||
}
|
||||
if (config.find("client_max_dirty_bytes") != config.end())
|
||||
{
|
||||
client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
|
||||
}
|
||||
else if (config.find("client_dirty_limit") != config.end())
|
||||
{
|
||||
client_dirty_limit = config["client_dirty_limit"].uint64_value();
|
||||
// Old name
|
||||
client_max_dirty_bytes = config["client_dirty_limit"].uint64_value();
|
||||
}
|
||||
if (!client_dirty_limit)
|
||||
if (config.find("client_max_dirty_ops") != config.end())
|
||||
{
|
||||
client_dirty_limit = DEFAULT_CLIENT_DIRTY_LIMIT;
|
||||
client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
|
||||
}
|
||||
if (!client_max_dirty_bytes)
|
||||
{
|
||||
client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
|
||||
}
|
||||
if (!client_max_dirty_ops)
|
||||
{
|
||||
client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
|
||||
}
|
||||
up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value();
|
||||
if (!up_wait_retry_interval)
|
||||
|
@ -215,23 +260,10 @@ void cluster_client_t::on_change_hook(json11::Json::object & changes)
|
|||
{
|
||||
// At this point, all pool operations should have been suspended
|
||||
// And now they have to be resliced!
|
||||
for (auto op: cur_ops)
|
||||
for (auto op: op_queue)
|
||||
{
|
||||
if (INODE_POOL(op->inode) == pool_item.first)
|
||||
{
|
||||
op->needs_reslice = true;
|
||||
}
|
||||
}
|
||||
for (auto op: unsynced_writes)
|
||||
{
|
||||
if (INODE_POOL(op->inode) == pool_item.first)
|
||||
{
|
||||
op->needs_reslice = true;
|
||||
}
|
||||
}
|
||||
for (auto op: syncing_writes)
|
||||
{
|
||||
if (INODE_POOL(op->inode) == pool_item.first)
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ) &&
|
||||
INODE_POOL(op->inode) == pool_item.first)
|
||||
{
|
||||
op->needs_reslice = true;
|
||||
}
|
||||
|
@ -250,6 +282,11 @@ void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
|
|||
}
|
||||
}
|
||||
|
||||
bool cluster_client_t::is_ready()
|
||||
{
|
||||
return pgs_loaded;
|
||||
}
|
||||
|
||||
void cluster_client_t::on_ready(std::function<void(void)> fn)
|
||||
{
|
||||
if (pgs_loaded)
|
||||
|
@ -265,21 +302,15 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
|
|||
/**
|
||||
* How writes are synced when immediate_commit is false
|
||||
*
|
||||
* 1) accept up to <client_dirty_limit> write operations for execution,
|
||||
* queue all subsequent writes into <next_writes>
|
||||
* 2) accept exactly one SYNC, queue all subsequent SYNCs into <next_writes>, too
|
||||
* 3) "continue" all accepted writes
|
||||
*
|
||||
* "Continue" WRITE:
|
||||
* 1) if the operation is not a copy yet - copy it (required for replay)
|
||||
* 2) if the operation is not sliced yet - slice it
|
||||
* 3) if the operation doesn't require reslice - try to connect & send all remaining parts
|
||||
* 4) if any of them fail due to disconnected peers or PGs not up, repeat after reconnecting or small timeout
|
||||
* 5) if any of them fail due to other errors, fail the operation and forget it from the current "unsynced batch"
|
||||
* 6) if PG count changes before all parts are done, wait for all in-progress parts to finish,
|
||||
* 1) if the operation is not sliced yet - slice it
|
||||
* 2) if the operation doesn't require reslice - try to connect & send all remaining parts
|
||||
* 3) if any of them fail due to disconnected peers or PGs not up, repeat after reconnecting or small timeout
|
||||
* 4) if any of them fail due to other errors, fail the operation and forget it from the current "unsynced batch"
|
||||
* 5) if PG count changes before all parts are done, wait for all in-progress parts to finish,
|
||||
* throw all results away, reslice and resubmit op
|
||||
* 7) when all parts are done, try to "continue" the current SYNC
|
||||
* 8) if the operation succeeds, but then some OSDs drop their connections, repeat
|
||||
* 6) when all parts are done, try to "continue" the current SYNC
|
||||
* 7) if the operation succeeds, but then some OSDs drop their connections, repeat
|
||||
* parts from the current "unsynced batch" previously sent to those OSDs in any order
|
||||
*
|
||||
* "Continue" current SYNC:
|
||||
|
@ -289,181 +320,277 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
|
|||
* 4) if any of them fail due to disconnected peers, repeat SYNC after repeating all writes
|
||||
* 5) if any of them fail due to other errors, fail the SYNC operation
|
||||
*/
|
||||
|
||||
void cluster_client_t::execute(cluster_op_t *op)
|
||||
{
|
||||
if (!pgs_loaded)
|
||||
{
|
||||
// We're offline
|
||||
offline_ops.push_back(op);
|
||||
return;
|
||||
}
|
||||
op->retval = 0;
|
||||
if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ && op->opcode != OSD_OP_WRITE ||
|
||||
(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_WRITE) && (!op->inode || !op->len ||
|
||||
op->offset % bs_disk_alignment || op->len % bs_disk_alignment))
|
||||
if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ && op->opcode != OSD_OP_WRITE)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
return;
|
||||
}
|
||||
if (op->opcode == OSD_OP_SYNC)
|
||||
{
|
||||
execute_sync(op);
|
||||
return;
|
||||
}
|
||||
op->retval = 0;
|
||||
if (op->opcode == OSD_OP_WRITE && !immediate_commit)
|
||||
{
|
||||
if (next_writes.size() > 0)
|
||||
{
|
||||
assert(cur_sync);
|
||||
next_writes.push_back(op);
|
||||
return;
|
||||
}
|
||||
if (queued_bytes >= client_dirty_limit)
|
||||
if (dirty_bytes >= client_max_dirty_bytes || dirty_ops >= client_max_dirty_ops)
|
||||
{
|
||||
// Push an extra SYNC operation to flush previous writes
|
||||
next_writes.push_back(op);
|
||||
cluster_op_t *sync_op = new cluster_op_t;
|
||||
sync_op->is_internal = true;
|
||||
sync_op->opcode = OSD_OP_SYNC;
|
||||
sync_op->callback = [](cluster_op_t* sync_op) {};
|
||||
execute_sync(sync_op);
|
||||
return;
|
||||
sync_op->callback = [](cluster_op_t* sync_op)
|
||||
{
|
||||
delete sync_op;
|
||||
};
|
||||
op_queue.push_back(sync_op);
|
||||
dirty_bytes = 0;
|
||||
dirty_ops = 0;
|
||||
}
|
||||
queued_bytes += op->len;
|
||||
dirty_bytes += op->len;
|
||||
dirty_ops++;
|
||||
}
|
||||
cur_ops.insert(op);
|
||||
continue_rw(op);
|
||||
else if (op->opcode == OSD_OP_SYNC)
|
||||
{
|
||||
dirty_bytes = 0;
|
||||
dirty_ops = 0;
|
||||
}
|
||||
op_queue.push_back(op);
|
||||
continue_ops();
|
||||
}
|
||||
|
||||
void cluster_client_t::continue_rw(cluster_op_t *op)
|
||||
void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers)
|
||||
{
|
||||
// Save operation for replay when one of PGs goes out of sync
|
||||
// (primary OSD drops our connection in this case)
|
||||
auto dirty_it = dirty_buffers.lower_bound((object_id){
|
||||
.inode = op->inode,
|
||||
.stripe = op->offset,
|
||||
});
|
||||
while (dirty_it != dirty_buffers.begin())
|
||||
{
|
||||
dirty_it--;
|
||||
if (dirty_it->first.inode != op->inode ||
|
||||
(dirty_it->first.stripe + dirty_it->second.len) <= op->offset)
|
||||
{
|
||||
dirty_it++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
uint64_t pos = op->offset, len = op->len, iov_idx = 0, iov_pos = 0;
|
||||
while (len > 0)
|
||||
{
|
||||
uint64_t new_len = 0;
|
||||
if (dirty_it == dirty_buffers.end())
|
||||
{
|
||||
new_len = len;
|
||||
}
|
||||
else if (dirty_it->first.inode != op->inode || dirty_it->first.stripe > pos)
|
||||
{
|
||||
new_len = dirty_it->first.stripe - pos;
|
||||
if (new_len > len)
|
||||
{
|
||||
new_len = len;
|
||||
}
|
||||
}
|
||||
if (new_len > 0)
|
||||
{
|
||||
dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
|
||||
.inode = op->inode,
|
||||
.stripe = pos,
|
||||
}, (cluster_buffer_t){
|
||||
.buf = malloc_or_die(new_len),
|
||||
.len = new_len,
|
||||
});
|
||||
}
|
||||
// FIXME: Split big buffers into smaller ones on overwrites. But this will require refcounting
|
||||
dirty_it->second.state = CACHE_DIRTY;
|
||||
uint64_t cur_len = (dirty_it->first.stripe + dirty_it->second.len - pos);
|
||||
if (cur_len > len)
|
||||
{
|
||||
cur_len = len;
|
||||
}
|
||||
while (cur_len > 0 && iov_idx < op->iov.count)
|
||||
{
|
||||
unsigned iov_len = (op->iov.buf[iov_idx].iov_len - iov_pos);
|
||||
if (iov_len <= cur_len)
|
||||
{
|
||||
memcpy(dirty_it->second.buf + pos - dirty_it->first.stripe,
|
||||
op->iov.buf[iov_idx].iov_base + iov_pos, iov_len);
|
||||
pos += iov_len;
|
||||
len -= iov_len;
|
||||
cur_len -= iov_len;
|
||||
iov_pos = 0;
|
||||
iov_idx++;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(dirty_it->second.buf + pos - dirty_it->first.stripe,
|
||||
op->iov.buf[iov_idx].iov_base + iov_pos, cur_len);
|
||||
pos += cur_len;
|
||||
len -= cur_len;
|
||||
iov_pos += cur_len;
|
||||
cur_len = 0;
|
||||
}
|
||||
}
|
||||
dirty_it++;
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::flush_buffer(const object_id & oid, cluster_buffer_t *wr)
|
||||
{
|
||||
wr->state = CACHE_REPEATING;
|
||||
cluster_op_t *op = new cluster_op_t;
|
||||
op->flags = OP_FLUSH_BUFFER;
|
||||
op->opcode = OSD_OP_WRITE;
|
||||
op->inode = oid.inode;
|
||||
op->offset = oid.stripe;
|
||||
op->len = wr->len;
|
||||
op->iov.push_back(wr->buf, wr->len);
|
||||
op->callback = [wr](cluster_op_t* op)
|
||||
{
|
||||
if (wr->state == CACHE_REPEATING)
|
||||
{
|
||||
wr->state = CACHE_DIRTY;
|
||||
}
|
||||
delete op;
|
||||
};
|
||||
op_queue.insert(op_queue.begin(), op);
|
||||
if (continuing_ops)
|
||||
{
|
||||
continuing_ops = 2;
|
||||
op_queue_pos++;
|
||||
}
|
||||
}
|
||||
|
||||
int cluster_client_t::continue_rw(cluster_op_t *op)
|
||||
{
|
||||
if (op->state == 0)
|
||||
goto resume_0;
|
||||
else if (op->state == 1)
|
||||
goto resume_1;
|
||||
else if (op->state == 2)
|
||||
goto resume_2;
|
||||
else if (op->state == 3)
|
||||
goto resume_3;
|
||||
resume_0:
|
||||
if (!op->len || op->offset % bs_bitmap_granularity || op->len % bs_bitmap_granularity)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
return 1;
|
||||
}
|
||||
{
|
||||
pool_id_t pool_id = INODE_POOL(op->inode);
|
||||
if (!pool_id)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
return;
|
||||
return 1;
|
||||
}
|
||||
if (st_cli.pool_config.find(pool_id) == st_cli.pool_config.end() ||
|
||||
st_cli.pool_config[pool_id].real_pg_count == 0)
|
||||
{
|
||||
// Postpone operations to unknown pools
|
||||
return;
|
||||
return 0;
|
||||
}
|
||||
if (op->opcode == OSD_OP_WRITE && !immediate_commit && !op->is_internal)
|
||||
{
|
||||
// Save operation for replay when PG goes out of sync
|
||||
// (primary OSD drops our connection in this case)
|
||||
cluster_op_t *op_copy = new cluster_op_t();
|
||||
op_copy->is_internal = true;
|
||||
op_copy->orig_op = op;
|
||||
op_copy->opcode = op->opcode;
|
||||
op_copy->inode = op->inode;
|
||||
op_copy->offset = op->offset;
|
||||
op_copy->len = op->len;
|
||||
op_copy->buf = malloc_or_die(op->len);
|
||||
op_copy->iov.push_back(op_copy->buf, op->len);
|
||||
op_copy->callback = [](cluster_op_t* op_copy)
|
||||
{
|
||||
if (op_copy->orig_op)
|
||||
{
|
||||
// Acknowledge write and forget the original pointer
|
||||
op_copy->orig_op->retval = op_copy->retval;
|
||||
std::function<void(cluster_op_t*)>(op_copy->orig_op->callback)(op_copy->orig_op);
|
||||
op_copy->orig_op = NULL;
|
||||
}
|
||||
};
|
||||
void *cur_buf = op_copy->buf;
|
||||
for (int i = 0; i < op->iov.count; i++)
|
||||
if (op->opcode == OSD_OP_WRITE)
|
||||
{
|
||||
memcpy(cur_buf, op->iov.buf[i].iov_base, op->iov.buf[i].iov_len);
|
||||
cur_buf += op->iov.buf[i].iov_len;
|
||||
}
|
||||
unsynced_writes.push_back(op_copy);
|
||||
cur_ops.erase(op);
|
||||
cur_ops.insert(op_copy);
|
||||
op = op_copy;
|
||||
}
|
||||
if (!op->parts.size())
|
||||
if (!immediate_commit && !(op->flags & OP_FLUSH_BUFFER))
|
||||
{
|
||||
copy_write(op, dirty_buffers);
|
||||
}
|
||||
}
|
||||
resume_1:
|
||||
// Slice the operation into parts
|
||||
slice_rw(op);
|
||||
}
|
||||
if (!op->needs_reslice)
|
||||
{
|
||||
op->needs_reslice = false;
|
||||
resume_2:
|
||||
// Send unsent parts, if they're not subject to change
|
||||
for (auto & op_part: op->parts)
|
||||
op->state = 3;
|
||||
if (op->needs_reslice)
|
||||
{
|
||||
if (!op_part.sent && !op_part.done)
|
||||
for (int i = 0; i < op->parts.size(); i++)
|
||||
{
|
||||
try_send(op, &op_part);
|
||||
if (!(op->parts[i].flags & PART_SENT) && op->retval)
|
||||
{
|
||||
op->retval = -EPIPE;
|
||||
}
|
||||
}
|
||||
goto resume_3;
|
||||
}
|
||||
for (int i = 0; i < op->parts.size(); i++)
|
||||
{
|
||||
if (!(op->parts[i].flags & PART_SENT))
|
||||
{
|
||||
if (!try_send(op, i))
|
||||
{
|
||||
// We'll need to retry again
|
||||
op->up_wait = true;
|
||||
if (!retry_timeout_id)
|
||||
{
|
||||
retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
|
||||
{
|
||||
retry_timeout_id = 0;
|
||||
continue_ops(true);
|
||||
});
|
||||
}
|
||||
op->state = 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!op->sent_count)
|
||||
if (op->state == 2)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
resume_3:
|
||||
if (op->inflight_count > 0)
|
||||
{
|
||||
op->state = 3;
|
||||
return 0;
|
||||
}
|
||||
if (op->done_count >= op->parts.size())
|
||||
{
|
||||
// Finished successfully
|
||||
// Even if the PG count has changed in meanwhile we treat it as success
|
||||
// because if some operations were invalid for the new PG count we'd get errors
|
||||
cur_ops.erase(op);
|
||||
op->retval = op->len;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
continue_sync();
|
||||
return;
|
||||
return 1;
|
||||
}
|
||||
else if (op->retval != 0 && op->retval != -EPIPE)
|
||||
{
|
||||
// Fatal error (not -EPIPE)
|
||||
cur_ops.erase(op);
|
||||
if (!immediate_commit && op->opcode == OSD_OP_WRITE)
|
||||
{
|
||||
for (int i = 0; i < unsynced_writes.size(); i++)
|
||||
{
|
||||
if (unsynced_writes[i] == op)
|
||||
{
|
||||
unsynced_writes.erase(unsynced_writes.begin()+i, unsynced_writes.begin()+i+1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
bool del = op->is_internal;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
if (del)
|
||||
{
|
||||
if (op->buf)
|
||||
free(op->buf);
|
||||
delete op;
|
||||
}
|
||||
continue_sync();
|
||||
return;
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// -EPIPE or no error - clear the error
|
||||
// -EPIPE - clear the error and retry
|
||||
op->retval = 0;
|
||||
if (op->needs_reslice)
|
||||
{
|
||||
op->parts.clear();
|
||||
op->done_count = 0;
|
||||
op->needs_reslice = false;
|
||||
continue_rw(op);
|
||||
}
|
||||
goto resume_1;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < op->parts.size(); i++)
|
||||
{
|
||||
op->parts[i].flags = 0;
|
||||
}
|
||||
goto resume_2;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void cluster_client_t::slice_rw(cluster_op_t *op)
|
||||
{
|
||||
// Slice the request into individual object stripe requests
|
||||
// Primary OSDs still operate individual stripes, but their size is multiplied by PG minsize in case of EC
|
||||
auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->inode)];
|
||||
uint64_t pg_block_size = bs_block_size * (
|
||||
pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
|
||||
);
|
||||
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
|
||||
uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
||||
uint64_t pg_block_size = bs_block_size * pg_data_size;
|
||||
uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
|
||||
uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
|
||||
op->retval = 0;
|
||||
|
@ -482,8 +609,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
|||
.offset = begin,
|
||||
.len = (uint32_t)(end - begin),
|
||||
.pg_num = pg_num,
|
||||
.sent = false,
|
||||
.done = false,
|
||||
.flags = 0,
|
||||
};
|
||||
int left = end-begin;
|
||||
while (left > 0 && iov_idx < op->iov.count)
|
||||
|
@ -507,8 +633,28 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
|||
}
|
||||
}
|
||||
|
||||
bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
|
||||
bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd)
|
||||
{
|
||||
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(inode));
|
||||
uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
||||
uint64_t pg_block_size = bs_block_size * pg_data_size;
|
||||
uint64_t first_stripe = (offset / pg_block_size) * pg_block_size;
|
||||
uint64_t last_stripe = ((offset + len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
|
||||
for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
|
||||
{
|
||||
pg_num_t pg_num = (stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
|
||||
auto pg_it = pool_cfg.pg_config.find(pg_num);
|
||||
if (pg_it != pool_cfg.pg_config.end() && pg_it->second.cur_primary == osd)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
{
|
||||
auto part = &op->parts[i];
|
||||
auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->inode)];
|
||||
auto pg_it = pool_cfg.pg_config.find(part->pg_num);
|
||||
if (pg_it != pool_cfg.pg_config.end() &&
|
||||
|
@ -520,8 +666,8 @@ bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
|
|||
{
|
||||
int peer_fd = peer_it->second;
|
||||
part->osd_num = primary_osd;
|
||||
part->sent = true;
|
||||
op->sent_count++;
|
||||
part->flags |= PART_SENT;
|
||||
op->inflight_count++;
|
||||
part->op = (osd_op_t){
|
||||
.op_type = OSD_OP_OUT,
|
||||
.peer_fd = peer_fd,
|
||||
|
@ -552,137 +698,99 @@ bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
|
|||
return false;
|
||||
}
|
||||
|
||||
void cluster_client_t::execute_sync(cluster_op_t *op)
|
||||
int cluster_client_t::continue_sync(cluster_op_t *op)
|
||||
{
|
||||
if (immediate_commit)
|
||||
if (op->state == 1)
|
||||
goto resume_1;
|
||||
if (immediate_commit || !dirty_osds.size())
|
||||
{
|
||||
// Syncs are not required in the immediate_commit mode
|
||||
// Sync is not required in the immediate_commit mode or if there are no dirty_osds
|
||||
op->retval = 0;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
}
|
||||
else if (cur_sync != NULL)
|
||||
{
|
||||
next_writes.push_back(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
cur_sync = op;
|
||||
continue_sync();
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::continue_sync()
|
||||
{
|
||||
if (!cur_sync || cur_sync->parts.size() > 0)
|
||||
{
|
||||
// Already submitted
|
||||
return;
|
||||
}
|
||||
cur_sync->retval = 0;
|
||||
std::set<osd_num_t> sync_osds;
|
||||
for (auto prev_op: unsynced_writes)
|
||||
{
|
||||
if (prev_op->done_count < prev_op->parts.size())
|
||||
{
|
||||
// Writes not finished yet
|
||||
return;
|
||||
}
|
||||
for (auto & part: prev_op->parts)
|
||||
{
|
||||
if (part.osd_num)
|
||||
{
|
||||
sync_osds.insert(part.osd_num);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!sync_osds.size())
|
||||
{
|
||||
// No dirty writes
|
||||
finish_sync();
|
||||
return;
|
||||
return 1;
|
||||
}
|
||||
// Check that all OSD connections are still alive
|
||||
for (auto sync_osd: sync_osds)
|
||||
for (auto sync_osd: dirty_osds)
|
||||
{
|
||||
auto peer_it = msgr.osd_peer_fds.find(sync_osd);
|
||||
if (peer_it == msgr.osd_peer_fds.end())
|
||||
{
|
||||
// SYNC is pointless to send to a non connected OSD
|
||||
return;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
syncing_writes.swap(unsynced_writes);
|
||||
// Post sync to affected OSDs
|
||||
cur_sync->parts.resize(sync_osds.size());
|
||||
int i = 0;
|
||||
for (auto sync_osd: sync_osds)
|
||||
for (auto & prev_op: dirty_buffers)
|
||||
{
|
||||
cur_sync->parts[i] = {
|
||||
.parent = cur_sync,
|
||||
if (prev_op.second.state == CACHE_DIRTY)
|
||||
{
|
||||
prev_op.second.state = CACHE_FLUSHING;
|
||||
}
|
||||
}
|
||||
op->parts.resize(dirty_osds.size());
|
||||
op->retval = 0;
|
||||
{
|
||||
int i = 0;
|
||||
for (auto sync_osd: dirty_osds)
|
||||
{
|
||||
op->parts[i] = {
|
||||
.parent = op,
|
||||
.osd_num = sync_osd,
|
||||
.sent = false,
|
||||
.done = false,
|
||||
.flags = 0,
|
||||
};
|
||||
send_sync(cur_sync, &cur_sync->parts[i]);
|
||||
send_sync(op, &op->parts[i]);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::finish_sync()
|
||||
{
|
||||
int retval = cur_sync->retval;
|
||||
if (retval != 0)
|
||||
}
|
||||
dirty_osds.clear();
|
||||
resume_1:
|
||||
if (op->inflight_count > 0)
|
||||
{
|
||||
for (auto op: syncing_writes)
|
||||
op->state = 1;
|
||||
return 0;
|
||||
}
|
||||
if (op->retval != 0)
|
||||
{
|
||||
if (op->done_count < op->parts.size())
|
||||
for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); uw_it++)
|
||||
{
|
||||
cur_ops.insert(op);
|
||||
if (uw_it->second.state == CACHE_FLUSHING)
|
||||
{
|
||||
uw_it->second.state = CACHE_DIRTY;
|
||||
}
|
||||
}
|
||||
unsynced_writes.insert(unsynced_writes.begin(), syncing_writes.begin(), syncing_writes.end());
|
||||
syncing_writes.clear();
|
||||
}
|
||||
if (retval == -EPIPE)
|
||||
if (op->retval == -EPIPE)
|
||||
{
|
||||
// Retry later
|
||||
cur_sync->parts.clear();
|
||||
cur_sync->retval = 0;
|
||||
cur_sync->sent_count = 0;
|
||||
cur_sync->done_count = 0;
|
||||
return;
|
||||
op->parts.clear();
|
||||
op->retval = 0;
|
||||
op->inflight_count = 0;
|
||||
op->done_count = 0;
|
||||
op->state = 0;
|
||||
return 0;
|
||||
}
|
||||
std::function<void(cluster_op_t*)>(cur_sync->callback)(cur_sync);
|
||||
if (!retval)
|
||||
}
|
||||
else
|
||||
{
|
||||
for (auto op: syncing_writes)
|
||||
for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); )
|
||||
{
|
||||
assert(op->sent_count == 0);
|
||||
if (op->is_internal)
|
||||
if (uw_it->second.state == CACHE_FLUSHING)
|
||||
{
|
||||
if (op->buf)
|
||||
free(op->buf);
|
||||
delete op;
|
||||
free(uw_it->second.buf);
|
||||
dirty_buffers.erase(uw_it++);
|
||||
}
|
||||
else
|
||||
uw_it++;
|
||||
}
|
||||
}
|
||||
syncing_writes.clear();
|
||||
}
|
||||
cur_sync = NULL;
|
||||
queued_bytes = 0;
|
||||
std::vector<cluster_op_t*> next_wr_copy;
|
||||
next_wr_copy.swap(next_writes);
|
||||
for (auto next_op: next_wr_copy)
|
||||
{
|
||||
execute(next_op);
|
||||
}
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
return 1;
|
||||
}
|
||||
|
||||
void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
|
||||
{
|
||||
auto peer_it = msgr.osd_peer_fds.find(part->osd_num);
|
||||
assert(peer_it != msgr.osd_peer_fds.end());
|
||||
part->sent = true;
|
||||
op->sent_count++;
|
||||
part->flags |= PART_SENT;
|
||||
op->inflight_count++;
|
||||
part->op = (osd_op_t){
|
||||
.op_type = OSD_OP_OUT,
|
||||
.peer_fd = peer_it->second,
|
||||
|
@ -704,19 +812,18 @@ void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
|
|||
void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
{
|
||||
cluster_op_t *op = part->parent;
|
||||
part->sent = false;
|
||||
op->sent_count--;
|
||||
op->inflight_count--;
|
||||
int expected = part->op.req.hdr.opcode == OSD_OP_SYNC ? 0 : part->op.req.rw.len;
|
||||
if (part->op.reply.hdr.retval != expected)
|
||||
{
|
||||
// Operation failed, retry
|
||||
printf(
|
||||
"Operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
|
||||
part->osd_num, part->op.reply.hdr.retval, expected
|
||||
"%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
|
||||
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
||||
);
|
||||
msgr.stop_client(part->op.peer_fd);
|
||||
if (part->op.reply.hdr.retval == -EPIPE)
|
||||
{
|
||||
// Mark op->up_wait = true before stopping the client
|
||||
op->up_wait = true;
|
||||
if (!retry_timeout_id)
|
||||
{
|
||||
|
@ -732,23 +839,18 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||
// Don't overwrite other errors with -EPIPE
|
||||
op->retval = part->op.reply.hdr.retval;
|
||||
}
|
||||
msgr.stop_client(part->op.peer_fd);
|
||||
part->flags |= PART_ERROR;
|
||||
}
|
||||
else
|
||||
{
|
||||
// OK
|
||||
part->done = true;
|
||||
dirty_osds.insert(part->osd_num);
|
||||
part->flags |= PART_DONE;
|
||||
op->done_count++;
|
||||
}
|
||||
if (op->sent_count == 0)
|
||||
if (op->inflight_count == 0)
|
||||
{
|
||||
if (op->opcode == OSD_OP_SYNC)
|
||||
{
|
||||
assert(op == cur_sync);
|
||||
finish_sync();
|
||||
}
|
||||
else if (!op->up_wait)
|
||||
{
|
||||
continue_rw(op);
|
||||
}
|
||||
continue_ops();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,7 +10,8 @@
|
|||
#define MAX_BLOCK_SIZE 128*1024*1024
|
||||
#define DEFAULT_DISK_ALIGNMENT 4096
|
||||
#define DEFAULT_BITMAP_GRANULARITY 4096
|
||||
#define DEFAULT_CLIENT_DIRTY_LIMIT 32*1024*1024
|
||||
#define DEFAULT_CLIENT_MAX_DIRTY_BYTES 32*1024*1024
|
||||
#define DEFAULT_CLIENT_MAX_DIRTY_OPS 1024
|
||||
|
||||
struct cluster_op_t;
|
||||
|
||||
|
@ -22,8 +23,7 @@ struct cluster_op_part_t
|
|||
pg_num_t pg_num;
|
||||
osd_num_t osd_num;
|
||||
osd_op_buf_list_t iov;
|
||||
bool sent;
|
||||
bool done;
|
||||
unsigned flags;
|
||||
osd_op_t op;
|
||||
};
|
||||
|
||||
|
@ -37,47 +37,53 @@ struct cluster_op_t
|
|||
osd_op_buf_list_t iov;
|
||||
std::function<void(cluster_op_t*)> callback;
|
||||
protected:
|
||||
int flags = 0;
|
||||
int state = 0;
|
||||
void *buf = NULL;
|
||||
cluster_op_t *orig_op = NULL;
|
||||
bool is_internal = false;
|
||||
bool needs_reslice = false;
|
||||
bool up_wait = false;
|
||||
int sent_count = 0, done_count = 0;
|
||||
int inflight_count = 0, done_count = 0;
|
||||
std::vector<cluster_op_part_t> parts;
|
||||
friend class cluster_client_t;
|
||||
};
|
||||
|
||||
struct cluster_buffer_t
|
||||
{
|
||||
void *buf;
|
||||
uint64_t len;
|
||||
int state;
|
||||
};
|
||||
|
||||
// FIXME: Split into public and private interfaces
|
||||
class cluster_client_t
|
||||
{
|
||||
timerfd_manager_t *tfd;
|
||||
ring_loop_t *ringloop;
|
||||
|
||||
uint64_t bs_block_size = 0;
|
||||
uint64_t bs_disk_alignment = 0;
|
||||
uint64_t bs_bitmap_granularity = 0;
|
||||
std::map<pool_id_t, uint64_t> pg_counts;
|
||||
bool immediate_commit = false;
|
||||
// FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
|
||||
uint64_t client_dirty_limit = 0;
|
||||
uint64_t client_max_dirty_bytes = 0;
|
||||
uint64_t client_max_dirty_ops = 0;
|
||||
int log_level;
|
||||
int up_wait_retry_interval = 500; // ms
|
||||
|
||||
uint64_t op_id = 1;
|
||||
ring_consumer_t consumer;
|
||||
// operations currently in progress
|
||||
std::set<cluster_op_t*> cur_ops;
|
||||
int retry_timeout_id = 0;
|
||||
// unsynced operations are copied in memory to allow replay when cluster isn't in the immediate_commit mode
|
||||
// unsynced_writes are replayed in any order (because only the SYNC operation guarantees ordering)
|
||||
std::vector<cluster_op_t*> unsynced_writes;
|
||||
std::vector<cluster_op_t*> syncing_writes;
|
||||
cluster_op_t* cur_sync = NULL;
|
||||
std::vector<cluster_op_t*> next_writes;
|
||||
uint64_t op_id = 1;
|
||||
std::vector<cluster_op_t*> offline_ops;
|
||||
uint64_t queued_bytes = 0;
|
||||
std::vector<cluster_op_t*> op_queue;
|
||||
std::map<object_id, cluster_buffer_t> dirty_buffers;
|
||||
std::set<osd_num_t> dirty_osds;
|
||||
uint64_t dirty_bytes = 0, dirty_ops = 0;
|
||||
|
||||
bool pgs_loaded = false;
|
||||
ring_consumer_t consumer;
|
||||
std::vector<std::function<void(void)>> on_ready_hooks;
|
||||
int continuing_ops = 0;
|
||||
int op_queue_pos = 0;
|
||||
|
||||
public:
|
||||
etcd_state_client_t st_cli;
|
||||
|
@ -87,21 +93,22 @@ public:
|
|||
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
|
||||
~cluster_client_t();
|
||||
void execute(cluster_op_t *op);
|
||||
bool is_ready();
|
||||
void on_ready(std::function<void(void)> fn);
|
||||
void stop();
|
||||
|
||||
protected:
|
||||
static void copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers);
|
||||
void continue_ops(bool up_retry = false);
|
||||
protected:
|
||||
bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
|
||||
void flush_buffer(const object_id & oid, cluster_buffer_t *wr);
|
||||
void on_load_config_hook(json11::Json::object & config);
|
||||
void on_load_pgs_hook(bool success);
|
||||
void on_change_hook(json11::Json::object & changes);
|
||||
void on_change_osd_state_hook(uint64_t peer_osd);
|
||||
void continue_rw(cluster_op_t *op);
|
||||
int continue_rw(cluster_op_t *op);
|
||||
void slice_rw(cluster_op_t *op);
|
||||
bool try_send(cluster_op_t *op, cluster_op_part_t *part);
|
||||
void execute_sync(cluster_op_t *op);
|
||||
void continue_sync();
|
||||
void finish_sync();
|
||||
bool try_send(cluster_op_t *op, int i);
|
||||
int continue_sync(cluster_op_t *op);
|
||||
void send_sync(cluster_op_t *op, cluster_op_part_t *part);
|
||||
void handle_op_part(cluster_op_part_t *part);
|
||||
};
|
||||
|
|
|
@ -4,19 +4,24 @@
|
|||
#include "osd_ops.h"
|
||||
#include "pg_states.h"
|
||||
#include "etcd_state_client.h"
|
||||
#ifndef __MOCK__
|
||||
#include "http_client.h"
|
||||
#include "base64.h"
|
||||
#endif
|
||||
|
||||
etcd_state_client_t::~etcd_state_client_t()
|
||||
{
|
||||
etcd_watches_initialised = -1;
|
||||
#ifndef __MOCK__
|
||||
if (etcd_watch_ws)
|
||||
{
|
||||
etcd_watch_ws->close();
|
||||
etcd_watch_ws = NULL;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef __MOCK__
|
||||
json_kv_t etcd_state_client_t::parse_etcd_kv(const json11::Json & kv_json)
|
||||
{
|
||||
json_kv_t kv;
|
||||
|
@ -323,6 +328,26 @@ void etcd_state_client_t::load_pgs()
|
|||
start_etcd_watcher();
|
||||
});
|
||||
}
|
||||
#else
|
||||
void etcd_state_client_t::parse_config(json11::Json & config)
|
||||
{
|
||||
}
|
||||
|
||||
void etcd_state_client_t::load_global_config()
|
||||
{
|
||||
json11::Json::object global_config;
|
||||
on_load_config_hook(global_config);
|
||||
}
|
||||
|
||||
void etcd_state_client_t::load_pgs()
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
void etcd_state_client_t::parse_state(const json_kv_t & kv)
|
||||
{
|
||||
parse_state(kv.key, kv.value);
|
||||
}
|
||||
|
||||
void etcd_state_client_t::parse_state(const std::string & key, const json11::Json & value)
|
||||
{
|
||||
|
@ -336,8 +361,10 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
|
|||
{
|
||||
pool_config_t pc;
|
||||
// ID
|
||||
pool_id_t pool_id = stoull_full(pool_item.first);
|
||||
if (!pool_id || pool_id >= POOL_ID_MAX)
|
||||
pool_id_t pool_id;
|
||||
char null_byte = 0;
|
||||
sscanf(pool_item.first.c_str(), "%u%c", &pool_id, &null_byte);
|
||||
if (!pool_id || pool_id >= POOL_ID_MAX || null_byte != 0)
|
||||
{
|
||||
printf("Pool ID %s is invalid (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
|
||||
continue;
|
||||
|
@ -449,16 +476,19 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
|
|||
}
|
||||
for (auto & pool_item: value["items"].object_items())
|
||||
{
|
||||
pool_id_t pool_id = stoull_full(pool_item.first);
|
||||
if (!pool_id || pool_id >= POOL_ID_MAX)
|
||||
pool_id_t pool_id;
|
||||
char null_byte = 0;
|
||||
sscanf(pool_item.first.c_str(), "%u%c", &pool_id, &null_byte);
|
||||
if (!pool_id || pool_id >= POOL_ID_MAX || null_byte != 0)
|
||||
{
|
||||
printf("Pool ID %s is invalid in PG configuration (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
|
||||
continue;
|
||||
}
|
||||
for (auto & pg_item: pool_item.second.object_items())
|
||||
{
|
||||
pg_num_t pg_num = stoull_full(pg_item.first);
|
||||
if (!pg_num)
|
||||
pg_num_t pg_num = 0;
|
||||
sscanf(pg_item.first.c_str(), "%u%c", &pg_num, &null_byte);
|
||||
if (!pg_num || null_byte != 0)
|
||||
{
|
||||
printf("Bad key in pool %u PG configuration: %s (must be a number), skipped\n", pool_id, pg_item.first.c_str());
|
||||
continue;
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "json11/json11.hpp"
|
||||
#include "osd_id.h"
|
||||
#include "http_client.h"
|
||||
#include "timerfd_manager.h"
|
||||
|
||||
#define ETCD_CONFIG_WATCH_ID 1
|
||||
|
@ -52,9 +52,13 @@ struct pool_config_t
|
|||
std::map<pg_num_t, pg_config_t> pg_config;
|
||||
};
|
||||
|
||||
struct websocket_t;
|
||||
|
||||
struct etcd_state_client_t
|
||||
{
|
||||
protected:
|
||||
websocket_t *etcd_watch_ws = NULL;
|
||||
uint64_t bs_block_size = DEFAULT_BLOCK_SIZE;
|
||||
void add_etcd_url(std::string);
|
||||
public:
|
||||
std::vector<std::string> etcd_addresses;
|
||||
|
@ -64,8 +68,6 @@ public:
|
|||
|
||||
int etcd_watches_initialised = 0;
|
||||
uint64_t etcd_watch_revision = 0;
|
||||
websocket_t *etcd_watch_ws = NULL;
|
||||
uint64_t bs_block_size = 0;
|
||||
std::map<pool_id_t, pool_config_t> pool_config;
|
||||
std::map<osd_num_t, json11::Json> peer_states;
|
||||
|
||||
|
@ -82,6 +84,7 @@ public:
|
|||
void start_etcd_watcher();
|
||||
void load_global_config();
|
||||
void load_pgs();
|
||||
void parse_state(const json_kv_t & kv);
|
||||
void parse_state(const std::string & key, const json11::Json & value);
|
||||
void parse_config(json11::Json & config);
|
||||
~etcd_state_client_t();
|
||||
|
|
|
@ -10,30 +10,16 @@
|
|||
|
||||
#include "messenger.h"
|
||||
|
||||
osd_op_t::~osd_op_t()
|
||||
{
|
||||
assert(!bs_op);
|
||||
assert(!op_data);
|
||||
if (rmw_buf)
|
||||
{
|
||||
free(rmw_buf);
|
||||
}
|
||||
if (buf)
|
||||
{
|
||||
// Note: reusing osd_op_t WILL currently lead to memory leaks
|
||||
// So we don't reuse it, but free it every time
|
||||
free(buf);
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::init()
|
||||
{
|
||||
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
|
||||
{
|
||||
for (auto cl_it = clients.begin(); cl_it != clients.end();)
|
||||
std::vector<int> to_stop;
|
||||
std::vector<osd_op_t*> to_ping;
|
||||
for (auto cl_it = clients.begin(); cl_it != clients.end(); cl_it++)
|
||||
{
|
||||
auto cl = (cl_it++)->second;
|
||||
if (!cl->osd_num)
|
||||
auto cl = cl_it->second;
|
||||
if (!cl->osd_num || cl->peer_state != PEER_CONNECTED)
|
||||
{
|
||||
// Do not run keepalive on regular clients
|
||||
continue;
|
||||
|
@ -44,7 +30,8 @@ void osd_messenger_t::init()
|
|||
if (!cl->ping_time_remaining)
|
||||
{
|
||||
// Ping timed out, stop the client
|
||||
stop_client(cl->peer_fd, true);
|
||||
printf("Ping timed out for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
|
||||
to_stop.push_back(cl->peer_fd);
|
||||
}
|
||||
}
|
||||
else if (cl->idle_time_remaining > 0)
|
||||
|
@ -70,10 +57,11 @@ void osd_messenger_t::init()
|
|||
delete op;
|
||||
if (fail_fd >= 0)
|
||||
{
|
||||
printf("Ping failed for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
|
||||
stop_client(fail_fd, true);
|
||||
}
|
||||
};
|
||||
outbox_push(op);
|
||||
to_ping.push_back(op);
|
||||
cl->ping_time_remaining = osd_ping_timeout;
|
||||
cl->idle_time_remaining = osd_idle_timeout;
|
||||
}
|
||||
|
@ -83,6 +71,15 @@ void osd_messenger_t::init()
|
|||
cl->idle_time_remaining = osd_idle_timeout;
|
||||
}
|
||||
}
|
||||
// Don't stop clients while a 'clients' iterator is still active
|
||||
for (int peer_fd: to_stop)
|
||||
{
|
||||
stop_client(peer_fd, true);
|
||||
}
|
||||
for (auto op: to_ping)
|
||||
{
|
||||
outbox_push(op);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -141,17 +138,14 @@ void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
|
|||
wanted_peers[peer_osd].port = (int)peer_state["port"].int64_value();
|
||||
}
|
||||
wanted_peers[peer_osd].address_changed = true;
|
||||
if (!wanted_peers[peer_osd].connecting &&
|
||||
(time(NULL) - wanted_peers[peer_osd].last_connect_attempt) >= peer_connect_interval)
|
||||
{
|
||||
try_connect_peer(peer_osd);
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
|
||||
{
|
||||
auto wp_it = wanted_peers.find(peer_osd);
|
||||
if (wp_it == wanted_peers.end())
|
||||
if (wp_it == wanted_peers.end() || wp_it->second.connecting ||
|
||||
(time(NULL) - wp_it->second.last_connect_attempt) < peer_connect_interval)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
@ -197,10 +191,22 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
|
|||
on_connect_peer(peer_osd, -errno);
|
||||
return;
|
||||
}
|
||||
int timeout_id = -1;
|
||||
clients[peer_fd] = new osd_client_t();
|
||||
clients[peer_fd]->peer_addr = addr;
|
||||
clients[peer_fd]->peer_port = peer_port;
|
||||
clients[peer_fd]->peer_fd = peer_fd;
|
||||
clients[peer_fd]->peer_state = PEER_CONNECTING;
|
||||
clients[peer_fd]->connect_timeout_id = -1;
|
||||
clients[peer_fd]->osd_num = peer_osd;
|
||||
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
|
||||
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Either OUT (connected) or HUP
|
||||
handle_connect_epoll(peer_fd);
|
||||
});
|
||||
if (peer_connect_timeout > 0)
|
||||
{
|
||||
timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
|
||||
clients[peer_fd]->connect_timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
|
||||
{
|
||||
osd_num_t peer_osd = clients.at(peer_fd)->osd_num;
|
||||
stop_client(peer_fd, true);
|
||||
|
@ -208,20 +214,6 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
|
|||
return;
|
||||
});
|
||||
}
|
||||
clients[peer_fd] = new osd_client_t((osd_client_t){
|
||||
.peer_addr = addr,
|
||||
.peer_port = peer_port,
|
||||
.peer_fd = peer_fd,
|
||||
.peer_state = PEER_CONNECTING,
|
||||
.connect_timeout_id = timeout_id,
|
||||
.osd_num = peer_osd,
|
||||
.in_buf = malloc_or_die(receive_buffer_size),
|
||||
});
|
||||
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Either OUT (connected) or HUP
|
||||
handle_connect_epoll(peer_fd);
|
||||
});
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_connect_epoll(int peer_fd)
|
||||
|
@ -373,123 +365,6 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
|||
outbox_push(op);
|
||||
}
|
||||
|
||||
void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
|
||||
{
|
||||
for (auto p: cl->sent_ops)
|
||||
{
|
||||
cancel_op(p.second);
|
||||
}
|
||||
cl->sent_ops.clear();
|
||||
cl->outbox.clear();
|
||||
}
|
||||
|
||||
void osd_messenger_t::cancel_op(osd_op_t *op)
|
||||
{
|
||||
if (op->op_type == OSD_OP_OUT)
|
||||
{
|
||||
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||
op->reply.hdr.id = op->req.hdr.id;
|
||||
op->reply.hdr.opcode = op->req.hdr.opcode;
|
||||
op->reply.hdr.retval = -EPIPE;
|
||||
// Copy lambda to be unaffected by `delete op`
|
||||
std::function<void(osd_op_t*)>(op->callback)(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
// This function is only called in stop_client(), so it's fine to destroy the operation
|
||||
delete op;
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::stop_client(int peer_fd, bool force)
|
||||
{
|
||||
assert(peer_fd != 0);
|
||||
auto it = clients.find(peer_fd);
|
||||
if (it == clients.end())
|
||||
{
|
||||
return;
|
||||
}
|
||||
uint64_t repeer_osd = 0;
|
||||
osd_client_t *cl = it->second;
|
||||
if (cl->peer_state == PEER_CONNECTED)
|
||||
{
|
||||
if (cl->osd_num)
|
||||
{
|
||||
// Reload configuration from etcd when the connection is dropped
|
||||
if (log_level > 0)
|
||||
printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl->osd_num);
|
||||
repeer_osd = cl->osd_num;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (log_level > 0)
|
||||
printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
|
||||
}
|
||||
}
|
||||
else if (!force)
|
||||
{
|
||||
return;
|
||||
}
|
||||
cl->peer_state = PEER_STOPPED;
|
||||
clients.erase(it);
|
||||
tfd->set_fd_handler(peer_fd, false, NULL);
|
||||
if (cl->connect_timeout_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(cl->connect_timeout_id);
|
||||
cl->connect_timeout_id = -1;
|
||||
}
|
||||
if (cl->osd_num)
|
||||
{
|
||||
osd_peer_fds.erase(cl->osd_num);
|
||||
}
|
||||
if (cl->read_op)
|
||||
{
|
||||
if (cl->read_op->callback)
|
||||
{
|
||||
cancel_op(cl->read_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
delete cl->read_op;
|
||||
}
|
||||
cl->read_op = NULL;
|
||||
}
|
||||
for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
|
||||
{
|
||||
if (*rit == peer_fd)
|
||||
{
|
||||
read_ready_clients.erase(rit);
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
|
||||
{
|
||||
if (*wit == peer_fd)
|
||||
{
|
||||
write_ready_clients.erase(wit);
|
||||
break;
|
||||
}
|
||||
}
|
||||
free(cl->in_buf);
|
||||
cl->in_buf = NULL;
|
||||
close(peer_fd);
|
||||
if (repeer_osd)
|
||||
{
|
||||
// First repeer PGs as canceling OSD ops may push new operations
|
||||
// and we need correct PG states when we do that
|
||||
repeer_pgs(repeer_osd);
|
||||
}
|
||||
if (cl->osd_num)
|
||||
{
|
||||
// Cancel outbound operations
|
||||
cancel_osd_ops(cl);
|
||||
}
|
||||
if (cl->refs <= 0)
|
||||
{
|
||||
delete cl;
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::accept_connections(int listen_fd)
|
||||
{
|
||||
// Accept new connections
|
||||
|
@ -505,13 +380,12 @@ void osd_messenger_t::accept_connections(int listen_fd)
|
|||
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
int one = 1;
|
||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
clients[peer_fd] = new osd_client_t((osd_client_t){
|
||||
.peer_addr = addr,
|
||||
.peer_port = ntohs(addr.sin_port),
|
||||
.peer_fd = peer_fd,
|
||||
.peer_state = PEER_CONNECTED,
|
||||
.in_buf = malloc_or_die(receive_buffer_size),
|
||||
});
|
||||
clients[peer_fd] = new osd_client_t();
|
||||
clients[peer_fd]->peer_addr = addr;
|
||||
clients[peer_fd]->peer_port = ntohs(addr.sin_port);
|
||||
clients[peer_fd]->peer_fd = peer_fd;
|
||||
clients[peer_fd]->peer_state = PEER_CONNECTED;
|
||||
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
|
||||
// Add FD to epoll
|
||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
|
|
190
src/messenger.h
190
src/messenger.h
|
@ -14,19 +14,15 @@
|
|||
|
||||
#include "malloc_or_die.h"
|
||||
#include "json11/json11.hpp"
|
||||
#include "osd_ops.h"
|
||||
#include "msgr_op.h"
|
||||
#include "timerfd_manager.h"
|
||||
#include "ringloop.h"
|
||||
|
||||
#define OSD_OP_IN 0
|
||||
#define OSD_OP_OUT 1
|
||||
#include <ringloop.h>
|
||||
|
||||
#define CL_READ_HDR 1
|
||||
#define CL_READ_DATA 2
|
||||
#define CL_READ_REPLY_DATA 3
|
||||
#define CL_WRITE_READY 1
|
||||
#define CL_WRITE_REPLY 2
|
||||
#define OSD_OP_INLINE_BUF_COUNT 16
|
||||
|
||||
#define PEER_CONNECTING 1
|
||||
#define PEER_CONNECTED 2
|
||||
|
@ -36,160 +32,6 @@
|
|||
#define DEFAULT_PEER_CONNECT_TIMEOUT 5
|
||||
#define DEFAULT_OSD_PING_TIMEOUT 5
|
||||
|
||||
// Kind of a vector with small-list-optimisation
|
||||
struct osd_op_buf_list_t
|
||||
{
|
||||
int count = 0, alloc = OSD_OP_INLINE_BUF_COUNT, done = 0;
|
||||
iovec *buf = NULL;
|
||||
iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
|
||||
|
||||
inline osd_op_buf_list_t()
|
||||
{
|
||||
buf = inline_buf;
|
||||
}
|
||||
|
||||
inline osd_op_buf_list_t(const osd_op_buf_list_t & other)
|
||||
{
|
||||
buf = inline_buf;
|
||||
append(other);
|
||||
}
|
||||
|
||||
inline osd_op_buf_list_t & operator = (const osd_op_buf_list_t & other)
|
||||
{
|
||||
reset();
|
||||
append(other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline ~osd_op_buf_list_t()
|
||||
{
|
||||
if (buf && buf != inline_buf)
|
||||
{
|
||||
free(buf);
|
||||
}
|
||||
}
|
||||
|
||||
inline void reset()
|
||||
{
|
||||
count = 0;
|
||||
done = 0;
|
||||
}
|
||||
|
||||
inline iovec* get_iovec()
|
||||
{
|
||||
return buf + done;
|
||||
}
|
||||
|
||||
inline int get_size()
|
||||
{
|
||||
return count - done;
|
||||
}
|
||||
|
||||
inline void append(const osd_op_buf_list_t & other)
|
||||
{
|
||||
if (count+other.count > alloc)
|
||||
{
|
||||
if (buf == inline_buf)
|
||||
{
|
||||
int old = alloc;
|
||||
alloc = (((count+other.count+15)/16)*16);
|
||||
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
memcpy(buf, inline_buf, sizeof(iovec) * old);
|
||||
}
|
||||
else
|
||||
{
|
||||
alloc = (((count+other.count+15)/16)*16);
|
||||
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < other.count; i++)
|
||||
{
|
||||
buf[count++] = other.buf[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline void push_back(void *nbuf, size_t len)
|
||||
{
|
||||
if (count >= alloc)
|
||||
{
|
||||
if (buf == inline_buf)
|
||||
{
|
||||
int old = alloc;
|
||||
alloc = ((alloc/16)*16 + 1);
|
||||
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
memcpy(buf, inline_buf, sizeof(iovec)*old);
|
||||
}
|
||||
else
|
||||
{
|
||||
alloc = alloc < 16 ? 16 : (alloc+16);
|
||||
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
buf[count++] = { .iov_base = nbuf, .iov_len = len };
|
||||
}
|
||||
|
||||
inline void eat(int result)
|
||||
{
|
||||
while (result > 0 && done < count)
|
||||
{
|
||||
iovec & iov = buf[done];
|
||||
if (iov.iov_len <= result)
|
||||
{
|
||||
result -= iov.iov_len;
|
||||
done++;
|
||||
}
|
||||
else
|
||||
{
|
||||
iov.iov_len -= result;
|
||||
iov.iov_base += result;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct blockstore_op_t;
|
||||
|
||||
struct osd_primary_op_data_t;
|
||||
|
||||
struct osd_op_t
|
||||
{
|
||||
timespec tv_begin;
|
||||
uint64_t op_type = OSD_OP_IN;
|
||||
int peer_fd;
|
||||
osd_any_op_t req;
|
||||
osd_any_reply_t reply;
|
||||
blockstore_op_t *bs_op = NULL;
|
||||
void *buf = NULL;
|
||||
void *rmw_buf = NULL;
|
||||
osd_primary_op_data_t* op_data = NULL;
|
||||
std::function<void(osd_op_t*)> callback;
|
||||
|
||||
osd_op_buf_list_t iov;
|
||||
|
||||
~osd_op_t();
|
||||
};
|
||||
|
||||
struct osd_client_t
|
||||
{
|
||||
int refs = 0;
|
||||
|
@ -228,6 +70,12 @@ struct osd_client_t
|
|||
int write_state = 0;
|
||||
std::vector<iovec> send_list, next_send_list;
|
||||
std::vector<osd_op_t*> outbox, next_outbox;
|
||||
|
||||
~osd_client_t()
|
||||
{
|
||||
free(in_buf);
|
||||
in_buf = NULL;
|
||||
}
|
||||
};
|
||||
|
||||
struct osd_wanted_peer_t
|
||||
|
@ -252,12 +100,9 @@ struct osd_op_stats_t
|
|||
|
||||
struct osd_messenger_t
|
||||
{
|
||||
timerfd_manager_t *tfd;
|
||||
ring_loop_t *ringloop;
|
||||
protected:
|
||||
int keepalive_timer_id = -1;
|
||||
|
||||
// osd_num_t is only for logging and asserts
|
||||
osd_num_t osd_num;
|
||||
// FIXME: make receive_buffer_size configurable
|
||||
int receive_buffer_size = 64*1024;
|
||||
int peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
|
||||
|
@ -267,19 +112,22 @@ struct osd_messenger_t
|
|||
int log_level = 0;
|
||||
bool use_sync_send_recv = false;
|
||||
|
||||
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
|
||||
std::map<uint64_t, int> osd_peer_fds;
|
||||
uint64_t next_subop_id = 1;
|
||||
|
||||
std::map<int, osd_client_t*> clients;
|
||||
std::vector<int> read_ready_clients;
|
||||
std::vector<int> write_ready_clients;
|
||||
std::vector<std::function<void()>> set_immediate;
|
||||
|
||||
public:
|
||||
timerfd_manager_t *tfd;
|
||||
ring_loop_t *ringloop;
|
||||
// osd_num_t is only for logging and asserts
|
||||
osd_num_t osd_num;
|
||||
uint64_t next_subop_id = 1;
|
||||
std::map<int, osd_client_t*> clients;
|
||||
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
|
||||
std::map<uint64_t, int> osd_peer_fds;
|
||||
// op statistics
|
||||
osd_op_stats_t stats;
|
||||
|
||||
public:
|
||||
void init();
|
||||
void parse_config(const json11::Json & config);
|
||||
void connect_peer(uint64_t osd_num, json11::Json peer_state);
|
||||
|
@ -287,7 +135,6 @@ public:
|
|||
void outbox_push(osd_op_t *cur_op);
|
||||
std::function<void(osd_op_t*)> exec_op;
|
||||
std::function<void(osd_num_t)> repeer_pgs;
|
||||
void handle_peer_epoll(int peer_fd, int epoll_events);
|
||||
void read_requests();
|
||||
void send_replies();
|
||||
void accept_connections(int listen_fd);
|
||||
|
@ -296,6 +143,7 @@ public:
|
|||
protected:
|
||||
void try_connect_peer(uint64_t osd_num);
|
||||
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
|
||||
void handle_peer_epoll(int peer_fd, int epoll_events);
|
||||
void handle_connect_epoll(int peer_fd);
|
||||
void on_connect_peer(osd_num_t peer_osd, int peer_fd);
|
||||
void check_peer_config(osd_client_t *cl);
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
g++ -D__MOCK__ -fsanitize=address -g -Wno-pointer-arith pg_states.cpp osd_ops.cpp test_cluster_client.cpp cluster_client.cpp msgr_op.cpp msgr_stop.cpp mock/messenger.cpp etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp -I mock -I . -I ..; ./a.out
|
|
@ -0,0 +1,44 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#include <unistd.h>
|
||||
#include <stdexcept>
|
||||
#include <assert.h>
|
||||
|
||||
#include "messenger.h"
|
||||
|
||||
void osd_messenger_t::init()
|
||||
{
|
||||
}
|
||||
|
||||
osd_messenger_t::~osd_messenger_t()
|
||||
{
|
||||
while (clients.size() > 0)
|
||||
{
|
||||
stop_client(clients.begin()->first, true);
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
{
|
||||
clients[cur_op->peer_fd]->sent_ops[cur_op->req.hdr.id] = cur_op;
|
||||
}
|
||||
|
||||
void osd_messenger_t::parse_config(const json11::Json & config)
|
||||
{
|
||||
}
|
||||
|
||||
void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
|
||||
{
|
||||
wanted_peers[peer_osd] = (osd_wanted_peer_t){
|
||||
.port = 1,
|
||||
};
|
||||
}
|
||||
|
||||
void osd_messenger_t::read_requests()
|
||||
{
|
||||
}
|
||||
|
||||
void osd_messenger_t::send_replies()
|
||||
{
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
|
||||
struct ring_consumer_t
|
||||
{
|
||||
std::function<void(void)> loop;
|
||||
};
|
||||
|
||||
class ring_loop_t
|
||||
{
|
||||
public:
|
||||
void register_consumer(ring_consumer_t *consumer)
|
||||
{
|
||||
}
|
||||
void unregister_consumer(ring_consumer_t *consumer)
|
||||
{
|
||||
}
|
||||
void submit()
|
||||
{
|
||||
}
|
||||
};
|
|
@ -0,0 +1,22 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "msgr_op.h"
|
||||
|
||||
osd_op_t::~osd_op_t()
|
||||
{
|
||||
assert(!bs_op);
|
||||
assert(!op_data);
|
||||
if (rmw_buf)
|
||||
{
|
||||
free(rmw_buf);
|
||||
}
|
||||
if (buf)
|
||||
{
|
||||
// Note: reusing osd_op_t WILL currently lead to memory leaks
|
||||
// So we don't reuse it, but free it every time
|
||||
free(buf);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,171 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <sys/uio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "osd_ops.h"
|
||||
|
||||
#define OSD_OP_IN 0
|
||||
#define OSD_OP_OUT 1
|
||||
|
||||
#define OSD_OP_INLINE_BUF_COUNT 16
|
||||
|
||||
// Kind of a vector with small-list-optimisation
|
||||
struct osd_op_buf_list_t
|
||||
{
|
||||
int count = 0, alloc = OSD_OP_INLINE_BUF_COUNT, done = 0;
|
||||
iovec *buf = NULL;
|
||||
iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
|
||||
|
||||
inline osd_op_buf_list_t()
|
||||
{
|
||||
buf = inline_buf;
|
||||
}
|
||||
|
||||
inline osd_op_buf_list_t(const osd_op_buf_list_t & other)
|
||||
{
|
||||
buf = inline_buf;
|
||||
append(other);
|
||||
}
|
||||
|
||||
inline osd_op_buf_list_t & operator = (const osd_op_buf_list_t & other)
|
||||
{
|
||||
reset();
|
||||
append(other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline ~osd_op_buf_list_t()
|
||||
{
|
||||
if (buf && buf != inline_buf)
|
||||
{
|
||||
free(buf);
|
||||
}
|
||||
}
|
||||
|
||||
inline void reset()
|
||||
{
|
||||
count = 0;
|
||||
done = 0;
|
||||
}
|
||||
|
||||
inline iovec* get_iovec()
|
||||
{
|
||||
return buf + done;
|
||||
}
|
||||
|
||||
inline int get_size()
|
||||
{
|
||||
return count - done;
|
||||
}
|
||||
|
||||
inline void append(const osd_op_buf_list_t & other)
|
||||
{
|
||||
if (count+other.count > alloc)
|
||||
{
|
||||
if (buf == inline_buf)
|
||||
{
|
||||
int old = alloc;
|
||||
alloc = (((count+other.count+15)/16)*16);
|
||||
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
memcpy(buf, inline_buf, sizeof(iovec) * old);
|
||||
}
|
||||
else
|
||||
{
|
||||
alloc = (((count+other.count+15)/16)*16);
|
||||
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < other.count; i++)
|
||||
{
|
||||
buf[count++] = other.buf[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline void push_back(void *nbuf, size_t len)
|
||||
{
|
||||
if (count >= alloc)
|
||||
{
|
||||
if (buf == inline_buf)
|
||||
{
|
||||
int old = alloc;
|
||||
alloc = ((alloc/16)*16 + 1);
|
||||
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
memcpy(buf, inline_buf, sizeof(iovec)*old);
|
||||
}
|
||||
else
|
||||
{
|
||||
alloc = alloc < 16 ? 16 : (alloc+16);
|
||||
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
buf[count++] = { .iov_base = nbuf, .iov_len = len };
|
||||
}
|
||||
|
||||
inline void eat(int result)
|
||||
{
|
||||
while (result > 0 && done < count)
|
||||
{
|
||||
iovec & iov = buf[done];
|
||||
if (iov.iov_len <= result)
|
||||
{
|
||||
result -= iov.iov_len;
|
||||
done++;
|
||||
}
|
||||
else
|
||||
{
|
||||
iov.iov_len -= result;
|
||||
iov.iov_base += result;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct blockstore_op_t;
|
||||
|
||||
struct osd_primary_op_data_t;
|
||||
|
||||
struct osd_op_t
|
||||
{
|
||||
timespec tv_begin;
|
||||
uint64_t op_type = OSD_OP_IN;
|
||||
int peer_fd;
|
||||
osd_any_op_t req;
|
||||
osd_any_reply_t reply;
|
||||
blockstore_op_t *bs_op = NULL;
|
||||
void *buf = NULL;
|
||||
void *rmw_buf = NULL;
|
||||
osd_primary_op_data_t* op_data = NULL;
|
||||
std::function<void(osd_op_t*)> callback;
|
||||
|
||||
osd_op_buf_list_t iov;
|
||||
|
||||
~osd_op_t();
|
||||
};
|
|
@ -180,7 +180,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||
cl->refs--;
|
||||
if (cl->peer_state == PEER_STOPPED)
|
||||
{
|
||||
if (!cl->refs)
|
||||
if (cl->refs <= 0)
|
||||
{
|
||||
delete cl;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,137 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#include <unistd.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "messenger.h"
|
||||
|
||||
void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
|
||||
{
|
||||
std::vector<osd_op_t*> cancel_ops;
|
||||
cancel_ops.resize(cl->sent_ops.size());
|
||||
int i = 0;
|
||||
for (auto p: cl->sent_ops)
|
||||
{
|
||||
cancel_ops[i++] = p.second;
|
||||
}
|
||||
cl->sent_ops.clear();
|
||||
cl->outbox.clear();
|
||||
for (auto op: cancel_ops)
|
||||
{
|
||||
cancel_op(op);
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::cancel_op(osd_op_t *op)
|
||||
{
|
||||
if (op->op_type == OSD_OP_OUT)
|
||||
{
|
||||
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||
op->reply.hdr.id = op->req.hdr.id;
|
||||
op->reply.hdr.opcode = op->req.hdr.opcode;
|
||||
op->reply.hdr.retval = -EPIPE;
|
||||
// Copy lambda to be unaffected by `delete op`
|
||||
std::function<void(osd_op_t*)>(op->callback)(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
// This function is only called in stop_client(), so it's fine to destroy the operation
|
||||
delete op;
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::stop_client(int peer_fd, bool force)
|
||||
{
|
||||
assert(peer_fd != 0);
|
||||
auto it = clients.find(peer_fd);
|
||||
if (it == clients.end())
|
||||
{
|
||||
return;
|
||||
}
|
||||
osd_client_t *cl = it->second;
|
||||
if (cl->peer_state == PEER_CONNECTING && !force || cl->peer_state == PEER_STOPPED)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (log_level > 0)
|
||||
{
|
||||
if (cl->osd_num)
|
||||
{
|
||||
printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl->osd_num);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
|
||||
}
|
||||
}
|
||||
// First set state to STOPPED so another stop_client() call doesn't try to free it again
|
||||
cl->refs++;
|
||||
cl->peer_state = PEER_STOPPED;
|
||||
if (cl->osd_num)
|
||||
{
|
||||
// ...and forget OSD peer
|
||||
osd_peer_fds.erase(cl->osd_num);
|
||||
}
|
||||
#ifndef __MOCK__
|
||||
// Then remove FD from the eventloop so we don't accidentally read something
|
||||
tfd->set_fd_handler(peer_fd, false, NULL);
|
||||
if (cl->connect_timeout_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(cl->connect_timeout_id);
|
||||
cl->connect_timeout_id = -1;
|
||||
}
|
||||
for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
|
||||
{
|
||||
if (*rit == peer_fd)
|
||||
{
|
||||
read_ready_clients.erase(rit);
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
|
||||
{
|
||||
if (*wit == peer_fd)
|
||||
{
|
||||
write_ready_clients.erase(wit);
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (cl->osd_num)
|
||||
{
|
||||
// Then repeer PGs because cancel_op() callbacks can try to perform
|
||||
// some actions and we need correct PG states to not do something silly
|
||||
repeer_pgs(cl->osd_num);
|
||||
}
|
||||
// Then cancel all operations
|
||||
if (cl->read_op)
|
||||
{
|
||||
if (!cl->read_op->callback)
|
||||
{
|
||||
delete cl->read_op;
|
||||
}
|
||||
cl->read_op = NULL;
|
||||
}
|
||||
if (cl->osd_num)
|
||||
{
|
||||
// Cancel outbound operations
|
||||
cancel_osd_ops(cl);
|
||||
}
|
||||
#ifndef __MOCK__
|
||||
// And close the FD only when everything is done
|
||||
// ...because peer_fd number can get reused after close()
|
||||
close(peer_fd);
|
||||
#endif
|
||||
// Find the item again because it can be invalidated at this point
|
||||
it = clients.find(peer_fd);
|
||||
if (it != clients.end())
|
||||
{
|
||||
clients.erase(it);
|
||||
}
|
||||
cl->refs--;
|
||||
if (cl->refs <= 0)
|
||||
{
|
||||
delete cl;
|
||||
}
|
||||
}
|
25
src/osd.cpp
25
src/osd.cpp
|
@ -8,16 +8,20 @@
|
|||
#include <arpa/inet.h>
|
||||
|
||||
#include "osd.h"
|
||||
#include "http_client.h"
|
||||
|
||||
osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop)
|
||||
osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
|
||||
{
|
||||
config["entry_attr_size"] = "0";
|
||||
|
||||
this->config = config;
|
||||
this->bs = bs;
|
||||
this->ringloop = ringloop;
|
||||
|
||||
// FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
|
||||
this->bs = new blockstore_t(config, ringloop);
|
||||
|
||||
this->bs_block_size = bs->get_block_size();
|
||||
// FIXME: use bitmap granularity instead
|
||||
this->bs_disk_alignment = bs->get_disk_alignment();
|
||||
this->bs_bitmap_granularity = bs->get_bitmap_granularity();
|
||||
|
||||
parse_config(config);
|
||||
|
||||
|
@ -49,6 +53,7 @@ osd_t::~osd_t()
|
|||
{
|
||||
ringloop->unregister_consumer(&consumer);
|
||||
delete epmgr;
|
||||
delete bs;
|
||||
close(listen_fd);
|
||||
}
|
||||
|
||||
|
@ -171,7 +176,7 @@ bool osd_t::shutdown()
|
|||
{
|
||||
return false;
|
||||
}
|
||||
return bs->is_safe_to_stop();
|
||||
return !bs || bs->is_safe_to_stop();
|
||||
}
|
||||
|
||||
void osd_t::loop()
|
||||
|
@ -191,6 +196,8 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
|||
delete cur_op;
|
||||
return;
|
||||
}
|
||||
// Clear the reply buffer
|
||||
memset(cur_op->reply.buf, 0, OSD_PACKET_SIZE);
|
||||
inflight_ops++;
|
||||
if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
|
||||
cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
|
||||
|
@ -198,14 +205,14 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
|||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
|
||||
(cur_op->req.sec_rw.len > OSD_RW_MAX ||
|
||||
cur_op->req.sec_rw.len % bs_disk_alignment ||
|
||||
cur_op->req.sec_rw.offset % bs_disk_alignment)) ||
|
||||
cur_op->req.sec_rw.len % bs_bitmap_granularity ||
|
||||
cur_op->req.sec_rw.offset % bs_bitmap_granularity)) ||
|
||||
((cur_op->req.hdr.opcode == OSD_OP_READ ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_WRITE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
|
||||
(cur_op->req.rw.len > OSD_RW_MAX ||
|
||||
cur_op->req.rw.len % bs_disk_alignment ||
|
||||
cur_op->req.rw.offset % bs_disk_alignment)))
|
||||
cur_op->req.rw.len % bs_bitmap_granularity ||
|
||||
cur_op->req.rw.offset % bs_bitmap_granularity)))
|
||||
{
|
||||
// Bad command
|
||||
finish_op(cur_op, -EINVAL);
|
||||
|
|
|
@ -115,7 +115,7 @@ class osd_t
|
|||
bool stopping = false;
|
||||
int inflight_ops = 0;
|
||||
blockstore_t *bs;
|
||||
uint32_t bs_block_size, bs_disk_alignment;
|
||||
uint32_t bs_block_size, bs_bitmap_granularity;
|
||||
ring_loop_t *ringloop;
|
||||
timerfd_manager_t *tfd = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
|
@ -198,6 +198,7 @@ class osd_t
|
|||
void continue_primary_del(osd_op_t *cur_op);
|
||||
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
|
||||
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
|
||||
void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
|
||||
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
|
||||
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
|
||||
void handle_primary_bs_subop(osd_op_t *subop);
|
||||
|
@ -206,9 +207,11 @@ class osd_t
|
|||
void submit_primary_subops(int submit_type, uint64_t op_version, int pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
|
||||
void submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, uint64_t set_size, pg_osd_set_t & loc_set);
|
||||
void submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_delete, int chunks_to_delete_count);
|
||||
void submit_primary_sync_subops(osd_op_t *cur_op);
|
||||
int submit_primary_sync_subops(osd_op_t *cur_op);
|
||||
void submit_primary_stab_subops(osd_op_t *cur_op);
|
||||
|
||||
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state);
|
||||
|
||||
inline pg_num_t map_to_pg(object_id oid, uint64_t pg_stripe_size)
|
||||
{
|
||||
uint64_t pg_count = pg_counts[INODE_POOL(oid.inode)];
|
||||
|
@ -218,7 +221,7 @@ class osd_t
|
|||
}
|
||||
|
||||
public:
|
||||
osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop);
|
||||
osd_t(blockstore_config_t & config, ring_loop_t *ringloop);
|
||||
~osd_t();
|
||||
void force_stop(int exitcode);
|
||||
bool shutdown();
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
#include "osd.h"
|
||||
#include "base64.h"
|
||||
#include "etcd_state_client.h"
|
||||
#include "http_client.h"
|
||||
#include "osd_rmw.h"
|
||||
|
||||
// Startup sequence:
|
||||
|
@ -557,7 +558,7 @@ void osd_t::apply_pg_config()
|
|||
}
|
||||
if (currently_taken)
|
||||
{
|
||||
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING))
|
||||
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING))
|
||||
{
|
||||
if (pg_it->second.target_set == pg_cfg.target_set)
|
||||
{
|
||||
|
|
|
@ -149,10 +149,14 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
|||
{
|
||||
continue_primary_write(op);
|
||||
}
|
||||
if (pg.inflight == 0 && (pg.state & PG_STOPPING))
|
||||
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
}
|
||||
else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
start_pg_peering(pg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -231,7 +235,8 @@ bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
|||
{
|
||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
||||
{
|
||||
if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
|
||||
// Don't try to "recover" misplaced objects if "recovery" would make them degraded
|
||||
if ((pg_it->second.state & (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
|
||||
{
|
||||
for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
|
||||
{
|
||||
|
|
|
@ -41,16 +41,13 @@ int main(int narg, char *args[])
|
|||
signal(SIGINT, handle_sigint);
|
||||
signal(SIGTERM, handle_sigint);
|
||||
ring_loop_t *ringloop = new ring_loop_t(512);
|
||||
// FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
|
||||
blockstore_t *bs = new blockstore_t(config, ringloop);
|
||||
osd = new osd_t(config, bs, ringloop);
|
||||
osd = new osd_t(config, ringloop);
|
||||
while (1)
|
||||
{
|
||||
ringloop->loop();
|
||||
ringloop->wait();
|
||||
}
|
||||
delete osd;
|
||||
delete bs;
|
||||
delete ringloop;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -77,10 +77,11 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
|
|||
// Re-peer affected PGs
|
||||
for (auto & p: pgs)
|
||||
{
|
||||
auto & pg = p.second;
|
||||
bool repeer = false;
|
||||
if (p.second.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
|
||||
if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
|
||||
{
|
||||
for (osd_num_t pg_osd: p.second.all_peers)
|
||||
for (osd_num_t pg_osd: pg.all_peers)
|
||||
{
|
||||
if (pg_osd == peer_osd)
|
||||
{
|
||||
|
@ -91,8 +92,17 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
|
|||
if (repeer)
|
||||
{
|
||||
// Repeer this pg
|
||||
printf("[PG %u/%u] Repeer because of OSD %lu\n", p.second.pool_id, p.second.pg_num, peer_osd);
|
||||
start_pg_peering(p.second);
|
||||
printf("[PG %u/%u] Repeer because of OSD %lu\n", pg.pool_id, pg.pg_num, peer_osd);
|
||||
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
start_pg_peering(pg);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Stop accepting new operations, wait for current ones to finish or fail
|
||||
pg.state = pg.state & ~PG_ACTIVE | PG_REPEERING;
|
||||
report_pg_state(pg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -334,9 +344,10 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
|
|||
{
|
||||
// FIXME: Mark peer as failed and don't reconnect immediately after dropping the connection
|
||||
printf("Failed to sync OSD %lu: %ld (%s), disconnecting peer\n", role_osd, op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
|
||||
int fail_fd = op->peer_fd;
|
||||
ps->list_ops.erase(role_osd);
|
||||
c_cli.stop_client(op->peer_fd);
|
||||
delete op;
|
||||
c_cli.stop_client(fail_fd);
|
||||
return;
|
||||
}
|
||||
delete op;
|
||||
|
@ -413,9 +424,10 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
|||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
|
||||
int fail_fd = op->peer_fd;
|
||||
ps->list_ops.erase(role_osd);
|
||||
c_cli.stop_client(op->peer_fd);
|
||||
delete op;
|
||||
c_cli.stop_client(fail_fd);
|
||||
return;
|
||||
}
|
||||
printf(
|
||||
|
@ -484,15 +496,13 @@ bool osd_t::stop_pg(pg_t & pg)
|
|||
{
|
||||
return false;
|
||||
}
|
||||
if (!(pg.state & PG_ACTIVE))
|
||||
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)))
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
return true;
|
||||
}
|
||||
pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING;
|
||||
if (pg.inflight == 0 && !pg.flush_batch &&
|
||||
// We must either forget all PG's unstable writes or wait for it to become clean
|
||||
dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
|
||||
pg.state = pg.state & ~PG_ACTIVE & ~PG_REPEERING | PG_STOPPING;
|
||||
if (pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
}
|
||||
|
|
|
@ -430,12 +430,13 @@ void pg_t::calc_object_states(int log_level)
|
|||
void pg_t::print_state()
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||
(state & PG_STARTING) ? "starting" : "",
|
||||
(state & PG_OFFLINE) ? "offline" : "",
|
||||
(state & PG_PEERING) ? "peering" : "",
|
||||
(state & PG_INCOMPLETE) ? "incomplete" : "",
|
||||
(state & PG_ACTIVE) ? "active" : "",
|
||||
(state & PG_REPEERING) ? "repeering" : "",
|
||||
(state & PG_STOPPING) ? "stopping" : "",
|
||||
(state & PG_DEGRADED) ? " + degraded" : "",
|
||||
(state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
|
||||
|
|
|
@ -18,7 +18,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
|||
// Our EC scheme stores data in fixed chunks equal to (K*block size)
|
||||
// K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools
|
||||
pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode);
|
||||
// FIXME: We have to access pool config here, so make sure that it doesn't change while its PGs are active...
|
||||
// Note: We read pool config here, so we must NOT change it when PGs are active
|
||||
auto pool_cfg_it = st_cli.pool_config.find(pool_id);
|
||||
if (pool_cfg_it == st_cli.pool_config.end())
|
||||
{
|
||||
|
@ -44,8 +44,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
|||
return false;
|
||||
}
|
||||
if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_block_size) ||
|
||||
(cur_op->req.rw.offset % bs_disk_alignment) != 0 ||
|
||||
(cur_op->req.rw.len % bs_disk_alignment) != 0)
|
||||
(cur_op->req.rw.offset % bs_bitmap_granularity) != 0 ||
|
||||
(cur_op->req.rw.len % bs_bitmap_granularity) != 0)
|
||||
{
|
||||
finish_op(cur_op, -EINVAL);
|
||||
return false;
|
||||
|
@ -64,7 +64,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
|||
return true;
|
||||
}
|
||||
|
||||
static uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
|
||||
uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
|
||||
{
|
||||
if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
|
||||
{
|
||||
|
@ -177,609 +177,6 @@ resume_2:
|
|||
finish_op(cur_op, cur_op->req.rw.len);
|
||||
}
|
||||
|
||||
bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
// Check if actions are pending for this object
|
||||
auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
|
||||
.oid = op_data->oid,
|
||||
.osd_num = 0,
|
||||
});
|
||||
if (act_it != pg.flush_actions.end() &&
|
||||
act_it->first.oid.inode == op_data->oid.inode &&
|
||||
(act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
|
||||
{
|
||||
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||
return false;
|
||||
}
|
||||
// Check if there are other write requests to the same object
|
||||
auto vo_it = pg.write_queue.find(op_data->oid);
|
||||
if (vo_it != pg.write_queue.end())
|
||||
{
|
||||
op_data->st = 1;
|
||||
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||
return false;
|
||||
}
|
||||
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_t::continue_primary_write(osd_op_t *cur_op)
|
||||
{
|
||||
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
|
||||
{
|
||||
return;
|
||||
}
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
if (op_data->st == 1) goto resume_1;
|
||||
else if (op_data->st == 2) goto resume_2;
|
||||
else if (op_data->st == 3) goto resume_3;
|
||||
else if (op_data->st == 4) goto resume_4;
|
||||
else if (op_data->st == 5) goto resume_5;
|
||||
else if (op_data->st == 6) goto resume_6;
|
||||
else if (op_data->st == 7) goto resume_7;
|
||||
else if (op_data->st == 8) goto resume_8;
|
||||
else if (op_data->st == 9) goto resume_9;
|
||||
else if (op_data->st == 10) goto resume_10;
|
||||
assert(op_data->st == 0);
|
||||
if (!check_write_queue(cur_op, pg))
|
||||
{
|
||||
return;
|
||||
}
|
||||
resume_1:
|
||||
// Determine blocks to read and write
|
||||
// Missing chunks are allowed to be overwritten even in incomplete objects
|
||||
// FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Simplified algorithm
|
||||
op_data->stripes[0].write_start = op_data->stripes[0].req_start;
|
||||
op_data->stripes[0].write_end = op_data->stripes[0].req_end;
|
||||
op_data->stripes[0].write_buf = cur_op->buf;
|
||||
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
|
||||
op_data->stripes[0].write_end != bs_block_size))
|
||||
{
|
||||
// Object is degraded/misplaced and will be moved to <write_osd_set>
|
||||
op_data->stripes[0].read_start = 0;
|
||||
op_data->stripes[0].read_end = bs_block_size;
|
||||
cur_op->rmw_buf = op_data->stripes[0].read_buf = memalign_or_die(MEM_ALIGNMENT, bs_block_size);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
|
||||
pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
|
||||
if (!cur_op->rmw_buf)
|
||||
{
|
||||
// Refuse partial overwrite of an incomplete object
|
||||
cur_op->reply.hdr.retval = -EINVAL;
|
||||
goto continue_others;
|
||||
}
|
||||
}
|
||||
// Read required blocks
|
||||
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, pg.pg_size, op_data->prev_set, cur_op);
|
||||
resume_2:
|
||||
op_data->st = 2;
|
||||
return;
|
||||
resume_3:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return;
|
||||
}
|
||||
// Save version override for parallel reads
|
||||
pg.ver_override[op_data->oid] = op_data->fact_ver;
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Only (possibly) copy new data from the request into the recovery buffer
|
||||
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
|
||||
op_data->stripes[0].write_end != bs_block_size))
|
||||
{
|
||||
memcpy(
|
||||
op_data->stripes[0].read_buf + op_data->stripes[0].req_start,
|
||||
op_data->stripes[0].write_buf,
|
||||
op_data->stripes[0].req_end - op_data->stripes[0].req_start
|
||||
);
|
||||
op_data->stripes[0].write_buf = op_data->stripes[0].read_buf;
|
||||
op_data->stripes[0].write_start = 0;
|
||||
op_data->stripes[0].write_end = bs_block_size;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Recover missing stripes, calculate parity
|
||||
if (pg.scheme == POOL_SCHEME_XOR)
|
||||
{
|
||||
calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
|
||||
}
|
||||
else if (pg.scheme == POOL_SCHEME_JERASURE)
|
||||
{
|
||||
calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
|
||||
}
|
||||
}
|
||||
// Send writes
|
||||
if ((op_data->fact_ver >> (64-PG_EPOCH_BITS)) < pg.epoch)
|
||||
{
|
||||
op_data->target_ver = ((uint64_t)pg.epoch << (64-PG_EPOCH_BITS)) | 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((op_data->fact_ver & (1ul<<(64-PG_EPOCH_BITS) - 1)) == (1ul<<(64-PG_EPOCH_BITS) - 1))
|
||||
{
|
||||
assert(pg.epoch != ((1ul << PG_EPOCH_BITS)-1));
|
||||
pg.epoch++;
|
||||
}
|
||||
op_data->target_ver = op_data->fact_ver + 1;
|
||||
}
|
||||
if (pg.epoch > pg.reported_epoch)
|
||||
{
|
||||
// Report newer epoch before writing
|
||||
// FIXME: We may report only one PG state here...
|
||||
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
pg.history_changed = true;
|
||||
report_pg_states();
|
||||
resume_10:
|
||||
if (pg.epoch > pg.reported_epoch)
|
||||
{
|
||||
op_data->st = 10;
|
||||
return;
|
||||
}
|
||||
}
|
||||
submit_primary_subops(SUBMIT_WRITE, op_data->target_ver, pg.pg_size, pg.cur_set.data(), cur_op);
|
||||
resume_4:
|
||||
op_data->st = 4;
|
||||
return;
|
||||
resume_5:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return;
|
||||
}
|
||||
resume_6:
|
||||
resume_7:
|
||||
if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
|
||||
{
|
||||
// FIXME: Check for immediate_commit == IMMEDIATE_SMALL
|
||||
return;
|
||||
}
|
||||
if (op_data->fact_ver == 1)
|
||||
{
|
||||
// Object is created
|
||||
pg.clean_count++;
|
||||
pg.total_count++;
|
||||
}
|
||||
if (op_data->object_state)
|
||||
{
|
||||
{
|
||||
int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
|
||||
recovery_stat_count[0][recovery_type]++;
|
||||
if (!recovery_stat_count[0][recovery_type])
|
||||
{
|
||||
recovery_stat_count[0][recovery_type]++;
|
||||
recovery_stat_bytes[0][recovery_type] = 0;
|
||||
}
|
||||
for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size); role++)
|
||||
{
|
||||
recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
|
||||
}
|
||||
}
|
||||
// Any kind of a non-clean object can have extra chunks, because we don't record objects
|
||||
// as degraded & misplaced or incomplete & misplaced at the same time. So try to remove extra chunks
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
{
|
||||
// We can't remove extra chunks yet if fsyncs are explicit, because
|
||||
// new copies may not be committed to stable storage yet
|
||||
// We can only remove extra chunks after a successful SYNC for this PG
|
||||
for (auto & chunk: op_data->object_state->osd_set)
|
||||
{
|
||||
// Check is the same as in submit_primary_del_subops()
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED
|
||||
? !contains_osd(pg.cur_set.data(), pg.pg_size, chunk.osd_num)
|
||||
: (chunk.osd_num != pg.cur_set[chunk.role]))
|
||||
{
|
||||
pg.copies_to_delete_after_sync.push_back((obj_ver_osd_t){
|
||||
.osd_num = chunk.osd_num,
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | (op_data->scheme == POOL_SCHEME_REPLICATED ? 0 : chunk.role),
|
||||
},
|
||||
.version = op_data->fact_ver,
|
||||
});
|
||||
copies_to_delete_after_sync_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
|
||||
if (op_data->n_subops > 0)
|
||||
{
|
||||
resume_8:
|
||||
op_data->st = 8;
|
||||
return;
|
||||
resume_9:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Clear object state
|
||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
||||
pg.clean_count++;
|
||||
}
|
||||
cur_op->reply.hdr.retval = cur_op->req.rw.len;
|
||||
continue_others:
|
||||
// Remove version override
|
||||
pg.ver_override.erase(op_data->oid);
|
||||
object_id oid = op_data->oid;
|
||||
// Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue
|
||||
auto next_it = pg.write_queue.find(oid);
|
||||
if (next_it != pg.write_queue.end() && next_it->second == cur_op)
|
||||
{
|
||||
pg.write_queue.erase(next_it++);
|
||||
}
|
||||
// finish_op would invalidate next_it if it cleared pg.write_queue, but it doesn't do that :)
|
||||
finish_op(cur_op, cur_op->reply.hdr.retval);
|
||||
// Continue other write operations to the same object
|
||||
if (next_it != pg.write_queue.end() && next_it->first == oid)
|
||||
{
|
||||
osd_op_t *next_op = next_it->second;
|
||||
continue_primary_write(next_op);
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
if (op_data->st == base_state)
|
||||
{
|
||||
goto resume_6;
|
||||
}
|
||||
else if (op_data->st == base_state+1)
|
||||
{
|
||||
goto resume_7;
|
||||
}
|
||||
// FIXME: Check for immediate_commit == IMMEDIATE_SMALL
|
||||
if (immediate_commit == IMMEDIATE_ALL)
|
||||
{
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Send STABILIZE ops immediately
|
||||
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
|
||||
op_data->unstable_writes = new obj_ver_id[loc_set.size()];
|
||||
{
|
||||
int last_start = 0;
|
||||
for (auto & chunk: loc_set)
|
||||
{
|
||||
op_data->unstable_writes[last_start] = (obj_ver_id){
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | chunk.role,
|
||||
},
|
||||
.version = op_data->fact_ver,
|
||||
};
|
||||
op_data->unstable_write_osds->push_back((unstable_osd_num_t){
|
||||
.osd_num = chunk.osd_num,
|
||||
.start = last_start,
|
||||
.len = 1,
|
||||
});
|
||||
last_start++;
|
||||
}
|
||||
}
|
||||
submit_primary_stab_subops(cur_op);
|
||||
resume_6:
|
||||
op_data->st = 6;
|
||||
return false;
|
||||
resume_7:
|
||||
// FIXME: Free those in the destructor?
|
||||
delete op_data->unstable_write_osds;
|
||||
delete[] op_data->unstable_writes;
|
||||
op_data->unstable_writes = NULL;
|
||||
op_data->unstable_write_osds = NULL;
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Remember version as unstable for EC/XOR
|
||||
for (auto & chunk: loc_set)
|
||||
{
|
||||
this->dirty_osds.insert(chunk.osd_num);
|
||||
this->unstable_writes[(osd_object_id_t){
|
||||
.osd_num = chunk.osd_num,
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | chunk.role,
|
||||
},
|
||||
}] = op_data->fact_ver;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Only remember to sync OSDs for replicated pools
|
||||
for (auto & chunk: loc_set)
|
||||
{
|
||||
this->dirty_osds.insert(chunk.osd_num);
|
||||
}
|
||||
}
|
||||
// Remember PG as dirty to drop the connection when PG goes offline
|
||||
// (this is required because of the "lazy sync")
|
||||
auto cl_it = c_cli.clients.find(cur_op->peer_fd);
|
||||
if (cl_it != c_cli.clients.end())
|
||||
{
|
||||
cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
}
|
||||
dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Save and clear unstable_writes -> SYNC all -> STABLE all
|
||||
void osd_t::continue_primary_sync(osd_op_t *cur_op)
|
||||
{
|
||||
if (!cur_op->op_data)
|
||||
{
|
||||
cur_op->op_data = (osd_primary_op_data_t*)calloc_or_die(1, sizeof(osd_primary_op_data_t));
|
||||
}
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
if (op_data->st == 1) goto resume_1;
|
||||
else if (op_data->st == 2) goto resume_2;
|
||||
else if (op_data->st == 3) goto resume_3;
|
||||
else if (op_data->st == 4) goto resume_4;
|
||||
else if (op_data->st == 5) goto resume_5;
|
||||
else if (op_data->st == 6) goto resume_6;
|
||||
else if (op_data->st == 7) goto resume_7;
|
||||
else if (op_data->st == 8) goto resume_8;
|
||||
assert(op_data->st == 0);
|
||||
if (syncs_in_progress.size() > 0)
|
||||
{
|
||||
// Wait for previous syncs, if any
|
||||
// FIXME: We may try to execute the current one in parallel, like in Blockstore, but I'm not sure if it matters at all
|
||||
syncs_in_progress.push_back(cur_op);
|
||||
op_data->st = 1;
|
||||
resume_1:
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
syncs_in_progress.push_back(cur_op);
|
||||
}
|
||||
resume_2:
|
||||
if (dirty_osds.size() == 0)
|
||||
{
|
||||
// Nothing to sync
|
||||
goto finish;
|
||||
}
|
||||
// Save and clear unstable_writes
|
||||
// In theory it is possible to do in on a per-client basis, but this seems to be an unnecessary complication
|
||||
// It would be cool not to copy these here at all, but someone has to deduplicate them by object IDs anyway
|
||||
if (unstable_writes.size() > 0)
|
||||
{
|
||||
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
|
||||
op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
|
||||
osd_num_t last_osd = 0;
|
||||
int last_start = 0, last_end = 0;
|
||||
for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
|
||||
{
|
||||
if (last_osd != it->first.osd_num)
|
||||
{
|
||||
if (last_osd != 0)
|
||||
{
|
||||
op_data->unstable_write_osds->push_back((unstable_osd_num_t){
|
||||
.osd_num = last_osd,
|
||||
.start = last_start,
|
||||
.len = last_end - last_start,
|
||||
});
|
||||
}
|
||||
last_osd = it->first.osd_num;
|
||||
last_start = last_end;
|
||||
}
|
||||
op_data->unstable_writes[last_end] = (obj_ver_id){
|
||||
.oid = it->first.oid,
|
||||
.version = it->second,
|
||||
};
|
||||
last_end++;
|
||||
}
|
||||
if (last_osd != 0)
|
||||
{
|
||||
op_data->unstable_write_osds->push_back((unstable_osd_num_t){
|
||||
.osd_num = last_osd,
|
||||
.start = last_start,
|
||||
.len = last_end - last_start,
|
||||
});
|
||||
}
|
||||
this->unstable_writes.clear();
|
||||
}
|
||||
{
|
||||
void *dirty_buf = malloc_or_die(
|
||||
sizeof(pool_pg_num_t)*dirty_pgs.size() +
|
||||
sizeof(osd_num_t)*dirty_osds.size() +
|
||||
sizeof(obj_ver_osd_t)*this->copies_to_delete_after_sync_count
|
||||
);
|
||||
op_data->dirty_pgs = (pool_pg_num_t*)dirty_buf;
|
||||
op_data->dirty_osds = (osd_num_t*)(dirty_buf + sizeof(pool_pg_num_t)*dirty_pgs.size());
|
||||
op_data->dirty_pg_count = dirty_pgs.size();
|
||||
op_data->dirty_osd_count = dirty_osds.size();
|
||||
if (this->copies_to_delete_after_sync_count)
|
||||
{
|
||||
op_data->copies_to_delete_count = 0;
|
||||
op_data->copies_to_delete = (obj_ver_osd_t*)(op_data->dirty_osds + op_data->dirty_osd_count);
|
||||
for (auto dirty_pg_num: dirty_pgs)
|
||||
{
|
||||
auto & pg = pgs.at(dirty_pg_num);
|
||||
assert(pg.copies_to_delete_after_sync.size() <= this->copies_to_delete_after_sync_count);
|
||||
memcpy(
|
||||
op_data->copies_to_delete + op_data->copies_to_delete_count,
|
||||
pg.copies_to_delete_after_sync.data(),
|
||||
sizeof(obj_ver_osd_t)*pg.copies_to_delete_after_sync.size()
|
||||
);
|
||||
op_data->copies_to_delete_count += pg.copies_to_delete_after_sync.size();
|
||||
this->copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
|
||||
pg.copies_to_delete_after_sync.clear();
|
||||
}
|
||||
assert(this->copies_to_delete_after_sync_count == 0);
|
||||
}
|
||||
int dpg = 0;
|
||||
for (auto dirty_pg_num: dirty_pgs)
|
||||
{
|
||||
pgs.at(dirty_pg_num).inflight++;
|
||||
op_data->dirty_pgs[dpg++] = dirty_pg_num;
|
||||
}
|
||||
dirty_pgs.clear();
|
||||
dpg = 0;
|
||||
for (auto osd_num: dirty_osds)
|
||||
{
|
||||
op_data->dirty_osds[dpg++] = osd_num;
|
||||
}
|
||||
dirty_osds.clear();
|
||||
}
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
{
|
||||
// SYNC
|
||||
submit_primary_sync_subops(cur_op);
|
||||
resume_3:
|
||||
op_data->st = 3;
|
||||
return;
|
||||
resume_4:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
goto resume_6;
|
||||
}
|
||||
}
|
||||
if (op_data->unstable_writes)
|
||||
{
|
||||
// Stabilize version sets, if any
|
||||
submit_primary_stab_subops(cur_op);
|
||||
resume_5:
|
||||
op_data->st = 5;
|
||||
return;
|
||||
}
|
||||
resume_6:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
// Return PGs and OSDs back into their dirty sets
|
||||
for (int i = 0; i < op_data->dirty_pg_count; i++)
|
||||
{
|
||||
dirty_pgs.insert(op_data->dirty_pgs[i]);
|
||||
}
|
||||
for (int i = 0; i < op_data->dirty_osd_count; i++)
|
||||
{
|
||||
dirty_osds.insert(op_data->dirty_osds[i]);
|
||||
}
|
||||
if (op_data->unstable_writes)
|
||||
{
|
||||
// Return objects back into the unstable write set
|
||||
for (auto unstable_osd: *(op_data->unstable_write_osds))
|
||||
{
|
||||
for (int i = 0; i < unstable_osd.len; i++)
|
||||
{
|
||||
// Except those from peered PGs
|
||||
auto & w = op_data->unstable_writes[i];
|
||||
pool_pg_num_t wpg = {
|
||||
.pool_id = INODE_POOL(w.oid.inode),
|
||||
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
|
||||
};
|
||||
if (pgs.at(wpg).state & PG_ACTIVE)
|
||||
{
|
||||
uint64_t & dest = this->unstable_writes[(osd_object_id_t){
|
||||
.osd_num = unstable_osd.osd_num,
|
||||
.oid = w.oid,
|
||||
}];
|
||||
dest = dest < w.version ? w.version : dest;
|
||||
dirty_pgs.insert(wpg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (op_data->copies_to_delete)
|
||||
{
|
||||
// Return 'copies to delete' back into respective PGs
|
||||
for (int i = 0; i < op_data->copies_to_delete_count; i++)
|
||||
{
|
||||
auto & w = op_data->copies_to_delete[i];
|
||||
auto & pg = pgs.at((pool_pg_num_t){
|
||||
.pool_id = INODE_POOL(w.oid.inode),
|
||||
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
|
||||
});
|
||||
if (pg.state & PG_ACTIVE)
|
||||
{
|
||||
pg.copies_to_delete_after_sync.push_back(w);
|
||||
copies_to_delete_after_sync_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (op_data->copies_to_delete)
|
||||
{
|
||||
// Actually delete copies which we wanted to delete
|
||||
submit_primary_del_batch(cur_op, op_data->copies_to_delete, op_data->copies_to_delete_count);
|
||||
resume_7:
|
||||
op_data->st = 7;
|
||||
return;
|
||||
resume_8:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
goto resume_6;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < op_data->dirty_pg_count; i++)
|
||||
{
|
||||
auto & pg = pgs.at(op_data->dirty_pgs[i]);
|
||||
pg.inflight--;
|
||||
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch &&
|
||||
// We must either forget all PG's unstable writes or wait for it to become clean
|
||||
dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
}
|
||||
}
|
||||
// FIXME: Free those in the destructor?
|
||||
free(op_data->dirty_pgs);
|
||||
op_data->dirty_pgs = NULL;
|
||||
op_data->dirty_osds = NULL;
|
||||
if (op_data->unstable_writes)
|
||||
{
|
||||
delete op_data->unstable_write_osds;
|
||||
delete[] op_data->unstable_writes;
|
||||
op_data->unstable_writes = NULL;
|
||||
op_data->unstable_write_osds = NULL;
|
||||
}
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
}
|
||||
else
|
||||
{
|
||||
finish:
|
||||
if (cur_op->peer_fd)
|
||||
{
|
||||
auto it = c_cli.clients.find(cur_op->peer_fd);
|
||||
if (it != c_cli.clients.end())
|
||||
it->second->dirty_pgs.clear();
|
||||
}
|
||||
finish_op(cur_op, 0);
|
||||
}
|
||||
assert(syncs_in_progress.front() == cur_op);
|
||||
syncs_in_progress.pop_front();
|
||||
if (syncs_in_progress.size() > 0)
|
||||
{
|
||||
cur_op = syncs_in_progress.front();
|
||||
op_data = cur_op->op_data;
|
||||
op_data->st++;
|
||||
goto resume_2;
|
||||
}
|
||||
}
|
||||
|
||||
// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
|
||||
void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
|
||||
{
|
||||
|
@ -818,10 +215,14 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
|
|||
{
|
||||
throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
|
||||
}
|
||||
object_state->object_count--;
|
||||
if (!object_state->object_count)
|
||||
}
|
||||
|
||||
void osd_t::free_object_state(pg_t & pg, pg_osd_set_state_t **object_state)
|
||||
{
|
||||
if (*object_state && !(--(*object_state)->object_count))
|
||||
{
|
||||
pg.state_dict.erase(object_state->osd_set);
|
||||
pg.state_dict.erase((*object_state)->osd_set);
|
||||
*object_state = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -887,22 +288,21 @@ resume_5:
|
|||
else
|
||||
{
|
||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
}
|
||||
pg.total_count--;
|
||||
object_id oid = op_data->oid;
|
||||
osd_op_t *next_op = NULL;
|
||||
auto next_it = pg.write_queue.find(op_data->oid);
|
||||
if (next_it != pg.write_queue.end() && next_it->second == cur_op)
|
||||
{
|
||||
pg.write_queue.erase(next_it++);
|
||||
if (next_it != pg.write_queue.end() && next_it->first == op_data->oid)
|
||||
next_op = next_it->second;
|
||||
}
|
||||
finish_op(cur_op, cur_op->req.rw.len);
|
||||
// Continue other write operations to the same object
|
||||
auto next_it = pg.write_queue.find(oid);
|
||||
auto this_it = next_it;
|
||||
if (this_it != pg.write_queue.end() && this_it->second == cur_op)
|
||||
if (next_op)
|
||||
{
|
||||
next_it++;
|
||||
pg.write_queue.erase(this_it);
|
||||
if (next_it != pg.write_queue.end() &&
|
||||
next_it->first == oid)
|
||||
{
|
||||
osd_op_t *next_op = next_it->second;
|
||||
// Continue next write to the same object
|
||||
continue_primary_write(next_op);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,12 +43,14 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
|||
auto & pg = pgs.at({ .pool_id = INODE_POOL(cur_op->op_data->oid.inode), .pg_num = cur_op->op_data->pg_num });
|
||||
pg.inflight--;
|
||||
assert(pg.inflight >= 0);
|
||||
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch &&
|
||||
// We must either forget all PG's unstable writes or wait for it to become clean
|
||||
dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
|
||||
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
}
|
||||
else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
start_pg_peering(pg);
|
||||
}
|
||||
}
|
||||
assert(!cur_op->op_data->subops);
|
||||
assert(!cur_op->op_data->unstable_write_osds);
|
||||
|
@ -194,14 +196,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
|
|||
}
|
||||
subops[i].callback = [cur_op, this](osd_op_t *subop)
|
||||
{
|
||||
int fail_fd = subop->req.hdr.opcode == OSD_OP_SEC_WRITE &&
|
||||
subop->reply.hdr.retval != subop->req.sec_rw.len ? subop->peer_fd : -1;
|
||||
handle_primary_subop(subop, cur_op);
|
||||
if (fail_fd >= 0)
|
||||
{
|
||||
// write operation failed, drop the connection
|
||||
c_cli.stop_client(fail_fd);
|
||||
}
|
||||
};
|
||||
c_cli.outbox_push(&subops[i]);
|
||||
}
|
||||
|
@ -247,6 +242,7 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
|
|||
}
|
||||
delete bs_op;
|
||||
subop->bs_op = NULL;
|
||||
subop->peer_fd = -1;
|
||||
handle_primary_subop(subop, cur_op);
|
||||
}
|
||||
|
||||
|
@ -288,6 +284,11 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
|||
op_data->epipe++;
|
||||
}
|
||||
op_data->errors++;
|
||||
if (subop->peer_fd >= 0)
|
||||
{
|
||||
// Drop connection on any error
|
||||
c_cli.stop_client(subop->peer_fd);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -427,7 +428,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
|||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = c_cli.osd_peer_fds.at(chunk.osd_num);
|
||||
subops[i].req.sec_del = {
|
||||
subops[i].req = (osd_any_op_t){ .sec_del = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = c_cli.next_subop_id++,
|
||||
|
@ -435,23 +436,17 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
|||
},
|
||||
.oid = chunk.oid,
|
||||
.version = chunk.version,
|
||||
};
|
||||
} };
|
||||
subops[i].callback = [cur_op, this](osd_op_t *subop)
|
||||
{
|
||||
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
|
||||
handle_primary_subop(subop, cur_op);
|
||||
if (fail_fd >= 0)
|
||||
{
|
||||
// delete operation failed, drop the connection
|
||||
c_cli.stop_client(fail_fd);
|
||||
}
|
||||
};
|
||||
c_cli.outbox_push(&subops[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
||||
int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
int n_osds = op_data->dirty_osd_count;
|
||||
|
@ -459,6 +454,7 @@ void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
|||
op_data->done = op_data->errors = 0;
|
||||
op_data->n_subops = n_osds;
|
||||
op_data->subops = subops;
|
||||
std::map<uint64_t, int>::iterator peer_it;
|
||||
for (int i = 0; i < n_osds; i++)
|
||||
{
|
||||
osd_num_t sync_osd = op_data->dirty_osds[i];
|
||||
|
@ -475,30 +471,35 @@ void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
|||
});
|
||||
bs->enqueue_op(subops[i].bs_op);
|
||||
}
|
||||
else
|
||||
else if ((peer_it = c_cli.osd_peer_fds.find(sync_osd)) != c_cli.osd_peer_fds.end())
|
||||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = c_cli.osd_peer_fds.at(sync_osd);
|
||||
subops[i].req.sec_sync = {
|
||||
subops[i].peer_fd = peer_it->second;
|
||||
subops[i].req = (osd_any_op_t){ .sec_sync = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = c_cli.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_SYNC,
|
||||
},
|
||||
};
|
||||
} };
|
||||
subops[i].callback = [cur_op, this](osd_op_t *subop)
|
||||
{
|
||||
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
|
||||
handle_primary_subop(subop, cur_op);
|
||||
if (fail_fd >= 0)
|
||||
{
|
||||
// sync operation failed, drop the connection
|
||||
c_cli.stop_client(fail_fd);
|
||||
}
|
||||
};
|
||||
c_cli.outbox_push(&subops[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
op_data->done++;
|
||||
}
|
||||
}
|
||||
if (op_data->done >= op_data->n_subops)
|
||||
{
|
||||
delete[] op_data->subops;
|
||||
op_data->subops = NULL;
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||
|
@ -531,24 +532,18 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
|||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = c_cli.osd_peer_fds.at(stab_osd.osd_num);
|
||||
subops[i].req.sec_stab = {
|
||||
subops[i].req = (osd_any_op_t){ .sec_stab = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = c_cli.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_STABILIZE,
|
||||
},
|
||||
.len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
|
||||
};
|
||||
} };
|
||||
subops[i].iov.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
|
||||
subops[i].callback = [cur_op, this](osd_op_t *subop)
|
||||
{
|
||||
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
|
||||
handle_primary_subop(subop, cur_op);
|
||||
if (fail_fd >= 0)
|
||||
{
|
||||
// sync operation failed, drop the connection
|
||||
c_cli.stop_client(fail_fd);
|
||||
}
|
||||
};
|
||||
c_cli.outbox_push(&subops[i]);
|
||||
}
|
||||
|
@ -566,7 +561,7 @@ void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid,
|
|||
return;
|
||||
}
|
||||
std::vector<osd_op_t*> cancel_ops;
|
||||
while (it != pg.write_queue.end())
|
||||
while (it != pg.write_queue.end() && it->first == oid)
|
||||
{
|
||||
cancel_ops.push_back(it->second);
|
||||
it++;
|
||||
|
|
|
@ -0,0 +1,265 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "osd_primary.h"
|
||||
|
||||
// Save and clear unstable_writes -> SYNC all -> STABLE all
|
||||
void osd_t::continue_primary_sync(osd_op_t *cur_op)
|
||||
{
|
||||
if (!cur_op->op_data)
|
||||
{
|
||||
cur_op->op_data = (osd_primary_op_data_t*)calloc_or_die(1, sizeof(osd_primary_op_data_t));
|
||||
}
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
if (op_data->st == 1) goto resume_1;
|
||||
else if (op_data->st == 2) goto resume_2;
|
||||
else if (op_data->st == 3) goto resume_3;
|
||||
else if (op_data->st == 4) goto resume_4;
|
||||
else if (op_data->st == 5) goto resume_5;
|
||||
else if (op_data->st == 6) goto resume_6;
|
||||
else if (op_data->st == 7) goto resume_7;
|
||||
else if (op_data->st == 8) goto resume_8;
|
||||
assert(op_data->st == 0);
|
||||
if (syncs_in_progress.size() > 0)
|
||||
{
|
||||
// Wait for previous syncs, if any
|
||||
// FIXME: We may try to execute the current one in parallel, like in Blockstore, but I'm not sure if it matters at all
|
||||
syncs_in_progress.push_back(cur_op);
|
||||
op_data->st = 1;
|
||||
resume_1:
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
syncs_in_progress.push_back(cur_op);
|
||||
}
|
||||
resume_2:
|
||||
if (dirty_osds.size() == 0)
|
||||
{
|
||||
// Nothing to sync
|
||||
goto finish;
|
||||
}
|
||||
// Save and clear unstable_writes
|
||||
// In theory it is possible to do in on a per-client basis, but this seems to be an unnecessary complication
|
||||
// It would be cool not to copy these here at all, but someone has to deduplicate them by object IDs anyway
|
||||
if (unstable_writes.size() > 0)
|
||||
{
|
||||
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
|
||||
op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
|
||||
osd_num_t last_osd = 0;
|
||||
int last_start = 0, last_end = 0;
|
||||
for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
|
||||
{
|
||||
if (last_osd != it->first.osd_num)
|
||||
{
|
||||
if (last_osd != 0)
|
||||
{
|
||||
op_data->unstable_write_osds->push_back((unstable_osd_num_t){
|
||||
.osd_num = last_osd,
|
||||
.start = last_start,
|
||||
.len = last_end - last_start,
|
||||
});
|
||||
}
|
||||
last_osd = it->first.osd_num;
|
||||
last_start = last_end;
|
||||
}
|
||||
op_data->unstable_writes[last_end] = (obj_ver_id){
|
||||
.oid = it->first.oid,
|
||||
.version = it->second,
|
||||
};
|
||||
last_end++;
|
||||
}
|
||||
if (last_osd != 0)
|
||||
{
|
||||
op_data->unstable_write_osds->push_back((unstable_osd_num_t){
|
||||
.osd_num = last_osd,
|
||||
.start = last_start,
|
||||
.len = last_end - last_start,
|
||||
});
|
||||
}
|
||||
this->unstable_writes.clear();
|
||||
}
|
||||
{
|
||||
void *dirty_buf = malloc_or_die(
|
||||
sizeof(pool_pg_num_t)*dirty_pgs.size() +
|
||||
sizeof(osd_num_t)*dirty_osds.size() +
|
||||
sizeof(obj_ver_osd_t)*this->copies_to_delete_after_sync_count
|
||||
);
|
||||
op_data->dirty_pgs = (pool_pg_num_t*)dirty_buf;
|
||||
op_data->dirty_osds = (osd_num_t*)(dirty_buf + sizeof(pool_pg_num_t)*dirty_pgs.size());
|
||||
op_data->dirty_pg_count = dirty_pgs.size();
|
||||
op_data->dirty_osd_count = dirty_osds.size();
|
||||
if (this->copies_to_delete_after_sync_count)
|
||||
{
|
||||
op_data->copies_to_delete_count = 0;
|
||||
op_data->copies_to_delete = (obj_ver_osd_t*)(op_data->dirty_osds + op_data->dirty_osd_count);
|
||||
for (auto dirty_pg_num: dirty_pgs)
|
||||
{
|
||||
auto & pg = pgs.at(dirty_pg_num);
|
||||
assert(pg.copies_to_delete_after_sync.size() <= this->copies_to_delete_after_sync_count);
|
||||
memcpy(
|
||||
op_data->copies_to_delete + op_data->copies_to_delete_count,
|
||||
pg.copies_to_delete_after_sync.data(),
|
||||
sizeof(obj_ver_osd_t)*pg.copies_to_delete_after_sync.size()
|
||||
);
|
||||
op_data->copies_to_delete_count += pg.copies_to_delete_after_sync.size();
|
||||
this->copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
|
||||
pg.copies_to_delete_after_sync.clear();
|
||||
}
|
||||
assert(this->copies_to_delete_after_sync_count == 0);
|
||||
}
|
||||
int dpg = 0;
|
||||
for (auto dirty_pg_num: dirty_pgs)
|
||||
{
|
||||
pgs.at(dirty_pg_num).inflight++;
|
||||
op_data->dirty_pgs[dpg++] = dirty_pg_num;
|
||||
}
|
||||
dirty_pgs.clear();
|
||||
dpg = 0;
|
||||
for (auto osd_num: dirty_osds)
|
||||
{
|
||||
op_data->dirty_osds[dpg++] = osd_num;
|
||||
}
|
||||
dirty_osds.clear();
|
||||
}
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
{
|
||||
// SYNC
|
||||
if (!submit_primary_sync_subops(cur_op))
|
||||
{
|
||||
goto resume_4;
|
||||
}
|
||||
resume_3:
|
||||
op_data->st = 3;
|
||||
return;
|
||||
resume_4:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
goto resume_6;
|
||||
}
|
||||
}
|
||||
if (op_data->unstable_writes)
|
||||
{
|
||||
// Stabilize version sets, if any
|
||||
submit_primary_stab_subops(cur_op);
|
||||
resume_5:
|
||||
op_data->st = 5;
|
||||
return;
|
||||
}
|
||||
resume_6:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
// Return PGs and OSDs back into their dirty sets
|
||||
for (int i = 0; i < op_data->dirty_pg_count; i++)
|
||||
{
|
||||
dirty_pgs.insert(op_data->dirty_pgs[i]);
|
||||
}
|
||||
for (int i = 0; i < op_data->dirty_osd_count; i++)
|
||||
{
|
||||
dirty_osds.insert(op_data->dirty_osds[i]);
|
||||
}
|
||||
if (op_data->unstable_writes)
|
||||
{
|
||||
// Return objects back into the unstable write set
|
||||
for (auto unstable_osd: *(op_data->unstable_write_osds))
|
||||
{
|
||||
for (int i = 0; i < unstable_osd.len; i++)
|
||||
{
|
||||
// Except those from peered PGs
|
||||
auto & w = op_data->unstable_writes[i];
|
||||
pool_pg_num_t wpg = {
|
||||
.pool_id = INODE_POOL(w.oid.inode),
|
||||
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
|
||||
};
|
||||
if (pgs.at(wpg).state & PG_ACTIVE)
|
||||
{
|
||||
uint64_t & dest = this->unstable_writes[(osd_object_id_t){
|
||||
.osd_num = unstable_osd.osd_num,
|
||||
.oid = w.oid,
|
||||
}];
|
||||
dest = dest < w.version ? w.version : dest;
|
||||
dirty_pgs.insert(wpg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (op_data->copies_to_delete)
|
||||
{
|
||||
// Return 'copies to delete' back into respective PGs
|
||||
for (int i = 0; i < op_data->copies_to_delete_count; i++)
|
||||
{
|
||||
auto & w = op_data->copies_to_delete[i];
|
||||
auto & pg = pgs.at((pool_pg_num_t){
|
||||
.pool_id = INODE_POOL(w.oid.inode),
|
||||
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
|
||||
});
|
||||
if (pg.state & PG_ACTIVE)
|
||||
{
|
||||
pg.copies_to_delete_after_sync.push_back(w);
|
||||
copies_to_delete_after_sync_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (op_data->copies_to_delete)
|
||||
{
|
||||
// Actually delete copies which we wanted to delete
|
||||
submit_primary_del_batch(cur_op, op_data->copies_to_delete, op_data->copies_to_delete_count);
|
||||
resume_7:
|
||||
op_data->st = 7;
|
||||
return;
|
||||
resume_8:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
goto resume_6;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < op_data->dirty_pg_count; i++)
|
||||
{
|
||||
auto & pg = pgs.at(op_data->dirty_pgs[i]);
|
||||
pg.inflight--;
|
||||
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
}
|
||||
else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
start_pg_peering(pg);
|
||||
}
|
||||
}
|
||||
// FIXME: Free those in the destructor?
|
||||
free(op_data->dirty_pgs);
|
||||
op_data->dirty_pgs = NULL;
|
||||
op_data->dirty_osds = NULL;
|
||||
if (op_data->unstable_writes)
|
||||
{
|
||||
delete op_data->unstable_write_osds;
|
||||
delete[] op_data->unstable_writes;
|
||||
op_data->unstable_writes = NULL;
|
||||
op_data->unstable_write_osds = NULL;
|
||||
}
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
}
|
||||
else
|
||||
{
|
||||
finish:
|
||||
if (cur_op->peer_fd)
|
||||
{
|
||||
auto it = c_cli.clients.find(cur_op->peer_fd);
|
||||
if (it != c_cli.clients.end())
|
||||
it->second->dirty_pgs.clear();
|
||||
}
|
||||
finish_op(cur_op, 0);
|
||||
}
|
||||
assert(syncs_in_progress.front() == cur_op);
|
||||
syncs_in_progress.pop_front();
|
||||
if (syncs_in_progress.size() > 0)
|
||||
{
|
||||
cur_op = syncs_in_progress.front();
|
||||
op_data = cur_op->op_data;
|
||||
op_data->st++;
|
||||
goto resume_2;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,378 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "osd_primary.h"
|
||||
#include "allocator.h"
|
||||
|
||||
bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
// Check if actions are pending for this object
|
||||
auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
|
||||
.oid = op_data->oid,
|
||||
.osd_num = 0,
|
||||
});
|
||||
if (act_it != pg.flush_actions.end() &&
|
||||
act_it->first.oid.inode == op_data->oid.inode &&
|
||||
(act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
|
||||
{
|
||||
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||
return false;
|
||||
}
|
||||
// Check if there are other write requests to the same object
|
||||
auto vo_it = pg.write_queue.find(op_data->oid);
|
||||
if (vo_it != pg.write_queue.end())
|
||||
{
|
||||
op_data->st = 1;
|
||||
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||
return false;
|
||||
}
|
||||
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_t::continue_primary_write(osd_op_t *cur_op)
|
||||
{
|
||||
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
|
||||
{
|
||||
return;
|
||||
}
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
if (op_data->st == 1) goto resume_1;
|
||||
else if (op_data->st == 2) goto resume_2;
|
||||
else if (op_data->st == 3) goto resume_3;
|
||||
else if (op_data->st == 4) goto resume_4;
|
||||
else if (op_data->st == 5) goto resume_5;
|
||||
else if (op_data->st == 6) goto resume_6;
|
||||
else if (op_data->st == 7) goto resume_7;
|
||||
else if (op_data->st == 8) goto resume_8;
|
||||
else if (op_data->st == 9) goto resume_9;
|
||||
else if (op_data->st == 10) goto resume_10;
|
||||
assert(op_data->st == 0);
|
||||
if (!check_write_queue(cur_op, pg))
|
||||
{
|
||||
return;
|
||||
}
|
||||
resume_1:
|
||||
// Determine blocks to read and write
|
||||
// Missing chunks are allowed to be overwritten even in incomplete objects
|
||||
// FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Simplified algorithm
|
||||
op_data->stripes[0].write_start = op_data->stripes[0].req_start;
|
||||
op_data->stripes[0].write_end = op_data->stripes[0].req_end;
|
||||
op_data->stripes[0].write_buf = cur_op->buf;
|
||||
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
|
||||
op_data->stripes[0].write_end != bs_block_size))
|
||||
{
|
||||
// Object is degraded/misplaced and will be moved to <write_osd_set>
|
||||
op_data->stripes[0].read_start = 0;
|
||||
op_data->stripes[0].read_end = bs_block_size;
|
||||
cur_op->rmw_buf = op_data->stripes[0].read_buf = memalign_or_die(MEM_ALIGNMENT, bs_block_size);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
|
||||
pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
|
||||
if (!cur_op->rmw_buf)
|
||||
{
|
||||
// Refuse partial overwrite of an incomplete object
|
||||
cur_op->reply.hdr.retval = -EINVAL;
|
||||
goto continue_others;
|
||||
}
|
||||
}
|
||||
// Read required blocks
|
||||
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, pg.pg_size, op_data->prev_set, cur_op);
|
||||
resume_2:
|
||||
op_data->st = 2;
|
||||
return;
|
||||
resume_3:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return;
|
||||
}
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Only (possibly) copy new data from the request into the recovery buffer
|
||||
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
|
||||
op_data->stripes[0].write_end != bs_block_size))
|
||||
{
|
||||
memcpy(
|
||||
op_data->stripes[0].read_buf + op_data->stripes[0].req_start,
|
||||
op_data->stripes[0].write_buf,
|
||||
op_data->stripes[0].req_end - op_data->stripes[0].req_start
|
||||
);
|
||||
op_data->stripes[0].write_buf = op_data->stripes[0].read_buf;
|
||||
op_data->stripes[0].write_start = 0;
|
||||
op_data->stripes[0].write_end = bs_block_size;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// For EC/XOR pools, save version override to make it impossible
|
||||
// for parallel reads to read different versions of data and parity
|
||||
pg.ver_override[op_data->oid] = op_data->fact_ver;
|
||||
// Recover missing stripes, calculate parity
|
||||
if (pg.scheme == POOL_SCHEME_XOR)
|
||||
{
|
||||
calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
|
||||
}
|
||||
else if (pg.scheme == POOL_SCHEME_JERASURE)
|
||||
{
|
||||
calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
|
||||
}
|
||||
}
|
||||
// Send writes
|
||||
if ((op_data->fact_ver >> (64-PG_EPOCH_BITS)) < pg.epoch)
|
||||
{
|
||||
op_data->target_ver = ((uint64_t)pg.epoch << (64-PG_EPOCH_BITS)) | 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((op_data->fact_ver & (1ul<<(64-PG_EPOCH_BITS) - 1)) == (1ul<<(64-PG_EPOCH_BITS) - 1))
|
||||
{
|
||||
assert(pg.epoch != ((1ul << PG_EPOCH_BITS)-1));
|
||||
pg.epoch++;
|
||||
}
|
||||
op_data->target_ver = op_data->fact_ver + 1;
|
||||
}
|
||||
if (pg.epoch > pg.reported_epoch)
|
||||
{
|
||||
// Report newer epoch before writing
|
||||
// FIXME: We may report only one PG state here...
|
||||
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
pg.history_changed = true;
|
||||
report_pg_states();
|
||||
resume_10:
|
||||
if (pg.epoch > pg.reported_epoch)
|
||||
{
|
||||
op_data->st = 10;
|
||||
return;
|
||||
}
|
||||
}
|
||||
submit_primary_subops(SUBMIT_WRITE, op_data->target_ver, pg.pg_size, pg.cur_set.data(), cur_op);
|
||||
resume_4:
|
||||
op_data->st = 4;
|
||||
return;
|
||||
resume_5:
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Remove version override just after the write, but before stabilizing
|
||||
pg.ver_override.erase(op_data->oid);
|
||||
}
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return;
|
||||
}
|
||||
if (op_data->object_state)
|
||||
{
|
||||
// We must forget the unclean state of the object before deleting it
|
||||
// so the next reads don't accidentally read a deleted version
|
||||
// And it should be done at the same time as the removal of the version override
|
||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
||||
pg.clean_count++;
|
||||
}
|
||||
resume_6:
|
||||
resume_7:
|
||||
if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (op_data->fact_ver == 1)
|
||||
{
|
||||
// Object is created
|
||||
pg.clean_count++;
|
||||
pg.total_count++;
|
||||
}
|
||||
if (op_data->object_state)
|
||||
{
|
||||
{
|
||||
int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
|
||||
recovery_stat_count[0][recovery_type]++;
|
||||
if (!recovery_stat_count[0][recovery_type])
|
||||
{
|
||||
recovery_stat_count[0][recovery_type]++;
|
||||
recovery_stat_bytes[0][recovery_type] = 0;
|
||||
}
|
||||
for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size); role++)
|
||||
{
|
||||
recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
|
||||
}
|
||||
}
|
||||
// Any kind of a non-clean object can have extra chunks, because we don't record objects
|
||||
// as degraded & misplaced or incomplete & misplaced at the same time. So try to remove extra chunks
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
{
|
||||
// We can't remove extra chunks yet if fsyncs are explicit, because
|
||||
// new copies may not be committed to stable storage yet
|
||||
// We can only remove extra chunks after a successful SYNC for this PG
|
||||
for (auto & chunk: op_data->object_state->osd_set)
|
||||
{
|
||||
// Check is the same as in submit_primary_del_subops()
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED
|
||||
? !contains_osd(pg.cur_set.data(), pg.pg_size, chunk.osd_num)
|
||||
: (chunk.osd_num != pg.cur_set[chunk.role]))
|
||||
{
|
||||
pg.copies_to_delete_after_sync.push_back((obj_ver_osd_t){
|
||||
.osd_num = chunk.osd_num,
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | (op_data->scheme == POOL_SCHEME_REPLICATED ? 0 : chunk.role),
|
||||
},
|
||||
.version = op_data->fact_ver,
|
||||
});
|
||||
copies_to_delete_after_sync_count++;
|
||||
}
|
||||
}
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
}
|
||||
else
|
||||
{
|
||||
submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
if (op_data->n_subops > 0)
|
||||
{
|
||||
resume_8:
|
||||
op_data->st = 8;
|
||||
return;
|
||||
resume_9:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cur_op->reply.hdr.retval = cur_op->req.rw.len;
|
||||
continue_others:
|
||||
osd_op_t *next_op = NULL;
|
||||
auto next_it = pg.write_queue.find(op_data->oid);
|
||||
// Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue
|
||||
if (next_it != pg.write_queue.end() && next_it->second == cur_op)
|
||||
{
|
||||
pg.write_queue.erase(next_it++);
|
||||
if (next_it != pg.write_queue.end() && next_it->first == op_data->oid)
|
||||
next_op = next_it->second;
|
||||
}
|
||||
// finish_op would invalidate next_it if it cleared pg.write_queue, but it doesn't do that :)
|
||||
finish_op(cur_op, cur_op->req.rw.len);
|
||||
if (next_op)
|
||||
{
|
||||
// Continue next write to the same object
|
||||
continue_primary_write(next_op);
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
if (op_data->st == base_state)
|
||||
{
|
||||
goto resume_6;
|
||||
}
|
||||
else if (op_data->st == base_state+1)
|
||||
{
|
||||
goto resume_7;
|
||||
}
|
||||
if (immediate_commit == IMMEDIATE_ALL)
|
||||
{
|
||||
immediate:
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Send STABILIZE ops immediately
|
||||
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
|
||||
op_data->unstable_writes = new obj_ver_id[loc_set.size()];
|
||||
{
|
||||
int last_start = 0;
|
||||
for (auto & chunk: loc_set)
|
||||
{
|
||||
op_data->unstable_writes[last_start] = (obj_ver_id){
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | chunk.role,
|
||||
},
|
||||
.version = op_data->fact_ver,
|
||||
};
|
||||
op_data->unstable_write_osds->push_back((unstable_osd_num_t){
|
||||
.osd_num = chunk.osd_num,
|
||||
.start = last_start,
|
||||
.len = 1,
|
||||
});
|
||||
last_start++;
|
||||
}
|
||||
}
|
||||
submit_primary_stab_subops(cur_op);
|
||||
resume_6:
|
||||
op_data->st = 6;
|
||||
return false;
|
||||
resume_7:
|
||||
// FIXME: Free those in the destructor?
|
||||
delete op_data->unstable_write_osds;
|
||||
delete[] op_data->unstable_writes;
|
||||
op_data->unstable_writes = NULL;
|
||||
op_data->unstable_write_osds = NULL;
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (immediate_commit == IMMEDIATE_SMALL)
|
||||
{
|
||||
int stripe_count = (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : op_data->pg_size);
|
||||
for (int role = 0; role < stripe_count; role++)
|
||||
{
|
||||
if (op_data->stripes[role].write_start == 0 &&
|
||||
op_data->stripes[role].write_end == bs_block_size)
|
||||
{
|
||||
// Big write. Treat write as unsynced
|
||||
goto lazy;
|
||||
}
|
||||
}
|
||||
goto immediate;
|
||||
}
|
||||
else
|
||||
{
|
||||
lazy:
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Remember version as unstable for EC/XOR
|
||||
for (auto & chunk: loc_set)
|
||||
{
|
||||
this->dirty_osds.insert(chunk.osd_num);
|
||||
this->unstable_writes[(osd_object_id_t){
|
||||
.osd_num = chunk.osd_num,
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | chunk.role,
|
||||
},
|
||||
}] = op_data->fact_ver;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Only remember to sync OSDs for replicated pools
|
||||
for (auto & chunk: loc_set)
|
||||
{
|
||||
this->dirty_osds.insert(chunk.osd_num);
|
||||
}
|
||||
}
|
||||
// Remember PG as dirty to drop the connection when PG goes offline
|
||||
// (this is required because of the "lazy sync")
|
||||
auto cl_it = c_cli.clients.find(cur_op->peer_fd);
|
||||
if (cl_it != c_cli.clients.end())
|
||||
{
|
||||
cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
}
|
||||
dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
}
|
||||
return true;
|
||||
}
|
|
@ -3,13 +3,14 @@
|
|||
|
||||
#include "pg_states.h"
|
||||
|
||||
const int pg_state_bit_count = 14;
|
||||
const int pg_state_bit_count = 15;
|
||||
|
||||
const int pg_state_bits[14] = {
|
||||
const int pg_state_bits[15] = {
|
||||
PG_STARTING,
|
||||
PG_PEERING,
|
||||
PG_INCOMPLETE,
|
||||
PG_ACTIVE,
|
||||
PG_REPEERING,
|
||||
PG_STOPPING,
|
||||
PG_OFFLINE,
|
||||
PG_DEGRADED,
|
||||
|
@ -21,11 +22,12 @@ const int pg_state_bits[14] = {
|
|||
PG_LEFT_ON_DEAD,
|
||||
};
|
||||
|
||||
const char *pg_state_names[14] = {
|
||||
const char *pg_state_names[15] = {
|
||||
"starting",
|
||||
"peering",
|
||||
"incomplete",
|
||||
"active",
|
||||
"repeering",
|
||||
"stopping",
|
||||
"offline",
|
||||
"degraded",
|
||||
|
|
|
@ -10,16 +10,17 @@
|
|||
#define PG_PEERING (1<<1)
|
||||
#define PG_INCOMPLETE (1<<2)
|
||||
#define PG_ACTIVE (1<<3)
|
||||
#define PG_STOPPING (1<<4)
|
||||
#define PG_OFFLINE (1<<5)
|
||||
#define PG_REPEERING (1<<4)
|
||||
#define PG_STOPPING (1<<5)
|
||||
#define PG_OFFLINE (1<<6)
|
||||
// Plus any of these:
|
||||
#define PG_DEGRADED (1<<6)
|
||||
#define PG_HAS_INCOMPLETE (1<<7)
|
||||
#define PG_HAS_DEGRADED (1<<8)
|
||||
#define PG_HAS_MISPLACED (1<<9)
|
||||
#define PG_HAS_UNCLEAN (1<<10)
|
||||
#define PG_HAS_INVALID (1<<11)
|
||||
#define PG_LEFT_ON_DEAD (1<<12)
|
||||
#define PG_DEGRADED (1<<7)
|
||||
#define PG_HAS_INCOMPLETE (1<<8)
|
||||
#define PG_HAS_DEGRADED (1<<9)
|
||||
#define PG_HAS_MISPLACED (1<<10)
|
||||
#define PG_HAS_UNCLEAN (1<<11)
|
||||
#define PG_HAS_INVALID (1<<12)
|
||||
#define PG_LEFT_ON_DEAD (1<<13)
|
||||
|
||||
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
|
||||
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
|
||||
|
|
|
@ -47,7 +47,6 @@ public:
|
|||
|
||||
~QemuProxy()
|
||||
{
|
||||
cli->stop();
|
||||
delete cli;
|
||||
delete tfd;
|
||||
}
|
||||
|
|
|
@ -20,7 +20,15 @@ void alloc_all(int size)
|
|||
{
|
||||
printf("incorrect block allocated: expected %d, got %lu\n", i, x);
|
||||
}
|
||||
if (a->get(x))
|
||||
{
|
||||
printf("not free before set at %d\n", i);
|
||||
}
|
||||
a->set(x, true);
|
||||
if (!a->get(x))
|
||||
{
|
||||
printf("free after set at %d\n", i);
|
||||
}
|
||||
}
|
||||
uint64_t x = a->find_free();
|
||||
if (x != UINT64_MAX)
|
||||
|
|
|
@ -0,0 +1,407 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include "cluster_client.h"
|
||||
|
||||
void configure_single_pg_pool(cluster_client_t *cli)
|
||||
{
|
||||
cli->st_cli.on_load_pgs_hook(true);
|
||||
cli->st_cli.parse_state((json_kv_t){
|
||||
.key = "/config/pools",
|
||||
.value = json11::Json::object {
|
||||
{ "1", json11::Json::object {
|
||||
{ "name", "hddpool" },
|
||||
{ "scheme", "replicated" },
|
||||
{ "pg_size", 2 },
|
||||
{ "pg_minsize", 1 },
|
||||
{ "pg_count", 1 },
|
||||
{ "failure_domain", "osd" },
|
||||
} }
|
||||
},
|
||||
});
|
||||
cli->st_cli.parse_state((json_kv_t){
|
||||
.key = "/config/pgs",
|
||||
.value = json11::Json::object {
|
||||
{ "items", json11::Json::object {
|
||||
{ "1", json11::Json::object {
|
||||
{ "1", json11::Json::object {
|
||||
{ "osd_set", json11::Json::array { 1, 2 } },
|
||||
{ "primary", 1 },
|
||||
} }
|
||||
} }
|
||||
} }
|
||||
},
|
||||
});
|
||||
cli->st_cli.parse_state((json_kv_t){
|
||||
.key = "/pg/state/1/1",
|
||||
.value = json11::Json::object {
|
||||
{ "peers", json11::Json::array { 1, 2 } },
|
||||
{ "primary", 1 },
|
||||
{ "state", json11::Json::array { "active" } },
|
||||
},
|
||||
});
|
||||
json11::Json::object changes;
|
||||
cli->st_cli.on_change_hook(changes);
|
||||
}
|
||||
|
||||
int *test_write(cluster_client_t *cli, uint64_t offset, uint64_t len, uint8_t c, std::function<void()> cb = NULL)
|
||||
{
|
||||
printf("Post write %lx+%lx\n", offset, len);
|
||||
int *r = new int;
|
||||
*r = -1;
|
||||
cluster_op_t *op = new cluster_op_t();
|
||||
op->opcode = OSD_OP_WRITE;
|
||||
op->inode = 0x1000000000001;
|
||||
op->offset = offset;
|
||||
op->len = len;
|
||||
op->iov.push_back(malloc_or_die(len), len);
|
||||
memset(op->iov.buf[0].iov_base, c, len);
|
||||
op->callback = [r, cb](cluster_op_t *op)
|
||||
{
|
||||
if (*r == -1)
|
||||
printf("Error: Not allowed to complete yet\n");
|
||||
assert(*r != -1);
|
||||
*r = op->retval == op->len ? 1 : 0;
|
||||
free(op->iov.buf[0].iov_base);
|
||||
printf("Done write %lx+%lx r=%d\n", op->offset, op->len, op->retval);
|
||||
delete op;
|
||||
if (cb != NULL)
|
||||
cb();
|
||||
};
|
||||
cli->execute(op);
|
||||
return r;
|
||||
}
|
||||
|
||||
int *test_sync(cluster_client_t *cli)
|
||||
{
|
||||
printf("Post sync\n");
|
||||
int *r = new int;
|
||||
*r = -1;
|
||||
cluster_op_t *op = new cluster_op_t();
|
||||
op->opcode = OSD_OP_SYNC;
|
||||
op->callback = [r](cluster_op_t *op)
|
||||
{
|
||||
if (*r == -1)
|
||||
printf("Error: Not allowed to complete yet\n");
|
||||
assert(*r != -1);
|
||||
*r = op->retval == 0 ? 1 : 0;
|
||||
printf("Done sync r=%d\n", op->retval);
|
||||
delete op;
|
||||
};
|
||||
cli->execute(op);
|
||||
return r;
|
||||
}
|
||||
|
||||
void can_complete(int *r)
|
||||
{
|
||||
// Allow the operation to proceed so the test verifies
|
||||
// that it doesn't complete earlier than expected
|
||||
*r = -2;
|
||||
}
|
||||
|
||||
void check_completed(int *r)
|
||||
{
|
||||
assert(*r == 1);
|
||||
delete r;
|
||||
}
|
||||
|
||||
void pretend_connected(cluster_client_t *cli, osd_num_t osd_num)
|
||||
{
|
||||
printf("OSD %lu connected\n", osd_num);
|
||||
int peer_fd = cli->msgr.clients.size() ? std::prev(cli->msgr.clients.end())->first+1 : 10;
|
||||
cli->msgr.osd_peer_fds[osd_num] = peer_fd;
|
||||
cli->msgr.clients[peer_fd] = new osd_client_t();
|
||||
cli->msgr.clients[peer_fd]->osd_num = osd_num;
|
||||
cli->msgr.clients[peer_fd]->peer_state = PEER_CONNECTED;
|
||||
cli->msgr.wanted_peers.erase(osd_num);
|
||||
cli->msgr.repeer_pgs(osd_num);
|
||||
}
|
||||
|
||||
void pretend_disconnected(cluster_client_t *cli, osd_num_t osd_num)
|
||||
{
|
||||
printf("OSD %lu disconnected\n", osd_num);
|
||||
cli->msgr.stop_client(cli->msgr.osd_peer_fds.at(osd_num));
|
||||
}
|
||||
|
||||
void check_disconnected(cluster_client_t *cli, osd_num_t osd_num)
|
||||
{
|
||||
if (cli->msgr.osd_peer_fds.find(osd_num) != cli->msgr.osd_peer_fds.end())
|
||||
{
|
||||
printf("OSD %lu not disconnected as it ought to be\n", osd_num);
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
void check_op_count(cluster_client_t *cli, osd_num_t osd_num, int ops)
|
||||
{
|
||||
int peer_fd = cli->msgr.osd_peer_fds.at(osd_num);
|
||||
int real_ops = cli->msgr.clients[peer_fd]->sent_ops.size();
|
||||
if (real_ops != ops)
|
||||
{
|
||||
printf("error: %d ops expected, but %d queued\n", ops, real_ops);
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
osd_op_t *find_op(cluster_client_t *cli, osd_num_t osd_num, uint64_t opcode, uint64_t offset, uint64_t len)
|
||||
{
|
||||
int peer_fd = cli->msgr.osd_peer_fds.at(osd_num);
|
||||
auto op_it = cli->msgr.clients[peer_fd]->sent_ops.begin();
|
||||
while (op_it != cli->msgr.clients[peer_fd]->sent_ops.end())
|
||||
{
|
||||
auto op = op_it->second;
|
||||
if (op->req.hdr.opcode == opcode && (opcode == OSD_OP_SYNC ||
|
||||
op->req.rw.inode == 0x1000000000001 && op->req.rw.offset == offset && op->req.rw.len == len))
|
||||
{
|
||||
return op;
|
||||
}
|
||||
op_it++;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void pretend_op_completed(cluster_client_t *cli, osd_op_t *op, int64_t retval)
|
||||
{
|
||||
assert(op);
|
||||
printf("Pretend completed %s %lx+%x\n", op->req.hdr.opcode == OSD_OP_SYNC
|
||||
? "sync" : (op->req.hdr.opcode == OSD_OP_WRITE ? "write" : "read"), op->req.rw.offset, op->req.rw.len);
|
||||
uint64_t op_id = op->req.hdr.id;
|
||||
int peer_fd = op->peer_fd;
|
||||
cli->msgr.clients[peer_fd]->sent_ops.erase(op_id);
|
||||
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||
op->reply.hdr.id = op->req.hdr.id;
|
||||
op->reply.hdr.opcode = op->req.hdr.opcode;
|
||||
op->reply.hdr.retval = retval < 0 ? retval : (op->req.hdr.opcode == OSD_OP_SYNC ? 0 : op->req.rw.len);
|
||||
// Copy lambda to be unaffected by `delete op`
|
||||
std::function<void(osd_op_t*)>(op->callback)(op);
|
||||
}
|
||||
|
||||
void test1()
|
||||
{
|
||||
json11::Json config;
|
||||
timerfd_manager_t *tfd = new timerfd_manager_t([](int fd, bool wr, std::function<void(int, int)> callback){});
|
||||
cluster_client_t *cli = new cluster_client_t(NULL, tfd, config);
|
||||
|
||||
int *r1 = test_write(cli, 0, 4096, 0x55);
|
||||
configure_single_pg_pool(cli);
|
||||
pretend_connected(cli, 1);
|
||||
cli->continue_ops(true);
|
||||
can_complete(r1);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
|
||||
check_completed(r1);
|
||||
pretend_disconnected(cli, 1);
|
||||
int *r2 = test_sync(cli);
|
||||
pretend_connected(cli, 1);
|
||||
check_op_count(cli, 1, 0);
|
||||
cli->continue_ops(true);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
|
||||
check_op_count(cli, 1, 1);
|
||||
can_complete(r2);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_SYNC, 0, 0), 0);
|
||||
check_completed(r2);
|
||||
// Check that the client doesn't repeat operations once more
|
||||
pretend_disconnected(cli, 1);
|
||||
pretend_connected(cli, 1);
|
||||
check_op_count(cli, 1, 0);
|
||||
|
||||
// Case:
|
||||
// Write(1) -> Complete Write(1) -> Overwrite(2) -> Complete Write(2)
|
||||
// -> Overwrite(3) -> Drop OSD connection -> Reestablish OSD connection
|
||||
// -> Complete All Posted Writes -> Sync -> Complete Sync
|
||||
// The resulting state of the block must be (3) over (2) over (1).
|
||||
// I.e. the part overwritten by (3) must remain as in (3) and so on.
|
||||
|
||||
// More interesting case:
|
||||
// Same, but both Write(2) and Write(3) must consist of two parts:
|
||||
// one from an OSD 2 that drops connection and other from OSD 1 that doesn't.
|
||||
// The idea is that if the whole Write(2) is repeated when OSD 2 drops connection
|
||||
// then it may also overwrite a part in OSD 1 which shouldn't be overwritten.
|
||||
|
||||
// Another interesting case:
|
||||
// A new operation added during replay (would also break with the previous implementation)
|
||||
|
||||
r1 = test_write(cli, 0, 0x10000, 0x56);
|
||||
can_complete(r1);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x10000), 0);
|
||||
check_completed(r1);
|
||||
|
||||
r1 = test_write(cli, 0xE000, 0x4000, 0x57);
|
||||
can_complete(r1);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0xE000, 0x4000), 0);
|
||||
check_completed(r1);
|
||||
|
||||
r1 = test_write(cli, 0x10000, 0x4000, 0x58);
|
||||
|
||||
pretend_disconnected(cli, 1);
|
||||
pretend_connected(cli, 1);
|
||||
cli->continue_ops(true);
|
||||
|
||||
// Check replay
|
||||
{
|
||||
uint64_t replay_start = UINT64_MAX;
|
||||
uint64_t replay_end = 0;
|
||||
std::vector<osd_op_t*> replay_ops;
|
||||
auto osd_cl = cli->msgr.clients.at(cli->msgr.osd_peer_fds.at(1));
|
||||
for (auto & op_p: osd_cl->sent_ops)
|
||||
{
|
||||
auto op = op_p.second;
|
||||
assert(op->req.hdr.opcode == OSD_OP_WRITE);
|
||||
uint64_t offset = op->req.rw.offset;
|
||||
if (op->req.rw.offset < replay_start)
|
||||
replay_start = op->req.rw.offset;
|
||||
if (op->req.rw.offset+op->req.rw.len > replay_end)
|
||||
replay_end = op->req.rw.offset+op->req.rw.len;
|
||||
for (int buf_idx = 0; buf_idx < op->iov.count; buf_idx++)
|
||||
{
|
||||
for (int i = 0; i < op->iov.buf[buf_idx].iov_len; i++, offset++)
|
||||
{
|
||||
uint8_t c = offset < 0xE000 ? 0x56 : (offset < 0x10000 ? 0x57 : 0x58);
|
||||
if (((uint8_t*)op->iov.buf[buf_idx].iov_base)[i] != c)
|
||||
{
|
||||
printf("Write replay: mismatch at %lu\n", offset-op->req.rw.offset);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
}
|
||||
fail:
|
||||
assert(offset == op->req.rw.offset+op->req.rw.len);
|
||||
replay_ops.push_back(op);
|
||||
}
|
||||
if (replay_start != 0 || replay_end != 0x14000)
|
||||
{
|
||||
printf("Write replay: range mismatch: %lx-%lx\n", replay_start, replay_end);
|
||||
assert(0);
|
||||
}
|
||||
for (auto op: replay_ops)
|
||||
{
|
||||
pretend_op_completed(cli, op, 0);
|
||||
}
|
||||
}
|
||||
// Check that the following write finally proceeds
|
||||
check_op_count(cli, 1, 1);
|
||||
can_complete(r1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0x10000, 0x4000), 0);
|
||||
check_completed(r1);
|
||||
check_op_count(cli, 1, 0);
|
||||
|
||||
// Check sync
|
||||
r2 = test_sync(cli);
|
||||
can_complete(r2);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_SYNC, 0, 0), 0);
|
||||
check_completed(r2);
|
||||
|
||||
// Check disconnect during write
|
||||
r1 = test_write(cli, 0, 4096, 0x59);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), -EPIPE);
|
||||
check_disconnected(cli, 1);
|
||||
pretend_connected(cli, 1);
|
||||
check_op_count(cli, 1, 0);
|
||||
cli->continue_ops(true);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
|
||||
check_op_count(cli, 1, 1);
|
||||
can_complete(r1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
|
||||
check_completed(r1);
|
||||
|
||||
// Check disconnect inside operation callback (reenterability)
|
||||
// Probably doesn't happen too often, but possible in theory
|
||||
r1 = test_write(cli, 0, 0x1000, 0x60, [cli]()
|
||||
{
|
||||
pretend_disconnected(cli, 1);
|
||||
});
|
||||
r2 = test_write(cli, 0x1000, 0x1000, 0x61);
|
||||
check_op_count(cli, 1, 2);
|
||||
can_complete(r1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
|
||||
check_completed(r1);
|
||||
check_disconnected(cli, 1);
|
||||
pretend_connected(cli, 1);
|
||||
cli->continue_ops(true);
|
||||
check_op_count(cli, 1, 2);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0x1000, 0x1000), 0);
|
||||
check_op_count(cli, 1, 1);
|
||||
can_complete(r2);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0x1000, 0x1000), 0);
|
||||
check_completed(r2);
|
||||
|
||||
// Free client
|
||||
delete cli;
|
||||
delete tfd;
|
||||
printf("[ok] write replay test\n");
|
||||
}
|
||||
|
||||
void test2()
|
||||
{
|
||||
std::map<object_id, cluster_buffer_t> unsynced_writes;
|
||||
cluster_op_t *op = new cluster_op_t();
|
||||
op->opcode = OSD_OP_WRITE;
|
||||
op->inode = 1;
|
||||
op->offset = 0;
|
||||
op->len = 4096;
|
||||
op->iov.push_back(malloc_or_die(4096*1024), 4096);
|
||||
// 0-4k = 0x55
|
||||
memset(op->iov.buf[0].iov_base, 0x55, op->iov.buf[0].iov_len);
|
||||
cluster_client_t::copy_write(op, unsynced_writes);
|
||||
// 8k-12k = 0x66
|
||||
op->offset = 8192;
|
||||
memset(op->iov.buf[0].iov_base, 0x66, op->iov.buf[0].iov_len);
|
||||
cluster_client_t::copy_write(op, unsynced_writes);
|
||||
// 4k-1M+4k = 0x77
|
||||
op->len = op->iov.buf[0].iov_len = 1048576;
|
||||
op->offset = 4096;
|
||||
memset(op->iov.buf[0].iov_base, 0x77, op->iov.buf[0].iov_len);
|
||||
cluster_client_t::copy_write(op, unsynced_writes);
|
||||
// check it
|
||||
assert(unsynced_writes.size() == 4);
|
||||
auto uit = unsynced_writes.begin();
|
||||
int i;
|
||||
assert(uit->first.inode == 1);
|
||||
assert(uit->first.stripe == 0);
|
||||
assert(uit->second.len == 4096);
|
||||
for (i = 0; i < uit->second.len && ((uint8_t*)uit->second.buf)[i] == 0x55; i++) {}
|
||||
assert(i == uit->second.len);
|
||||
uit++;
|
||||
assert(uit->first.inode == 1);
|
||||
assert(uit->first.stripe == 4096);
|
||||
assert(uit->second.len == 4096);
|
||||
for (i = 0; i < uit->second.len && ((uint8_t*)uit->second.buf)[i] == 0x77; i++) {}
|
||||
assert(i == uit->second.len);
|
||||
uit++;
|
||||
assert(uit->first.inode == 1);
|
||||
assert(uit->first.stripe == 8192);
|
||||
assert(uit->second.len == 4096);
|
||||
for (i = 0; i < uit->second.len && ((uint8_t*)uit->second.buf)[i] == 0x77; i++) {}
|
||||
assert(i == uit->second.len);
|
||||
uit++;
|
||||
assert(uit->first.inode == 1);
|
||||
assert(uit->first.stripe == 12*1024);
|
||||
assert(uit->second.len == 1016*1024);
|
||||
for (i = 0; i < uit->second.len && ((uint8_t*)uit->second.buf)[i] == 0x77; i++) {}
|
||||
assert(i == uit->second.len);
|
||||
uit++;
|
||||
// free memory
|
||||
free(op->iov.buf[0].iov_base);
|
||||
delete op;
|
||||
for (auto p: unsynced_writes)
|
||||
{
|
||||
free(p.second.buf);
|
||||
}
|
||||
printf("[ok] copy_write test\n");
|
||||
}
|
||||
|
||||
int main(int narg, char *args[])
|
||||
{
|
||||
test1();
|
||||
test2();
|
||||
return 0;
|
||||
}
|
|
@ -121,7 +121,7 @@ again:
|
|||
exp.it_value.tv_sec--;
|
||||
exp.it_value.tv_nsec += 1000000000;
|
||||
}
|
||||
if (exp.it_value.tv_sec < 0 || !exp.it_value.tv_sec && !exp.it_value.tv_nsec)
|
||||
if (exp.it_value.tv_sec < 0 || exp.it_value.tv_sec == 0 && exp.it_value.tv_nsec <= 0)
|
||||
{
|
||||
// It already happened
|
||||
trigger_nearest();
|
||||
|
@ -159,6 +159,6 @@ void timerfd_manager_t::trigger_nearest()
|
|||
{
|
||||
timers.erase(timers.begin()+nearest, timers.begin()+nearest+1);
|
||||
}
|
||||
cb(nearest_id);
|
||||
nearest = -1;
|
||||
cb(nearest_id);
|
||||
}
|
||||
|
|
|
@ -2,6 +2,14 @@
|
|||
|
||||
. `dirname $0`/common.sh
|
||||
|
||||
if [ "$EC" != "" ]; then
|
||||
POOLCFG='"scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1'
|
||||
NOBJ=512
|
||||
else
|
||||
POOLCFG='"scheme":"replicated","pg_size":2,"pg_minsize":2'
|
||||
NOBJ=1024
|
||||
fi
|
||||
|
||||
dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd3.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
|
@ -28,7 +36,7 @@ cd ..
|
|||
node mon/mon-main.js --etcd_url http://$ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
|
||||
MON_PID=$!
|
||||
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":16,"failure_domain":"osd"}}'
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":16,"failure_domain":"osd"}}'
|
||||
|
||||
sleep 2
|
||||
|
||||
|
@ -52,7 +60,7 @@ try_change()
|
|||
echo --- Change PG count to $n --- >>testdata/osd$i.log
|
||||
done
|
||||
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":'$n',"failure_domain":"osd"}}'
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":'$n',"failure_domain":"osd"}}'
|
||||
|
||||
for i in {1..10}; do
|
||||
($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == '$n) && \
|
||||
|
@ -82,8 +90,8 @@ try_change()
|
|||
|
||||
# Check that no objects are lost !
|
||||
nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'`
|
||||
if [ "$nobj" -ne 1024 ]; then
|
||||
format_error "Data lost after changing PG count to $n: 1024 objects expected, but got $nobj"
|
||||
if [ "$nobj" -ne $NOBJ ]; then
|
||||
format_error "Data lost after changing PG count to $n: $NOBJ objects expected, but got $nobj"
|
||||
fi
|
||||
}
|
||||
|
||||
|
|
|
@ -35,6 +35,18 @@ fi
|
|||
# fio -thread -name=test -ioengine=build/src/libfio_vitastor_sec.so -bs=4k -fsync=128 `$ETCDCTL get /vitastor/osd/state/1 --print-value-only | jq -r '"-host="+.addresses[0]+" -port="+(.port|tostring)'` -rw=write -size=32M
|
||||
|
||||
LD_PRELOAD=libasan.so.5 \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=1G -cluster_log_level=10
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -cluster_log_level=10
|
||||
|
||||
LD_PRELOAD=libasan.so.5 \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=1 -fsync=32 -buffer_pattern=0xdeadface \
|
||||
-rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -number_ios=1024
|
||||
|
||||
qemu-img convert -S 4096 -p \
|
||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=1:size=$((128*1024*1024))" \
|
||||
-O raw ./testdata/read.bin
|
||||
|
||||
qemu-img convert -S 4096 -p \
|
||||
-f raw ./testdata/read.bin \
|
||||
-O raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=1:size=$((128*1024*1024))"
|
||||
|
||||
format_green OK
|
||||
|
|
Loading…
Reference in New Issue