forked from vitalif/vitastor
Compare commits
85 Commits
Author | SHA1 | Date | |
---|---|---|---|
59fbcef734 | |||
40b7c21fb1 | |||
efb3678606 | |||
462650134e | |||
8d87e32175 | |||
b0b2e7df3c | |||
97efb9e299 | |||
f6d705383a | |||
68567c0e1f | |||
04b00003e9 | |||
307c1731c1 | |||
75a6a556b5 | |||
a48e2bbf18 | |||
688821665a | |||
3e162d95a0 | |||
829381b335 | |||
54f2353f24 | |||
e47f6fba60 | |||
883bf84a16 | |||
52097c4856 | |||
e1355cbc74 | |||
8f8b90be7a | |||
ad9f619370 | |||
f4769ba7c7 | |||
843b7052d2 | |||
df99e232ee | |||
3a40fa4127 | |||
4095bcc558 | |||
564d64e271 | |||
cf54741c95 | |||
18a5fafa2a | |||
06f4978085 | |||
7ebf1588c5 | |||
b0ad1e1e6d | |||
0949f08407 | |||
04a1f18fa5 | |||
cf9a641d66 | |||
05db1308aa | |||
98b54ca948 | |||
23225c5e62 | |||
7e6e1a5a82 | |||
435045751d | |||
c5fb1d5987 | |||
9f59381bea | |||
9ac7e75178 | |||
88671cf745 | |||
fe1749c427 | |||
ceb9c28de7 | |||
299d7d7c95 | |||
d1526b415f | |||
f49fd53d55 | |||
dd76eda5e5 | |||
87dbd8fa57 | |||
b44f49aab2 | |||
036555638e | |||
af5155fcd9 | |||
0d2efbecc9 | |||
e62e8b6bae | |||
c4ba24c305 | |||
19e47a0279 | |||
bd178ac20f | |||
7006875a24 | |||
ad577c4aac | |||
836635c518 | |||
88a03f4e98 | |||
2a5036669d | |||
2e0c853180 | |||
e91ff2a9ec | |||
086667f568 | |||
73ce20e246 | |||
1be94da437 | |||
80e12358a2 | |||
36c935ace6 | |||
0d8b5e2ef9 | |||
98f1e2c277 | |||
21e7686037 | |||
ab21a1908b | |||
30d1ccd43e | |||
8bdd6d8d78 | |||
09b3e4e789 | |||
07912fd670 | |||
bc742ccf8c | |||
314b20437b | |||
29bac892ad | |||
cf7547faf3 |
18
.gitignore
vendored
Normal file
18
.gitignore
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
*.o
|
||||
*.so
|
||||
package-lock.json
|
||||
fio
|
||||
qemu
|
||||
osd
|
||||
stub_osd
|
||||
stub_uring_osd
|
||||
stub_bench
|
||||
osd_test
|
||||
osd_peering_pg_test
|
||||
dump_journal
|
||||
nbd_proxy
|
||||
rm_inode
|
||||
test_allocator
|
||||
test_blockstore
|
||||
test_shit
|
||||
osd_rmw_test
|
2
debian/build-vitastor-bullseye.sh
vendored
2
debian/build-vitastor-bullseye.sh
vendored
@@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
sed 's/$REL/bullseye/' < vitastor.Dockerfile > ../Dockerfile
|
||||
sed 's/$REL/bullseye/g' < vitastor.Dockerfile > ../Dockerfile
|
||||
cd ..
|
||||
mkdir -p packages
|
||||
sudo podman build -v `pwd`/packages:/root/packages -f Dockerfile .
|
||||
|
2
debian/build-vitastor-buster.sh
vendored
2
debian/build-vitastor-buster.sh
vendored
@@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
sed 's/$REL/buster/' < vitastor.Dockerfile > ../Dockerfile
|
||||
sed 's/$REL/buster/g' < vitastor.Dockerfile > ../Dockerfile
|
||||
cd ..
|
||||
mkdir -p packages
|
||||
sudo podman build -v `pwd`/packages:/root/packages -f Dockerfile .
|
||||
|
2
debian/changelog
vendored
2
debian/changelog
vendored
@@ -1,4 +1,4 @@
|
||||
vitastor (0.5.6-1) unstable; urgency=medium
|
||||
vitastor (0.5.12-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
|
12
debian/vitastor.Dockerfile
vendored
12
debian/vitastor.Dockerfile
vendored
@@ -40,10 +40,10 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-0.5.6; \
|
||||
ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.5.6/qemu; \
|
||||
ln -s /root/fio-build/fio-*/ vitastor-0.5.6/fio; \
|
||||
cd vitastor-0.5.6; \
|
||||
cp -r /root/vitastor vitastor-0.5.12; \
|
||||
ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.5.12/qemu; \
|
||||
ln -s /root/fio-build/fio-*/ vitastor-0.5.12/fio; \
|
||||
cd vitastor-0.5.12; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
sh copy-qemu-includes.sh; \
|
||||
@@ -59,8 +59,8 @@ RUN set -e -x; \
|
||||
echo "dep:fio=$FIO" > debian/substvars; \
|
||||
echo "dep:qemu=$QEMU" >> debian/substvars; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.5.6.orig.tar.xz vitastor-0.5.6; \
|
||||
cd vitastor-0.5.6; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.5.12.orig.tar.xz vitastor-0.5.12; \
|
||||
cd vitastor-0.5.12; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
@@ -1,51 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <iostream>
|
||||
#include <functional>
|
||||
#include <array>
|
||||
#include <cstdlib> // for malloc() and free()
|
||||
using namespace std;
|
||||
|
||||
// replace operator new and delete to log allocations
|
||||
void* operator new(std::size_t n)
|
||||
{
|
||||
cout << "Allocating " << n << " bytes" << endl;
|
||||
return malloc(n);
|
||||
}
|
||||
|
||||
void operator delete(void* p) throw()
|
||||
{
|
||||
free(p);
|
||||
}
|
||||
|
||||
class test
|
||||
{
|
||||
public:
|
||||
std::string s;
|
||||
void a(std::function<void()> & f, const char *str)
|
||||
{
|
||||
auto l = [this, str]() { cout << str << " ? " << s << " from this\n"; };
|
||||
cout << "Assigning lambda3 of size " << sizeof(l) << endl;
|
||||
f = l;
|
||||
}
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
std::array<char, 16> arr1;
|
||||
auto lambda1 = [arr1](){};
|
||||
cout << "Assigning lambda1 of size " << sizeof(lambda1) << endl;
|
||||
std::function<void()> f1 = lambda1;
|
||||
|
||||
std::array<char, 17> arr2;
|
||||
auto lambda2 = [arr2](){};
|
||||
cout << "Assigning lambda2 of size " << sizeof(lambda2) << endl;
|
||||
std::function<void()> f2 = lambda2;
|
||||
|
||||
test t;
|
||||
std::function<void()> f3;
|
||||
t.s = "str";
|
||||
t.a(f3, "huyambda");
|
||||
f3();
|
||||
}
|
110
mon/PGUtil.js
110
mon/PGUtil.js
@@ -5,18 +5,55 @@ module.exports = {
|
||||
scale_pg_count,
|
||||
};
|
||||
|
||||
function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_pg)
|
||||
{
|
||||
if (!new_pg_history[new_pg])
|
||||
{
|
||||
new_pg_history[new_pg] = {
|
||||
osd_sets: {},
|
||||
all_peers: {},
|
||||
epoch: 0,
|
||||
};
|
||||
}
|
||||
const nh = new_pg_history[new_pg], oh = prev_pg_history[old_pg];
|
||||
nh.osd_sets[prev_pgs[old_pg].join(' ')] = prev_pgs[old_pg];
|
||||
if (oh && oh.osd_sets && oh.osd_sets.length)
|
||||
{
|
||||
for (const pg of oh.osd_sets)
|
||||
{
|
||||
nh.osd_sets[pg.join(' ')] = pg;
|
||||
}
|
||||
}
|
||||
if (oh && oh.all_peers && oh.all_peers.length)
|
||||
{
|
||||
for (const osd_num of oh.all_peers)
|
||||
{
|
||||
nh.all_peers[osd_num] = Number(osd_num);
|
||||
}
|
||||
}
|
||||
if (oh && oh.epoch)
|
||||
{
|
||||
nh.epoch = nh.epoch < oh.epoch ? oh.epoch : nh.epoch;
|
||||
}
|
||||
}
|
||||
|
||||
function finish_pg_history(merged_history)
|
||||
{
|
||||
merged_history.osd_sets = Object.values(merged_history.osd_sets);
|
||||
merged_history.all_peers = Object.values(merged_history.all_peers);
|
||||
}
|
||||
|
||||
function scale_pg_count(prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
|
||||
{
|
||||
const old_pg_count = prev_pgs.length;
|
||||
// Add all possibly intersecting PGs to the history of new PGs
|
||||
if (!(new_pg_count % old_pg_count))
|
||||
{
|
||||
// New PG count is a multiple of the old PG count
|
||||
const mul = (new_pg_count / old_pg_count);
|
||||
// New PG count is a multiple of old PG count
|
||||
for (let i = 0; i < new_pg_count; i++)
|
||||
{
|
||||
const old_i = Math.floor(new_pg_count / mul);
|
||||
new_pg_history[i] = prev_pg_history[old_i] ? JSON.parse(JSON.stringify(prev_pg_history[old_i])) : undefined;
|
||||
add_pg_history(new_pg_history, i, prev_pgs, prev_pg_history, i % old_pg_count);
|
||||
finish_pg_history(new_pg_history[i]);
|
||||
}
|
||||
}
|
||||
else if (!(old_pg_count % new_pg_count))
|
||||
@@ -25,68 +62,26 @@ function scale_pg_count(prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
|
||||
const mul = (old_pg_count / new_pg_count);
|
||||
for (let i = 0; i < new_pg_count; i++)
|
||||
{
|
||||
new_pg_history[i] = {
|
||||
osd_sets: [],
|
||||
all_peers: [],
|
||||
epoch: 0,
|
||||
};
|
||||
for (let j = 0; j < mul; j++)
|
||||
{
|
||||
new_pg_history[i].osd_sets.push(prev_pgs[i*mul]);
|
||||
const hist = prev_pg_history[1+i*mul+j];
|
||||
if (hist && hist.osd_sets && hist.osd_sets.length)
|
||||
{
|
||||
Array.prototype.push.apply(new_pg_history[i].osd_sets, hist.osd_sets);
|
||||
}
|
||||
if (hist && hist.all_peers && hist.all_peers.length)
|
||||
{
|
||||
Array.prototype.push.apply(new_pg_history[i].all_peers, hist.all_peers);
|
||||
}
|
||||
if (hist && hist.epoch)
|
||||
{
|
||||
new_pg_history[i].epoch = new_pg_history[i].epoch < hist.epoch ? hist.epoch : new_pg_history[i].epoch;
|
||||
}
|
||||
add_pg_history(new_pg_history, i, prev_pgs, prev_pg_history, i+j*new_pg_count);
|
||||
}
|
||||
finish_pg_history(new_pg_history[i]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Any PG may intersect with any PG after non-multiple PG count change
|
||||
// So, merge ALL PGs history
|
||||
let all_sets = {};
|
||||
let all_peers = {};
|
||||
let max_epoch = 0;
|
||||
for (const pg of prev_pgs)
|
||||
let merged_history = {};
|
||||
for (let i = 0; i < old_pg_count; i++)
|
||||
{
|
||||
all_sets[pg.join(' ')] = pg;
|
||||
add_pg_history(merged_history, 1, prev_pgs, prev_pg_history, i);
|
||||
}
|
||||
for (const pg in prev_pg_history)
|
||||
{
|
||||
const hist = prev_pg_history[pg];
|
||||
if (hist && hist.osd_sets)
|
||||
{
|
||||
for (const pg of hist.osd_sets)
|
||||
{
|
||||
all_sets[pg.join(' ')] = pg;
|
||||
}
|
||||
}
|
||||
if (hist && hist.all_peers)
|
||||
{
|
||||
for (const osd_num of hist.all_peers)
|
||||
{
|
||||
all_peers[osd_num] = Number(osd_num);
|
||||
}
|
||||
}
|
||||
if (hist && hist.epoch)
|
||||
{
|
||||
max_epoch = max_epoch < hist.epoch ? hist.epoch : max_epoch;
|
||||
}
|
||||
}
|
||||
all_sets = Object.values(all_sets);
|
||||
all_peers = Object.values(all_peers);
|
||||
finish_pg_history(merged_history[1]);
|
||||
for (let i = 0; i < new_pg_count; i++)
|
||||
{
|
||||
new_pg_history[i] = { osd_sets: all_sets, all_peers, epoch: max_epoch };
|
||||
new_pg_history[i] = { ...merged_history[1] };
|
||||
}
|
||||
}
|
||||
// Mark history keys for removed PGs as removed
|
||||
@@ -94,19 +89,16 @@ function scale_pg_count(prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
|
||||
{
|
||||
new_pg_history[i] = null;
|
||||
}
|
||||
// Just for the lp_solve optimizer - pick a "previous" PG for each "new" one
|
||||
if (old_pg_count < new_pg_count)
|
||||
{
|
||||
for (let i = new_pg_count-1; i >= 0; i--)
|
||||
for (let i = old_pg_count; i < new_pg_count; i++)
|
||||
{
|
||||
prev_pgs[i] = prev_pgs[Math.floor(i/new_pg_count*old_pg_count)];
|
||||
prev_pgs[i] = prev_pgs[i % old_pg_count];
|
||||
}
|
||||
}
|
||||
else if (old_pg_count > new_pg_count)
|
||||
{
|
||||
for (let i = 0; i < new_pg_count; i++)
|
||||
{
|
||||
prev_pgs[i] = prev_pgs[Math.round(i/new_pg_count*old_pg_count)];
|
||||
}
|
||||
prev_pgs.splice(new_pg_count, old_pg_count-new_pg_count);
|
||||
}
|
||||
}
|
||||
|
@@ -104,6 +104,17 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
|
||||
return res;
|
||||
}
|
||||
|
||||
function shuffle(array)
|
||||
{
|
||||
for (let i = array.length - 1, j, x; i > 0; i--)
|
||||
{
|
||||
j = Math.floor(Math.random() * (i + 1));
|
||||
x = array[i];
|
||||
array[i] = array[j];
|
||||
array[j] = x;
|
||||
}
|
||||
}
|
||||
|
||||
function make_int_pgs(weights, pg_count)
|
||||
{
|
||||
const total_weight = Object.values(weights).reduce((a, c) => Number(a) + Number(c), 0);
|
||||
@@ -120,6 +131,7 @@ function make_int_pgs(weights, pg_count)
|
||||
weight_left -= weights[pg_name];
|
||||
pg_left -= n;
|
||||
}
|
||||
shuffle(int_pgs);
|
||||
return int_pgs;
|
||||
}
|
||||
|
||||
|
@@ -53,7 +53,6 @@ ExecStart=/usr/bin/vitastor-osd \\
|
||||
--osd_num $OSD_NUM \\
|
||||
--disable_data_fsync 1 \\
|
||||
--immediate_commit all \\
|
||||
--flusher_count 256 \\
|
||||
--disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
|
||||
--journal_no_same_sector_overwrites true \\
|
||||
--journal_sector_buffer_count 1024 \\
|
||||
|
@@ -32,7 +32,8 @@ ExecStart=/usr/local/bin/etcd -name etcd$ETCD_NUM --data-dir /var/lib/etcd$ETCD_
|
||||
--advertise-client-urls http://$IP:2379 --listen-client-urls http://$IP:2379 \\
|
||||
--initial-advertise-peer-urls http://$IP:2380 --listen-peer-urls http://$IP:2380 \\
|
||||
--initial-cluster-token vitastor-etcd-1 --initial-cluster $ETCD_HOSTS \\
|
||||
--initial-cluster-state new --max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision
|
||||
--initial-cluster-state new --max-txn-ops=100000 --max-request-bytes=104857600 \\
|
||||
--auto-compaction-retention=10 --auto-compaction-mode=revision
|
||||
WorkingDirectory=/var/lib/etcd$ETCD_NUM.etcd
|
||||
ExecStartPre=+chown -R etcd /var/lib/etcd$ETCD_NUM.etcd
|
||||
User=etcd
|
||||
|
264
mon/mon.js
264
mon/mon.js
@@ -10,6 +10,14 @@ const stableStringify = require('./stable-stringify.js');
|
||||
const PGUtil = require('./PGUtil.js');
|
||||
|
||||
// FIXME document all etcd keys and config variables in the form of JSON schema or similar
|
||||
const etcd_nonempty_keys = {
|
||||
'config/global': 1,
|
||||
'config/node_placement': 1,
|
||||
'config/pools': 1,
|
||||
'config/pgs': 1,
|
||||
'history/last_clean_pgs': 1,
|
||||
'stats': 1,
|
||||
};
|
||||
const etcd_allow = new RegExp('^'+[
|
||||
'config/global',
|
||||
'config/node_placement',
|
||||
@@ -22,6 +30,7 @@ const etcd_allow = new RegExp('^'+[
|
||||
'pg/state/[1-9]\\d*/[1-9]\\d*',
|
||||
'pg/stats/[1-9]\\d*/[1-9]\\d*',
|
||||
'pg/history/[1-9]\\d*/[1-9]\\d*',
|
||||
'history/last_clean_pgs',
|
||||
'stats',
|
||||
].join('$|^')+'$');
|
||||
|
||||
@@ -34,7 +43,7 @@ const etcd_tree = {
|
||||
etcd_mon_retries: 5, // min: 0
|
||||
mon_change_timeout: 1000, // ms. min: 100
|
||||
mon_stats_timeout: 1000, // ms. min: 100
|
||||
osd_out_time: 1800, // seconds. min: 0
|
||||
osd_out_time: 600, // seconds. min: 0
|
||||
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
|
||||
// client and osd
|
||||
use_sync_send_recv: false,
|
||||
@@ -46,6 +55,8 @@ const etcd_tree = {
|
||||
client_dirty_limit: 33554432,
|
||||
peer_connect_interval: 5, // seconds. min: 1
|
||||
peer_connect_timeout: 5, // seconds. min: 1
|
||||
osd_idle_timeout: 5, // seconds. min: 1
|
||||
osd_ping_timeout: 5, // seconds. min: 1
|
||||
up_wait_retry_interval: 500, // ms. min: 50
|
||||
// osd
|
||||
etcd_report_interval: 30, // min: 10
|
||||
@@ -55,8 +66,12 @@ const etcd_tree = {
|
||||
autosync_interval: 5,
|
||||
client_queue_depth: 128, // unused
|
||||
recovery_queue_depth: 4,
|
||||
recovery_sync_batch: 16,
|
||||
readonly: false,
|
||||
no_recovery: false,
|
||||
no_rebalance: false,
|
||||
print_stats_interval: 3,
|
||||
slow_log_interval: 10,
|
||||
// blockstore - fixed in superblock
|
||||
block_size,
|
||||
disk_alignment,
|
||||
@@ -76,7 +91,9 @@ const etcd_tree = {
|
||||
disable_meta_fsync,
|
||||
disable_device_lock,
|
||||
// blockstore - configurable
|
||||
flusher_count,
|
||||
max_write_iodepth,
|
||||
min_flusher_count: 1,
|
||||
max_flusher_count: 256,
|
||||
inmemory_metadata,
|
||||
inmemory_journal,
|
||||
journal_sector_buffer_count,
|
||||
@@ -166,7 +183,7 @@ const etcd_tree = {
|
||||
/* <pool_id>: {
|
||||
<pg_id>: {
|
||||
primary: osd_num_t,
|
||||
state: ("starting"|"peering"|"incomplete"|"active"|"stopping"|"offline"|
|
||||
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
||||
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
||||
"has_invalid"|"left_on_dead")[],
|
||||
}
|
||||
@@ -213,6 +230,9 @@ const etcd_tree = {
|
||||
incomplete: uint64_t,
|
||||
}, */
|
||||
},
|
||||
history: {
|
||||
last_clean_pgs: {},
|
||||
},
|
||||
};
|
||||
|
||||
// FIXME Split into several files
|
||||
@@ -291,7 +311,7 @@ class Mon
|
||||
this.config.osd_out_time = Number(this.config.osd_out_time) || 0;
|
||||
if (!this.config.osd_out_time)
|
||||
{
|
||||
this.config.osd_out_time = 30*60; // 30 minutes by default
|
||||
this.config.osd_out_time = 600; // 10 minutes by default
|
||||
}
|
||||
}
|
||||
|
||||
@@ -313,8 +333,14 @@ class Mon
|
||||
ok(false);
|
||||
}, this.config.etcd_mon_timeout);
|
||||
this.ws = new WebSocket(base+'/watch');
|
||||
const fail = () =>
|
||||
{
|
||||
ok(false);
|
||||
};
|
||||
this.ws.on('error', fail);
|
||||
this.ws.on('open', () =>
|
||||
{
|
||||
this.ws.removeListener('error', fail);
|
||||
if (timer_id)
|
||||
clearTimeout(timer_id);
|
||||
ok(true);
|
||||
@@ -359,7 +385,7 @@ class Mon
|
||||
}
|
||||
else
|
||||
{
|
||||
let stats_changed = false, changed = false;
|
||||
let stats_changed = false, changed = false, pg_states_changed = false;
|
||||
if (this.verbose)
|
||||
{
|
||||
console.log('Revision '+data.result.header.revision+' events: ');
|
||||
@@ -373,6 +399,10 @@ class Mon
|
||||
{
|
||||
stats_changed = true;
|
||||
}
|
||||
else if (key.substr(0, 10) == '/pg/state/')
|
||||
{
|
||||
pg_states_changed = true;
|
||||
}
|
||||
else if (key != '/stats')
|
||||
{
|
||||
changed = true;
|
||||
@@ -382,6 +412,10 @@ class Mon
|
||||
console.log(JSON.stringify(e));
|
||||
}
|
||||
}
|
||||
if (pg_states_changed)
|
||||
{
|
||||
this.save_last_clean().catch(console.error);
|
||||
}
|
||||
if (stats_changed)
|
||||
{
|
||||
this.schedule_update_stats();
|
||||
@@ -394,10 +428,46 @@ class Mon
|
||||
});
|
||||
}
|
||||
|
||||
async save_last_clean()
|
||||
{
|
||||
// last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
|
||||
for (const pool_id in this.state.config.pools)
|
||||
{
|
||||
const pool_cfg = this.state.config.pools[pool_id];
|
||||
if (!this.validate_pool_cfg(pool_id, pool_cfg, false))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
|
||||
{
|
||||
if (!this.state.pg.state[pool_id] ||
|
||||
!this.state.pg.state[pool_id][pg_num] ||
|
||||
!(this.state.pg.state[pool_id][pg_num].state instanceof Array))
|
||||
{
|
||||
// Unclean
|
||||
return;
|
||||
}
|
||||
let st = this.state.pg.state[pool_id][pg_num].state.join(',');
|
||||
if (st != 'active' && st != 'active,left_on_dead' && st != 'left_on_dead,active')
|
||||
{
|
||||
// Unclean
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
this.state.history.last_clean_pgs = JSON.parse(JSON.stringify(this.state.config.pgs));
|
||||
await this.etcd_call('/kv/txn', {
|
||||
success: [ { requestPut: {
|
||||
key: b64(this.etcd_prefix+'/history/last_clean_pgs'),
|
||||
value: b64(JSON.stringify(this.state.history.last_clean_pgs))
|
||||
} } ],
|
||||
}, this.etcd_start_timeout, 0);
|
||||
}
|
||||
|
||||
async get_lease()
|
||||
{
|
||||
const max_ttl = this.config.etcd_mon_ttl + this.config.etcd_mon_timeout/1000*this.config.etcd_mon_retries;
|
||||
const res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
|
||||
const res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, -1);
|
||||
this.etcd_lease_id = res.ID;
|
||||
setInterval(async () =>
|
||||
{
|
||||
@@ -472,7 +542,7 @@ class Mon
|
||||
for (const osd_num of this.all_osds().sort((a, b) => a - b))
|
||||
{
|
||||
const stat = this.state.osd.stats[osd_num];
|
||||
if (stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
|
||||
if (stat && stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
|
||||
{
|
||||
// Numeric IDs are reserved for OSDs
|
||||
const osd_cfg = this.state.config.osd[osd_num];
|
||||
@@ -573,34 +643,61 @@ class Mon
|
||||
return !has_online;
|
||||
}
|
||||
|
||||
reset_rng()
|
||||
{
|
||||
this.seed = 0x5f020e43;
|
||||
}
|
||||
|
||||
rng()
|
||||
{
|
||||
this.seed ^= this.seed << 13;
|
||||
this.seed ^= this.seed >> 17;
|
||||
this.seed ^= this.seed << 5;
|
||||
return this.seed + 2147483648;
|
||||
}
|
||||
|
||||
pick_primary(pool_id, osd_set, up_osds)
|
||||
{
|
||||
let alive_set;
|
||||
if (this.state.config.pools[pool_id].scheme === 'replicated')
|
||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
else
|
||||
{
|
||||
// Prefer data OSDs for EC because they can actually read something without an additional network hop
|
||||
const pg_data_size = (this.state.config.pools[pool_id].pg_size||0) -
|
||||
(this.state.config.pools[pool_id].parity_chunks||0);
|
||||
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
if (!alive_set.length)
|
||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
}
|
||||
if (!alive_set.length)
|
||||
return 0;
|
||||
return alive_set[this.rng() % alive_set.length];
|
||||
}
|
||||
|
||||
save_new_pgs_txn(request, pool_id, up_osds, prev_pgs, new_pgs, pg_history)
|
||||
{
|
||||
const replicated = new_pgs.length && this.state.config.pools[pool_id].scheme === 'replicated';
|
||||
const pg_minsize = new_pgs.length && this.state.config.pools[pool_id].pg_minsize;
|
||||
const pg_items = {};
|
||||
this.reset_rng();
|
||||
new_pgs.map((osd_set, i) =>
|
||||
{
|
||||
osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
|
||||
let alive_set;
|
||||
if (replicated)
|
||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
else
|
||||
{
|
||||
// Prefer data OSDs for EC because they can actually read something without an additional network hop
|
||||
alive_set = osd_set.slice(0, pg_minsize).filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
if (!alive_set.length)
|
||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
}
|
||||
pg_items[i+1] = {
|
||||
osd_set,
|
||||
primary: alive_set.length ? alive_set[Math.floor(Math.random()*alive_set.length)] : 0,
|
||||
primary: this.pick_primary(pool_id, osd_set, up_osds),
|
||||
};
|
||||
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' '))
|
||||
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
|
||||
prev_pgs[i].filter(osd_num => osd_num).length > 0)
|
||||
{
|
||||
pg_history[i] = pg_history[i] || {};
|
||||
pg_history[i].osd_sets = pg_history[i].osd_sets || [];
|
||||
pg_history[i].osd_sets.push(prev_pgs[i]);
|
||||
}
|
||||
if (pg_history[i] && pg_history[i].osd_sets)
|
||||
{
|
||||
pg_history[i].osd_sets = Object.values(pg_history[i].osd_sets
|
||||
.reduce((a, c) => { a[c.join(' ')] = c; return a; }, {}));
|
||||
}
|
||||
});
|
||||
for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
|
||||
{
|
||||
@@ -751,7 +848,7 @@ class Mon
|
||||
{
|
||||
// Take configuration and state, check it against the stored configuration hash
|
||||
// Recalculate PGs and save them to etcd if the configuration is changed
|
||||
// FIXME: Also do not change anything if the distribution is good enough and no PGs are degraded
|
||||
// FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
|
||||
const { up_osds, levels, osd_tree } = this.get_osd_tree();
|
||||
const tree_cfg = {
|
||||
osd_tree,
|
||||
@@ -791,13 +888,33 @@ class Mon
|
||||
pool_tree = pool_tree ? pool_tree.children : [];
|
||||
pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
|
||||
this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
|
||||
const prev_pgs = [];
|
||||
for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{})||{})
|
||||
// These are for the purpose of building history.osd_sets
|
||||
const real_prev_pgs = [];
|
||||
let pg_history = [];
|
||||
for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{}))
|
||||
{
|
||||
prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set;
|
||||
real_prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set;
|
||||
if (this.state.pg.history[pool_id] &&
|
||||
this.state.pg.history[pool_id][pg])
|
||||
{
|
||||
pg_history[pg-1] = this.state.pg.history[pool_id][pg];
|
||||
}
|
||||
}
|
||||
const pg_history = [];
|
||||
const old_pg_count = prev_pgs.length;
|
||||
// And these are for the purpose of minimizing data movement
|
||||
let prev_pgs = [];
|
||||
for (const pg in ((this.state.history.last_clean_pgs.items||{})[pool_id]||{}))
|
||||
{
|
||||
prev_pgs[pg-1] = this.state.history.last_clean_pgs.items[pool_id][pg].osd_set;
|
||||
}
|
||||
prev_pgs = JSON.parse(JSON.stringify(prev_pgs.length ? prev_pgs : real_prev_pgs));
|
||||
const old_pg_count = real_prev_pgs.length;
|
||||
const optimize_cfg = {
|
||||
osd_tree: pool_tree,
|
||||
pg_count: pool_cfg.pg_count,
|
||||
pg_size: pool_cfg.pg_size,
|
||||
pg_minsize: pool_cfg.pg_minsize,
|
||||
max_combinations: pool_cfg.max_osd_combinations,
|
||||
};
|
||||
let optimize_result;
|
||||
if (old_pg_count > 0)
|
||||
{
|
||||
@@ -809,7 +926,9 @@ class Mon
|
||||
this.schedule_recheck();
|
||||
return;
|
||||
}
|
||||
PGUtil.scale_pg_count(prev_pgs, this.state.pg.history[pool_id]||{}, pg_history, pool_cfg.pg_count);
|
||||
const new_pg_history = [];
|
||||
PGUtil.scale_pg_count(prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
|
||||
pg_history = new_pg_history;
|
||||
}
|
||||
for (const pg of prev_pgs)
|
||||
{
|
||||
@@ -822,23 +941,22 @@ class Mon
|
||||
pg.pop();
|
||||
}
|
||||
}
|
||||
optimize_result = await LPOptimizer.optimize_change({
|
||||
prev_pgs,
|
||||
osd_tree: pool_tree,
|
||||
pg_size: pool_cfg.pg_size,
|
||||
pg_minsize: pool_cfg.pg_minsize,
|
||||
max_combinations: pool_cfg.max_osd_combinations,
|
||||
});
|
||||
if (!this.state.config.pgs.hash)
|
||||
{
|
||||
// Re-shuffle PGs
|
||||
optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
|
||||
}
|
||||
else
|
||||
{
|
||||
optimize_result = await LPOptimizer.optimize_change({
|
||||
prev_pgs,
|
||||
...optimize_cfg,
|
||||
});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
optimize_result = await LPOptimizer.optimize_initial({
|
||||
osd_tree: pool_tree,
|
||||
pg_count: pool_cfg.pg_count,
|
||||
pg_size: pool_cfg.pg_size,
|
||||
pg_minsize: pool_cfg.pg_minsize,
|
||||
max_combinations: pool_cfg.max_osd_combinations,
|
||||
});
|
||||
optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
|
||||
}
|
||||
if (old_pg_count != optimize_result.int_pgs.length)
|
||||
{
|
||||
@@ -846,16 +964,21 @@ class Mon
|
||||
`PG count for pool ${pool_id} (${pool_cfg.name || 'unnamed'})`+
|
||||
` changed from: ${old_pg_count} to ${optimize_result.int_pgs.length}`
|
||||
);
|
||||
// Drop stats
|
||||
etcd_request.success.push({ requestDeleteRange: {
|
||||
key: b64(this.etcd_prefix+'/pg/stats/'+pool_id+'/'),
|
||||
range_end: b64(this.etcd_prefix+'/pg/stats/'+pool_id+'0'),
|
||||
} });
|
||||
}
|
||||
LPOptimizer.print_change_stats(optimize_result);
|
||||
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, prev_pgs, optimize_result.int_pgs, pg_history);
|
||||
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, real_prev_pgs, optimize_result.int_pgs, pg_history);
|
||||
}
|
||||
this.state.config.pgs.hash = tree_hash;
|
||||
await this.save_pg_config(etcd_request);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Nothing changed, but we still want to check for down OSDs
|
||||
// Nothing changed, but we still want to recheck the distribution of primaries
|
||||
let changed = false;
|
||||
for (const pool_id in this.state.config.pools)
|
||||
{
|
||||
@@ -865,22 +988,13 @@ class Mon
|
||||
continue;
|
||||
}
|
||||
const replicated = pool_cfg.scheme === 'replicated';
|
||||
for (const pg_num in ((this.state.config.pgs.items||{})[pool_id]||{})||{})
|
||||
this.reset_rng();
|
||||
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
|
||||
{
|
||||
const pg_cfg = this.state.config.pgs.items[pool_id][pg_num];
|
||||
if (!Number(pg_cfg.primary) || !up_osds[pg_cfg.primary])
|
||||
if (pg_cfg)
|
||||
{
|
||||
let alive_set;
|
||||
if (replicated)
|
||||
alive_set = pg_cfg.osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
else
|
||||
{
|
||||
// Prefer data OSDs for EC because they can actually read something without an additional network hop
|
||||
alive_set = pg_cfg.osd_set.slice(0, pool_cfg.pg_minsize).filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
if (!alive_set.length)
|
||||
alive_set = pg_cfg.osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
}
|
||||
const new_primary = alive_set.length ? alive_set[Math.floor(Math.random()*alive_set.length)] : 0;
|
||||
const new_primary = this.pick_primary(pool_id, pg_cfg.osd_set, up_osds);
|
||||
if (pg_cfg.primary != new_primary)
|
||||
{
|
||||
console.log(
|
||||
@@ -970,7 +1084,7 @@ class Mon
|
||||
const op_stats = {}, subop_stats = {}, recovery_stats = {};
|
||||
for (const osd in this.state.osd.stats)
|
||||
{
|
||||
const st = this.state.osd.stats[osd];
|
||||
const st = this.state.osd.stats[osd]||{};
|
||||
for (const op in st.op_stats||{})
|
||||
{
|
||||
op_stats[op] = op_stats[op] || { count: 0n, usec: 0n, bytes: 0n };
|
||||
@@ -1043,11 +1157,14 @@ class Mon
|
||||
for (const pg_num in this.state.pg.stats[pool_id])
|
||||
{
|
||||
const st = this.state.pg.stats[pool_id][pg_num];
|
||||
for (const k in object_counts)
|
||||
if (st)
|
||||
{
|
||||
if (st[k+'_count'])
|
||||
for (const k in object_counts)
|
||||
{
|
||||
object_counts[k] += BigInt(st[k+'_count']);
|
||||
if (st[k+'_count'])
|
||||
{
|
||||
object_counts[k] += BigInt(st[k+'_count']);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1122,16 +1239,20 @@ class Mon
|
||||
console.log('Bad value in etcd: '+kv.key+' = '+kv.value);
|
||||
return;
|
||||
}
|
||||
key = key.split('/');
|
||||
let key_parts = key.split('/');
|
||||
let cur = this.state;
|
||||
for (let i = 0; i < key.length-1; i++)
|
||||
for (let i = 0; i < key_parts.length-1; i++)
|
||||
{
|
||||
cur = (cur[key[i]] = cur[key[i]] || {});
|
||||
cur = (cur[key_parts[i]] = cur[key_parts[i]] || {});
|
||||
}
|
||||
cur[key[key.length-1]] = kv.value;
|
||||
if (key.join('/') === 'config/global')
|
||||
if (etcd_nonempty_keys[key])
|
||||
{
|
||||
// Do not clear these to null
|
||||
kv.value = kv.value || {};
|
||||
}
|
||||
cur[key_parts[key_parts.length-1]] = kv.value;
|
||||
if (key === 'config/global')
|
||||
{
|
||||
this.state.config.global = this.state.config.global || {};
|
||||
this.config = this.state.config.global;
|
||||
this.check_config();
|
||||
for (const osd_num in this.state.osd.stats)
|
||||
@@ -1142,7 +1263,7 @@ class Mon
|
||||
);
|
||||
}
|
||||
}
|
||||
else if (key.join('/') === 'config/pools')
|
||||
else if (key === 'config/pools')
|
||||
{
|
||||
for (const pool_id in this.state.config.pools)
|
||||
{
|
||||
@@ -1151,7 +1272,7 @@ class Mon
|
||||
this.validate_pool_cfg(pool_id, pool_cfg, true);
|
||||
}
|
||||
}
|
||||
else if (key[0] === 'osd' && key[1] === 'stats')
|
||||
else if (key_parts[0] === 'osd' && key_parts[1] === 'stats')
|
||||
{
|
||||
// Recheck PGs <osd_out_time> later
|
||||
this.schedule_next_recheck_at(
|
||||
@@ -1183,6 +1304,11 @@ class Mon
|
||||
console.error('etcd returned error: '+res.json.error);
|
||||
break;
|
||||
}
|
||||
if (this.etcd_urls.length > 1)
|
||||
{
|
||||
// Stick to the same etcd for the rest of calls
|
||||
this.etcd_urls = [ base ];
|
||||
}
|
||||
return res.json;
|
||||
}
|
||||
retry++;
|
||||
|
@@ -48,4 +48,4 @@ FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Ve
|
||||
QEMU=`rpm -qi qemu qemu-kvm | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
perl -i -pe 's/(Requires:\s*qemu(?:-kvm)?)([^\n]+)?/$1 = '$QEMU'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-0.5.6/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.5.6$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-0.5.12/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.5.12$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -37,7 +37,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.5.6.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.5.12.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.5.6
|
||||
Version: 0.5.12
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.5.6.el7.tar.gz
|
||||
Source0: vitastor-0.5.12.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.5.6.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.5.12.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.5.6
|
||||
Version: 0.5.12
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.5.6.el8.tar.gz
|
||||
Source0: vitastor-0.5.12.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -14,7 +14,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="0.6-dev")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith)
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -I ${CMAKE_SOURCE_DIR}/src)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||
add_link_options(-fsanitize=address -fno-omit-frame-pointer)
|
||||
@@ -66,7 +66,8 @@ target_link_libraries(fio_vitastor_blk
|
||||
# vitastor-osd
|
||||
add_executable(vitastor-osd
|
||||
osd_main.cpp osd.cpp osd_secondary.cpp msgr_receive.cpp msgr_send.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
|
||||
osd_primary.cpp osd_primary_subops.cpp etcd_state_client.cpp messenger.cpp osd_cluster.cpp http_client.cpp osd_ops.cpp pg_states.cpp
|
||||
osd_primary.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
|
||||
etcd_state_client.cpp messenger.cpp msgr_stop.cpp msgr_op.cpp osd_cluster.cpp http_client.cpp osd_ops.cpp pg_states.cpp
|
||||
osd_rmw.cpp base64.cpp timerfd_manager.cpp epoll_manager.cpp ../json11/json11.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-osd
|
||||
@@ -86,7 +87,7 @@ target_link_libraries(fio_vitastor_sec
|
||||
# libvitastor_client.so
|
||||
add_library(vitastor_client SHARED
|
||||
cluster_client.cpp epoll_manager.cpp etcd_state_client.cpp
|
||||
messenger.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp ../json11/json11.cpp
|
||||
messenger.cpp msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp ../json11/json11.cpp
|
||||
http_client.cpp osd_ops.cpp pg_states.cpp timerfd_manager.cpp base64.cpp
|
||||
)
|
||||
target_link_libraries(vitastor_client
|
||||
@@ -161,7 +162,8 @@ target_link_libraries(osd_rmw_test Jerasure tcmalloc_minimal)
|
||||
|
||||
# stub_uring_osd
|
||||
add_executable(stub_uring_osd
|
||||
stub_uring_osd.cpp epoll_manager.cpp messenger.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp timerfd_manager.cpp ../json11/json11.cpp
|
||||
stub_uring_osd.cpp epoll_manager.cpp messenger.cpp msgr_stop.cpp msgr_op.cpp
|
||||
msgr_send.cpp msgr_receive.cpp ringloop.cpp timerfd_manager.cpp ../json11/json11.cpp
|
||||
)
|
||||
target_link_libraries(stub_uring_osd
|
||||
${LIBURING_LIBRARIES}
|
||||
@@ -175,6 +177,15 @@ target_link_libraries(osd_peering_pg_test tcmalloc_minimal)
|
||||
# test_allocator
|
||||
add_executable(test_allocator test_allocator.cpp allocator.cpp)
|
||||
|
||||
# test_cluster_client
|
||||
add_executable(test_cluster_client
|
||||
test_cluster_client.cpp
|
||||
pg_states.cpp osd_ops.cpp cluster_client.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
|
||||
etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
|
||||
)
|
||||
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
|
||||
target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
|
||||
|
||||
## test_blockstore, test_shit
|
||||
#add_executable(test_blockstore test_blockstore.cpp timerfd_interval.cpp)
|
||||
#target_link_libraries(test_blockstore blockstore)
|
||||
|
@@ -13,19 +13,19 @@ allocator::allocator(uint64_t blocks)
|
||||
{
|
||||
throw std::invalid_argument("blocks");
|
||||
}
|
||||
uint64_t p2 = 1, total = 1;
|
||||
uint64_t p2 = 1;
|
||||
total = 0;
|
||||
while (p2 * 64 < blocks)
|
||||
{
|
||||
p2 = p2 * 64;
|
||||
total += p2;
|
||||
p2 = p2 * 64;
|
||||
}
|
||||
total -= p2;
|
||||
total += (blocks+63) / 64;
|
||||
mask = new uint64_t[2 + total];
|
||||
mask = new uint64_t[total];
|
||||
size = free = blocks;
|
||||
last_one_mask = (blocks % 64) == 0
|
||||
? UINT64_MAX
|
||||
: ~(UINT64_MAX << (64 - blocks % 64));
|
||||
: ((1l << (blocks % 64)) - 1);
|
||||
for (uint64_t i = 0; i < total; i++)
|
||||
{
|
||||
mask[i] = 0;
|
||||
@@ -37,6 +37,21 @@ allocator::~allocator()
|
||||
delete[] mask;
|
||||
}
|
||||
|
||||
bool allocator::get(uint64_t addr)
|
||||
{
|
||||
if (addr >= size)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
uint64_t p2 = 1, offset = 0;
|
||||
while (p2 * 64 < size)
|
||||
{
|
||||
offset += p2;
|
||||
p2 = p2 * 64;
|
||||
}
|
||||
return ((mask[offset + addr/64] >> (addr % 64)) & 1);
|
||||
}
|
||||
|
||||
void allocator::set(uint64_t addr, bool value)
|
||||
{
|
||||
if (addr >= size)
|
||||
@@ -99,6 +114,10 @@ uint64_t allocator::find_free()
|
||||
uint64_t p2 = 1, offset = 0, addr = 0, f, i;
|
||||
while (p2 < size)
|
||||
{
|
||||
if (offset+addr >= total)
|
||||
{
|
||||
return UINT64_MAX;
|
||||
}
|
||||
uint64_t m = mask[offset + addr];
|
||||
for (i = 0, f = 1; i < 64; i++, f <<= 1)
|
||||
{
|
||||
@@ -113,11 +132,6 @@ uint64_t allocator::find_free()
|
||||
return UINT64_MAX;
|
||||
}
|
||||
addr = (addr * 64) | i;
|
||||
if (addr >= size)
|
||||
{
|
||||
// No space
|
||||
return UINT64_MAX;
|
||||
}
|
||||
offset += p2;
|
||||
p2 = p2 * 64;
|
||||
}
|
||||
|
@@ -8,6 +8,7 @@
|
||||
// Hierarchical bitmap allocator
|
||||
class allocator
|
||||
{
|
||||
uint64_t total;
|
||||
uint64_t size;
|
||||
uint64_t free;
|
||||
uint64_t last_one_mask;
|
||||
@@ -15,6 +16,7 @@ class allocator
|
||||
public:
|
||||
allocator(uint64_t blocks);
|
||||
~allocator();
|
||||
bool get(uint64_t addr);
|
||||
void set(uint64_t addr, bool value);
|
||||
uint64_t find_free();
|
||||
uint64_t get_free_count();
|
||||
|
@@ -35,12 +35,7 @@ bool blockstore_t::is_safe_to_stop()
|
||||
|
||||
void blockstore_t::enqueue_op(blockstore_op_t *op)
|
||||
{
|
||||
impl->enqueue_op(op, false);
|
||||
}
|
||||
|
||||
void blockstore_t::enqueue_op_first(blockstore_op_t *op)
|
||||
{
|
||||
impl->enqueue_op(op, true);
|
||||
impl->enqueue_op(op);
|
||||
}
|
||||
|
||||
std::unordered_map<object_id, uint64_t> & blockstore_t::get_unstable_writes()
|
||||
@@ -63,7 +58,7 @@ uint64_t blockstore_t::get_free_block_count()
|
||||
return impl->get_free_block_count();
|
||||
}
|
||||
|
||||
uint32_t blockstore_t::get_disk_alignment()
|
||||
uint32_t blockstore_t::get_bitmap_granularity()
|
||||
{
|
||||
return impl->get_disk_alignment();
|
||||
return impl->get_bitmap_granularity();
|
||||
}
|
||||
|
@@ -175,10 +175,6 @@ public:
|
||||
// Submission
|
||||
void enqueue_op(blockstore_op_t *op);
|
||||
|
||||
// Insert operation into the beginning of the queue
|
||||
// Intended for the OSD syncer "thread" to be able to stabilize something when the journal is full
|
||||
void enqueue_op_first(blockstore_op_t *op);
|
||||
|
||||
// Unstable writes are added here (map of object_id -> version)
|
||||
std::unordered_map<object_id, uint64_t> & get_unstable_writes();
|
||||
|
||||
@@ -187,5 +183,5 @@ public:
|
||||
uint64_t get_block_count();
|
||||
uint64_t get_free_block_count();
|
||||
|
||||
uint32_t get_disk_alignment();
|
||||
uint32_t get_bitmap_granularity();
|
||||
};
|
||||
|
@@ -3,12 +3,13 @@
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
|
||||
journal_flusher_t::journal_flusher_t(blockstore_impl_t *bs)
|
||||
{
|
||||
this->bs = bs;
|
||||
this->flusher_count = flusher_count;
|
||||
this->cur_flusher_count = 1;
|
||||
this->target_flusher_count = 1;
|
||||
this->max_flusher_count = bs->max_flusher_count;
|
||||
this->min_flusher_count = bs->min_flusher_count;
|
||||
this->cur_flusher_count = bs->min_flusher_count;
|
||||
this->target_flusher_count = bs->min_flusher_count;
|
||||
dequeuing = false;
|
||||
trimming = false;
|
||||
active_flushers = 0;
|
||||
@@ -19,8 +20,8 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
|
||||
journal_trim_counter = 0;
|
||||
trim_wanted = 0;
|
||||
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->journal_block_size);
|
||||
co = new journal_flusher_co[flusher_count];
|
||||
for (int i = 0; i < flusher_count; i++)
|
||||
co = new journal_flusher_co[max_flusher_count];
|
||||
for (int i = 0; i < max_flusher_count; i++)
|
||||
{
|
||||
co[i].bs = bs;
|
||||
co[i].flusher = this;
|
||||
@@ -71,10 +72,10 @@ bool journal_flusher_t::is_active()
|
||||
void journal_flusher_t::loop()
|
||||
{
|
||||
target_flusher_count = bs->write_iodepth*2;
|
||||
if (target_flusher_count <= 0)
|
||||
target_flusher_count = 1;
|
||||
else if (target_flusher_count > flusher_count)
|
||||
target_flusher_count = flusher_count;
|
||||
if (target_flusher_count < min_flusher_count)
|
||||
target_flusher_count = min_flusher_count;
|
||||
else if (target_flusher_count > max_flusher_count)
|
||||
target_flusher_count = max_flusher_count;
|
||||
if (target_flusher_count > cur_flusher_count)
|
||||
cur_flusher_count = target_flusher_count;
|
||||
else if (target_flusher_count < cur_flusher_count)
|
||||
@@ -237,7 +238,8 @@ bool journal_flusher_co::loop()
|
||||
else if (wait_state == 21)
|
||||
goto resume_21;
|
||||
resume_0:
|
||||
if (!flusher->flush_queue.size() || !flusher->dequeuing)
|
||||
if (flusher->flush_queue.size() < flusher->min_flusher_count && !flusher->trim_wanted ||
|
||||
!flusher->flush_queue.size() || !flusher->dequeuing)
|
||||
{
|
||||
stop_flusher:
|
||||
if (flusher->trim_wanted > 0 && flusher->journal_trim_counter > 0)
|
||||
@@ -482,6 +484,14 @@ resume_1:
|
||||
}
|
||||
if (has_delete)
|
||||
{
|
||||
clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
|
||||
if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
|
||||
{
|
||||
printf("Fatal error (metadata corruption or bug): tried to delete metadata entry %lu (%lx:%lx) while deleting %lx:%lx\n",
|
||||
clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, cur.oid.inode, cur.oid.stripe);
|
||||
exit(1);
|
||||
}
|
||||
// zero out new metadata entry
|
||||
memset(meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
|
||||
}
|
||||
else
|
||||
@@ -646,7 +656,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
|
||||
{
|
||||
char err[1024];
|
||||
snprintf(
|
||||
err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu unstable state during flush: %d",
|
||||
err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu unstable state during flush: 0x%x",
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
|
||||
);
|
||||
throw std::runtime_error(err);
|
||||
@@ -775,7 +785,10 @@ void journal_flusher_co::update_clean_db()
|
||||
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %lu (new location is %lu)\n", old_clean_loc >> bs->block_order, clean_loc >> bs->block_order);
|
||||
printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
|
||||
old_clean_loc >> bs->block_order,
|
||||
cur.oid.inode, cur.oid.stripe, cur.version,
|
||||
clean_loc >> bs->block_order);
|
||||
#endif
|
||||
bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
|
||||
}
|
||||
@@ -783,6 +796,11 @@ void journal_flusher_co::update_clean_db()
|
||||
{
|
||||
auto clean_it = bs->clean_db.find(cur.oid);
|
||||
bs->clean_db.erase(clean_it);
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %lu from %lx:%lx v%lu (delete)\n",
|
||||
clean_loc >> bs->block_order,
|
||||
cur.oid.inode, cur.oid.stripe, cur.version);
|
||||
#endif
|
||||
bs->data_alloc->set(clean_loc >> bs->block_order, false);
|
||||
clean_loc = UINT64_MAX;
|
||||
}
|
||||
@@ -804,7 +822,7 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
|
||||
goto resume_1;
|
||||
else if (wait_state == wait_base+2)
|
||||
goto resume_2;
|
||||
if (!(fsync_meta ? bs->disable_meta_fsync : bs->disable_journal_fsync))
|
||||
if (!(fsync_meta ? bs->disable_meta_fsync : bs->disable_data_fsync))
|
||||
{
|
||||
cur_sync = flusher->syncs.end();
|
||||
while (cur_sync != flusher->syncs.begin())
|
||||
@@ -823,31 +841,34 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
|
||||
sync_found:
|
||||
cur_sync->ready_count++;
|
||||
flusher->syncing_flushers++;
|
||||
if (flusher->syncing_flushers >= flusher->flusher_count || !flusher->flush_queue.size())
|
||||
resume_1:
|
||||
if (!cur_sync->state)
|
||||
{
|
||||
// Sync batch is ready. Do it.
|
||||
await_sqe(0);
|
||||
data->iov = { 0 };
|
||||
data->callback = simple_callback_w;
|
||||
my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC);
|
||||
cur_sync->state = 1;
|
||||
wait_count++;
|
||||
resume_1:
|
||||
if (wait_count > 0)
|
||||
if (flusher->syncing_flushers >= flusher->cur_flusher_count || !flusher->flush_queue.size())
|
||||
{
|
||||
// Sync batch is ready. Do it.
|
||||
await_sqe(0);
|
||||
data->iov = { 0 };
|
||||
data->callback = simple_callback_w;
|
||||
my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC);
|
||||
cur_sync->state = 1;
|
||||
wait_count++;
|
||||
resume_2:
|
||||
if (wait_count > 0)
|
||||
{
|
||||
wait_state = 2;
|
||||
return false;
|
||||
}
|
||||
// Sync completed. All previous coroutines waiting for it must be resumed
|
||||
cur_sync->state = 2;
|
||||
bs->ringloop->wakeup();
|
||||
}
|
||||
else
|
||||
{
|
||||
// Wait until someone else sends and completes a sync.
|
||||
wait_state = 1;
|
||||
return false;
|
||||
}
|
||||
// Sync completed. All previous coroutines waiting for it must be resumed
|
||||
cur_sync->state = 2;
|
||||
bs->ringloop->wakeup();
|
||||
}
|
||||
// Wait until someone else sends and completes a sync.
|
||||
resume_2:
|
||||
if (!cur_sync->state)
|
||||
{
|
||||
wait_state = 2;
|
||||
return false;
|
||||
}
|
||||
flusher->syncing_flushers--;
|
||||
cur_sync->ready_count--;
|
||||
|
@@ -80,7 +80,7 @@ class journal_flusher_t
|
||||
{
|
||||
int trim_wanted = 0;
|
||||
bool dequeuing;
|
||||
int flusher_count, cur_flusher_count, target_flusher_count;
|
||||
int min_flusher_count, max_flusher_count, cur_flusher_count, target_flusher_count;
|
||||
int flusher_start_threshold;
|
||||
journal_flusher_co *co;
|
||||
blockstore_impl_t *bs;
|
||||
@@ -99,7 +99,7 @@ class journal_flusher_t
|
||||
std::deque<object_id> flush_queue;
|
||||
std::map<object_id, uint64_t> flush_versions;
|
||||
public:
|
||||
journal_flusher_t(int flusher_count, blockstore_impl_t *bs);
|
||||
journal_flusher_t(blockstore_impl_t *bs);
|
||||
~journal_flusher_t();
|
||||
void loop();
|
||||
bool is_active();
|
||||
|
@@ -10,9 +10,9 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
|
||||
ring_consumer.loop = [this]() { loop(); };
|
||||
ringloop->register_consumer(&ring_consumer);
|
||||
initialized = 0;
|
||||
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, block_size);
|
||||
data_fd = meta_fd = journal.fd = -1;
|
||||
parse_config(config);
|
||||
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, block_size);
|
||||
try
|
||||
{
|
||||
open_data();
|
||||
@@ -31,7 +31,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
|
||||
close(journal.fd);
|
||||
throw;
|
||||
}
|
||||
flusher = new journal_flusher_t(flusher_count, this);
|
||||
flusher = new journal_flusher_t(this);
|
||||
}
|
||||
|
||||
blockstore_impl_t::~blockstore_impl_t()
|
||||
@@ -101,26 +101,14 @@ void blockstore_impl_t::loop()
|
||||
{
|
||||
// try to submit ops
|
||||
unsigned initial_ring_space = ringloop->space_left();
|
||||
// FIXME: rework this "sync polling"
|
||||
auto cur_sync = in_progress_syncs.begin();
|
||||
while (cur_sync != in_progress_syncs.end())
|
||||
// has_writes == 0 - no writes before the current queue item
|
||||
// has_writes == 1 - some writes in progress
|
||||
// has_writes == 2 - tried to submit some writes, but failed
|
||||
int has_writes = 0, op_idx = 0, new_idx = 0;
|
||||
for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
|
||||
{
|
||||
if (continue_sync(*cur_sync) != 2)
|
||||
{
|
||||
// List is unmodified
|
||||
cur_sync++;
|
||||
}
|
||||
else
|
||||
{
|
||||
cur_sync = in_progress_syncs.begin();
|
||||
}
|
||||
}
|
||||
auto cur = submit_queue.begin();
|
||||
int has_writes = 0;
|
||||
while (cur != submit_queue.end())
|
||||
{
|
||||
auto op_ptr = cur;
|
||||
auto op = *(cur++);
|
||||
auto op = submit_queue[op_idx];
|
||||
submit_queue[new_idx] = op;
|
||||
// FIXME: This needs some simplification
|
||||
// Writes should not block reads if the ring is not full and reads don't depend on them
|
||||
// In all other cases we should stop submission
|
||||
@@ -142,10 +130,13 @@ void blockstore_impl_t::loop()
|
||||
}
|
||||
unsigned ring_space = ringloop->space_left();
|
||||
unsigned prev_sqe_pos = ringloop->save();
|
||||
bool dequeue_op = false;
|
||||
// 0 = can't submit
|
||||
// 1 = in progress
|
||||
// 2 = can be removed from queue
|
||||
int wr_st = 0;
|
||||
if (op->opcode == BS_OP_READ)
|
||||
{
|
||||
dequeue_op = dequeue_read(op);
|
||||
wr_st = dequeue_read(op);
|
||||
}
|
||||
else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE)
|
||||
{
|
||||
@@ -154,8 +145,8 @@ void blockstore_impl_t::loop()
|
||||
// Some writes already could not be submitted
|
||||
continue;
|
||||
}
|
||||
dequeue_op = dequeue_write(op);
|
||||
has_writes = dequeue_op ? 1 : 2;
|
||||
wr_st = dequeue_write(op);
|
||||
has_writes = wr_st > 0 ? 1 : 2;
|
||||
}
|
||||
else if (op->opcode == BS_OP_DELETE)
|
||||
{
|
||||
@@ -164,8 +155,8 @@ void blockstore_impl_t::loop()
|
||||
// Some writes already could not be submitted
|
||||
continue;
|
||||
}
|
||||
dequeue_op = dequeue_del(op);
|
||||
has_writes = dequeue_op ? 1 : 2;
|
||||
wr_st = dequeue_del(op);
|
||||
has_writes = wr_st > 0 ? 1 : 2;
|
||||
}
|
||||
else if (op->opcode == BS_OP_SYNC)
|
||||
{
|
||||
@@ -178,29 +169,31 @@ void blockstore_impl_t::loop()
|
||||
// Can't submit SYNC before previous writes
|
||||
continue;
|
||||
}
|
||||
dequeue_op = dequeue_sync(op);
|
||||
wr_st = continue_sync(op, false);
|
||||
if (wr_st != 2)
|
||||
{
|
||||
has_writes = wr_st > 0 ? 1 : 2;
|
||||
}
|
||||
}
|
||||
else if (op->opcode == BS_OP_STABLE)
|
||||
{
|
||||
dequeue_op = dequeue_stable(op);
|
||||
wr_st = dequeue_stable(op);
|
||||
}
|
||||
else if (op->opcode == BS_OP_ROLLBACK)
|
||||
{
|
||||
dequeue_op = dequeue_rollback(op);
|
||||
wr_st = dequeue_rollback(op);
|
||||
}
|
||||
else if (op->opcode == BS_OP_LIST)
|
||||
{
|
||||
// LIST doesn't need to be blocked by previous modifications,
|
||||
// it only needs to include all in-progress writes as they're guaranteed
|
||||
// to be readable and stabilizable/rollbackable by subsequent operations
|
||||
// LIST doesn't need to be blocked by previous modifications
|
||||
process_list(op);
|
||||
dequeue_op = true;
|
||||
wr_st = 2;
|
||||
}
|
||||
if (dequeue_op)
|
||||
if (wr_st == 2)
|
||||
{
|
||||
submit_queue.erase(op_ptr);
|
||||
new_idx--;
|
||||
}
|
||||
else
|
||||
if (wr_st == 0)
|
||||
{
|
||||
ringloop->restore(prev_sqe_pos);
|
||||
if (PRIV(op)->wait_for == WAIT_SQE)
|
||||
@@ -211,6 +204,14 @@ void blockstore_impl_t::loop()
|
||||
}
|
||||
}
|
||||
}
|
||||
if (op_idx != new_idx)
|
||||
{
|
||||
while (op_idx < submit_queue.size())
|
||||
{
|
||||
submit_queue[new_idx++] = submit_queue[op_idx++];
|
||||
}
|
||||
submit_queue.resize(new_idx);
|
||||
}
|
||||
if (!readonly)
|
||||
{
|
||||
flusher->loop();
|
||||
@@ -233,7 +234,7 @@ bool blockstore_impl_t::is_safe_to_stop()
|
||||
{
|
||||
// It's safe to stop blockstore when there are no in-flight operations,
|
||||
// no in-progress syncs and flusher isn't doing anything
|
||||
if (submit_queue.size() > 0 || in_progress_syncs.size() > 0 || !readonly && flusher->is_active())
|
||||
if (submit_queue.size() > 0 || !readonly && flusher->is_active())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@@ -300,7 +301,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
|
||||
}
|
||||
else if (PRIV(op)->wait_for == WAIT_FREE)
|
||||
{
|
||||
if (!data_alloc->get_free_count() && !flusher->is_active())
|
||||
if (!data_alloc->get_free_count() && flusher->is_active())
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Still waiting for free space on the data device\n");
|
||||
@@ -315,7 +316,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
|
||||
}
|
||||
}
|
||||
|
||||
void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
|
||||
void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
|
||||
{
|
||||
if (op->opcode < BS_OP_MIN || op->opcode > BS_OP_MAX ||
|
||||
((op->opcode == BS_OP_READ || op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE) && (
|
||||
@@ -323,8 +324,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
|
||||
op->len > block_size-op->offset ||
|
||||
(op->len % disk_alignment)
|
||||
)) ||
|
||||
readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST ||
|
||||
first && (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE))
|
||||
readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST)
|
||||
{
|
||||
// Basic verification not passed
|
||||
op->retval = -EINVAL;
|
||||
@@ -374,25 +374,12 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
|
||||
std::function<void (blockstore_op_t*)>(op->callback)(op);
|
||||
return;
|
||||
}
|
||||
if (op->opcode == BS_OP_SYNC && immediate_commit == IMMEDIATE_ALL)
|
||||
{
|
||||
op->retval = 0;
|
||||
std::function<void (blockstore_op_t*)>(op->callback)(op);
|
||||
return;
|
||||
}
|
||||
// Call constructor without allocating memory. We'll call destructor before returning op back
|
||||
new ((void*)op->private_data) blockstore_op_private_t;
|
||||
PRIV(op)->wait_for = 0;
|
||||
PRIV(op)->op_state = 0;
|
||||
PRIV(op)->pending_ops = 0;
|
||||
if (!first)
|
||||
{
|
||||
submit_queue.push_back(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
submit_queue.push_front(op);
|
||||
}
|
||||
submit_queue.push_back(op);
|
||||
ringloop->wakeup();
|
||||
}
|
||||
|
||||
|
@@ -160,8 +160,6 @@ struct blockstore_op_private_t
|
||||
// Sync
|
||||
std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
|
||||
int sync_small_checked, sync_big_checked;
|
||||
std::list<blockstore_op_t*>::iterator in_progress_ptr;
|
||||
int prev_sync_count;
|
||||
};
|
||||
|
||||
// https://github.com/algorithm-ninja/cpp-btree
|
||||
@@ -199,8 +197,8 @@ class blockstore_impl_t
|
||||
// Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
|
||||
int immediate_commit = IMMEDIATE_NONE;
|
||||
bool inmemory_meta = false;
|
||||
// Maximum flusher count
|
||||
unsigned flusher_count;
|
||||
// Maximum and minimum flusher count
|
||||
unsigned max_flusher_count, min_flusher_count;
|
||||
// Maximum queue depth
|
||||
unsigned max_write_iodepth = 128;
|
||||
/******* END OF OPTIONS *******/
|
||||
@@ -210,9 +208,9 @@ class blockstore_impl_t
|
||||
blockstore_clean_db_t clean_db;
|
||||
uint8_t *clean_bitmap = NULL;
|
||||
blockstore_dirty_db_t dirty_db;
|
||||
std::list<blockstore_op_t*> submit_queue; // FIXME: funny thing is that vector is better here
|
||||
std::vector<blockstore_op_t*> submit_queue;
|
||||
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
||||
std::list<blockstore_op_t*> in_progress_syncs; // ...and probably here, too
|
||||
int unsynced_big_write_count = 0;
|
||||
allocator *data_alloc = NULL;
|
||||
uint8_t *zero_object;
|
||||
|
||||
@@ -271,6 +269,7 @@ class blockstore_impl_t
|
||||
|
||||
// Write
|
||||
bool enqueue_write(blockstore_op_t *op);
|
||||
void cancel_all_writes(blockstore_op_t *op, blockstore_dirty_db_t::iterator dirty_it, int retval);
|
||||
int dequeue_write(blockstore_op_t *op);
|
||||
int dequeue_del(blockstore_op_t *op);
|
||||
int continue_write(blockstore_op_t *op);
|
||||
@@ -278,16 +277,14 @@ class blockstore_impl_t
|
||||
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
|
||||
|
||||
// Sync
|
||||
int dequeue_sync(blockstore_op_t *op);
|
||||
int continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync);
|
||||
void handle_sync_event(ring_data_t *data, blockstore_op_t *op);
|
||||
int continue_sync(blockstore_op_t *op);
|
||||
void ack_one_sync(blockstore_op_t *op);
|
||||
int ack_sync(blockstore_op_t *op);
|
||||
void ack_sync(blockstore_op_t *op);
|
||||
|
||||
// Stabilize
|
||||
int dequeue_stable(blockstore_op_t *op);
|
||||
int continue_stable(blockstore_op_t *op);
|
||||
void mark_stable(const obj_ver_id & ov);
|
||||
void mark_stable(const obj_ver_id & ov, bool forget_dirty = false);
|
||||
void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
|
||||
void stabilize_object(object_id oid, uint64_t max_ver);
|
||||
|
||||
@@ -322,7 +319,7 @@ public:
|
||||
bool is_stalled();
|
||||
|
||||
// Submission
|
||||
void enqueue_op(blockstore_op_t *op, bool first = false);
|
||||
void enqueue_op(blockstore_op_t *op);
|
||||
|
||||
// Unstable writes are added here (map of object_id -> version)
|
||||
std::unordered_map<object_id, uint64_t> unstable_writes;
|
||||
@@ -330,5 +327,5 @@ public:
|
||||
inline uint32_t get_block_size() { return block_size; }
|
||||
inline uint64_t get_block_count() { return block_count; }
|
||||
inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
|
||||
inline uint32_t get_disk_alignment() { return disk_alignment; }
|
||||
inline uint32_t get_bitmap_granularity() { return disk_alignment; }
|
||||
};
|
||||
|
@@ -111,7 +111,10 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
|
||||
{
|
||||
// free the previous block
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %lu (new location is %lu)\n", clean_it->second.location >> block_order, done_cnt+i);
|
||||
printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
|
||||
clean_it->second.location >> block_order,
|
||||
clean_it->first.inode, clean_it->first.stripe, clean_it->second.version,
|
||||
done_cnt+i);
|
||||
#endif
|
||||
bs->data_alloc->set(clean_it->second.location >> block_order, false);
|
||||
}
|
||||
@@ -399,6 +402,18 @@ resume_1:
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto ov: double_allocs)
|
||||
{
|
||||
auto dirty_it = bs->dirty_db.find(ov);
|
||||
if (dirty_it != bs->dirty_db.end() &&
|
||||
IS_BIG_WRITE(dirty_it->second.state) &&
|
||||
dirty_it->second.location == UINT64_MAX)
|
||||
{
|
||||
printf("Fatal error (bug): %lx:%lx v%lu big_write journal_entry was allocated over another object\n",
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
bs->flusher->mark_trim_possible();
|
||||
bs->journal.dirty_start = bs->journal.next_free;
|
||||
printf(
|
||||
@@ -549,7 +564,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
unstab = unstab < ov.version ? ov.version : unstab;
|
||||
if (je->type == JE_SMALL_WRITE_INSTANT)
|
||||
{
|
||||
bs->mark_stable(ov);
|
||||
bs->mark_stable(ov, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -579,32 +594,10 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
// its data and metadata are already flushed.
|
||||
// We don't know if newer versions are flushed, but
|
||||
// the previous delete definitely is.
|
||||
// So we flush previous dirty entries, but retain the clean one.
|
||||
// So we forget previous dirty entries, but retain the clean one.
|
||||
// This feature is required for writes happening shortly
|
||||
// after deletes.
|
||||
auto dirty_end = dirty_it;
|
||||
dirty_end++;
|
||||
while (1)
|
||||
{
|
||||
if (dirty_it == bs->dirty_db.begin())
|
||||
{
|
||||
break;
|
||||
}
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != je->big_write.oid)
|
||||
{
|
||||
dirty_it++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
||||
bs->erase_dirty(
|
||||
dirty_it, dirty_end,
|
||||
clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX
|
||||
);
|
||||
// Remove it from the flusher's queue, too
|
||||
// Otherwise it may end up referring to a small unstable write after reading the rest of the journal
|
||||
bs->flusher->remove_flush(je->big_write.oid);
|
||||
erase_dirty_object(dirty_it);
|
||||
}
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
||||
@@ -616,18 +609,33 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
.oid = je->big_write.oid,
|
||||
.version = je->big_write.version,
|
||||
};
|
||||
bs->dirty_db.emplace(ov, (dirty_entry){
|
||||
auto dirty_it = bs->dirty_db.emplace(ov, (dirty_entry){
|
||||
.state = (BS_ST_BIG_WRITE | BS_ST_SYNCED),
|
||||
.flags = 0,
|
||||
.location = je->big_write.location,
|
||||
.offset = je->big_write.offset,
|
||||
.len = je->big_write.len,
|
||||
.journal_sector = proc_pos,
|
||||
});
|
||||
}).first;
|
||||
if (bs->data_alloc->get(je->big_write.location >> bs->block_order))
|
||||
{
|
||||
// This is probably a big_write that's already flushed and freed, but it may
|
||||
// also indicate a bug. So we remember such entries and recheck them afterwards.
|
||||
// If it's not a bug they won't be present after reading the whole journal.
|
||||
dirty_it->second.location = UINT64_MAX;
|
||||
double_allocs.push_back(ov);
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Allocate block %lu\n", je->big_write.location >> bs->block_order);
|
||||
printf(
|
||||
"Allocate block (journal) %lu: %lx:%lx v%lu\n",
|
||||
je->big_write.location >> bs->block_order,
|
||||
ov.oid.inode, ov.oid.stripe, ov.version
|
||||
);
|
||||
#endif
|
||||
bs->data_alloc->set(je->big_write.location >> bs->block_order, true);
|
||||
bs->data_alloc->set(je->big_write.location >> bs->block_order, true);
|
||||
}
|
||||
bs->journal.used_sectors[proc_pos]++;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
@@ -639,7 +647,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
unstab = unstab < ov.version ? ov.version : unstab;
|
||||
if (je->type == JE_BIG_WRITE_INSTANT)
|
||||
{
|
||||
bs->mark_stable(ov);
|
||||
bs->mark_stable(ov, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -653,7 +661,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
.oid = je->stable.oid,
|
||||
.version = je->stable.version,
|
||||
};
|
||||
bs->mark_stable(ov);
|
||||
bs->mark_stable(ov, true);
|
||||
}
|
||||
else if (je->type == JE_ROLLBACK)
|
||||
{
|
||||
@@ -672,9 +680,26 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
|
||||
#endif
|
||||
bool dirty_exists = false;
|
||||
auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
|
||||
.oid = je->del.oid,
|
||||
.version = UINT64_MAX,
|
||||
});
|
||||
if (dirty_it != bs->dirty_db.begin())
|
||||
{
|
||||
dirty_it--;
|
||||
dirty_exists = dirty_it->first.oid == je->del.oid;
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(je->del.oid);
|
||||
if (clean_it == bs->clean_db.end() ||
|
||||
clean_it->second.version < je->del.version)
|
||||
bool clean_exists = (clean_it != bs->clean_db.end() &&
|
||||
clean_it->second.version < je->del.version);
|
||||
if (!clean_exists && dirty_exists)
|
||||
{
|
||||
// Clean entry doesn't exist. This means that the delete is already flushed.
|
||||
// So we must not flush this object anymore.
|
||||
erase_dirty_object(dirty_it);
|
||||
}
|
||||
else if (clean_exists || dirty_exists)
|
||||
{
|
||||
// oid, version
|
||||
obj_ver_id ov = {
|
||||
@@ -692,8 +717,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
bs->journal.used_sectors[proc_pos]++;
|
||||
// Deletions are treated as immediately stable, because
|
||||
// "2-phase commit" (write->stabilize) isn't sufficient for them anyway
|
||||
bs->mark_stable(ov);
|
||||
bs->mark_stable(ov, true);
|
||||
}
|
||||
// Ignore delete if neither preceding dirty entries nor the clean one are present
|
||||
}
|
||||
started = true;
|
||||
pos += je->size;
|
||||
@@ -704,3 +730,30 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
bs->journal.next_free = next_free;
|
||||
return 1;
|
||||
}
|
||||
|
||||
void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it)
|
||||
{
|
||||
auto oid = dirty_it->first.oid;
|
||||
auto dirty_end = dirty_it;
|
||||
dirty_end++;
|
||||
while (1)
|
||||
{
|
||||
if (dirty_it == bs->dirty_db.begin())
|
||||
{
|
||||
break;
|
||||
}
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != oid)
|
||||
{
|
||||
dirty_it++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(oid);
|
||||
uint64_t clean_loc = clean_it != bs->clean_db.end()
|
||||
? clean_it->second.location : UINT64_MAX;
|
||||
bs->erase_dirty(dirty_it, dirty_end, clean_loc);
|
||||
// Remove it from the flusher's queue, too
|
||||
// Otherwise it may end up referring to a small unstable write after reading the rest of the journal
|
||||
bs->flusher->remove_flush(oid);
|
||||
}
|
||||
|
@@ -36,6 +36,7 @@ class blockstore_init_journal
|
||||
bool started = false;
|
||||
uint64_t next_free;
|
||||
std::vector<bs_init_journal_done> done;
|
||||
std::vector<obj_ver_id> double_allocs;
|
||||
uint64_t journal_pos = 0;
|
||||
uint64_t continue_pos = 0;
|
||||
void *init_write_buf = NULL;
|
||||
@@ -48,6 +49,7 @@ class blockstore_init_journal
|
||||
std::function<void(ring_data_t*)> simple_callback;
|
||||
int handle_journal_part(void *buf, uint64_t done_pos, uint64_t len);
|
||||
void handle_event(ring_data_t *data);
|
||||
void erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it);
|
||||
public:
|
||||
blockstore_init_journal(blockstore_impl_t* bs);
|
||||
int loop();
|
||||
|
@@ -69,7 +69,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
|
||||
journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10);
|
||||
meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
|
||||
bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
|
||||
flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
|
||||
max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
|
||||
if (!max_flusher_count)
|
||||
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
|
||||
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
|
||||
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
|
||||
// Validate
|
||||
if (!block_size)
|
||||
@@ -80,9 +83,13 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
|
||||
{
|
||||
throw std::runtime_error("Bad block size");
|
||||
}
|
||||
if (!flusher_count)
|
||||
if (!max_flusher_count)
|
||||
{
|
||||
flusher_count = 32;
|
||||
max_flusher_count = 256;
|
||||
}
|
||||
if (!min_flusher_count)
|
||||
{
|
||||
min_flusher_count = 1;
|
||||
}
|
||||
if (!max_write_iodepth)
|
||||
{
|
||||
|
@@ -112,7 +112,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
read_op->version = 0;
|
||||
read_op->retval = read_op->len;
|
||||
FINISH_OP(read_op);
|
||||
return 1;
|
||||
return 2;
|
||||
}
|
||||
uint64_t fulfilled = 0;
|
||||
PRIV(read_op)->pending_ops = 0;
|
||||
@@ -191,8 +191,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
if (bmp_end > bmp_start)
|
||||
{
|
||||
// fill with zeroes
|
||||
fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
|
||||
bmp_end * bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0);
|
||||
assert(fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
|
||||
bmp_end * bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
|
||||
}
|
||||
bmp_start = bmp_end;
|
||||
while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
|
||||
@@ -218,7 +218,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
else if (fulfilled < read_op->len)
|
||||
{
|
||||
// fill remaining parts with zeroes
|
||||
fulfill_read(read_op, fulfilled, 0, block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0);
|
||||
assert(fulfill_read(read_op, fulfilled, 0, block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
|
||||
}
|
||||
assert(fulfilled == read_op->len);
|
||||
read_op->version = result_version;
|
||||
@@ -232,10 +232,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
}
|
||||
read_op->retval = read_op->len;
|
||||
FINISH_OP(read_op);
|
||||
return 1;
|
||||
return 2;
|
||||
}
|
||||
read_op->retval = 0;
|
||||
return 1;
|
||||
return 2;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op)
|
||||
|
@@ -50,7 +50,7 @@ skip_ov:
|
||||
{
|
||||
op->retval = -EBUSY;
|
||||
FINISH_OP(op);
|
||||
return 1;
|
||||
return 2;
|
||||
}
|
||||
if (dirty_it == dirty_db.begin())
|
||||
{
|
||||
@@ -66,7 +66,7 @@ skip_ov:
|
||||
// Already rolled back
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
return 1;
|
||||
return 2;
|
||||
}
|
||||
// Check journal space
|
||||
blockstore_journal_check_t space_check(this);
|
||||
@@ -126,11 +126,8 @@ resume_2:
|
||||
resume_3:
|
||||
if (!disable_journal_fsync)
|
||||
{
|
||||
io_uring_sqe *sqe = get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
io_uring_sqe *sqe;
|
||||
BS_SUBMIT_GET_SQE_DECL(sqe);
|
||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
@@ -151,7 +148,7 @@ resume_5:
|
||||
// Acknowledge op
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
return 1;
|
||||
return 2;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
|
||||
@@ -166,10 +163,7 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
|
||||
auto rm_start = it;
|
||||
auto rm_end = it;
|
||||
it--;
|
||||
while (it->first.oid == ov.oid &&
|
||||
it->first.version > ov.version &&
|
||||
!IS_IN_FLIGHT(it->second.state) &&
|
||||
!IS_STABLE(it->second.state))
|
||||
while (1)
|
||||
{
|
||||
if (it->first.oid != ov.oid)
|
||||
break;
|
||||
@@ -179,7 +173,7 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
|
||||
max_unstable = it->first.version;
|
||||
break;
|
||||
}
|
||||
else if (IS_STABLE(it->second.state))
|
||||
else if (IS_IN_FLIGHT(it->second.state) || IS_STABLE(it->second.state))
|
||||
break;
|
||||
// Remove entry
|
||||
rm_start = it;
|
||||
@@ -190,14 +184,14 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
|
||||
if (rm_start != rm_end)
|
||||
{
|
||||
erase_dirty(rm_start, rm_end, UINT64_MAX);
|
||||
}
|
||||
auto unstab_it = unstable_writes.find(ov.oid);
|
||||
if (unstab_it != unstable_writes.end())
|
||||
{
|
||||
if (max_unstable == 0)
|
||||
unstable_writes.erase(unstab_it);
|
||||
else
|
||||
unstab_it->second = max_unstable;
|
||||
auto unstab_it = unstable_writes.find(ov.oid);
|
||||
if (unstab_it != unstable_writes.end())
|
||||
{
|
||||
if (max_unstable == 0)
|
||||
unstable_writes.erase(unstab_it);
|
||||
else
|
||||
unstab_it->second = max_unstable;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -216,10 +210,7 @@ void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t
|
||||
if (PRIV(op)->pending_ops == 0)
|
||||
{
|
||||
PRIV(op)->op_state++;
|
||||
if (!continue_rollback(op))
|
||||
{
|
||||
submit_queue.push_front(op);
|
||||
}
|
||||
ringloop->wakeup();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -257,10 +248,12 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
|
||||
}
|
||||
while (1)
|
||||
{
|
||||
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
|
||||
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc &&
|
||||
dirty_it->second.location != UINT64_MAX)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %lu\n", dirty_it->second.location >> block_order);
|
||||
printf("Free block %lu from %lx:%lx v%lu\n", dirty_it->second.location >> block_order,
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
||||
#endif
|
||||
data_alloc->set(dirty_it->second.location >> block_order, false);
|
||||
}
|
||||
|
@@ -60,7 +60,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||
// No such object version
|
||||
op->retval = -ENOENT;
|
||||
FINISH_OP(op);
|
||||
return 1;
|
||||
return 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -77,7 +77,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||
// Object not synced yet. Caller must sync it first
|
||||
op->retval = -EBUSY;
|
||||
FINISH_OP(op);
|
||||
return 1;
|
||||
return 2;
|
||||
}
|
||||
else if (!IS_STABLE(dirty_it->second.state))
|
||||
{
|
||||
@@ -89,7 +89,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||
// Already stable
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
return 1;
|
||||
return 2;
|
||||
}
|
||||
// Check journal space
|
||||
blockstore_journal_check_t space_check(this);
|
||||
@@ -150,11 +150,8 @@ resume_2:
|
||||
resume_3:
|
||||
if (!disable_journal_fsync)
|
||||
{
|
||||
io_uring_sqe *sqe = get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
io_uring_sqe *sqe;
|
||||
BS_SUBMIT_GET_SQE_DECL(sqe);
|
||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
@@ -171,30 +168,50 @@ resume_5:
|
||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||
{
|
||||
// Mark all dirty_db entries up to op->version as stable
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Stabilize %lx:%lx v%lu\n", v->oid.inode, v->oid.stripe, v->version);
|
||||
#endif
|
||||
mark_stable(*v);
|
||||
}
|
||||
// Acknowledge op
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
return 1;
|
||||
return 2;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::mark_stable(const obj_ver_id & v)
|
||||
void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
|
||||
{
|
||||
auto dirty_it = dirty_db.find(v);
|
||||
if (dirty_it != dirty_db.end())
|
||||
{
|
||||
while (1)
|
||||
{
|
||||
bool was_stable = IS_STABLE(dirty_it->second.state);
|
||||
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_SYNCED)
|
||||
{
|
||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_STABLE;
|
||||
}
|
||||
else if (IS_STABLE(dirty_it->second.state))
|
||||
if (forget_dirty && (IS_BIG_WRITE(dirty_it->second.state) ||
|
||||
IS_DELETE(dirty_it->second.state)))
|
||||
{
|
||||
// Big write overrides all previous dirty entries
|
||||
auto erase_end = dirty_it;
|
||||
while (dirty_it != dirty_db.begin())
|
||||
{
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != v.oid)
|
||||
{
|
||||
dirty_it++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto clean_it = clean_db.find(v.oid);
|
||||
uint64_t clean_loc = clean_it != clean_db.end()
|
||||
? clean_it->second.location : UINT64_MAX;
|
||||
erase_dirty(dirty_it, erase_end, clean_loc);
|
||||
break;
|
||||
}
|
||||
if (dirty_it == dirty_db.begin())
|
||||
if (was_stable || dirty_it == dirty_db.begin())
|
||||
{
|
||||
break;
|
||||
}
|
||||
@@ -228,9 +245,6 @@ void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *
|
||||
if (PRIV(op)->pending_ops == 0)
|
||||
{
|
||||
PRIV(op)->op_state++;
|
||||
if (!continue_stable(op))
|
||||
{
|
||||
submit_queue.push_front(op);
|
||||
}
|
||||
ringloop->wakeup();
|
||||
}
|
||||
}
|
||||
|
@@ -12,11 +12,19 @@
|
||||
#define SYNC_JOURNAL_SYNC_SENT 7
|
||||
#define SYNC_DONE 8
|
||||
|
||||
int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
|
||||
int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync)
|
||||
{
|
||||
if (immediate_commit == IMMEDIATE_ALL)
|
||||
{
|
||||
// We can return immediately because sync is only dequeued after all previous writes
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
if (PRIV(op)->op_state == 0)
|
||||
{
|
||||
stop_sync_submitted = false;
|
||||
unsynced_big_write_count -= unsynced_big_writes.size();
|
||||
PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
|
||||
PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
|
||||
PRIV(op)->sync_small_checked = 0;
|
||||
@@ -29,34 +37,15 @@ int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
|
||||
PRIV(op)->op_state = SYNC_HAS_SMALL;
|
||||
else
|
||||
PRIV(op)->op_state = SYNC_DONE;
|
||||
// Always add sync to in_progress_syncs because we clear unsynced_big_writes and unsynced_small_writes
|
||||
PRIV(op)->prev_sync_count = in_progress_syncs.size();
|
||||
PRIV(op)->in_progress_ptr = in_progress_syncs.insert(in_progress_syncs.end(), op);
|
||||
}
|
||||
continue_sync(op);
|
||||
// Always dequeue because we always add syncs to in_progress_syncs
|
||||
return 1;
|
||||
}
|
||||
|
||||
int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
{
|
||||
auto cb = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
|
||||
if (PRIV(op)->op_state == SYNC_HAS_SMALL)
|
||||
{
|
||||
// No big writes, just fsync the journal
|
||||
for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
|
||||
{
|
||||
if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
|
||||
{
|
||||
// Wait for small inflight writes to complete
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
if (journal.sector_info[journal.cur_sector].dirty)
|
||||
{
|
||||
// Write out the last journal sector if it happens to be dirty
|
||||
BS_SUBMIT_GET_ONLY_SQE(sqe);
|
||||
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
|
||||
prepare_journal_sector_write(journal, journal.cur_sector, sqe, [this, op](ring_data_t *data) { handle_sync_event(data, op); });
|
||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
|
||||
PRIV(op)->pending_ops = 1;
|
||||
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
|
||||
@@ -69,21 +58,13 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
}
|
||||
if (PRIV(op)->op_state == SYNC_HAS_BIG)
|
||||
{
|
||||
for (; PRIV(op)->sync_big_checked < PRIV(op)->sync_big_writes.size(); PRIV(op)->sync_big_checked++)
|
||||
{
|
||||
if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_big_writes[PRIV(op)->sync_big_checked]].state))
|
||||
{
|
||||
// Wait for big inflight writes to complete
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
// 1st step: fsync data
|
||||
if (!disable_data_fsync)
|
||||
{
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = cb;
|
||||
data->callback = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
|
||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||
PRIV(op)->pending_ops = 1;
|
||||
PRIV(op)->op_state = SYNC_DATA_SYNC_SENT;
|
||||
@@ -96,14 +77,6 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
}
|
||||
if (PRIV(op)->op_state == SYNC_DATA_SYNC_DONE)
|
||||
{
|
||||
for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
|
||||
{
|
||||
if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
|
||||
{
|
||||
// Wait for small inflight writes to complete
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
// 2nd step: Data device is synced, prepare & write journal entries
|
||||
// Check space in the journal and journal memory buffers
|
||||
blockstore_journal_check_t space_check(this);
|
||||
@@ -127,7 +100,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
{
|
||||
if (cur_sector == -1)
|
||||
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
|
||||
prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
|
||||
prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], [this, op](ring_data_t *data) { handle_sync_event(data, op); });
|
||||
cur_sector = journal.cur_sector;
|
||||
}
|
||||
journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
|
||||
@@ -152,7 +125,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
journal.crc32_last = je->crc32;
|
||||
it++;
|
||||
}
|
||||
prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
|
||||
prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], [this, op](ring_data_t *data) { handle_sync_event(data, op); });
|
||||
assert(s == space_check.sectors_to_write);
|
||||
if (cur_sector == -1)
|
||||
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
|
||||
@@ -168,7 +141,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = cb;
|
||||
data->callback = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
|
||||
PRIV(op)->pending_ops = 1;
|
||||
PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
|
||||
return 1;
|
||||
@@ -178,9 +151,10 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
PRIV(op)->op_state = SYNC_DONE;
|
||||
}
|
||||
}
|
||||
if (PRIV(op)->op_state == SYNC_DONE)
|
||||
if (PRIV(op)->op_state == SYNC_DONE && !queue_has_in_progress_sync)
|
||||
{
|
||||
return ack_sync(op);
|
||||
ack_sync(op);
|
||||
return 2;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
@@ -212,42 +186,16 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op
|
||||
else if (PRIV(op)->op_state == SYNC_JOURNAL_SYNC_SENT)
|
||||
{
|
||||
PRIV(op)->op_state = SYNC_DONE;
|
||||
ack_sync(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("BUG: unexpected sync op state");
|
||||
}
|
||||
ringloop->wakeup();
|
||||
}
|
||||
}
|
||||
|
||||
int blockstore_impl_t::ack_sync(blockstore_op_t *op)
|
||||
{
|
||||
if (PRIV(op)->op_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
|
||||
{
|
||||
// Remove dependency of subsequent syncs
|
||||
auto it = PRIV(op)->in_progress_ptr;
|
||||
int done_syncs = 1;
|
||||
++it;
|
||||
// Acknowledge sync
|
||||
ack_one_sync(op);
|
||||
while (it != in_progress_syncs.end())
|
||||
{
|
||||
auto & next_sync = *it++;
|
||||
PRIV(next_sync)->prev_sync_count -= done_syncs;
|
||||
if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->op_state == SYNC_DONE)
|
||||
{
|
||||
done_syncs++;
|
||||
// Acknowledge next_sync
|
||||
ack_one_sync(next_sync);
|
||||
}
|
||||
}
|
||||
return 2;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
|
||||
void blockstore_impl_t::ack_sync(blockstore_op_t *op)
|
||||
{
|
||||
// Handle states
|
||||
for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++)
|
||||
@@ -295,7 +243,6 @@ void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
|
||||
}
|
||||
}
|
||||
}
|
||||
in_progress_syncs.erase(PRIV(op)->in_progress_ptr);
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
}
|
||||
|
@@ -124,6 +124,29 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
return true;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::cancel_all_writes(blockstore_op_t *op, blockstore_dirty_db_t::iterator dirty_it, int retval)
|
||||
{
|
||||
while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
|
||||
{
|
||||
dirty_db.erase(dirty_it++);
|
||||
}
|
||||
bool found = false;
|
||||
for (auto other_op: submit_queue)
|
||||
{
|
||||
if (!found && other_op == op)
|
||||
found = true;
|
||||
else if (found && other_op->oid == op->oid &&
|
||||
(other_op->opcode == BS_OP_WRITE || other_op->opcode == BS_OP_WRITE_STABLE))
|
||||
{
|
||||
// Mark operations to cancel them
|
||||
PRIV(other_op)->real_version = UINT64_MAX;
|
||||
other_op->retval = retval;
|
||||
}
|
||||
}
|
||||
op->retval = retval;
|
||||
FINISH_OP(op);
|
||||
}
|
||||
|
||||
// First step of the write algorithm: dequeue operation and submit initial write(s)
|
||||
int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
{
|
||||
@@ -143,6 +166,12 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
}
|
||||
if (PRIV(op)->real_version != 0)
|
||||
{
|
||||
if (PRIV(op)->real_version == UINT64_MAX)
|
||||
{
|
||||
// This is the flag value used to cancel operations
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
// Restore original low version number for unblocked operations
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Restoring %lx:%lx version: v%lu -> v%lu\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
|
||||
@@ -152,11 +181,9 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
if (prev_it->first.oid == op->oid && prev_it->first.version >= PRIV(op)->real_version)
|
||||
{
|
||||
// Original version is still invalid
|
||||
// FIXME Oops. Successive small writes will currently break in an unexpected way. Fix it
|
||||
dirty_db.erase(dirty_it);
|
||||
op->retval = -EEXIST;
|
||||
FINISH_OP(op);
|
||||
return 1;
|
||||
// All subsequent writes to the same object must be canceled too
|
||||
cancel_all_writes(op, dirty_it, -EEXIST);
|
||||
return 2;
|
||||
}
|
||||
op->version = PRIV(op)->real_version;
|
||||
PRIV(op)->real_version = 0;
|
||||
@@ -174,7 +201,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
|
||||
{
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
|
||||
if (!space_check.check_available(op, unsynced_big_write_count + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -189,18 +216,18 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
PRIV(op)->wait_for = WAIT_FREE;
|
||||
return 0;
|
||||
}
|
||||
// FIXME Oops. Successive small writes will currently break in an unexpected way. Fix it
|
||||
dirty_db.erase(dirty_it);
|
||||
op->retval = -ENOSPC;
|
||||
FINISH_OP(op);
|
||||
return 1;
|
||||
cancel_all_writes(op, dirty_it, -ENOSPC);
|
||||
return 2;
|
||||
}
|
||||
write_iodepth++;
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
dirty_it->second.location = loc << block_order;
|
||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Allocate block %lu\n", loc);
|
||||
printf(
|
||||
"Allocate block %lu for %lx:%lx v%lu\n",
|
||||
loc, op->oid.inode, op->oid.stripe, op->version
|
||||
);
|
||||
#endif
|
||||
data_alloc->set(loc, true);
|
||||
uint64_t stripe_offset = (op->offset % bitmap_granularity);
|
||||
@@ -226,11 +253,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
{
|
||||
// Remember big write as unsynced
|
||||
unsynced_big_writes.push_back((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version,
|
||||
});
|
||||
// Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
|
||||
unsynced_big_write_count++;
|
||||
PRIV(op)->op_state = 3;
|
||||
}
|
||||
else
|
||||
@@ -243,7 +267,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
// Small (journaled) write
|
||||
// First check if the journal has sufficient space
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (unsynced_big_writes.size() && !space_check.check_available(op, unsynced_big_writes.size(), sizeof(journal_entry_big_write), 0)
|
||||
if (unsynced_big_write_count && !space_check.check_available(op, unsynced_big_write_count, sizeof(journal_entry_big_write), 0)
|
||||
|| !space_check.check_available(op, 1, sizeof(journal_entry_small_write), op->len + JOURNAL_STABILIZE_RESERVATION))
|
||||
{
|
||||
return 0;
|
||||
@@ -335,18 +359,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
{
|
||||
journal.next_free = journal_block_size;
|
||||
}
|
||||
if (immediate_commit == IMMEDIATE_NONE)
|
||||
{
|
||||
// Remember small write as unsynced
|
||||
unsynced_small_writes.push_back((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version,
|
||||
});
|
||||
}
|
||||
if (!PRIV(op)->pending_ops)
|
||||
{
|
||||
PRIV(op)->op_state = 4;
|
||||
continue_write(op);
|
||||
return continue_write(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -360,24 +376,24 @@ int blockstore_impl_t::continue_write(blockstore_op_t *op)
|
||||
{
|
||||
io_uring_sqe *sqe = NULL;
|
||||
journal_entry_big_write *je;
|
||||
int op_state = PRIV(op)->op_state;
|
||||
if (op_state != 2 && op_state != 4)
|
||||
{
|
||||
// In progress
|
||||
return 1;
|
||||
}
|
||||
auto dirty_it = dirty_db.find((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version,
|
||||
});
|
||||
assert(dirty_it != dirty_db.end());
|
||||
if (PRIV(op)->op_state == 2)
|
||||
if (op_state == 2)
|
||||
goto resume_2;
|
||||
else if (PRIV(op)->op_state == 4)
|
||||
else if (op_state == 4)
|
||||
goto resume_4;
|
||||
else
|
||||
return 1;
|
||||
resume_2:
|
||||
// Only for the immediate_commit mode: prepare and submit big_write journal entry
|
||||
sqe = get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
BS_SUBMIT_GET_SQE_DECL(sqe);
|
||||
je = (journal_entry_big_write*)prefill_single_journal_entry(
|
||||
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
|
||||
sizeof(journal_entry_big_write)
|
||||
@@ -407,7 +423,7 @@ resume_2:
|
||||
resume_4:
|
||||
// Switch object state
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Ack write %lx:%lx v%lu = state %x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
|
||||
printf("Ack write %lx:%lx v%lu = state 0x%x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
|
||||
#endif
|
||||
bool imm = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
|
||||
? (immediate_commit == IMMEDIATE_ALL)
|
||||
@@ -421,11 +437,31 @@ resume_4:
|
||||
| (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
|
||||
if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
|
||||
{
|
||||
// Deletions are treated as immediately stable
|
||||
// Deletions and 'instant' operations are treated as immediately stable
|
||||
mark_stable(dirty_it->first);
|
||||
}
|
||||
if (immediate_commit == IMMEDIATE_ALL)
|
||||
if (!imm)
|
||||
{
|
||||
if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
|
||||
{
|
||||
// Remember big write as unsynced
|
||||
unsynced_big_writes.push_back((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version,
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
// Remember small write as unsynced
|
||||
unsynced_small_writes.push_back((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version,
|
||||
});
|
||||
}
|
||||
}
|
||||
if (imm && (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
|
||||
{
|
||||
// Unblock small writes
|
||||
dirty_it++;
|
||||
while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
|
||||
{
|
||||
@@ -440,7 +476,7 @@ resume_4:
|
||||
op->retval = op->len;
|
||||
write_iodepth--;
|
||||
FINISH_OP(op);
|
||||
return 1;
|
||||
return 2;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *op)
|
||||
@@ -459,10 +495,7 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
|
||||
{
|
||||
release_journal_sectors(op);
|
||||
PRIV(op)->op_state++;
|
||||
if (!continue_write(op))
|
||||
{
|
||||
submit_queue.push_front(op);
|
||||
}
|
||||
ringloop->wakeup();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -500,6 +533,10 @@ void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
|
||||
|
||||
int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
||||
{
|
||||
if (PRIV(op)->op_state)
|
||||
{
|
||||
return continue_write(op);
|
||||
}
|
||||
auto dirty_it = dirty_db.find((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version,
|
||||
@@ -558,18 +595,10 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
|
||||
PRIV(op)->pending_ops++;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Remember delete as unsynced
|
||||
unsynced_small_writes.push_back((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version,
|
||||
});
|
||||
}
|
||||
if (!PRIV(op)->pending_ops)
|
||||
{
|
||||
PRIV(op)->op_state = 4;
|
||||
continue_write(op);
|
||||
return continue_write(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -10,7 +10,8 @@
|
||||
#define MAX_BLOCK_SIZE 128*1024*1024
|
||||
#define DEFAULT_DISK_ALIGNMENT 4096
|
||||
#define DEFAULT_BITMAP_GRANULARITY 4096
|
||||
#define DEFAULT_CLIENT_DIRTY_LIMIT 32*1024*1024
|
||||
#define DEFAULT_CLIENT_MAX_DIRTY_BYTES 32*1024*1024
|
||||
#define DEFAULT_CLIENT_MAX_DIRTY_OPS 1024
|
||||
|
||||
struct cluster_op_t;
|
||||
|
||||
@@ -22,8 +23,7 @@ struct cluster_op_part_t
|
||||
pg_num_t pg_num;
|
||||
osd_num_t osd_num;
|
||||
osd_op_buf_list_t iov;
|
||||
bool sent;
|
||||
bool done;
|
||||
unsigned flags;
|
||||
osd_op_t op;
|
||||
};
|
||||
|
||||
@@ -37,70 +37,77 @@ struct cluster_op_t
|
||||
osd_op_buf_list_t iov;
|
||||
std::function<void(cluster_op_t*)> callback;
|
||||
protected:
|
||||
int flags = 0;
|
||||
int state = 0;
|
||||
void *buf = NULL;
|
||||
cluster_op_t *orig_op = NULL;
|
||||
bool is_internal = false;
|
||||
bool needs_reslice = false;
|
||||
bool up_wait = false;
|
||||
int sent_count = 0, done_count = 0;
|
||||
int inflight_count = 0, done_count = 0;
|
||||
std::vector<cluster_op_part_t> parts;
|
||||
friend class cluster_client_t;
|
||||
};
|
||||
|
||||
struct cluster_buffer_t
|
||||
{
|
||||
void *buf;
|
||||
uint64_t len;
|
||||
int state;
|
||||
};
|
||||
|
||||
// FIXME: Split into public and private interfaces
|
||||
class cluster_client_t
|
||||
{
|
||||
timerfd_manager_t *tfd;
|
||||
ring_loop_t *ringloop;
|
||||
|
||||
uint64_t bs_block_size = 0;
|
||||
uint64_t bs_disk_alignment = 0;
|
||||
uint64_t bs_bitmap_granularity = 0;
|
||||
std::map<pool_id_t, uint64_t> pg_counts;
|
||||
bool immediate_commit = false;
|
||||
// FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
|
||||
uint64_t client_dirty_limit = 0;
|
||||
uint64_t client_max_dirty_bytes = 0;
|
||||
uint64_t client_max_dirty_ops = 0;
|
||||
int log_level;
|
||||
int up_wait_retry_interval = 500; // ms
|
||||
|
||||
uint64_t op_id = 1;
|
||||
ring_consumer_t consumer;
|
||||
// operations currently in progress
|
||||
std::set<cluster_op_t*> cur_ops;
|
||||
int retry_timeout_id = 0;
|
||||
// unsynced operations are copied in memory to allow replay when cluster isn't in the immediate_commit mode
|
||||
// unsynced_writes are replayed in any order (because only the SYNC operation guarantees ordering)
|
||||
std::vector<cluster_op_t*> unsynced_writes;
|
||||
std::vector<cluster_op_t*> syncing_writes;
|
||||
cluster_op_t* cur_sync = NULL;
|
||||
std::vector<cluster_op_t*> next_writes;
|
||||
uint64_t op_id = 1;
|
||||
std::vector<cluster_op_t*> offline_ops;
|
||||
uint64_t queued_bytes = 0;
|
||||
std::deque<cluster_op_t*> op_queue;
|
||||
std::map<object_id, cluster_buffer_t> dirty_buffers;
|
||||
std::set<osd_num_t> dirty_osds;
|
||||
uint64_t dirty_bytes = 0, dirty_ops = 0;
|
||||
|
||||
bool pgs_loaded = false;
|
||||
ring_consumer_t consumer;
|
||||
std::vector<std::function<void(void)>> on_ready_hooks;
|
||||
int continuing_ops = 0;
|
||||
|
||||
public:
|
||||
etcd_state_client_t st_cli;
|
||||
osd_messenger_t msgr;
|
||||
json11::Json config;
|
||||
|
||||
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
|
||||
~cluster_client_t();
|
||||
void execute(cluster_op_t *op);
|
||||
bool is_ready();
|
||||
void on_ready(std::function<void(void)> fn);
|
||||
void stop();
|
||||
|
||||
protected:
|
||||
static void copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers);
|
||||
void continue_ops(bool up_retry = false);
|
||||
protected:
|
||||
bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
|
||||
void flush_buffer(const object_id & oid, cluster_buffer_t & wr);
|
||||
void on_load_config_hook(json11::Json::object & config);
|
||||
void on_load_pgs_hook(bool success);
|
||||
void on_change_hook(json11::Json::object & changes);
|
||||
void on_change_osd_state_hook(uint64_t peer_osd);
|
||||
void continue_rw(cluster_op_t *op);
|
||||
int continue_rw(cluster_op_t *op);
|
||||
void slice_rw(cluster_op_t *op);
|
||||
bool try_send(cluster_op_t *op, cluster_op_part_t *part);
|
||||
void execute_sync(cluster_op_t *op);
|
||||
void continue_sync();
|
||||
void finish_sync();
|
||||
bool try_send(cluster_op_t *op, int i);
|
||||
int continue_sync(cluster_op_t *op);
|
||||
void send_sync(cluster_op_t *op, cluster_op_part_t *part);
|
||||
void handle_op_part(cluster_op_part_t *part);
|
||||
};
|
||||
|
@@ -4,9 +4,24 @@
|
||||
#include "osd_ops.h"
|
||||
#include "pg_states.h"
|
||||
#include "etcd_state_client.h"
|
||||
#ifndef __MOCK__
|
||||
#include "http_client.h"
|
||||
#include "base64.h"
|
||||
#endif
|
||||
|
||||
etcd_state_client_t::~etcd_state_client_t()
|
||||
{
|
||||
etcd_watches_initialised = -1;
|
||||
#ifndef __MOCK__
|
||||
if (etcd_watch_ws)
|
||||
{
|
||||
etcd_watch_ws->close();
|
||||
etcd_watch_ws = NULL;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef __MOCK__
|
||||
json_kv_t etcd_state_client_t::parse_etcd_kv(const json11::Json & kv_json)
|
||||
{
|
||||
json_kv_t kv;
|
||||
@@ -46,6 +61,23 @@ void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int t
|
||||
http_request_json(tfd, etcd_address, req, timeout, callback);
|
||||
}
|
||||
|
||||
void etcd_state_client_t::add_etcd_url(std::string addr)
|
||||
{
|
||||
if (addr.length() > 0)
|
||||
{
|
||||
if (strtolower(addr.substr(0, 7)) == "http://")
|
||||
addr = addr.substr(7);
|
||||
else if (strtolower(addr.substr(0, 8)) == "https://")
|
||||
{
|
||||
printf("HTTPS is unsupported for etcd. Either use plain HTTP or setup a local proxy for etcd interaction\n");
|
||||
exit(1);
|
||||
}
|
||||
if (addr.find('/') < 0)
|
||||
addr += "/v3";
|
||||
this->etcd_addresses.push_back(addr);
|
||||
}
|
||||
}
|
||||
|
||||
void etcd_state_client_t::parse_config(json11::Json & config)
|
||||
{
|
||||
this->etcd_addresses.clear();
|
||||
@@ -55,13 +87,7 @@ void etcd_state_client_t::parse_config(json11::Json & config)
|
||||
while (1)
|
||||
{
|
||||
int pos = ea.find(',');
|
||||
std::string addr = pos >= 0 ? ea.substr(0, pos) : ea;
|
||||
if (addr.length() > 0)
|
||||
{
|
||||
if (addr.find('/') < 0)
|
||||
addr += "/v3";
|
||||
this->etcd_addresses.push_back(addr);
|
||||
}
|
||||
add_etcd_url(pos >= 0 ? ea.substr(0, pos) : ea);
|
||||
if (pos >= 0)
|
||||
ea = ea.substr(pos+1);
|
||||
else
|
||||
@@ -72,13 +98,7 @@ void etcd_state_client_t::parse_config(json11::Json & config)
|
||||
{
|
||||
for (auto & ea: config["etcd_address"].array_items())
|
||||
{
|
||||
std::string addr = ea.string_value();
|
||||
if (addr != "")
|
||||
{
|
||||
if (addr.find('/') < 0)
|
||||
addr += "/v3";
|
||||
this->etcd_addresses.push_back(addr);
|
||||
}
|
||||
add_etcd_url(ea.string_value());
|
||||
}
|
||||
}
|
||||
this->etcd_prefix = config["etcd_prefix"].string_value();
|
||||
@@ -160,7 +180,7 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
start_etcd_watcher();
|
||||
});
|
||||
}
|
||||
else
|
||||
else if (etcd_watches_initialised > 0)
|
||||
{
|
||||
// Connection was live, retry immediately
|
||||
start_etcd_watcher();
|
||||
@@ -308,6 +328,26 @@ void etcd_state_client_t::load_pgs()
|
||||
start_etcd_watcher();
|
||||
});
|
||||
}
|
||||
#else
|
||||
void etcd_state_client_t::parse_config(json11::Json & config)
|
||||
{
|
||||
}
|
||||
|
||||
void etcd_state_client_t::load_global_config()
|
||||
{
|
||||
json11::Json::object global_config;
|
||||
on_load_config_hook(global_config);
|
||||
}
|
||||
|
||||
void etcd_state_client_t::load_pgs()
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
void etcd_state_client_t::parse_state(const json_kv_t & kv)
|
||||
{
|
||||
parse_state(kv.key, kv.value);
|
||||
}
|
||||
|
||||
void etcd_state_client_t::parse_state(const std::string & key, const json11::Json & value)
|
||||
{
|
||||
@@ -321,8 +361,10 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
|
||||
{
|
||||
pool_config_t pc;
|
||||
// ID
|
||||
pool_id_t pool_id = stoull_full(pool_item.first);
|
||||
if (!pool_id || pool_id >= POOL_ID_MAX)
|
||||
pool_id_t pool_id;
|
||||
char null_byte = 0;
|
||||
sscanf(pool_item.first.c_str(), "%u%c", &pool_id, &null_byte);
|
||||
if (!pool_id || pool_id >= POOL_ID_MAX || null_byte != 0)
|
||||
{
|
||||
printf("Pool ID %s is invalid (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
|
||||
continue;
|
||||
@@ -434,16 +476,19 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
|
||||
}
|
||||
for (auto & pool_item: value["items"].object_items())
|
||||
{
|
||||
pool_id_t pool_id = stoull_full(pool_item.first);
|
||||
if (!pool_id || pool_id >= POOL_ID_MAX)
|
||||
pool_id_t pool_id;
|
||||
char null_byte = 0;
|
||||
sscanf(pool_item.first.c_str(), "%u%c", &pool_id, &null_byte);
|
||||
if (!pool_id || pool_id >= POOL_ID_MAX || null_byte != 0)
|
||||
{
|
||||
printf("Pool ID %s is invalid in PG configuration (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
|
||||
continue;
|
||||
}
|
||||
for (auto & pg_item: pool_item.second.object_items())
|
||||
{
|
||||
pg_num_t pg_num = stoull_full(pg_item.first);
|
||||
if (!pg_num)
|
||||
pg_num_t pg_num = 0;
|
||||
sscanf(pg_item.first.c_str(), "%u%c", &pg_num, &null_byte);
|
||||
if (!pg_num || null_byte != 0)
|
||||
{
|
||||
printf("Bad key in pool %u PG configuration: %s (must be a number), skipped\n", pool_id, pg_item.first.c_str());
|
||||
continue;
|
||||
|
@@ -3,8 +3,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "json11/json11.hpp"
|
||||
#include "osd_id.h"
|
||||
#include "http_client.h"
|
||||
#include "timerfd_manager.h"
|
||||
|
||||
#define ETCD_CONFIG_WATCH_ID 1
|
||||
@@ -52,8 +52,15 @@ struct pool_config_t
|
||||
std::map<pg_num_t, pg_config_t> pg_config;
|
||||
};
|
||||
|
||||
struct websocket_t;
|
||||
|
||||
struct etcd_state_client_t
|
||||
{
|
||||
protected:
|
||||
websocket_t *etcd_watch_ws = NULL;
|
||||
uint64_t bs_block_size = DEFAULT_BLOCK_SIZE;
|
||||
void add_etcd_url(std::string);
|
||||
public:
|
||||
std::vector<std::string> etcd_addresses;
|
||||
std::string etcd_prefix;
|
||||
int log_level = 0;
|
||||
@@ -61,8 +68,6 @@ struct etcd_state_client_t
|
||||
|
||||
int etcd_watches_initialised = 0;
|
||||
uint64_t etcd_watch_revision = 0;
|
||||
websocket_t *etcd_watch_ws = NULL;
|
||||
uint64_t bs_block_size = 0;
|
||||
std::map<pool_id_t, pool_config_t> pool_config;
|
||||
std::map<osd_num_t, json11::Json> peer_states;
|
||||
|
||||
@@ -79,6 +84,8 @@ struct etcd_state_client_t
|
||||
void start_etcd_watcher();
|
||||
void load_global_config();
|
||||
void load_pgs();
|
||||
void parse_state(const json_kv_t & kv);
|
||||
void parse_state(const std::string & key, const json11::Json & value);
|
||||
void parse_config(json11::Json & config);
|
||||
~etcd_state_client_t();
|
||||
};
|
||||
|
@@ -117,8 +117,15 @@ static struct fio_option options[] = {
|
||||
|
||||
static int sec_setup(struct thread_data *td)
|
||||
{
|
||||
sec_options *o = (sec_options*)td->eo;
|
||||
sec_data *bsd;
|
||||
|
||||
if (!o->etcd_host)
|
||||
{
|
||||
td_verror(td, EINVAL, "etcd address is missing");
|
||||
return 1;
|
||||
}
|
||||
|
||||
bsd = new sec_data;
|
||||
if (!bsd)
|
||||
{
|
||||
|
@@ -22,7 +22,6 @@
|
||||
#define READ_BUFFER_SIZE 9000
|
||||
|
||||
static int extract_port(std::string & host);
|
||||
static std::string strtolower(const std::string & in);
|
||||
static std::string trim(const std::string & in);
|
||||
static std::string ws_format_frame(int type, uint64_t size);
|
||||
static bool ws_parse_frame(std::string & buf, int & type, std::string & res);
|
||||
@@ -673,7 +672,7 @@ static int extract_port(std::string & host)
|
||||
return port;
|
||||
}
|
||||
|
||||
static std::string strtolower(const std::string & in)
|
||||
std::string strtolower(const std::string & in)
|
||||
{
|
||||
std::string s = in;
|
||||
for (int i = 0; i < s.length(); i++)
|
||||
|
@@ -49,6 +49,8 @@ std::vector<std::string> getifaddr_list(bool include_v6 = false);
|
||||
|
||||
uint64_t stoull_full(const std::string & str, int base = 10);
|
||||
|
||||
std::string strtolower(const std::string & in);
|
||||
|
||||
void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
|
||||
const http_options_t & options, std::function<void(const http_response_t *response)> callback);
|
||||
|
||||
|
@@ -10,30 +10,119 @@
|
||||
|
||||
#include "messenger.h"
|
||||
|
||||
osd_op_t::~osd_op_t()
|
||||
void osd_messenger_t::init()
|
||||
{
|
||||
assert(!bs_op);
|
||||
assert(!op_data);
|
||||
if (rmw_buf)
|
||||
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
|
||||
{
|
||||
free(rmw_buf);
|
||||
}
|
||||
if (buf)
|
||||
{
|
||||
// Note: reusing osd_op_t WILL currently lead to memory leaks
|
||||
// So we don't reuse it, but free it every time
|
||||
free(buf);
|
||||
}
|
||||
std::vector<int> to_stop;
|
||||
std::vector<osd_op_t*> to_ping;
|
||||
for (auto cl_it = clients.begin(); cl_it != clients.end(); cl_it++)
|
||||
{
|
||||
auto cl = cl_it->second;
|
||||
if (!cl->osd_num || cl->peer_state != PEER_CONNECTED)
|
||||
{
|
||||
// Do not run keepalive on regular clients
|
||||
continue;
|
||||
}
|
||||
if (cl->ping_time_remaining > 0)
|
||||
{
|
||||
cl->ping_time_remaining--;
|
||||
if (!cl->ping_time_remaining)
|
||||
{
|
||||
// Ping timed out, stop the client
|
||||
printf("Ping timed out for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
|
||||
to_stop.push_back(cl->peer_fd);
|
||||
}
|
||||
}
|
||||
else if (cl->idle_time_remaining > 0)
|
||||
{
|
||||
cl->idle_time_remaining--;
|
||||
if (!cl->idle_time_remaining)
|
||||
{
|
||||
// Connection is idle for <osd_idle_time>, send ping
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = cl->peer_fd;
|
||||
op->req = (osd_any_op_t){
|
||||
.hdr = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = this->next_subop_id++,
|
||||
.opcode = OSD_OP_PING,
|
||||
},
|
||||
};
|
||||
op->callback = [this, cl](osd_op_t *op)
|
||||
{
|
||||
int fail_fd = (op->reply.hdr.retval != 0 ? op->peer_fd : -1);
|
||||
cl->ping_time_remaining = 0;
|
||||
delete op;
|
||||
if (fail_fd >= 0)
|
||||
{
|
||||
printf("Ping failed for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
|
||||
stop_client(fail_fd, true);
|
||||
}
|
||||
};
|
||||
to_ping.push_back(op);
|
||||
cl->ping_time_remaining = osd_ping_timeout;
|
||||
cl->idle_time_remaining = osd_idle_timeout;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
cl->idle_time_remaining = osd_idle_timeout;
|
||||
}
|
||||
}
|
||||
// Don't stop clients while a 'clients' iterator is still active
|
||||
for (int peer_fd: to_stop)
|
||||
{
|
||||
stop_client(peer_fd, true);
|
||||
}
|
||||
for (auto op: to_ping)
|
||||
{
|
||||
outbox_push(op);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
osd_messenger_t::~osd_messenger_t()
|
||||
{
|
||||
if (keepalive_timer_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(keepalive_timer_id);
|
||||
keepalive_timer_id = -1;
|
||||
}
|
||||
while (clients.size() > 0)
|
||||
{
|
||||
stop_client(clients.begin()->first, true);
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::parse_config(const json11::Json & config)
|
||||
{
|
||||
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
|
||||
config["use_sync_send_recv"].uint64_value();
|
||||
this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
|
||||
if (!this->peer_connect_interval)
|
||||
{
|
||||
this->peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
|
||||
}
|
||||
this->peer_connect_timeout = config["peer_connect_timeout"].uint64_value();
|
||||
if (!this->peer_connect_timeout)
|
||||
{
|
||||
this->peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
|
||||
}
|
||||
this->osd_idle_timeout = config["osd_idle_timeout"].uint64_value();
|
||||
if (!this->osd_idle_timeout)
|
||||
{
|
||||
this->osd_idle_timeout = DEFAULT_OSD_PING_TIMEOUT;
|
||||
}
|
||||
this->osd_ping_timeout = config["osd_ping_timeout"].uint64_value();
|
||||
if (!this->osd_ping_timeout)
|
||||
{
|
||||
this->osd_ping_timeout = DEFAULT_OSD_PING_TIMEOUT;
|
||||
}
|
||||
this->log_level = config["log_level"].uint64_value();
|
||||
}
|
||||
|
||||
void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
|
||||
{
|
||||
if (wanted_peers.find(peer_osd) == wanted_peers.end())
|
||||
@@ -49,17 +138,14 @@ void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
|
||||
wanted_peers[peer_osd].port = (int)peer_state["port"].int64_value();
|
||||
}
|
||||
wanted_peers[peer_osd].address_changed = true;
|
||||
if (!wanted_peers[peer_osd].connecting &&
|
||||
(time(NULL) - wanted_peers[peer_osd].last_connect_attempt) >= peer_connect_interval)
|
||||
{
|
||||
try_connect_peer(peer_osd);
|
||||
}
|
||||
try_connect_peer(peer_osd);
|
||||
}
|
||||
|
||||
void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
|
||||
{
|
||||
auto wp_it = wanted_peers.find(peer_osd);
|
||||
if (wp_it == wanted_peers.end())
|
||||
if (wp_it == wanted_peers.end() || wp_it->second.connecting ||
|
||||
(time(NULL) - wp_it->second.last_connect_attempt) < peer_connect_interval)
|
||||
{
|
||||
return;
|
||||
}
|
||||
@@ -105,10 +191,22 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
|
||||
on_connect_peer(peer_osd, -errno);
|
||||
return;
|
||||
}
|
||||
int timeout_id = -1;
|
||||
clients[peer_fd] = new osd_client_t();
|
||||
clients[peer_fd]->peer_addr = addr;
|
||||
clients[peer_fd]->peer_port = peer_port;
|
||||
clients[peer_fd]->peer_fd = peer_fd;
|
||||
clients[peer_fd]->peer_state = PEER_CONNECTING;
|
||||
clients[peer_fd]->connect_timeout_id = -1;
|
||||
clients[peer_fd]->osd_num = peer_osd;
|
||||
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
|
||||
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Either OUT (connected) or HUP
|
||||
handle_connect_epoll(peer_fd);
|
||||
});
|
||||
if (peer_connect_timeout > 0)
|
||||
{
|
||||
timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
|
||||
clients[peer_fd]->connect_timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
|
||||
{
|
||||
osd_num_t peer_osd = clients.at(peer_fd)->osd_num;
|
||||
stop_client(peer_fd, true);
|
||||
@@ -116,20 +214,6 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
|
||||
return;
|
||||
});
|
||||
}
|
||||
clients[peer_fd] = new osd_client_t((osd_client_t){
|
||||
.peer_addr = addr,
|
||||
.peer_port = peer_port,
|
||||
.peer_fd = peer_fd,
|
||||
.peer_state = PEER_CONNECTING,
|
||||
.connect_timeout_id = timeout_id,
|
||||
.osd_num = peer_osd,
|
||||
.in_buf = malloc_or_die(receive_buffer_size),
|
||||
});
|
||||
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Either OUT (connected) or HUP
|
||||
handle_connect_epoll(peer_fd);
|
||||
});
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_connect_epoll(int peer_fd)
|
||||
@@ -281,123 +365,6 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||
outbox_push(op);
|
||||
}
|
||||
|
||||
void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
|
||||
{
|
||||
for (auto p: cl->sent_ops)
|
||||
{
|
||||
cancel_op(p.second);
|
||||
}
|
||||
cl->sent_ops.clear();
|
||||
cl->outbox.clear();
|
||||
}
|
||||
|
||||
void osd_messenger_t::cancel_op(osd_op_t *op)
|
||||
{
|
||||
if (op->op_type == OSD_OP_OUT)
|
||||
{
|
||||
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||
op->reply.hdr.id = op->req.hdr.id;
|
||||
op->reply.hdr.opcode = op->req.hdr.opcode;
|
||||
op->reply.hdr.retval = -EPIPE;
|
||||
// Copy lambda to be unaffected by `delete op`
|
||||
std::function<void(osd_op_t*)>(op->callback)(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
// This function is only called in stop_client(), so it's fine to destroy the operation
|
||||
delete op;
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::stop_client(int peer_fd, bool force)
|
||||
{
|
||||
assert(peer_fd != 0);
|
||||
auto it = clients.find(peer_fd);
|
||||
if (it == clients.end())
|
||||
{
|
||||
return;
|
||||
}
|
||||
uint64_t repeer_osd = 0;
|
||||
osd_client_t *cl = it->second;
|
||||
if (cl->peer_state == PEER_CONNECTED)
|
||||
{
|
||||
if (cl->osd_num)
|
||||
{
|
||||
// Reload configuration from etcd when the connection is dropped
|
||||
if (log_level > 0)
|
||||
printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl->osd_num);
|
||||
repeer_osd = cl->osd_num;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (log_level > 0)
|
||||
printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
|
||||
}
|
||||
}
|
||||
else if (!force)
|
||||
{
|
||||
return;
|
||||
}
|
||||
cl->peer_state = PEER_STOPPED;
|
||||
clients.erase(it);
|
||||
tfd->set_fd_handler(peer_fd, false, NULL);
|
||||
if (cl->connect_timeout_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(cl->connect_timeout_id);
|
||||
cl->connect_timeout_id = -1;
|
||||
}
|
||||
if (cl->osd_num)
|
||||
{
|
||||
osd_peer_fds.erase(cl->osd_num);
|
||||
}
|
||||
if (cl->read_op)
|
||||
{
|
||||
if (cl->read_op->callback)
|
||||
{
|
||||
cancel_op(cl->read_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
delete cl->read_op;
|
||||
}
|
||||
cl->read_op = NULL;
|
||||
}
|
||||
for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
|
||||
{
|
||||
if (*rit == peer_fd)
|
||||
{
|
||||
read_ready_clients.erase(rit);
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
|
||||
{
|
||||
if (*wit == peer_fd)
|
||||
{
|
||||
write_ready_clients.erase(wit);
|
||||
break;
|
||||
}
|
||||
}
|
||||
free(cl->in_buf);
|
||||
cl->in_buf = NULL;
|
||||
close(peer_fd);
|
||||
if (repeer_osd)
|
||||
{
|
||||
// First repeer PGs as canceling OSD ops may push new operations
|
||||
// and we need correct PG states when we do that
|
||||
repeer_pgs(repeer_osd);
|
||||
}
|
||||
if (cl->osd_num)
|
||||
{
|
||||
// Cancel outbound operations
|
||||
cancel_osd_ops(cl);
|
||||
}
|
||||
if (cl->refs <= 0)
|
||||
{
|
||||
delete cl;
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::accept_connections(int listen_fd)
|
||||
{
|
||||
// Accept new connections
|
||||
@@ -413,13 +380,12 @@ void osd_messenger_t::accept_connections(int listen_fd)
|
||||
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
int one = 1;
|
||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
clients[peer_fd] = new osd_client_t((osd_client_t){
|
||||
.peer_addr = addr,
|
||||
.peer_port = ntohs(addr.sin_port),
|
||||
.peer_fd = peer_fd,
|
||||
.peer_state = PEER_CONNECTED,
|
||||
.in_buf = malloc_or_die(receive_buffer_size),
|
||||
});
|
||||
clients[peer_fd] = new osd_client_t();
|
||||
clients[peer_fd]->peer_addr = addr;
|
||||
clients[peer_fd]->peer_port = ntohs(addr.sin_port);
|
||||
clients[peer_fd]->peer_fd = peer_fd;
|
||||
clients[peer_fd]->peer_state = PEER_CONNECTED;
|
||||
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
|
||||
// Add FD to epoll
|
||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
|
198
src/messenger.h
198
src/messenger.h
@@ -14,19 +14,15 @@
|
||||
|
||||
#include "malloc_or_die.h"
|
||||
#include "json11/json11.hpp"
|
||||
#include "osd_ops.h"
|
||||
#include "msgr_op.h"
|
||||
#include "timerfd_manager.h"
|
||||
#include "ringloop.h"
|
||||
|
||||
#define OSD_OP_IN 0
|
||||
#define OSD_OP_OUT 1
|
||||
#include <ringloop.h>
|
||||
|
||||
#define CL_READ_HDR 1
|
||||
#define CL_READ_DATA 2
|
||||
#define CL_READ_REPLY_DATA 3
|
||||
#define CL_WRITE_READY 1
|
||||
#define CL_WRITE_REPLY 2
|
||||
#define OSD_OP_INLINE_BUF_COUNT 16
|
||||
|
||||
#define PEER_CONNECTING 1
|
||||
#define PEER_CONNECTED 2
|
||||
@@ -34,160 +30,7 @@
|
||||
|
||||
#define DEFAULT_PEER_CONNECT_INTERVAL 5
|
||||
#define DEFAULT_PEER_CONNECT_TIMEOUT 5
|
||||
|
||||
// Kind of a vector with small-list-optimisation
|
||||
struct osd_op_buf_list_t
|
||||
{
|
||||
int count = 0, alloc = OSD_OP_INLINE_BUF_COUNT, done = 0;
|
||||
iovec *buf = NULL;
|
||||
iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
|
||||
|
||||
inline osd_op_buf_list_t()
|
||||
{
|
||||
buf = inline_buf;
|
||||
}
|
||||
|
||||
inline osd_op_buf_list_t(const osd_op_buf_list_t & other)
|
||||
{
|
||||
buf = inline_buf;
|
||||
append(other);
|
||||
}
|
||||
|
||||
inline osd_op_buf_list_t & operator = (const osd_op_buf_list_t & other)
|
||||
{
|
||||
reset();
|
||||
append(other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline ~osd_op_buf_list_t()
|
||||
{
|
||||
if (buf && buf != inline_buf)
|
||||
{
|
||||
free(buf);
|
||||
}
|
||||
}
|
||||
|
||||
inline void reset()
|
||||
{
|
||||
count = 0;
|
||||
done = 0;
|
||||
}
|
||||
|
||||
inline iovec* get_iovec()
|
||||
{
|
||||
return buf + done;
|
||||
}
|
||||
|
||||
inline int get_size()
|
||||
{
|
||||
return count - done;
|
||||
}
|
||||
|
||||
inline void append(const osd_op_buf_list_t & other)
|
||||
{
|
||||
if (count+other.count > alloc)
|
||||
{
|
||||
if (buf == inline_buf)
|
||||
{
|
||||
int old = alloc;
|
||||
alloc = (((count+other.count+15)/16)*16);
|
||||
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
memcpy(buf, inline_buf, sizeof(iovec) * old);
|
||||
}
|
||||
else
|
||||
{
|
||||
alloc = (((count+other.count+15)/16)*16);
|
||||
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < other.count; i++)
|
||||
{
|
||||
buf[count++] = other.buf[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline void push_back(void *nbuf, size_t len)
|
||||
{
|
||||
if (count >= alloc)
|
||||
{
|
||||
if (buf == inline_buf)
|
||||
{
|
||||
int old = alloc;
|
||||
alloc = ((alloc/16)*16 + 1);
|
||||
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
memcpy(buf, inline_buf, sizeof(iovec)*old);
|
||||
}
|
||||
else
|
||||
{
|
||||
alloc = alloc < 16 ? 16 : (alloc+16);
|
||||
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
buf[count++] = { .iov_base = nbuf, .iov_len = len };
|
||||
}
|
||||
|
||||
inline void eat(int result)
|
||||
{
|
||||
while (result > 0 && done < count)
|
||||
{
|
||||
iovec & iov = buf[done];
|
||||
if (iov.iov_len <= result)
|
||||
{
|
||||
result -= iov.iov_len;
|
||||
done++;
|
||||
}
|
||||
else
|
||||
{
|
||||
iov.iov_len -= result;
|
||||
iov.iov_base += result;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct blockstore_op_t;
|
||||
|
||||
struct osd_primary_op_data_t;
|
||||
|
||||
struct osd_op_t
|
||||
{
|
||||
timespec tv_begin;
|
||||
uint64_t op_type = OSD_OP_IN;
|
||||
int peer_fd;
|
||||
osd_any_op_t req;
|
||||
osd_any_reply_t reply;
|
||||
blockstore_op_t *bs_op = NULL;
|
||||
void *buf = NULL;
|
||||
void *rmw_buf = NULL;
|
||||
osd_primary_op_data_t* op_data = NULL;
|
||||
std::function<void(osd_op_t*)> callback;
|
||||
|
||||
osd_op_buf_list_t iov;
|
||||
|
||||
~osd_op_t();
|
||||
};
|
||||
#define DEFAULT_OSD_PING_TIMEOUT 5
|
||||
|
||||
struct osd_client_t
|
||||
{
|
||||
@@ -198,6 +41,8 @@ struct osd_client_t
|
||||
int peer_fd;
|
||||
int peer_state;
|
||||
int connect_timeout_id = -1;
|
||||
int ping_time_remaining = 0;
|
||||
int idle_time_remaining = 0;
|
||||
osd_num_t osd_num = 0;
|
||||
|
||||
void *in_buf = NULL;
|
||||
@@ -225,6 +70,12 @@ struct osd_client_t
|
||||
int write_state = 0;
|
||||
std::vector<iovec> send_list, next_send_list;
|
||||
std::vector<osd_op_t*> outbox, next_outbox;
|
||||
|
||||
~osd_client_t()
|
||||
{
|
||||
free(in_buf);
|
||||
in_buf = NULL;
|
||||
}
|
||||
};
|
||||
|
||||
struct osd_wanted_peer_t
|
||||
@@ -249,37 +100,41 @@ struct osd_op_stats_t
|
||||
|
||||
struct osd_messenger_t
|
||||
{
|
||||
timerfd_manager_t *tfd;
|
||||
ring_loop_t *ringloop;
|
||||
protected:
|
||||
int keepalive_timer_id = -1;
|
||||
|
||||
// osd_num_t is only for logging and asserts
|
||||
osd_num_t osd_num;
|
||||
// FIXME: make receive_buffer_size configurable
|
||||
int receive_buffer_size = 64*1024;
|
||||
int peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
|
||||
int peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
|
||||
int osd_idle_timeout = DEFAULT_OSD_PING_TIMEOUT;
|
||||
int osd_ping_timeout = DEFAULT_OSD_PING_TIMEOUT;
|
||||
int log_level = 0;
|
||||
bool use_sync_send_recv = false;
|
||||
|
||||
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
|
||||
std::map<uint64_t, int> osd_peer_fds;
|
||||
uint64_t next_subop_id = 1;
|
||||
|
||||
std::map<int, osd_client_t*> clients;
|
||||
std::vector<int> read_ready_clients;
|
||||
std::vector<int> write_ready_clients;
|
||||
std::vector<std::function<void()>> set_immediate;
|
||||
|
||||
public:
|
||||
timerfd_manager_t *tfd;
|
||||
ring_loop_t *ringloop;
|
||||
// osd_num_t is only for logging and asserts
|
||||
osd_num_t osd_num;
|
||||
uint64_t next_subop_id = 1;
|
||||
std::map<int, osd_client_t*> clients;
|
||||
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
|
||||
std::map<uint64_t, int> osd_peer_fds;
|
||||
// op statistics
|
||||
osd_op_stats_t stats;
|
||||
|
||||
public:
|
||||
void init();
|
||||
void parse_config(const json11::Json & config);
|
||||
void connect_peer(uint64_t osd_num, json11::Json peer_state);
|
||||
void stop_client(int peer_fd, bool force = false);
|
||||
void outbox_push(osd_op_t *cur_op);
|
||||
std::function<void(osd_op_t*)> exec_op;
|
||||
std::function<void(osd_num_t)> repeer_pgs;
|
||||
void handle_peer_epoll(int peer_fd, int epoll_events);
|
||||
void read_requests();
|
||||
void send_replies();
|
||||
void accept_connections(int listen_fd);
|
||||
@@ -288,6 +143,7 @@ public:
|
||||
protected:
|
||||
void try_connect_peer(uint64_t osd_num);
|
||||
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
|
||||
void handle_peer_epoll(int peer_fd, int epoll_events);
|
||||
void handle_connect_epoll(int peer_fd);
|
||||
void on_connect_peer(osd_num_t peer_osd, int peer_fd);
|
||||
void check_peer_config(osd_client_t *cl);
|
||||
|
1
src/mock/build.sh
Normal file
1
src/mock/build.sh
Normal file
@@ -0,0 +1 @@
|
||||
g++ -D__MOCK__ -fsanitize=address -g -Wno-pointer-arith pg_states.cpp osd_ops.cpp test_cluster_client.cpp cluster_client.cpp msgr_op.cpp msgr_stop.cpp mock/messenger.cpp etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp -I mock -I . -I ..; ./a.out
|
44
src/mock/messenger.cpp
Normal file
44
src/mock/messenger.cpp
Normal file
@@ -0,0 +1,44 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#include <unistd.h>
|
||||
#include <stdexcept>
|
||||
#include <assert.h>
|
||||
|
||||
#include "messenger.h"
|
||||
|
||||
void osd_messenger_t::init()
|
||||
{
|
||||
}
|
||||
|
||||
osd_messenger_t::~osd_messenger_t()
|
||||
{
|
||||
while (clients.size() > 0)
|
||||
{
|
||||
stop_client(clients.begin()->first, true);
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
{
|
||||
clients[cur_op->peer_fd]->sent_ops[cur_op->req.hdr.id] = cur_op;
|
||||
}
|
||||
|
||||
void osd_messenger_t::parse_config(const json11::Json & config)
|
||||
{
|
||||
}
|
||||
|
||||
void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
|
||||
{
|
||||
wanted_peers[peer_osd] = (osd_wanted_peer_t){
|
||||
.port = 1,
|
||||
};
|
||||
}
|
||||
|
||||
void osd_messenger_t::read_requests()
|
||||
{
|
||||
}
|
||||
|
||||
void osd_messenger_t::send_replies()
|
||||
{
|
||||
}
|
25
src/mock/ringloop.h
Normal file
25
src/mock/ringloop.h
Normal file
@@ -0,0 +1,25 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
|
||||
struct ring_consumer_t
|
||||
{
|
||||
std::function<void(void)> loop;
|
||||
};
|
||||
|
||||
class ring_loop_t
|
||||
{
|
||||
public:
|
||||
void register_consumer(ring_consumer_t *consumer)
|
||||
{
|
||||
}
|
||||
void unregister_consumer(ring_consumer_t *consumer)
|
||||
{
|
||||
}
|
||||
void submit()
|
||||
{
|
||||
}
|
||||
};
|
22
src/msgr_op.cpp
Normal file
22
src/msgr_op.cpp
Normal file
@@ -0,0 +1,22 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "msgr_op.h"
|
||||
|
||||
osd_op_t::~osd_op_t()
|
||||
{
|
||||
assert(!bs_op);
|
||||
assert(!op_data);
|
||||
if (rmw_buf)
|
||||
{
|
||||
free(rmw_buf);
|
||||
}
|
||||
if (buf)
|
||||
{
|
||||
// Note: reusing osd_op_t WILL currently lead to memory leaks
|
||||
// So we don't reuse it, but free it every time
|
||||
free(buf);
|
||||
}
|
||||
}
|
171
src/msgr_op.h
Normal file
171
src/msgr_op.h
Normal file
@@ -0,0 +1,171 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <sys/uio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "osd_ops.h"
|
||||
|
||||
#define OSD_OP_IN 0
|
||||
#define OSD_OP_OUT 1
|
||||
|
||||
#define OSD_OP_INLINE_BUF_COUNT 16
|
||||
|
||||
// Kind of a vector with small-list-optimisation
|
||||
struct osd_op_buf_list_t
|
||||
{
|
||||
int count = 0, alloc = OSD_OP_INLINE_BUF_COUNT, done = 0;
|
||||
iovec *buf = NULL;
|
||||
iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
|
||||
|
||||
inline osd_op_buf_list_t()
|
||||
{
|
||||
buf = inline_buf;
|
||||
}
|
||||
|
||||
inline osd_op_buf_list_t(const osd_op_buf_list_t & other)
|
||||
{
|
||||
buf = inline_buf;
|
||||
append(other);
|
||||
}
|
||||
|
||||
inline osd_op_buf_list_t & operator = (const osd_op_buf_list_t & other)
|
||||
{
|
||||
reset();
|
||||
append(other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline ~osd_op_buf_list_t()
|
||||
{
|
||||
if (buf && buf != inline_buf)
|
||||
{
|
||||
free(buf);
|
||||
}
|
||||
}
|
||||
|
||||
inline void reset()
|
||||
{
|
||||
count = 0;
|
||||
done = 0;
|
||||
}
|
||||
|
||||
inline iovec* get_iovec()
|
||||
{
|
||||
return buf + done;
|
||||
}
|
||||
|
||||
inline int get_size()
|
||||
{
|
||||
return count - done;
|
||||
}
|
||||
|
||||
inline void append(const osd_op_buf_list_t & other)
|
||||
{
|
||||
if (count+other.count > alloc)
|
||||
{
|
||||
if (buf == inline_buf)
|
||||
{
|
||||
int old = alloc;
|
||||
alloc = (((count+other.count+15)/16)*16);
|
||||
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
memcpy(buf, inline_buf, sizeof(iovec) * old);
|
||||
}
|
||||
else
|
||||
{
|
||||
alloc = (((count+other.count+15)/16)*16);
|
||||
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < other.count; i++)
|
||||
{
|
||||
buf[count++] = other.buf[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline void push_back(void *nbuf, size_t len)
|
||||
{
|
||||
if (count >= alloc)
|
||||
{
|
||||
if (buf == inline_buf)
|
||||
{
|
||||
int old = alloc;
|
||||
alloc = ((alloc/16)*16 + 1);
|
||||
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
memcpy(buf, inline_buf, sizeof(iovec)*old);
|
||||
}
|
||||
else
|
||||
{
|
||||
alloc = alloc < 16 ? 16 : (alloc+16);
|
||||
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
||||
if (!buf)
|
||||
{
|
||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
buf[count++] = { .iov_base = nbuf, .iov_len = len };
|
||||
}
|
||||
|
||||
inline void eat(int result)
|
||||
{
|
||||
while (result > 0 && done < count)
|
||||
{
|
||||
iovec & iov = buf[done];
|
||||
if (iov.iov_len <= result)
|
||||
{
|
||||
result -= iov.iov_len;
|
||||
done++;
|
||||
}
|
||||
else
|
||||
{
|
||||
iov.iov_len -= result;
|
||||
iov.iov_base += result;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct blockstore_op_t;
|
||||
|
||||
struct osd_primary_op_data_t;
|
||||
|
||||
struct osd_op_t
|
||||
{
|
||||
timespec tv_begin;
|
||||
uint64_t op_type = OSD_OP_IN;
|
||||
int peer_fd;
|
||||
osd_any_op_t req;
|
||||
osd_any_reply_t reply;
|
||||
blockstore_op_t *bs_op = NULL;
|
||||
void *buf = NULL;
|
||||
void *rmw_buf = NULL;
|
||||
osd_primary_op_data_t* op_data = NULL;
|
||||
std::function<void(osd_op_t*)> callback;
|
||||
|
||||
osd_op_buf_list_t iov;
|
||||
|
||||
~osd_op_t();
|
||||
};
|
@@ -180,7 +180,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
cl->refs--;
|
||||
if (cl->peer_state == PEER_STOPPED)
|
||||
{
|
||||
if (!cl->refs)
|
||||
if (cl->refs <= 0)
|
||||
{
|
||||
delete cl;
|
||||
}
|
||||
|
137
src/msgr_stop.cpp
Normal file
137
src/msgr_stop.cpp
Normal file
@@ -0,0 +1,137 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#include <unistd.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "messenger.h"
|
||||
|
||||
void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
|
||||
{
|
||||
std::vector<osd_op_t*> cancel_ops;
|
||||
cancel_ops.resize(cl->sent_ops.size());
|
||||
int i = 0;
|
||||
for (auto p: cl->sent_ops)
|
||||
{
|
||||
cancel_ops[i++] = p.second;
|
||||
}
|
||||
cl->sent_ops.clear();
|
||||
cl->outbox.clear();
|
||||
for (auto op: cancel_ops)
|
||||
{
|
||||
cancel_op(op);
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::cancel_op(osd_op_t *op)
|
||||
{
|
||||
if (op->op_type == OSD_OP_OUT)
|
||||
{
|
||||
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||
op->reply.hdr.id = op->req.hdr.id;
|
||||
op->reply.hdr.opcode = op->req.hdr.opcode;
|
||||
op->reply.hdr.retval = -EPIPE;
|
||||
// Copy lambda to be unaffected by `delete op`
|
||||
std::function<void(osd_op_t*)>(op->callback)(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
// This function is only called in stop_client(), so it's fine to destroy the operation
|
||||
delete op;
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::stop_client(int peer_fd, bool force)
|
||||
{
|
||||
assert(peer_fd != 0);
|
||||
auto it = clients.find(peer_fd);
|
||||
if (it == clients.end())
|
||||
{
|
||||
return;
|
||||
}
|
||||
osd_client_t *cl = it->second;
|
||||
if (cl->peer_state == PEER_CONNECTING && !force || cl->peer_state == PEER_STOPPED)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (log_level > 0)
|
||||
{
|
||||
if (cl->osd_num)
|
||||
{
|
||||
printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl->osd_num);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
|
||||
}
|
||||
}
|
||||
// First set state to STOPPED so another stop_client() call doesn't try to free it again
|
||||
cl->refs++;
|
||||
cl->peer_state = PEER_STOPPED;
|
||||
if (cl->osd_num)
|
||||
{
|
||||
// ...and forget OSD peer
|
||||
osd_peer_fds.erase(cl->osd_num);
|
||||
}
|
||||
#ifndef __MOCK__
|
||||
// Then remove FD from the eventloop so we don't accidentally read something
|
||||
tfd->set_fd_handler(peer_fd, false, NULL);
|
||||
if (cl->connect_timeout_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(cl->connect_timeout_id);
|
||||
cl->connect_timeout_id = -1;
|
||||
}
|
||||
for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
|
||||
{
|
||||
if (*rit == peer_fd)
|
||||
{
|
||||
read_ready_clients.erase(rit);
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
|
||||
{
|
||||
if (*wit == peer_fd)
|
||||
{
|
||||
write_ready_clients.erase(wit);
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (cl->osd_num)
|
||||
{
|
||||
// Then repeer PGs because cancel_op() callbacks can try to perform
|
||||
// some actions and we need correct PG states to not do something silly
|
||||
repeer_pgs(cl->osd_num);
|
||||
}
|
||||
// Then cancel all operations
|
||||
if (cl->read_op)
|
||||
{
|
||||
if (!cl->read_op->callback)
|
||||
{
|
||||
delete cl->read_op;
|
||||
}
|
||||
cl->read_op = NULL;
|
||||
}
|
||||
if (cl->osd_num)
|
||||
{
|
||||
// Cancel outbound operations
|
||||
cancel_osd_ops(cl);
|
||||
}
|
||||
#ifndef __MOCK__
|
||||
// And close the FD only when everything is done
|
||||
// ...because peer_fd number can get reused after close()
|
||||
close(peer_fd);
|
||||
#endif
|
||||
// Find the item again because it can be invalidated at this point
|
||||
it = clients.find(peer_fd);
|
||||
if (it != clients.end())
|
||||
{
|
||||
clients.erase(it);
|
||||
}
|
||||
cl->refs--;
|
||||
if (cl->refs <= 0)
|
||||
{
|
||||
delete cl;
|
||||
}
|
||||
}
|
53
src/osd.cpp
53
src/osd.cpp
@@ -8,16 +8,20 @@
|
||||
#include <arpa/inet.h>
|
||||
|
||||
#include "osd.h"
|
||||
#include "http_client.h"
|
||||
|
||||
osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop)
|
||||
osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
|
||||
{
|
||||
config["entry_attr_size"] = "0";
|
||||
|
||||
this->config = config;
|
||||
this->bs = bs;
|
||||
this->ringloop = ringloop;
|
||||
|
||||
// FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
|
||||
this->bs = new blockstore_t(config, ringloop);
|
||||
|
||||
this->bs_block_size = bs->get_block_size();
|
||||
// FIXME: use bitmap granularity instead
|
||||
this->bs_disk_alignment = bs->get_disk_alignment();
|
||||
this->bs_bitmap_granularity = bs->get_bitmap_granularity();
|
||||
|
||||
parse_config(config);
|
||||
|
||||
@@ -37,6 +41,7 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
|
||||
c_cli.ringloop = this->ringloop;
|
||||
c_cli.exec_op = [this](osd_op_t *op) { exec_op(op); };
|
||||
c_cli.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
|
||||
c_cli.init();
|
||||
|
||||
init_cluster();
|
||||
|
||||
@@ -48,6 +53,7 @@ osd_t::~osd_t()
|
||||
{
|
||||
ringloop->unregister_consumer(&consumer);
|
||||
delete epmgr;
|
||||
delete bs;
|
||||
close(listen_fd);
|
||||
}
|
||||
|
||||
@@ -55,6 +61,7 @@ void osd_t::parse_config(blockstore_config_t & config)
|
||||
{
|
||||
if (config.find("log_level") == config.end())
|
||||
config["log_level"] = "1";
|
||||
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
|
||||
// Initial startup configuration
|
||||
json11::Json json_config = json11::Json(config);
|
||||
st_cli.parse_config(json_config);
|
||||
@@ -66,6 +73,8 @@ void osd_t::parse_config(blockstore_config_t & config)
|
||||
throw std::runtime_error("osd_num is required in the configuration");
|
||||
c_cli.osd_num = osd_num;
|
||||
run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no";
|
||||
no_rebalance = config["no_rebalance"] == "true" || config["no_rebalance"] == "1" || config["no_rebalance"] == "yes";
|
||||
no_recovery = config["no_recovery"] == "true" || config["no_recovery"] == "1" || config["no_recovery"] == "yes";
|
||||
// Cluster configuration
|
||||
bind_address = config["bind_address"];
|
||||
if (bind_address == "")
|
||||
@@ -92,6 +101,9 @@ void osd_t::parse_config(blockstore_config_t & config)
|
||||
recovery_queue_depth = strtoull(config["recovery_queue_depth"].c_str(), NULL, 10);
|
||||
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
|
||||
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
||||
recovery_sync_batch = strtoull(config["recovery_sync_batch"].c_str(), NULL, 10);
|
||||
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
|
||||
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||
if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes")
|
||||
readonly = true;
|
||||
print_stats_interval = strtoull(config["print_stats_interval"].c_str(), NULL, 10);
|
||||
@@ -100,14 +112,7 @@ void osd_t::parse_config(blockstore_config_t & config)
|
||||
slow_log_interval = strtoull(config["slow_log_interval"].c_str(), NULL, 10);
|
||||
if (!slow_log_interval)
|
||||
slow_log_interval = 10;
|
||||
c_cli.peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10);
|
||||
if (!c_cli.peer_connect_interval)
|
||||
c_cli.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
|
||||
c_cli.peer_connect_timeout = strtoull(config["peer_connect_timeout"].c_str(), NULL, 10);
|
||||
if (!c_cli.peer_connect_timeout)
|
||||
c_cli.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
|
||||
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
|
||||
c_cli.log_level = log_level;
|
||||
c_cli.parse_config(json_config);
|
||||
}
|
||||
|
||||
void osd_t::bind_socket()
|
||||
@@ -171,7 +176,7 @@ bool osd_t::shutdown()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return bs->is_safe_to_stop();
|
||||
return !bs || bs->is_safe_to_stop();
|
||||
}
|
||||
|
||||
void osd_t::loop()
|
||||
@@ -191,6 +196,8 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
||||
delete cur_op;
|
||||
return;
|
||||
}
|
||||
// Clear the reply buffer
|
||||
memset(cur_op->reply.buf, 0, OSD_PACKET_SIZE);
|
||||
inflight_ops++;
|
||||
if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
|
||||
cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
|
||||
@@ -198,19 +205,25 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
|
||||
(cur_op->req.sec_rw.len > OSD_RW_MAX ||
|
||||
cur_op->req.sec_rw.len % bs_disk_alignment ||
|
||||
cur_op->req.sec_rw.offset % bs_disk_alignment)) ||
|
||||
cur_op->req.sec_rw.len % bs_bitmap_granularity ||
|
||||
cur_op->req.sec_rw.offset % bs_bitmap_granularity)) ||
|
||||
((cur_op->req.hdr.opcode == OSD_OP_READ ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_WRITE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
|
||||
(cur_op->req.rw.len > OSD_RW_MAX ||
|
||||
cur_op->req.rw.len % bs_disk_alignment ||
|
||||
cur_op->req.rw.offset % bs_disk_alignment)))
|
||||
cur_op->req.rw.len % bs_bitmap_granularity ||
|
||||
cur_op->req.rw.offset % bs_bitmap_granularity)))
|
||||
{
|
||||
// Bad command
|
||||
finish_op(cur_op, -EINVAL);
|
||||
return;
|
||||
}
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_PING)
|
||||
{
|
||||
// Pong
|
||||
finish_op(cur_op, 0);
|
||||
return;
|
||||
}
|
||||
if (readonly &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_SEC_READ &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
|
||||
@@ -261,9 +274,9 @@ void osd_t::reset_stats()
|
||||
|
||||
void osd_t::print_stats()
|
||||
{
|
||||
for (int i = 0; i <= OSD_OP_MAX; i++)
|
||||
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
||||
{
|
||||
if (c_cli.stats.op_stat_count[i] != prev_stats.op_stat_count[i])
|
||||
if (c_cli.stats.op_stat_count[i] != prev_stats.op_stat_count[i] && i != OSD_OP_PING)
|
||||
{
|
||||
uint64_t avg = (c_cli.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(c_cli.stats.op_stat_count[i] - prev_stats.op_stat_count[i]);
|
||||
uint64_t bw = (c_cli.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval;
|
||||
@@ -284,7 +297,7 @@ void osd_t::print_stats()
|
||||
prev_stats.op_stat_bytes[i] = c_cli.stats.op_stat_bytes[i];
|
||||
}
|
||||
}
|
||||
for (int i = 0; i <= OSD_OP_MAX; i++)
|
||||
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
||||
{
|
||||
if (c_cli.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
|
||||
{
|
||||
|
17
src/osd.h
17
src/osd.h
@@ -37,6 +37,7 @@
|
||||
#define DEFAULT_AUTOSYNC_INTERVAL 5
|
||||
#define MAX_RECOVERY_QUEUE 2048
|
||||
#define DEFAULT_RECOVERY_QUEUE 4
|
||||
#define DEFAULT_RECOVERY_BATCH 16
|
||||
|
||||
//#define OSD_STUB
|
||||
|
||||
@@ -64,6 +65,8 @@ class osd_t
|
||||
bool readonly = false;
|
||||
osd_num_t osd_num = 1; // OSD numbers start with 1
|
||||
bool run_primary = false;
|
||||
bool no_rebalance = false;
|
||||
bool no_recovery = false;
|
||||
std::string bind_address;
|
||||
int bind_port, listen_backlog;
|
||||
// FIXME: Implement client queue depth limit
|
||||
@@ -74,6 +77,7 @@ class osd_t
|
||||
int immediate_commit = IMMEDIATE_NONE;
|
||||
int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // sync every 5 seconds
|
||||
int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
||||
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||
int log_level = 0;
|
||||
|
||||
// cluster state
|
||||
@@ -95,9 +99,11 @@ class osd_t
|
||||
std::map<pool_pg_num_t, pg_t> pgs;
|
||||
std::set<pool_pg_num_t> dirty_pgs;
|
||||
std::set<osd_num_t> dirty_osds;
|
||||
int copies_to_delete_after_sync_count = 0;
|
||||
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
|
||||
int peering_state = 0;
|
||||
std::map<object_id, osd_recovery_op_t> recovery_ops;
|
||||
int recovery_done = 0;
|
||||
osd_op_t *autosync_op = NULL;
|
||||
|
||||
// Unstable writes
|
||||
@@ -109,7 +115,7 @@ class osd_t
|
||||
bool stopping = false;
|
||||
int inflight_ops = 0;
|
||||
blockstore_t *bs;
|
||||
uint32_t bs_block_size, bs_disk_alignment;
|
||||
uint32_t bs_block_size, bs_bitmap_granularity;
|
||||
ring_loop_t *ringloop;
|
||||
timerfd_manager_t *tfd = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
@@ -160,6 +166,7 @@ class osd_t
|
||||
void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
|
||||
void discard_list_subop(osd_op_t *list_op);
|
||||
bool stop_pg(pg_t & pg);
|
||||
void reset_pg(pg_t & pg);
|
||||
void finish_stop_pg(pg_t & pg);
|
||||
|
||||
// flushing, recovery and backfill
|
||||
@@ -191,6 +198,7 @@ class osd_t
|
||||
void continue_primary_del(osd_op_t *cur_op);
|
||||
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
|
||||
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
|
||||
void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
|
||||
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
|
||||
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
|
||||
void handle_primary_bs_subop(osd_op_t *subop);
|
||||
@@ -198,9 +206,12 @@ class osd_t
|
||||
void pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval);
|
||||
void submit_primary_subops(int submit_type, uint64_t op_version, int pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
|
||||
void submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, uint64_t set_size, pg_osd_set_t & loc_set);
|
||||
void submit_primary_sync_subops(osd_op_t *cur_op);
|
||||
void submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_delete, int chunks_to_delete_count);
|
||||
int submit_primary_sync_subops(osd_op_t *cur_op);
|
||||
void submit_primary_stab_subops(osd_op_t *cur_op);
|
||||
|
||||
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state);
|
||||
|
||||
inline pg_num_t map_to_pg(object_id oid, uint64_t pg_stripe_size)
|
||||
{
|
||||
uint64_t pg_count = pg_counts[INODE_POOL(oid.inode)];
|
||||
@@ -210,7 +221,7 @@ class osd_t
|
||||
}
|
||||
|
||||
public:
|
||||
osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop);
|
||||
osd_t(blockstore_config_t & config, ring_loop_t *ringloop);
|
||||
~osd_t();
|
||||
void force_stop(int exitcode);
|
||||
bool shutdown();
|
||||
|
@@ -4,6 +4,7 @@
|
||||
#include "osd.h"
|
||||
#include "base64.h"
|
||||
#include "etcd_state_client.h"
|
||||
#include "http_client.h"
|
||||
#include "osd_rmw.h"
|
||||
|
||||
// Startup sequence:
|
||||
@@ -37,7 +38,7 @@ void osd_t::init_cluster()
|
||||
.pg_cursize = 0,
|
||||
.pg_size = 3,
|
||||
.pg_minsize = 2,
|
||||
.parity_chunks = 1,
|
||||
.pg_data_size = 2,
|
||||
.pool_id = 1,
|
||||
.pg_num = 1,
|
||||
.target_set = { 1, 2, 3 },
|
||||
@@ -557,7 +558,7 @@ void osd_t::apply_pg_config()
|
||||
}
|
||||
if (currently_taken)
|
||||
{
|
||||
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING))
|
||||
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING))
|
||||
{
|
||||
if (pg_it->second.target_set == pg_cfg.target_set)
|
||||
{
|
||||
@@ -606,7 +607,8 @@ void osd_t::apply_pg_config()
|
||||
.pg_cursize = 0,
|
||||
.pg_size = pool_item.second.pg_size,
|
||||
.pg_minsize = pool_item.second.pg_minsize,
|
||||
.parity_chunks = pool_item.second.parity_chunks,
|
||||
.pg_data_size = pg.scheme == POOL_SCHEME_REPLICATED
|
||||
? 1 : pool_item.second.pg_size - pool_item.second.parity_chunks,
|
||||
.pool_id = pool_id,
|
||||
.pg_num = pg_num,
|
||||
.reported_epoch = pg_cfg.epoch,
|
||||
@@ -616,7 +618,7 @@ void osd_t::apply_pg_config()
|
||||
};
|
||||
if (pg.scheme == POOL_SCHEME_JERASURE)
|
||||
{
|
||||
use_jerasure(pg.pg_size, pg.pg_size-pg.parity_chunks, true);
|
||||
use_jerasure(pg.pg_size, pg.pg_data_size, true);
|
||||
}
|
||||
this->pg_state_dirty.insert({ .pool_id = pool_id, .pg_num = pg_num });
|
||||
pg.print_state();
|
||||
@@ -664,7 +666,21 @@ void osd_t::report_pg_states()
|
||||
auto & pg = pg_it->second;
|
||||
reporting_pgs.push_back({ *it, pg.history_changed });
|
||||
std::string state_key_base64 = base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num));
|
||||
if (pg.state == PG_STARTING)
|
||||
bool pg_state_exists = false;
|
||||
if (pg.state != PG_STARTING)
|
||||
{
|
||||
auto pool_it = st_cli.pool_config.find(pg.pool_id);
|
||||
if (pool_it != st_cli.pool_config.end())
|
||||
{
|
||||
auto pg_it = pool_it->second.pg_config.find(pg.pg_num);
|
||||
if (pg_it != pool_it->second.pg_config.end() &&
|
||||
pg_it->second.cur_state != 0)
|
||||
{
|
||||
pg_state_exists = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!pg_state_exists)
|
||||
{
|
||||
// Check that the PG key does not exist
|
||||
// Failed check indicates an unsuccessful PG lock attempt in this case
|
||||
@@ -676,9 +692,7 @@ void osd_t::report_pg_states()
|
||||
}
|
||||
else
|
||||
{
|
||||
// Check that the key is ours
|
||||
// Failed check indicates success for OFFLINE pgs (PG lock is already deleted)
|
||||
// and an unexpected race condition for started pgs (PG lock is held by someone else)
|
||||
// Check that the key is ours if it already exists
|
||||
checks.push_back(json11::Json::object {
|
||||
{ "target", "LEASE" },
|
||||
{ "lease", etcd_lease_id },
|
||||
@@ -800,17 +814,16 @@ void osd_t::report_pg_states()
|
||||
for (auto pp: reporting_pgs)
|
||||
{
|
||||
auto pg_it = this->pgs.find(pp.first);
|
||||
if (pg_it != this->pgs.end())
|
||||
if (pg_it != this->pgs.end() &&
|
||||
pg_it->second.state == PG_OFFLINE &&
|
||||
pg_state_dirty.find(pp.first) == pg_state_dirty.end())
|
||||
{
|
||||
if (pg_it->second.state == PG_OFFLINE)
|
||||
// Forget offline PGs after reporting their state
|
||||
if (pg_it->second.scheme == POOL_SCHEME_JERASURE)
|
||||
{
|
||||
// Remove offline PGs after reporting their state
|
||||
if (pg_it->second.scheme == POOL_SCHEME_JERASURE)
|
||||
{
|
||||
use_jerasure(pg_it->second.pg_size, pg_it->second.pg_size-pg_it->second.parity_chunks, false);
|
||||
}
|
||||
this->pgs.erase(pg_it);
|
||||
use_jerasure(pg_it->second.pg_size, pg_it->second.pg_data_size, false);
|
||||
}
|
||||
this->pgs.erase(pg_it);
|
||||
}
|
||||
}
|
||||
// Push other PG state updates, if any
|
||||
|
@@ -149,10 +149,14 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
||||
{
|
||||
continue_primary_write(op);
|
||||
}
|
||||
if (pg.inflight == 0 && (pg.state & PG_STOPPING))
|
||||
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
}
|
||||
else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
start_pg_peering(pg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,32 +213,39 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
||||
|
||||
bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
||||
{
|
||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
||||
if (!no_recovery)
|
||||
{
|
||||
if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
|
||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
||||
{
|
||||
for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
|
||||
if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
|
||||
{
|
||||
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
||||
for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
|
||||
{
|
||||
op.degraded = true;
|
||||
op.oid = obj_it->first;
|
||||
return true;
|
||||
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
||||
{
|
||||
op.degraded = true;
|
||||
op.oid = obj_it->first;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
||||
if (!no_rebalance)
|
||||
{
|
||||
if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
|
||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
||||
{
|
||||
for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
|
||||
// Don't try to "recover" misplaced objects if "recovery" would make them degraded
|
||||
if ((pg_it->second.state & (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
|
||||
{
|
||||
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
||||
for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
|
||||
{
|
||||
op.degraded = false;
|
||||
op.oid = obj_it->first;
|
||||
return true;
|
||||
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
||||
{
|
||||
op.degraded = false;
|
||||
op.oid = obj_it->first;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -264,7 +275,6 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
||||
}
|
||||
op->osd_op->callback = [this, op](osd_op_t *osd_op)
|
||||
{
|
||||
// Don't sync the write, it will be synced by our regular sync coroutine
|
||||
if (osd_op->reply.hdr.retval < 0)
|
||||
{
|
||||
// Error recovering object
|
||||
@@ -286,6 +296,17 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
||||
op->osd_op = NULL;
|
||||
recovery_ops.erase(op->oid);
|
||||
delete osd_op;
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
{
|
||||
recovery_done++;
|
||||
if (recovery_done >= recovery_sync_batch)
|
||||
{
|
||||
// Force sync every <recovery_sync_batch> operations
|
||||
// This is required not to pile up an excessive amount of delete operations
|
||||
autosync();
|
||||
recovery_done = 0;
|
||||
}
|
||||
}
|
||||
continue_recovery();
|
||||
};
|
||||
exec_op(op->osd_op);
|
||||
|
@@ -41,16 +41,13 @@ int main(int narg, char *args[])
|
||||
signal(SIGINT, handle_sigint);
|
||||
signal(SIGTERM, handle_sigint);
|
||||
ring_loop_t *ringloop = new ring_loop_t(512);
|
||||
// FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
|
||||
blockstore_t *bs = new blockstore_t(config, ringloop);
|
||||
osd = new osd_t(config, bs, ringloop);
|
||||
osd = new osd_t(config, ringloop);
|
||||
while (1)
|
||||
{
|
||||
ringloop->loop();
|
||||
ringloop->wait();
|
||||
}
|
||||
delete osd;
|
||||
delete bs;
|
||||
delete ringloop;
|
||||
return 0;
|
||||
}
|
||||
|
@@ -19,4 +19,5 @@ const char* osd_op_names[] = {
|
||||
"primary_write",
|
||||
"primary_sync",
|
||||
"primary_delete",
|
||||
"ping",
|
||||
};
|
||||
|
@@ -27,7 +27,8 @@
|
||||
#define OSD_OP_WRITE 12
|
||||
#define OSD_OP_SYNC 13
|
||||
#define OSD_OP_DELETE 14
|
||||
#define OSD_OP_MAX 14
|
||||
#define OSD_OP_PING 15
|
||||
#define OSD_OP_MAX 15
|
||||
// Alignment & limit for read/write operations
|
||||
#ifndef MEM_ALIGNMENT
|
||||
#define MEM_ALIGNMENT 512
|
||||
|
@@ -77,10 +77,11 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
|
||||
// Re-peer affected PGs
|
||||
for (auto & p: pgs)
|
||||
{
|
||||
auto & pg = p.second;
|
||||
bool repeer = false;
|
||||
if (p.second.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
|
||||
if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
|
||||
{
|
||||
for (osd_num_t pg_osd: p.second.all_peers)
|
||||
for (osd_num_t pg_osd: pg.all_peers)
|
||||
{
|
||||
if (pg_osd == peer_osd)
|
||||
{
|
||||
@@ -91,22 +92,29 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
|
||||
if (repeer)
|
||||
{
|
||||
// Repeer this pg
|
||||
printf("[PG %u/%u] Repeer because of OSD %lu\n", p.second.pool_id, p.second.pg_num, peer_osd);
|
||||
start_pg_peering(p.second);
|
||||
printf("[PG %u/%u] Repeer because of OSD %lu\n", pg.pool_id, pg.pg_num, peer_osd);
|
||||
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
start_pg_peering(pg);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Stop accepting new operations, wait for current ones to finish or fail
|
||||
pg.state = pg.state & ~PG_ACTIVE | PG_REPEERING;
|
||||
report_pg_state(pg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Repeer on each connect/disconnect peer event
|
||||
void osd_t::start_pg_peering(pg_t & pg)
|
||||
// Reset PG state (when peering or stopping)
|
||||
void osd_t::reset_pg(pg_t & pg)
|
||||
{
|
||||
pg.state = PG_PEERING;
|
||||
this->peering_state |= OSD_PEERING_PGS;
|
||||
report_pg_state(pg);
|
||||
// Reset PG state
|
||||
pg.cur_peers.clear();
|
||||
pg.state_dict.clear();
|
||||
copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
|
||||
pg.copies_to_delete_after_sync.clear();
|
||||
incomplete_objects -= pg.incomplete_objects.size();
|
||||
misplaced_objects -= pg.misplaced_objects.size();
|
||||
degraded_objects -= pg.degraded_objects.size();
|
||||
@@ -135,6 +143,15 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||
it++;
|
||||
}
|
||||
dirty_pgs.erase({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
}
|
||||
|
||||
// Repeer on each connect/disconnect peer event
|
||||
void osd_t::start_pg_peering(pg_t & pg)
|
||||
{
|
||||
pg.state = PG_PEERING;
|
||||
this->peering_state |= OSD_PEERING_PGS;
|
||||
reset_pg(pg);
|
||||
report_pg_state(pg);
|
||||
// Drop connections of clients who have this PG in dirty_pgs
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
{
|
||||
@@ -175,13 +192,18 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||
// (PG history is kept up to the latest active+clean state)
|
||||
for (auto & history_set: pg.target_history)
|
||||
{
|
||||
bool found = false;
|
||||
bool found = true;
|
||||
for (auto history_osd: history_set)
|
||||
{
|
||||
if (history_osd != 0 && c_cli.osd_peer_fds.find(history_osd) != c_cli.osd_peer_fds.end())
|
||||
if (history_osd != 0)
|
||||
{
|
||||
found = true;
|
||||
break;
|
||||
found = false;
|
||||
if (history_osd == this->osd_num ||
|
||||
c_cli.osd_peer_fds.find(history_osd) != c_cli.osd_peer_fds.end())
|
||||
{
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
@@ -322,9 +344,10 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
|
||||
{
|
||||
// FIXME: Mark peer as failed and don't reconnect immediately after dropping the connection
|
||||
printf("Failed to sync OSD %lu: %ld (%s), disconnecting peer\n", role_osd, op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
|
||||
int fail_fd = op->peer_fd;
|
||||
ps->list_ops.erase(role_osd);
|
||||
c_cli.stop_client(op->peer_fd);
|
||||
delete op;
|
||||
c_cli.stop_client(fail_fd);
|
||||
return;
|
||||
}
|
||||
delete op;
|
||||
@@ -401,9 +424,10 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
|
||||
int fail_fd = op->peer_fd;
|
||||
ps->list_ops.erase(role_osd);
|
||||
c_cli.stop_client(op->peer_fd);
|
||||
delete op;
|
||||
c_cli.stop_client(fail_fd);
|
||||
return;
|
||||
}
|
||||
printf(
|
||||
@@ -454,11 +478,11 @@ bool osd_t::stop_pg(pg_t & pg)
|
||||
if (pg.peering_state)
|
||||
{
|
||||
// Stop peering
|
||||
for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end();)
|
||||
for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end(); it++)
|
||||
{
|
||||
discard_list_subop(it->second);
|
||||
}
|
||||
for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end();)
|
||||
for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end(); it++)
|
||||
{
|
||||
if (it->second.buf)
|
||||
{
|
||||
@@ -468,14 +492,17 @@ bool osd_t::stop_pg(pg_t & pg)
|
||||
delete pg.peering_state;
|
||||
pg.peering_state = NULL;
|
||||
}
|
||||
if (!(pg.state & PG_ACTIVE))
|
||||
if (pg.state & (PG_STOPPING | PG_OFFLINE))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING;
|
||||
if (pg.inflight == 0 && !pg.flush_batch &&
|
||||
// We must either forget all PG's unstable writes or wait for it to become clean
|
||||
dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
|
||||
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)))
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
return true;
|
||||
}
|
||||
pg.state = pg.state & ~PG_ACTIVE & ~PG_REPEERING | PG_STOPPING;
|
||||
if (pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
}
|
||||
@@ -489,6 +516,7 @@ bool osd_t::stop_pg(pg_t & pg)
|
||||
void osd_t::finish_stop_pg(pg_t & pg)
|
||||
{
|
||||
pg.state = PG_OFFLINE;
|
||||
reset_pg(pg);
|
||||
report_pg_state(pg);
|
||||
}
|
||||
|
||||
|
@@ -108,7 +108,7 @@ void pg_obj_state_check_t::start_object()
|
||||
|
||||
void pg_obj_state_check_t::handle_version()
|
||||
{
|
||||
if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_minsize))
|
||||
if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_data_size))
|
||||
{
|
||||
// Version is either stable or recoverable
|
||||
target_ver = last_ver;
|
||||
@@ -171,7 +171,7 @@ void pg_obj_state_check_t::handle_version()
|
||||
|
||||
void pg_obj_state_check_t::finish_object()
|
||||
{
|
||||
if (!target_ver && (n_stable > 0 || n_roles >= pg->pg_minsize))
|
||||
if (!target_ver && (n_stable > 0 || n_roles >= pg->pg_data_size))
|
||||
{
|
||||
// Version is either stable or recoverable
|
||||
target_ver = last_ver;
|
||||
@@ -233,7 +233,7 @@ void pg_obj_state_check_t::finish_object()
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (!replicated && n_roles < pg->pg_minsize)
|
||||
if (!replicated && n_roles < pg->pg_data_size)
|
||||
{
|
||||
if (log_level > 1)
|
||||
{
|
||||
@@ -430,12 +430,13 @@ void pg_t::calc_object_states(int log_level)
|
||||
void pg_t::print_state()
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||
(state & PG_STARTING) ? "starting" : "",
|
||||
(state & PG_OFFLINE) ? "offline" : "",
|
||||
(state & PG_PEERING) ? "peering" : "",
|
||||
(state & PG_INCOMPLETE) ? "incomplete" : "",
|
||||
(state & PG_ACTIVE) ? "active" : "",
|
||||
(state & PG_REPEERING) ? "repeering" : "",
|
||||
(state & PG_STOPPING) ? "stopping" : "",
|
||||
(state & PG_DEGRADED) ? " + degraded" : "",
|
||||
(state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
|
||||
|
@@ -56,6 +56,13 @@ struct obj_piece_id_t
|
||||
uint64_t osd_num;
|
||||
};
|
||||
|
||||
struct obj_ver_osd_t
|
||||
{
|
||||
uint64_t osd_num;
|
||||
object_id oid;
|
||||
uint64_t version;
|
||||
};
|
||||
|
||||
struct flush_action_t
|
||||
{
|
||||
bool rollback = false, make_stable = false;
|
||||
@@ -75,7 +82,7 @@ struct pg_t
|
||||
{
|
||||
int state = 0;
|
||||
uint64_t scheme = 0;
|
||||
uint64_t pg_cursize = 0, pg_size = 0, pg_minsize = 0, parity_chunks = 0;
|
||||
uint64_t pg_cursize = 0, pg_size = 0, pg_minsize = 0, pg_data_size = 0;
|
||||
pool_id_t pool_id = 0;
|
||||
pg_num_t pg_num = 0;
|
||||
uint64_t clean_count = 0, total_count = 0;
|
||||
@@ -101,6 +108,7 @@ struct pg_t
|
||||
std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
|
||||
btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
|
||||
std::map<obj_piece_id_t, flush_action_t> flush_actions;
|
||||
std::vector<obj_ver_osd_t> copies_to_delete_after_sync;
|
||||
btree::btree_map<object_id, uint64_t> ver_override;
|
||||
pg_peering_state_t *peering_state = NULL;
|
||||
pg_flush_batch_t *flush_batch = NULL;
|
||||
|
@@ -18,7 +18,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||
// Our EC scheme stores data in fixed chunks equal to (K*block size)
|
||||
// K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools
|
||||
pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode);
|
||||
// FIXME: We have to access pool config here, so make sure that it doesn't change while its PGs are active...
|
||||
// Note: We read pool config here, so we must NOT change it when PGs are active
|
||||
auto pool_cfg_it = st_cli.pool_config.find(pool_id);
|
||||
if (pool_cfg_it == st_cli.pool_config.end())
|
||||
{
|
||||
@@ -44,8 +44,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||
return false;
|
||||
}
|
||||
if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_block_size) ||
|
||||
(cur_op->req.rw.offset % bs_disk_alignment) != 0 ||
|
||||
(cur_op->req.rw.len % bs_disk_alignment) != 0)
|
||||
(cur_op->req.rw.offset % bs_bitmap_granularity) != 0 ||
|
||||
(cur_op->req.rw.len % bs_bitmap_granularity) != 0)
|
||||
{
|
||||
finish_op(cur_op, -EINVAL);
|
||||
return false;
|
||||
@@ -64,7 +64,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||
return true;
|
||||
}
|
||||
|
||||
static uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
|
||||
uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
|
||||
{
|
||||
if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
|
||||
{
|
||||
@@ -177,527 +177,6 @@ resume_2:
|
||||
finish_op(cur_op, cur_op->req.rw.len);
|
||||
}
|
||||
|
||||
bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
// Check if actions are pending for this object
|
||||
auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
|
||||
.oid = op_data->oid,
|
||||
.osd_num = 0,
|
||||
});
|
||||
if (act_it != pg.flush_actions.end() &&
|
||||
act_it->first.oid.inode == op_data->oid.inode &&
|
||||
(act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
|
||||
{
|
||||
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||
return false;
|
||||
}
|
||||
// Check if there are other write requests to the same object
|
||||
auto vo_it = pg.write_queue.find(op_data->oid);
|
||||
if (vo_it != pg.write_queue.end())
|
||||
{
|
||||
op_data->st = 1;
|
||||
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||
return false;
|
||||
}
|
||||
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_t::continue_primary_write(osd_op_t *cur_op)
|
||||
{
|
||||
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
|
||||
{
|
||||
return;
|
||||
}
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
if (op_data->st == 1) goto resume_1;
|
||||
else if (op_data->st == 2) goto resume_2;
|
||||
else if (op_data->st == 3) goto resume_3;
|
||||
else if (op_data->st == 4) goto resume_4;
|
||||
else if (op_data->st == 5) goto resume_5;
|
||||
else if (op_data->st == 6) goto resume_6;
|
||||
else if (op_data->st == 7) goto resume_7;
|
||||
else if (op_data->st == 8) goto resume_8;
|
||||
else if (op_data->st == 9) goto resume_9;
|
||||
else if (op_data->st == 10) goto resume_10;
|
||||
assert(op_data->st == 0);
|
||||
if (!check_write_queue(cur_op, pg))
|
||||
{
|
||||
return;
|
||||
}
|
||||
resume_1:
|
||||
// Determine blocks to read and write
|
||||
// Missing chunks are allowed to be overwritten even in incomplete objects
|
||||
// FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Simplified algorithm
|
||||
op_data->stripes[0].write_start = op_data->stripes[0].req_start;
|
||||
op_data->stripes[0].write_end = op_data->stripes[0].req_end;
|
||||
op_data->stripes[0].write_buf = cur_op->buf;
|
||||
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
|
||||
op_data->stripes[0].write_end != bs_block_size))
|
||||
{
|
||||
// Object is degraded/misplaced and will be moved to <write_osd_set>
|
||||
op_data->stripes[0].read_start = 0;
|
||||
op_data->stripes[0].read_end = bs_block_size;
|
||||
cur_op->rmw_buf = op_data->stripes[0].read_buf = memalign_or_die(MEM_ALIGNMENT, bs_block_size);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
|
||||
pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
|
||||
if (!cur_op->rmw_buf)
|
||||
{
|
||||
// Refuse partial overwrite of an incomplete object
|
||||
cur_op->reply.hdr.retval = -EINVAL;
|
||||
goto continue_others;
|
||||
}
|
||||
}
|
||||
// Read required blocks
|
||||
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, pg.pg_size, op_data->prev_set, cur_op);
|
||||
resume_2:
|
||||
op_data->st = 2;
|
||||
return;
|
||||
resume_3:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return;
|
||||
}
|
||||
// Save version override for parallel reads
|
||||
pg.ver_override[op_data->oid] = op_data->fact_ver;
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Only (possibly) copy new data from the request into the recovery buffer
|
||||
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
|
||||
op_data->stripes[0].write_end != bs_block_size))
|
||||
{
|
||||
memcpy(
|
||||
op_data->stripes[0].read_buf + op_data->stripes[0].req_start,
|
||||
op_data->stripes[0].write_buf,
|
||||
op_data->stripes[0].req_end - op_data->stripes[0].req_start
|
||||
);
|
||||
op_data->stripes[0].write_buf = op_data->stripes[0].read_buf;
|
||||
op_data->stripes[0].write_start = 0;
|
||||
op_data->stripes[0].write_end = bs_block_size;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Recover missing stripes, calculate parity
|
||||
if (pg.scheme == POOL_SCHEME_XOR)
|
||||
{
|
||||
calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
|
||||
}
|
||||
else if (pg.scheme == POOL_SCHEME_JERASURE)
|
||||
{
|
||||
calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
|
||||
}
|
||||
}
|
||||
// Send writes
|
||||
if ((op_data->fact_ver >> (64-PG_EPOCH_BITS)) < pg.epoch)
|
||||
{
|
||||
op_data->target_ver = ((uint64_t)pg.epoch << (64-PG_EPOCH_BITS)) | 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((op_data->fact_ver & (1ul<<(64-PG_EPOCH_BITS) - 1)) == (1ul<<(64-PG_EPOCH_BITS) - 1))
|
||||
{
|
||||
assert(pg.epoch != ((1ul << PG_EPOCH_BITS)-1));
|
||||
pg.epoch++;
|
||||
}
|
||||
op_data->target_ver = op_data->fact_ver + 1;
|
||||
}
|
||||
if (pg.epoch > pg.reported_epoch)
|
||||
{
|
||||
// Report newer epoch before writing
|
||||
// FIXME: We may report only one PG state here...
|
||||
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
pg.history_changed = true;
|
||||
report_pg_states();
|
||||
resume_10:
|
||||
if (pg.epoch > pg.reported_epoch)
|
||||
{
|
||||
op_data->st = 10;
|
||||
return;
|
||||
}
|
||||
}
|
||||
submit_primary_subops(SUBMIT_WRITE, op_data->target_ver, pg.pg_size, pg.cur_set.data(), cur_op);
|
||||
resume_4:
|
||||
op_data->st = 4;
|
||||
return;
|
||||
resume_5:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return;
|
||||
}
|
||||
resume_6:
|
||||
resume_7:
|
||||
if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
|
||||
{
|
||||
// FIXME: Check for immediate_commit == IMMEDIATE_SMALL
|
||||
return;
|
||||
}
|
||||
if (op_data->fact_ver == 1)
|
||||
{
|
||||
// Object is created
|
||||
pg.clean_count++;
|
||||
pg.total_count++;
|
||||
}
|
||||
if (op_data->object_state)
|
||||
{
|
||||
{
|
||||
int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
|
||||
recovery_stat_count[0][recovery_type]++;
|
||||
if (!recovery_stat_count[0][recovery_type])
|
||||
{
|
||||
recovery_stat_count[0][recovery_type]++;
|
||||
recovery_stat_bytes[0][recovery_type] = 0;
|
||||
}
|
||||
for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size); role++)
|
||||
{
|
||||
recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
|
||||
}
|
||||
}
|
||||
if (op_data->object_state->state & OBJ_MISPLACED)
|
||||
{
|
||||
// Remove extra chunks
|
||||
submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
|
||||
if (op_data->n_subops > 0)
|
||||
{
|
||||
resume_8:
|
||||
op_data->st = 8;
|
||||
return;
|
||||
resume_9:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Clear object state
|
||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
||||
pg.clean_count++;
|
||||
}
|
||||
cur_op->reply.hdr.retval = cur_op->req.rw.len;
|
||||
continue_others:
|
||||
// Remove version override
|
||||
pg.ver_override.erase(op_data->oid);
|
||||
object_id oid = op_data->oid;
|
||||
finish_op(cur_op, cur_op->reply.hdr.retval);
|
||||
// Continue other write operations to the same object
|
||||
auto next_it = pg.write_queue.find(oid);
|
||||
auto this_it = next_it;
|
||||
if (this_it != pg.write_queue.end() && this_it->second == cur_op)
|
||||
{
|
||||
next_it++;
|
||||
pg.write_queue.erase(this_it);
|
||||
if (next_it != pg.write_queue.end() && next_it->first == oid)
|
||||
{
|
||||
osd_op_t *next_op = next_it->second;
|
||||
continue_primary_write(next_op);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
if (op_data->st == base_state)
|
||||
{
|
||||
goto resume_6;
|
||||
}
|
||||
else if (op_data->st == base_state+1)
|
||||
{
|
||||
goto resume_7;
|
||||
}
|
||||
// FIXME: Check for immediate_commit == IMMEDIATE_SMALL
|
||||
if (immediate_commit == IMMEDIATE_ALL)
|
||||
{
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Send STABILIZE ops immediately
|
||||
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
|
||||
op_data->unstable_writes = new obj_ver_id[loc_set.size()];
|
||||
{
|
||||
int last_start = 0;
|
||||
for (auto & chunk: loc_set)
|
||||
{
|
||||
op_data->unstable_writes[last_start] = (obj_ver_id){
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | chunk.role,
|
||||
},
|
||||
.version = op_data->fact_ver,
|
||||
};
|
||||
op_data->unstable_write_osds->push_back((unstable_osd_num_t){
|
||||
.osd_num = chunk.osd_num,
|
||||
.start = last_start,
|
||||
.len = 1,
|
||||
});
|
||||
last_start++;
|
||||
}
|
||||
}
|
||||
submit_primary_stab_subops(cur_op);
|
||||
resume_6:
|
||||
op_data->st = 6;
|
||||
return false;
|
||||
resume_7:
|
||||
// FIXME: Free those in the destructor?
|
||||
delete op_data->unstable_write_osds;
|
||||
delete[] op_data->unstable_writes;
|
||||
op_data->unstable_writes = NULL;
|
||||
op_data->unstable_write_osds = NULL;
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Remember version as unstable for EC/XOR
|
||||
for (auto & chunk: loc_set)
|
||||
{
|
||||
this->dirty_osds.insert(chunk.osd_num);
|
||||
this->unstable_writes[(osd_object_id_t){
|
||||
.osd_num = chunk.osd_num,
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | chunk.role,
|
||||
},
|
||||
}] = op_data->fact_ver;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Only remember to sync OSDs for replicated pools
|
||||
for (auto & chunk: loc_set)
|
||||
{
|
||||
this->dirty_osds.insert(chunk.osd_num);
|
||||
}
|
||||
}
|
||||
// Remember PG as dirty to drop the connection when PG goes offline
|
||||
// (this is required because of the "lazy sync")
|
||||
auto cl_it = c_cli.clients.find(cur_op->peer_fd);
|
||||
if (cl_it != c_cli.clients.end())
|
||||
{
|
||||
cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
}
|
||||
dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Save and clear unstable_writes -> SYNC all -> STABLE all
|
||||
void osd_t::continue_primary_sync(osd_op_t *cur_op)
|
||||
{
|
||||
if (!cur_op->op_data)
|
||||
{
|
||||
cur_op->op_data = (osd_primary_op_data_t*)calloc_or_die(1, sizeof(osd_primary_op_data_t));
|
||||
}
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
if (op_data->st == 1) goto resume_1;
|
||||
else if (op_data->st == 2) goto resume_2;
|
||||
else if (op_data->st == 3) goto resume_3;
|
||||
else if (op_data->st == 4) goto resume_4;
|
||||
else if (op_data->st == 5) goto resume_5;
|
||||
else if (op_data->st == 6) goto resume_6;
|
||||
assert(op_data->st == 0);
|
||||
if (syncs_in_progress.size() > 0)
|
||||
{
|
||||
// Wait for previous syncs, if any
|
||||
// FIXME: We may try to execute the current one in parallel, like in Blockstore, but I'm not sure if it matters at all
|
||||
syncs_in_progress.push_back(cur_op);
|
||||
op_data->st = 1;
|
||||
resume_1:
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
syncs_in_progress.push_back(cur_op);
|
||||
}
|
||||
resume_2:
|
||||
if (dirty_osds.size() == 0)
|
||||
{
|
||||
// Nothing to sync
|
||||
goto finish;
|
||||
}
|
||||
// Save and clear unstable_writes
|
||||
// In theory it is possible to do in on a per-client basis, but this seems to be an unnecessary complication
|
||||
// It would be cool not to copy these here at all, but someone has to deduplicate them by object IDs anyway
|
||||
if (unstable_writes.size() > 0)
|
||||
{
|
||||
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
|
||||
op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
|
||||
osd_num_t last_osd = 0;
|
||||
int last_start = 0, last_end = 0;
|
||||
for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
|
||||
{
|
||||
if (last_osd != it->first.osd_num)
|
||||
{
|
||||
if (last_osd != 0)
|
||||
{
|
||||
op_data->unstable_write_osds->push_back((unstable_osd_num_t){
|
||||
.osd_num = last_osd,
|
||||
.start = last_start,
|
||||
.len = last_end - last_start,
|
||||
});
|
||||
}
|
||||
last_osd = it->first.osd_num;
|
||||
last_start = last_end;
|
||||
}
|
||||
op_data->unstable_writes[last_end] = (obj_ver_id){
|
||||
.oid = it->first.oid,
|
||||
.version = it->second,
|
||||
};
|
||||
last_end++;
|
||||
}
|
||||
if (last_osd != 0)
|
||||
{
|
||||
op_data->unstable_write_osds->push_back((unstable_osd_num_t){
|
||||
.osd_num = last_osd,
|
||||
.start = last_start,
|
||||
.len = last_end - last_start,
|
||||
});
|
||||
}
|
||||
this->unstable_writes.clear();
|
||||
}
|
||||
{
|
||||
void *dirty_buf = malloc_or_die(sizeof(pool_pg_num_t)*dirty_pgs.size() + sizeof(osd_num_t)*dirty_osds.size());
|
||||
op_data->dirty_pgs = (pool_pg_num_t*)dirty_buf;
|
||||
op_data->dirty_osds = (osd_num_t*)(dirty_buf + sizeof(pool_pg_num_t)*dirty_pgs.size());
|
||||
op_data->dirty_pg_count = dirty_pgs.size();
|
||||
op_data->dirty_osd_count = dirty_osds.size();
|
||||
int dpg = 0;
|
||||
for (auto dirty_pg_num: dirty_pgs)
|
||||
{
|
||||
pgs.at(dirty_pg_num).inflight++;
|
||||
op_data->dirty_pgs[dpg++] = dirty_pg_num;
|
||||
}
|
||||
dirty_pgs.clear();
|
||||
dpg = 0;
|
||||
for (auto osd_num: dirty_osds)
|
||||
{
|
||||
op_data->dirty_osds[dpg++] = osd_num;
|
||||
}
|
||||
dirty_osds.clear();
|
||||
}
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
{
|
||||
// SYNC
|
||||
submit_primary_sync_subops(cur_op);
|
||||
resume_3:
|
||||
op_data->st = 3;
|
||||
return;
|
||||
resume_4:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
goto resume_6;
|
||||
}
|
||||
}
|
||||
if (op_data->unstable_writes)
|
||||
{
|
||||
// Stabilize version sets, if any
|
||||
submit_primary_stab_subops(cur_op);
|
||||
resume_5:
|
||||
op_data->st = 5;
|
||||
return;
|
||||
}
|
||||
resume_6:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
// Return PGs and OSDs back into their dirty sets
|
||||
for (int i = 0; i < op_data->dirty_pg_count; i++)
|
||||
{
|
||||
dirty_pgs.insert(op_data->dirty_pgs[i]);
|
||||
}
|
||||
for (int i = 0; i < op_data->dirty_osd_count; i++)
|
||||
{
|
||||
dirty_osds.insert(op_data->dirty_osds[i]);
|
||||
}
|
||||
if (op_data->unstable_writes)
|
||||
{
|
||||
// Return objects back into the unstable write set
|
||||
for (auto unstable_osd: *(op_data->unstable_write_osds))
|
||||
{
|
||||
for (int i = 0; i < unstable_osd.len; i++)
|
||||
{
|
||||
// Except those from peered PGs
|
||||
auto & w = op_data->unstable_writes[i];
|
||||
pool_pg_num_t wpg = {
|
||||
.pool_id = INODE_POOL(w.oid.inode),
|
||||
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
|
||||
};
|
||||
if (pgs.at(wpg).state & PG_ACTIVE)
|
||||
{
|
||||
uint64_t & dest = this->unstable_writes[(osd_object_id_t){
|
||||
.osd_num = unstable_osd.osd_num,
|
||||
.oid = w.oid,
|
||||
}];
|
||||
dest = dest < w.version ? w.version : dest;
|
||||
dirty_pgs.insert(wpg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < op_data->dirty_pg_count; i++)
|
||||
{
|
||||
auto & pg = pgs.at(op_data->dirty_pgs[i]);
|
||||
pg.inflight--;
|
||||
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
}
|
||||
}
|
||||
// FIXME: Free those in the destructor?
|
||||
free(op_data->dirty_pgs);
|
||||
op_data->dirty_pgs = NULL;
|
||||
op_data->dirty_osds = NULL;
|
||||
if (op_data->unstable_writes)
|
||||
{
|
||||
delete op_data->unstable_write_osds;
|
||||
delete[] op_data->unstable_writes;
|
||||
op_data->unstable_writes = NULL;
|
||||
op_data->unstable_write_osds = NULL;
|
||||
}
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
}
|
||||
else
|
||||
{
|
||||
finish:
|
||||
if (cur_op->peer_fd)
|
||||
{
|
||||
auto it = c_cli.clients.find(cur_op->peer_fd);
|
||||
if (it != c_cli.clients.end())
|
||||
it->second->dirty_pgs.clear();
|
||||
}
|
||||
finish_op(cur_op, 0);
|
||||
}
|
||||
assert(syncs_in_progress.front() == cur_op);
|
||||
syncs_in_progress.pop_front();
|
||||
if (syncs_in_progress.size() > 0)
|
||||
{
|
||||
cur_op = syncs_in_progress.front();
|
||||
op_data = cur_op->op_data;
|
||||
op_data->st++;
|
||||
goto resume_2;
|
||||
}
|
||||
}
|
||||
|
||||
// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
|
||||
void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
|
||||
{
|
||||
@@ -736,10 +215,14 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
|
||||
{
|
||||
throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
|
||||
}
|
||||
object_state->object_count--;
|
||||
if (!object_state->object_count)
|
||||
}
|
||||
|
||||
void osd_t::free_object_state(pg_t & pg, pg_osd_set_state_t **object_state)
|
||||
{
|
||||
if (*object_state && !(--(*object_state)->object_count))
|
||||
{
|
||||
pg.state_dict.erase(object_state->osd_set);
|
||||
pg.state_dict.erase((*object_state)->osd_set);
|
||||
*object_state = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -805,22 +288,21 @@ resume_5:
|
||||
else
|
||||
{
|
||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
}
|
||||
pg.total_count--;
|
||||
object_id oid = op_data->oid;
|
||||
finish_op(cur_op, cur_op->req.rw.len);
|
||||
// Continue other write operations to the same object
|
||||
auto next_it = pg.write_queue.find(oid);
|
||||
auto this_it = next_it;
|
||||
if (this_it != pg.write_queue.end() && this_it->second == cur_op)
|
||||
osd_op_t *next_op = NULL;
|
||||
auto next_it = pg.write_queue.find(op_data->oid);
|
||||
if (next_it != pg.write_queue.end() && next_it->second == cur_op)
|
||||
{
|
||||
next_it++;
|
||||
pg.write_queue.erase(this_it);
|
||||
if (next_it != pg.write_queue.end() &&
|
||||
next_it->first == oid)
|
||||
{
|
||||
osd_op_t *next_op = next_it->second;
|
||||
continue_primary_write(next_op);
|
||||
}
|
||||
pg.write_queue.erase(next_it++);
|
||||
if (next_it != pg.write_queue.end() && next_it->first == op_data->oid)
|
||||
next_op = next_it->second;
|
||||
}
|
||||
finish_op(cur_op, cur_op->req.rw.len);
|
||||
if (next_op)
|
||||
{
|
||||
// Continue next write to the same object
|
||||
continue_primary_write(next_op);
|
||||
}
|
||||
}
|
||||
|
@@ -38,4 +38,8 @@ struct osd_primary_op_data_t
|
||||
osd_num_t *dirty_osds = NULL;
|
||||
int dirty_osd_count = 0;
|
||||
obj_ver_id *unstable_writes = NULL;
|
||||
obj_ver_osd_t *copies_to_delete = NULL;
|
||||
int copies_to_delete_count = 0;
|
||||
};
|
||||
|
||||
bool contains_osd(osd_num_t *osd_set, uint64_t size, osd_num_t osd_num);
|
||||
|
@@ -47,6 +47,10 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
}
|
||||
else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
start_pg_peering(pg);
|
||||
}
|
||||
}
|
||||
assert(!cur_op->op_data->subops);
|
||||
assert(!cur_op->op_data->unstable_write_osds);
|
||||
@@ -192,14 +196,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
|
||||
}
|
||||
subops[i].callback = [cur_op, this](osd_op_t *subop)
|
||||
{
|
||||
int fail_fd = subop->req.hdr.opcode == OSD_OP_SEC_WRITE &&
|
||||
subop->reply.hdr.retval != subop->req.sec_rw.len ? subop->peer_fd : -1;
|
||||
handle_primary_subop(subop, cur_op);
|
||||
if (fail_fd >= 0)
|
||||
{
|
||||
// write operation failed, drop the connection
|
||||
c_cli.stop_client(fail_fd);
|
||||
}
|
||||
};
|
||||
c_cli.outbox_push(&subops[i]);
|
||||
}
|
||||
@@ -245,6 +242,7 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
|
||||
}
|
||||
delete bs_op;
|
||||
subop->bs_op = NULL;
|
||||
subop->peer_fd = -1;
|
||||
handle_primary_subop(subop, cur_op);
|
||||
}
|
||||
|
||||
@@ -286,6 +284,11 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||
op_data->epipe++;
|
||||
}
|
||||
op_data->errors++;
|
||||
if (subop->peer_fd >= 0)
|
||||
{
|
||||
// Drop connection on any error
|
||||
c_cli.stop_client(subop->peer_fd);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -353,7 +356,7 @@ void osd_t::cancel_primary_write(osd_op_t *cur_op)
|
||||
}
|
||||
}
|
||||
|
||||
static bool contains_osd(osd_num_t *osd_set, uint64_t size, osd_num_t osd_num)
|
||||
bool contains_osd(osd_num_t *osd_set, uint64_t size, osd_num_t osd_num)
|
||||
{
|
||||
for (uint64_t i = 0; i < size; i++)
|
||||
{
|
||||
@@ -369,83 +372,81 @@ void osd_t::submit_primary_del_subops(osd_op_t *cur_op, osd_num_t *cur_set, uint
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
bool rep = op_data->scheme == POOL_SCHEME_REPLICATED;
|
||||
int extra_chunks = 0;
|
||||
// ordered comparison for EC/XOR, unordered for replicated pools
|
||||
obj_ver_osd_t extra_chunks[loc_set.size()];
|
||||
int chunks_to_del = 0;
|
||||
for (auto & chunk: loc_set)
|
||||
{
|
||||
if (!cur_set || (rep ? !contains_osd(cur_set, set_size, chunk.osd_num) : chunk.osd_num != cur_set[chunk.role]))
|
||||
// ordered comparison for EC/XOR, unordered for replicated pools
|
||||
if (!cur_set || (rep
|
||||
? !contains_osd(cur_set, set_size, chunk.osd_num)
|
||||
: (chunk.osd_num != cur_set[chunk.role])))
|
||||
{
|
||||
extra_chunks++;
|
||||
extra_chunks[chunks_to_del++] = (obj_ver_osd_t){
|
||||
.osd_num = chunk.osd_num,
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | (rep ? 0 : chunk.role),
|
||||
},
|
||||
// Same version as write
|
||||
.version = op_data->fact_ver,
|
||||
};
|
||||
}
|
||||
}
|
||||
op_data->n_subops = extra_chunks;
|
||||
submit_primary_del_batch(cur_op, extra_chunks, chunks_to_del);
|
||||
}
|
||||
|
||||
void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_delete, int chunks_to_delete_count)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
op_data->n_subops = chunks_to_delete_count;
|
||||
op_data->done = op_data->errors = 0;
|
||||
if (!extra_chunks)
|
||||
if (!op_data->n_subops)
|
||||
{
|
||||
return;
|
||||
}
|
||||
osd_op_t *subops = new osd_op_t[extra_chunks];
|
||||
osd_op_t *subops = new osd_op_t[chunks_to_delete_count];
|
||||
op_data->subops = subops;
|
||||
int i = 0;
|
||||
for (auto & chunk: loc_set)
|
||||
for (int i = 0; i < chunks_to_delete_count; i++)
|
||||
{
|
||||
if (!cur_set || (rep ? !contains_osd(cur_set, set_size, chunk.osd_num) : chunk.osd_num != cur_set[chunk.role]))
|
||||
auto & chunk = chunks_to_delete[i];
|
||||
if (chunk.osd_num == this->osd_num)
|
||||
{
|
||||
int stripe_num = op_data->scheme == POOL_SCHEME_REPLICATED ? 0 : chunk.role;
|
||||
if (chunk.osd_num == this->osd_num)
|
||||
{
|
||||
clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
|
||||
subops[i].op_type = (uint64_t)cur_op;
|
||||
subops[i].bs_op = new blockstore_op_t({
|
||||
.opcode = BS_OP_DELETE,
|
||||
.callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
|
||||
{
|
||||
handle_primary_bs_subop(subop);
|
||||
},
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | stripe_num,
|
||||
},
|
||||
// Same version as write
|
||||
.version = op_data->fact_ver,
|
||||
});
|
||||
bs->enqueue_op(subops[i].bs_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = c_cli.osd_peer_fds.at(chunk.osd_num);
|
||||
subops[i].req.sec_del = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = c_cli.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_DELETE,
|
||||
},
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | stripe_num,
|
||||
},
|
||||
// Same version as write
|
||||
.version = op_data->fact_ver,
|
||||
};
|
||||
subops[i].callback = [cur_op, this](osd_op_t *subop)
|
||||
clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
|
||||
subops[i].op_type = (uint64_t)cur_op;
|
||||
subops[i].bs_op = new blockstore_op_t({
|
||||
.opcode = BS_OP_DELETE,
|
||||
.callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
|
||||
{
|
||||
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
|
||||
handle_primary_subop(subop, cur_op);
|
||||
if (fail_fd >= 0)
|
||||
{
|
||||
// delete operation failed, drop the connection
|
||||
c_cli.stop_client(fail_fd);
|
||||
}
|
||||
};
|
||||
c_cli.outbox_push(&subops[i]);
|
||||
}
|
||||
i++;
|
||||
handle_primary_bs_subop(subop);
|
||||
},
|
||||
.oid = chunk.oid,
|
||||
.version = chunk.version,
|
||||
});
|
||||
bs->enqueue_op(subops[i].bs_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = c_cli.osd_peer_fds.at(chunk.osd_num);
|
||||
subops[i].req = (osd_any_op_t){ .sec_del = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = c_cli.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_DELETE,
|
||||
},
|
||||
.oid = chunk.oid,
|
||||
.version = chunk.version,
|
||||
} };
|
||||
subops[i].callback = [cur_op, this](osd_op_t *subop)
|
||||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
c_cli.outbox_push(&subops[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
||||
int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
int n_osds = op_data->dirty_osd_count;
|
||||
@@ -453,6 +454,7 @@ void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
||||
op_data->done = op_data->errors = 0;
|
||||
op_data->n_subops = n_osds;
|
||||
op_data->subops = subops;
|
||||
std::map<uint64_t, int>::iterator peer_it;
|
||||
for (int i = 0; i < n_osds; i++)
|
||||
{
|
||||
osd_num_t sync_osd = op_data->dirty_osds[i];
|
||||
@@ -469,30 +471,35 @@ void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
||||
});
|
||||
bs->enqueue_op(subops[i].bs_op);
|
||||
}
|
||||
else
|
||||
else if ((peer_it = c_cli.osd_peer_fds.find(sync_osd)) != c_cli.osd_peer_fds.end())
|
||||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = c_cli.osd_peer_fds.at(sync_osd);
|
||||
subops[i].req.sec_sync = {
|
||||
subops[i].peer_fd = peer_it->second;
|
||||
subops[i].req = (osd_any_op_t){ .sec_sync = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = c_cli.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_SYNC,
|
||||
},
|
||||
};
|
||||
} };
|
||||
subops[i].callback = [cur_op, this](osd_op_t *subop)
|
||||
{
|
||||
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
|
||||
handle_primary_subop(subop, cur_op);
|
||||
if (fail_fd >= 0)
|
||||
{
|
||||
// sync operation failed, drop the connection
|
||||
c_cli.stop_client(fail_fd);
|
||||
}
|
||||
};
|
||||
c_cli.outbox_push(&subops[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
op_data->done++;
|
||||
}
|
||||
}
|
||||
if (op_data->done >= op_data->n_subops)
|
||||
{
|
||||
delete[] op_data->subops;
|
||||
op_data->subops = NULL;
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||
@@ -525,24 +532,18 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = c_cli.osd_peer_fds.at(stab_osd.osd_num);
|
||||
subops[i].req.sec_stab = {
|
||||
subops[i].req = (osd_any_op_t){ .sec_stab = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = c_cli.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_STABILIZE,
|
||||
},
|
||||
.len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
|
||||
};
|
||||
} };
|
||||
subops[i].iov.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
|
||||
subops[i].callback = [cur_op, this](osd_op_t *subop)
|
||||
{
|
||||
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
|
||||
handle_primary_subop(subop, cur_op);
|
||||
if (fail_fd >= 0)
|
||||
{
|
||||
// sync operation failed, drop the connection
|
||||
c_cli.stop_client(fail_fd);
|
||||
}
|
||||
};
|
||||
c_cli.outbox_push(&subops[i]);
|
||||
}
|
||||
@@ -560,7 +561,7 @@ void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid,
|
||||
return;
|
||||
}
|
||||
std::vector<osd_op_t*> cancel_ops;
|
||||
while (it != pg.write_queue.end())
|
||||
while (it != pg.write_queue.end() && it->first == oid)
|
||||
{
|
||||
cancel_ops.push_back(it->second);
|
||||
it++;
|
||||
|
265
src/osd_primary_sync.cpp
Normal file
265
src/osd_primary_sync.cpp
Normal file
@@ -0,0 +1,265 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "osd_primary.h"
|
||||
|
||||
// Save and clear unstable_writes -> SYNC all -> STABLE all
|
||||
void osd_t::continue_primary_sync(osd_op_t *cur_op)
|
||||
{
|
||||
if (!cur_op->op_data)
|
||||
{
|
||||
cur_op->op_data = (osd_primary_op_data_t*)calloc_or_die(1, sizeof(osd_primary_op_data_t));
|
||||
}
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
if (op_data->st == 1) goto resume_1;
|
||||
else if (op_data->st == 2) goto resume_2;
|
||||
else if (op_data->st == 3) goto resume_3;
|
||||
else if (op_data->st == 4) goto resume_4;
|
||||
else if (op_data->st == 5) goto resume_5;
|
||||
else if (op_data->st == 6) goto resume_6;
|
||||
else if (op_data->st == 7) goto resume_7;
|
||||
else if (op_data->st == 8) goto resume_8;
|
||||
assert(op_data->st == 0);
|
||||
if (syncs_in_progress.size() > 0)
|
||||
{
|
||||
// Wait for previous syncs, if any
|
||||
// FIXME: We may try to execute the current one in parallel, like in Blockstore, but I'm not sure if it matters at all
|
||||
syncs_in_progress.push_back(cur_op);
|
||||
op_data->st = 1;
|
||||
resume_1:
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
syncs_in_progress.push_back(cur_op);
|
||||
}
|
||||
resume_2:
|
||||
if (dirty_osds.size() == 0)
|
||||
{
|
||||
// Nothing to sync
|
||||
goto finish;
|
||||
}
|
||||
// Save and clear unstable_writes
|
||||
// In theory it is possible to do in on a per-client basis, but this seems to be an unnecessary complication
|
||||
// It would be cool not to copy these here at all, but someone has to deduplicate them by object IDs anyway
|
||||
if (unstable_writes.size() > 0)
|
||||
{
|
||||
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
|
||||
op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
|
||||
osd_num_t last_osd = 0;
|
||||
int last_start = 0, last_end = 0;
|
||||
for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
|
||||
{
|
||||
if (last_osd != it->first.osd_num)
|
||||
{
|
||||
if (last_osd != 0)
|
||||
{
|
||||
op_data->unstable_write_osds->push_back((unstable_osd_num_t){
|
||||
.osd_num = last_osd,
|
||||
.start = last_start,
|
||||
.len = last_end - last_start,
|
||||
});
|
||||
}
|
||||
last_osd = it->first.osd_num;
|
||||
last_start = last_end;
|
||||
}
|
||||
op_data->unstable_writes[last_end] = (obj_ver_id){
|
||||
.oid = it->first.oid,
|
||||
.version = it->second,
|
||||
};
|
||||
last_end++;
|
||||
}
|
||||
if (last_osd != 0)
|
||||
{
|
||||
op_data->unstable_write_osds->push_back((unstable_osd_num_t){
|
||||
.osd_num = last_osd,
|
||||
.start = last_start,
|
||||
.len = last_end - last_start,
|
||||
});
|
||||
}
|
||||
this->unstable_writes.clear();
|
||||
}
|
||||
{
|
||||
void *dirty_buf = malloc_or_die(
|
||||
sizeof(pool_pg_num_t)*dirty_pgs.size() +
|
||||
sizeof(osd_num_t)*dirty_osds.size() +
|
||||
sizeof(obj_ver_osd_t)*this->copies_to_delete_after_sync_count
|
||||
);
|
||||
op_data->dirty_pgs = (pool_pg_num_t*)dirty_buf;
|
||||
op_data->dirty_osds = (osd_num_t*)(dirty_buf + sizeof(pool_pg_num_t)*dirty_pgs.size());
|
||||
op_data->dirty_pg_count = dirty_pgs.size();
|
||||
op_data->dirty_osd_count = dirty_osds.size();
|
||||
if (this->copies_to_delete_after_sync_count)
|
||||
{
|
||||
op_data->copies_to_delete_count = 0;
|
||||
op_data->copies_to_delete = (obj_ver_osd_t*)(op_data->dirty_osds + op_data->dirty_osd_count);
|
||||
for (auto dirty_pg_num: dirty_pgs)
|
||||
{
|
||||
auto & pg = pgs.at(dirty_pg_num);
|
||||
assert(pg.copies_to_delete_after_sync.size() <= this->copies_to_delete_after_sync_count);
|
||||
memcpy(
|
||||
op_data->copies_to_delete + op_data->copies_to_delete_count,
|
||||
pg.copies_to_delete_after_sync.data(),
|
||||
sizeof(obj_ver_osd_t)*pg.copies_to_delete_after_sync.size()
|
||||
);
|
||||
op_data->copies_to_delete_count += pg.copies_to_delete_after_sync.size();
|
||||
this->copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
|
||||
pg.copies_to_delete_after_sync.clear();
|
||||
}
|
||||
assert(this->copies_to_delete_after_sync_count == 0);
|
||||
}
|
||||
int dpg = 0;
|
||||
for (auto dirty_pg_num: dirty_pgs)
|
||||
{
|
||||
pgs.at(dirty_pg_num).inflight++;
|
||||
op_data->dirty_pgs[dpg++] = dirty_pg_num;
|
||||
}
|
||||
dirty_pgs.clear();
|
||||
dpg = 0;
|
||||
for (auto osd_num: dirty_osds)
|
||||
{
|
||||
op_data->dirty_osds[dpg++] = osd_num;
|
||||
}
|
||||
dirty_osds.clear();
|
||||
}
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
{
|
||||
// SYNC
|
||||
if (!submit_primary_sync_subops(cur_op))
|
||||
{
|
||||
goto resume_4;
|
||||
}
|
||||
resume_3:
|
||||
op_data->st = 3;
|
||||
return;
|
||||
resume_4:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
goto resume_6;
|
||||
}
|
||||
}
|
||||
if (op_data->unstable_writes)
|
||||
{
|
||||
// Stabilize version sets, if any
|
||||
submit_primary_stab_subops(cur_op);
|
||||
resume_5:
|
||||
op_data->st = 5;
|
||||
return;
|
||||
}
|
||||
resume_6:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
// Return PGs and OSDs back into their dirty sets
|
||||
for (int i = 0; i < op_data->dirty_pg_count; i++)
|
||||
{
|
||||
dirty_pgs.insert(op_data->dirty_pgs[i]);
|
||||
}
|
||||
for (int i = 0; i < op_data->dirty_osd_count; i++)
|
||||
{
|
||||
dirty_osds.insert(op_data->dirty_osds[i]);
|
||||
}
|
||||
if (op_data->unstable_writes)
|
||||
{
|
||||
// Return objects back into the unstable write set
|
||||
for (auto unstable_osd: *(op_data->unstable_write_osds))
|
||||
{
|
||||
for (int i = 0; i < unstable_osd.len; i++)
|
||||
{
|
||||
// Except those from peered PGs
|
||||
auto & w = op_data->unstable_writes[i];
|
||||
pool_pg_num_t wpg = {
|
||||
.pool_id = INODE_POOL(w.oid.inode),
|
||||
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
|
||||
};
|
||||
if (pgs.at(wpg).state & PG_ACTIVE)
|
||||
{
|
||||
uint64_t & dest = this->unstable_writes[(osd_object_id_t){
|
||||
.osd_num = unstable_osd.osd_num,
|
||||
.oid = w.oid,
|
||||
}];
|
||||
dest = dest < w.version ? w.version : dest;
|
||||
dirty_pgs.insert(wpg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (op_data->copies_to_delete)
|
||||
{
|
||||
// Return 'copies to delete' back into respective PGs
|
||||
for (int i = 0; i < op_data->copies_to_delete_count; i++)
|
||||
{
|
||||
auto & w = op_data->copies_to_delete[i];
|
||||
auto & pg = pgs.at((pool_pg_num_t){
|
||||
.pool_id = INODE_POOL(w.oid.inode),
|
||||
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
|
||||
});
|
||||
if (pg.state & PG_ACTIVE)
|
||||
{
|
||||
pg.copies_to_delete_after_sync.push_back(w);
|
||||
copies_to_delete_after_sync_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (op_data->copies_to_delete)
|
||||
{
|
||||
// Actually delete copies which we wanted to delete
|
||||
submit_primary_del_batch(cur_op, op_data->copies_to_delete, op_data->copies_to_delete_count);
|
||||
resume_7:
|
||||
op_data->st = 7;
|
||||
return;
|
||||
resume_8:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
goto resume_6;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < op_data->dirty_pg_count; i++)
|
||||
{
|
||||
auto & pg = pgs.at(op_data->dirty_pgs[i]);
|
||||
pg.inflight--;
|
||||
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
finish_stop_pg(pg);
|
||||
}
|
||||
else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
|
||||
{
|
||||
start_pg_peering(pg);
|
||||
}
|
||||
}
|
||||
// FIXME: Free those in the destructor?
|
||||
free(op_data->dirty_pgs);
|
||||
op_data->dirty_pgs = NULL;
|
||||
op_data->dirty_osds = NULL;
|
||||
if (op_data->unstable_writes)
|
||||
{
|
||||
delete op_data->unstable_write_osds;
|
||||
delete[] op_data->unstable_writes;
|
||||
op_data->unstable_writes = NULL;
|
||||
op_data->unstable_write_osds = NULL;
|
||||
}
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
}
|
||||
else
|
||||
{
|
||||
finish:
|
||||
if (cur_op->peer_fd)
|
||||
{
|
||||
auto it = c_cli.clients.find(cur_op->peer_fd);
|
||||
if (it != c_cli.clients.end())
|
||||
it->second->dirty_pgs.clear();
|
||||
}
|
||||
finish_op(cur_op, 0);
|
||||
}
|
||||
assert(syncs_in_progress.front() == cur_op);
|
||||
syncs_in_progress.pop_front();
|
||||
if (syncs_in_progress.size() > 0)
|
||||
{
|
||||
cur_op = syncs_in_progress.front();
|
||||
op_data = cur_op->op_data;
|
||||
op_data->st++;
|
||||
goto resume_2;
|
||||
}
|
||||
}
|
378
src/osd_primary_write.cpp
Normal file
378
src/osd_primary_write.cpp
Normal file
@@ -0,0 +1,378 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "osd_primary.h"
|
||||
#include "allocator.h"
|
||||
|
||||
bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
// Check if actions are pending for this object
|
||||
auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
|
||||
.oid = op_data->oid,
|
||||
.osd_num = 0,
|
||||
});
|
||||
if (act_it != pg.flush_actions.end() &&
|
||||
act_it->first.oid.inode == op_data->oid.inode &&
|
||||
(act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
|
||||
{
|
||||
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||
return false;
|
||||
}
|
||||
// Check if there are other write requests to the same object
|
||||
auto vo_it = pg.write_queue.find(op_data->oid);
|
||||
if (vo_it != pg.write_queue.end())
|
||||
{
|
||||
op_data->st = 1;
|
||||
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||
return false;
|
||||
}
|
||||
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_t::continue_primary_write(osd_op_t *cur_op)
|
||||
{
|
||||
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
|
||||
{
|
||||
return;
|
||||
}
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
if (op_data->st == 1) goto resume_1;
|
||||
else if (op_data->st == 2) goto resume_2;
|
||||
else if (op_data->st == 3) goto resume_3;
|
||||
else if (op_data->st == 4) goto resume_4;
|
||||
else if (op_data->st == 5) goto resume_5;
|
||||
else if (op_data->st == 6) goto resume_6;
|
||||
else if (op_data->st == 7) goto resume_7;
|
||||
else if (op_data->st == 8) goto resume_8;
|
||||
else if (op_data->st == 9) goto resume_9;
|
||||
else if (op_data->st == 10) goto resume_10;
|
||||
assert(op_data->st == 0);
|
||||
if (!check_write_queue(cur_op, pg))
|
||||
{
|
||||
return;
|
||||
}
|
||||
resume_1:
|
||||
// Determine blocks to read and write
|
||||
// Missing chunks are allowed to be overwritten even in incomplete objects
|
||||
// FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Simplified algorithm
|
||||
op_data->stripes[0].write_start = op_data->stripes[0].req_start;
|
||||
op_data->stripes[0].write_end = op_data->stripes[0].req_end;
|
||||
op_data->stripes[0].write_buf = cur_op->buf;
|
||||
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
|
||||
op_data->stripes[0].write_end != bs_block_size))
|
||||
{
|
||||
// Object is degraded/misplaced and will be moved to <write_osd_set>
|
||||
op_data->stripes[0].read_start = 0;
|
||||
op_data->stripes[0].read_end = bs_block_size;
|
||||
cur_op->rmw_buf = op_data->stripes[0].read_buf = memalign_or_die(MEM_ALIGNMENT, bs_block_size);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
|
||||
pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
|
||||
if (!cur_op->rmw_buf)
|
||||
{
|
||||
// Refuse partial overwrite of an incomplete object
|
||||
cur_op->reply.hdr.retval = -EINVAL;
|
||||
goto continue_others;
|
||||
}
|
||||
}
|
||||
// Read required blocks
|
||||
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, pg.pg_size, op_data->prev_set, cur_op);
|
||||
resume_2:
|
||||
op_data->st = 2;
|
||||
return;
|
||||
resume_3:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return;
|
||||
}
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Only (possibly) copy new data from the request into the recovery buffer
|
||||
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
|
||||
op_data->stripes[0].write_end != bs_block_size))
|
||||
{
|
||||
memcpy(
|
||||
op_data->stripes[0].read_buf + op_data->stripes[0].req_start,
|
||||
op_data->stripes[0].write_buf,
|
||||
op_data->stripes[0].req_end - op_data->stripes[0].req_start
|
||||
);
|
||||
op_data->stripes[0].write_buf = op_data->stripes[0].read_buf;
|
||||
op_data->stripes[0].write_start = 0;
|
||||
op_data->stripes[0].write_end = bs_block_size;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// For EC/XOR pools, save version override to make it impossible
|
||||
// for parallel reads to read different versions of data and parity
|
||||
pg.ver_override[op_data->oid] = op_data->fact_ver;
|
||||
// Recover missing stripes, calculate parity
|
||||
if (pg.scheme == POOL_SCHEME_XOR)
|
||||
{
|
||||
calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
|
||||
}
|
||||
else if (pg.scheme == POOL_SCHEME_JERASURE)
|
||||
{
|
||||
calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
|
||||
}
|
||||
}
|
||||
// Send writes
|
||||
if ((op_data->fact_ver >> (64-PG_EPOCH_BITS)) < pg.epoch)
|
||||
{
|
||||
op_data->target_ver = ((uint64_t)pg.epoch << (64-PG_EPOCH_BITS)) | 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((op_data->fact_ver & (1ul<<(64-PG_EPOCH_BITS) - 1)) == (1ul<<(64-PG_EPOCH_BITS) - 1))
|
||||
{
|
||||
assert(pg.epoch != ((1ul << PG_EPOCH_BITS)-1));
|
||||
pg.epoch++;
|
||||
}
|
||||
op_data->target_ver = op_data->fact_ver + 1;
|
||||
}
|
||||
if (pg.epoch > pg.reported_epoch)
|
||||
{
|
||||
// Report newer epoch before writing
|
||||
// FIXME: We may report only one PG state here...
|
||||
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
pg.history_changed = true;
|
||||
report_pg_states();
|
||||
resume_10:
|
||||
if (pg.epoch > pg.reported_epoch)
|
||||
{
|
||||
op_data->st = 10;
|
||||
return;
|
||||
}
|
||||
}
|
||||
submit_primary_subops(SUBMIT_WRITE, op_data->target_ver, pg.pg_size, pg.cur_set.data(), cur_op);
|
||||
resume_4:
|
||||
op_data->st = 4;
|
||||
return;
|
||||
resume_5:
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Remove version override just after the write, but before stabilizing
|
||||
pg.ver_override.erase(op_data->oid);
|
||||
}
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return;
|
||||
}
|
||||
if (op_data->object_state)
|
||||
{
|
||||
// We must forget the unclean state of the object before deleting it
|
||||
// so the next reads don't accidentally read a deleted version
|
||||
// And it should be done at the same time as the removal of the version override
|
||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
||||
pg.clean_count++;
|
||||
}
|
||||
resume_6:
|
||||
resume_7:
|
||||
if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (op_data->fact_ver == 1)
|
||||
{
|
||||
// Object is created
|
||||
pg.clean_count++;
|
||||
pg.total_count++;
|
||||
}
|
||||
if (op_data->object_state)
|
||||
{
|
||||
{
|
||||
int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
|
||||
recovery_stat_count[0][recovery_type]++;
|
||||
if (!recovery_stat_count[0][recovery_type])
|
||||
{
|
||||
recovery_stat_count[0][recovery_type]++;
|
||||
recovery_stat_bytes[0][recovery_type] = 0;
|
||||
}
|
||||
for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size); role++)
|
||||
{
|
||||
recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
|
||||
}
|
||||
}
|
||||
// Any kind of a non-clean object can have extra chunks, because we don't record objects
|
||||
// as degraded & misplaced or incomplete & misplaced at the same time. So try to remove extra chunks
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
{
|
||||
// We can't remove extra chunks yet if fsyncs are explicit, because
|
||||
// new copies may not be committed to stable storage yet
|
||||
// We can only remove extra chunks after a successful SYNC for this PG
|
||||
for (auto & chunk: op_data->object_state->osd_set)
|
||||
{
|
||||
// Check is the same as in submit_primary_del_subops()
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED
|
||||
? !contains_osd(pg.cur_set.data(), pg.pg_size, chunk.osd_num)
|
||||
: (chunk.osd_num != pg.cur_set[chunk.role]))
|
||||
{
|
||||
pg.copies_to_delete_after_sync.push_back((obj_ver_osd_t){
|
||||
.osd_num = chunk.osd_num,
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | (op_data->scheme == POOL_SCHEME_REPLICATED ? 0 : chunk.role),
|
||||
},
|
||||
.version = op_data->fact_ver,
|
||||
});
|
||||
copies_to_delete_after_sync_count++;
|
||||
}
|
||||
}
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
}
|
||||
else
|
||||
{
|
||||
submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
if (op_data->n_subops > 0)
|
||||
{
|
||||
resume_8:
|
||||
op_data->st = 8;
|
||||
return;
|
||||
resume_9:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cur_op->reply.hdr.retval = cur_op->req.rw.len;
|
||||
continue_others:
|
||||
osd_op_t *next_op = NULL;
|
||||
auto next_it = pg.write_queue.find(op_data->oid);
|
||||
// Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue
|
||||
if (next_it != pg.write_queue.end() && next_it->second == cur_op)
|
||||
{
|
||||
pg.write_queue.erase(next_it++);
|
||||
if (next_it != pg.write_queue.end() && next_it->first == op_data->oid)
|
||||
next_op = next_it->second;
|
||||
}
|
||||
// finish_op would invalidate next_it if it cleared pg.write_queue, but it doesn't do that :)
|
||||
finish_op(cur_op, cur_op->req.rw.len);
|
||||
if (next_op)
|
||||
{
|
||||
// Continue next write to the same object
|
||||
continue_primary_write(next_op);
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
if (op_data->st == base_state)
|
||||
{
|
||||
goto resume_6;
|
||||
}
|
||||
else if (op_data->st == base_state+1)
|
||||
{
|
||||
goto resume_7;
|
||||
}
|
||||
if (immediate_commit == IMMEDIATE_ALL)
|
||||
{
|
||||
immediate:
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Send STABILIZE ops immediately
|
||||
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
|
||||
op_data->unstable_writes = new obj_ver_id[loc_set.size()];
|
||||
{
|
||||
int last_start = 0;
|
||||
for (auto & chunk: loc_set)
|
||||
{
|
||||
op_data->unstable_writes[last_start] = (obj_ver_id){
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | chunk.role,
|
||||
},
|
||||
.version = op_data->fact_ver,
|
||||
};
|
||||
op_data->unstable_write_osds->push_back((unstable_osd_num_t){
|
||||
.osd_num = chunk.osd_num,
|
||||
.start = last_start,
|
||||
.len = 1,
|
||||
});
|
||||
last_start++;
|
||||
}
|
||||
}
|
||||
submit_primary_stab_subops(cur_op);
|
||||
resume_6:
|
||||
op_data->st = 6;
|
||||
return false;
|
||||
resume_7:
|
||||
// FIXME: Free those in the destructor?
|
||||
delete op_data->unstable_write_osds;
|
||||
delete[] op_data->unstable_writes;
|
||||
op_data->unstable_writes = NULL;
|
||||
op_data->unstable_write_osds = NULL;
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (immediate_commit == IMMEDIATE_SMALL)
|
||||
{
|
||||
int stripe_count = (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : op_data->pg_size);
|
||||
for (int role = 0; role < stripe_count; role++)
|
||||
{
|
||||
if (op_data->stripes[role].write_start == 0 &&
|
||||
op_data->stripes[role].write_end == bs_block_size)
|
||||
{
|
||||
// Big write. Treat write as unsynced
|
||||
goto lazy;
|
||||
}
|
||||
}
|
||||
goto immediate;
|
||||
}
|
||||
else
|
||||
{
|
||||
lazy:
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Remember version as unstable for EC/XOR
|
||||
for (auto & chunk: loc_set)
|
||||
{
|
||||
this->dirty_osds.insert(chunk.osd_num);
|
||||
this->unstable_writes[(osd_object_id_t){
|
||||
.osd_num = chunk.osd_num,
|
||||
.oid = {
|
||||
.inode = op_data->oid.inode,
|
||||
.stripe = op_data->oid.stripe | chunk.role,
|
||||
},
|
||||
}] = op_data->fact_ver;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Only remember to sync OSDs for replicated pools
|
||||
for (auto & chunk: loc_set)
|
||||
{
|
||||
this->dirty_osds.insert(chunk.osd_num);
|
||||
}
|
||||
}
|
||||
// Remember PG as dirty to drop the connection when PG goes offline
|
||||
// (this is required because of the "lazy sync")
|
||||
auto cl_it = c_cli.clients.find(cur_op->peer_fd);
|
||||
if (cl_it != c_cli.clients.end())
|
||||
{
|
||||
cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
}
|
||||
dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
}
|
||||
return true;
|
||||
}
|
@@ -3,13 +3,14 @@
|
||||
|
||||
#include "pg_states.h"
|
||||
|
||||
const int pg_state_bit_count = 14;
|
||||
const int pg_state_bit_count = 15;
|
||||
|
||||
const int pg_state_bits[14] = {
|
||||
const int pg_state_bits[15] = {
|
||||
PG_STARTING,
|
||||
PG_PEERING,
|
||||
PG_INCOMPLETE,
|
||||
PG_ACTIVE,
|
||||
PG_REPEERING,
|
||||
PG_STOPPING,
|
||||
PG_OFFLINE,
|
||||
PG_DEGRADED,
|
||||
@@ -21,11 +22,12 @@ const int pg_state_bits[14] = {
|
||||
PG_LEFT_ON_DEAD,
|
||||
};
|
||||
|
||||
const char *pg_state_names[14] = {
|
||||
const char *pg_state_names[15] = {
|
||||
"starting",
|
||||
"peering",
|
||||
"incomplete",
|
||||
"active",
|
||||
"repeering",
|
||||
"stopping",
|
||||
"offline",
|
||||
"degraded",
|
||||
|
@@ -10,16 +10,17 @@
|
||||
#define PG_PEERING (1<<1)
|
||||
#define PG_INCOMPLETE (1<<2)
|
||||
#define PG_ACTIVE (1<<3)
|
||||
#define PG_STOPPING (1<<4)
|
||||
#define PG_OFFLINE (1<<5)
|
||||
#define PG_REPEERING (1<<4)
|
||||
#define PG_STOPPING (1<<5)
|
||||
#define PG_OFFLINE (1<<6)
|
||||
// Plus any of these:
|
||||
#define PG_DEGRADED (1<<6)
|
||||
#define PG_HAS_INCOMPLETE (1<<7)
|
||||
#define PG_HAS_DEGRADED (1<<8)
|
||||
#define PG_HAS_MISPLACED (1<<9)
|
||||
#define PG_HAS_UNCLEAN (1<<10)
|
||||
#define PG_HAS_INVALID (1<<11)
|
||||
#define PG_LEFT_ON_DEAD (1<<12)
|
||||
#define PG_DEGRADED (1<<7)
|
||||
#define PG_HAS_INCOMPLETE (1<<8)
|
||||
#define PG_HAS_DEGRADED (1<<9)
|
||||
#define PG_HAS_MISPLACED (1<<10)
|
||||
#define PG_HAS_UNCLEAN (1<<11)
|
||||
#define PG_HAS_INVALID (1<<12)
|
||||
#define PG_LEFT_ON_DEAD (1<<13)
|
||||
|
||||
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
|
||||
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
|
||||
|
@@ -47,7 +47,6 @@ public:
|
||||
|
||||
~QemuProxy()
|
||||
{
|
||||
cli->stop();
|
||||
delete cli;
|
||||
delete tfd;
|
||||
}
|
||||
|
@@ -2,20 +2,47 @@
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "allocator.h"
|
||||
|
||||
void alloc_all(int size)
|
||||
{
|
||||
allocator *a = new allocator(size);
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
uint64_t x = a->find_free();
|
||||
if (x == UINT64_MAX)
|
||||
{
|
||||
printf("ran out of space %d allocated=%d\n", size, i);
|
||||
exit(1);
|
||||
}
|
||||
if (x != i)
|
||||
{
|
||||
printf("incorrect block allocated: expected %d, got %lu\n", i, x);
|
||||
}
|
||||
if (a->get(x))
|
||||
{
|
||||
printf("not free before set at %d\n", i);
|
||||
}
|
||||
a->set(x, true);
|
||||
if (!a->get(x))
|
||||
{
|
||||
printf("free after set at %d\n", i);
|
||||
}
|
||||
}
|
||||
uint64_t x = a->find_free();
|
||||
if (x != UINT64_MAX)
|
||||
{
|
||||
printf("extra free space found: %lx (%d)\n", x, size);
|
||||
exit(1);
|
||||
}
|
||||
delete a;
|
||||
}
|
||||
|
||||
int main(int narg, char *args[])
|
||||
{
|
||||
allocator a(8192);
|
||||
for (int i = 0; i < 8192; i++)
|
||||
{
|
||||
uint64_t x = a.find_free();
|
||||
if (x == UINT64_MAX)
|
||||
{
|
||||
printf("ran out of space %d\n", i);
|
||||
return 1;
|
||||
}
|
||||
a.set(x, true);
|
||||
}
|
||||
alloc_all(8192);
|
||||
alloc_all(8062);
|
||||
alloc_all(4096);
|
||||
return 0;
|
||||
}
|
||||
|
383
src/test_cluster_client.cpp
Normal file
383
src/test_cluster_client.cpp
Normal file
@@ -0,0 +1,383 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include "cluster_client.h"
|
||||
|
||||
void configure_single_pg_pool(cluster_client_t *cli)
|
||||
{
|
||||
cli->st_cli.on_load_pgs_hook(true);
|
||||
cli->st_cli.parse_state((json_kv_t){
|
||||
.key = "/config/pools",
|
||||
.value = json11::Json::object {
|
||||
{ "1", json11::Json::object {
|
||||
{ "name", "hddpool" },
|
||||
{ "scheme", "replicated" },
|
||||
{ "pg_size", 2 },
|
||||
{ "pg_minsize", 1 },
|
||||
{ "pg_count", 1 },
|
||||
{ "failure_domain", "osd" },
|
||||
} }
|
||||
},
|
||||
});
|
||||
cli->st_cli.parse_state((json_kv_t){
|
||||
.key = "/config/pgs",
|
||||
.value = json11::Json::object {
|
||||
{ "items", json11::Json::object {
|
||||
{ "1", json11::Json::object {
|
||||
{ "1", json11::Json::object {
|
||||
{ "osd_set", json11::Json::array { 1, 2 } },
|
||||
{ "primary", 1 },
|
||||
} }
|
||||
} }
|
||||
} }
|
||||
},
|
||||
});
|
||||
cli->st_cli.parse_state((json_kv_t){
|
||||
.key = "/pg/state/1/1",
|
||||
.value = json11::Json::object {
|
||||
{ "peers", json11::Json::array { 1, 2 } },
|
||||
{ "primary", 1 },
|
||||
{ "state", json11::Json::array { "active" } },
|
||||
},
|
||||
});
|
||||
json11::Json::object changes;
|
||||
cli->st_cli.on_change_hook(changes);
|
||||
}
|
||||
|
||||
int *test_write(cluster_client_t *cli, uint64_t offset, uint64_t len, uint8_t c)
|
||||
{
|
||||
printf("Post write %lx+%lx\n", offset, len);
|
||||
int *r = new int;
|
||||
*r = -1;
|
||||
cluster_op_t *op = new cluster_op_t();
|
||||
op->opcode = OSD_OP_WRITE;
|
||||
op->inode = 0x1000000000001;
|
||||
op->offset = offset;
|
||||
op->len = len;
|
||||
op->iov.push_back(malloc_or_die(len), len);
|
||||
memset(op->iov.buf[0].iov_base, c, len);
|
||||
op->callback = [r](cluster_op_t *op)
|
||||
{
|
||||
if (*r == -1)
|
||||
printf("Error: Not allowed to complete yet\n");
|
||||
assert(*r != -1);
|
||||
*r = op->retval == op->len ? 1 : 0;
|
||||
free(op->iov.buf[0].iov_base);
|
||||
printf("Done write %lx+%lx r=%d\n", op->offset, op->len, op->retval);
|
||||
delete op;
|
||||
};
|
||||
cli->execute(op);
|
||||
return r;
|
||||
}
|
||||
|
||||
int *test_sync(cluster_client_t *cli)
|
||||
{
|
||||
printf("Post sync\n");
|
||||
int *r = new int;
|
||||
*r = -1;
|
||||
cluster_op_t *op = new cluster_op_t();
|
||||
op->opcode = OSD_OP_SYNC;
|
||||
op->callback = [r](cluster_op_t *op)
|
||||
{
|
||||
if (*r == -1)
|
||||
printf("Error: Not allowed to complete yet\n");
|
||||
assert(*r != -1);
|
||||
*r = op->retval == 0 ? 1 : 0;
|
||||
printf("Done sync r=%d\n", op->retval);
|
||||
delete op;
|
||||
};
|
||||
cli->execute(op);
|
||||
return r;
|
||||
}
|
||||
|
||||
void can_complete(int *r)
|
||||
{
|
||||
// Allow the operation to proceed so the test verifies
|
||||
// that it doesn't complete earlier than expected
|
||||
*r = -2;
|
||||
}
|
||||
|
||||
void check_completed(int *r)
|
||||
{
|
||||
assert(*r == 1);
|
||||
delete r;
|
||||
}
|
||||
|
||||
void pretend_connected(cluster_client_t *cli, osd_num_t osd_num)
|
||||
{
|
||||
printf("OSD %lu connected\n", osd_num);
|
||||
int peer_fd = cli->msgr.clients.size() ? std::prev(cli->msgr.clients.end())->first+1 : 10;
|
||||
cli->msgr.osd_peer_fds[osd_num] = peer_fd;
|
||||
cli->msgr.clients[peer_fd] = new osd_client_t();
|
||||
cli->msgr.clients[peer_fd]->osd_num = osd_num;
|
||||
cli->msgr.clients[peer_fd]->peer_state = PEER_CONNECTED;
|
||||
cli->msgr.wanted_peers.erase(osd_num);
|
||||
cli->msgr.repeer_pgs(osd_num);
|
||||
}
|
||||
|
||||
void pretend_disconnected(cluster_client_t *cli, osd_num_t osd_num)
|
||||
{
|
||||
printf("OSD %lu disconnected\n", osd_num);
|
||||
cli->msgr.stop_client(cli->msgr.osd_peer_fds.at(osd_num));
|
||||
}
|
||||
|
||||
void check_disconnected(cluster_client_t *cli, osd_num_t osd_num)
|
||||
{
|
||||
if (cli->msgr.osd_peer_fds.find(osd_num) != cli->msgr.osd_peer_fds.end())
|
||||
{
|
||||
printf("OSD %lu not disconnected as it ought to be\n", osd_num);
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
void check_op_count(cluster_client_t *cli, osd_num_t osd_num, int ops)
|
||||
{
|
||||
int peer_fd = cli->msgr.osd_peer_fds.at(osd_num);
|
||||
int real_ops = cli->msgr.clients[peer_fd]->sent_ops.size();
|
||||
if (real_ops != ops)
|
||||
{
|
||||
printf("error: %d ops expected, but %d queued\n", ops, real_ops);
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
osd_op_t *find_op(cluster_client_t *cli, osd_num_t osd_num, uint64_t opcode, uint64_t offset, uint64_t len)
|
||||
{
|
||||
int peer_fd = cli->msgr.osd_peer_fds.at(osd_num);
|
||||
auto op_it = cli->msgr.clients[peer_fd]->sent_ops.begin();
|
||||
while (op_it != cli->msgr.clients[peer_fd]->sent_ops.end())
|
||||
{
|
||||
auto op = op_it->second;
|
||||
if (op->req.hdr.opcode == opcode && (opcode == OSD_OP_SYNC ||
|
||||
op->req.rw.inode == 0x1000000000001 && op->req.rw.offset == offset && op->req.rw.len == len))
|
||||
{
|
||||
return op;
|
||||
}
|
||||
op_it++;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void pretend_op_completed(cluster_client_t *cli, osd_op_t *op, int64_t retval)
|
||||
{
|
||||
assert(op);
|
||||
printf("Pretend completed %s %lx+%x\n", op->req.hdr.opcode == OSD_OP_SYNC
|
||||
? "sync" : (op->req.hdr.opcode == OSD_OP_WRITE ? "write" : "read"), op->req.rw.offset, op->req.rw.len);
|
||||
uint64_t op_id = op->req.hdr.id;
|
||||
int peer_fd = op->peer_fd;
|
||||
cli->msgr.clients[peer_fd]->sent_ops.erase(op_id);
|
||||
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||
op->reply.hdr.id = op->req.hdr.id;
|
||||
op->reply.hdr.opcode = op->req.hdr.opcode;
|
||||
op->reply.hdr.retval = retval < 0 ? retval : (op->req.hdr.opcode == OSD_OP_SYNC ? 0 : op->req.rw.len);
|
||||
// Copy lambda to be unaffected by `delete op`
|
||||
std::function<void(osd_op_t*)>(op->callback)(op);
|
||||
}
|
||||
|
||||
void test1()
|
||||
{
|
||||
json11::Json config;
|
||||
timerfd_manager_t *tfd = new timerfd_manager_t([](int fd, bool wr, std::function<void(int, int)> callback){});
|
||||
cluster_client_t *cli = new cluster_client_t(NULL, tfd, config);
|
||||
|
||||
int *r1 = test_write(cli, 0, 4096, 0x55);
|
||||
configure_single_pg_pool(cli);
|
||||
pretend_connected(cli, 1);
|
||||
cli->continue_ops(true);
|
||||
can_complete(r1);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
|
||||
check_completed(r1);
|
||||
pretend_disconnected(cli, 1);
|
||||
int *r2 = test_sync(cli);
|
||||
pretend_connected(cli, 1);
|
||||
check_op_count(cli, 1, 0);
|
||||
cli->continue_ops(true);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
|
||||
check_op_count(cli, 1, 1);
|
||||
can_complete(r2);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_SYNC, 0, 0), 0);
|
||||
check_completed(r2);
|
||||
// Check that the client doesn't repeat operations once more
|
||||
pretend_disconnected(cli, 1);
|
||||
pretend_connected(cli, 1);
|
||||
check_op_count(cli, 1, 0);
|
||||
|
||||
// Case:
|
||||
// Write(1) -> Complete Write(1) -> Overwrite(2) -> Complete Write(2)
|
||||
// -> Overwrite(3) -> Drop OSD connection -> Reestablish OSD connection
|
||||
// -> Complete All Posted Writes -> Sync -> Complete Sync
|
||||
// The resulting state of the block must be (3) over (2) over (1).
|
||||
// I.e. the part overwritten by (3) must remain as in (3) and so on.
|
||||
|
||||
// More interesting case:
|
||||
// Same, but both Write(2) and Write(3) must consist of two parts:
|
||||
// one from an OSD 2 that drops connection and other from OSD 1 that doesn't.
|
||||
// The idea is that if the whole Write(2) is repeated when OSD 2 drops connection
|
||||
// then it may also overwrite a part in OSD 1 which shouldn't be overwritten.
|
||||
|
||||
// Another interesting case:
|
||||
// A new operation added during replay (would also break with the previous implementation)
|
||||
|
||||
r1 = test_write(cli, 0, 0x10000, 0x56);
|
||||
can_complete(r1);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x10000), 0);
|
||||
check_completed(r1);
|
||||
|
||||
r1 = test_write(cli, 0xE000, 0x4000, 0x57);
|
||||
can_complete(r1);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0xE000, 0x4000), 0);
|
||||
check_completed(r1);
|
||||
|
||||
r1 = test_write(cli, 0x10000, 0x4000, 0x58);
|
||||
|
||||
pretend_disconnected(cli, 1);
|
||||
pretend_connected(cli, 1);
|
||||
cli->continue_ops(true);
|
||||
|
||||
// Check replay
|
||||
{
|
||||
uint64_t replay_start = UINT64_MAX;
|
||||
uint64_t replay_end = 0;
|
||||
std::vector<osd_op_t*> replay_ops;
|
||||
auto osd_cl = cli->msgr.clients.at(cli->msgr.osd_peer_fds.at(1));
|
||||
for (auto & op_p: osd_cl->sent_ops)
|
||||
{
|
||||
auto op = op_p.second;
|
||||
assert(op->req.hdr.opcode == OSD_OP_WRITE);
|
||||
uint64_t offset = op->req.rw.offset;
|
||||
if (op->req.rw.offset < replay_start)
|
||||
replay_start = op->req.rw.offset;
|
||||
if (op->req.rw.offset+op->req.rw.len > replay_end)
|
||||
replay_end = op->req.rw.offset+op->req.rw.len;
|
||||
for (int buf_idx = 0; buf_idx < op->iov.count; buf_idx++)
|
||||
{
|
||||
for (int i = 0; i < op->iov.buf[buf_idx].iov_len; i++, offset++)
|
||||
{
|
||||
uint8_t c = offset < 0xE000 ? 0x56 : (offset < 0x10000 ? 0x57 : 0x58);
|
||||
if (((uint8_t*)op->iov.buf[buf_idx].iov_base)[i] != c)
|
||||
{
|
||||
printf("Write replay: mismatch at %lu\n", offset-op->req.rw.offset);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
}
|
||||
fail:
|
||||
assert(offset == op->req.rw.offset+op->req.rw.len);
|
||||
replay_ops.push_back(op);
|
||||
}
|
||||
if (replay_start != 0 || replay_end != 0x14000)
|
||||
{
|
||||
printf("Write replay: range mismatch: %lx-%lx\n", replay_start, replay_end);
|
||||
assert(0);
|
||||
}
|
||||
for (auto op: replay_ops)
|
||||
{
|
||||
pretend_op_completed(cli, op, 0);
|
||||
}
|
||||
}
|
||||
// Check that the following write finally proceeds
|
||||
check_op_count(cli, 1, 1);
|
||||
can_complete(r1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0x10000, 0x4000), 0);
|
||||
check_completed(r1);
|
||||
check_op_count(cli, 1, 0);
|
||||
|
||||
// Check sync
|
||||
r2 = test_sync(cli);
|
||||
can_complete(r2);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_SYNC, 0, 0), 0);
|
||||
check_completed(r2);
|
||||
|
||||
// Check disconnect during write
|
||||
r1 = test_write(cli, 0, 4096, 0x59);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), -EPIPE);
|
||||
check_disconnected(cli, 1);
|
||||
pretend_connected(cli, 1);
|
||||
check_op_count(cli, 1, 0);
|
||||
cli->continue_ops(true);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
|
||||
check_op_count(cli, 1, 1);
|
||||
can_complete(r1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
|
||||
check_completed(r1);
|
||||
|
||||
// Free client
|
||||
delete cli;
|
||||
delete tfd;
|
||||
printf("[ok] write replay test\n");
|
||||
}
|
||||
|
||||
void test2()
|
||||
{
|
||||
std::map<object_id, cluster_buffer_t> unsynced_writes;
|
||||
cluster_op_t *op = new cluster_op_t();
|
||||
op->opcode = OSD_OP_WRITE;
|
||||
op->inode = 1;
|
||||
op->offset = 0;
|
||||
op->len = 4096;
|
||||
op->iov.push_back(malloc_or_die(4096*1024), 4096);
|
||||
// 0-4k = 0x55
|
||||
memset(op->iov.buf[0].iov_base, 0x55, op->iov.buf[0].iov_len);
|
||||
cluster_client_t::copy_write(op, unsynced_writes);
|
||||
// 8k-12k = 0x66
|
||||
op->offset = 8192;
|
||||
memset(op->iov.buf[0].iov_base, 0x66, op->iov.buf[0].iov_len);
|
||||
cluster_client_t::copy_write(op, unsynced_writes);
|
||||
// 4k-1M+4k = 0x77
|
||||
op->len = op->iov.buf[0].iov_len = 1048576;
|
||||
op->offset = 4096;
|
||||
memset(op->iov.buf[0].iov_base, 0x77, op->iov.buf[0].iov_len);
|
||||
cluster_client_t::copy_write(op, unsynced_writes);
|
||||
// check it
|
||||
assert(unsynced_writes.size() == 4);
|
||||
auto uit = unsynced_writes.begin();
|
||||
int i;
|
||||
assert(uit->first.inode == 1);
|
||||
assert(uit->first.stripe == 0);
|
||||
assert(uit->second.len == 4096);
|
||||
for (i = 0; i < uit->second.len && ((uint8_t*)uit->second.buf)[i] == 0x55; i++) {}
|
||||
assert(i == uit->second.len);
|
||||
uit++;
|
||||
assert(uit->first.inode == 1);
|
||||
assert(uit->first.stripe == 4096);
|
||||
assert(uit->second.len == 4096);
|
||||
for (i = 0; i < uit->second.len && ((uint8_t*)uit->second.buf)[i] == 0x77; i++) {}
|
||||
assert(i == uit->second.len);
|
||||
uit++;
|
||||
assert(uit->first.inode == 1);
|
||||
assert(uit->first.stripe == 8192);
|
||||
assert(uit->second.len == 4096);
|
||||
for (i = 0; i < uit->second.len && ((uint8_t*)uit->second.buf)[i] == 0x77; i++) {}
|
||||
assert(i == uit->second.len);
|
||||
uit++;
|
||||
assert(uit->first.inode == 1);
|
||||
assert(uit->first.stripe == 12*1024);
|
||||
assert(uit->second.len == 1016*1024);
|
||||
for (i = 0; i < uit->second.len && ((uint8_t*)uit->second.buf)[i] == 0x77; i++) {}
|
||||
assert(i == uit->second.len);
|
||||
uit++;
|
||||
// free memory
|
||||
free(op->iov.buf[0].iov_base);
|
||||
delete op;
|
||||
for (auto p: unsynced_writes)
|
||||
{
|
||||
free(p.second.buf);
|
||||
}
|
||||
printf("[ok] copy_write test\n");
|
||||
}
|
||||
|
||||
int main(int narg, char *args[])
|
||||
{
|
||||
test1();
|
||||
test2();
|
||||
return 0;
|
||||
}
|
@@ -121,7 +121,7 @@ again:
|
||||
exp.it_value.tv_sec--;
|
||||
exp.it_value.tv_nsec += 1000000000;
|
||||
}
|
||||
if (exp.it_value.tv_sec < 0 || !exp.it_value.tv_sec && !exp.it_value.tv_nsec)
|
||||
if (exp.it_value.tv_sec < 0 || exp.it_value.tv_sec == 0 && exp.it_value.tv_nsec <= 0)
|
||||
{
|
||||
// It already happened
|
||||
trigger_nearest();
|
||||
@@ -159,6 +159,6 @@ void timerfd_manager_t::trigger_nearest()
|
||||
{
|
||||
timers.erase(timers.begin()+nearest, timers.begin()+nearest+1);
|
||||
}
|
||||
cb(nearest_id);
|
||||
nearest = -1;
|
||||
cb(nearest_id);
|
||||
}
|
||||
|
@@ -1,34 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Cheatsheet for CentOS 7 packaging (not a build script)
|
||||
|
||||
set -e
|
||||
rm -f /etc/yum.repos.d/CentOS-Media.repo
|
||||
yum -y --enablerepo=extras install centos-release-scl epel-release
|
||||
yum -y --enablerepo='*' install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gperftools-devel
|
||||
yumdownloader --source qemu
|
||||
yumdownloader --source fio
|
||||
yum-builddep -y --enablerepo='*' qemu
|
||||
yum -y install rpm-build
|
||||
. /opt/rh/devtoolset-9/enable
|
||||
rpm --nomd5 -i qemu*.src.rpm
|
||||
rpm --nomd5 -i fio*.src.rpm
|
||||
cd ~/rpmbuild/SPECS
|
||||
rpmbuild -bp fio.spec
|
||||
perl -i -pe 's/^make V=1/exit 1; make V=1/' qemu.spec
|
||||
rpmbuild -bc qemu.spec
|
||||
perl -i -pe 's/^exit 1; make V=1/make V=1/' qemu.spec
|
||||
cd ~/rpmbuild/BUILD/qemu*/
|
||||
make qapi-types.h
|
||||
mkdir -p ~/vitastor/qemu/b/qemu
|
||||
cp config-host.h ~/vitastor/qemu/b/qemu
|
||||
cp qapi-types.h ~/vitastor/qemu/b/qemu
|
||||
cp -r include ~/vitastor/qemu
|
||||
cd ~/vitastor
|
||||
sh copy-qemu-includes.sh
|
||||
mv qemu qemu-old
|
||||
mv qemu-copy qemu
|
||||
ln -s ~/rpmbuild/BUILD/fio*/ fio
|
||||
sh copy-fio-includes.sh
|
||||
rm fio
|
||||
mv fio-copy fio
|
@@ -2,6 +2,14 @@
|
||||
|
||||
. `dirname $0`/common.sh
|
||||
|
||||
if [ "$EC" != "" ]; then
|
||||
POOLCFG='"scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1'
|
||||
NOBJ=512
|
||||
else
|
||||
POOLCFG='"scheme":"replicated","pg_size":2,"pg_minsize":2'
|
||||
NOBJ=1024
|
||||
fi
|
||||
|
||||
dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd3.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
@@ -9,17 +17,17 @@ dd if=/dev/zero of=./testdata/test_osd4.bin bs=1024 count=1 seek=$((1024*1024-1)
|
||||
dd if=/dev/zero of=./testdata/test_osd5.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd6.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
|
||||
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd1.bin 2>/dev/null) &>./testdata/osd1.log &
|
||||
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd1.bin 2>/dev/null) 2>&1 >>./testdata/osd1.log &
|
||||
OSD1_PID=$!
|
||||
build/src/vitastor-osd --osd_num 2 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd2.bin 2>/dev/null) &>./testdata/osd2.log &
|
||||
build/src/vitastor-osd --osd_num 2 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd2.bin 2>/dev/null) 2>&1 >>./testdata/osd2.log &
|
||||
OSD2_PID=$!
|
||||
build/src/vitastor-osd --osd_num 3 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd3.bin 2>/dev/null) &>./testdata/osd3.log &
|
||||
build/src/vitastor-osd --osd_num 3 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd3.bin 2>/dev/null) 2>&1 >>./testdata/osd3.log &
|
||||
OSD3_PID=$!
|
||||
build/src/vitastor-osd --osd_num 4 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd4.bin 2>/dev/null) &>./testdata/osd4.log &
|
||||
build/src/vitastor-osd --osd_num 4 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd4.bin 2>/dev/null) 2>&1 >>./testdata/osd4.log &
|
||||
OSD4_PID=$!
|
||||
build/src/vitastor-osd --osd_num 5 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd5.bin 2>/dev/null) &>./testdata/osd5.log &
|
||||
build/src/vitastor-osd --osd_num 5 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd5.bin 2>/dev/null) 2>&1 >>./testdata/osd5.log &
|
||||
OSD5_PID=$!
|
||||
build/src/vitastor-osd --osd_num 6 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd6.bin 2>/dev/null) &>./testdata/osd6.log &
|
||||
build/src/vitastor-osd --osd_num 6 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd6.bin 2>/dev/null) 2>&1 >>./testdata/osd6.log &
|
||||
OSD6_PID=$!
|
||||
|
||||
cd mon
|
||||
@@ -28,9 +36,7 @@ cd ..
|
||||
node mon/mon-main.js --etcd_url http://$ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
|
||||
MON_PID=$!
|
||||
|
||||
$ETCDCTL put /vitastor/config/global '{"immediate_commit":"all"}'
|
||||
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":16,"failure_domain":"osd"}}'
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":16,"failure_domain":"osd"}}'
|
||||
|
||||
sleep 2
|
||||
|
||||
@@ -50,7 +56,11 @@ try_change()
|
||||
{
|
||||
n=$1
|
||||
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":'$n',"failure_domain":"osd"}}'
|
||||
for i in {1..6}; do
|
||||
echo --- Change PG count to $n --- >>testdata/osd$i.log
|
||||
done
|
||||
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":'$n',"failure_domain":"osd"}}'
|
||||
|
||||
for i in {1..10}; do
|
||||
($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == '$n) && \
|
||||
@@ -77,6 +87,12 @@ try_change()
|
||||
$ETCDCTL get --prefix /vitastor/pg/state/
|
||||
format_error "FAILED: $n PGS NOT UP"
|
||||
fi
|
||||
|
||||
# Check that no objects are lost !
|
||||
nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'`
|
||||
if [ "$nobj" -ne $NOBJ ]; then
|
||||
format_error "Data lost after changing PG count to $n: $NOBJ objects expected, but got $nobj"
|
||||
fi
|
||||
}
|
||||
|
||||
# 16 -> 32
|
||||
|
109
tests/test_interrupted_rebalance.sh
Executable file
109
tests/test_interrupted_rebalance.sh
Executable file
@@ -0,0 +1,109 @@
|
||||
#!/bin/bash -ex
|
||||
|
||||
. `dirname $0`/common.sh
|
||||
|
||||
if [ "$IMMEDIATE_COMMIT" != "" ]; then
|
||||
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 1"
|
||||
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":5,"immediate_commit":"all"}'
|
||||
else
|
||||
NO_SAME="--journal_sector_buffer_count 1024 --log_level 1"
|
||||
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":5}'
|
||||
fi
|
||||
|
||||
dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd3.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd4.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd5.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd6.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd7.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
|
||||
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd1.bin 2>/dev/null) 2>&1 >>./testdata/osd1.log &
|
||||
OSD1_PID=$!
|
||||
build/src/vitastor-osd --osd_num 2 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd2.bin 2>/dev/null) 2>&1 >>./testdata/osd2.log &
|
||||
OSD2_PID=$!
|
||||
build/src/vitastor-osd --osd_num 3 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd3.bin 2>/dev/null) 2>&1 >>./testdata/osd3.log &
|
||||
OSD3_PID=$!
|
||||
build/src/vitastor-osd --osd_num 4 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd4.bin 2>/dev/null) 2>&1 >>./testdata/osd4.log &
|
||||
OSD4_PID=$!
|
||||
build/src/vitastor-osd --osd_num 5 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd5.bin 2>/dev/null) 2>&1 >>./testdata/osd5.log &
|
||||
OSD5_PID=$!
|
||||
build/src/vitastor-osd --osd_num 6 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd6.bin 2>/dev/null) 2>&1 >>./testdata/osd6.log &
|
||||
OSD6_PID=$!
|
||||
build/src/vitastor-osd --osd_num 7 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd7.bin 2>/dev/null) 2>&1 >>./testdata/osd7.log &
|
||||
OSD7_PID=$!
|
||||
|
||||
cd mon
|
||||
npm install
|
||||
cd ..
|
||||
node mon/mon-main.js --etcd_url http://$ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
|
||||
MON_PID=$!
|
||||
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":32,"failure_domain":"osd"}}'
|
||||
|
||||
sleep 2
|
||||
|
||||
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == 32'); then
|
||||
format_error "FAILED: 32 PGS NOT CONFIGURED"
|
||||
fi
|
||||
|
||||
if ! ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 32'); then
|
||||
format_error "FAILED: 32 PGS NOT UP"
|
||||
fi
|
||||
|
||||
IMG_SIZE=960
|
||||
|
||||
LD_PRELOAD=libasan.so.5 \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=16 -fsync=16 -rw=write \
|
||||
-etcd=$ETCD_URL -pool=1 -inode=2 -size=${IMG_SIZE}M -cluster_log_level=10
|
||||
|
||||
try_reweight()
|
||||
{
|
||||
osd=$1
|
||||
w=$2
|
||||
$ETCDCTL put /vitastor/config/osd/$osd '{"reweight":'$w'}'
|
||||
sleep 3
|
||||
}
|
||||
|
||||
try_reweight 1 0
|
||||
|
||||
try_reweight 2 0
|
||||
|
||||
try_reweight 3 0
|
||||
|
||||
try_reweight 4 0
|
||||
|
||||
try_reweight 5 0
|
||||
|
||||
try_reweight 1 1
|
||||
|
||||
try_reweight 2 1
|
||||
|
||||
try_reweight 3 1
|
||||
|
||||
try_reweight 4 1
|
||||
|
||||
try_reweight 5 1
|
||||
|
||||
# Wait for the rebalance to finish
|
||||
for i in {1..60}; do
|
||||
($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 32') && \
|
||||
break
|
||||
if [ $i -eq 60 ]; then
|
||||
format_error "Rebalance couldn't finish in 60 seconds"
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
# Check that PGs never had degraded objects !
|
||||
if grep has_degraded ./testdata/mon.log; then
|
||||
format_error "Some copies of objects were lost during interrupted rebalancings"
|
||||
fi
|
||||
|
||||
# Check that no objects are lost !
|
||||
nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'`
|
||||
if [ "$nobj" -ne $((IMG_SIZE*8)) ]; then
|
||||
format_error "Data lost after multiple interrupted rebalancings"
|
||||
fi
|
||||
|
||||
format_green OK
|
79
tests/test_move_reappear.sh
Executable file
79
tests/test_move_reappear.sh
Executable file
@@ -0,0 +1,79 @@
|
||||
#!/bin/bash -ex
|
||||
|
||||
. `dirname $0`/common.sh
|
||||
|
||||
dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd3.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd4.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
dd if=/dev/zero of=./testdata/test_osd5.bin bs=1024 count=1 seek=$((1024*1024-1))
|
||||
|
||||
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd1.bin 2>/dev/null) &>./testdata/osd1.log &
|
||||
OSD1_PID=$!
|
||||
build/src/vitastor-osd --osd_num 2 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd2.bin 2>/dev/null) &>./testdata/osd2.log &
|
||||
OSD2_PID=$!
|
||||
build/src/vitastor-osd --osd_num 3 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd3.bin 2>/dev/null) &>./testdata/osd3.log &
|
||||
OSD3_PID=$!
|
||||
build/src/vitastor-osd --osd_num 4 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd4.bin 2>/dev/null) &>./testdata/osd4.log &
|
||||
OSD4_PID=$!
|
||||
build/src/vitastor-osd --osd_num 5 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd5.bin 2>/dev/null) &>./testdata/osd5.log &
|
||||
OSD5_PID=$!
|
||||
|
||||
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":1,"failure_domain":"osd"}}'
|
||||
|
||||
$ETCDCTL put /vitastor/config/pgs '{"items":{"1":{"1":{"osd_set":[1,0],"primary":1}}}}'
|
||||
|
||||
sleep 2
|
||||
|
||||
if ! ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active","degraded"]'); then
|
||||
format_error "Failed to start the PG active+degraded"
|
||||
fi
|
||||
|
||||
LD_PRELOAD=libasan.so.5 \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write \
|
||||
-etcd=$ETCD_URL -pool=1 -inode=2 -size=32M -cluster_log_level=10
|
||||
|
||||
$ETCDCTL put /vitastor/config/pgs '{"items":{"1":{"1":{"osd_set":[1,0],"primary":0}}}}'
|
||||
|
||||
sleep 2
|
||||
|
||||
if [ "`$ETCDCTL get /vitastor/pg/state/1/1 --print-value-only`" != "" ]; then
|
||||
format_error "Failed to stop the PG"
|
||||
fi
|
||||
|
||||
$ETCDCTL put /vitastor/pg/history/1/1 '{"all_peers":[1,2,3]}'
|
||||
|
||||
$ETCDCTL put /vitastor/config/pgs '{"items":{"1":{"1":{"osd_set":[4,5],"primary":4}}}}'
|
||||
|
||||
sleep 5
|
||||
|
||||
if ! ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active"]'); then
|
||||
format_error "Failed to move degraded objects to the clean OSD set"
|
||||
fi
|
||||
|
||||
$ETCDCTL put /vitastor/config/pgs '{"items":{"1":{"1":{"osd_set":[4,5],"primary":0}}}}'
|
||||
|
||||
$ETCDCTL put /vitastor/pg/history/1/1 '{"all_peers":[1,2,3]}'
|
||||
|
||||
sleep 2
|
||||
|
||||
if [ "`$ETCDCTL get /vitastor/pg/state/1/1 --print-value-only`" != "" ]; then
|
||||
format_error "Failed to stop the PG after degraded recovery"
|
||||
fi
|
||||
|
||||
cp testdata/osd4.log testdata/osd4_pre.log
|
||||
>testdata/osd4.log
|
||||
|
||||
$ETCDCTL put /vitastor/config/pgs '{"items":{"1":{"1":{"osd_set":[4,5],"primary":4}}}}'
|
||||
|
||||
sleep 2
|
||||
|
||||
if grep -q 'PG 1/1.*is.*has_' testdata/osd4.log; then
|
||||
format_error "PG has degraded or misplaced objects after a full re-peer following a degraded recovery"
|
||||
fi
|
||||
|
||||
if ! ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active"]'); then
|
||||
format_error "PG not active+clean after a full re-peer following a degraded recovery"
|
||||
fi
|
||||
|
||||
format_green OK
|
@@ -35,6 +35,18 @@ fi
|
||||
# fio -thread -name=test -ioengine=build/src/libfio_vitastor_sec.so -bs=4k -fsync=128 `$ETCDCTL get /vitastor/osd/state/1 --print-value-only | jq -r '"-host="+.addresses[0]+" -port="+(.port|tostring)'` -rw=write -size=32M
|
||||
|
||||
LD_PRELOAD=libasan.so.5 \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=1G -cluster_log_level=10
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -cluster_log_level=10
|
||||
|
||||
LD_PRELOAD=libasan.so.5 \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=1 -fsync=32 -buffer_pattern=0xdeadface \
|
||||
-rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -number_ios=1024
|
||||
|
||||
qemu-img convert -S 4096 -p \
|
||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=1:size=$((128*1024*1024))" \
|
||||
-O raw ./testdata/read.bin
|
||||
|
||||
qemu-img convert -S 4096 -p \
|
||||
-f raw ./testdata/read.bin \
|
||||
-O raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=1:size=$((128*1024*1024))"
|
||||
|
||||
format_green OK
|
||||
|
Reference in New Issue
Block a user