Compare commits
190 Commits
test-etcd-
...
heap-meta
Author | SHA1 | Date | |
---|---|---|---|
88cb2df946 | |||
7be9ed93d5 | |||
b019988e2b | |||
0db5400cb7 | |||
5a13db107b | |||
1288cfb0af | |||
2eba37db8a | |||
f9e0b0db27 | |||
f6acd5e79c | |||
6fbeb5c668 | |||
8e55869b71 | |||
7bb9004435 | |||
d9d484e8bb | |||
0c0ab64155 | |||
84fca8abca | |||
eaf0fe66a1 | |||
5ca25c0e7d | |||
1b7f2eac8e | |||
ddd16e8613 | |||
046a9f7a67 | |||
bd55b24827 | |||
09d69f7968 | |||
cc4d170ef0 | |||
335b73a3d5 | |||
fc6c5a853e | |||
445393dfc4 | |||
021762193b | |||
a6dee28f4e | |||
6e0ad777e3 | |||
13d069cf5f | |||
504246b1db | |||
fd647021eb | |||
049a7b260e | |||
17d0da1a74 | |||
fe277127bb | |||
79f3147d0c | |||
5126e67c3f | |||
90b1bdee43 | |||
5d501d0d43 | |||
157a62628a | |||
d4da42bb05 | |||
2883df733a | |||
77e3870f8f | |||
b82be8136a | |||
6c4f407575 | |||
c0489d237b | |||
7c250f165c | |||
b46dbbbefb | |||
1810fbe622 | |||
4d6c9bc294 | |||
e7df9683f1 | |||
a5dd943fcc | |||
c9b527f2e2 | |||
1551a49454 | |||
d46feccd03 | |||
959e2e2df9 | |||
b7bc3d652d | |||
fc2762d60a | |||
85b3c691e9 | |||
0ac4645a9e | |||
87922bc660 | |||
77b97b0613 | |||
0713120315 | |||
bce082a444 | |||
383305da88 | |||
639c809827 | |||
71d78c1409 | |||
3e2e2f9846 | |||
cb48c70083 | |||
989571bb74 | |||
0e1d069ad7 | |||
a36b4e5933 | |||
c809d86846 | |||
0bd5eb1f20 | |||
2c8ddc5431 | |||
e1f3829bb1 | |||
a980d65f78 | |||
bcc93e548e | |||
68e9f71723 | |||
1ae4b9a799 | |||
79d4b57f0e | |||
cc73e42488 | |||
6cb0fdb571 | |||
8e1ea15f58 | |||
d7f1b3a2dd | |||
0049a6ed4a | |||
05ed9a27a4 | |||
300a149513 | |||
0223016ce6 | |||
9368cc7d9b | |||
74743ccd3f | |||
ec1c7e6be4 | |||
bc643b24cf | |||
2a66cc3f11 | |||
bd74ce4b70 | |||
7c07303d12 | |||
ac6bacc46e | |||
f7fbfb8174 | |||
ebf85a7515 | |||
36413d89c3 | |||
fb9505d5db | |||
77e1badfad | |||
b83359fdbc | |||
2f4f46d7eb | |||
65a4aecb8c | |||
d08a2fb8ee | |||
5739d52600 | |||
d44dba43b3 | |||
8984556689 | |||
457b47d311 | |||
af62f6374b | |||
1c7f3710be | |||
a67a09da7c | |||
4ed29c19c4 | |||
dd3f64cf62 | |||
e1f4fcb76a | |||
2d24b4d70d | |||
d9e2705db7 | |||
5935762730 | |||
a945a46d56 | |||
82ac6416d3 | |||
df4661230e | |||
72a29b7031 | |||
2d42f29385 | |||
17240c6144 | |||
9e627a4414 | |||
90b1019636 | |||
df604afbd5 | |||
47c7aa62de | |||
9f2dc48d0f | |||
6d951b21fb | |||
552f28cb3e | |||
e87b6e26f7 | |||
0c89886374 | |||
e79bef8751 | |||
ad76f84e1c | |||
db827cb34c | |||
e5c6d85ea1 | |||
6cc44c1f54 | |||
c20450c1f1 | |||
db63e58b3d | |||
31b7021330 | |||
2ebe3a468c | |||
9892fccfb0 | |||
0be86a306d | |||
d77a775948 | |||
8cc82bab39 | |||
f9d5e33ddd | |||
![]() |
f83418d93e | ||
fbf14fb0cb | |||
fb1c3e00f4 | |||
![]() |
d8332171e9 | ||
c24cc9bf0b | |||
9f57c75acf | |||
53b12641d1 | |||
![]() |
5c5c8825dc | ||
3a261ac3fc | |||
04514435de | |||
07303020fc | |||
feaf7a15cf | |||
29dda5066f | |||
1de53ef7e6 | |||
4793dbe9c3 | |||
918ea34af2 | |||
2db8184cd8 | |||
0e964b3c8c | |||
1b9296ff6c | |||
6bf136c199 | |||
b529f77264 | |||
bf9519dcdc | |||
4ba687738b | |||
8427f6fe46 | |||
efa6bc3e70 | |||
da33e9b12d | |||
![]() |
265127c1a7 | ||
2b30acfc1d | |||
7fbc38ef29 | |||
e5070e991a | |||
625552c441 | |||
78c95c94f6 | |||
488e20bf55 | |||
25d6281b3e | |||
1676e50b3a | |||
8049e3c14a | |||
93a30efd86 | |||
83fb121f36 | |||
afc97b757b | |||
68905cbf41 | |||
3fff667f13 | |||
980aec1d9b |
@@ -20,7 +20,7 @@ RUN echo 'deb http://deb.debian.org/debian bullseye-backports main' >> /etc/apt/
|
||||
|
||||
RUN apt-get update
|
||||
RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
|
||||
liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
|
||||
libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
|
||||
RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
|
||||
RUN apt-get update && apt-get -y install jq lp-solve sudo nfs-common fdisk parted
|
||||
RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
|
||||
|
@@ -144,6 +144,24 @@ jobs:
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_change_pg_count_online:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_change_pg_count_online.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_change_pg_size:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
@@ -792,6 +810,24 @@ jobs:
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_reweight_half:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_reweight_half.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
test_heal_csum_32k_dmj:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
|
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -4,3 +4,6 @@
|
||||
[submodule "json11"]
|
||||
path = json11
|
||||
url = ../json11.git
|
||||
[submodule "emhash"]
|
||||
path = emhash
|
||||
url = ../emhash.git
|
||||
|
@@ -2,6 +2,19 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VITASTOR_VERSION "2.2.2")
|
||||
set(VITASTOR_VERSION "2.3.0")
|
||||
|
||||
include(CTest)
|
||||
|
||||
add_custom_target(build_tests)
|
||||
add_custom_target(test
|
||||
COMMAND
|
||||
echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
|
||||
env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
|
||||
)
|
||||
# make -j16 -C ../../build test_heap && ../../build/src/test/test_heap
|
||||
# make -j16 -C ../../build test_heap && rm -f $(find ../../build -name '*.gcda') && ctest -V -T test -T coverage -R heap --test-dir ../../build && (cd ../../build; gcovr -f ../src --html --html-nested -o coverage/index.html; cd ../src/test)
|
||||
# make -j16 -C ../../build test_blockstore && rm -f $(find ../../build -name '*.gcda') && ctest -V -T test -T coverage -R blockstore --test-dir ../../build && (cd ../../build; gcovr -f ../src --html --html-nested -o coverage/index.html; cd ../src/test)
|
||||
# kcov --include-path=../../../src ../../kcov ./test_blockstore
|
||||
add_dependencies(test build_tests)
|
||||
add_subdirectory(src)
|
||||
|
@@ -19,7 +19,7 @@ Vitastor нацелен в первую очередь на SSD и SSD+HDD кл
|
||||
TCP и RDMA и на хорошем железе может достигать задержки 4 КБ чтения и записи на уровне ~0.1 мс,
|
||||
что примерно в 10 раз быстрее, чем Ceph и другие популярные программные СХД.
|
||||
|
||||
Vitastor поддерживает QEMU-драйвер, протоколы NBD и NFS, драйверы OpenStack, OpenNebula, Proxmox, Kubernetes.
|
||||
Vitastor поддерживает QEMU-драйвер, протоколы UBLK, NBD и NFS, драйверы OpenStack, OpenNebula, Proxmox, Kubernetes.
|
||||
Другие драйверы могут также быть легко реализованы.
|
||||
|
||||
Подробности смотрите в документации по ссылкам. Можете начать отсюда: [Быстрый старт](docs/intro/quickstart.ru.md).
|
||||
@@ -64,8 +64,9 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
|
||||
- [vitastor-cli](docs/usage/cli.ru.md) (консольный интерфейс)
|
||||
- [vitastor-disk](docs/usage/disk.ru.md) (управление дисками)
|
||||
- [fio](docs/usage/fio.ru.md) для тестов производительности
|
||||
- [NBD](docs/usage/nbd.ru.md) для монтирования ядром
|
||||
- [QEMU и qemu-img](docs/usage/qemu.ru.md)
|
||||
- [UBLK](docs/usage/ublk.ru.md) для монтирования ядром
|
||||
- [NBD](docs/usage/nbd.ru.md) - старый интерфейс для монтирования ядром
|
||||
- [QEMU, qemu-img и VDUSE](docs/usage/qemu.ru.md)
|
||||
- [NFS](docs/usage/nfs.ru.md) кластерная файловая система и псевдо-ФС прокси
|
||||
- [Администрирование](docs/usage/admin.ru.md)
|
||||
- Производительность
|
||||
|
@@ -19,7 +19,7 @@ supports TCP and RDMA and may achieve 4 KB read and write latency as low as ~0.1
|
||||
with proper hardware which is ~10 times faster than other popular SDS's like Ceph
|
||||
or internal systems of public clouds.
|
||||
|
||||
Vitastor supports QEMU, NBD, NFS protocols, OpenStack, OpenNebula, Proxmox, Kubernetes drivers.
|
||||
Vitastor supports QEMU, UBLK, NBD, NFS protocols, OpenStack, OpenNebula, Proxmox, Kubernetes drivers.
|
||||
More drivers may be created easily.
|
||||
|
||||
Read more details in the documentation. You can start from here: [Quick Start](docs/intro/quickstart.en.md).
|
||||
@@ -64,8 +64,9 @@ Read more details in the documentation. You can start from here: [Quick Start](d
|
||||
- [vitastor-cli](docs/usage/cli.en.md) (command-line interface)
|
||||
- [vitastor-disk](docs/usage/disk.en.md) (disk management tool)
|
||||
- [fio](docs/usage/fio.en.md) for benchmarks
|
||||
- [NBD](docs/usage/nbd.en.md) for kernel mounts
|
||||
- [QEMU and qemu-img](docs/usage/qemu.en.md)
|
||||
- [UBLK](docs/usage/ublk.en.md) for kernel mounts
|
||||
- [NBD](docs/usage/nbd.en.md) - old interface for kernel mounts
|
||||
- [QEMU, qemu-img and VDUSE](docs/usage/qemu.en.md)
|
||||
- [NFS](docs/usage/nfs.en.md) clustered file system and pseudo-FS proxy
|
||||
- [Administration](docs/usage/admin.en.md)
|
||||
- Performance
|
||||
|
@@ -36,7 +36,7 @@ RUN (echo deb http://vitastor.io/debian bookworm main > /etc/apt/sources.list.d/
|
||||
((echo 'Package: *'; echo 'Pin: origin "vitastor.io"'; echo 'Pin-Priority: 1000') > /etc/apt/preferences.d/vitastor.pref) && \
|
||||
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
|
||||
apt-get update && \
|
||||
apt-get install -y vitastor-client && \
|
||||
apt-get install -y vitastor-client ibverbs-providers && \
|
||||
wget https://vitastor.io/archive/qemu/qemu-bookworm-9.2.2%2Bds-1%2Bvitastor4/qemu-utils_9.2.2%2Bds-1%2Bvitastor4_amd64.deb && \
|
||||
wget https://vitastor.io/archive/qemu/qemu-bookworm-9.2.2%2Bds-1%2Bvitastor4/qemu-block-extra_9.2.2%2Bds-1%2Bvitastor4_amd64.deb && \
|
||||
dpkg -x qemu-utils*.deb tmp1 && \
|
||||
|
@@ -1,4 +1,4 @@
|
||||
VITASTOR_VERSION ?= v2.2.2
|
||||
VITASTOR_VERSION ?= v2.3.0
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v2.2.2
|
||||
image: vitalif/vitastor-csi:v2.3.0
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -121,7 +121,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v2.2.2
|
||||
image: vitalif/vitastor-csi:v2.3.0
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "2.2.2"
|
||||
vitastorCSIDriverVersion = "2.3.0"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
7
debian/build-vitastor-bookworm.sh
vendored
7
debian/build-vitastor-bookworm.sh
vendored
@@ -1,7 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
cat < vitastor.Dockerfile > ../Dockerfile
|
||||
cd ..
|
||||
mkdir -p packages
|
||||
sudo podman build --build-arg DISTRO=debian --build-arg REL=bookworm -v `pwd`/packages:/root/packages -f Dockerfile .
|
||||
rm Dockerfile
|
||||
docker build --build-arg DISTRO=debian --build-arg REL=bookworm -t vitastor-buildenv:bookworm -f vitastor-buildenv.Dockerfile .
|
||||
docker run -i --rm -e REL=bookworm -v `dirname $0`/../:/root/vitastor vitastor-buildenv:bookworm /root/vitastor/debian/vitastor-build.sh
|
||||
|
7
debian/build-vitastor-bullseye.sh
vendored
7
debian/build-vitastor-bullseye.sh
vendored
@@ -1,7 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
cat < vitastor.Dockerfile > ../Dockerfile
|
||||
cd ..
|
||||
mkdir -p packages
|
||||
sudo podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f Dockerfile .
|
||||
rm Dockerfile
|
||||
docker build --build-arg DISTRO=debian --build-arg REL=bullseye -t vitastor-buildenv:bullseye -f vitastor-buildenv.Dockerfile .
|
||||
docker run -i --rm -e REL=bullseye -v `dirname $0`/../:/root/vitastor vitastor-buildenv:bullseye /root/vitastor/debian/vitastor-build.sh
|
||||
|
7
debian/build-vitastor-buster.sh
vendored
7
debian/build-vitastor-buster.sh
vendored
@@ -1,7 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
cat < vitastor.Dockerfile > ../Dockerfile
|
||||
cd ..
|
||||
mkdir -p packages
|
||||
sudo podman build --build-arg DISTRO=debian --build-arg REL=buster -v `pwd`/packages:/root/packages -f Dockerfile .
|
||||
rm Dockerfile
|
||||
docker build --build-arg DISTRO=debian --build-arg REL=buster -t vitastor-buildenv:buster -f vitastor-buildenv.Dockerfile .
|
||||
docker run -i --rm -e REL=buster -v `dirname $0`/../:/root/vitastor vitastor-buildenv:buster /root/vitastor/debian/vitastor-build.sh
|
||||
|
4
debian/build-vitastor-trixie.sh
vendored
Executable file
4
debian/build-vitastor-trixie.sh
vendored
Executable file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
docker build --build-arg DISTRO=debian --build-arg REL=trixie -t vitastor-buildenv:trixie -f vitastor-buildenv.Dockerfile .
|
||||
docker run -i --rm -e REL=trixie -v `dirname $0`/../:/root/vitastor vitastor-buildenv:trixie /root/vitastor/debian/vitastor-build.sh
|
8
debian/build-vitastor-ubuntu-jammy.sh
vendored
8
debian/build-vitastor-ubuntu-jammy.sh
vendored
@@ -1,7 +1,5 @@
|
||||
#!/bin/bash
|
||||
# Ubuntu 22.04 Jammy Jellyfish
|
||||
|
||||
cat < vitastor.Dockerfile > ../Dockerfile
|
||||
cd ..
|
||||
mkdir -p packages
|
||||
sudo podman build --build-arg DISTRO=ubuntu --build-arg REL=jammy -v `pwd`/packages:/root/packages -f Dockerfile .
|
||||
rm Dockerfile
|
||||
docker build --build-arg DISTRO=ubuntu --build-arg REL=jammy -t vitastor-buildenv:jammy -f vitastor-buildenv.Dockerfile .
|
||||
docker run -i --rm -e REL=jammy -v `dirname $0`/../:/root/vitastor vitastor-buildenv:jammy /root/vitastor/debian/vitastor-build.sh
|
||||
|
5
debian/build-vitastor-ubuntu-noble.sh
vendored
Executable file
5
debian/build-vitastor-ubuntu-noble.sh
vendored
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
# 24.04 Noble Numbat
|
||||
|
||||
docker build --build-arg DISTRO=ubuntu --build-arg REL=noble -t vitastor-buildenv:noble -f vitastor-buildenv.Dockerfile .
|
||||
docker run -i --rm -e REL=noble -v `dirname $0`/../:/root/vitastor vitastor-buildenv:noble /root/vitastor/debian/vitastor-build.sh
|
2
debian/changelog
vendored
2
debian/changelog
vendored
@@ -1,4 +1,4 @@
|
||||
vitastor (2.2.2-1) unstable; urgency=medium
|
||||
vitastor (2.3.0-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
|
4
debian/control
vendored
4
debian/control
vendored
@@ -2,9 +2,9 @@ Source: vitastor
|
||||
Section: admin
|
||||
Priority: optional
|
||||
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
|
||||
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8),
|
||||
Build-Depends: debhelper, g++ (>= 8), libstdc++6 (>= 8),
|
||||
linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev,
|
||||
libibverbs-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev,
|
||||
libibverbs-dev, librdmacm-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev,
|
||||
node-bindings <!nocheck>, node-gyp, node-nan
|
||||
Standards-Version: 4.5.0
|
||||
Homepage: https://vitastor.io/
|
||||
|
2
debian/patched-qemu.Dockerfile
vendored
2
debian/patched-qemu.Dockerfile
vendored
@@ -26,7 +26,7 @@ RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" -o "$REL" = "bookworm" ]; then
|
||||
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
|
||||
|
||||
RUN apt-get update
|
||||
RUN DEBIAN_FRONTEND=noninteractive TZ=Europe/Moscow apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts
|
||||
RUN DEBIAN_FRONTEND=noninteractive TZ=Europe/Moscow apt-get -y install fio libgoogle-perftools-dev devscripts
|
||||
RUN DEBIAN_FRONTEND=noninteractive TZ=Europe/Moscow apt-get -y build-dep qemu
|
||||
# To build a custom version
|
||||
#RUN cp /root/packages/qemu-orig/* /root
|
||||
|
60
debian/vitastor-build.sh
vendored
Executable file
60
debian/vitastor-build.sh
vendored
Executable file
@@ -0,0 +1,60 @@
|
||||
#!/bin/bash
|
||||
# To be ran inside buildenv docker
|
||||
|
||||
set -e -x
|
||||
|
||||
[ -e /usr/lib/x86_64-linux-gnu/pkgconfig/libisal.pc ] || cp /root/vitastor/debian/libisal.pc /usr/lib/x86_64-linux-gnu/pkgconfig
|
||||
|
||||
mkdir -p /root/fio-build/
|
||||
cd /root/fio-build/
|
||||
rm -rf /root/fio-build/*
|
||||
dpkg-source -x /root/fio*.dsc
|
||||
|
||||
FULLVER=`head -n1 /root/vitastor/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'`
|
||||
VER=${FULLVER%%-*}
|
||||
rm -rf /root/vitastor-$VER
|
||||
mkdir /root/vitastor-$VER
|
||||
cd /root/vitastor
|
||||
cp -a $(ls | grep -v packages) /root/vitastor-$VER
|
||||
|
||||
rm -rf /root/vitastor/packages/vitastor-$REL
|
||||
mkdir -p /root/vitastor/packages/vitastor-$REL
|
||||
mv /root/vitastor-$VER /root/vitastor/packages/vitastor-$REL/
|
||||
|
||||
cd /root/vitastor/packages/vitastor-$REL/vitastor-$VER
|
||||
|
||||
rm -rf fio
|
||||
ln -s /root/fio-build/fio-*/ ./fio
|
||||
FIO=`head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'`
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h
|
||||
sh copy-fio-includes.sh
|
||||
rm fio
|
||||
mkdir -p a b debian/patches
|
||||
mv fio-copy b/fio
|
||||
diff -NaurpbB a b > debian/patches/fio-headers.patch || true
|
||||
echo fio-headers.patch >> debian/patches/series
|
||||
rm -rf a b
|
||||
|
||||
echo "dep:fio=$FIO" > debian/fio_version
|
||||
|
||||
cd /root/vitastor/packages/vitastor-$REL/vitastor-$VER
|
||||
mkdir mon/node_modules
|
||||
cd mon/node_modules
|
||||
curl -s https://git.yourcmc.ru/vitalif/antietcd/archive/master.tar.gz | tar -zx
|
||||
curl -s https://git.yourcmc.ru/vitalif/tinyraft/archive/master.tar.gz | tar -zx
|
||||
|
||||
cd /root/vitastor/packages/vitastor-$REL
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_$VER.orig.tar.xz vitastor-$VER
|
||||
cd vitastor-$VER
|
||||
DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$FULLVER""$REL" "Rebuild for $REL"
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa
|
||||
rm -rf /root/vitastor/packages/vitastor-$REL/vitastor-*/
|
||||
|
||||
# Why does ubuntu rename debug packages to *.ddeb?
|
||||
cd /root/vitastor/packages/vitastor-$REL
|
||||
if ls *.ddeb >/dev/null; then
|
||||
perl -i -pe 's/\.ddeb/.deb/' *.buildinfo *.changes
|
||||
for i in *.ddeb; do
|
||||
mv $i ${i%%.ddeb}.deb
|
||||
done
|
||||
fi
|
31
debian/vitastor-buildenv.Dockerfile
vendored
Normal file
31
debian/vitastor-buildenv.Dockerfile
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
# Build environment for building Vitastor packages for Debian inside a container
|
||||
# cd ..
|
||||
# docker build --build-arg DISTRO=debian --build-arg REL=bullseye -f debian/vitastor.Dockerfile -t vitastor-buildenv:bullseye .
|
||||
# docker run --rm -e REL=bullseye -v ./:/root/vitastor /root/vitastor/debian/vitastor-build.sh
|
||||
|
||||
ARG DISTRO=debian
|
||||
ARG REL=
|
||||
FROM $DISTRO:$REL
|
||||
ARG DISTRO=debian
|
||||
ARG REL=
|
||||
|
||||
WORKDIR /root
|
||||
|
||||
RUN set -e -x; \
|
||||
if [ "$REL" = "buster" ]; then \
|
||||
perl -i -pe 's/deb.debian.org/archive.debian.org/' /etc/apt/sources.list; \
|
||||
apt-get update; \
|
||||
apt-get -y install wget; \
|
||||
wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg; \
|
||||
echo "deb https://vitastor.io/debian $REL main" >> /etc/apt/sources.list; \
|
||||
fi; \
|
||||
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
|
||||
perl -i -pe 's/Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/*.sources || true; \
|
||||
echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
|
||||
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get -y install fio libgoogle-perftools-dev devscripts libjerasure-dev cmake \
|
||||
libibverbs-dev librdmacm-dev libisal-dev libnl-3-dev libnl-genl-3-dev curl nodejs npm node-nan node-bindings && \
|
||||
apt-get -y build-dep fio && \
|
||||
apt-get --download-only source fio
|
1
debian/vitastor-client.install
vendored
1
debian/vitastor-client.install
vendored
@@ -2,6 +2,7 @@ usr/bin/vita
|
||||
usr/bin/vitastor-cli
|
||||
usr/bin/vitastor-rm
|
||||
usr/bin/vitastor-nbd
|
||||
usr/bin/vitastor-ublk
|
||||
usr/bin/vitastor-nfs
|
||||
usr/bin/vitastor-kv
|
||||
usr/bin/vitastor-kv-stress
|
||||
|
65
debian/vitastor.Dockerfile
vendored
65
debian/vitastor.Dockerfile
vendored
@@ -1,65 +0,0 @@
|
||||
# Build Vitastor packages for Debian inside a container
|
||||
# cd ..; podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/vitastor.Dockerfile .
|
||||
|
||||
ARG DISTRO=debian
|
||||
ARG REL=
|
||||
FROM $DISTRO:$REL
|
||||
ARG DISTRO=debian
|
||||
ARG REL=
|
||||
|
||||
WORKDIR /root
|
||||
|
||||
RUN set -e -x; \
|
||||
if [ "$REL" = "buster" ]; then \
|
||||
apt-get update; \
|
||||
apt-get -y install wget; \
|
||||
wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg; \
|
||||
echo "deb https://vitastor.io/debian $REL main" >> /etc/apt/sources.list; \
|
||||
fi; \
|
||||
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
|
||||
perl -i -pe 's/Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/debian.sources || true; \
|
||||
echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
|
||||
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake \
|
||||
libibverbs-dev librdmacm-dev libisal-dev libnl-3-dev libnl-genl-3-dev curl nodejs npm node-nan node-bindings && \
|
||||
apt-get -y build-dep fio && \
|
||||
apt-get --download-only source fio
|
||||
|
||||
ADD . /root/vitastor
|
||||
RUN set -e -x; \
|
||||
[ -e /usr/lib/x86_64-linux-gnu/pkgconfig/libisal.pc ] || cp /root/vitastor/debian/libisal.pc /usr/lib/x86_64-linux-gnu/pkgconfig; \
|
||||
mkdir -p /root/fio-build/; \
|
||||
cd /root/fio-build/; \
|
||||
rm -rf /root/fio-build/*; \
|
||||
dpkg-source -x /root/fio*.dsc; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
FULLVER=$(head -n1 /root/vitastor/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
VER=${FULLVER%%-*}; \
|
||||
cp -r /root/vitastor vitastor-$VER; \
|
||||
cd vitastor-$VER; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
sh copy-fio-includes.sh; \
|
||||
rm fio; \
|
||||
mkdir -p a b debian/patches; \
|
||||
mv fio-copy b/fio; \
|
||||
diff -NaurpbB a b > debian/patches/fio-headers.patch || true; \
|
||||
echo fio-headers.patch >> debian/patches/series; \
|
||||
rm -rf a b; \
|
||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||
cd /root/packages/vitastor-$REL/vitastor-$VER; \
|
||||
mkdir mon/node_modules; \
|
||||
cd mon/node_modules; \
|
||||
curl -s https://git.yourcmc.ru/vitalif/antietcd/archive/master.tar.gz | tar -zx; \
|
||||
curl -s https://git.yourcmc.ru/vitalif/tinyraft/archive/master.tar.gz | tar -zx; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_$VER.orig.tar.xz vitastor-$VER; \
|
||||
cd vitastor-$VER; \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$FULLVER""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
rm -rf /root/packages/vitastor-$REL/vitastor-*/
|
@@ -3,7 +3,7 @@
|
||||
FROM debian:bookworm
|
||||
|
||||
ADD etc/apt /etc/apt/
|
||||
RUN apt-get update && apt-get -y install vitastor udev systemd qemu-system-x86 qemu-system-common qemu-block-extra qemu-utils jq nfs-common && apt-get clean
|
||||
RUN apt-get update && apt-get -y install vitastor ibverbs-providers udev systemd qemu-system-x86 qemu-system-common qemu-block-extra qemu-utils jq nfs-common && apt-get clean
|
||||
ADD sleep.sh /usr/bin/
|
||||
ADD install.sh /usr/bin/
|
||||
ADD scripts /opt/scripts/
|
||||
|
@@ -1,4 +1,4 @@
|
||||
VITASTOR_VERSION ?= v2.2.2
|
||||
VITASTOR_VERSION ?= v2.3.0
|
||||
|
||||
all: build push
|
||||
|
||||
|
3
docker/etc/apt/preferences
Normal file
3
docker/etc/apt/preferences
Normal file
@@ -0,0 +1,3 @@
|
||||
Package: *
|
||||
Pin: release n=bookworm-backports
|
||||
Pin-Priority: 500
|
@@ -4,7 +4,7 @@
|
||||
#
|
||||
|
||||
# Desired Vitastor version
|
||||
VITASTOR_VERSION=v2.2.2
|
||||
VITASTOR_VERSION=v2.3.0
|
||||
|
||||
# Additional arguments for all containers
|
||||
# For example, you may want to specify a custom logging driver here
|
||||
|
@@ -25,6 +25,9 @@ affect their interaction with the cluster.
|
||||
- [nbd_max_part](#nbd_max_part)
|
||||
- [osd_nearfull_ratio](#osd_nearfull_ratio)
|
||||
- [hostname](#hostname)
|
||||
- [ublk_queue_depth](#ublk_queue_depth)
|
||||
- [ublk_max_io_size](#ublk_max_io_size)
|
||||
- [qemu_file_mirror_path](#qemu_file_mirror_path)
|
||||
|
||||
## client_iothread_count
|
||||
|
||||
@@ -225,3 +228,28 @@ without destroying and recreating OSDs.
|
||||
Clients use host name to find their distance to OSDs when [localized reads](pool.en.md#local_reads)
|
||||
are enabled. By default, standard [gethostname](https://man7.org/linux/man-pages/man2/gethostname.2.html)
|
||||
function is used to determine host name, but you can also override it with this parameter.
|
||||
|
||||
## ublk_queue_depth
|
||||
|
||||
- Type: integer
|
||||
- Default: 256
|
||||
|
||||
Default queue depth for [Vitastor ublk servers](../usage/ublk.en.md).
|
||||
|
||||
## ublk_max_io_size
|
||||
|
||||
- Type: integer
|
||||
|
||||
Default maximum I/O size for Vitastor [ublk servers](../usage/ublk.en.md).
|
||||
The largest of 1 MB and pool block size multiplied by EC data chunk count is used if not specified.
|
||||
|
||||
## qemu_file_mirror_path
|
||||
|
||||
- Type: string
|
||||
|
||||
When set to an FS directory path (for example, `/mnt/vitastor/`), `qemu-img info` and similar
|
||||
QAPI commands return the name of the image inside this directory instead of normal
|
||||
`vitastor://?image=abc` URI as `filename`.
|
||||
|
||||
This allows to then mount this path using [vitastor-nfs](../usage/nfs.en.md) and trick
|
||||
third-party systems like Veeam which rely on `filename` in the image info but don't support Vitastor.
|
||||
|
@@ -25,6 +25,9 @@
|
||||
- [nbd_max_part](#nbd_max_part)
|
||||
- [osd_nearfull_ratio](#osd_nearfull_ratio)
|
||||
- [hostname](#hostname)
|
||||
- [ublk_queue_depth](#ublk_queue_depth)
|
||||
- [ublk_max_io_size](#ublk_max_io_size)
|
||||
- [qemu_file_mirror_path](#qemu_file_mirror_path)
|
||||
|
||||
## client_iothread_count
|
||||
|
||||
@@ -230,3 +233,30 @@ RDMA и хотите повысить пиковую производитель
|
||||
[локальные чтения](pool.ru.md#local_reads). По умолчанию для определения имени
|
||||
хоста используется стандартная функция [gethostname](https://man7.org/linux/man-pages/man2/gethostname.2.html),
|
||||
но вы также можете задать имя хоста вручную данным параметром.
|
||||
|
||||
## ublk_queue_depth
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 256
|
||||
|
||||
Глубина очереди по умолчанию для [ublk-серверов Vitastor](../usage/ublk.ru.md).
|
||||
|
||||
## ublk_max_io_size
|
||||
|
||||
- Тип: целое число
|
||||
|
||||
Максимальный размер запроса ввода-вывода для [ublk-серверов Vitastor](../usage/ublk.ru.md).
|
||||
Если не задан, используется максимум из 1 МБ и размера блока пула, умноженного на число частей
|
||||
данных EC-пула.
|
||||
|
||||
## qemu_file_mirror_path
|
||||
|
||||
- Тип: строка
|
||||
|
||||
Если установить эту опцию равной пути к каталогу в ФС, команда `qemu-img info` и подобные
|
||||
команды QAPI будут возвращать в поле `filename` имя образа внутри заданного каталога вместо
|
||||
обычного адреса типа `vitastor://?image=abc`.
|
||||
|
||||
Это позволяет смонтировать этот путь с помощью [vitastor-nfs](../usage/nfs.ru.md) и обмануть
|
||||
сторонние системы типа Veeam, которые полагаются на поле `filename` в информации об образе QEMU,
|
||||
но не поддерживают Vitastor.
|
||||
|
@@ -74,7 +74,7 @@ Consider `use_rdmacm` for such networks.
|
||||
## use_rdmacm
|
||||
|
||||
- Type: boolean
|
||||
- Default: true
|
||||
- Default: false
|
||||
|
||||
Use an alternative implementation of RDMA through RDMA-CM (Connection
|
||||
Manager). Works with all RDMA networks: Infiniband, iWARP and
|
||||
|
@@ -74,7 +74,7 @@ RDMA-устройства, но они не имеют соединения с
|
||||
## use_rdmacm
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: true
|
||||
- Значение по умолчанию: false
|
||||
|
||||
Использовать альтернативную реализацию RDMA на основе RDMA-CM (Connection
|
||||
Manager). Работает со всеми типами RDMA-сетей: Infiniband, iWARP и
|
||||
|
@@ -491,7 +491,7 @@ Can be used to slow down scrubbing if it affects user load too much.
|
||||
## scrub_list_limit
|
||||
|
||||
- Type: integer
|
||||
- Default: 1000
|
||||
- Default: 262144
|
||||
- Can be changed online: yes
|
||||
|
||||
Number of objects to list in one listing operation during scrub.
|
||||
|
@@ -514,7 +514,7 @@ fsync небезопасным даже с режимом "directsync".
|
||||
## scrub_list_limit
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 1000
|
||||
- Значение по умолчанию: 262144
|
||||
- Можно менять на лету: да
|
||||
|
||||
Размер загружаемых за одну операцию списков объектов в процессе фоновой
|
||||
|
@@ -283,3 +283,36 @@
|
||||
[локальные чтения](pool.ru.md#local_reads). По умолчанию для определения имени
|
||||
хоста используется стандартная функция [gethostname](https://man7.org/linux/man-pages/man2/gethostname.2.html),
|
||||
но вы также можете задать имя хоста вручную данным параметром.
|
||||
- name: ublk_queue_depth
|
||||
type: int
|
||||
default: 256
|
||||
online: false
|
||||
info: Default queue depth for [Vitastor ublk servers](../usage/ublk.en.md).
|
||||
info_ru: Глубина очереди по умолчанию для [ublk-серверов Vitastor](../usage/ublk.ru.md).
|
||||
- name: ublk_max_io_size
|
||||
type: int
|
||||
online: false
|
||||
info: |
|
||||
Default maximum I/O size for Vitastor [ublk servers](../usage/ublk.en.md).
|
||||
The largest of 1 MB and pool block size multiplied by EC data chunk count is used if not specified.
|
||||
info_ru: |
|
||||
Максимальный размер запроса ввода-вывода для [ublk-серверов Vitastor](../usage/ublk.ru.md).
|
||||
Если не задан, используется максимум из 1 МБ и размера блока пула, умноженного на число частей
|
||||
данных EC-пула.
|
||||
- name: qemu_file_mirror_path
|
||||
type: string
|
||||
info: |
|
||||
When set to an FS directory path (for example, `/mnt/vitastor/`), `qemu-img info` and similar
|
||||
QAPI commands return the name of the image inside this directory instead of normal
|
||||
`vitastor://?image=abc` URI as `filename`.
|
||||
|
||||
This allows to then mount this path using [vitastor-nfs](../usage/nfs.en.md) and trick
|
||||
third-party systems like Veeam which rely on `filename` in the image info but don't support Vitastor.
|
||||
info_ru: |
|
||||
Если установить эту опцию равной пути к каталогу в ФС, команда `qemu-img info` и подобные
|
||||
команды QAPI будут возвращать в поле `filename` имя образа внутри заданного каталога вместо
|
||||
обычного адреса типа `vitastor://?image=abc`.
|
||||
|
||||
Это позволяет смонтировать этот путь с помощью [vitastor-nfs](../usage/nfs.ru.md) и обмануть
|
||||
сторонние системы типа Veeam, которые полагаются на поле `filename` в информации об образе QEMU,
|
||||
но не поддерживают Vitastor.
|
||||
|
@@ -24,6 +24,8 @@
|
||||
|
||||
{{../../installation/kubernetes.en.md}}
|
||||
|
||||
{{../../installation/s3.en.md}}
|
||||
|
||||
{{../../installation/source.en.md}}
|
||||
|
||||
{{../../config.en.md|indent=1}}
|
||||
@@ -54,6 +56,8 @@
|
||||
|
||||
{{../../usage/fio.en.md}}
|
||||
|
||||
{{../../usage/ublk.en.md}}
|
||||
|
||||
{{../../usage/nbd.en.md}}
|
||||
|
||||
{{../../usage/qemu.en.md}}
|
||||
|
@@ -26,6 +26,8 @@
|
||||
|
||||
{{../../installation/source.ru.md}}
|
||||
|
||||
{{../../installation/s3.ru.md}}
|
||||
|
||||
{{../../config.ru.md|indent=1}}
|
||||
|
||||
{{../../config/common.ru.md|indent=2}}
|
||||
@@ -54,6 +56,8 @@
|
||||
|
||||
{{../../usage/fio.ru.md}}
|
||||
|
||||
{{../../usage/ublk.ru.md}}
|
||||
|
||||
{{../../usage/nbd.ru.md}}
|
||||
|
||||
{{../../usage/qemu.ru.md}}
|
||||
|
@@ -51,7 +51,7 @@
|
||||
Рассмотрите включение `use_rdmacm` для таких сетей.
|
||||
- name: use_rdmacm
|
||||
type: bool
|
||||
default: true
|
||||
default: false
|
||||
info: |
|
||||
Use an alternative implementation of RDMA through RDMA-CM (Connection
|
||||
Manager). Works with all RDMA networks: Infiniband, iWARP and
|
||||
|
@@ -566,7 +566,7 @@
|
||||
сильно влияет на пользовательскую нагрузку.
|
||||
- name: scrub_list_limit
|
||||
type: int
|
||||
default: 1000
|
||||
default: 262144
|
||||
online: true
|
||||
info: |
|
||||
Number of objects to list in one listing operation during scrub.
|
||||
|
@@ -26,9 +26,9 @@ at Vitastor Kubernetes operator: https://github.com/Antilles7227/vitastor-operat
|
||||
The instruction is very simple.
|
||||
|
||||
1. Download a Docker image of the desired version: \
|
||||
`docker pull vitastor:v2.2.2`
|
||||
`docker pull vitalif/vitastor:v2.3.0`
|
||||
2. Install scripts to the host system: \
|
||||
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.2 install.sh`
|
||||
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitalif/vitastor:v2.3.0 install.sh`
|
||||
3. Reload udev rules: \
|
||||
`udevadm control --reload-rules`
|
||||
|
||||
|
@@ -25,9 +25,9 @@ Vitastor можно установить в Docker/Podman. При этом etcd,
|
||||
Инструкция по установке максимально простая.
|
||||
|
||||
1. Скачайте Docker-образ желаемой версии: \
|
||||
`docker pull vitastor:v2.2.2`
|
||||
`docker pull vitalif/vitastor:v2.3.0`
|
||||
2. Установите скрипты в хост-систему командой: \
|
||||
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.2 install.sh`
|
||||
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitalif/vitastor:v2.3.0 install.sh`
|
||||
3. Перезагрузите правила udev: \
|
||||
`udevadm control --reload-rules`
|
||||
|
||||
|
@@ -11,12 +11,20 @@
|
||||
- Trust Vitastor package signing key:
|
||||
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
|
||||
- Add Vitastor package repository to your /etc/apt/sources.list:
|
||||
- Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
|
||||
- Debian 13 (Trixie/Sid): `deb https://vitastor.io/debian trixie main`
|
||||
- Debian 12 (Bookworm): `deb https://vitastor.io/debian bookworm main`
|
||||
- Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
- Ubuntu 22.04 (Jammy): `deb https://vitastor.io/debian jammy main`
|
||||
- Ubuntu 24.04 (Noble): `deb https://vitastor.io/debian noble main`
|
||||
- Add `-oldstable` to bookworm/bullseye/buster in this line to install the last
|
||||
stable version from 0.9.x branch instead of 1.x
|
||||
- To always prefer vitastor-patched QEMU and Libvirt versions, add the following to `/etc/apt/preferences`:
|
||||
```
|
||||
Package: *
|
||||
Pin: origin "vitastor.io"
|
||||
Pin-Priority: 501
|
||||
```
|
||||
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
|
||||
|
||||
## CentOS
|
||||
@@ -42,7 +50,6 @@
|
||||
recommended because io_uring is a relatively new technology and there is
|
||||
at least one bug which reproduces with io_uring and HP SmartArray
|
||||
controllers in 5.4
|
||||
- liburing 0.4 or newer
|
||||
- lp_solve
|
||||
- etcd 3.4.15 or newer. Earlier versions won't work because of various bugs,
|
||||
for example [#12402](https://github.com/etcd-io/etcd/pull/12402).
|
||||
|
@@ -11,12 +11,20 @@
|
||||
- Добавьте ключ репозитория Vitastor:
|
||||
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
|
||||
- Добавьте репозиторий Vitastor в /etc/apt/sources.list:
|
||||
- Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
|
||||
- Debian 13 (Trixie/Sid): `deb https://vitastor.io/debian trixie main`
|
||||
- Debian 12 (Bookworm): `deb https://vitastor.io/debian bookworm main`
|
||||
- Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
- Ubuntu 22.04 (Jammy): `deb https://vitastor.io/debian jammy main`
|
||||
- Ubuntu 24.04 (Noble): `deb https://vitastor.io/debian noble main`
|
||||
- Добавьте `-oldstable` к слову bookworm/bullseye/buster в этой строке, чтобы
|
||||
установить последнюю стабильную версию из ветки 0.9.x вместо 1.x
|
||||
- Чтобы всегда предпочитались версии пакетов QEMU и Libvirt с патчами Vitastor, добавьте в `/etc/apt/preferences`:
|
||||
```
|
||||
Package: *
|
||||
Pin: origin "vitastor.io"
|
||||
Pin-Priority: 501
|
||||
```
|
||||
- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
|
||||
|
||||
## CentOS
|
||||
@@ -41,7 +49,6 @@
|
||||
- Ядро Linux 5.4 или новее, для поддержки io_uring. Рекомендуется даже 5.8,
|
||||
так как io_uring - относительно новый интерфейс и в версиях до 5.8 встречались
|
||||
некоторые баги, например, зависание с io_uring и контроллером HP SmartArray
|
||||
- liburing 0.4 или новее
|
||||
- lp_solve
|
||||
- etcd 3.4.15 или новее. Более старые версии не будут работать из-за разных багов,
|
||||
например, [#12402](https://github.com/etcd-io/etcd/pull/12402).
|
||||
|
@@ -9,7 +9,7 @@
|
||||
To enable Vitastor support in Proxmox Virtual Environment (6.4-8.x are supported):
|
||||
|
||||
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
|
||||
bookworm for 8.1+, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
|
||||
trixie for 9.0+, bookworm for 8.1+, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
|
||||
- Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
|
||||
- Define storage in `/etc/pve/storage.cfg` (see below)
|
||||
- Block network access from VMs to Vitastor network (to OSDs and etcd),
|
||||
|
@@ -9,7 +9,7 @@
|
||||
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.x):
|
||||
|
||||
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
|
||||
bookworm для 8.1+, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
|
||||
trixie для 9.0+, bookworm для 8.1+, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
|
||||
- Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
|
||||
- Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
|
||||
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
|
||||
|
@@ -15,7 +15,7 @@
|
||||
- gcc and g++ 8 or newer, clang 10 or newer, or other compiler with C++11 plus
|
||||
designated initializers support from C++20
|
||||
- CMake
|
||||
- liburing, jerasure headers and libraries
|
||||
- jerasure headers and libraries
|
||||
- ISA-L, libibverbs and librdmacm headers and libraries (optional)
|
||||
- tcmalloc (google-perftools-dev)
|
||||
|
||||
|
@@ -15,7 +15,7 @@
|
||||
- gcc и g++ >= 8, либо clang >= 10, либо другой компилятор с поддержкой C++11 плюс
|
||||
назначенных инициализаторов (designated initializers) из C++20
|
||||
- CMake
|
||||
- Заголовки и библиотеки liburing, jerasure
|
||||
- Заголовки и библиотеки jerasure
|
||||
- Опционально - заголовки и библиотеки ISA-L, libibverbs, librdmacm
|
||||
- tcmalloc (google-perftools-dev)
|
||||
|
||||
|
@@ -52,7 +52,7 @@
|
||||
- Generic user-space client library
|
||||
- [Native QEMU driver](../usage/qemu.en.md)
|
||||
- [Loadable fio engine for benchmarks](../usage/fio.en.md)
|
||||
- [NBD proxy for kernel mounts](../usage/nbd.en.md)
|
||||
- [UBLK](../usage/ublk.en.md) and [NBD](../usage/nbd.en.md) servers for kernel mounts
|
||||
- [Simplified NFS proxy for file-based image access emulation (suitable for VMWare)](../usage/nfs.en.md#pseudo-fs)
|
||||
|
||||
## Roadmap
|
||||
|
@@ -54,7 +54,7 @@
|
||||
- Общая пользовательская клиентская библиотека для работы с кластером
|
||||
- [Драйвер диска для QEMU](../usage/qemu.ru.md)
|
||||
- [Драйвер диска для утилиты тестирования производительности fio](../usage/fio.ru.md)
|
||||
- [NBD-прокси для монтирования образов ядром](../usage/nbd.ru.md) ("блочное устройство в режиме пользователя")
|
||||
- [UBLK](../usage/ublk.ru.md) и [NBD](../usage/nbd.ru.md) серверы для монтирования образов ядром ("блочное устройство в режиме пользователя")
|
||||
- [Упрощённая NFS-прокси для эмуляции файлового доступа к образам (подходит для VMWare)](../usage/nfs.ru.md#псевдо-фс)
|
||||
|
||||
## Планы развития
|
||||
|
@@ -73,6 +73,8 @@ Options (automatic mode):
|
||||
--max_other 10%
|
||||
Use disks for OSD data even if they already have non-Vitastor partitions,
|
||||
but only if these take up no more than this percent of disk space.
|
||||
--dry-run
|
||||
Check and print new OSD count for each disk but do not actually create them.
|
||||
```
|
||||
|
||||
Options (single-device mode):
|
||||
|
@@ -74,6 +74,8 @@ vitastor-disk - инструмент командной строки для уп
|
||||
--max_other 10%
|
||||
Использовать диски под данные OSD, даже если на них уже есть не-Vitastor-овые
|
||||
разделы, но только в случае, если они занимают не более данного процента диска.
|
||||
--dry-run
|
||||
Проверить и вывести число новых OSD для каждого диска, но не создавать их.
|
||||
```
|
||||
|
||||
Опции для режима одного OSD:
|
||||
|
@@ -89,6 +89,8 @@ POSIX features currently not implemented in VitastorFS:
|
||||
instead of actually allocated space
|
||||
- Access times (`atime`) are not tracked (like `-o noatime`)
|
||||
- Modification time (`mtime`) is updated lazily every second (like `-o lazytime`)
|
||||
- Permission enforcement is disabled by default (and Linux NFS client doesn't
|
||||
enforce them too). Use `--enforce 1` to enable it.
|
||||
|
||||
Other notable missing features which should be addressed in the future:
|
||||
- Inode ID reuse. Currently inode IDs always grow, the limit is 2^48 inodes, so
|
||||
@@ -258,4 +260,5 @@ Options:
|
||||
| `--nfspath <PATH>` | set NFS export path to \<PATH> (default is /) |
|
||||
| `--pidfile <FILE>` | write process ID to the specified file |
|
||||
| `--logfile <FILE>` | log to the specified file |
|
||||
| `--enforce 1` | enforce permissions at the server side (no by default) |
|
||||
| `--foreground 1` | stay in foreground, do not daemonize |
|
||||
|
@@ -91,6 +91,8 @@ JSON-формате :-). Для инспекции содержимого БД
|
||||
stat(2), так что `du` всегда показывает сумму размеров файлов, а не фактически занятое место
|
||||
- Времена доступа (`atime`) не отслеживаются (как будто ФС смонтирована с `-o noatime`)
|
||||
- Времена модификации (`mtime`) отслеживаются асинхронно (как будто ФС смонтирована с `-o lazytime`)
|
||||
- Привилегии доступа по умолчанию не проверяются сервером (клиент NFS Linux их также не проверяет).
|
||||
Чтобы включить проверки, используйте опцию `--enforce 1`.
|
||||
|
||||
Другие недостающие функции, которые нужно добавить в будущем:
|
||||
- Переиспользование номеров инодов. В текущей реализации номера инодов всё время
|
||||
@@ -270,4 +272,5 @@ VitastorFS из GPUDirect.
|
||||
| `--nfspath <PATH>` | установить путь NFS-экспорта в \<PATH> (по умолчанию /) |
|
||||
| `--pidfile <FILE>` | записать ID процесса в заданный файл |
|
||||
| `--logfile <FILE>` | записывать логи в заданный файл |
|
||||
| `--enforce 1` | проверять права доступа на стороне сервера (по умолчанию нет) |
|
||||
| `--foreground 1` | не уходить в фон после запуска |
|
||||
|
@@ -130,23 +130,16 @@ Linux kernel, starting with version 5.15, supports a new interface for attaching
|
||||
to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
|
||||
exporting QEMU block devices over this protocol using qemu-storage-daemon.
|
||||
|
||||
VDUSE is currently the best interface to attach Vitastor disks as kernel devices because:
|
||||
- It avoids data copies and thus achieves much better performance than [NBD](nbd.en.md)
|
||||
- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long
|
||||
VDUSE advantages:
|
||||
|
||||
- VDUSE copies memory 1 time instead of 2, and is thus faster than [NBD](nbd.en.md) for linear read/write.
|
||||
- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long.
|
||||
- It doesn't have hung device problem - if the userspace process dies it can be restarted (!)
|
||||
and block device will continue operation
|
||||
- It doesn't seem to have the device number limit
|
||||
and block device will continue operation (UBLK can do it too).
|
||||
- It doesn't seem to have the device number limit (UBLK also doesn't).
|
||||
|
||||
Example performance comparison:
|
||||
|
||||
| | direct fio | NBD | VDUSE |
|
||||
|----------------------|-------------|-------------|-------------|
|
||||
| linear write | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
|
||||
| 4k random write Q128 | 240000 iops | 120000 iops | 178000 iops |
|
||||
| 4k random write Q1 | 9500 iops | 7620 iops | 7640 iops |
|
||||
| linear read | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
|
||||
| 4k random read Q128 | 287000 iops | 140000 iops | 189000 iops |
|
||||
| 4k random read Q1 | 9600 iops | 7640 iops | 7780 iops |
|
||||
At the same time, VDUSE may be slower or faster than [UBLK](ublk.en.md) for linear read/write,
|
||||
and iops-wise it's sometimes even slower than NBD. See performance comparison examples at the page [UBLK](ublk.en.md).
|
||||
|
||||
To try VDUSE you need at least Linux 5.15, built with VDUSE support
|
||||
(CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
|
||||
@@ -193,3 +186,12 @@ To remove the device:
|
||||
vdpa dev del test1
|
||||
kill <qemu-storage-daemon_process_PID>
|
||||
```
|
||||
|
||||
## Veeam
|
||||
|
||||
Vitastor QEMU driver has a feature that allows to trick third-party systems like Veeam not able to parse qemu-img
|
||||
vitastor URIs: [qemu_file_mirror_path](../config/client.en.md#qemu_file_mirror_path).
|
||||
|
||||
To make such systems work, you should set this option to an FS directory path (for example, `/mnt/vitastor/`) and
|
||||
mount this directory using [`vitastor-nfs mount --block`](../usage/nfs.en.md). It will make them access
|
||||
your images using files and, hopefully, succeed in doing their normal job :).
|
||||
|
@@ -132,24 +132,16 @@ qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
|
||||
к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
|
||||
экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.
|
||||
|
||||
VDUSE - на данный момент лучший интерфейс для подключения дисков Vitastor в виде блочных
|
||||
устройств на уровне ядра, ибо:
|
||||
- VDUSE не копирует данные и поэтому достигает значительно лучшей производительности, чем [NBD](nbd.ru.md)
|
||||
- Также оно не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго
|
||||
- Также оно не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
|
||||
перезапустить (!) и блочное устройство продолжит работать
|
||||
- По-видимому, у него нет предела числа подключаемых в систему устройств
|
||||
Преимущества VDUSE:
|
||||
|
||||
Пример сравнения производительности:
|
||||
- VDUSE копирует данные 1 раз, а не 2, и поэтому он быстрее, чем [NBD](nbd.ru.md) при линейном доступе.
|
||||
- VDUSE не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго.
|
||||
- VDUSE не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
|
||||
перезапустить (!) и блочное устройство продолжит работать (в UBLK это тоже поддерживается).
|
||||
- По-видимому, у него нет предела числа подключаемых в систему устройств (в UBLK лимита тоже нет).
|
||||
|
||||
| | Прямой fio | NBD | VDUSE |
|
||||
|--------------------------|-------------|-------------|-------------|
|
||||
| линейная запись | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
|
||||
| 4k случайная запись Q128 | 240000 iops | 120000 iops | 178000 iops |
|
||||
| 4k случайная запись Q1 | 9500 iops | 7620 iops | 7640 iops |
|
||||
| линейное чтение | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
|
||||
| 4k случайное чтение Q128 | 287000 iops | 140000 iops | 189000 iops |
|
||||
| 4k случайное чтение Q1 | 9600 iops | 7640 iops | 7780 iops |
|
||||
Однако, при линейном доступе VDUSE может быть медленнее UBLK (а может быть и быстрее), а по iops
|
||||
VDUSE иногда даже медленнее NBD. Пример сравнения производительности смотрите на странице [UBLK](ublk.ru.md).
|
||||
|
||||
Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
|
||||
VDUSE (CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
|
||||
@@ -196,3 +188,12 @@ vdpa dev add name test1 mgmtdev vduse
|
||||
vdpa dev del test1
|
||||
kill <PID_процесса_qemu-storage-daemon>
|
||||
```
|
||||
|
||||
## Veeam
|
||||
|
||||
Драйвер Vitastor QEMU имеет функцию, которая позволяет обманывать сторонние системы типа Veeam, которые
|
||||
не могут сами по себе разобрать адреса дисков в vitastor: [qemu_file_mirror_path](../config/client.ru.md#qemu_file_mirror_path).
|
||||
|
||||
Чтобы заставить такие системы работать, вам нужно установить эту опцию равной пути к некоторому каталогу
|
||||
в ФС (например, `/mnt/vitastor/`) и примонтировать этот каталог с помощью [`vitastor-nfs mount --block`](../usage/nfs.ru.md).
|
||||
Они начнут обращаться к образам как к файлам и, вероятно, смогут заработать корректно :).
|
||||
|
116
docs/usage/ublk.en.md
Normal file
116
docs/usage/ublk.en.md
Normal file
@@ -0,0 +1,116 @@
|
||||
[Documentation](../../README.md#documentation) → Usage → UBLK
|
||||
|
||||
-----
|
||||
|
||||
[Читать на русском](ublk.ru.md)
|
||||
|
||||
# UBLK
|
||||
|
||||
[ublk](https://docs.kernel.org/block/ublk.html) is a new io_uring-based Linux interface
|
||||
for user-space block device drivers, available since Linux 6.0.
|
||||
|
||||
It's not zero-copy, but it's still a fast implementation, outperforming both [NBD](nbd.en.md)
|
||||
and [VDUSE](qemu.en.md#vduse) iops-wise and may or may not outperform VDUSE in linear I/O MB/s.
|
||||
ublk also allows to recover devices even if the server (vitastor-ublk process) dies.
|
||||
|
||||
## Example performance comparison
|
||||
|
||||
TCP (100G), 3 hosts each with 6 NVMe OSDs, 3 replicas, single client
|
||||
|
||||
| | direct fio | NBD | VDUSE | UBLK |
|
||||
|----------------------|-------------|-------------|------------|-------------|
|
||||
| linear write | 3807 MB/s | 1832 MB/s | 3226 MB/s | 3027 MB/s |
|
||||
| linear read | 3067 MB/s | 1885 MB/s | 1800 MB/s | 2076 MB/s |
|
||||
| 4k random write Q128 | 128624 iops | 91060 iops | 94621 iops | 149450 iops |
|
||||
| 4k random read Q128 | 117769 iops | 153408 iops | 93157 iops | 171987 iops |
|
||||
| 4k random write Q1 | 8090 iops | 6442 iops | 6316 iops | 7272 iops |
|
||||
| 4k random read Q1 | 9474 iops | 7200 iops | 6840 iops | 8038 iops |
|
||||
|
||||
RDMA (100G), 3 hosts each with 6 NVMe OSDs, 3 replicas, single client
|
||||
|
||||
| | direct fio | NBD | VDUSE | UBLK |
|
||||
|----------------------|-------------|-------------|-------------|-------------|
|
||||
| linear write | 6998 MB/s | 1878 MB/s | 4249 MB/s | 3140 MB/s |
|
||||
| linear read | 8628 MB/s | 3389 MB/s | 5062 MB/s | 3674 MB/s |
|
||||
| 4k random write Q128 | 222541 iops | 181589 iops | 138281 iops | 218222 iops |
|
||||
| 4k random read Q128 | 412647 iops | 239987 iops | 151663 iops | 269583 iops |
|
||||
| 4k random write Q1 | 11601 iops | 8592 iops | 9111 iops | 10000 iops |
|
||||
| 4k random read Q1 | 10102 iops | 7788 iops | 8111 iops | 8965 iops |
|
||||
|
||||
## Commands
|
||||
|
||||
vitastor-ublk supports the following commands:
|
||||
|
||||
- [map](#map)
|
||||
- [unmap](#unmap)
|
||||
- [ls](#ls)
|
||||
|
||||
## map
|
||||
|
||||
To create a local block device for a Vitastor image run:
|
||||
|
||||
```
|
||||
vitastor-ublk map [/dev/ublkbN] --image testimg
|
||||
```
|
||||
|
||||
It will output a block device name like /dev/ublkb0 which you can then use as a normal disk.
|
||||
|
||||
You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.
|
||||
|
||||
vitastor-ublk supports all usual Vitastor configuration options like `--config_path <path_to_config>` plus ublk-specific:
|
||||
|
||||
* `--recover` \
|
||||
Recover a mapped device if the previous ublk server is dead.
|
||||
* `--queue_depth 256` \
|
||||
Maximum queue size for the device.
|
||||
* `--max_io_size 1M` \
|
||||
Maximum single I/O size for the device. Default: `max(1 MB, pool block size * EC part count)`.
|
||||
* `--readonly` \
|
||||
Make the device read-only.
|
||||
* `--hdd` \
|
||||
Mark the device as rotational.
|
||||
* `--logfile /path/to/log/file.txt` \
|
||||
Write log messages to the specified file instead of dropping them (in background mode)
|
||||
or printing them to the standard output (in foreground mode).
|
||||
* `--dev_num N` \
|
||||
Use the specified device /dev/ublkbN instead of automatic selection (alternative syntax
|
||||
to /dev/ublkbN positional parameter).
|
||||
* `--foreground 1` \
|
||||
Stay in foreground, do not daemonize.
|
||||
|
||||
Note that `ublk_queue_depth` and `ublk_max_io_size` may also be specified
|
||||
in `/etc/vitastor/vitastor.conf` or in other configuration file specified with `--config_path`.
|
||||
|
||||
## unmap
|
||||
|
||||
To unmap the device run:
|
||||
|
||||
```
|
||||
vitastor-ublk unmap /dev/ublkb0
|
||||
```
|
||||
|
||||
## ls
|
||||
|
||||
```
|
||||
vitastor-ublk ls [--json]
|
||||
```
|
||||
|
||||
List mapped images.
|
||||
|
||||
Example output (normal format):
|
||||
|
||||
```
|
||||
/dev/ublkb0
|
||||
image: bench
|
||||
pid: 584536
|
||||
|
||||
/dev/ublkb1
|
||||
image: bench1
|
||||
pid: 584546
|
||||
```
|
||||
|
||||
Example output (JSON format):
|
||||
|
||||
```
|
||||
{"/dev/ublkb0": {"image": "bench", "pid": 584536}, "/dev/ublkb1": {"image": "bench1", "pid": 584546}}
|
||||
```
|
121
docs/usage/ublk.ru.md
Normal file
121
docs/usage/ublk.ru.md
Normal file
@@ -0,0 +1,121 @@
|
||||
[Документация](../../README-ru.md#документация) → Использование → UBLK
|
||||
|
||||
-----
|
||||
|
||||
[Read in English](ublk.en.md)
|
||||
|
||||
# UBLK
|
||||
|
||||
[ublk](https://docs.kernel.org/block/ublk.html) - это новый Linux-интерфейс на основе io_uring
|
||||
для реализации блочных устройств в пространстве пользователя, доступный, начиная с Linux 6.0.
|
||||
|
||||
ublk тоже копирует память (т.е. не является zero-copy), но по IOPS всё равно обгоняет и
|
||||
[NBD](nbd.ru.md), и [VDUSE](qemu.ru.md#vduse), и иногда может даже обгонять VDUSE по
|
||||
скорости линейного доступа. Также ublk позволяет оживлять устройства, у которых умер
|
||||
сервер (процесс-обработчик vitastor-ublk).
|
||||
|
||||
## Пример сравнения производительности
|
||||
|
||||
TCP (100G), 3 сервера с 6 NVMe OSD каждый, 3 реплики, один клиент
|
||||
|
||||
| | Прямой fio | NBD | VDUSE | UBLK |
|
||||
|--------------------------|-------------|-------------|------------|-------------|
|
||||
| линейная запись | 3807 MB/s | 1832 MB/s | 3226 MB/s | 3027 MB/s |
|
||||
| линейное чтение | 3067 MB/s | 1885 MB/s | 1800 MB/s | 2076 MB/s |
|
||||
| 4k случайная запись Q128 | 128624 iops | 91060 iops | 94621 iops | 149450 iops |
|
||||
| 4k случайное чтение Q128 | 117769 iops | 153408 iops | 93157 iops | 171987 iops |
|
||||
| 4k случайная запись Q1 | 8090 iops | 6442 iops | 6316 iops | 7272 iops |
|
||||
| 4k случайное чтение Q1 | 9474 iops | 7200 iops | 6840 iops | 8038 iops |
|
||||
|
||||
RDMA (100G), 3 сервера с 6 NVMe OSD каждый, 3 реплики, один клиент
|
||||
|
||||
| | Прямой fio | NBD | VDUSE | UBLK |
|
||||
|--------------------------|-------------|-------------|-------------|-------------|
|
||||
| линейная запись | 6998 MB/s | 1878 MB/s | 4249 MB/s | 3140 MB/s |
|
||||
| линейное чтение | 8628 MB/s | 3389 MB/s | 5062 MB/s | 3674 MB/s |
|
||||
| 4k случайная запись Q128 | 222541 iops | 181589 iops | 138281 iops | 218222 iops |
|
||||
| 4k случайное чтение Q128 | 412647 iops | 239987 iops | 151663 iops | 269583 iops |
|
||||
| 4k случайная запись Q1 | 11601 iops | 8592 iops | 9111 iops | 10000 iops |
|
||||
| 4k случайное чтение Q1 | 10102 iops | 7788 iops | 8111 iops | 8965 iops |
|
||||
|
||||
## Команды
|
||||
|
||||
vitastor-ublk поддерживает следующие команды:
|
||||
|
||||
- [map](#map)
|
||||
- [unmap](#unmap)
|
||||
- [ls](#ls)
|
||||
|
||||
## map
|
||||
|
||||
Чтобы создать локальное блочное устройство для образа, выполните команду:
|
||||
|
||||
```
|
||||
vitastor-ublk map [/dev/ublkbN] --image testimg
|
||||
```
|
||||
|
||||
Команда напечатает название блочного устройства вида /dev/ublkb0, которое потом можно
|
||||
будет использовать как обычный диск.
|
||||
|
||||
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
|
||||
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
|
||||
|
||||
vitastor-ublk поддерживает все обычные опции Vitastor, например, `--config_path <path_to_config>`,
|
||||
плюс специфичные для ublk:
|
||||
|
||||
* `--recover` \
|
||||
Восстановить ранее подключённое устройство, у которого умер обработчик.
|
||||
* `--queue_depth 256` \
|
||||
Максимальная глубина очереди устройства.
|
||||
* `--max_io_size 1M` \
|
||||
Максимальный размер запроса ввода-вывода для устройства. По умолчанию: `max(1 MB, блок данных пула * число частей данных EC)`.
|
||||
* `--readonly` \
|
||||
Подключить устройство в режиме только для чтения.
|
||||
* `--hdd` \
|
||||
Пометить устройство как вращающийся жёсткий диск (флаг rotational).
|
||||
* `--logfile /path/to/log/file.txt` \
|
||||
Писать сообщения о процессе работы в заданный файл, вместо пропуска их
|
||||
при фоновом режиме запуска или печати на стандартный вывод при запуске
|
||||
в консоли с `--foreground 1`.
|
||||
* `--dev_num N` \
|
||||
Использовать заданное устройство `/dev/ublkbN` вместо автоматического подбора.
|
||||
* `--foreground 1` \
|
||||
Не уводить процесс в фоновый режим.
|
||||
|
||||
Обратите внимание, что опции `ublk_queue_depth` и `ublk_max_io_size` можно
|
||||
также задавать в `/etc/vitastor/vitastor.conf` или в другом файле конфигурации,
|
||||
заданном опцией `--config_path`.
|
||||
|
||||
## unmap
|
||||
|
||||
Для отключения устройства выполните:
|
||||
|
||||
```
|
||||
vitastor-ublk unmap /dev/ublkb0
|
||||
```
|
||||
|
||||
## ls
|
||||
|
||||
```
|
||||
vitastor-ublk ls [--json]
|
||||
```
|
||||
|
||||
Вывести подключённые устройства.
|
||||
|
||||
Пример вывода в обычном формате:
|
||||
|
||||
```
|
||||
/dev/ublkb0
|
||||
image: bench
|
||||
pid: 584536
|
||||
|
||||
/dev/ublkb1
|
||||
image: bench1
|
||||
pid: 584546
|
||||
```
|
||||
|
||||
Пример вывода в JSON-формате:
|
||||
|
||||
```
|
||||
{"/dev/ublkb0": {"image": "bench", "pid": 584536}, "/dev/ublkb1": {"image": "bench1", "pid": 584546}}
|
||||
```
|
1
emhash
Submodule
1
emhash
Submodule
Submodule emhash added at b7ff3147a5
@@ -96,6 +96,7 @@ class Mon
|
||||
}
|
||||
else
|
||||
{
|
||||
res.setHeader('Content-Type', 'text/plain; version=0.0.4; charset=utf-8');
|
||||
res.write(export_prometheus_metrics(this.state));
|
||||
}
|
||||
}
|
||||
|
@@ -15,7 +15,7 @@ function get_osd_tree(global_config, state)
|
||||
const stat = state.osd.stats[osd_num];
|
||||
const osd_cfg = state.config.osd[osd_num];
|
||||
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
|
||||
if (isNaN(reweight) || reweight < 0 || reweight > 0)
|
||||
if (isNaN(reweight) || reweight < 0 || reweight > 1)
|
||||
reweight = 1;
|
||||
if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
|
||||
osd_cfg && osd_cfg.noout))
|
||||
@@ -179,7 +179,7 @@ function filter_osds_by_block_layout(orig_tree, osd_stats, block_size, bitmap_gr
|
||||
if (orig_tree[osd].level === 'osd')
|
||||
{
|
||||
const osd_stat = osd_stats[osd];
|
||||
if (osd_stat && (osd_stat.bs_block_size && osd_stat.bs_block_size != block_size ||
|
||||
if (osd_stat && (osd_stat.data_block_size && osd_stat.data_block_size != block_size ||
|
||||
osd_stat.bitmap_granularity && osd_stat.bitmap_granularity != bitmap_granularity ||
|
||||
osd_stat.immediate_commit == 'small' && immediate_commit == 'all' ||
|
||||
osd_stat.immediate_commit == 'none' && immediate_commit != 'none'))
|
||||
|
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vitastor-mon",
|
||||
"version": "2.2.2",
|
||||
"version": "2.3.0",
|
||||
"description": "Vitastor SDS monitor service",
|
||||
"main": "mon-main.js",
|
||||
"scripts": {
|
||||
@@ -9,7 +9,7 @@
|
||||
"author": "Vitaliy Filippov",
|
||||
"license": "UNLICENSED",
|
||||
"dependencies": {
|
||||
"antietcd": "^1.1.2",
|
||||
"antietcd": "^1.1.3",
|
||||
"sprintf-js": "^1.1.2",
|
||||
"ws": "^7.2.5"
|
||||
},
|
||||
|
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vitastor",
|
||||
"version": "2.2.2",
|
||||
"version": "2.3.0",
|
||||
"description": "Low-level native bindings to Vitastor client library",
|
||||
"main": "index.js",
|
||||
"keywords": [
|
||||
|
@@ -261,7 +261,7 @@ sub free_image
|
||||
my ($vtype, $name, $vmid, undef, undef, undef) = $class->parse_volname($volname);
|
||||
$class->deactivate_volume($storeid, $scfg, $volname);
|
||||
my $full_list = run_cli($scfg, [ 'ls', '-l' ]);
|
||||
my $list = _process_list($scfg, $storeid, $full_list);
|
||||
my $list = _process_list($scfg, $storeid, $full_list, 0);
|
||||
# Remove image and all its snapshots
|
||||
my $rm_names = {
|
||||
map { ($prefix.$_->{name} => 1) }
|
||||
@@ -269,6 +269,10 @@ sub free_image
|
||||
@$list
|
||||
};
|
||||
my $children = [ grep { $_->{parent_name} && $rm_names->{$_->{parent_name}} } @$full_list ];
|
||||
$children = [ grep {
|
||||
substr($_->{name}, 0, length($prefix.$name)) ne $prefix.$name &&
|
||||
substr($_->{name}, 0, length($prefix.$name)+1) ne $prefix.$name.'@'
|
||||
} @$children ];
|
||||
die "Image has children: ".join(', ', map {
|
||||
substr($_->{name}, 0, length $prefix) eq $prefix
|
||||
? substr($_->name, length $prefix)
|
||||
@@ -288,14 +292,15 @@ sub free_image
|
||||
|
||||
sub _process_list
|
||||
{
|
||||
my ($scfg, $storeid, $result) = @_;
|
||||
my ($scfg, $storeid, $result, $skip_snapshot) = @_;
|
||||
$skip_snapshot = 1 if !defined $skip_snapshot;
|
||||
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
|
||||
my $list = [];
|
||||
foreach my $el (@$result)
|
||||
{
|
||||
next if !$el->{name} || length($prefix) && substr($el->{name}, 0, length $prefix) ne $prefix;
|
||||
my $name = substr($el->{name}, length $prefix);
|
||||
next if $name =~ /@/;
|
||||
next if $skip_snapshot && $name =~ /@/;
|
||||
my ($owner) = $name =~ /^(?:vm|base)-(\d+)-/s;
|
||||
next if !defined $owner;
|
||||
my $parent = !defined $el->{parent_name}
|
||||
@@ -494,4 +499,55 @@ sub rename_volume
|
||||
return "${storeid}:${base_name}${target_volname}";
|
||||
}
|
||||
|
||||
sub _monkey_patch_qemu_blockdev_options
|
||||
{
|
||||
my ($cfg, $volid, $machine_version, $options) = @_;
|
||||
my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid);
|
||||
|
||||
my $scfg = PVE::Storage::storage_config($cfg, $storeid);
|
||||
|
||||
my $plugin = PVE::Storage::Plugin->lookup($scfg->{type});
|
||||
|
||||
my ($vtype) = $plugin->parse_volname($volname);
|
||||
die "cannot use volume of type '$vtype' as a QEMU blockdevice\n"
|
||||
if $vtype ne 'images' && $vtype ne 'iso' && $vtype ne 'import';
|
||||
|
||||
return $plugin->qemu_blockdev_options($scfg, $storeid, $volname, $machine_version, $options);
|
||||
}
|
||||
|
||||
sub qemu_blockdev_options
|
||||
{
|
||||
my ($class, $scfg, $storeid, $volname, $machine_version, $options) = @_;
|
||||
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
|
||||
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
|
||||
$name .= '@'.$options->{'snapshot-name'} if $options->{'snapshot-name'};
|
||||
if ($scfg->{vitastor_nbd})
|
||||
{
|
||||
my $mapped = run_cli($scfg, [ 'ls' ], binary => '/usr/bin/vitastor-nbd');
|
||||
my ($kerneldev) = grep { $mapped->{$_}->{image} eq $prefix.$name } keys %$mapped;
|
||||
die "Image not mapped via NBD" if !$kerneldev;
|
||||
return { driver => 'host_device', filename => $kerneldev };
|
||||
}
|
||||
my $blockdev = {
|
||||
driver => 'vitastor',
|
||||
image => $prefix.$name,
|
||||
};
|
||||
if ($scfg->{vitastor_config_path})
|
||||
{
|
||||
$blockdev->{'config-path'} = $scfg->{vitastor_config_path};
|
||||
}
|
||||
if ($scfg->{vitastor_etcd_address})
|
||||
{
|
||||
# FIXME This is the only exception: etcd_address -> etcd_host for qemu
|
||||
$blockdev->{'etcd-host'} = $scfg->{vitastor_etcd_address};
|
||||
}
|
||||
if ($scfg->{vitastor_etcd_prefix})
|
||||
{
|
||||
$blockdev->{'etcd-prefix'} = $scfg->{vitastor_etcd_prefix};
|
||||
}
|
||||
return $blockdev;
|
||||
}
|
||||
|
||||
*PVE::Storage::qemu_blockdev_options = *_monkey_patch_qemu_blockdev_options;
|
||||
|
||||
1;
|
||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VITASTOR_VERSION = '2.2.2'
|
||||
VITASTOR_VERSION = '2.3.0'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
637
patches/libvirt-11.5-vitastor.diff
Normal file
637
patches/libvirt-11.5-vitastor.diff
Normal file
@@ -0,0 +1,637 @@
|
||||
diff --git a/include/libvirt/libvirt-storage.h b/include/libvirt/libvirt-storage.h
|
||||
index aaad4a3da1..5f5daa8341 100644
|
||||
--- a/include/libvirt/libvirt-storage.h
|
||||
+++ b/include/libvirt/libvirt-storage.h
|
||||
@@ -326,6 +326,7 @@ typedef enum {
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17, /* (Since: 1.2.8) */
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18, /* (Since: 3.1.0) */
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19, /* (Since: 5.6.0) */
|
||||
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20, /* (Since: 5.0.0) */
|
||||
} virConnectListAllStoragePoolsFlags;
|
||||
|
||||
int virConnectListAllStoragePools(virConnectPtr conn,
|
||||
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
|
||||
index 1e24e41a48..ce359a4cf8 100644
|
||||
--- a/src/conf/domain_conf.c
|
||||
+++ b/src/conf/domain_conf.c
|
||||
@@ -7435,7 +7435,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
|
||||
src->configFile = virXPathString("string(./config/@file)", ctxt);
|
||||
|
||||
if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP ||
|
||||
- src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS)
|
||||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS ||
|
||||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR)
|
||||
src->query = virXMLPropString(node, "query");
|
||||
|
||||
if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0)
|
||||
@@ -31871,6 +31872,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSource *src,
|
||||
|
||||
case VIR_STORAGE_POOL_MPATH:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_SHEEPDOG:
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c
|
||||
index b28af7fa56..d1aae6e43e 100644
|
||||
--- a/src/conf/domain_validate.c
|
||||
+++ b/src/conf/domain_validate.c
|
||||
@@ -504,6 +504,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
@@ -576,7 +577,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
}
|
||||
}
|
||||
|
||||
- /* internal snapshots and config files are currently supported only with rbd: */
|
||||
+ /* internal snapshots are currently supported only with rbd: */
|
||||
if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
|
||||
src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
|
||||
if (src->snapshot) {
|
||||
@@ -584,10 +585,14 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
_("<snapshot> element is currently supported only with 'rbd' disks"));
|
||||
return -1;
|
||||
}
|
||||
-
|
||||
+ }
|
||||
+ /* config files are currently supported only with rbd and vitastor: */
|
||||
+ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
|
||||
if (src->configFile) {
|
||||
virReportError(VIR_ERR_XML_ERROR, "%s",
|
||||
- _("<config> element is currently supported only with 'rbd' disks"));
|
||||
+ _("<config> element is currently supported only with 'rbd' and 'vitastor' disks"));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
diff --git a/src/conf/schemas/domaincommon.rng b/src/conf/schemas/domaincommon.rng
|
||||
index 183dd5db5e..dcc0d1a778 100644
|
||||
--- a/src/conf/schemas/domaincommon.rng
|
||||
+++ b/src/conf/schemas/domaincommon.rng
|
||||
@@ -2066,6 +2066,35 @@
|
||||
</element>
|
||||
</define>
|
||||
|
||||
+ <define name="diskSourceNetworkProtocolVitastor">
|
||||
+ <element name="source">
|
||||
+ <interleave>
|
||||
+ <attribute name="protocol">
|
||||
+ <value>vitastor</value>
|
||||
+ </attribute>
|
||||
+ <ref name="diskSourceCommon"/>
|
||||
+ <optional>
|
||||
+ <attribute name="name"/>
|
||||
+ </optional>
|
||||
+ <optional>
|
||||
+ <attribute name="query"/>
|
||||
+ </optional>
|
||||
+ <zeroOrMore>
|
||||
+ <ref name="diskSourceNetworkHost"/>
|
||||
+ </zeroOrMore>
|
||||
+ <optional>
|
||||
+ <element name="config">
|
||||
+ <attribute name="file">
|
||||
+ <ref name="absFilePath"/>
|
||||
+ </attribute>
|
||||
+ <empty/>
|
||||
+ </element>
|
||||
+ </optional>
|
||||
+ <empty/>
|
||||
+ </interleave>
|
||||
+ </element>
|
||||
+ </define>
|
||||
+
|
||||
<define name="diskSourceNetworkProtocolISCSI">
|
||||
<element name="source">
|
||||
<attribute name="protocol">
|
||||
@@ -2416,6 +2445,7 @@
|
||||
<ref name="diskSourceNetworkProtocolSimple"/>
|
||||
<ref name="diskSourceNetworkProtocolVxHS"/>
|
||||
<ref name="diskSourceNetworkProtocolNFS"/>
|
||||
+ <ref name="diskSourceNetworkProtocolVitastor"/>
|
||||
</choice>
|
||||
</define>
|
||||
|
||||
diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c
|
||||
index 1dc9365bf2..a8a736be81 100644
|
||||
--- a/src/conf/storage_conf.c
|
||||
+++ b/src/conf/storage_conf.c
|
||||
@@ -56,7 +56,7 @@ VIR_ENUM_IMPL(virStoragePool,
|
||||
"logical", "disk", "iscsi",
|
||||
"iscsi-direct", "scsi", "mpath",
|
||||
"rbd", "sheepdog", "gluster",
|
||||
- "zfs", "vstorage",
|
||||
+ "zfs", "vstorage", "vitastor",
|
||||
);
|
||||
|
||||
VIR_ENUM_IMPL(virStoragePoolFormatFileSystem,
|
||||
@@ -242,6 +242,18 @@ static virStoragePoolTypeInfo poolTypeInfo[] = {
|
||||
.formatToString = virStorageFileFormatTypeToString,
|
||||
}
|
||||
},
|
||||
+ {.poolType = VIR_STORAGE_POOL_VITASTOR,
|
||||
+ .poolOptions = {
|
||||
+ .flags = (VIR_STORAGE_POOL_SOURCE_HOST |
|
||||
+ VIR_STORAGE_POOL_SOURCE_NETWORK |
|
||||
+ VIR_STORAGE_POOL_SOURCE_NAME),
|
||||
+ },
|
||||
+ .volOptions = {
|
||||
+ .defaultFormat = VIR_STORAGE_FILE_RAW,
|
||||
+ .formatFromString = virStorageVolumeFormatFromString,
|
||||
+ .formatToString = virStorageFileFormatTypeToString,
|
||||
+ }
|
||||
+ },
|
||||
{.poolType = VIR_STORAGE_POOL_SHEEPDOG,
|
||||
.poolOptions = {
|
||||
.flags = (VIR_STORAGE_POOL_SOURCE_HOST |
|
||||
@@ -538,6 +550,11 @@ virStoragePoolDefParseSource(xmlXPathContextPtr ctxt,
|
||||
_("element 'name' is mandatory for RBD pool"));
|
||||
return -1;
|
||||
}
|
||||
+ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) {
|
||||
+ virReportError(VIR_ERR_XML_ERROR, "%s",
|
||||
+ _("element 'name' is mandatory for Vitastor pool"));
|
||||
+ return -1;
|
||||
+ }
|
||||
|
||||
if (options->formatFromString) {
|
||||
g_autofree char *format = NULL;
|
||||
@@ -1127,6 +1144,7 @@ virStoragePoolDefFormatBuf(virBuffer *buf,
|
||||
/* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor
|
||||
* files, so they don't have a target */
|
||||
if (def->type != VIR_STORAGE_POOL_RBD &&
|
||||
+ def->type != VIR_STORAGE_POOL_VITASTOR &&
|
||||
def->type != VIR_STORAGE_POOL_SHEEPDOG &&
|
||||
def->type != VIR_STORAGE_POOL_GLUSTER &&
|
||||
def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) {
|
||||
diff --git a/src/conf/storage_conf.h b/src/conf/storage_conf.h
|
||||
index fc67957cfe..720c07ef74 100644
|
||||
--- a/src/conf/storage_conf.h
|
||||
+++ b/src/conf/storage_conf.h
|
||||
@@ -103,6 +103,7 @@ typedef enum {
|
||||
VIR_STORAGE_POOL_GLUSTER, /* Gluster device */
|
||||
VIR_STORAGE_POOL_ZFS, /* ZFS */
|
||||
VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */
|
||||
+ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */
|
||||
|
||||
VIR_STORAGE_POOL_LAST,
|
||||
} virStoragePoolType;
|
||||
@@ -454,6 +455,7 @@ VIR_ENUM_DECL(virStoragePartedFs);
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \
|
||||
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \
|
||||
diff --git a/src/conf/storage_source_conf.c b/src/conf/storage_source_conf.c
|
||||
index 8a063be244..dd9c7f11a2 100644
|
||||
--- a/src/conf/storage_source_conf.c
|
||||
+++ b/src/conf/storage_source_conf.c
|
||||
@@ -89,6 +89,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol,
|
||||
"ssh",
|
||||
"vxhs",
|
||||
"nfs",
|
||||
+ "vitastor",
|
||||
);
|
||||
|
||||
|
||||
@@ -1314,6 +1315,7 @@ virStorageSourceNetworkDefaultPort(virStorageNetProtocol protocol)
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
return 24007;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
/* we don't provide a default for RBD */
|
||||
return 0;
|
||||
diff --git a/src/conf/storage_source_conf.h b/src/conf/storage_source_conf.h
|
||||
index ebddf28cd6..873a2be65c 100644
|
||||
--- a/src/conf/storage_source_conf.h
|
||||
+++ b/src/conf/storage_source_conf.h
|
||||
@@ -130,6 +130,7 @@ typedef enum {
|
||||
VIR_STORAGE_NET_PROTOCOL_SSH,
|
||||
VIR_STORAGE_NET_PROTOCOL_VXHS,
|
||||
VIR_STORAGE_NET_PROTOCOL_NFS,
|
||||
+ VIR_STORAGE_NET_PROTOCOL_VITASTOR,
|
||||
|
||||
VIR_STORAGE_NET_PROTOCOL_LAST
|
||||
} virStorageNetProtocol;
|
||||
diff --git a/src/conf/virstorageobj.c b/src/conf/virstorageobj.c
|
||||
index 59fa5da372..4739167f5f 100644
|
||||
--- a/src/conf/virstorageobj.c
|
||||
+++ b/src/conf/virstorageobj.c
|
||||
@@ -1438,6 +1438,7 @@ virStoragePoolObjSourceFindDuplicateCb(const void *payload,
|
||||
return 1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_ISCSI_DIRECT:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
@@ -1921,6 +1922,8 @@ virStoragePoolObjMatch(virStoragePoolObj *obj,
|
||||
(obj->def->type == VIR_STORAGE_POOL_MPATH)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) &&
|
||||
(obj->def->type == VIR_STORAGE_POOL_RBD)) ||
|
||||
+ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) &&
|
||||
+ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) &&
|
||||
(obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) &&
|
||||
diff --git a/src/libvirt-storage.c b/src/libvirt-storage.c
|
||||
index db7660aac4..561df34709 100644
|
||||
--- a/src/libvirt-storage.c
|
||||
+++ b/src/libvirt-storage.c
|
||||
@@ -94,6 +94,7 @@ virStoragePoolGetConnect(virStoragePoolPtr pool)
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_SCSI
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_MPATH
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_RBD
|
||||
+ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_ZFS
|
||||
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c
|
||||
index bdd30dd65a..5353e00b4a 100644
|
||||
--- a/src/libxl/libxl_conf.c
|
||||
+++ b/src/libxl/libxl_conf.c
|
||||
@@ -1081,6 +1081,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSource *src,
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
virReportError(VIR_ERR_NO_SUPPORT,
|
||||
diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c
|
||||
index ec8de30c01..61eab9606d 100644
|
||||
--- a/src/libxl/xen_xl.c
|
||||
+++ b/src/libxl/xen_xl.c
|
||||
@@ -1461,6 +1461,7 @@ xenFormatXLDiskSrcNet(virStorageSource *src)
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
virReportError(VIR_ERR_NO_SUPPORT,
|
||||
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
|
||||
index 32568d4ae6..e625fa0720 100644
|
||||
--- a/src/qemu/qemu_block.c
|
||||
+++ b/src/qemu/qemu_block.c
|
||||
@@ -731,6 +731,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src,
|
||||
}
|
||||
|
||||
|
||||
+static virJSONValue *
|
||||
+qemuBlockStorageSourceGetVitastorProps(virStorageSource *src)
|
||||
+{
|
||||
+ virJSONValue *ret = NULL;
|
||||
+ virStorageNetHostDef *host;
|
||||
+ size_t i;
|
||||
+ g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
|
||||
+ g_autofree char *etcd = NULL;
|
||||
+
|
||||
+ for (i = 0; i < src->nhosts; i++) {
|
||||
+ host = src->hosts + i;
|
||||
+ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) {
|
||||
+ return NULL;
|
||||
+ }
|
||||
+ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port);
|
||||
+ }
|
||||
+ if (src->nhosts > 0) {
|
||||
+ etcd = virBufferContentAndReset(&buf);
|
||||
+ }
|
||||
+
|
||||
+ if (virJSONValueObjectAdd(&ret,
|
||||
+ "S:etcd-host", etcd,
|
||||
+ "S:etcd-prefix", src->query,
|
||||
+ "S:config-path", src->configFile,
|
||||
+ "s:image", src->path,
|
||||
+ NULL) < 0)
|
||||
+ return NULL;
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+
|
||||
static virJSONValue *
|
||||
qemuBlockStorageSourceGetSshProps(virStorageSource *src)
|
||||
{
|
||||
@@ -1082,6 +1114,12 @@ qemuBlockStorageSourceGetBackendProps(virStorageSource *src,
|
||||
return NULL;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ driver = "vitastor";
|
||||
+ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src)))
|
||||
+ return NULL;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
driver = "ssh";
|
||||
if (!(fileprops = qemuBlockStorageSourceGetSshProps(src)))
|
||||
@@ -1985,6 +2023,7 @@ qemuBlockGetBackingStoreString(virStorageSource *src,
|
||||
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
@@ -2365,6 +2404,12 @@ qemuBlockStorageSourceCreateGetStorageProps(virStorageSource *src,
|
||||
return -1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ driver = "vitastor";
|
||||
+ if (!(location = qemuBlockStorageSourceGetVitastorProps(src)))
|
||||
+ return -1;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
if (srcPriv->nbdkitProcess) {
|
||||
/* disk creation not yet supported with nbdkit, and even if it
|
||||
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
|
||||
index 0d2548d8d4..91121d6e1f 100644
|
||||
--- a/src/qemu/qemu_domain.c
|
||||
+++ b/src/qemu/qemu_domain.c
|
||||
@@ -4526,7 +4526,8 @@ qemuDomainValidateStorageSource(virStorageSource *src,
|
||||
if (src->query &&
|
||||
(actualType != VIR_STORAGE_TYPE_NETWORK ||
|
||||
(src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS &&
|
||||
- src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP))) {
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) {
|
||||
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
|
||||
_("query is supported only with HTTP(S) protocols"));
|
||||
return -1;
|
||||
@@ -8954,6 +8955,7 @@ qemuDomainPrepareStorageSourceTLS(virStorageSource *src,
|
||||
break;
|
||||
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c
|
||||
index 8128154749..afb339b9b0 100644
|
||||
--- a/src/qemu/qemu_snapshot.c
|
||||
+++ b/src/qemu/qemu_snapshot.c
|
||||
@@ -662,6 +662,7 @@ qemuSnapshotPrepareDiskExternalInactive(virDomainSnapshotDiskDef *snapdisk,
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
@@ -887,6 +888,7 @@ qemuSnapshotPrepareDiskInternal(virDomainDiskDef *disk,
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
diff --git a/src/storage/storage_driver.c b/src/storage/storage_driver.c
|
||||
index e19e032427..59f91f4710 100644
|
||||
--- a/src/storage/storage_driver.c
|
||||
+++ b/src/storage/storage_driver.c
|
||||
@@ -1626,6 +1626,7 @@ storageVolLookupByPathCallback(virStoragePoolObj *obj,
|
||||
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_SHEEPDOG:
|
||||
case VIR_STORAGE_POOL_ZFS:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c
|
||||
index 80681924ea..8a3ade9ec0 100644
|
||||
--- a/src/storage_file/storage_source_backingstore.c
|
||||
+++ b/src/storage_file/storage_source_backingstore.c
|
||||
@@ -287,6 +287,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr,
|
||||
}
|
||||
|
||||
|
||||
+static int
|
||||
+virStorageSourceParseVitastorColonString(const char *colonstr,
|
||||
+ virStorageSource *src)
|
||||
+{
|
||||
+ char *p, *e, *next;
|
||||
+ g_autofree char *options = NULL;
|
||||
+
|
||||
+ /* optionally skip the "vitastor:" prefix if provided */
|
||||
+ if (STRPREFIX(colonstr, "vitastor:"))
|
||||
+ colonstr += strlen("vitastor:");
|
||||
+
|
||||
+ options = g_strdup(colonstr);
|
||||
+
|
||||
+ p = options;
|
||||
+ while (*p) {
|
||||
+ /* find : delimiter or end of string */
|
||||
+ for (e = p; *e && *e != ':'; ++e) {
|
||||
+ if (*e == '\\') {
|
||||
+ e++;
|
||||
+ if (*e == '\0')
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ if (*e == '\0') {
|
||||
+ next = e; /* last kv pair */
|
||||
+ } else {
|
||||
+ next = e + 1;
|
||||
+ *e = '\0';
|
||||
+ }
|
||||
+
|
||||
+ if (STRPREFIX(p, "image=")) {
|
||||
+ src->path = g_strdup(p + strlen("image="));
|
||||
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
|
||||
+ src->query = g_strdup(p + strlen("etcd-prefix="));
|
||||
+ } else if (STRPREFIX(p, "config-path=")) {
|
||||
+ src->configFile = g_strdup(p + strlen("config-path="));
|
||||
+ } else if (STRPREFIX(p, "etcd-host=")) {
|
||||
+ char *h, *sep;
|
||||
+
|
||||
+ h = p + strlen("etcd-host=");
|
||||
+ while (h < e) {
|
||||
+ for (sep = h; sep < e; ++sep) {
|
||||
+ if (*sep == '\\' && (sep[1] == ',' ||
|
||||
+ sep[1] == ';' ||
|
||||
+ sep[1] == ' ')) {
|
||||
+ *sep = '\0';
|
||||
+ sep += 2;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (virStorageSourceRBDAddHost(src, h) < 0)
|
||||
+ return -1;
|
||||
+
|
||||
+ h = sep;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ p = next;
|
||||
+ }
|
||||
+
|
||||
+ if (!src->path) {
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+
|
||||
static int
|
||||
virStorageSourceParseNBDColonString(const char *nbdstr,
|
||||
virStorageSource *src)
|
||||
@@ -399,6 +468,11 @@ virStorageSourceParseBackingColon(virStorageSource *src,
|
||||
return -1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ if (virStorageSourceParseVitastorColonString(path, src) < 0)
|
||||
+ return -1;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
@@ -975,6 +1049,54 @@ virStorageSourceParseBackingJSONRBD(virStorageSource *src,
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static int
|
||||
+virStorageSourceParseBackingJSONVitastor(virStorageSource *src,
|
||||
+ virJSONValue *json,
|
||||
+ const char *jsonstr G_GNUC_UNUSED,
|
||||
+ int opaque G_GNUC_UNUSED)
|
||||
+{
|
||||
+ const char *filename;
|
||||
+ const char *image = virJSONValueObjectGetString(json, "image");
|
||||
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
|
||||
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
|
||||
+ virJSONValue *servers = virJSONValueObjectGetArray(json, "server");
|
||||
+ size_t nservers;
|
||||
+ size_t i;
|
||||
+
|
||||
+ src->type = VIR_STORAGE_TYPE_NETWORK;
|
||||
+ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR;
|
||||
+
|
||||
+ /* legacy syntax passed via 'filename' option */
|
||||
+ if ((filename = virJSONValueObjectGetString(json, "filename")))
|
||||
+ return virStorageSourceParseVitastorColonString(filename, src);
|
||||
+
|
||||
+ if (!image) {
|
||||
+ virReportError(VIR_ERR_INVALID_ARG, "%s",
|
||||
+ _("missing image name in Vitastor backing volume "
|
||||
+ "JSON specification"));
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ src->path = g_strdup(image);
|
||||
+ src->configFile = g_strdup(conf);
|
||||
+ src->query = g_strdup(etcd_prefix);
|
||||
+
|
||||
+ if (servers) {
|
||||
+ nservers = virJSONValueArraySize(servers);
|
||||
+
|
||||
+ src->hosts = g_new0(virStorageNetHostDef, nservers);
|
||||
+ src->nhosts = nservers;
|
||||
+
|
||||
+ for (i = 0; i < nservers; i++) {
|
||||
+ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i,
|
||||
+ virJSONValueArrayGet(servers, i)) < 0)
|
||||
+ return -1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static int
|
||||
virStorageSourceParseBackingJSONRaw(virStorageSource *src,
|
||||
virJSONValue *json,
|
||||
@@ -1152,6 +1274,7 @@ static const struct virStorageSourceJSONDriverParser jsonParsers[] = {
|
||||
{"sheepdog", false, virStorageSourceParseBackingJSONSheepdog, 0},
|
||||
{"ssh", false, virStorageSourceParseBackingJSONSSH, 0},
|
||||
{"rbd", false, virStorageSourceParseBackingJSONRBD, 0},
|
||||
+ {"vitastor", false, virStorageSourceParseBackingJSONVitastor, 0},
|
||||
{"raw", true, virStorageSourceParseBackingJSONRaw, 0},
|
||||
{"nfs", false, virStorageSourceParseBackingJSONNFS, 0},
|
||||
{"vxhs", false, virStorageSourceParseBackingJSONVxHS, 0},
|
||||
diff --git a/src/test/test_driver.c b/src/test/test_driver.c
|
||||
index 25335d9002..cf54069fbe 100644
|
||||
--- a/src/test/test_driver.c
|
||||
+++ b/src/test/test_driver.c
|
||||
@@ -7340,6 +7340,7 @@ testStorageVolumeTypeForPool(int pooltype)
|
||||
case VIR_STORAGE_POOL_ISCSI_DIRECT:
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
return VIR_STORAGE_VOL_NETWORK;
|
||||
case VIR_STORAGE_POOL_LOGICAL:
|
||||
case VIR_STORAGE_POOL_DISK:
|
||||
diff --git a/tests/storagepoolcapsschemadata/poolcaps-fs.xml b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
index eee75af746..8bd0a57bdd 100644
|
||||
--- a/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
+++ b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
@@ -204,4 +204,11 @@
|
||||
</enum>
|
||||
</volOptions>
|
||||
</pool>
|
||||
+ <pool type='vitastor' supported='no'>
|
||||
+ <volOptions>
|
||||
+ <defaultFormat type='raw'/>
|
||||
+ <enum name='targetFormatType'>
|
||||
+ </enum>
|
||||
+ </volOptions>
|
||||
+ </pool>
|
||||
</storagepoolCapabilities>
|
||||
diff --git a/tests/storagepoolcapsschemadata/poolcaps-full.xml b/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
index 805950a937..852df0de16 100644
|
||||
--- a/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
+++ b/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
@@ -204,4 +204,11 @@
|
||||
</enum>
|
||||
</volOptions>
|
||||
</pool>
|
||||
+ <pool type='vitastor' supported='yes'>
|
||||
+ <volOptions>
|
||||
+ <defaultFormat type='raw'/>
|
||||
+ <enum name='targetFormatType'>
|
||||
+ </enum>
|
||||
+ </volOptions>
|
||||
+ </pool>
|
||||
</storagepoolCapabilities>
|
||||
diff --git a/tests/storagepoolxml2argvtest.c b/tests/storagepoolxml2argvtest.c
|
||||
index d5c2531ab8..b19308ac38 100644
|
||||
--- a/tests/storagepoolxml2argvtest.c
|
||||
+++ b/tests/storagepoolxml2argvtest.c
|
||||
@@ -57,6 +57,7 @@ testCompareXMLToArgvFiles(bool shouldFail,
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_ZFS:
|
||||
case VIR_STORAGE_POOL_VSTORAGE:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
default:
|
||||
VIR_TEST_DEBUG("pool type '%s' has no xml2argv test", defTypeStr);
|
||||
diff --git a/tools/virsh-pool.c b/tools/virsh-pool.c
|
||||
index 2010ef1356..072e2ff9e8 100644
|
||||
--- a/tools/virsh-pool.c
|
||||
+++ b/tools/virsh-pool.c
|
||||
@@ -1187,6 +1187,9 @@ cmdPoolList(vshControl *ctl, const vshCmd *cmd G_GNUC_UNUSED)
|
||||
case VIR_STORAGE_POOL_VSTORAGE:
|
||||
flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE;
|
||||
break;
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
+ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR;
|
||||
+ break;
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
break;
|
||||
}
|
172
patches/pve-qemu-10.0-vitastor.patch
Normal file
172
patches/pve-qemu-10.0-vitastor.patch
Normal file
@@ -0,0 +1,172 @@
|
||||
Index: pve-qemu-kvm-10.0.2/block/meson.build
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-10.0.2.orig/block/meson.build
|
||||
+++ pve-qemu-kvm-10.0.2/block/meson.build
|
||||
@@ -126,6 +126,7 @@ foreach m : [
|
||||
[libnfs, 'nfs', files('nfs.c')],
|
||||
[libssh, 'ssh', files('ssh.c')],
|
||||
[rbd, 'rbd', files('rbd.c')],
|
||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||
]
|
||||
if m[0].found()
|
||||
module_ss = ss.source_set()
|
||||
Index: pve-qemu-kvm-10.0.2/meson.build
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-10.0.2.orig/meson.build
|
||||
+++ pve-qemu-kvm-10.0.2/meson.build
|
||||
@@ -1622,6 +1622,26 @@ if not get_option('rbd').auto() or have_
|
||||
endif
|
||||
endif
|
||||
|
||||
+vitastor = not_found
|
||||
+if not get_option('vitastor').auto() or have_block
|
||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||
+ required: get_option('vitastor'))
|
||||
+ if libvitastor_client.found()
|
||||
+ if cc.links('''
|
||||
+ #include <vitastor_c.h>
|
||||
+ int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+ }''', dependencies: libvitastor_client)
|
||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||
+ elif get_option('vitastor').enabled()
|
||||
+ error('could not link libvitastor_client')
|
||||
+ else
|
||||
+ warning('could not link libvitastor_client, disabling')
|
||||
+ endif
|
||||
+ endif
|
||||
+endif
|
||||
+
|
||||
glusterfs = not_found
|
||||
glusterfs_ftruncate_has_stat = false
|
||||
glusterfs_iocb_has_stat = false
|
||||
@@ -2514,6 +2534,7 @@ endif
|
||||
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
|
||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
|
||||
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
|
||||
@@ -4812,6 +4833,7 @@ summary_info += {'fdt support': fd
|
||||
summary_info += {'libcap-ng support': libcap_ng}
|
||||
summary_info += {'bpf support': libbpf}
|
||||
summary_info += {'rbd support': rbd}
|
||||
+summary_info += {'vitastor support': vitastor}
|
||||
summary_info += {'smartcard support': cacard}
|
||||
summary_info += {'U2F support': u2f}
|
||||
summary_info += {'libusb': libusb}
|
||||
Index: pve-qemu-kvm-10.0.2/meson_options.txt
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-10.0.2.orig/meson_options.txt
|
||||
+++ pve-qemu-kvm-10.0.2/meson_options.txt
|
||||
@@ -202,6 +202,8 @@ option('pvg', type: 'feature', value: 'a
|
||||
description: 'macOS paravirtualized graphics support')
|
||||
option('rbd', type : 'feature', value : 'auto',
|
||||
description: 'Ceph block device driver')
|
||||
+option('vitastor', type : 'feature', value : 'auto',
|
||||
+ description: 'Vitastor block device driver')
|
||||
option('opengl', type : 'feature', value : 'auto',
|
||||
description: 'OpenGL support')
|
||||
option('rdma', type : 'feature', value : 'auto',
|
||||
Index: pve-qemu-kvm-10.0.2/qapi/block-core.json
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-10.0.2.orig/qapi/block-core.json
|
||||
+++ pve-qemu-kvm-10.0.2/qapi/block-core.json
|
||||
@@ -3599,7 +3599,7 @@
|
||||
'raw', 'rbd',
|
||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||
'pbs',
|
||||
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
||||
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
||||
@@ -4725,6 +4725,28 @@
|
||||
'*server': ['InetSocketAddressBase'] } }
|
||||
|
||||
##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
+##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
# An enumeration of replication modes.
|
||||
@@ -5194,6 +5216,7 @@
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'virtio-blk-vfio-pci':
|
||||
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||
'if': 'CONFIG_BLKIO' },
|
||||
@@ -5674,6 +5697,20 @@
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @location: Where to store the new image file. This location cannot
|
||||
+# point to a snapshot.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
+##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
# Subformat options for VMDK images
|
||||
@@ -5895,6 +5932,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||
'vpc': 'BlockdevCreateOptionsVpc'
|
||||
} }
|
||||
Index: pve-qemu-kvm-10.0.2/scripts/meson-buildoptions.sh
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-10.0.2.orig/scripts/meson-buildoptions.sh
|
||||
+++ pve-qemu-kvm-10.0.2/scripts/meson-buildoptions.sh
|
||||
@@ -175,6 +175,7 @@ meson_options_help() {
|
||||
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||
printf "%s\n" ' qpl Query Processing Library support'
|
||||
printf "%s\n" ' rbd Ceph block device driver'
|
||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||
printf "%s\n" ' rdma Enable RDMA-based migration'
|
||||
printf "%s\n" ' replication replication support'
|
||||
printf "%s\n" ' rust Rust support'
|
||||
@@ -458,6 +459,8 @@ _meson_option_parse() {
|
||||
--disable-qpl) printf "%s" -Dqpl=disabled ;;
|
||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
||||
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
||||
--enable-relocatable) printf "%s" -Drelocatable=true ;;
|
172
patches/qemu-10.0-vitastor.patch
Normal file
172
patches/qemu-10.0-vitastor.patch
Normal file
@@ -0,0 +1,172 @@
|
||||
diff --git a/block/meson.build b/block/meson.build
|
||||
index 34b1b2a306..24ca0f1e52 100644
|
||||
--- a/block/meson.build
|
||||
+++ b/block/meson.build
|
||||
@@ -114,6 +114,7 @@ foreach m : [
|
||||
[libnfs, 'nfs', files('nfs.c')],
|
||||
[libssh, 'ssh', files('ssh.c')],
|
||||
[rbd, 'rbd', files('rbd.c')],
|
||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||
]
|
||||
if m[0].found()
|
||||
module_ss = ss.source_set()
|
||||
diff --git a/meson.build b/meson.build
|
||||
index 41f68d3806..29eaed9ba4 100644
|
||||
--- a/meson.build
|
||||
+++ b/meson.build
|
||||
@@ -1622,6 +1622,26 @@ if not get_option('rbd').auto() or have_block
|
||||
endif
|
||||
endif
|
||||
|
||||
+vitastor = not_found
|
||||
+if not get_option('vitastor').auto() or have_block
|
||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||
+ required: get_option('vitastor'))
|
||||
+ if libvitastor_client.found()
|
||||
+ if cc.links('''
|
||||
+ #include <vitastor_c.h>
|
||||
+ int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+ }''', dependencies: libvitastor_client)
|
||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||
+ elif get_option('vitastor').enabled()
|
||||
+ error('could not link libvitastor_client')
|
||||
+ else
|
||||
+ warning('could not link libvitastor_client, disabling')
|
||||
+ endif
|
||||
+ endif
|
||||
+endif
|
||||
+
|
||||
glusterfs = not_found
|
||||
glusterfs_ftruncate_has_stat = false
|
||||
glusterfs_iocb_has_stat = false
|
||||
@@ -2506,6 +2526,7 @@ endif
|
||||
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
|
||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
|
||||
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
|
||||
@@ -4813,6 +4834,7 @@ summary_info += {'fdt support': fdt_opt == 'internal' ? 'internal' : fdt}
|
||||
summary_info += {'libcap-ng support': libcap_ng}
|
||||
summary_info += {'bpf support': libbpf}
|
||||
summary_info += {'rbd support': rbd}
|
||||
+summary_info += {'vitastor support': vitastor}
|
||||
summary_info += {'smartcard support': cacard}
|
||||
summary_info += {'U2F support': u2f}
|
||||
summary_info += {'libusb': libusb}
|
||||
diff --git a/meson_options.txt b/meson_options.txt
|
||||
index 59d973bca0..a3e7123980 100644
|
||||
--- a/meson_options.txt
|
||||
+++ b/meson_options.txt
|
||||
@@ -202,6 +202,8 @@ option('pvg', type: 'feature', value: 'auto',
|
||||
description: 'macOS paravirtualized graphics support')
|
||||
option('rbd', type : 'feature', value : 'auto',
|
||||
description: 'Ceph block device driver')
|
||||
+option('vitastor', type : 'feature', value : 'auto',
|
||||
+ description: 'Vitastor block device driver')
|
||||
option('opengl', type : 'feature', value : 'auto',
|
||||
description: 'OpenGL support')
|
||||
option('rdma', type : 'feature', value : 'auto',
|
||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||
index b1937780e1..a511193620 100644
|
||||
--- a/qapi/block-core.json
|
||||
+++ b/qapi/block-core.json
|
||||
@@ -3216,7 +3216,7 @@
|
||||
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
|
||||
'raw', 'rbd',
|
||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
||||
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
||||
@@ -4299,6 +4299,28 @@
|
||||
'*key-secret': 'str',
|
||||
'*server': ['InetSocketAddressBase'] } }
|
||||
|
||||
+##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
@@ -4767,6 +4789,7 @@
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'virtio-blk-vfio-pci':
|
||||
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||
'if': 'CONFIG_BLKIO' },
|
||||
@@ -5240,6 +5263,20 @@
|
||||
'*cluster-size' : 'size',
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
+##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @location: Where to store the new image file. This location cannot
|
||||
+# point to a snapshot.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
@@ -5462,6 +5499,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||
'vpc': 'BlockdevCreateOptionsVpc'
|
||||
} }
|
||||
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
||||
index 3e8e00852b..45aff3b6a9 100644
|
||||
--- a/scripts/meson-buildoptions.sh
|
||||
+++ b/scripts/meson-buildoptions.sh
|
||||
@@ -175,6 +175,7 @@ meson_options_help() {
|
||||
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||
printf "%s\n" ' qpl Query Processing Library support'
|
||||
printf "%s\n" ' rbd Ceph block device driver'
|
||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||
printf "%s\n" ' rdma Enable RDMA-based migration'
|
||||
printf "%s\n" ' replication replication support'
|
||||
printf "%s\n" ' rust Rust support'
|
||||
@@ -458,6 +459,8 @@ _meson_option_parse() {
|
||||
--disable-qpl) printf "%s" -Dqpl=disabled ;;
|
||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
||||
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
||||
--enable-relocatable) printf "%s" -Drelocatable=true ;;
|
@@ -7,22 +7,24 @@ set -e
|
||||
VITASTOR=$(dirname $0)
|
||||
VITASTOR=$(realpath "$VITASTOR/..")
|
||||
|
||||
EL=$(rpm --eval '%dist')
|
||||
if [ "$EL" = ".el8" ]; then
|
||||
REL=$(rpm --eval '%dist')
|
||||
REL=${REL##.}
|
||||
if [ "$REL" = "el8" ]; then
|
||||
# CentOS 8
|
||||
. /opt/rh/gcc-toolset-9/enable
|
||||
elif [ "$EL" = ".el7" ]; then
|
||||
elif [ "$REL" = "el7" ]; then
|
||||
# CentOS 7
|
||||
. /opt/rh/devtoolset-9/enable
|
||||
fi
|
||||
cd ~/rpmbuild/SPECS
|
||||
rpmbuild -bp fio.spec
|
||||
cd $VITASTOR
|
||||
VER=$(grep ^Version: rpm/vitastor-el7.spec | awk '{print $2}')
|
||||
VER=$(grep ^Version: rpm/vitastor-$REL.spec | awk '{print $2}')
|
||||
rm -rf fio
|
||||
ln -s ~/rpmbuild/BUILD/fio*/ fio
|
||||
sh copy-fio-includes.sh
|
||||
rm fio
|
||||
mv fio-copy fio
|
||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform "s#^#vitastor-$VER/#" --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-$VER$(rpm --eval '%dist').tar.gz *
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-$REL.spec
|
||||
tar --transform "s#^#vitastor-$VER/#" --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-$VER.$REL.tar.gz $(ls | grep -v packages)
|
||||
|
16
rpm/vitastor-build.sh
Executable file
16
rpm/vitastor-build.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e -x
|
||||
REL=$(rpm --eval '%dist')
|
||||
REL=${REL##.}
|
||||
cd /root/vitastor/rpm
|
||||
./build-tarball.sh
|
||||
VER=$(grep ^Version: vitastor-$REL.spec | awk '{print $2}')
|
||||
cp /root/vitastor-$VER.$REL.tar.gz ~/rpmbuild/SOURCES
|
||||
cp vitastor-$REL.spec ~/rpmbuild/SPECS/vitastor.spec
|
||||
cd ~/rpmbuild/SPECS/
|
||||
rpmbuild -ba vitastor.spec
|
||||
mkdir -p /root/vitastor/packages/vitastor-$REL
|
||||
rm -rf /root/vitastor/packages/vitastor-$REL/*
|
||||
cp ~/rpmbuild/RPMS/*/*vitastor* /root/vitastor/packages/vitastor-$REL/
|
||||
cp ~/rpmbuild/SRPMS/vitastor* /root/vitastor/packages/vitastor-$REL/
|
@@ -1,5 +1,8 @@
|
||||
# Build packages for CentOS 7 inside a container
|
||||
# cd ..; podman build -t vitastor-el7 -v `pwd`/packages:/root/packages -f rpm/vitastor-el7.Dockerfile .
|
||||
# cd ..
|
||||
# docker build -t vitastor-buildenv:el7 -f rpm/vitastor-el7.Dockerfile .
|
||||
# docker run -i --rm -v ./:/root/vitastor vitastor-buildenv:el7 /root/vitastor/rpm/vitastor-build.sh
|
||||
|
||||
# localedef -i ru_RU -f UTF-8 ru_RU.UTF-8
|
||||
|
||||
FROM centos:7
|
||||
@@ -7,7 +10,9 @@ FROM centos:7
|
||||
WORKDIR /root
|
||||
|
||||
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
|
||||
RUN sed -i 's/^mirrorlist=/#mirrorlist=/; s!#baseurl=http://mirror.centos.org/centos/\$releasever!baseurl=http://vault.centos.org/7.9.2009!' /etc/yum.repos.d/*.repo
|
||||
RUN yum -y --enablerepo=extras install centos-release-scl epel-release yum-utils rpm-build
|
||||
RUN perl -i -pe 's!mirrorlist=!#mirrorlist=!s; s!#\s*baseurl=http://mirror.centos.org!baseurl=http://vault.centos.org!' /etc/yum.repos.d/CentOS-SCLo-scl*.repo
|
||||
RUN yum -y install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm
|
||||
RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gcc make cmake gperftools-devel \
|
||||
fio rh-nodejs12 jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libnl3-devel
|
||||
@@ -16,32 +21,3 @@ RUN rpm --nomd5 -i fio*.src.rpm
|
||||
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
|
||||
RUN cd ~/rpmbuild/SPECS && yum-builddep -y fio.spec
|
||||
RUN yum -y install cmake3
|
||||
|
||||
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
|
||||
|
||||
RUN set -e; \
|
||||
rpm -i liburing*.src.rpm; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
. /opt/rh/devtoolset-9/enable; \
|
||||
rpmbuild -ba liburing.spec; \
|
||||
mkdir -p /root/packages/liburing-el7; \
|
||||
rm -rf /root/packages/liburing-el7/*; \
|
||||
cp ~/rpmbuild/RPMS/*/liburing* /root/packages/liburing-el7/; \
|
||||
cp ~/rpmbuild/SRPMS/liburing* /root/packages/liburing-el7/
|
||||
|
||||
RUN rpm -i `ls /root/packages/liburing-el7/liburing-*.x86_64.rpm | grep -v debug`
|
||||
|
||||
ADD . /root/vitastor
|
||||
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
VER=$(grep ^Version: vitastor-el7.spec | awk '{print $2}'); \
|
||||
cp /root/vitastor-$VER.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
mkdir -p /root/packages/vitastor-el7; \
|
||||
rm -rf /root/packages/vitastor-el7/*; \
|
||||
cp ~/rpmbuild/RPMS/*/*vitastor* /root/packages/vitastor-el7/; \
|
||||
cp ~/rpmbuild/SRPMS/vitastor* /root/packages/vitastor-el7/
|
||||
|
@@ -1,13 +1,12 @@
|
||||
Name: vitastor
|
||||
Version: 2.2.2
|
||||
Version: 2.3.0
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-2.2.2.el7.tar.gz
|
||||
Source0: vitastor-2.3.0.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
BuildRequires: devtoolset-9-gcc-c++
|
||||
BuildRequires: rh-nodejs12
|
||||
@@ -35,8 +34,6 @@ size with configurable redundancy (replication or erasure codes/XOR).
|
||||
Summary: Vitastor - OSD
|
||||
Requires: libJerasure2
|
||||
Requires: libisa-l
|
||||
Requires: liburing >= 0.6
|
||||
Requires: liburing < 2
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
Requires: util-linux
|
||||
Requires: parted
|
||||
@@ -60,8 +57,6 @@ scheduling cluster-level operations.
|
||||
|
||||
%package -n vitastor-client
|
||||
Summary: Vitastor - client
|
||||
Requires: liburing >= 0.6
|
||||
Requires: liburing < 2
|
||||
|
||||
|
||||
%description -n vitastor-client
|
||||
@@ -82,7 +77,7 @@ Vitastor library headers for development.
|
||||
Summary: Vitastor - fio drivers
|
||||
Group: Development/Libraries
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
Requires: fio = 3.7-1.el7
|
||||
Requires: fio = 3.7-2.el7
|
||||
|
||||
|
||||
%description -n vitastor-fio
|
||||
@@ -169,6 +164,7 @@ chown vitastor:vitastor /var/lib/vitastor
|
||||
|
||||
%files -n vitastor-client
|
||||
%_bindir/vitastor-nbd
|
||||
%_bindir/vitastor-ublk
|
||||
%_bindir/vitastor-nfs
|
||||
%_bindir/vitastor-cli
|
||||
%_bindir/vitastor-rm
|
||||
|
@@ -1,5 +1,7 @@
|
||||
# Build packages for CentOS 8 inside a container
|
||||
# cd ..; podman build -t vitastor-el8 -v `pwd`/packages:/root/packages -f rpm/vitastor-el8.Dockerfile .
|
||||
# cd ..
|
||||
# docker build -t vitastor-buildenv:el8 -f rpm/vitastor-el8.Dockerfile .
|
||||
# docker run -i --rm -v ./:/root/vitastor vitastor-buildenv:el8 /root/vitastor/rpm/vitastor-build.sh
|
||||
|
||||
FROM centos:8
|
||||
|
||||
@@ -15,32 +17,3 @@ RUN dnf -y install gcc-toolset-9 gcc-toolset-9-gcc-c++ gperftools-devel \
|
||||
RUN dnf download --source fio
|
||||
RUN rpm --nomd5 -i fio*.src.rpm
|
||||
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec fio.spec
|
||||
|
||||
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
|
||||
|
||||
RUN set -e; \
|
||||
rpm -i liburing*.src.rpm; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
. /opt/rh/gcc-toolset-9/enable; \
|
||||
rpmbuild -ba liburing.spec; \
|
||||
mkdir -p /root/packages/liburing-el8; \
|
||||
rm -rf /root/packages/liburing-el8/*; \
|
||||
cp ~/rpmbuild/RPMS/*/liburing* /root/packages/liburing-el8/; \
|
||||
cp ~/rpmbuild/SRPMS/liburing* /root/packages/liburing-el8/
|
||||
|
||||
RUN rpm -i `ls /root/packages/liburing-el8/liburing-*.x86_64.rpm | grep -v debug`
|
||||
|
||||
ADD . /root/vitastor
|
||||
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
VER=$(grep ^Version: vitastor-el8.spec | awk '{print $2}'); \
|
||||
cp /root/vitastor-$VER.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
mkdir -p /root/packages/vitastor-el8; \
|
||||
rm -rf /root/packages/vitastor-el8/*; \
|
||||
cp ~/rpmbuild/RPMS/*/*vitastor* /root/packages/vitastor-el8/; \
|
||||
cp ~/rpmbuild/SRPMS/vitastor* /root/packages/vitastor-el8/
|
||||
|
@@ -1,13 +1,12 @@
|
||||
Name: vitastor
|
||||
Version: 2.2.2
|
||||
Version: 2.3.0
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-2.2.2.el8.tar.gz
|
||||
Source0: vitastor-2.3.0.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
BuildRequires: gcc-toolset-9-gcc-c++
|
||||
BuildRequires: nodejs >= 10
|
||||
@@ -34,8 +33,6 @@ size with configurable redundancy (replication or erasure codes/XOR).
|
||||
Summary: Vitastor - OSD
|
||||
Requires: libJerasure2
|
||||
Requires: libisa-l
|
||||
Requires: liburing >= 0.6
|
||||
Requires: liburing < 2
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
Requires: util-linux
|
||||
Requires: parted
|
||||
@@ -58,8 +55,6 @@ scheduling cluster-level operations.
|
||||
|
||||
%package -n vitastor-client
|
||||
Summary: Vitastor - client
|
||||
Requires: liburing >= 0.6
|
||||
Requires: liburing < 2
|
||||
|
||||
|
||||
%description -n vitastor-client
|
||||
@@ -80,7 +75,7 @@ Vitastor library headers for development.
|
||||
Summary: Vitastor - fio drivers
|
||||
Group: Development/Libraries
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
Requires: fio = 3.7-3.el8
|
||||
Requires: fio = 3.19-3.el8
|
||||
|
||||
|
||||
%description -n vitastor-fio
|
||||
@@ -166,6 +161,7 @@ chown vitastor:vitastor /var/lib/vitastor
|
||||
|
||||
%files -n vitastor-client
|
||||
%_bindir/vitastor-nbd
|
||||
%_bindir/vitastor-ublk
|
||||
%_bindir/vitastor-nfs
|
||||
%_bindir/vitastor-cli
|
||||
%_bindir/vitastor-rm
|
||||
|
@@ -1,5 +1,7 @@
|
||||
# Build packages for AlmaLinux 9 inside a container
|
||||
# cd ..; podman build -t vitastor-el9 -v `pwd`/packages:/root/packages -f rpm/vitastor-el9.Dockerfile .
|
||||
# cd ..
|
||||
# docker build -t vitastor-buildenv:el9 -f rpm/vitastor-el9.Dockerfile .
|
||||
# docker run -i --rm -v ./:/root/vitastor vitastor-buildenv:el9 /root/vitastor/rpm/vitastor-build.sh
|
||||
|
||||
FROM almalinux:9
|
||||
|
||||
@@ -8,22 +10,7 @@ WORKDIR /root
|
||||
RUN sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/*.repo
|
||||
RUN dnf -y install epel-release dnf-plugins-core
|
||||
RUN dnf -y install https://vitastor.io/rpms/centos/9/vitastor-release-1.0-1.el9.noarch.rpm
|
||||
RUN dnf -y install gcc-c++ gperftools-devel fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libarchive liburing-devel cmake libnl3-devel
|
||||
RUN dnf -y install gcc-c++ gperftools-devel fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libarchive cmake libnl3-devel
|
||||
RUN dnf download --source fio
|
||||
RUN rpm --nomd5 -i fio*.src.rpm
|
||||
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --spec fio.spec
|
||||
|
||||
ADD . /root/vitastor
|
||||
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
VER=$(grep ^Version: vitastor-el9.spec | awk '{print $2}'); \
|
||||
cp /root/vitastor-$VER.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
mkdir -p /root/packages/vitastor-el9; \
|
||||
rm -rf /root/packages/vitastor-el9/*; \
|
||||
cp ~/rpmbuild/RPMS/*/*vitastor* /root/packages/vitastor-el9/; \
|
||||
cp ~/rpmbuild/SRPMS/vitastor* /root/packages/vitastor-el9/
|
||||
|
@@ -1,13 +1,12 @@
|
||||
Name: vitastor
|
||||
Version: 2.2.2
|
||||
Version: 2.3.0
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-2.2.2.el9.tar.gz
|
||||
Source0: vitastor-2.3.0.el9.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
BuildRequires: gcc-c++
|
||||
BuildRequires: nodejs >= 10
|
||||
@@ -159,6 +158,7 @@ chown vitastor:vitastor /var/lib/vitastor
|
||||
|
||||
%files -n vitastor-client
|
||||
%_bindir/vitastor-nbd
|
||||
%_bindir/vitastor-ublk
|
||||
%_bindir/vitastor-nfs
|
||||
%_bindir/vitastor-cli
|
||||
%_bindir/vitastor-rm
|
||||
|
@@ -12,20 +12,30 @@ set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree")
|
||||
set(WITH_FIO true CACHE BOOL "Build FIO driver")
|
||||
set(QEMU_PLUGINDIR qemu CACHE STRING "QEMU plugin directory suffix (qemu-kvm on RHEL)")
|
||||
set(WITH_ASAN false CACHE BOOL "Build with AddressSanitizer")
|
||||
set(WITH_SYSTEM_LIBURING false CACHE BOOL "Use system liburing")
|
||||
if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
if(EXISTS "/etc/debian_version")
|
||||
set(CMAKE_INSTALL_LIBDIR "lib/${CMAKE_LIBRARY_ARCHITECTURE}")
|
||||
endif()
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
set(ENABLE_COVERAGE false CACHE BOOL "Enable code coverage")
|
||||
|
||||
add_definitions(-DVITASTOR_VERSION="2.2.2")
|
||||
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
||||
add_definitions(-DVITASTOR_VERSION="2.3.0")
|
||||
add_definitions(-D_GNU_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -fvisibility=hidden -I ${CMAKE_SOURCE_DIR}/src)
|
||||
add_link_options(-fno-omit-frame-pointer)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address)
|
||||
add_link_options(-fsanitize=address -fno-omit-frame-pointer)
|
||||
endif (${WITH_ASAN})
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fvisibility-inlines-hidden")
|
||||
set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} -fvisibility-inlines-hidden")
|
||||
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fvisibility-inlines-hidden")
|
||||
|
||||
if (${ENABLE_COVERAGE})
|
||||
add_definitions(-coverage)
|
||||
add_link_options(-coverage)
|
||||
endif()
|
||||
|
||||
set(CMAKE_BUILD_TYPE RelWithDebInfo)
|
||||
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
|
||||
@@ -49,7 +59,6 @@ endmacro(install_symlink)
|
||||
check_include_file("linux/nbd-netlink.h" HAVE_NBD_NETLINK_H)
|
||||
|
||||
find_package(PkgConfig)
|
||||
pkg_check_modules(LIBURING REQUIRED liburing)
|
||||
if (${WITH_QEMU})
|
||||
pkg_check_modules(GLIB REQUIRED glib-2.0)
|
||||
endif (${WITH_QEMU})
|
||||
@@ -66,13 +75,14 @@ if (RDMACM_LIBRARIES)
|
||||
add_definitions(-DWITH_RDMACM)
|
||||
endif (RDMACM_LIBRARIES)
|
||||
|
||||
add_custom_target(build_tests)
|
||||
add_custom_target(test
|
||||
COMMAND
|
||||
echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
|
||||
env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
|
||||
)
|
||||
add_dependencies(test build_tests)
|
||||
if (${WITH_SYSTEM_LIBURING})
|
||||
pkg_check_modules(LIBURING REQUIRED liburing>=2.10)
|
||||
include_directories(${LIBURING_INCLUDE_DIRS})
|
||||
else()
|
||||
include_directories(${CMAKE_SOURCE_DIR}/src/liburing/include)
|
||||
add_subdirectory(liburing)
|
||||
set(LIBURING_LIBRARIES uring)
|
||||
endif (${WITH_SYSTEM_LIBURING})
|
||||
|
||||
include_directories(
|
||||
../
|
||||
@@ -86,7 +96,6 @@ include_directories(
|
||||
${CMAKE_SOURCE_DIR}/src/test
|
||||
${CMAKE_SOURCE_DIR}/src/util
|
||||
/usr/include/jerasure
|
||||
${LIBURING_INCLUDE_DIRS}
|
||||
${IBVERBS_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
@@ -101,7 +110,7 @@ add_subdirectory(test)
|
||||
|
||||
### Install
|
||||
|
||||
install(TARGETS vitastor-osd vitastor-disk vitastor-nbd vitastor-nfs vitastor-cli vitastor-kv vitastor-kv-stress RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
install(TARGETS vitastor-osd vitastor-disk vitastor-nbd vitastor-ublk vitastor-nfs vitastor-cli vitastor-kv vitastor-kv-stress RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
install_symlink(vitastor-disk ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-dump-journal)
|
||||
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-rm)
|
||||
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vita)
|
||||
|
@@ -2,14 +2,17 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
# libvitastor_blk.so
|
||||
add_library(vitastor_blk SHARED
|
||||
../util/allocator.cpp blockstore.cpp blockstore_impl.cpp blockstore_disk.cpp blockstore_init.cpp blockstore_open.cpp blockstore_journal.cpp blockstore_read.cpp
|
||||
blockstore_write.cpp blockstore_sync.cpp blockstore_stable.cpp blockstore_rollback.cpp blockstore_flush.cpp ../util/crc32c.c ../util/ringloop.cpp
|
||||
# libvitastor_blk.a
|
||||
add_library(vitastor_blk STATIC
|
||||
../util/allocator.cpp ../util/crc32c.c ../util/ringloop.cpp
|
||||
multilist.cpp blockstore_heap.cpp blockstore_disk.cpp
|
||||
blockstore.cpp blockstore_impl.cpp blockstore_init.cpp blockstore_open.cpp
|
||||
blockstore_flush.cpp blockstore_read.cpp blockstore_stable.cpp blockstore_sync.cpp blockstore_write.cpp
|
||||
)
|
||||
target_compile_options(vitastor_blk PUBLIC -fPIC)
|
||||
target_link_libraries(vitastor_blk
|
||||
${LIBURING_LIBRARIES}
|
||||
tcmalloc_minimal
|
||||
${ISAL_LIBRARIES}
|
||||
# for timerfd_manager
|
||||
vitastor_common
|
||||
)
|
||||
|
@@ -3,7 +3,7 @@
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
blockstore_t::blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd)
|
||||
blockstore_t::blockstore_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd)
|
||||
{
|
||||
impl = new blockstore_impl_t(config, ringloop, tfd);
|
||||
}
|
||||
@@ -48,9 +48,9 @@ int blockstore_t::read_bitmap(object_id oid, uint64_t target_version, void *bitm
|
||||
return impl->read_bitmap(oid, target_version, bitmap, result_version);
|
||||
}
|
||||
|
||||
std::map<uint64_t, uint64_t> & blockstore_t::get_inode_space_stats()
|
||||
const std::map<uint64_t, uint64_t> & blockstore_t::get_inode_space_stats()
|
||||
{
|
||||
return impl->inode_space_stats;
|
||||
return impl->get_inode_space_stats();
|
||||
}
|
||||
|
||||
void blockstore_t::dump_diagnostics()
|
||||
@@ -82,8 +82,3 @@ uint32_t blockstore_t::get_bitmap_granularity()
|
||||
{
|
||||
return impl->get_bitmap_granularity();
|
||||
}
|
||||
|
||||
void blockstore_t::set_no_inode_stats(const std::vector<uint64_t> & pool_ids)
|
||||
{
|
||||
impl->set_no_inode_stats(pool_ids);
|
||||
}
|
||||
|
@@ -22,17 +22,20 @@
|
||||
#define DIRECT_IO_ALIGNMENT 512
|
||||
#endif
|
||||
|
||||
// Memory allocation alignment (page size is usually optimal)
|
||||
#ifndef MEM_ALIGNMENT
|
||||
#define MEM_ALIGNMENT 4096
|
||||
#endif
|
||||
|
||||
// Default block size is 128 KB, current allowed range is 4K - 128M
|
||||
#define DEFAULT_DATA_BLOCK_ORDER 17
|
||||
#define MIN_DATA_BLOCK_SIZE 4*1024
|
||||
#define MAX_DATA_BLOCK_SIZE 128*1024*1024
|
||||
#define DEFAULT_BITMAP_GRANULARITY 4096
|
||||
|
||||
#define MIN_JOURNAL_SIZE 1024*1024
|
||||
|
||||
// "VITAstor"
|
||||
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
|
||||
#define BLOCKSTORE_META_FORMAT_V1 1
|
||||
#define BLOCKSTORE_META_FORMAT_V2 2
|
||||
#define BLOCKSTORE_META_FORMAT_HEAP 3
|
||||
|
||||
#define BS_OP_MIN 1
|
||||
#define BS_OP_READ 1
|
||||
#define BS_OP_WRITE 2
|
||||
@@ -42,13 +45,18 @@
|
||||
#define BS_OP_DELETE 6
|
||||
#define BS_OP_LIST 7
|
||||
#define BS_OP_ROLLBACK 8
|
||||
#define BS_OP_SYNC_STAB_ALL 9
|
||||
#define BS_OP_MAX 9
|
||||
#define BS_OP_MAX 8
|
||||
|
||||
#define BS_OP_PRIVATE_DATA_SIZE 256
|
||||
|
||||
/*
|
||||
|
||||
All operations may be submitted in any order, because reads only see completed writes,
|
||||
syncs only sync completed writes and writes don't depend on each other.
|
||||
|
||||
The only restriction is that the external code MUST NOT submit multiple writes for one
|
||||
object in parallel. This is a natural restriction because `version` numbers are used though.
|
||||
|
||||
Blockstore opcode documentation:
|
||||
|
||||
## BS_OP_READ / BS_OP_WRITE / BS_OP_WRITE_STABLE
|
||||
@@ -113,14 +121,6 @@ Input:
|
||||
Output:
|
||||
- retval = 0 or negative error number (-ENOENT if no such version for stabilize)
|
||||
|
||||
## BS_OP_SYNC_STAB_ALL
|
||||
|
||||
ONLY FOR TESTS! Sync and mark all unstable object versions as stable, at once.
|
||||
|
||||
Input: Nothing except opcode
|
||||
Output:
|
||||
- retval = 0 or negative error number (-EINVAL)
|
||||
|
||||
## BS_OP_LIST
|
||||
|
||||
Get a list of all objects in this Blockstore.
|
||||
@@ -144,10 +144,10 @@ Output:
|
||||
|
||||
*/
|
||||
|
||||
struct blockstore_op_t
|
||||
struct __attribute__ ((visibility("default"))) blockstore_op_t
|
||||
{
|
||||
// operation
|
||||
uint64_t opcode;
|
||||
uint64_t opcode = 0;
|
||||
// finish callback
|
||||
std::function<void (blockstore_op_t*)> callback;
|
||||
union __attribute__((__packed__))
|
||||
@@ -171,9 +171,9 @@ struct blockstore_op_t
|
||||
uint32_t list_stable_limit;
|
||||
};
|
||||
};
|
||||
void *buf;
|
||||
void *bitmap;
|
||||
int retval;
|
||||
uint8_t *buf = NULL;
|
||||
uint8_t *bitmap = NULL;
|
||||
int retval = 0;
|
||||
|
||||
uint8_t private_data[BS_OP_PRIVATE_DATA_SIZE];
|
||||
};
|
||||
@@ -182,11 +182,11 @@ typedef std::map<std::string, std::string> blockstore_config_t;
|
||||
|
||||
class blockstore_impl_t;
|
||||
|
||||
class blockstore_t
|
||||
class __attribute__((visibility("default"))) blockstore_t
|
||||
{
|
||||
blockstore_impl_t *impl;
|
||||
public:
|
||||
blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
|
||||
blockstore_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd);
|
||||
~blockstore_t();
|
||||
|
||||
// Update configuration
|
||||
@@ -214,10 +214,7 @@ public:
|
||||
int read_bitmap(object_id oid, uint64_t target_version, void *bitmap, uint64_t *result_version = NULL);
|
||||
|
||||
// Get per-inode space usage statistics
|
||||
std::map<uint64_t, uint64_t> & get_inode_space_stats();
|
||||
|
||||
// Set per-pool no_inode_stats
|
||||
void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
|
||||
const std::map<uint64_t, uint64_t> & get_inode_space_stats();
|
||||
|
||||
// Print diagnostics to stdout
|
||||
void dump_diagnostics();
|
||||
|
@@ -2,11 +2,14 @@
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <sys/file.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
#include "blockstore.h"
|
||||
#include "blockstore_disk.h"
|
||||
#include "blockstore_heap.h"
|
||||
#include "str_util.h"
|
||||
#include "allocator.h"
|
||||
|
||||
@@ -44,8 +47,11 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||
disk_alignment = parse_size(config["disk_alignment"]);
|
||||
journal_block_size = parse_size(config["journal_block_size"]);
|
||||
meta_block_size = parse_size(config["meta_block_size"]);
|
||||
meta_block_target_free_space = parse_size(config["meta_block_target_free_space"]);
|
||||
bitmap_granularity = parse_size(config["bitmap_granularity"]);
|
||||
meta_format = stoull_full(config["meta_format"]);
|
||||
atomic_write_size = (config.find("atomic_write_size") != config.end()
|
||||
? parse_size(config["atomic_write_size"]) : 4096);
|
||||
if (config.find("data_io") == config.end() &&
|
||||
config.find("meta_io") == config.end() &&
|
||||
config.find("journal_io") == config.end())
|
||||
@@ -90,12 +96,28 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||
if (!min_discard_size)
|
||||
min_discard_size = 1024*1024;
|
||||
discard_granularity = parse_size(config["discard_granularity"]);
|
||||
inmemory_meta = config["inmemory_metadata"] != "false" && config["inmemory_metadata"] != "0" &&
|
||||
config["inmemory_metadata"] != "no";
|
||||
inmemory_journal = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
|
||||
config["inmemory_journal"] != "no";
|
||||
disable_data_fsync = config["disable_data_fsync"] == "true" || config["disable_data_fsync"] == "1" || config["disable_data_fsync"] == "yes";
|
||||
disable_meta_fsync = config["disable_meta_fsync"] == "true" || config["disable_meta_fsync"] == "1" || config["disable_meta_fsync"] == "yes";
|
||||
disable_journal_fsync = config["disable_journal_fsync"] == "true" || config["disable_journal_fsync"] == "1" || config["disable_journal_fsync"] == "yes";
|
||||
if (mock_mode)
|
||||
{
|
||||
data_device_size = parse_size(config["data_device_size"]);
|
||||
data_device_sect = parse_size(config["data_device_sect"]);
|
||||
meta_device_size = parse_size(config["meta_device_size"]);
|
||||
meta_device_sect = parse_size(config["meta_device_sect"]);
|
||||
journal_device_size = parse_size(config["journal_device_size"]);
|
||||
journal_device_sect = parse_size(config["journal_device_sect"]);
|
||||
}
|
||||
// Validate
|
||||
if (!data_block_size)
|
||||
{
|
||||
data_block_size = (1 << DEFAULT_DATA_BLOCK_ORDER);
|
||||
}
|
||||
if ((block_order = is_power_of_two(data_block_size)) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE)
|
||||
if (is_power_of_two(data_block_size) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE)
|
||||
{
|
||||
throw std::runtime_error("Bad block size");
|
||||
}
|
||||
@@ -131,6 +153,14 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||
{
|
||||
throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_DATA_BLOCK_SIZE));
|
||||
}
|
||||
if (!meta_block_target_free_space)
|
||||
{
|
||||
meta_block_target_free_space = 800;
|
||||
}
|
||||
if (meta_block_target_free_space >= meta_block_size)
|
||||
{
|
||||
throw std::runtime_error("meta_block_target_free_space must not exceed "+std::to_string(meta_block_size));
|
||||
}
|
||||
if (data_offset % disk_alignment)
|
||||
{
|
||||
throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
|
||||
@@ -179,17 +209,29 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
|
||||
{
|
||||
throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
|
||||
}
|
||||
if (!meta_format)
|
||||
{
|
||||
meta_format = BLOCKSTORE_META_FORMAT_HEAP;
|
||||
}
|
||||
if (meta_device == data_device)
|
||||
{
|
||||
disable_meta_fsync = disable_data_fsync;
|
||||
}
|
||||
if (journal_device == meta_device)
|
||||
{
|
||||
disable_journal_fsync = disable_meta_fsync;
|
||||
}
|
||||
}
|
||||
|
||||
void blockstore_disk_t::calc_lengths(bool skip_meta_check)
|
||||
void blockstore_disk_t::calc_lengths()
|
||||
{
|
||||
// data
|
||||
data_len = data_device_size - data_offset;
|
||||
if (data_fd == meta_fd && data_offset < meta_offset)
|
||||
if (data_device == meta_device && data_offset < meta_offset)
|
||||
{
|
||||
data_len = meta_offset - data_offset;
|
||||
}
|
||||
if (data_fd == journal_fd && data_offset < journal_offset)
|
||||
if (data_device == journal_device && data_offset < journal_offset)
|
||||
{
|
||||
data_len = data_len < journal_offset-data_offset
|
||||
? data_len : journal_offset-data_offset;
|
||||
@@ -204,23 +246,23 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
|
||||
data_len = cfg_data_size;
|
||||
}
|
||||
// meta
|
||||
uint64_t meta_area_size = (meta_fd == data_fd ? data_device_size : meta_device_size) - meta_offset;
|
||||
if (meta_fd == data_fd && meta_offset <= data_offset)
|
||||
meta_area_size = (meta_device == data_device ? data_device_size : meta_device_size) - meta_offset;
|
||||
if (meta_device == data_device && meta_offset <= data_offset)
|
||||
{
|
||||
meta_area_size = data_offset - meta_offset;
|
||||
}
|
||||
if (meta_fd == journal_fd && meta_offset <= journal_offset)
|
||||
if (meta_device == journal_device && meta_offset <= journal_offset)
|
||||
{
|
||||
meta_area_size = meta_area_size < journal_offset-meta_offset
|
||||
? meta_area_size : journal_offset-meta_offset;
|
||||
}
|
||||
// journal
|
||||
journal_len = (journal_fd == data_fd ? data_device_size : (journal_fd == meta_fd ? meta_device_size : journal_device_size)) - journal_offset;
|
||||
if (journal_fd == data_fd && journal_offset <= data_offset)
|
||||
journal_len = (journal_device == data_device ? data_device_size : (journal_device == meta_device ? meta_device_size : journal_device_size)) - journal_offset;
|
||||
if (journal_device == data_device && journal_offset <= data_offset)
|
||||
{
|
||||
journal_len = data_offset - journal_offset;
|
||||
}
|
||||
if (journal_fd == meta_fd && journal_offset <= meta_offset)
|
||||
if (journal_device == meta_device && journal_offset <= meta_offset)
|
||||
{
|
||||
journal_len = journal_len < meta_offset-journal_offset
|
||||
? journal_len : meta_offset-journal_offset;
|
||||
@@ -230,37 +272,37 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
|
||||
clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8;
|
||||
clean_dyn_size = clean_entry_bitmap_size*2 + (csum_block_size
|
||||
? data_block_size/csum_block_size*(data_csum_type & 0xFF) : 0);
|
||||
clean_entry_size = sizeof(clean_disk_entry) + clean_dyn_size + 4 /*entry_csum*/;
|
||||
meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
|
||||
bool new_doesnt_fit = (!meta_format && !skip_meta_check && meta_area_size < meta_len && !data_csum_type);
|
||||
if (meta_format == BLOCKSTORE_META_FORMAT_V1 || new_doesnt_fit)
|
||||
if (meta_format == BLOCKSTORE_META_FORMAT_HEAP)
|
||||
{
|
||||
uint64_t clean_entry_v0_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
|
||||
uint64_t meta_v0_len = (1 + (block_count - 1 + meta_block_size / clean_entry_v0_size)
|
||||
/ (meta_block_size / clean_entry_v0_size)) * meta_block_size;
|
||||
if (meta_format == BLOCKSTORE_META_FORMAT_V1 || meta_area_size >= meta_v0_len)
|
||||
{
|
||||
// Old metadata fits.
|
||||
if (new_doesnt_fit)
|
||||
{
|
||||
printf("Warning: Using old metadata format without checksums because the new format"
|
||||
" doesn't fit into provided area (%ju bytes required, %ju bytes available)\n", meta_len, meta_area_size);
|
||||
}
|
||||
clean_entry_size = clean_entry_v0_size;
|
||||
meta_len = meta_v0_len;
|
||||
meta_format = BLOCKSTORE_META_FORMAT_V1;
|
||||
}
|
||||
else
|
||||
meta_format = BLOCKSTORE_META_FORMAT_V2;
|
||||
uint32_t entries_per_block = ((meta_block_size-meta_block_target_free_space) /
|
||||
(sizeof(heap_object_t) + sizeof(heap_write_t) + clean_dyn_size));
|
||||
min_meta_len = (block_count+entries_per_block-1) / entries_per_block * meta_block_size;
|
||||
}
|
||||
else if (meta_format == BLOCKSTORE_META_FORMAT_V1)
|
||||
{
|
||||
clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + 2*clean_entry_bitmap_size;
|
||||
min_meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size)
|
||||
/ (meta_block_size / clean_entry_size)) * meta_block_size;
|
||||
}
|
||||
else if (meta_format == BLOCKSTORE_META_FORMAT_V2)
|
||||
{
|
||||
clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + clean_dyn_size + 4 /*entry_csum*/;
|
||||
min_meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
|
||||
}
|
||||
else
|
||||
meta_format = BLOCKSTORE_META_FORMAT_V2;
|
||||
if (!skip_meta_check && meta_area_size < meta_len)
|
||||
{
|
||||
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes, have only "+std::to_string(meta_area_size)+" bytes");
|
||||
throw std::runtime_error("meta_format = "+std::to_string(meta_format)+" is not supported");
|
||||
}
|
||||
}
|
||||
|
||||
void blockstore_disk_t::check_lengths()
|
||||
{
|
||||
if (meta_area_size < min_meta_len)
|
||||
{
|
||||
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(min_meta_len)+" bytes, have only "+std::to_string(meta_area_size)+" bytes");
|
||||
}
|
||||
// requested journal size
|
||||
if (!skip_meta_check && cfg_journal_size > journal_len)
|
||||
if (cfg_journal_size > journal_len)
|
||||
{
|
||||
throw std::runtime_error("Requested journal_size is too large");
|
||||
}
|
||||
@@ -321,12 +363,19 @@ static int bs_openmode(const std::string & mode)
|
||||
|
||||
void blockstore_disk_t::open_data()
|
||||
{
|
||||
data_fd = open(data_device.c_str(), bs_openmode(data_io) | O_RDWR);
|
||||
if (data_fd >= 0)
|
||||
{
|
||||
throw std::runtime_error("data device is already opened");
|
||||
}
|
||||
data_fd = mock_mode ? MOCK_DATA_FD : open(data_device.c_str(), bs_openmode(data_io) | O_RDWR);
|
||||
if (data_fd == -1)
|
||||
{
|
||||
throw std::runtime_error("Failed to open data device "+data_device+": "+std::string(strerror(errno)));
|
||||
}
|
||||
check_size(data_fd, &data_device_size, &data_device_sect, "data device");
|
||||
if (!mock_mode)
|
||||
{
|
||||
check_size(data_fd, &data_device_size, &data_device_sect, "data device");
|
||||
}
|
||||
if (disk_alignment % data_device_sect)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
@@ -338,7 +387,7 @@ void blockstore_disk_t::open_data()
|
||||
{
|
||||
throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_device_size));
|
||||
}
|
||||
if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
|
||||
if (!mock_mode && !disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
|
||||
}
|
||||
@@ -346,19 +395,26 @@ void blockstore_disk_t::open_data()
|
||||
|
||||
void blockstore_disk_t::open_meta()
|
||||
{
|
||||
if (meta_fd >= 0)
|
||||
{
|
||||
throw std::runtime_error("metadata device is already opened");
|
||||
}
|
||||
if (meta_device != data_device || meta_io != data_io)
|
||||
{
|
||||
meta_fd = open(meta_device.c_str(), bs_openmode(meta_io) | O_RDWR);
|
||||
meta_fd = mock_mode ? MOCK_META_FD : open(meta_device.c_str(), bs_openmode(meta_io) | O_RDWR);
|
||||
if (meta_fd == -1)
|
||||
{
|
||||
throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
|
||||
}
|
||||
check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device");
|
||||
if (!mock_mode)
|
||||
{
|
||||
check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device");
|
||||
}
|
||||
if (meta_offset >= meta_device_size)
|
||||
{
|
||||
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_device_size));
|
||||
}
|
||||
if (!disable_flock && meta_device != data_device && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
|
||||
if (!mock_mode && !disable_flock && meta_device != data_device && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
|
||||
}
|
||||
@@ -384,15 +440,26 @@ void blockstore_disk_t::open_meta()
|
||||
|
||||
void blockstore_disk_t::open_journal()
|
||||
{
|
||||
if (journal_fd >= 0)
|
||||
{
|
||||
throw std::runtime_error("journal device is already opened");
|
||||
}
|
||||
if (journal_device != meta_device || journal_io != meta_io)
|
||||
{
|
||||
journal_fd = open(journal_device.c_str(), bs_openmode(journal_io) | O_RDWR);
|
||||
journal_fd = mock_mode ? MOCK_JOURNAL_FD : open(journal_device.c_str(), bs_openmode(journal_io) | O_RDWR);
|
||||
if (journal_fd == -1)
|
||||
{
|
||||
throw std::runtime_error("Failed to open journal device "+journal_device+": "+std::string(strerror(errno)));
|
||||
}
|
||||
check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device");
|
||||
if (!disable_flock && journal_device != meta_device && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
|
||||
if (!mock_mode)
|
||||
{
|
||||
check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device");
|
||||
}
|
||||
if (journal_offset >= journal_device_size)
|
||||
{
|
||||
throw std::runtime_error("journal_offset exceeds device size = "+std::to_string(journal_device_size));
|
||||
}
|
||||
if (!mock_mode && !disable_flock && journal_device != meta_device && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
|
||||
}
|
||||
@@ -418,25 +485,32 @@ void blockstore_disk_t::open_journal()
|
||||
|
||||
void blockstore_disk_t::close_all()
|
||||
{
|
||||
if (data_fd >= 0)
|
||||
close(data_fd);
|
||||
if (meta_fd >= 0 && meta_fd != data_fd)
|
||||
close(meta_fd);
|
||||
if (journal_fd >= 0 && journal_fd != meta_fd)
|
||||
close(journal_fd);
|
||||
if (!mock_mode)
|
||||
{
|
||||
if (data_fd >= 0)
|
||||
close(data_fd);
|
||||
if (meta_fd >= 0 && meta_fd != data_fd)
|
||||
close(meta_fd);
|
||||
if (journal_fd >= 0 && journal_fd != meta_fd)
|
||||
close(journal_fd);
|
||||
}
|
||||
data_fd = meta_fd = journal_fd = -1;
|
||||
}
|
||||
|
||||
// Sadly DISCARD only works through ioctl(), but it seems to always block the device queue,
|
||||
// so it's not a big deal that we can only run it synchronously.
|
||||
int blockstore_disk_t::trim_data(allocator_t *alloc)
|
||||
int blockstore_disk_t::trim_data(std::function<bool(uint64_t)> is_free)
|
||||
{
|
||||
if (mock_mode)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
int r = 0;
|
||||
uint64_t j = 0, i = 0;
|
||||
uint64_t discarded = 0;
|
||||
for (; i <= block_count; i++)
|
||||
{
|
||||
if (i >= block_count || alloc->get(i))
|
||||
if (i >= block_count || is_free(i))
|
||||
{
|
||||
if (i > j && (i-j)*data_block_size >= min_discard_size)
|
||||
{
|
||||
|
@@ -12,6 +12,10 @@
|
||||
// Lower byte of checksum type is its length
|
||||
#define BLOCKSTORE_CSUM_CRC32C 0x104
|
||||
|
||||
#define MOCK_DATA_FD 1000
|
||||
#define MOCK_META_FD 1001
|
||||
#define MOCK_JOURNAL_FD 1002
|
||||
|
||||
class allocator_t;
|
||||
|
||||
struct blockstore_disk_t
|
||||
@@ -22,11 +26,15 @@ struct blockstore_disk_t
|
||||
// Required write alignment and journal/metadata/data areas' location alignment
|
||||
uint32_t disk_alignment = 4096;
|
||||
// Journal block size - minimum_io_size of the journal device is the best choice
|
||||
uint64_t journal_block_size = 4096;
|
||||
uint32_t journal_block_size = 4096;
|
||||
// Metadata block size - minimum_io_size of the metadata device is the best choice
|
||||
uint64_t meta_block_size = 4096;
|
||||
uint32_t meta_block_size = 4096;
|
||||
// Atomic write size of the data block device
|
||||
uint32_t atomic_write_size = 4096;
|
||||
// Target free space in metadata blocks
|
||||
uint32_t meta_block_target_free_space = 800;
|
||||
// Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
|
||||
uint64_t bitmap_granularity = 4096;
|
||||
uint32_t bitmap_granularity = 4096;
|
||||
// Data checksum type, BLOCKSTORE_CSUM_NONE or BLOCKSTORE_CSUM_CRC32C
|
||||
uint32_t data_csum_type = BLOCKSTORE_CSUM_NONE;
|
||||
// Checksum block size, must be a multiple of bitmap_granularity
|
||||
@@ -36,27 +44,36 @@ struct blockstore_disk_t
|
||||
// I/O modes for data, metadata and journal: direct or "" = O_DIRECT, cached = O_SYNC, directsync = O_DIRECT|O_SYNC
|
||||
// O_SYNC without O_DIRECT = use Linux page cache for reads and writes
|
||||
std::string data_io, meta_io, journal_io;
|
||||
// It is safe to disable fsync() if drive write cache is writethrough
|
||||
bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
|
||||
// Keep journal (buffered data) in memory?
|
||||
bool inmemory_meta = true;
|
||||
// Keep metadata in memory?
|
||||
bool inmemory_journal = true;
|
||||
// Data discard granularity and minimum size (for the sake of performance)
|
||||
bool discard_on_start = false;
|
||||
uint64_t min_discard_size = 1024*1024;
|
||||
uint64_t discard_granularity = 0;
|
||||
|
||||
int meta_fd = -1, data_fd = -1, journal_fd = -1;
|
||||
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;
|
||||
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_area_size, min_meta_len, meta_format = 0;
|
||||
uint64_t data_offset, data_device_sect, data_device_size, data_len;
|
||||
uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;
|
||||
|
||||
uint32_t block_order = 0;
|
||||
uint64_t block_count = 0;
|
||||
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0, clean_dyn_size = 0;
|
||||
uint32_t clean_entry_bitmap_size = 0;
|
||||
uint32_t clean_entry_size = 0, clean_dyn_size = 0; // for meta_v1/2
|
||||
|
||||
bool mock_mode = false;
|
||||
|
||||
void parse_config(std::map<std::string, std::string> & config);
|
||||
void open_data();
|
||||
void open_meta();
|
||||
void open_journal();
|
||||
void calc_lengths(bool skip_meta_check = false);
|
||||
void calc_lengths();
|
||||
void check_lengths();
|
||||
void close_all();
|
||||
int trim_data(allocator_t *alloc);
|
||||
int trim_data(std::function<bool(uint64_t)> is_free);
|
||||
|
||||
inline uint64_t dirty_dyn_size(uint64_t offset, uint64_t len)
|
||||
{
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -1,22 +1,20 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#define COPY_BUF_JOURNAL 1
|
||||
#define COPY_BUF_DATA 2
|
||||
#define COPY_BUF_ZERO 4
|
||||
#define COPY_BUF_CSUM_FILL 8
|
||||
#define COPY_BUF_COALESCED 16
|
||||
#define COPY_BUF_META_BLOCK 32
|
||||
#define COPY_BUF_JOURNALED_BIG 64
|
||||
#define COPY_BUF_JOURNAL 0x01
|
||||
#define COPY_BUF_DATA 0x02
|
||||
#define COPY_BUF_ZERO 0x04
|
||||
#define COPY_BUF_CSUM_FILL 0x08
|
||||
#define COPY_BUF_COALESCED 0x10
|
||||
#define COPY_BUF_PADDED 0x20
|
||||
#define COPY_BUF_SKIP_CSUM 0x40
|
||||
|
||||
struct copy_buffer_t
|
||||
{
|
||||
int copy_flags;
|
||||
uint64_t offset, len, disk_offset;
|
||||
uint64_t journal_sector; // only for reads: sector+1 if used and !journal.inmemory, otherwise 0
|
||||
void *buf;
|
||||
uint8_t *csum_buf;
|
||||
int *dyn_data;
|
||||
uint32_t copy_flags;
|
||||
uint64_t offset, len, disk_loc, disk_offset, disk_len;
|
||||
uint8_t *buf;
|
||||
uint64_t wr_lsn;
|
||||
};
|
||||
|
||||
struct meta_sector_t
|
||||
@@ -27,13 +25,6 @@ struct meta_sector_t
|
||||
int usage_count;
|
||||
};
|
||||
|
||||
struct flusher_sync_t
|
||||
{
|
||||
bool fsync_meta;
|
||||
int ready_count;
|
||||
int state;
|
||||
};
|
||||
|
||||
struct flusher_meta_write_t
|
||||
{
|
||||
uint64_t sector, pos;
|
||||
@@ -49,93 +40,75 @@ class journal_flusher_co
|
||||
{
|
||||
blockstore_impl_t *bs;
|
||||
journal_flusher_t *flusher;
|
||||
int wait_state, wait_count, wait_journal_count;
|
||||
int co_id;
|
||||
int wait_state, wait_count;
|
||||
struct io_uring_sqe *sqe;
|
||||
struct ring_data_t *data;
|
||||
|
||||
std::list<flusher_sync_t>::iterator cur_sync;
|
||||
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
|
||||
|
||||
obj_ver_id cur;
|
||||
std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_start, dirty_end;
|
||||
std::map<object_id, uint64_t>::iterator repeat_it;
|
||||
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_rj, simple_callback_w;
|
||||
object_id cur_oid;
|
||||
uint64_t copy_id;
|
||||
uint64_t compact_lsn;
|
||||
uint64_t cur_version;
|
||||
heap_object_t *cur_obj;
|
||||
heap_write_t *begin_wr, *end_wr;
|
||||
uint32_t modified_block;
|
||||
bool should_repeat;
|
||||
|
||||
bool try_trim = false;
|
||||
bool skip_copy, has_delete, has_writes;
|
||||
std::vector<copy_buffer_t> v;
|
||||
std::vector<copy_buffer_t>::iterator it;
|
||||
int i;
|
||||
bool fill_incomplete, cleared_incomplete;
|
||||
int read_to_fill_incomplete;
|
||||
std::vector<copy_buffer_t> read_vec;
|
||||
uint32_t overwrite_start, overwrite_end;
|
||||
uint32_t big_start, big_end;
|
||||
int i, res;
|
||||
bool read_to_fill_incomplete;
|
||||
int copy_count;
|
||||
uint64_t clean_loc, clean_ver, old_clean_loc, old_clean_ver;
|
||||
uint64_t clean_loc;
|
||||
flusher_meta_write_t meta_old, meta_new;
|
||||
bool clean_init_bitmap;
|
||||
uint64_t clean_bitmap_offset, clean_bitmap_len;
|
||||
uint8_t *clean_init_dyn_ptr;
|
||||
uint8_t *new_clean_bitmap;
|
||||
|
||||
uint64_t new_trim_pos;
|
||||
bool do_repeat = false;
|
||||
|
||||
friend class journal_flusher_t;
|
||||
void scan_dirty();
|
||||
bool read_dirty(int wait_base);
|
||||
bool modify_meta_do_reads(int wait_base);
|
||||
bool wait_meta_reads(int wait_base);
|
||||
bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
|
||||
bool clear_incomplete_csum_block_bits(int wait_base);
|
||||
void calc_block_checksums(uint32_t *new_data_csums, bool skip_overwrites);
|
||||
void update_metadata_entry();
|
||||
bool write_meta_block(flusher_meta_write_t & meta_block, int wait_base);
|
||||
void update_clean_db();
|
||||
void free_data_blocks();
|
||||
bool fsync_batch(bool fsync_meta, int wait_base);
|
||||
bool trim_journal(int wait_base);
|
||||
|
||||
void iterate_checksum_holes(std::function<void(int & pos, uint32_t hole_start, uint32_t hole_end)> cb);
|
||||
void fill_partial_checksum_blocks();
|
||||
void free_buffers();
|
||||
int check_and_punch_checksums();
|
||||
bool calc_block_checksums();
|
||||
bool write_meta_block(int wait_base);
|
||||
bool read_buffered(int wait_base);
|
||||
bool fsync_meta(int wait_base);
|
||||
int fsync_buffer(int wait_base);
|
||||
bool trim_lsn(int wait_base);
|
||||
public:
|
||||
journal_flusher_co();
|
||||
~journal_flusher_co();
|
||||
bool loop();
|
||||
};
|
||||
|
||||
// Journal flusher itself
|
||||
class journal_flusher_t
|
||||
{
|
||||
int trim_wanted = 0;
|
||||
bool dequeuing;
|
||||
int min_flusher_count, max_flusher_count, cur_flusher_count, target_flusher_count;
|
||||
int flusher_start_threshold;
|
||||
int force_start = 0;
|
||||
int min_flusher_count = 0, max_flusher_count = 0, cur_flusher_count = 0, target_flusher_count = 0;
|
||||
journal_flusher_co *co;
|
||||
blockstore_impl_t *bs;
|
||||
friend class journal_flusher_co;
|
||||
|
||||
int journal_trim_counter;
|
||||
bool trimming;
|
||||
void* journal_superblock;
|
||||
int advance_lsn_counter = 0;
|
||||
uint64_t compact_counter = 0;
|
||||
|
||||
int active_flushers;
|
||||
int syncing_flushers;
|
||||
std::list<flusher_sync_t> syncs;
|
||||
std::map<object_id, uint64_t> sync_to_repeat;
|
||||
|
||||
std::map<uint64_t, meta_sector_t> meta_sectors;
|
||||
std::deque<object_id> flush_queue;
|
||||
std::map<object_id, uint64_t> flush_versions; // FIXME: consider unordered_map?
|
||||
|
||||
bool try_find_older(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
|
||||
bool try_find_other(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
|
||||
int active_flushers = 0;
|
||||
int wanting_meta_fsync = 0;
|
||||
bool fsyncing_meta = false;
|
||||
int syncing_buffer = 0;
|
||||
|
||||
public:
|
||||
journal_flusher_t(blockstore_impl_t *bs);
|
||||
~journal_flusher_t();
|
||||
void loop();
|
||||
bool is_trim_wanted() { return trim_wanted; }
|
||||
int get_syncing_buffer();
|
||||
uint64_t get_compact_counter();
|
||||
bool is_active();
|
||||
void mark_trim_possible();
|
||||
void request_trim();
|
||||
void release_trim();
|
||||
void enqueue_flush(obj_ver_id oid);
|
||||
void unshift_flush(obj_ver_id oid, bool force);
|
||||
void remove_flush(object_id oid);
|
||||
void dump_diagnostics();
|
||||
bool is_mutated(uint64_t clean_loc);
|
||||
};
|
||||
|
2341
src/blockstore/blockstore_heap.cpp
Normal file
2341
src/blockstore/blockstore_heap.cpp
Normal file
File diff suppressed because it is too large
Load Diff
344
src/blockstore/blockstore_heap.h
Normal file
344
src/blockstore/blockstore_heap.h
Normal file
@@ -0,0 +1,344 @@
|
||||
// Metadata storage version 3 ("heap")
|
||||
// Copyright (c) Vitaliy Filippov, 2025+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#pragma once
|
||||
|
||||
#define EMH_EXT
|
||||
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include <set>
|
||||
#include <deque>
|
||||
#include <vector>
|
||||
|
||||
#include "../client/object_id.h"
|
||||
#include "../../emhash/hash_table7.hpp"
|
||||
#include "../util/wyhash.h"
|
||||
#include "blockstore_disk.h"
|
||||
#include "multilist.h"
|
||||
|
||||
struct pool_shard_settings_t
|
||||
{
|
||||
uint32_t pg_count;
|
||||
uint32_t pg_stripe_size;
|
||||
};
|
||||
|
||||
#define BS_HEAP_TYPE 7
|
||||
#define BS_HEAP_SMALL_WRITE 1
|
||||
#define BS_HEAP_BIG_WRITE 2
|
||||
#define BS_HEAP_TOMBSTONE 3
|
||||
#define BS_HEAP_INTENT_WRITE 4
|
||||
#define BS_HEAP_STABLE 8
|
||||
|
||||
class blockstore_heap_t;
|
||||
|
||||
struct __attribute__((__packed__)) heap_write_t
|
||||
{
|
||||
// size should have top bit cleared
|
||||
uint16_t size = 0;
|
||||
int16_t next_pos = 0;
|
||||
uint64_t lsn = 0;
|
||||
uint64_t version = 0;
|
||||
uint32_t offset = 0;
|
||||
uint32_t len = 0;
|
||||
uint64_t location = 0;
|
||||
uint8_t flags = 0; // 1|2|3 = small|big|tombstone, 4|0 = stable|unstable
|
||||
|
||||
// uint8_t[] external_bitmap
|
||||
// uint8_t[] internal_bitmap
|
||||
// uint32_t[] checksums
|
||||
|
||||
heap_write_t *next();
|
||||
inline uint8_t type() const { return (flags & BS_HEAP_TYPE); }
|
||||
uint32_t get_size(blockstore_heap_t *heap);
|
||||
uint32_t get_csum_size(blockstore_heap_t *heap);
|
||||
bool needs_recheck(blockstore_heap_t *heap);
|
||||
bool needs_compact(blockstore_heap_t *heap);
|
||||
bool is_compacted(uint64_t compacted_lsn);
|
||||
bool can_be_collapsed(blockstore_heap_t *heap);
|
||||
bool is_allowed_before_compacted(uint64_t compacted_lsn, bool is_last_entry);
|
||||
uint8_t *get_ext_bitmap(blockstore_heap_t *heap);
|
||||
uint8_t *get_int_bitmap(blockstore_heap_t *heap);
|
||||
uint8_t *get_checksums(blockstore_heap_t *heap);
|
||||
uint32_t *get_checksum(blockstore_heap_t *heap);
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) heap_object_t
|
||||
{
|
||||
// size should have top bit cleared
|
||||
uint16_t size = 0;
|
||||
// linked list of write entries...
|
||||
// newest entries are stored first to simplify scanning
|
||||
int16_t write_pos = 0;
|
||||
uint32_t crc32c = 0;
|
||||
uint64_t inode = 0;
|
||||
uint64_t stripe = 0;
|
||||
|
||||
heap_write_t *get_writes();
|
||||
uint32_t calc_crc32c();
|
||||
};
|
||||
|
||||
struct heap_object_lsn_t
|
||||
{
|
||||
object_id oid;
|
||||
uint64_t lsn;
|
||||
};
|
||||
|
||||
inline bool operator < (const heap_object_lsn_t & a, const heap_object_lsn_t & b)
|
||||
{
|
||||
return a.oid < b.oid || a.oid == b.oid && a.lsn < b.lsn;
|
||||
}
|
||||
|
||||
struct tmp_compact_item_t
|
||||
{
|
||||
object_id oid;
|
||||
uint64_t lsn;
|
||||
bool compact;
|
||||
};
|
||||
|
||||
struct heap_mvcc_copy_id_t
|
||||
{
|
||||
object_id oid;
|
||||
uint64_t copy_id;
|
||||
};
|
||||
|
||||
inline bool operator == (const heap_mvcc_copy_id_t & a, const heap_mvcc_copy_id_t & b)
|
||||
{
|
||||
return a.oid.inode == b.oid.inode && a.oid.stripe == b.oid.stripe && a.copy_id == b.copy_id;
|
||||
}
|
||||
|
||||
namespace std
|
||||
{
|
||||
template<> struct hash<heap_mvcc_copy_id_t>
|
||||
{
|
||||
inline size_t operator()(const heap_mvcc_copy_id_t &s) const
|
||||
{
|
||||
size_t seed = std::hash<object_id>()(s.oid);
|
||||
// Copy-pasted from spp::hash_combine()
|
||||
seed ^= (s.copy_id + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
struct heap_object_mvcc_t
|
||||
{
|
||||
uint32_t readers = 0;
|
||||
heap_object_t *entry_copy = NULL;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) heap_block_info_t
|
||||
{
|
||||
uint32_t used_space = 0;
|
||||
uint32_t free_pos = 0;
|
||||
uint8_t *data = NULL;
|
||||
};
|
||||
|
||||
struct heap_inflight_lsn_t
|
||||
{
|
||||
object_id oid;
|
||||
uint64_t flags;
|
||||
};
|
||||
|
||||
struct heap_refqi_t
|
||||
{
|
||||
uint64_t lsn;
|
||||
uint64_t inode;
|
||||
uint64_t location;
|
||||
uint32_t len;
|
||||
bool is_data;
|
||||
};
|
||||
|
||||
using i64hash_t = wyhash::hash<uint64_t>;
|
||||
using heap_block_index_t = emhash7::HashMap<uint64_t, emhash7::HashMap<inode_t, emhash7::HashMap<uint64_t, uint64_t, i64hash_t>, i64hash_t>, i64hash_t>;
|
||||
using heap_mvcc_map_t = emhash7::HashMap<heap_mvcc_copy_id_t, heap_object_mvcc_t>;
|
||||
|
||||
class blockstore_heap_t
|
||||
{
|
||||
friend class heap_write_t;
|
||||
friend class heap_object_t;
|
||||
|
||||
blockstore_disk_t *dsk = NULL;
|
||||
uint8_t* buffer_area = NULL;
|
||||
bool abort_on_corruption = false;
|
||||
bool abort_on_overlap = true;
|
||||
int log_level = 0;
|
||||
|
||||
const uint32_t meta_block_count = 0;
|
||||
uint32_t target_block_free_space = 800;
|
||||
|
||||
uint64_t next_lsn = 0;
|
||||
emhash7::HashMap<pool_id_t, pool_shard_settings_t> pool_shard_settings;
|
||||
// PG => inode => stripe => block number
|
||||
heap_block_index_t block_index;
|
||||
std::vector<heap_block_info_t> block_info;
|
||||
allocator_t *data_alloc = NULL;
|
||||
multilist_index_t *meta_alloc = NULL;
|
||||
uint32_t meta_alloc_count = 0;
|
||||
uint64_t meta_used_space = 0;
|
||||
multilist_alloc_t *buffer_alloc = NULL;
|
||||
heap_mvcc_map_t object_mvcc;
|
||||
std::unordered_map<uint64_t, uint32_t> mvcc_data_refs;
|
||||
std::unordered_map<uint64_t, uint32_t> mvcc_buffer_refs;
|
||||
std::map<uint64_t, uint64_t> inode_space_stats;
|
||||
uint64_t buffer_area_used_space = 0;
|
||||
uint64_t data_used_space = 0;
|
||||
|
||||
// LSN queue: inflight (writing) -> completed [-> fsynced] -> compactable -> compacted [-> fsynced] -> trimmed and removed
|
||||
std::deque<heap_inflight_lsn_t> inflight_lsn;
|
||||
uint32_t to_compact_count = 0;
|
||||
uint64_t first_inflight_lsn = 0;
|
||||
uint64_t completed_lsn = 0;
|
||||
uint64_t fsynced_lsn = 0;
|
||||
uint64_t compacted_lsn = 0;
|
||||
uint64_t next_compact_lsn = 0;
|
||||
std::deque<heap_refqi_t> overwrite_ref_queue;
|
||||
|
||||
std::vector<tmp_compact_item_t> tmp_compact_queue;
|
||||
std::deque<object_id> recheck_queue;
|
||||
int recheck_in_progress = 0;
|
||||
bool in_recheck = false;
|
||||
std::function<void(bool is_data, uint64_t offset, uint64_t len, uint8_t* buf, std::function<void()>)> recheck_cb;
|
||||
int recheck_queue_depth = 0;
|
||||
|
||||
const uint32_t max_write_entry_size;
|
||||
|
||||
uint64_t get_pg_id(inode_t inode, uint64_t stripe);
|
||||
void defragment_block(uint32_t block_num);
|
||||
uint32_t find_block_run(heap_block_info_t & block, uint32_t space);
|
||||
uint32_t find_block_space(uint32_t block_num, uint32_t space);
|
||||
uint32_t block_has_compactable(uint8_t *data);
|
||||
uint32_t compact_object_to(heap_object_t *obj, uint64_t lsn, uint8_t *new_csums, bool do_free);
|
||||
void copy_full_object(uint8_t *dst, heap_object_t *obj);
|
||||
bool mvcc_save_copy(heap_object_t *obj);
|
||||
bool mvcc_check_tracking(object_id oid);
|
||||
void free_mvcc(heap_mvcc_map_t::iterator mvcc_it);
|
||||
void allocate_block(heap_block_info_t & inf);
|
||||
int allocate_new_object(object_id oid, uint32_t full_object_size, uint32_t *modified_block, heap_object_t **new_obj);
|
||||
int add_object(object_id oid, heap_write_t *wr, uint32_t *modified_block);
|
||||
void mark_overwritten(uint64_t over_lsn, uint64_t inode, heap_write_t *wr, heap_write_t *end_wr, bool tracking_active);
|
||||
int update_object(uint32_t block_num, heap_object_t *obj, heap_write_t *wr, uint32_t *modified_block, uint32_t *moved_from_block);
|
||||
void init_erase(uint32_t block_num, heap_object_t *obj);
|
||||
void erase_object(uint32_t block_num, heap_object_t *obj, uint64_t lsn, bool tracking_active);
|
||||
void reindex_block(uint32_t block_num, heap_object_t *from_obj);
|
||||
void erase_block_index(inode_t inode, uint64_t stripe);
|
||||
void deref_data(uint64_t inode, uint64_t location, bool free_at_0);
|
||||
void deref_buffer(uint64_t inode, uint64_t location, uint32_t len, bool free_at_0);
|
||||
void deref_overwrites(uint64_t lsn);
|
||||
void free_object_space(inode_t inode, heap_write_t *from, heap_write_t *to, int mode = 0);
|
||||
void add_used_space(uint32_t block_num, int32_t used_delta);
|
||||
void push_inflight_lsn(object_id oid, uint64_t lsn, uint64_t flags);
|
||||
|
||||
public:
|
||||
blockstore_heap_t(blockstore_disk_t *dsk, uint8_t *buffer_area, int log_level = 0);
|
||||
~blockstore_heap_t();
|
||||
// set initially compacted lsn - should be done before loading
|
||||
void set_compacted_lsn(uint64_t compacted_lsn);
|
||||
uint64_t get_compacted_lsn();
|
||||
// load data from the disk, returns count of loaded write entries
|
||||
void read_blocks(uint64_t disk_offset, uint64_t size, uint8_t *buf,
|
||||
std::function<void(heap_object_t*)> handle_object, std::function<void(uint32_t, uint32_t, uint8_t*)> handle_block);
|
||||
uint64_t load_blocks(uint64_t disk_offset, uint64_t size, uint8_t *buf);
|
||||
// finish loading
|
||||
void finish_load();
|
||||
// recheck small write data after reading the database from disk
|
||||
bool recheck_small_writes(std::function<void(bool is_data, uint64_t offset, uint64_t len, uint8_t* buf, std::function<void()>)> read_buffer, int queue_depth);
|
||||
// initialize metadata area (fill it with empty data)
|
||||
// returns 0 when done, EAGAIN when the caller has to wait more
|
||||
int initialize();
|
||||
// read from the metadata area
|
||||
// returns 0 when done, EAGAIN when the caller has to wait more
|
||||
int read();
|
||||
// reshard database according to the pool's PG count
|
||||
void reshard(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size);
|
||||
// read an object entry and lock it against removal
|
||||
// in the future, may become asynchronous
|
||||
heap_object_t *lock_and_read_entry(object_id oid, uint64_t & copy_id);
|
||||
// re-read a locked object entry with the given lsn (pointer may be invalidated)
|
||||
heap_object_t *read_locked_entry(object_id oid, uint64_t copy_id);
|
||||
// read an object entry without locking it
|
||||
heap_object_t *read_entry(object_id oid, uint32_t *block_num_ptr, bool for_update = false);
|
||||
// unlock an entry
|
||||
bool unlock_entry(object_id oid, uint64_t copy_id);
|
||||
// set or verify checksums in a write request
|
||||
bool calc_checksums(heap_write_t *wr, uint8_t *data, bool set);
|
||||
// set or verify raw block checksums
|
||||
bool calc_block_checksums(uint32_t *block_csums, uint8_t *data, uint8_t *bitmap, uint32_t start, uint32_t end,
|
||||
bool set, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
bool calc_block_checksums(uint32_t *block_csums, uint8_t *bitmap,
|
||||
uint32_t start, uint32_t end, std::function<uint8_t*(uint32_t start, uint32_t & len)> next,
|
||||
bool set, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
// copy an object as is
|
||||
int copy_object(heap_object_t *obj, uint32_t *modified_block);
|
||||
// auto-compacts the object, then adds a write entry to it and to the compaction queue
|
||||
// return 0 if OK, or maybe ENOSPC
|
||||
int post_write(object_id oid, heap_write_t *wr, uint32_t *modified_block, uint32_t *moved_from_block);
|
||||
int post_write(uint32_t & block_num, object_id oid, heap_object_t *obj, heap_write_t *wr, uint32_t *moved_from_block);
|
||||
// stabilize an unstable object version
|
||||
// return 0 if OK, ENOENT if not exists
|
||||
int post_stabilize(object_id oid, uint64_t version, uint32_t *modified_block, uint64_t *new_lsn, uint64_t *new_to_lsn);
|
||||
// rollback an unstable object version
|
||||
// return 0 if OK, ENOENT if not exists, EBUSY if already stable
|
||||
int post_rollback(object_id oid, uint64_t version, uint64_t *new_lsn, uint32_t *modified_block);
|
||||
// forget an object
|
||||
// return error code
|
||||
int post_delete(object_id oid, uint64_t *new_lsn, uint32_t *modified_block);
|
||||
int post_delete(uint32_t block_num, heap_object_t *obj, uint64_t *new_lsn);
|
||||
// get the next object to compact
|
||||
// guaranteed to return objects in min lsn order
|
||||
// returns 0 if OK, ENOENT if nothing to compact
|
||||
int get_next_compact(object_id & oid);
|
||||
// get the range of an object eligible for compaction
|
||||
void get_compact_range(heap_object_t *obj, uint64_t max_lsn, heap_write_t **begin_wr, heap_write_t **end_wr);
|
||||
// mark an object as compacted up to the given lsn
|
||||
int compact_object(object_id oid, uint64_t lsn, uint8_t *new_csums);
|
||||
// retrieve object listing from a PG
|
||||
int list_objects(uint32_t pg_num, object_id min_oid, object_id max_oid,
|
||||
obj_ver_id **result_list, size_t *stable_count, size_t *unstable_count);
|
||||
// set a block number for a new object and returns error status: 0, EAGAIN or ENOSPC
|
||||
int get_block_for_new_object(uint32_t & out_block_num, uint32_t size = 0);
|
||||
|
||||
// inflight write tracking
|
||||
void mark_lsn_completed(uint64_t lsn);
|
||||
void mark_lsn_fsynced(uint64_t lsn);
|
||||
void mark_lsn_compacted(uint64_t lsn, bool allow_undone = false);
|
||||
void mark_object_compacted(heap_object_t *obj, uint64_t max_lsn);
|
||||
void mark_lsn_trimmed(uint64_t lsn);
|
||||
uint64_t get_completed_lsn();
|
||||
uint64_t get_fsynced_lsn();
|
||||
|
||||
// data device block allocator functions
|
||||
uint64_t find_free_data();
|
||||
bool is_data_used(uint64_t location);
|
||||
void use_data(inode_t inode, uint64_t location);
|
||||
void free_data(inode_t inode, uint64_t location);
|
||||
|
||||
// buffer device allocator functions
|
||||
uint64_t find_free_buffer_area(uint64_t size);
|
||||
bool is_buffer_area_free(uint64_t location, uint64_t size);
|
||||
void use_buffer_area(inode_t inode, uint64_t location, uint64_t size);
|
||||
void free_buffer_area(inode_t inode, uint64_t location, uint64_t size);
|
||||
uint64_t get_buffer_area_used_space();
|
||||
|
||||
// get metadata block data buffer and used space
|
||||
uint8_t *get_meta_block(uint32_t block_num);
|
||||
uint32_t get_meta_block_used_space(uint32_t block_num);
|
||||
|
||||
// get space usage statistics
|
||||
uint64_t get_data_used_space();
|
||||
const std::map<uint64_t, uint64_t> & get_inode_space_stats();
|
||||
uint64_t get_meta_total_space();
|
||||
uint64_t get_meta_used_space();
|
||||
uint32_t get_meta_nearfull_blocks();
|
||||
uint32_t get_inflight_queue_size();
|
||||
uint32_t get_compact_queue_size();
|
||||
uint32_t get_to_compact_count();
|
||||
|
||||
// get maximum size for a temporary heap_write_t buffer
|
||||
uint32_t get_max_write_entry_size();
|
||||
|
||||
// only for tests
|
||||
void set_abort_on_corruption(bool fail);
|
||||
void set_abort_on_overlap(bool fail);
|
||||
};
|
@@ -1,13 +1,17 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
#include <stdexcept>
|
||||
|
||||
blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd)
|
||||
#include "blockstore_impl.h"
|
||||
#include "crc32c.h"
|
||||
|
||||
blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd, bool mock_mode)
|
||||
{
|
||||
assert(sizeof(blockstore_op_private_t) <= BS_OP_PRIVATE_DATA_SIZE);
|
||||
this->tfd = tfd;
|
||||
this->ringloop = ringloop;
|
||||
dsk.mock_mode = mock_mode;
|
||||
ring_consumer.loop = [this]() { loop(); };
|
||||
ringloop->register_consumer(&ring_consumer);
|
||||
initialized = 0;
|
||||
@@ -17,31 +21,43 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
|
||||
dsk.open_data();
|
||||
dsk.open_meta();
|
||||
dsk.open_journal();
|
||||
calc_lengths();
|
||||
alloc_dyn_data = dsk.clean_dyn_size > sizeof(void*) || dsk.csum_block_size > 0;
|
||||
dsk.calc_lengths();
|
||||
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
|
||||
data_alloc = new allocator_t(dsk.block_count);
|
||||
}
|
||||
catch (std::exception & e)
|
||||
{
|
||||
dsk.close_all();
|
||||
throw;
|
||||
}
|
||||
memset(zero_object, 0, dsk.data_block_size);
|
||||
meta_superblock = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
|
||||
memset(meta_superblock, 0, dsk.meta_block_size);
|
||||
}
|
||||
|
||||
void blockstore_impl_t::init()
|
||||
{
|
||||
flusher = new journal_flusher_t(this);
|
||||
if (dsk.inmemory_journal)
|
||||
{
|
||||
buffer_area = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.journal_len);
|
||||
}
|
||||
heap = new blockstore_heap_t(&dsk, buffer_area, log_level);
|
||||
}
|
||||
|
||||
blockstore_impl_t::~blockstore_impl_t()
|
||||
{
|
||||
delete data_alloc;
|
||||
delete flusher;
|
||||
if (flusher)
|
||||
delete flusher;
|
||||
if (heap)
|
||||
delete heap;
|
||||
if (buffer_area)
|
||||
free(buffer_area);
|
||||
if (meta_superblock)
|
||||
free(meta_superblock);
|
||||
if (zero_object)
|
||||
free(zero_object);
|
||||
ringloop->unregister_consumer(&ring_consumer);
|
||||
dsk.close_all();
|
||||
if (metadata_buffer)
|
||||
free(metadata_buffer);
|
||||
if (clean_bitmaps)
|
||||
free(clean_bitmaps);
|
||||
}
|
||||
|
||||
bool blockstore_impl_t::is_started()
|
||||
@@ -57,10 +73,9 @@ bool blockstore_impl_t::is_stalled()
|
||||
// main event loop - produce requests
|
||||
void blockstore_impl_t::loop()
|
||||
{
|
||||
// FIXME: initialized == 10 is ugly
|
||||
if (initialized != 10)
|
||||
{
|
||||
// read metadata, then journal
|
||||
// read metadata
|
||||
if (initialized == 0)
|
||||
{
|
||||
metadata_init_reader = new blockstore_init_meta(this);
|
||||
@@ -73,69 +88,41 @@ void blockstore_impl_t::loop()
|
||||
{
|
||||
delete metadata_init_reader;
|
||||
metadata_init_reader = NULL;
|
||||
journal_init_reader = new blockstore_init_journal(this);
|
||||
initialized = 2;
|
||||
}
|
||||
}
|
||||
if (initialized == 2)
|
||||
{
|
||||
int res = journal_init_reader->loop();
|
||||
if (!res)
|
||||
{
|
||||
delete journal_init_reader;
|
||||
journal_init_reader = NULL;
|
||||
initialized = 3;
|
||||
ringloop->wakeup();
|
||||
}
|
||||
}
|
||||
if (initialized == 3)
|
||||
{
|
||||
if (!readonly && dsk.discard_on_start)
|
||||
dsk.trim_data(data_alloc);
|
||||
if (journal.flush_journal)
|
||||
initialized = 4;
|
||||
else
|
||||
initialized = 10;
|
||||
}
|
||||
if (initialized == 4)
|
||||
{
|
||||
if (readonly)
|
||||
{
|
||||
printf("Can't flush the journal in readonly mode\n");
|
||||
exit(1);
|
||||
dsk.trim_data([this](uint64_t block_num){ return heap->is_data_used(block_num * dsk.data_block_size); });
|
||||
}
|
||||
flusher->loop();
|
||||
ringloop->submit();
|
||||
initialized = 10;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// try to submit ops
|
||||
unsigned initial_ring_space = ringloop->space_left();
|
||||
// has_writes == 0 - no writes before the current queue item
|
||||
// has_writes == 1 - some writes in progress
|
||||
// has_writes == 2 - tried to submit some writes, but failed
|
||||
int has_writes = 0, op_idx = 0, new_idx = 0;
|
||||
int op_idx = 0, new_idx = 0;
|
||||
bool has_unfinished_writes = false;
|
||||
for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
|
||||
{
|
||||
auto op = submit_queue[op_idx];
|
||||
submit_queue[new_idx] = op;
|
||||
// FIXME: This needs some simplification
|
||||
// Writes should not block reads if the ring is not full and reads don't depend on them
|
||||
// In all other cases we should stop submission
|
||||
if (PRIV(op)->wait_for)
|
||||
{
|
||||
check_wait(op);
|
||||
if (PRIV(op)->wait_for == WAIT_SQE)
|
||||
{
|
||||
// ring is full, stop submission
|
||||
break;
|
||||
}
|
||||
else if (PRIV(op)->wait_for)
|
||||
{
|
||||
if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE)
|
||||
{
|
||||
has_writes = 2;
|
||||
}
|
||||
has_unfinished_writes = has_unfinished_writes || op->opcode == BS_OP_WRITE ||
|
||||
op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE ||
|
||||
op->opcode == BS_OP_STABLE || op->opcode == BS_OP_ROLLBACK;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -148,46 +135,33 @@ void blockstore_impl_t::loop()
|
||||
{
|
||||
wr_st = dequeue_read(op);
|
||||
}
|
||||
else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE)
|
||||
else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE)
|
||||
{
|
||||
if (has_writes == 2)
|
||||
{
|
||||
// Some writes already could not be submitted
|
||||
continue;
|
||||
}
|
||||
wr_st = dequeue_write(op);
|
||||
has_writes = wr_st > 0 ? 1 : 2;
|
||||
}
|
||||
else if (op->opcode == BS_OP_DELETE)
|
||||
{
|
||||
if (has_writes == 2)
|
||||
{
|
||||
// Some writes already could not be submitted
|
||||
continue;
|
||||
}
|
||||
wr_st = dequeue_del(op);
|
||||
has_writes = wr_st > 0 ? 1 : 2;
|
||||
has_unfinished_writes = has_unfinished_writes || (wr_st != 2);
|
||||
}
|
||||
else if (op->opcode == BS_OP_SYNC)
|
||||
{
|
||||
// sync only completed writes?
|
||||
// wait for the data device fsync to complete, then submit journal writes for big writes
|
||||
// then submit an fsync operation
|
||||
// syncs only completed writes, so doesn't have to be blocked by anything
|
||||
wr_st = continue_sync(op);
|
||||
}
|
||||
else if (op->opcode == BS_OP_STABLE)
|
||||
else if (op->opcode == BS_OP_STABLE || op->opcode == BS_OP_ROLLBACK)
|
||||
{
|
||||
wr_st = dequeue_stable(op);
|
||||
}
|
||||
else if (op->opcode == BS_OP_ROLLBACK)
|
||||
{
|
||||
wr_st = dequeue_rollback(op);
|
||||
has_unfinished_writes = has_unfinished_writes || (wr_st != 2);
|
||||
}
|
||||
else if (op->opcode == BS_OP_LIST)
|
||||
{
|
||||
// LIST doesn't have to be blocked by previous modifications
|
||||
process_list(op);
|
||||
wr_st = 2;
|
||||
// LIST has to be blocked by previous writes and commits/rollbacks
|
||||
if (!has_unfinished_writes)
|
||||
{
|
||||
process_list(op);
|
||||
wr_st = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
wr_st = 0;
|
||||
}
|
||||
}
|
||||
if (wr_st == 2)
|
||||
{
|
||||
@@ -196,16 +170,13 @@ void blockstore_impl_t::loop()
|
||||
}
|
||||
if (wr_st == 0)
|
||||
{
|
||||
PRIV(op)->pending_ops = 0;
|
||||
ringloop->restore(prev_sqe_pos);
|
||||
if (PRIV(op)->wait_for == WAIT_SQE)
|
||||
{
|
||||
// ring is full, stop submission
|
||||
break;
|
||||
}
|
||||
else if (PRIV(op)->wait_for == WAIT_JOURNAL)
|
||||
{
|
||||
PRIV(op)->wait_detail2 = (unstable_writes.size()+unstable_unsynced);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (op_idx != new_idx)
|
||||
@@ -225,12 +196,6 @@ void blockstore_impl_t::loop()
|
||||
{
|
||||
throw std::runtime_error(std::string("io_uring_submit: ") + strerror(-ret));
|
||||
}
|
||||
for (auto s: journal.submitting_sectors)
|
||||
{
|
||||
// Mark journal sector writes as submitted
|
||||
journal.sector_info[s].submit_id = 0;
|
||||
}
|
||||
journal.submitting_sectors.clear();
|
||||
if ((initial_ring_space - ringloop->space_left()) > 0)
|
||||
{
|
||||
live = true;
|
||||
@@ -248,7 +213,7 @@ bool blockstore_impl_t::is_safe_to_stop()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if (unsynced_big_writes.size() > 0 || unsynced_small_writes.size() > 0)
|
||||
if (unsynced_big_write_count > 0 || unsynced_small_write_count > 0)
|
||||
{
|
||||
if (!readonly && !stop_sync_submitted)
|
||||
{
|
||||
@@ -272,7 +237,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
|
||||
{
|
||||
if (PRIV(op)->wait_for == WAIT_SQE)
|
||||
{
|
||||
if (ringloop->sqes_left() < PRIV(op)->wait_detail)
|
||||
if (ringloop->space_left() < PRIV(op)->wait_detail)
|
||||
{
|
||||
// stop submission if there's still no free space
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
@@ -282,40 +247,13 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
|
||||
}
|
||||
PRIV(op)->wait_for = 0;
|
||||
}
|
||||
else if (PRIV(op)->wait_for == WAIT_JOURNAL)
|
||||
else if (PRIV(op)->wait_for == WAIT_COMPACTION)
|
||||
{
|
||||
if (journal.used_start == PRIV(op)->wait_detail &&
|
||||
(unstable_writes.size()+unstable_unsynced) == PRIV(op)->wait_detail2)
|
||||
if (flusher->get_compact_counter() <= PRIV(op)->wait_detail)
|
||||
{
|
||||
// do not submit
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Still waiting to flush journal offset %08jx\n", PRIV(op)->wait_detail);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
flusher->release_trim();
|
||||
PRIV(op)->wait_for = 0;
|
||||
}
|
||||
else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
|
||||
{
|
||||
int next = ((journal.cur_sector + 1) % journal.sector_count);
|
||||
if (journal.sector_info[next].flush_count > 0 ||
|
||||
journal.sector_info[next].dirty)
|
||||
{
|
||||
// do not submit
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Still waiting for a journal buffer\n");
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
PRIV(op)->wait_for = 0;
|
||||
}
|
||||
else if (PRIV(op)->wait_for == WAIT_FREE)
|
||||
{
|
||||
if (!data_alloc->get_free_count() && big_to_flush > 0)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Still waiting for free space on the data device\n");
|
||||
printf("Still waiting for more flushes\n");
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
@@ -343,44 +281,6 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
|
||||
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
|
||||
return;
|
||||
}
|
||||
if (op->opcode == BS_OP_SYNC_STAB_ALL)
|
||||
{
|
||||
std::function<void(blockstore_op_t*)> *old_callback = new std::function<void(blockstore_op_t*)>(op->callback);
|
||||
op->opcode = BS_OP_SYNC;
|
||||
op->callback = [this, old_callback](blockstore_op_t *op)
|
||||
{
|
||||
if (op->retval >= 0 && unstable_writes.size() > 0)
|
||||
{
|
||||
op->opcode = BS_OP_STABLE;
|
||||
op->len = unstable_writes.size();
|
||||
obj_ver_id *vers = new obj_ver_id[op->len];
|
||||
op->buf = vers;
|
||||
int i = 0;
|
||||
for (auto it = unstable_writes.begin(); it != unstable_writes.end(); it++, i++)
|
||||
{
|
||||
vers[i] = {
|
||||
.oid = it->first,
|
||||
.version = it->second,
|
||||
};
|
||||
}
|
||||
unstable_writes.clear();
|
||||
op->callback = [old_callback](blockstore_op_t *op)
|
||||
{
|
||||
obj_ver_id *vers = (obj_ver_id*)op->buf;
|
||||
delete[] vers;
|
||||
op->buf = NULL;
|
||||
(*old_callback)(op);
|
||||
delete old_callback;
|
||||
};
|
||||
this->enqueue_op(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
(*old_callback)(op);
|
||||
delete old_callback;
|
||||
}
|
||||
};
|
||||
}
|
||||
if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
|
||||
{
|
||||
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
|
||||
@@ -399,75 +299,11 @@ void blockstore_impl_t::init_op(blockstore_op_t *op)
|
||||
{
|
||||
// Call constructor without allocating memory. We'll call destructor before returning op back
|
||||
new ((void*)op->private_data) blockstore_op_private_t;
|
||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||
PRIV(op)->wait_for = 0;
|
||||
PRIV(op)->op_state = 0;
|
||||
PRIV(op)->pending_ops = 0;
|
||||
}
|
||||
|
||||
static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
|
||||
{
|
||||
while (search_start < search_end)
|
||||
{
|
||||
int pos = search_start+(search_end-search_start)/2;
|
||||
if (oid < list[pos].oid)
|
||||
{
|
||||
search_end = pos;
|
||||
}
|
||||
else if (list[pos].oid < oid)
|
||||
{
|
||||
search_start = pos+1;
|
||||
}
|
||||
else
|
||||
{
|
||||
list[pos].version = version;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
blockstore_clean_db_t& blockstore_impl_t::clean_db_shard(object_id oid)
|
||||
{
|
||||
uint64_t pg_num = 0;
|
||||
uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
|
||||
auto sh_it = clean_db_settings.find(pool_id);
|
||||
if (sh_it != clean_db_settings.end())
|
||||
{
|
||||
// like map_to_pg()
|
||||
pg_num = (oid.stripe / sh_it->second.pg_stripe_size) % sh_it->second.pg_count + 1;
|
||||
}
|
||||
return clean_db_shards[(pool_id << (64-POOL_ID_BITS)) | pg_num];
|
||||
}
|
||||
|
||||
void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)
|
||||
{
|
||||
uint64_t pool_id = (uint64_t)pool;
|
||||
std::map<pool_pg_id_t, blockstore_clean_db_t> new_shards;
|
||||
auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
|
||||
while (sh_it != clean_db_shards.end() &&
|
||||
(sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
|
||||
{
|
||||
for (auto & pair: sh_it->second)
|
||||
{
|
||||
// like map_to_pg()
|
||||
uint64_t pg_num = (pair.first.stripe / pg_stripe_size) % pg_count + 1;
|
||||
uint64_t shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
|
||||
new_shards[shard_id][pair.first] = pair.second;
|
||||
}
|
||||
clean_db_shards.erase(sh_it++);
|
||||
}
|
||||
for (sh_it = new_shards.begin(); sh_it != new_shards.end(); sh_it++)
|
||||
{
|
||||
auto & to = clean_db_shards[sh_it->first];
|
||||
to.swap(sh_it->second);
|
||||
}
|
||||
clean_db_settings[pool_id] = (pool_shard_settings_t){
|
||||
.pg_count = pg_count,
|
||||
.pg_stripe_size = pg_stripe_size,
|
||||
};
|
||||
}
|
||||
|
||||
void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
{
|
||||
uint32_t list_pg = op->pg_number+1;
|
||||
@@ -476,7 +312,8 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
uint64_t min_inode = op->min_oid.inode;
|
||||
uint64_t max_inode = op->max_oid.inode;
|
||||
// Check PG
|
||||
if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count))
|
||||
if (!pg_count || (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count) ||
|
||||
!INODE_POOL(min_inode) || INODE_POOL(min_inode) != INODE_POOL(max_inode))
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
FINISH_OP(op);
|
||||
@@ -484,250 +321,40 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
}
|
||||
// Check if the DB needs resharding
|
||||
// (we don't know about PGs from the beginning, we only create "shards" here)
|
||||
uint64_t first_shard = 0, last_shard = UINT64_MAX;
|
||||
if (min_inode != 0 &&
|
||||
// Check if min_inode == max_inode == pool_id<<N, i.e. this is a pool listing
|
||||
(min_inode >> (64-POOL_ID_BITS)) == (max_inode >> (64-POOL_ID_BITS)))
|
||||
heap->reshard(INODE_POOL(min_inode), pg_count, pg_stripe_size);
|
||||
obj_ver_id *result = NULL;
|
||||
size_t stable_count = 0, unstable_count = 0;
|
||||
int res = heap->list_objects(list_pg, op->min_oid, op->max_oid, &result, &stable_count, &unstable_count);
|
||||
if (op->list_stable_limit)
|
||||
{
|
||||
pool_id_t pool_id = (min_inode >> (64-POOL_ID_BITS));
|
||||
if (pg_count > 1)
|
||||
// Ordered result is expected - used by scrub
|
||||
// We use an unordered map
|
||||
std::sort(result, result + stable_count);
|
||||
if (stable_count > op->list_stable_limit)
|
||||
{
|
||||
// Per-pg listing
|
||||
auto sh_it = clean_db_settings.find(pool_id);
|
||||
if (sh_it == clean_db_settings.end() ||
|
||||
sh_it->second.pg_count != pg_count ||
|
||||
sh_it->second.pg_stripe_size != pg_stripe_size)
|
||||
{
|
||||
reshard_clean_db(pool_id, pg_count, pg_stripe_size);
|
||||
}
|
||||
first_shard = last_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS)) | list_pg;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Per-pool listing
|
||||
first_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS));
|
||||
last_shard = ((uint64_t)(pool_id+1) << (64-POOL_ID_BITS)) - 1;
|
||||
memmove(result + op->list_stable_limit, result + stable_count, unstable_count);
|
||||
stable_count = op->list_stable_limit;
|
||||
}
|
||||
}
|
||||
// Copy clean_db entries
|
||||
int stable_count = 0, stable_alloc = 0;
|
||||
if (min_inode != max_inode)
|
||||
{
|
||||
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||
shard_it++)
|
||||
{
|
||||
auto & clean_db = shard_it->second;
|
||||
stable_alloc += clean_db.size();
|
||||
}
|
||||
}
|
||||
if (op->list_stable_limit > 0)
|
||||
{
|
||||
stable_alloc = op->list_stable_limit;
|
||||
if (stable_alloc > 1024*1024)
|
||||
stable_alloc = 1024*1024;
|
||||
}
|
||||
if (stable_alloc < 32768)
|
||||
{
|
||||
stable_alloc = 32768;
|
||||
}
|
||||
obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!stable)
|
||||
{
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
auto max_oid = op->max_oid;
|
||||
bool limited = false;
|
||||
pool_pg_id_t last_shard_id = 0;
|
||||
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||
shard_it++)
|
||||
{
|
||||
auto & clean_db = shard_it->second;
|
||||
auto clean_it = clean_db.begin(), clean_end = clean_db.end();
|
||||
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
|
||||
{
|
||||
clean_it = clean_db.lower_bound(op->min_oid);
|
||||
}
|
||||
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
|
||||
{
|
||||
clean_end = clean_db.upper_bound(max_oid);
|
||||
}
|
||||
for (; clean_it != clean_end; clean_it++)
|
||||
{
|
||||
if (stable_count >= stable_alloc)
|
||||
{
|
||||
stable_alloc *= 2;
|
||||
obj_ver_id* nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!nst)
|
||||
{
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
stable = nst;
|
||||
}
|
||||
stable[stable_count++] = {
|
||||
.oid = clean_it->first,
|
||||
.version = clean_it->second.version,
|
||||
};
|
||||
if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
|
||||
{
|
||||
if (!limited)
|
||||
{
|
||||
limited = true;
|
||||
max_oid = stable[stable_count-1].oid;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (op->list_stable_limit > 0)
|
||||
{
|
||||
// To maintain the order, we have to include objects in the same range from other shards
|
||||
if (last_shard_id != 0 && last_shard_id != shard_it->first)
|
||||
std::sort(stable, stable+stable_count);
|
||||
if (stable_count > op->list_stable_limit)
|
||||
stable_count = op->list_stable_limit;
|
||||
}
|
||||
last_shard_id = shard_it->first;
|
||||
}
|
||||
if (op->list_stable_limit == 0 && first_shard != last_shard)
|
||||
{
|
||||
// If that's not a per-PG listing, sort clean entries (already sorted if list_stable_limit != 0)
|
||||
std::sort(stable, stable+stable_count);
|
||||
}
|
||||
int clean_stable_count = stable_count;
|
||||
// Copy dirty_db entries (sorted, too)
|
||||
int unstable_count = 0, unstable_alloc = 0;
|
||||
obj_ver_id *unstable = NULL;
|
||||
{
|
||||
auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
|
||||
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
|
||||
{
|
||||
dirty_it = dirty_db.lower_bound({
|
||||
.oid = op->min_oid,
|
||||
.version = 0,
|
||||
});
|
||||
}
|
||||
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
|
||||
{
|
||||
dirty_end = dirty_db.upper_bound({
|
||||
.oid = max_oid,
|
||||
.version = UINT64_MAX,
|
||||
});
|
||||
}
|
||||
for (; dirty_it != dirty_end; dirty_it++)
|
||||
{
|
||||
if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count + 1) == list_pg) // like map_to_pg()
|
||||
{
|
||||
if (IS_DELETE(dirty_it->second.state))
|
||||
{
|
||||
// Deletions are always stable, so try to zero out two possible entries
|
||||
if (!replace_stable(dirty_it->first.oid, 0, 0, clean_stable_count, stable))
|
||||
{
|
||||
replace_stable(dirty_it->first.oid, 0, clean_stable_count, stable_count, stable);
|
||||
}
|
||||
}
|
||||
else if (IS_STABLE(dirty_it->second.state) || (dirty_it->second.state & BS_ST_INSTANT))
|
||||
{
|
||||
// First try to replace a clean stable version in the first part of the list
|
||||
if (!replace_stable(dirty_it->first.oid, dirty_it->first.version, 0, clean_stable_count, stable))
|
||||
{
|
||||
// Then try to replace the last dirty stable version in the second part of the list
|
||||
if (stable_count > 0 && stable[stable_count-1].oid == dirty_it->first.oid)
|
||||
{
|
||||
stable[stable_count-1].version = dirty_it->first.version;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (stable_count >= stable_alloc)
|
||||
{
|
||||
stable_alloc += 32768;
|
||||
obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!nst)
|
||||
{
|
||||
if (unstable)
|
||||
free(unstable);
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
stable = nst;
|
||||
}
|
||||
stable[stable_count++] = dirty_it->first;
|
||||
}
|
||||
}
|
||||
if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
|
||||
{
|
||||
// Stop here
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (unstable_count >= unstable_alloc)
|
||||
{
|
||||
unstable_alloc += 32768;
|
||||
obj_ver_id *nst = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc);
|
||||
if (!nst)
|
||||
{
|
||||
if (stable)
|
||||
free(stable);
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
unstable = nst;
|
||||
}
|
||||
unstable[unstable_count++] = dirty_it->first;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Remove zeroed out stable entries
|
||||
int j = 0;
|
||||
for (int i = 0; i < stable_count; i++)
|
||||
{
|
||||
if (stable[i].version != 0)
|
||||
{
|
||||
stable[j++] = stable[i];
|
||||
}
|
||||
}
|
||||
stable_count = j;
|
||||
if (stable_count+unstable_count > stable_alloc)
|
||||
{
|
||||
stable_alloc = stable_count+unstable_count;
|
||||
obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!nst)
|
||||
{
|
||||
if (unstable)
|
||||
free(unstable);
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
stable = nst;
|
||||
}
|
||||
// Copy unstable entries
|
||||
for (int i = 0; i < unstable_count; i++)
|
||||
{
|
||||
stable[j++] = unstable[i];
|
||||
}
|
||||
free(unstable);
|
||||
op->version = stable_count;
|
||||
op->retval = stable_count+unstable_count;
|
||||
op->buf = stable;
|
||||
op->retval = res == 0 ? stable_count+unstable_count : -res;
|
||||
op->buf = (uint8_t*)result;
|
||||
FINISH_OP(op);
|
||||
}
|
||||
|
||||
void blockstore_impl_t::dump_diagnostics()
|
||||
{
|
||||
journal.dump_diagnostics();
|
||||
flusher->dump_diagnostics();
|
||||
}
|
||||
|
||||
void blockstore_meta_header_v3_t::set_crc32c()
|
||||
{
|
||||
header_csum = 0;
|
||||
uint32_t calc = crc32c(0, this, version == BLOCKSTORE_META_FORMAT_HEAP
|
||||
? sizeof(blockstore_meta_header_v3_t) : sizeof(blockstore_meta_header_v2_t));
|
||||
header_csum = calc;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expected)
|
||||
{
|
||||
if (retval == -EAGAIN)
|
||||
@@ -741,85 +368,7 @@ void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expecte
|
||||
exit(1);
|
||||
}
|
||||
|
||||
void blockstore_impl_t::set_no_inode_stats(const std::vector<uint64_t> & pool_ids)
|
||||
uint64_t blockstore_impl_t::get_free_block_count()
|
||||
{
|
||||
for (auto & np: no_inode_stats)
|
||||
{
|
||||
np.second = 2;
|
||||
}
|
||||
for (auto pool_id: pool_ids)
|
||||
{
|
||||
if (!no_inode_stats[pool_id])
|
||||
recalc_inode_space_stats(pool_id, false);
|
||||
no_inode_stats[pool_id] = 1;
|
||||
}
|
||||
for (auto np_it = no_inode_stats.begin(); np_it != no_inode_stats.end(); )
|
||||
{
|
||||
if (np_it->second == 2)
|
||||
{
|
||||
recalc_inode_space_stats(np_it->first, true);
|
||||
no_inode_stats.erase(np_it++);
|
||||
}
|
||||
else
|
||||
np_it++;
|
||||
}
|
||||
}
|
||||
|
||||
void blockstore_impl_t::recalc_inode_space_stats(uint64_t pool_id, bool per_inode)
|
||||
{
|
||||
auto sp_begin = inode_space_stats.lower_bound((pool_id << (64-POOL_ID_BITS)));
|
||||
auto sp_end = inode_space_stats.lower_bound(((pool_id+1) << (64-POOL_ID_BITS)));
|
||||
inode_space_stats.erase(sp_begin, sp_end);
|
||||
auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
|
||||
while (sh_it != clean_db_shards.end() &&
|
||||
(sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
|
||||
{
|
||||
for (auto & pair: sh_it->second)
|
||||
{
|
||||
uint64_t space_id = per_inode ? pair.first.inode : (pool_id << (64-POOL_ID_BITS));
|
||||
inode_space_stats[space_id] += dsk.data_block_size;
|
||||
}
|
||||
sh_it++;
|
||||
}
|
||||
object_id last_oid = {};
|
||||
bool last_exists = false;
|
||||
auto dirty_it = dirty_db.lower_bound((obj_ver_id){ .oid = { .inode = (pool_id << (64-POOL_ID_BITS)) } });
|
||||
while (dirty_it != dirty_db.end() && (dirty_it->first.oid.inode >> (64-POOL_ID_BITS)) == pool_id)
|
||||
{
|
||||
if (IS_STABLE(dirty_it->second.state) && (IS_BIG_WRITE(dirty_it->second.state) || IS_DELETE(dirty_it->second.state)))
|
||||
{
|
||||
bool exists = false;
|
||||
if (last_oid == dirty_it->first.oid)
|
||||
{
|
||||
exists = last_exists;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto & clean_db = clean_db_shard(dirty_it->first.oid);
|
||||
auto clean_it = clean_db.find(dirty_it->first.oid);
|
||||
exists = clean_it != clean_db.end();
|
||||
}
|
||||
uint64_t space_id = per_inode ? dirty_it->first.oid.inode : (pool_id << (64-POOL_ID_BITS));
|
||||
if (IS_BIG_WRITE(dirty_it->second.state))
|
||||
{
|
||||
if (!exists)
|
||||
inode_space_stats[space_id] += dsk.data_block_size;
|
||||
last_exists = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (exists)
|
||||
{
|
||||
auto & sp = inode_space_stats[space_id];
|
||||
if (sp > dsk.data_block_size)
|
||||
sp -= dsk.data_block_size;
|
||||
else
|
||||
inode_space_stats.erase(space_id);
|
||||
}
|
||||
last_exists = false;
|
||||
}
|
||||
last_oid = dirty_it->first.oid;
|
||||
}
|
||||
dirty_it++;
|
||||
}
|
||||
return dsk.block_count - heap->get_data_used_space()/dsk.data_block_size;
|
||||
}
|
||||
|
@@ -5,6 +5,7 @@
|
||||
|
||||
#include "blockstore.h"
|
||||
#include "blockstore_disk.h"
|
||||
#include "blockstore_heap.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/ioctl.h>
|
||||
@@ -19,46 +20,18 @@
|
||||
#include <deque>
|
||||
#include <new>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "cpp-btree/btree_map.h"
|
||||
#include <unordered_set>
|
||||
|
||||
#include "malloc_or_die.h"
|
||||
#include "allocator.h"
|
||||
|
||||
//#define BLOCKSTORE_DEBUG
|
||||
|
||||
// States are not stored on disk. Instead, they're deduced from the journal
|
||||
|
||||
#define BS_ST_SMALL_WRITE 0x01
|
||||
#define BS_ST_BIG_WRITE 0x02
|
||||
#define BS_ST_DELETE 0x03
|
||||
|
||||
#define BS_ST_WAIT_DEL 0x10
|
||||
#define BS_ST_WAIT_BIG 0x20
|
||||
#define BS_ST_IN_FLIGHT 0x30
|
||||
#define BS_ST_SUBMITTED 0x40
|
||||
#define BS_ST_WRITTEN 0x50
|
||||
#define BS_ST_SYNCED 0x60
|
||||
#define BS_ST_STABLE 0x70
|
||||
|
||||
#define BS_ST_INSTANT 0x100
|
||||
|
||||
#define IMMEDIATE_NONE 0
|
||||
#define IMMEDIATE_SMALL 1
|
||||
#define IMMEDIATE_ALL 2
|
||||
|
||||
#define BS_ST_TYPE_MASK 0x0F
|
||||
#define BS_ST_WORKFLOW_MASK 0xF0
|
||||
#define IS_IN_FLIGHT(st) (((st) & 0xF0) <= BS_ST_SUBMITTED)
|
||||
#define IS_STABLE(st) (((st) & 0xF0) == BS_ST_STABLE)
|
||||
#define IS_SYNCED(st) (((st) & 0xF0) >= BS_ST_SYNCED)
|
||||
#define IS_JOURNAL(st) (((st) & 0x0F) == BS_ST_SMALL_WRITE)
|
||||
#define IS_BIG_WRITE(st) (((st) & 0x0F) == BS_ST_BIG_WRITE)
|
||||
#define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE)
|
||||
#define IS_INSTANT(st) (((st) & BS_ST_TYPE_MASK) == BS_ST_DELETE || ((st) & BS_ST_INSTANT))
|
||||
|
||||
#define BS_SUBMIT_CHECK_SQES(n) \
|
||||
if (ringloop->sqes_left() < (n))\
|
||||
if (ringloop->space_left() < (n))\
|
||||
{\
|
||||
/* Pause until there are more requests available */\
|
||||
PRIV(op)->wait_detail = (n);\
|
||||
@@ -90,13 +63,6 @@
|
||||
return 0;\
|
||||
}
|
||||
|
||||
#include "blockstore_journal.h"
|
||||
|
||||
// "VITAstor"
|
||||
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
|
||||
#define BLOCKSTORE_META_FORMAT_V1 1
|
||||
#define BLOCKSTORE_META_FORMAT_V2 2
|
||||
|
||||
// metadata header (superblock)
|
||||
struct __attribute__((__packed__)) blockstore_meta_header_v1_t
|
||||
{
|
||||
@@ -121,75 +87,26 @@ struct __attribute__((__packed__)) blockstore_meta_header_v2_t
|
||||
uint32_t header_csum;
|
||||
};
|
||||
|
||||
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
|
||||
// per "clean" entry on disk with fixed metadata tables
|
||||
struct __attribute__((__packed__)) clean_disk_entry
|
||||
struct __attribute__((__packed__)) blockstore_meta_header_v3_t
|
||||
{
|
||||
object_id oid;
|
||||
uint64_t zero;
|
||||
uint64_t magic;
|
||||
uint64_t version;
|
||||
uint8_t bitmap[];
|
||||
// Two more fields come after bitmap in metadata version 2:
|
||||
// uint32_t data_csum[];
|
||||
// uint32_t entry_csum;
|
||||
};
|
||||
uint32_t meta_block_size;
|
||||
uint32_t data_block_size;
|
||||
uint32_t bitmap_granularity;
|
||||
uint32_t data_csum_type;
|
||||
uint32_t csum_block_size;
|
||||
uint32_t header_csum;
|
||||
uint64_t compacted_lsn;
|
||||
|
||||
// 32 = 16 + 16 bytes per "clean" entry in memory (object_id => clean_entry)
|
||||
struct __attribute__((__packed__)) clean_entry
|
||||
{
|
||||
uint64_t version;
|
||||
uint64_t location;
|
||||
void set_crc32c();
|
||||
};
|
||||
|
||||
// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry). Plus checksums
|
||||
struct __attribute__((__packed__)) dirty_entry
|
||||
{
|
||||
uint32_t state;
|
||||
uint32_t flags; // unneeded, but present for alignment
|
||||
uint64_t location; // location in either journal or data -> in BYTES
|
||||
uint32_t offset; // data offset within object (stripe)
|
||||
uint32_t len; // data length
|
||||
uint64_t journal_sector; // journal sector used for this entry
|
||||
void* dyn_data; // dynamic data: external bitmap and data block checksums. may be a pointer to the in-memory journal
|
||||
};
|
||||
|
||||
// - Sync must be submitted after previous writes/deletes (not before!)
|
||||
// - Reads to the same object must be submitted after previous writes/deletes
|
||||
// are written (not necessarily synced) in their location. This is because we
|
||||
// rely on read-modify-write for erasure coding and we must return new data
|
||||
// to calculate parity for subsequent writes
|
||||
// - Writes may be submitted in any order, because they don't overlap. Each write
|
||||
// goes into a new location - either on the journal device or on the data device
|
||||
// - Stable (stabilize) must be submitted after sync of that object is completed
|
||||
// It's even OK to return an error to the caller if that object is not synced yet
|
||||
// - Journal trim may be processed only after all versions are moved to
|
||||
// the main storage AND after all read operations for older versions complete
|
||||
// - If an operation can not be submitted because the ring is full
|
||||
// we should stop submission of other operations. Otherwise some "scatter" reads
|
||||
// may end up blocked for a long time.
|
||||
// Otherwise, the submit order is free, that is all operations may be submitted immediately
|
||||
// In fact, adding a write operation must immediately result in dirty_db being populated
|
||||
|
||||
// Suspend operation until there are more free SQEs
|
||||
#define WAIT_SQE 1
|
||||
// Suspend operation until there are <wait_detail> bytes of free space in the journal on disk
|
||||
#define WAIT_JOURNAL 3
|
||||
// Suspend operation until the next journal sector buffer is free
|
||||
#define WAIT_JOURNAL_BUFFER 4
|
||||
// Suspend operation until there is some free space on the data device
|
||||
#define WAIT_FREE 5
|
||||
|
||||
struct used_clean_obj_t
|
||||
{
|
||||
int refs;
|
||||
bool was_freed; // was freed by a parallel flush?
|
||||
bool was_changed; // was changed by a parallel flush?
|
||||
};
|
||||
|
||||
// https://github.com/algorithm-ninja/cpp-btree
|
||||
// https://github.com/greg7mdp/sparsepp/ was used previously, but it was TERRIBLY slow after resizing
|
||||
// with sparsepp, random reads dropped to ~700 iops very fast with just as much as ~32k objects in the DB
|
||||
typedef btree::btree_map<object_id, clean_entry> blockstore_clean_db_t;
|
||||
typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
|
||||
// Suspend until something is compacted
|
||||
#define WAIT_COMPACTION 2
|
||||
|
||||
#include "blockstore_init.h"
|
||||
|
||||
@@ -202,58 +119,47 @@ struct blockstore_op_private_t
|
||||
{
|
||||
// Wait status
|
||||
int wait_for;
|
||||
uint64_t wait_detail, wait_detail2;
|
||||
uint64_t wait_detail;
|
||||
int pending_ops;
|
||||
int op_state;
|
||||
|
||||
// Read, write, sync, stabilize
|
||||
uint64_t lsn;
|
||||
|
||||
// Read
|
||||
uint64_t clean_block_used;
|
||||
std::vector<copy_buffer_t> read_vec;
|
||||
|
||||
// Sync, write
|
||||
uint64_t min_flushed_journal_sector, max_flushed_journal_sector;
|
||||
// Write
|
||||
uint64_t location;
|
||||
bool is_big;
|
||||
|
||||
// Stabilize, rollback
|
||||
int stab_pos;
|
||||
|
||||
// Stabilize
|
||||
uint64_t to_lsn;
|
||||
|
||||
// Write
|
||||
struct iovec iov_zerofill[3];
|
||||
// Warning: must not have a default value here because it's written to before calling constructor in blockstore_write.cpp O_o
|
||||
uint64_t real_version;
|
||||
timespec tv_begin;
|
||||
|
||||
// Sync
|
||||
std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
|
||||
};
|
||||
|
||||
typedef uint32_t pool_id_t;
|
||||
typedef uint64_t pool_pg_id_t;
|
||||
|
||||
#define POOL_ID_BITS 16
|
||||
|
||||
struct pool_shard_settings_t
|
||||
{
|
||||
uint32_t pg_count;
|
||||
uint32_t pg_stripe_size;
|
||||
};
|
||||
|
||||
#define STAB_SPLIT_DONE 1
|
||||
#define STAB_SPLIT_WAIT 2
|
||||
#define STAB_SPLIT_SYNC 3
|
||||
#define STAB_SPLIT_TODO 4
|
||||
|
||||
class blockstore_impl_t
|
||||
{
|
||||
public:
|
||||
blockstore_disk_t dsk;
|
||||
|
||||
/******* OPTIONS *******/
|
||||
bool readonly = false;
|
||||
// It is safe to disable fsync() if drive write cache is writethrough
|
||||
bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
|
||||
// Enable if you want every operation to be executed with an "implicit fsync"
|
||||
// Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
|
||||
int immediate_commit = IMMEDIATE_NONE;
|
||||
bool inmemory_meta = false;
|
||||
uint32_t meta_write_recheck_parallelism = 0;
|
||||
// Maximum and minimum flusher count
|
||||
unsigned max_flusher_count, min_flusher_count;
|
||||
unsigned journal_trim_interval;
|
||||
unsigned max_flusher_count = 0, min_flusher_count = 0;
|
||||
unsigned journal_trim_interval = 0;
|
||||
unsigned flusher_start_threshold = 0;
|
||||
// Maximum queue depth
|
||||
unsigned max_write_iodepth = 128;
|
||||
// Enable small (journaled) write throttling, useful for the SSD+HDD case
|
||||
@@ -268,139 +174,89 @@ class blockstore_impl_t
|
||||
uint64_t autosync_writes = 128;
|
||||
// Log level (0-10)
|
||||
int log_level = 0;
|
||||
// Enable correct block checksum validation on objects updated with small writes when checksum block
|
||||
// is larger than bitmap_granularity, at the expense of extra metadata fsyncs during compaction
|
||||
bool perfect_csum_update = false;
|
||||
/******* END OF OPTIONS *******/
|
||||
|
||||
struct ring_consumer_t ring_consumer;
|
||||
|
||||
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
|
||||
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
|
||||
std::map<uint64_t, int> no_inode_stats;
|
||||
uint8_t *clean_bitmaps = NULL;
|
||||
blockstore_dirty_db_t dirty_db;
|
||||
blockstore_heap_t *heap = NULL;
|
||||
uint8_t* meta_superblock = NULL;
|
||||
uint8_t *buffer_area = NULL;
|
||||
std::vector<blockstore_op_t*> submit_queue;
|
||||
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
||||
int unsynced_big_write_count = 0, unstable_unsynced = 0;
|
||||
int unsynced_big_write_count = 0, unsynced_small_write_count = 0, unsynced_meta_write_count = 0;
|
||||
int unsynced_queued_ops = 0;
|
||||
allocator_t *data_alloc = NULL;
|
||||
uint64_t used_blocks = 0;
|
||||
uint8_t *zero_object = NULL;
|
||||
|
||||
void *metadata_buffer = NULL;
|
||||
|
||||
struct journal_t journal;
|
||||
journal_flusher_t *flusher;
|
||||
int big_to_flush = 0;
|
||||
int write_iodepth = 0;
|
||||
bool alloc_dyn_data = false;
|
||||
|
||||
// clean data blocks referenced by read operations
|
||||
std::map<uint64_t, used_clean_obj_t> used_clean_objects;
|
||||
int inflight_big = 0;
|
||||
bool fsyncing_data = false;
|
||||
|
||||
bool live = false, queue_stall = false;
|
||||
ring_loop_t *ringloop;
|
||||
timerfd_manager_t *tfd;
|
||||
ring_loop_i *ringloop = NULL;
|
||||
timerfd_manager_t *tfd = NULL;
|
||||
|
||||
bool stop_sync_submitted;
|
||||
bool stop_sync_submitted = false;
|
||||
|
||||
inline struct io_uring_sqe* get_sqe()
|
||||
{
|
||||
return ringloop->get_sqe();
|
||||
}
|
||||
|
||||
friend class blockstore_init_meta;
|
||||
friend class blockstore_init_journal;
|
||||
friend struct blockstore_journal_check_t;
|
||||
friend class journal_flusher_t;
|
||||
friend class journal_flusher_co;
|
||||
|
||||
void calc_lengths();
|
||||
void open_data();
|
||||
void open_meta();
|
||||
void open_journal();
|
||||
uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
|
||||
|
||||
blockstore_clean_db_t& clean_db_shard(object_id oid);
|
||||
void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
|
||||
void recalc_inode_space_stats(uint64_t pool_id, bool per_inode);
|
||||
|
||||
// Journaling
|
||||
void prepare_journal_sector_write(int sector, blockstore_op_t *op);
|
||||
void handle_journal_write(ring_data_t *data, uint64_t flush_id);
|
||||
void disk_error_abort(const char *op, int retval, int expected);
|
||||
|
||||
// Asynchronous init
|
||||
int initialized;
|
||||
int metadata_buf_size;
|
||||
blockstore_init_meta* metadata_init_reader;
|
||||
blockstore_init_journal* journal_init_reader;
|
||||
|
||||
void init();
|
||||
void check_wait(blockstore_op_t *op);
|
||||
void init_op(blockstore_op_t *op);
|
||||
|
||||
// Read
|
||||
int dequeue_read(blockstore_op_t *read_op);
|
||||
int dequeue_read(blockstore_op_t *op);
|
||||
int fulfill_read(blockstore_op_t *op);
|
||||
uint32_t prepare_read(std::vector<copy_buffer_t> & read_vec, heap_object_t *obj, heap_write_t *wr, uint32_t start, uint32_t end);
|
||||
uint32_t prepare_read_with_bitmaps(std::vector<copy_buffer_t> & read_vec, heap_object_t *obj, heap_write_t *wr, uint32_t start, uint32_t end);
|
||||
uint32_t prepare_read_zero(std::vector<copy_buffer_t> & read_vec, uint32_t start, uint32_t end);
|
||||
uint32_t prepare_read_simple(std::vector<copy_buffer_t> & read_vec, heap_object_t *obj, heap_write_t *wr, uint32_t start, uint32_t end);
|
||||
void prepare_disk_read(std::vector<copy_buffer_t> & read_vec, int pos, heap_object_t *obj, heap_write_t *wr,
|
||||
uint32_t blk_start, uint32_t blk_end, uint32_t start, uint32_t end, uint32_t copy_flags);
|
||||
void find_holes(std::vector<copy_buffer_t> & read_vec, uint32_t item_start, uint32_t item_end,
|
||||
std::function<int(int, bool, uint32_t, uint32_t)> callback);
|
||||
int fulfill_read(blockstore_op_t *read_op,
|
||||
uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
|
||||
uint32_t item_state, uint64_t item_version, uint64_t item_location,
|
||||
uint64_t journal_sector, uint8_t *csum, int *dyn_data);
|
||||
bool fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
|
||||
uint8_t *clean_entry_bitmap, int *dyn_data,
|
||||
uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
|
||||
int fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
|
||||
uint8_t *clean_entry_bitmap, int *dyn_data, bool from_journal, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end);
|
||||
int pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
|
||||
uint64_t dirty_offset, uint64_t dirty_end, uint64_t dirty_loc, uint8_t *csum_ptr, int *dyn_data,
|
||||
uint64_t offset, uint64_t submit_len, uint64_t & blk_begin, uint64_t & blk_end, uint8_t* & blk_buf);
|
||||
bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
|
||||
uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
|
||||
bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
|
||||
uint8_t* read_clean_meta_block(blockstore_op_t *read_op, uint64_t clean_loc, int rv_pos);
|
||||
bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
|
||||
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
|
||||
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal,
|
||||
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
|
||||
int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
|
||||
uint32_t item_state, uint64_t item_version);
|
||||
std::function<void(int&, uint32_t, uint32_t)> callback);
|
||||
void free_read_buffers(std::vector<copy_buffer_t> & rv);
|
||||
void handle_read_event(ring_data_t *data, blockstore_op_t *op);
|
||||
bool verify_read_checksums(blockstore_op_t *op);
|
||||
|
||||
// Write
|
||||
bool enqueue_write(blockstore_op_t *op);
|
||||
void cancel_all_writes(blockstore_op_t *op, blockstore_dirty_db_t::iterator dirty_it, int retval);
|
||||
void prepare_meta_block_write(blockstore_op_t *op, uint64_t modified_block, io_uring_sqe *sqe = NULL);
|
||||
int dequeue_write(blockstore_op_t *op);
|
||||
int dequeue_del(blockstore_op_t *op);
|
||||
int make_big_write(blockstore_op_t *op, uint32_t offset, uint32_t len, uint32_t *modified_block, uint32_t *moved_from_block);
|
||||
int continue_write(blockstore_op_t *op);
|
||||
void release_journal_sectors(blockstore_op_t *op);
|
||||
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
|
||||
|
||||
// Sync
|
||||
int continue_sync(blockstore_op_t *op);
|
||||
void ack_sync(blockstore_op_t *op);
|
||||
bool submit_fsyncs(int & wait_count);
|
||||
int do_sync(blockstore_op_t *op, int base_state);
|
||||
|
||||
// Stabilize
|
||||
int dequeue_stable(blockstore_op_t *op);
|
||||
int continue_stable(blockstore_op_t *op);
|
||||
void mark_stable(obj_ver_id ov, bool forget_dirty = false);
|
||||
void stabilize_object(object_id oid, uint64_t max_ver);
|
||||
blockstore_op_t* selective_sync(blockstore_op_t *op);
|
||||
int split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider);
|
||||
|
||||
// Rollback
|
||||
int dequeue_rollback(blockstore_op_t *op);
|
||||
int continue_rollback(blockstore_op_t *op);
|
||||
void mark_rolled_back(const obj_ver_id & ov);
|
||||
void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);
|
||||
void free_dirty_dyn_data(dirty_entry & e);
|
||||
|
||||
// List
|
||||
void process_list(blockstore_op_t *op);
|
||||
|
||||
public:
|
||||
/*public:*/
|
||||
|
||||
blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
|
||||
blockstore_impl_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd, bool mock_mode = false);
|
||||
~blockstore_impl_t();
|
||||
|
||||
void parse_config(blockstore_config_t & config, bool init);
|
||||
@@ -426,21 +282,13 @@ public:
|
||||
// Simplified synchronous operation: get object bitmap & current version
|
||||
int read_bitmap(object_id oid, uint64_t target_version, void *bitmap, uint64_t *result_version = NULL);
|
||||
|
||||
// Unstable writes are added here (map of object_id -> version)
|
||||
std::unordered_map<object_id, uint64_t> unstable_writes;
|
||||
|
||||
// Space usage statistics
|
||||
std::map<uint64_t, uint64_t> inode_space_stats;
|
||||
|
||||
// Set per-pool no_inode_stats
|
||||
void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
|
||||
|
||||
// Print diagnostics to stdout
|
||||
void dump_diagnostics();
|
||||
|
||||
const std::map<uint64_t, uint64_t> & get_inode_space_stats() { return heap->get_inode_space_stats(); }
|
||||
inline uint32_t get_block_size() { return dsk.data_block_size; }
|
||||
inline uint64_t get_block_count() { return dsk.block_count; }
|
||||
inline uint64_t get_free_block_count() { return dsk.block_count - used_blocks; }
|
||||
uint64_t get_free_block_count();
|
||||
inline uint32_t get_bitmap_granularity() { return dsk.disk_alignment; }
|
||||
inline uint64_t get_journal_size() { return dsk.journal_len; }
|
||||
};
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -25,47 +25,10 @@ class blockstore_init_meta
|
||||
uint64_t next_offset = 0;
|
||||
uint64_t last_read_offset = 0;
|
||||
uint64_t entries_loaded = 0;
|
||||
unsigned entries_per_block = 0;
|
||||
int i = 0, j = 0;
|
||||
std::vector<uint64_t> entries_to_zero;
|
||||
bool handle_meta_block(uint8_t *buf, uint64_t count, uint64_t done_cnt);
|
||||
void handle_event(ring_data_t *data, int buf_num);
|
||||
public:
|
||||
blockstore_init_meta(blockstore_impl_t *bs);
|
||||
int loop();
|
||||
};
|
||||
|
||||
struct bs_init_journal_done
|
||||
{
|
||||
void *buf;
|
||||
uint64_t pos, len;
|
||||
};
|
||||
|
||||
class blockstore_init_journal
|
||||
{
|
||||
blockstore_impl_t *bs;
|
||||
int wait_state = 0, wait_count = 0, handle_res = 0;
|
||||
uint64_t entries_loaded = 0;
|
||||
uint32_t crc32_last = 0;
|
||||
bool started = false;
|
||||
uint64_t next_free;
|
||||
std::vector<bs_init_journal_done> done;
|
||||
std::vector<obj_ver_id> double_allocs;
|
||||
std::vector<iovec> small_write_data;
|
||||
uint64_t journal_pos = 0;
|
||||
uint64_t continue_pos = 0;
|
||||
void *init_write_buf = NULL;
|
||||
uint64_t init_write_sector = 0;
|
||||
bool wrapped = false;
|
||||
void *submitted_buf;
|
||||
struct io_uring_sqe *sqe;
|
||||
struct ring_data_t *data;
|
||||
journal_entry_start *je_start;
|
||||
std::function<void(ring_data_t*)> simple_callback;
|
||||
int handle_journal_part(void *buf, uint64_t done_pos, uint64_t len);
|
||||
void handle_event(ring_data_t *data);
|
||||
void erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it);
|
||||
public:
|
||||
blockstore_init_journal(blockstore_impl_t* bs);
|
||||
int loop();
|
||||
};
|
||||
|
@@ -1,356 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
|
||||
{
|
||||
this->bs = bs;
|
||||
sectors_to_write = 0;
|
||||
next_pos = bs->journal.next_free;
|
||||
next_sector = bs->journal.cur_sector;
|
||||
first_sector = -1;
|
||||
next_in_pos = bs->journal.in_sector_pos;
|
||||
right_dir = next_pos >= bs->journal.used_start;
|
||||
}
|
||||
|
||||
// Check if we can write <required> entries of <size> bytes and <data_after> data bytes after them to the journal
|
||||
int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries_required, int size, int data_after)
|
||||
{
|
||||
uint64_t prev_next = next_sector;
|
||||
int required = entries_required;
|
||||
while (1)
|
||||
{
|
||||
int fits = bs->journal.no_same_sector_overwrites && next_pos == bs->journal.next_free && bs->journal.sector_info[next_sector].written
|
||||
? 0
|
||||
: (bs->journal.block_size - next_in_pos) / size;
|
||||
if (fits > 0)
|
||||
{
|
||||
if (fits > required)
|
||||
{
|
||||
fits = required;
|
||||
}
|
||||
if (first_sector == -1)
|
||||
{
|
||||
first_sector = next_sector;
|
||||
}
|
||||
required -= fits;
|
||||
next_in_pos += fits * size;
|
||||
if (next_sector != prev_next || !sectors_to_write)
|
||||
{
|
||||
// Except the previous call to this function
|
||||
sectors_to_write++;
|
||||
}
|
||||
}
|
||||
else if (bs->journal.sector_info[next_sector].dirty)
|
||||
{
|
||||
if (next_sector != prev_next || !sectors_to_write)
|
||||
{
|
||||
// Except the previous call to this function
|
||||
sectors_to_write++;
|
||||
}
|
||||
}
|
||||
if (required <= 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
next_pos = next_pos + bs->journal.block_size;
|
||||
if (next_pos >= bs->journal.len)
|
||||
{
|
||||
next_pos = bs->journal.block_size;
|
||||
right_dir = false;
|
||||
}
|
||||
next_in_pos = 0;
|
||||
next_sector = ((next_sector + 1) % bs->journal.sector_count);
|
||||
if (next_sector == first_sector)
|
||||
{
|
||||
// next_sector may wrap when all sectors are flushed and the incoming batch is too big
|
||||
// This is an error condition, we can't wait for anything in this case
|
||||
throw std::runtime_error(
|
||||
"Blockstore journal_sector_buffer_count="+std::to_string(bs->journal.sector_count)+
|
||||
" is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes"
|
||||
);
|
||||
}
|
||||
if (bs->journal.sector_info[next_sector].flush_count > 0 ||
|
||||
bs->journal.sector_info[next_sector].dirty)
|
||||
{
|
||||
// No memory buffer available. Wait for it.
|
||||
int used = 0, dirty = 0;
|
||||
for (int i = 0; i < bs->journal.sector_count; i++)
|
||||
{
|
||||
if (bs->journal.sector_info[i].dirty)
|
||||
{
|
||||
dirty++;
|
||||
used++;
|
||||
}
|
||||
if (bs->journal.sector_info[i].flush_count > 0)
|
||||
{
|
||||
used++;
|
||||
}
|
||||
}
|
||||
// In fact, it's even more rare than "ran out of journal space", so print a warning
|
||||
printf(
|
||||
"Ran out of journal sector buffers: %d/%ju buffers used (%d dirty), next buffer (%jd)"
|
||||
" is %s and flushed %ju times. Consider increasing \'journal_sector_buffer_count\'\n",
|
||||
used, bs->journal.sector_count, dirty, next_sector,
|
||||
bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
|
||||
bs->journal.sector_info[next_sector].flush_count
|
||||
);
|
||||
PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
if (data_after > 0)
|
||||
{
|
||||
next_pos = next_pos + data_after;
|
||||
if (next_pos >= bs->journal.len)
|
||||
{
|
||||
if (right_dir)
|
||||
next_pos = bs->journal.block_size + data_after;
|
||||
right_dir = false;
|
||||
}
|
||||
}
|
||||
if (!right_dir && next_pos >= bs->journal.used_start-bs->journal.block_size)
|
||||
{
|
||||
// No space in the journal. Wait until used_start changes.
|
||||
if (bs->log_level > 5)
|
||||
{
|
||||
printf(
|
||||
"Ran out of journal space (used_start=%08jx, next_free=%08jx, dirty_start=%08jx)\n",
|
||||
bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
|
||||
);
|
||||
}
|
||||
PRIV(op)->wait_for = WAIT_JOURNAL;
|
||||
bs->flusher->request_trim();
|
||||
PRIV(op)->wait_detail = bs->journal.used_start;
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size)
|
||||
{
|
||||
if (!journal.entry_fits(size))
|
||||
{
|
||||
assert(!journal.sector_info[journal.cur_sector].dirty);
|
||||
// Move to the next journal sector
|
||||
if (journal.sector_info[journal.cur_sector].flush_count > 0)
|
||||
{
|
||||
// Also select next sector buffer in memory
|
||||
journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
|
||||
assert(!journal.sector_info[journal.cur_sector].flush_count);
|
||||
}
|
||||
else
|
||||
{
|
||||
journal.dirty_start = journal.next_free;
|
||||
}
|
||||
journal.sector_info[journal.cur_sector].written = false;
|
||||
journal.sector_info[journal.cur_sector].offset = journal.next_free;
|
||||
journal.in_sector_pos = 0;
|
||||
auto next_next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
|
||||
// double check that next_free doesn't cross used_start from the left
|
||||
assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
|
||||
journal.next_free = next_next_free;
|
||||
memset(journal.inmemory
|
||||
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
|
||||
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
|
||||
}
|
||||
journal_entry *je = (struct journal_entry*)(
|
||||
(journal.inmemory
|
||||
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
|
||||
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector) + journal.in_sector_pos
|
||||
);
|
||||
journal.in_sector_pos += size;
|
||||
je->magic = JOURNAL_MAGIC;
|
||||
je->type = type;
|
||||
je->size = size;
|
||||
je->crc32_prev = journal.crc32_last;
|
||||
journal.sector_info[journal.cur_sector].dirty = true;
|
||||
return je;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_op_t *op)
|
||||
{
|
||||
// Don't submit the same sector twice in the same batch
|
||||
if (!journal.sector_info[cur_sector].submit_id)
|
||||
{
|
||||
io_uring_sqe *sqe = get_sqe();
|
||||
// Caller must ensure availability of an SQE
|
||||
assert(sqe != NULL);
|
||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
journal.sector_info[cur_sector].written = true;
|
||||
journal.sector_info[cur_sector].submit_id = ++journal.submit_id;
|
||||
assert(journal.submit_id != 0); // check overflow
|
||||
journal.submitting_sectors.push_back(cur_sector);
|
||||
journal.sector_info[cur_sector].flush_count++;
|
||||
data->iov = (struct iovec){
|
||||
(journal.inmemory
|
||||
? (uint8_t*)journal.buffer + journal.sector_info[cur_sector].offset
|
||||
: (uint8_t*)journal.sector_buf + journal.block_size*cur_sector),
|
||||
(size_t)journal.block_size
|
||||
};
|
||||
data->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
|
||||
my_uring_prep_writev(
|
||||
sqe, dsk.journal_fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
|
||||
);
|
||||
}
|
||||
journal.sector_info[cur_sector].dirty = false;
|
||||
// But always remember that this operation has to wait until this exact journal write is finished
|
||||
journal.flushing_ops.emplace(journal.sector_info[cur_sector].submit_id, (pending_journaling_t){
|
||||
.pending = 1,
|
||||
.sector = cur_sector,
|
||||
.op = op,
|
||||
});
|
||||
auto priv = PRIV(op);
|
||||
priv->pending_ops++;
|
||||
if (!priv->min_flushed_journal_sector)
|
||||
priv->min_flushed_journal_sector = 1+cur_sector;
|
||||
assert(priv->min_flushed_journal_sector <= journal.sector_count);
|
||||
priv->max_flushed_journal_sector = 1+cur_sector;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::handle_journal_write(ring_data_t *data, uint64_t flush_id)
|
||||
{
|
||||
live = true;
|
||||
if (data->res != data->iov.iov_len)
|
||||
{
|
||||
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
|
||||
disk_error_abort("journal write", data->res, data->iov.iov_len);
|
||||
}
|
||||
auto fl_it = journal.flushing_ops.lower_bound(flush_id);
|
||||
if (fl_it != journal.flushing_ops.end() && fl_it->first == flush_id && fl_it->second.sector >= 0)
|
||||
{
|
||||
journal.sector_info[fl_it->second.sector].flush_count--;
|
||||
}
|
||||
auto is_first = fl_it == journal.flushing_ops.begin();
|
||||
while (fl_it != journal.flushing_ops.end())
|
||||
{
|
||||
bool del = false;
|
||||
if (fl_it->first == flush_id)
|
||||
{
|
||||
fl_it->second.pending = 0;
|
||||
del = is_first;
|
||||
}
|
||||
else
|
||||
{
|
||||
del = !fl_it->second.pending;
|
||||
}
|
||||
if (del)
|
||||
{
|
||||
// Do not complete this operation if previous writes are unfinished
|
||||
// Otherwise also complete following operations waiting for this one
|
||||
auto priv = PRIV(fl_it->second.op);
|
||||
priv->pending_ops--;
|
||||
assert(priv->pending_ops >= 0);
|
||||
if (priv->pending_ops == 0)
|
||||
{
|
||||
release_journal_sectors(fl_it->second.op);
|
||||
priv->op_state++;
|
||||
ringloop->wakeup();
|
||||
}
|
||||
journal.flushing_ops.erase(fl_it++);
|
||||
}
|
||||
else
|
||||
{
|
||||
fl_it++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
journal_t::~journal_t()
|
||||
{
|
||||
if (sector_buf)
|
||||
free(sector_buf);
|
||||
if (sector_info)
|
||||
free(sector_info);
|
||||
if (buffer)
|
||||
free(buffer);
|
||||
sector_buf = NULL;
|
||||
sector_info = NULL;
|
||||
buffer = NULL;
|
||||
}
|
||||
|
||||
uint64_t journal_t::get_trim_pos()
|
||||
{
|
||||
auto journal_used_it = used_sectors.lower_bound(used_start);
|
||||
if (journal_used_it == used_sectors.end())
|
||||
{
|
||||
// Journal is cleared to its end, restart from the beginning
|
||||
journal_used_it = used_sectors.begin();
|
||||
if (journal_used_it == used_sectors.end())
|
||||
{
|
||||
// Journal is empty
|
||||
return next_free;
|
||||
}
|
||||
else
|
||||
{
|
||||
// next_free does not need updating during trim
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
|
||||
used_start, next_free, dirty_start,
|
||||
journal_used_it->first, journal_used_it->second
|
||||
);
|
||||
#endif
|
||||
return journal_used_it->first;
|
||||
}
|
||||
}
|
||||
else if (journal_used_it->first > used_start)
|
||||
{
|
||||
// Journal is cleared up to <journal_used_it>
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
|
||||
used_start, next_free, dirty_start,
|
||||
journal_used_it->first, journal_used_it->second
|
||||
);
|
||||
#endif
|
||||
return journal_used_it->first;
|
||||
}
|
||||
// Can't trim journal
|
||||
return used_start;
|
||||
}
|
||||
|
||||
void journal_t::dump_diagnostics()
|
||||
{
|
||||
auto journal_used_it = used_sectors.lower_bound(used_start);
|
||||
if (journal_used_it == used_sectors.end())
|
||||
{
|
||||
// Journal is cleared to its end, restart from the beginning
|
||||
journal_used_it = used_sectors.begin();
|
||||
}
|
||||
printf(
|
||||
"Journal: used_start=%08jx next_free=%08jx dirty_start=%08jx trim_to=%08jx trim_to_refs=%jd\n",
|
||||
used_start, next_free, dirty_start,
|
||||
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
|
||||
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
|
||||
);
|
||||
}
|
||||
|
||||
static uint64_t zero_page[4096];
|
||||
|
||||
uint32_t crc32c_pad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad)
|
||||
{
|
||||
uint32_t r = prev_crc;
|
||||
while (left_pad >= 4096)
|
||||
{
|
||||
r = crc32c(r, zero_page, 4096);
|
||||
left_pad -= 4096;
|
||||
}
|
||||
if (left_pad > 0)
|
||||
r = crc32c(r, zero_page, left_pad);
|
||||
r = crc32c(r, buf, len);
|
||||
while (right_pad >= 4096)
|
||||
{
|
||||
r = crc32c(r, zero_page, 4096);
|
||||
right_pad -= 4096;
|
||||
}
|
||||
if (left_pad > 0)
|
||||
r = crc32c(r, zero_page, right_pad);
|
||||
return r;
|
||||
}
|
||||
|
||||
uint32_t crc32c_nopad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad)
|
||||
{
|
||||
return crc32c(0, buf, len);
|
||||
}
|
@@ -2,6 +2,7 @@
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <sys/file.h>
|
||||
#include <stdexcept>
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
@@ -14,12 +15,14 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
}
|
||||
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
|
||||
journal_trim_interval = strtoull(config["journal_trim_interval"].c_str(), NULL, 10);
|
||||
flusher_start_threshold = strtoull(config["flusher_start_threshold"].c_str(), NULL, 10);
|
||||
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
|
||||
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
|
||||
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
|
||||
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
|
||||
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
|
||||
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
|
||||
perfect_csum_update = config["perfect_csum_update"] == "true" || config["perfect_csum_update"] == "1" || config["perfect_csum_update"] == "yes";
|
||||
if (config["autosync_writes"] != "")
|
||||
{
|
||||
autosync_writes = strtoull(config["autosync_writes"].c_str(), NULL, 10);
|
||||
@@ -28,13 +31,17 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
{
|
||||
max_flusher_count = 256;
|
||||
}
|
||||
if (!min_flusher_count || journal.flush_journal)
|
||||
if (!min_flusher_count)
|
||||
{
|
||||
min_flusher_count = 1;
|
||||
}
|
||||
if (!journal_trim_interval)
|
||||
{
|
||||
journal_trim_interval = 512;
|
||||
journal_trim_interval = 1024;
|
||||
}
|
||||
if (!flusher_start_threshold)
|
||||
{
|
||||
flusher_start_threshold = 32;
|
||||
}
|
||||
if (!max_write_iodepth)
|
||||
{
|
||||
@@ -68,23 +75,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
{
|
||||
readonly = true;
|
||||
}
|
||||
if (config["disable_data_fsync"] == "true" || config["disable_data_fsync"] == "1" || config["disable_data_fsync"] == "yes")
|
||||
{
|
||||
disable_data_fsync = true;
|
||||
}
|
||||
if (config["disable_meta_fsync"] == "true" || config["disable_meta_fsync"] == "1" || config["disable_meta_fsync"] == "yes")
|
||||
{
|
||||
disable_meta_fsync = true;
|
||||
}
|
||||
if (config["disable_journal_fsync"] == "true" || config["disable_journal_fsync"] == "1" || config["disable_journal_fsync"] == "yes")
|
||||
{
|
||||
disable_journal_fsync = true;
|
||||
}
|
||||
if (config["flush_journal"] == "true" || config["flush_journal"] == "1" || config["flush_journal"] == "yes")
|
||||
{
|
||||
// Only flush journal and exit
|
||||
journal.flush_journal = true;
|
||||
}
|
||||
if (config["immediate_commit"] == "all")
|
||||
{
|
||||
immediate_commit = IMMEDIATE_ALL;
|
||||
@@ -94,85 +84,23 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
immediate_commit = IMMEDIATE_SMALL;
|
||||
}
|
||||
metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
|
||||
inmemory_meta = config["inmemory_metadata"] != "false" && config["inmemory_metadata"] != "0" &&
|
||||
config["inmemory_metadata"] != "no";
|
||||
journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10);
|
||||
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
|
||||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
|
||||
journal.inmemory = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
|
||||
config["inmemory_journal"] != "no";
|
||||
meta_write_recheck_parallelism = strtoull(config["meta_write_recheck_parallelism"].c_str(), NULL, 10);
|
||||
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
|
||||
// Validate
|
||||
if (journal.sector_count < 2)
|
||||
{
|
||||
journal.sector_count = 32;
|
||||
}
|
||||
if (metadata_buf_size < 65536)
|
||||
{
|
||||
metadata_buf_size = 4*1024*1024;
|
||||
}
|
||||
if (dsk.meta_device == dsk.data_device)
|
||||
if (!meta_write_recheck_parallelism)
|
||||
{
|
||||
disable_meta_fsync = disable_data_fsync;
|
||||
meta_write_recheck_parallelism = 16;
|
||||
}
|
||||
if (dsk.journal_device == dsk.meta_device)
|
||||
{
|
||||
disable_journal_fsync = disable_meta_fsync;
|
||||
}
|
||||
if (immediate_commit != IMMEDIATE_NONE && !disable_journal_fsync)
|
||||
if (immediate_commit != IMMEDIATE_NONE && !dsk.disable_journal_fsync)
|
||||
{
|
||||
throw std::runtime_error("immediate_commit requires disable_journal_fsync");
|
||||
}
|
||||
if (immediate_commit == IMMEDIATE_ALL && !disable_data_fsync)
|
||||
if (immediate_commit == IMMEDIATE_ALL && !dsk.disable_data_fsync)
|
||||
{
|
||||
throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
|
||||
}
|
||||
// init some fields
|
||||
journal.block_size = dsk.journal_block_size;
|
||||
journal.next_free = dsk.journal_block_size;
|
||||
journal.used_start = dsk.journal_block_size;
|
||||
// no free space because sector is initially unmapped
|
||||
journal.in_sector_pos = dsk.journal_block_size;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::calc_lengths()
|
||||
{
|
||||
dsk.calc_lengths();
|
||||
journal.len = dsk.journal_len;
|
||||
journal.block_size = dsk.journal_block_size;
|
||||
journal.offset = dsk.journal_offset;
|
||||
if (inmemory_meta)
|
||||
{
|
||||
metadata_buffer = memalign(MEM_ALIGNMENT, dsk.meta_len);
|
||||
if (!metadata_buffer)
|
||||
throw std::runtime_error("Failed to allocate memory for the metadata ("+std::to_string(dsk.meta_len/1024/1024)+" MB)");
|
||||
}
|
||||
else if (dsk.clean_entry_bitmap_size || dsk.data_csum_type)
|
||||
{
|
||||
clean_bitmaps = (uint8_t*)malloc(dsk.block_count * 2 * dsk.clean_entry_bitmap_size);
|
||||
if (!clean_bitmaps)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"Failed to allocate memory for the metadata sparse write bitmap ("+
|
||||
std::to_string(dsk.block_count * 2 * dsk.clean_entry_bitmap_size / 1024 / 1024)+" MB)"
|
||||
);
|
||||
}
|
||||
}
|
||||
if (journal.inmemory)
|
||||
{
|
||||
journal.buffer = memalign(MEM_ALIGNMENT, journal.len);
|
||||
if (!journal.buffer)
|
||||
throw std::runtime_error("Failed to allocate memory for journal ("+std::to_string(journal.len/1024/1024)+" MB)");
|
||||
}
|
||||
else
|
||||
{
|
||||
journal.sector_buf = (uint8_t*)memalign(MEM_ALIGNMENT, journal.sector_count * dsk.journal_block_size);
|
||||
if (!journal.sector_buf)
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
journal.sector_info = (journal_sector_info_t*)calloc(journal.sector_count, sizeof(journal_sector_info_t));
|
||||
if (!journal.sector_info)
|
||||
{
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -1,258 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
|
||||
{
|
||||
if (PRIV(op)->op_state)
|
||||
{
|
||||
return continue_rollback(op);
|
||||
}
|
||||
int r = split_stab_op(op, [this](obj_ver_id ov)
|
||||
{
|
||||
// Check that there are some versions greater than v->version (which may be zero),
|
||||
// check that they're unstable, synced, and not currently written to
|
||||
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
|
||||
.oid = ov.oid,
|
||||
.version = UINT64_MAX,
|
||||
});
|
||||
if (dirty_it == dirty_db.begin())
|
||||
{
|
||||
// Already rolled back, skip this object version
|
||||
return STAB_SPLIT_DONE;
|
||||
}
|
||||
else
|
||||
{
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != ov.oid || dirty_it->first.version < ov.version)
|
||||
{
|
||||
// Already rolled back, skip this object version
|
||||
return STAB_SPLIT_DONE;
|
||||
}
|
||||
while (dirty_it->first.oid == ov.oid && dirty_it->first.version > ov.version)
|
||||
{
|
||||
if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||
{
|
||||
// Object write is still in progress. Wait until the write request completes
|
||||
return STAB_SPLIT_WAIT;
|
||||
}
|
||||
else if (!IS_SYNCED(dirty_it->second.state) ||
|
||||
IS_STABLE(dirty_it->second.state))
|
||||
{
|
||||
// Sync the object
|
||||
return STAB_SPLIT_SYNC;
|
||||
}
|
||||
if (dirty_it == dirty_db.begin())
|
||||
{
|
||||
break;
|
||||
}
|
||||
dirty_it--;
|
||||
}
|
||||
return STAB_SPLIT_TODO;
|
||||
}
|
||||
});
|
||||
if (r != 1)
|
||||
{
|
||||
return r;
|
||||
}
|
||||
// Check journal space
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, op->len, sizeof(journal_entry_rollback), 0))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
// There is sufficient space. Check SQEs
|
||||
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
||||
// Prepare and submit journal entries
|
||||
int s = 0;
|
||||
auto v = (obj_ver_id*)op->buf;
|
||||
for (int i = 0; i < op->len; i++, v++)
|
||||
{
|
||||
if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
|
||||
journal.sector_info[journal.cur_sector].dirty)
|
||||
{
|
||||
prepare_journal_sector_write(journal.cur_sector, op);
|
||||
s++;
|
||||
}
|
||||
journal_entry_rollback *je = (journal_entry_rollback*)
|
||||
prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
|
||||
je->oid = v->oid;
|
||||
je->version = v->version;
|
||||
je->crc32 = je_crc32((journal_entry*)je);
|
||||
journal.crc32_last = je->crc32;
|
||||
}
|
||||
prepare_journal_sector_write(journal.cur_sector, op);
|
||||
s++;
|
||||
assert(s == space_check.sectors_to_write);
|
||||
PRIV(op)->op_state = 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int blockstore_impl_t::continue_rollback(blockstore_op_t *op)
|
||||
{
|
||||
if (PRIV(op)->op_state == 2)
|
||||
goto resume_2;
|
||||
else if (PRIV(op)->op_state == 4)
|
||||
goto resume_4;
|
||||
else
|
||||
return 1;
|
||||
resume_2:
|
||||
if (!disable_journal_fsync)
|
||||
{
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
my_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||
PRIV(op)->pending_ops = 1;
|
||||
PRIV(op)->op_state = 3;
|
||||
return 1;
|
||||
}
|
||||
resume_4:
|
||||
obj_ver_id* v;
|
||||
int i;
|
||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||
{
|
||||
mark_rolled_back(*v);
|
||||
}
|
||||
// Acknowledge op
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
|
||||
{
|
||||
auto it = dirty_db.lower_bound((obj_ver_id){
|
||||
.oid = ov.oid,
|
||||
.version = UINT64_MAX,
|
||||
});
|
||||
if (it != dirty_db.begin())
|
||||
{
|
||||
uint64_t max_unstable = 0;
|
||||
auto rm_start = it;
|
||||
auto rm_end = it;
|
||||
it--;
|
||||
while (1)
|
||||
{
|
||||
if (it->first.oid != ov.oid)
|
||||
break;
|
||||
else if (it->first.version <= ov.version)
|
||||
{
|
||||
if (!IS_STABLE(it->second.state))
|
||||
max_unstable = it->first.version;
|
||||
break;
|
||||
}
|
||||
else if (IS_IN_FLIGHT(it->second.state) || IS_STABLE(it->second.state))
|
||||
break;
|
||||
// Remove entry
|
||||
rm_start = it;
|
||||
if (it == dirty_db.begin())
|
||||
break;
|
||||
it--;
|
||||
}
|
||||
if (rm_start != rm_end)
|
||||
{
|
||||
erase_dirty(rm_start, rm_end, UINT64_MAX);
|
||||
auto unstab_it = unstable_writes.find(ov.oid);
|
||||
if (unstab_it != unstable_writes.end())
|
||||
{
|
||||
if (max_unstable == 0)
|
||||
unstable_writes.erase(unstab_it);
|
||||
else
|
||||
unstab_it->second = max_unstable;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc)
|
||||
{
|
||||
if (dirty_end == dirty_start)
|
||||
{
|
||||
return;
|
||||
}
|
||||
auto dirty_it = dirty_end;
|
||||
dirty_it--;
|
||||
if (IS_DELETE(dirty_it->second.state))
|
||||
{
|
||||
object_id oid = dirty_it->first.oid;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Unblock writes-after-delete %jx:%jx v%ju\n", oid.inode, oid.stripe, dirty_it->first.version);
|
||||
#endif
|
||||
dirty_it = dirty_end;
|
||||
// Unblock operations blocked by delete flushing
|
||||
uint32_t next_state = BS_ST_IN_FLIGHT;
|
||||
while (dirty_it != dirty_db.end() && dirty_it->first.oid == oid)
|
||||
{
|
||||
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL)
|
||||
{
|
||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | next_state;
|
||||
if (IS_BIG_WRITE(dirty_it->second.state))
|
||||
{
|
||||
next_state = BS_ST_WAIT_BIG;
|
||||
}
|
||||
}
|
||||
dirty_it++;
|
||||
}
|
||||
dirty_it = dirty_end;
|
||||
dirty_it--;
|
||||
}
|
||||
while (1)
|
||||
{
|
||||
if ((IS_BIG_WRITE(dirty_it->second.state) || IS_DELETE(dirty_it->second.state)) &&
|
||||
IS_STABLE(dirty_it->second.state))
|
||||
{
|
||||
big_to_flush--;
|
||||
}
|
||||
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc &&
|
||||
dirty_it->second.location != UINT64_MAX)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %ju from %jx:%jx v%ju\n", dirty_it->second.location >> dsk.block_order,
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
||||
#endif
|
||||
data_alloc->set(dirty_it->second.location >> dsk.block_order, false);
|
||||
}
|
||||
auto used = --journal.used_sectors.at(dirty_it->second.journal_sector);
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"remove usage of journal offset %08jx by %jx:%jx v%ju (%ju refs)\n", dirty_it->second.journal_sector,
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
|
||||
);
|
||||
#endif
|
||||
if (used == 0)
|
||||
{
|
||||
journal.used_sectors.erase(dirty_it->second.journal_sector);
|
||||
if (dirty_it->second.journal_sector == journal.sector_info[journal.cur_sector].offset)
|
||||
{
|
||||
// Mark current sector as "full" to select the new one
|
||||
journal.in_sector_pos = dsk.journal_block_size;
|
||||
}
|
||||
flusher->mark_trim_possible();
|
||||
}
|
||||
free_dirty_dyn_data(dirty_it->second);
|
||||
if (dirty_it == dirty_start)
|
||||
{
|
||||
break;
|
||||
}
|
||||
dirty_it--;
|
||||
}
|
||||
dirty_db.erase(dirty_start, dirty_end);
|
||||
}
|
||||
|
||||
void blockstore_impl_t::free_dirty_dyn_data(dirty_entry & e)
|
||||
{
|
||||
if (e.dyn_data)
|
||||
{
|
||||
if (alloc_dyn_data &&
|
||||
--*((int*)e.dyn_data) == 0) // refcount
|
||||
{
|
||||
// dyn_data contains the bitmap and checksums
|
||||
// free it if it doesn't refer to the in-memory journal
|
||||
free(e.dyn_data);
|
||||
}
|
||||
e.dyn_data = NULL;
|
||||
}
|
||||
}
|
@@ -3,559 +3,87 @@
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
// Stabilize small write:
|
||||
// 1) Copy data from the journal to the data device
|
||||
// 2) Increase version on the metadata device and sync it
|
||||
// 3) Advance clean_db entry's version, clear previous journal entries
|
||||
//
|
||||
// This makes 1 4K small write+sync look like:
|
||||
// 512b+4K (journal) + sync + 512b (journal) + sync + 4K (data) [+ sync?] + 512b (metadata) + sync.
|
||||
// WA = 2.375. It's not the best, SSD FTL-like redirect-write could probably be lower
|
||||
// even with defragmentation. But it's fixed and it's still better than in Ceph. :)
|
||||
// except for HDD-only clusters, because each write results in 3 seeks.
|
||||
|
||||
// Stabilize big write:
|
||||
// 1) Copy metadata from the journal to the metadata device
|
||||
// 2) Move dirty_db entry to clean_db and clear previous journal entries
|
||||
//
|
||||
// This makes 1 128K big write+sync look like:
|
||||
// 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync.
|
||||
// WA = 1.012. Very good :)
|
||||
|
||||
// Stabilize delete:
|
||||
// 1) Remove metadata entry and sync it
|
||||
// 2) Remove dirty_db entry and clear previous journal entries
|
||||
// We have 2 problems here:
|
||||
// - In the cluster environment, we must store the "tombstones" of deleted objects until
|
||||
// all replicas (not just quorum) agrees about their deletion. That is, "stabilize" is
|
||||
// not possible for deletes in degraded placement groups
|
||||
// - With simple "fixed" metadata tables we can't just clear the metadata entry of the latest
|
||||
// object version. We must clear all previous entries, too.
|
||||
// FIXME Fix both problems - probably, by switching from "fixed" metadata tables to "dynamic"
|
||||
|
||||
// AND We must do it in batches, for the sake of reduced fsync call count
|
||||
// AND We must know what we stabilize. Basic workflow is like:
|
||||
// 1) primary OSD receives sync request
|
||||
// 2) it submits syncs to blockstore and peers
|
||||
// 3) after everyone acks sync it acks sync to the client
|
||||
// 4) after a while it takes his synced object list and sends stabilize requests
|
||||
// to peers and to its own blockstore, thus freeing the old version
|
||||
|
||||
struct ver_vector_t
|
||||
{
|
||||
obj_ver_id *items = NULL;
|
||||
uint64_t alloc = 0, size = 0;
|
||||
};
|
||||
|
||||
static void init_versions(ver_vector_t & vec, obj_ver_id *start, obj_ver_id *end, uint64_t len)
|
||||
{
|
||||
if (!vec.items)
|
||||
{
|
||||
vec.alloc = len;
|
||||
vec.items = (obj_ver_id*)malloc_or_die(sizeof(obj_ver_id) * vec.alloc);
|
||||
for (auto sv = start; sv < end; sv++)
|
||||
{
|
||||
vec.items[vec.size++] = *sv;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void append_version(ver_vector_t & vec, obj_ver_id ov)
|
||||
{
|
||||
if (vec.size >= vec.alloc)
|
||||
{
|
||||
vec.alloc = !vec.alloc ? 4 : vec.alloc*2;
|
||||
vec.items = (obj_ver_id*)realloc_or_die(vec.items, sizeof(obj_ver_id) * vec.alloc);
|
||||
}
|
||||
vec.items[vec.size++] = ov;
|
||||
}
|
||||
|
||||
static bool check_unsynced(std::vector<obj_ver_id> & check, obj_ver_id ov, std::vector<obj_ver_id> & to, int *count)
|
||||
{
|
||||
bool found = false;
|
||||
int j = 0, k = 0;
|
||||
while (j < check.size())
|
||||
{
|
||||
if (check[j] == ov)
|
||||
found = true;
|
||||
if (check[j].oid == ov.oid && check[j].version <= ov.version)
|
||||
{
|
||||
to.push_back(check[j++]);
|
||||
if (count)
|
||||
(*count)--;
|
||||
}
|
||||
else
|
||||
check[k++] = check[j++];
|
||||
}
|
||||
check.resize(k);
|
||||
return found;
|
||||
}
|
||||
|
||||
blockstore_op_t* blockstore_impl_t::selective_sync(blockstore_op_t *op)
|
||||
{
|
||||
unsynced_big_write_count -= unsynced_big_writes.size();
|
||||
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
|
||||
unsynced_big_write_count += unsynced_big_writes.size();
|
||||
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
|
||||
// Create a sync operation, insert into the end of the queue
|
||||
// And move ourselves into the end too!
|
||||
// Rather hacky but that's what we need...
|
||||
blockstore_op_t *sync_op = new blockstore_op_t;
|
||||
sync_op->opcode = BS_OP_SYNC;
|
||||
sync_op->buf = NULL;
|
||||
sync_op->callback = [](blockstore_op_t *sync_op)
|
||||
{
|
||||
delete sync_op;
|
||||
};
|
||||
init_op(sync_op);
|
||||
int sync_res = continue_sync(sync_op);
|
||||
if (sync_res != 2)
|
||||
{
|
||||
// Put SYNC into the queue if it's not finished yet
|
||||
submit_queue.push_back(sync_op);
|
||||
}
|
||||
// Restore unsynced_writes
|
||||
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
|
||||
unsynced_big_write_count -= unsynced_big_writes.size();
|
||||
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
|
||||
unsynced_big_write_count += unsynced_big_writes.size();
|
||||
if (sync_res == 2)
|
||||
{
|
||||
// Sync is immediately completed
|
||||
return NULL;
|
||||
}
|
||||
return sync_op;
|
||||
}
|
||||
|
||||
// Returns: 2 = stop processing and dequeue, 0 = stop processing and do not dequeue, 1 = proceed with op itself
|
||||
int blockstore_impl_t::split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider)
|
||||
{
|
||||
bool add_sync = false;
|
||||
ver_vector_t good_vers, bad_vers;
|
||||
obj_ver_id* v;
|
||||
int i, todo = 0;
|
||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||
{
|
||||
int action = decider(*v);
|
||||
if (action < 0)
|
||||
{
|
||||
// Rollback changes
|
||||
for (auto & ov: PRIV(op)->sync_big_writes)
|
||||
{
|
||||
unsynced_big_writes.push_back(ov);
|
||||
unsynced_big_write_count++;
|
||||
}
|
||||
for (auto & ov: PRIV(op)->sync_small_writes)
|
||||
{
|
||||
unsynced_small_writes.push_back(ov);
|
||||
}
|
||||
free(good_vers.items);
|
||||
good_vers.items = NULL;
|
||||
free(bad_vers.items);
|
||||
bad_vers.items = NULL;
|
||||
// Error
|
||||
op->retval = action;
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
else if (action == STAB_SPLIT_DONE)
|
||||
{
|
||||
// Already done
|
||||
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
|
||||
}
|
||||
else if (action == STAB_SPLIT_WAIT)
|
||||
{
|
||||
// Already in progress, we just have to wait until it finishes
|
||||
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
|
||||
append_version(bad_vers, *v);
|
||||
}
|
||||
else if (action == STAB_SPLIT_SYNC)
|
||||
{
|
||||
// Needs a SYNC, we have to send a SYNC if not already in progress
|
||||
//
|
||||
// If the object is not present in unsynced_(big|small)_writes then
|
||||
// it's currently being synced. If it's present then we can initiate
|
||||
// its sync ourselves.
|
||||
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
|
||||
append_version(bad_vers, *v);
|
||||
if (!add_sync)
|
||||
{
|
||||
PRIV(op)->sync_big_writes.clear();
|
||||
PRIV(op)->sync_small_writes.clear();
|
||||
add_sync = true;
|
||||
}
|
||||
check_unsynced(unsynced_small_writes, *v, PRIV(op)->sync_small_writes, NULL);
|
||||
check_unsynced(unsynced_big_writes, *v, PRIV(op)->sync_big_writes, &unsynced_big_write_count);
|
||||
}
|
||||
else /* if (action == STAB_SPLIT_TODO) */
|
||||
{
|
||||
if (good_vers.items)
|
||||
{
|
||||
// If we're selecting versions then append it
|
||||
// Main idea is that 99% of the time all versions passed to BS_OP_STABLE are synced
|
||||
// And we don't want to select/allocate anything in that optimistic case
|
||||
append_version(good_vers, *v);
|
||||
}
|
||||
todo++;
|
||||
}
|
||||
}
|
||||
// In a pessimistic scenario, an operation may be split into 3:
|
||||
// - Stabilize synced entries
|
||||
// - Sync unsynced entries
|
||||
// - Continue for unsynced entries after sync
|
||||
add_sync = add_sync && (PRIV(op)->sync_big_writes.size() || PRIV(op)->sync_small_writes.size());
|
||||
if (!todo && !bad_vers.size)
|
||||
{
|
||||
// Already stable
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
op->retval = 0;
|
||||
if (!todo && !add_sync)
|
||||
{
|
||||
// Only wait for inflight writes or current in-progress syncs
|
||||
return 0;
|
||||
}
|
||||
blockstore_op_t *sync_op = NULL, *split_stab_op = NULL;
|
||||
if (add_sync)
|
||||
{
|
||||
// Initiate a selective sync for PRIV(op)->sync_(big|small)_writes
|
||||
sync_op = selective_sync(op);
|
||||
}
|
||||
if (bad_vers.size)
|
||||
{
|
||||
// Split part of the request into a separate operation
|
||||
split_stab_op = new blockstore_op_t;
|
||||
split_stab_op->opcode = op->opcode;
|
||||
split_stab_op->buf = bad_vers.items;
|
||||
split_stab_op->len = bad_vers.size;
|
||||
init_op(split_stab_op);
|
||||
submit_queue.push_back(split_stab_op);
|
||||
}
|
||||
if (sync_op || split_stab_op || good_vers.items)
|
||||
{
|
||||
void *orig_buf = op->buf;
|
||||
if (good_vers.items)
|
||||
{
|
||||
op->buf = good_vers.items;
|
||||
op->len = good_vers.size;
|
||||
}
|
||||
// Make a wrapped callback
|
||||
int *split_op_counter = (int*)malloc_or_die(sizeof(int));
|
||||
*split_op_counter = (sync_op ? 1 : 0) + (split_stab_op ? 1 : 0) + (todo ? 1 : 0);
|
||||
auto cb = [op, good_items = good_vers.items,
|
||||
bad_items = bad_vers.items, split_op_counter,
|
||||
orig_buf, real_cb = op->callback](blockstore_op_t *split_op)
|
||||
{
|
||||
if (split_op->retval != 0)
|
||||
op->retval = split_op->retval;
|
||||
(*split_op_counter)--;
|
||||
assert((*split_op_counter) >= 0);
|
||||
if (op != split_op)
|
||||
delete split_op;
|
||||
if (!*split_op_counter)
|
||||
{
|
||||
free(good_items);
|
||||
free(bad_items);
|
||||
free(split_op_counter);
|
||||
op->buf = orig_buf;
|
||||
real_cb(op);
|
||||
}
|
||||
};
|
||||
if (sync_op)
|
||||
{
|
||||
sync_op->callback = cb;
|
||||
}
|
||||
if (split_stab_op)
|
||||
{
|
||||
split_stab_op->callback = cb;
|
||||
}
|
||||
op->callback = cb;
|
||||
}
|
||||
if (!todo)
|
||||
{
|
||||
// All work is postponed
|
||||
op->callback = NULL;
|
||||
return 2;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Handles both stabilize (commit) and rollback
|
||||
int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||
{
|
||||
if (PRIV(op)->op_state)
|
||||
{
|
||||
return continue_stable(op);
|
||||
}
|
||||
int r = split_stab_op(op, [this](obj_ver_id ov)
|
||||
{
|
||||
auto dirty_it = dirty_db.find(ov);
|
||||
if (dirty_it == dirty_db.end())
|
||||
{
|
||||
auto & clean_db = clean_db_shard(ov.oid);
|
||||
auto clean_it = clean_db.find(ov.oid);
|
||||
if (clean_it == clean_db.end() || clean_it->second.version < ov.version)
|
||||
{
|
||||
// No such object version
|
||||
printf("Error: %jx:%jx v%ju not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
||||
return -ENOENT;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Already stable
|
||||
return STAB_SPLIT_DONE;
|
||||
}
|
||||
}
|
||||
else if (IS_STABLE(dirty_it->second.state))
|
||||
{
|
||||
// Already stable
|
||||
return STAB_SPLIT_DONE;
|
||||
}
|
||||
while (true)
|
||||
{
|
||||
if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||
{
|
||||
// Object write is still in progress. Wait until the write request completes
|
||||
return STAB_SPLIT_WAIT;
|
||||
}
|
||||
else if (!IS_SYNCED(dirty_it->second.state))
|
||||
{
|
||||
// Object not synced yet - sync it
|
||||
// In previous versions we returned EBUSY here and required
|
||||
// the caller (OSD) to issue a global sync first. But a global sync
|
||||
// waits for all writes in the queue including inflight writes. And
|
||||
// inflight writes may themselves be blocked by unstable writes being
|
||||
// still present in the journal and not flushed away from it.
|
||||
// So we must sync specific objects here.
|
||||
//
|
||||
// Even more, we have to process "stabilize" request in parts. That is,
|
||||
// we must stabilize all objects which are already synced. Otherwise
|
||||
// they may block objects which are NOT synced yet.
|
||||
return STAB_SPLIT_SYNC;
|
||||
}
|
||||
else if (IS_STABLE(dirty_it->second.state))
|
||||
{
|
||||
break;
|
||||
}
|
||||
// Check previous versions too
|
||||
if (dirty_it == dirty_db.begin())
|
||||
{
|
||||
break;
|
||||
}
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != ov.oid)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
return STAB_SPLIT_TODO;
|
||||
});
|
||||
if (r != 1)
|
||||
{
|
||||
return r;
|
||||
}
|
||||
// Check journal space
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, op->len, sizeof(journal_entry_stable), 0))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
// There is sufficient space. Check SQEs
|
||||
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
||||
// Prepare and submit journal entries
|
||||
int s = 0;
|
||||
auto v = (obj_ver_id*)op->buf;
|
||||
for (int i = 0; i < op->len; i++, v++)
|
||||
{
|
||||
if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
|
||||
journal.sector_info[journal.cur_sector].dirty)
|
||||
{
|
||||
prepare_journal_sector_write(journal.cur_sector, op);
|
||||
s++;
|
||||
}
|
||||
journal_entry_stable *je = (journal_entry_stable*)
|
||||
prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
|
||||
je->oid = v->oid;
|
||||
je->version = v->version;
|
||||
je->crc32 = je_crc32((journal_entry*)je);
|
||||
journal.crc32_last = je->crc32;
|
||||
}
|
||||
prepare_journal_sector_write(journal.cur_sector, op);
|
||||
s++;
|
||||
assert(s == space_check.sectors_to_write);
|
||||
PRIV(op)->op_state = 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int blockstore_impl_t::continue_stable(blockstore_op_t *op)
|
||||
{
|
||||
if (PRIV(op)->op_state == 2)
|
||||
goto resume_2;
|
||||
else if (PRIV(op)->op_state == 4)
|
||||
goto resume_4;
|
||||
else
|
||||
return 1;
|
||||
resume_2:
|
||||
if (!disable_journal_fsync)
|
||||
{
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
my_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||
PRIV(op)->pending_ops = 1;
|
||||
PRIV(op)->op_state = 3;
|
||||
return 1;
|
||||
}
|
||||
resume_4:
|
||||
// Mark dirty_db entries as stable, acknowledge op completion
|
||||
obj_ver_id* v;
|
||||
int i;
|
||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||
{
|
||||
// Mark all dirty_db entries up to op->version as stable
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Stabilize %jx:%jx v%ju\n", v->oid.inode, v->oid.stripe, v->version);
|
||||
#endif
|
||||
mark_stable(*v);
|
||||
}
|
||||
// Acknowledge op
|
||||
obj_ver_id *v = (obj_ver_id*)op->buf;
|
||||
auto priv = PRIV(op);
|
||||
if (priv->op_state == 1) goto resume_1;
|
||||
else if (priv->op_state == 2) goto resume_2;
|
||||
else if (priv->op_state == 3) goto resume_3;
|
||||
else if (priv->op_state == 4) goto resume_4;
|
||||
assert(!priv->op_state);
|
||||
// Modify in-memory state and assign contiguous LSNs
|
||||
priv->stab_pos = 0;
|
||||
priv->lsn = priv->to_lsn = 0;
|
||||
op->retval = 0;
|
||||
while (priv->stab_pos < op->len)
|
||||
{
|
||||
uint32_t modified_block = 0;
|
||||
uint64_t new_lsn = 0;
|
||||
uint64_t new_to_lsn = 0;
|
||||
int res = op->opcode == BS_OP_STABLE
|
||||
? heap->post_stabilize(v[priv->stab_pos].oid, v[priv->stab_pos].version, &modified_block, &new_lsn, &new_to_lsn)
|
||||
: heap->post_rollback(v[priv->stab_pos].oid, v[priv->stab_pos].version, &new_lsn, &modified_block);
|
||||
if (res != 0)
|
||||
{
|
||||
assert(res == ENOENT || res == EBUSY);
|
||||
op->retval = -res;
|
||||
}
|
||||
if (new_lsn)
|
||||
{
|
||||
assert(priv->lsn == 0 || priv->to_lsn == new_lsn-1);
|
||||
if (!priv->lsn)
|
||||
priv->lsn = new_lsn;
|
||||
priv->to_lsn = op->opcode == BS_OP_STABLE ? new_to_lsn : new_lsn;
|
||||
}
|
||||
priv->stab_pos++;
|
||||
}
|
||||
// Submit metadata writes
|
||||
priv->stab_pos = 0;
|
||||
resume_1:
|
||||
priv->op_state = 1;
|
||||
while (priv->stab_pos < op->len)
|
||||
{
|
||||
uint32_t block_num = 0;
|
||||
heap_object_t *obj = heap->read_entry(v[priv->stab_pos].oid, &block_num);
|
||||
if (obj)
|
||||
{
|
||||
io_uring_sqe *sqe = get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
if (priv->pending_ops > 0)
|
||||
return 1;
|
||||
priv->wait_detail = 1;
|
||||
priv->wait_for = WAIT_SQE;
|
||||
return 0;
|
||||
}
|
||||
prepare_meta_block_write(op, block_num, sqe);
|
||||
}
|
||||
priv->stab_pos++;
|
||||
}
|
||||
if (priv->pending_ops > 0)
|
||||
{
|
||||
priv->op_state = 1;
|
||||
return 1;
|
||||
}
|
||||
// Mark writes as completed to allow compaction
|
||||
for (uint64_t lsn = priv->lsn; lsn <= priv->to_lsn; lsn++)
|
||||
{
|
||||
heap->mark_lsn_completed(lsn);
|
||||
}
|
||||
unsynced_meta_write_count++;
|
||||
// Fsync, just because our semantics imply that commit (stabilize) is immediately fsynced
|
||||
priv->op_state = 2;
|
||||
resume_2:
|
||||
resume_3:
|
||||
resume_4:
|
||||
int res = do_sync(op, 2);
|
||||
if (res != 2)
|
||||
{
|
||||
return res;
|
||||
}
|
||||
// Done. Don't touch op->retval - if anything resulted in ENOENT, return it as is
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::mark_stable(obj_ver_id v, bool forget_dirty)
|
||||
{
|
||||
auto dirty_it = dirty_db.find(v);
|
||||
if (dirty_it != dirty_db.end())
|
||||
{
|
||||
if (IS_INSTANT(dirty_it->second.state))
|
||||
{
|
||||
// 'Instant' (non-EC) operations may complete and try to become stable out of order. Prevent it.
|
||||
auto back_it = dirty_it;
|
||||
while (back_it != dirty_db.begin())
|
||||
{
|
||||
back_it--;
|
||||
if (back_it->first.oid != v.oid)
|
||||
{
|
||||
break;
|
||||
}
|
||||
if (!IS_STABLE(back_it->second.state))
|
||||
{
|
||||
// There are preceding unstable versions, can't flush <v>
|
||||
return;
|
||||
}
|
||||
}
|
||||
while (true)
|
||||
{
|
||||
dirty_it++;
|
||||
if (dirty_it == dirty_db.end() || dirty_it->first.oid != v.oid ||
|
||||
!IS_SYNCED(dirty_it->second.state))
|
||||
{
|
||||
dirty_it--;
|
||||
break;
|
||||
}
|
||||
v.version = dirty_it->first.version;
|
||||
}
|
||||
}
|
||||
while (1)
|
||||
{
|
||||
bool was_stable = IS_STABLE(dirty_it->second.state);
|
||||
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_SYNCED)
|
||||
{
|
||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_STABLE;
|
||||
// Allocations and deletions are counted when they're stabilized
|
||||
if (IS_BIG_WRITE(dirty_it->second.state))
|
||||
{
|
||||
int exists = -1;
|
||||
if (dirty_it != dirty_db.begin())
|
||||
{
|
||||
auto prev_it = dirty_it;
|
||||
prev_it--;
|
||||
if (prev_it->first.oid == v.oid)
|
||||
{
|
||||
exists = IS_DELETE(prev_it->second.state) ? 0 : 1;
|
||||
}
|
||||
}
|
||||
if (exists == -1)
|
||||
{
|
||||
auto & clean_db = clean_db_shard(v.oid);
|
||||
auto clean_it = clean_db.find(v.oid);
|
||||
exists = clean_it != clean_db.end() ? 1 : 0;
|
||||
}
|
||||
if (!exists)
|
||||
{
|
||||
uint64_t space_id = dirty_it->first.oid.inode;
|
||||
if (no_inode_stats[dirty_it->first.oid.inode >> (64-POOL_ID_BITS)])
|
||||
space_id = space_id & ~(((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
|
||||
inode_space_stats[space_id] += dsk.data_block_size;
|
||||
used_blocks++;
|
||||
}
|
||||
big_to_flush++;
|
||||
}
|
||||
else if (IS_DELETE(dirty_it->second.state))
|
||||
{
|
||||
uint64_t space_id = dirty_it->first.oid.inode;
|
||||
if (no_inode_stats[dirty_it->first.oid.inode >> (64-POOL_ID_BITS)])
|
||||
space_id = space_id & ~(((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
|
||||
auto & sp = inode_space_stats[space_id];
|
||||
if (sp > dsk.data_block_size)
|
||||
sp -= dsk.data_block_size;
|
||||
else
|
||||
inode_space_stats.erase(space_id);
|
||||
used_blocks--;
|
||||
big_to_flush++;
|
||||
}
|
||||
}
|
||||
else if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||
{
|
||||
// mark_stable should never be called for in-flight or submitted writes
|
||||
printf(
|
||||
"BUG: Attempt to mark_stable object %jx:%jx v%ju state of which is %x\n",
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
|
||||
dirty_it->second.state
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
if (forget_dirty && (IS_BIG_WRITE(dirty_it->second.state) ||
|
||||
IS_DELETE(dirty_it->second.state)))
|
||||
{
|
||||
// Big write overrides all previous dirty entries
|
||||
auto erase_end = dirty_it;
|
||||
while (dirty_it != dirty_db.begin())
|
||||
{
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != v.oid)
|
||||
{
|
||||
dirty_it++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto & clean_db = clean_db_shard(v.oid);
|
||||
auto clean_it = clean_db.find(v.oid);
|
||||
uint64_t clean_loc = clean_it != clean_db.end()
|
||||
? clean_it->second.location : UINT64_MAX;
|
||||
erase_dirty(dirty_it, erase_end, clean_loc);
|
||||
break;
|
||||
}
|
||||
if (was_stable || dirty_it == dirty_db.begin())
|
||||
{
|
||||
break;
|
||||
}
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != v.oid)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
flusher->enqueue_flush(v);
|
||||
}
|
||||
auto unstab_it = unstable_writes.find(v.oid);
|
||||
if (unstab_it != unstable_writes.end() &&
|
||||
unstab_it->second <= v.version)
|
||||
{
|
||||
unstable_writes.erase(unstab_it);
|
||||
}
|
||||
}
|
||||
|
@@ -3,231 +3,112 @@
|
||||
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
#define SYNC_HAS_SMALL 1
|
||||
#define SYNC_HAS_BIG 2
|
||||
#define SYNC_DATA_SYNC_SENT 3
|
||||
#define SYNC_DATA_SYNC_DONE 4
|
||||
#define SYNC_JOURNAL_WRITE_SENT 5
|
||||
#define SYNC_JOURNAL_WRITE_DONE 6
|
||||
#define SYNC_JOURNAL_SYNC_SENT 7
|
||||
#define SYNC_DONE 8
|
||||
|
||||
int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
{
|
||||
if (immediate_commit == IMMEDIATE_ALL)
|
||||
if (!PRIV(op)->op_state)
|
||||
{
|
||||
// We can return immediately because sync is only dequeued after all previous writes
|
||||
op->retval = 0;
|
||||
}
|
||||
int res = do_sync(op, 0);
|
||||
if (res == 2)
|
||||
{
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
if (PRIV(op)->op_state == 0)
|
||||
{
|
||||
stop_sync_submitted = false;
|
||||
unsynced_big_write_count -= unsynced_big_writes.size();
|
||||
PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
|
||||
PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
|
||||
unsynced_big_writes.clear();
|
||||
unsynced_small_writes.clear();
|
||||
if (PRIV(op)->sync_big_writes.size() > 0)
|
||||
PRIV(op)->op_state = SYNC_HAS_BIG;
|
||||
else if (PRIV(op)->sync_small_writes.size() > 0)
|
||||
PRIV(op)->op_state = SYNC_HAS_SMALL;
|
||||
else
|
||||
PRIV(op)->op_state = SYNC_DONE;
|
||||
}
|
||||
if (PRIV(op)->op_state == SYNC_HAS_SMALL)
|
||||
{
|
||||
// No big writes, just fsync the journal
|
||||
if (journal.sector_info[journal.cur_sector].dirty)
|
||||
{
|
||||
// Write out the last journal sector if it happens to be dirty
|
||||
BS_SUBMIT_CHECK_SQES(1);
|
||||
prepare_journal_sector_write(journal.cur_sector, op);
|
||||
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
|
||||
}
|
||||
}
|
||||
if (PRIV(op)->op_state == SYNC_HAS_BIG)
|
||||
{
|
||||
// 1st step: fsync data
|
||||
if (!disable_data_fsync)
|
||||
{
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
my_uring_prep_fsync(sqe, dsk.data_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||
PRIV(op)->pending_ops = 1;
|
||||
PRIV(op)->op_state = SYNC_DATA_SYNC_SENT;
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
|
||||
}
|
||||
}
|
||||
if (PRIV(op)->op_state == SYNC_DATA_SYNC_DONE)
|
||||
{
|
||||
// 2nd step: Data device is synced, prepare & write journal entries
|
||||
// Check space in the journal and journal memory buffers
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (dsk.csum_block_size)
|
||||
{
|
||||
// More complex check because all journal entries have different lengths
|
||||
int left = PRIV(op)->sync_big_writes.size();
|
||||
for (auto & sbw: PRIV(op)->sync_big_writes)
|
||||
{
|
||||
left--;
|
||||
auto & dirty_entry = dirty_db.at(sbw);
|
||||
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
|
||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size, 0))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
|
||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
// Check SQEs. Don't bother about merging, submit each journal sector as a separate request
|
||||
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
||||
// Prepare and submit journal entries
|
||||
auto it = PRIV(op)->sync_big_writes.begin();
|
||||
int s = 0;
|
||||
while (it != PRIV(op)->sync_big_writes.end())
|
||||
{
|
||||
auto & dirty_entry = dirty_db.at(*it);
|
||||
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
|
||||
if (!journal.entry_fits(sizeof(journal_entry_big_write) + dyn_size) &&
|
||||
journal.sector_info[journal.cur_sector].dirty)
|
||||
{
|
||||
prepare_journal_sector_write(journal.cur_sector, op);
|
||||
s++;
|
||||
}
|
||||
journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
|
||||
journal, (dirty_entry.state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
|
||||
sizeof(journal_entry_big_write) + dyn_size
|
||||
);
|
||||
auto jsec = dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
||||
assert(journal.next_free >= journal.used_start
|
||||
? (jsec >= journal.used_start && jsec < journal.next_free)
|
||||
: (jsec >= journal.used_start || jsec < journal.next_free));
|
||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
|
||||
dirty_entry.journal_sector, it->oid.inode, it->oid.stripe, it->version,
|
||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
|
||||
);
|
||||
#endif
|
||||
je->oid = it->oid;
|
||||
je->version = it->version;
|
||||
je->offset = dirty_entry.offset;
|
||||
je->len = dirty_entry.len;
|
||||
je->location = dirty_entry.location;
|
||||
memcpy((void*)(je+1), (alloc_dyn_data
|
||||
? (uint8_t*)dirty_entry.dyn_data+sizeof(int) : (uint8_t*)&dirty_entry.dyn_data), dyn_size);
|
||||
je->crc32 = je_crc32((journal_entry*)je);
|
||||
journal.crc32_last = je->crc32;
|
||||
it++;
|
||||
}
|
||||
prepare_journal_sector_write(journal.cur_sector, op);
|
||||
s++;
|
||||
assert(s == space_check.sectors_to_write);
|
||||
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
|
||||
return 1;
|
||||
}
|
||||
if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_DONE)
|
||||
{
|
||||
if (!disable_journal_fsync)
|
||||
{
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
my_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||
PRIV(op)->pending_ops = 1;
|
||||
PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
PRIV(op)->op_state = SYNC_DONE;
|
||||
}
|
||||
}
|
||||
if (PRIV(op)->op_state == SYNC_DONE)
|
||||
{
|
||||
ack_sync(op);
|
||||
return 2;
|
||||
}
|
||||
return 1;
|
||||
return res;
|
||||
}
|
||||
|
||||
void blockstore_impl_t::ack_sync(blockstore_op_t *op)
|
||||
bool blockstore_impl_t::submit_fsyncs(int & wait_count)
|
||||
{
|
||||
// Handle states
|
||||
for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++)
|
||||
int n = ((unsynced_small_write_count > 0 || unsynced_big_write_count > 0 || unsynced_meta_write_count > 0) && !dsk.disable_meta_fsync) +
|
||||
(unsynced_small_write_count > 0 && !dsk.disable_journal_fsync && dsk.journal_fd != dsk.meta_fd) +
|
||||
(unsynced_big_write_count > 0 && !dsk.disable_data_fsync && dsk.data_fd != dsk.meta_fd && dsk.data_fd != dsk.journal_fd);
|
||||
if (ringloop->space_left() < n)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Ack sync big %jx:%jx v%ju\n", it->oid.inode, it->oid.stripe, it->version);
|
||||
#endif
|
||||
auto & unstab = unstable_writes[it->oid];
|
||||
unstab = unstab < it->version ? it->version : unstab;
|
||||
auto dirty_it = dirty_db.find(*it);
|
||||
dirty_it->second.state = ((dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SYNCED);
|
||||
if (dirty_it->second.state & BS_ST_INSTANT)
|
||||
{
|
||||
mark_stable(dirty_it->first);
|
||||
}
|
||||
else
|
||||
{
|
||||
unstable_unsynced--;
|
||||
assert(unstable_unsynced >= 0);
|
||||
}
|
||||
dirty_it++;
|
||||
while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
|
||||
{
|
||||
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG)
|
||||
{
|
||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_IN_FLIGHT;
|
||||
}
|
||||
dirty_it++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
|
||||
if (!n)
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Ack sync small %jx:%jx v%ju\n", it->oid.inode, it->oid.stripe, it->version);
|
||||
#endif
|
||||
auto & unstab = unstable_writes[it->oid];
|
||||
unstab = unstab < it->version ? it->version : unstab;
|
||||
if (dirty_db[*it].state == (BS_ST_DELETE | BS_ST_WRITTEN))
|
||||
{
|
||||
dirty_db[*it].state = (BS_ST_DELETE | BS_ST_SYNCED);
|
||||
// Deletions are treated as immediately stable
|
||||
mark_stable(*it);
|
||||
}
|
||||
else /* (BS_ST_INSTANT?) | BS_ST_SMALL_WRITE | BS_ST_WRITTEN */
|
||||
{
|
||||
dirty_db[*it].state = (dirty_db[*it].state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SYNCED;
|
||||
if (dirty_db[*it].state & BS_ST_INSTANT)
|
||||
{
|
||||
mark_stable(*it);
|
||||
}
|
||||
else
|
||||
{
|
||||
unstable_unsynced--;
|
||||
assert(unstable_unsynced >= 0);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
auto cb = [this, & wait_count](ring_data_t *data)
|
||||
{
|
||||
if (data->res != 0)
|
||||
disk_error_abort("sync meta", data->res, 0);
|
||||
wait_count--;
|
||||
assert(wait_count >= 0);
|
||||
if (!wait_count)
|
||||
ringloop->wakeup();
|
||||
};
|
||||
if ((unsynced_small_write_count > 0 || unsynced_big_write_count > 0 || unsynced_meta_write_count > 0) && !dsk.disable_meta_fsync)
|
||||
{
|
||||
// fsync meta
|
||||
io_uring_sqe *sqe = get_sqe();
|
||||
assert(sqe);
|
||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
io_uring_prep_fsync(sqe, dsk.meta_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = cb;
|
||||
wait_count++;
|
||||
}
|
||||
if (unsynced_small_write_count > 0 && !dsk.disable_journal_fsync && dsk.meta_fd != dsk.journal_fd)
|
||||
{
|
||||
// fsync buffer
|
||||
io_uring_sqe *sqe = get_sqe();
|
||||
assert(sqe);
|
||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
io_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = cb;
|
||||
wait_count++;
|
||||
}
|
||||
if (unsynced_big_write_count > 0 && !dsk.disable_data_fsync && dsk.data_fd != dsk.meta_fd && dsk.data_fd != dsk.journal_fd)
|
||||
{
|
||||
// fsync data
|
||||
io_uring_sqe *sqe = get_sqe();
|
||||
assert(sqe);
|
||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
io_uring_prep_fsync(sqe, dsk.data_fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = cb;
|
||||
wait_count++;
|
||||
}
|
||||
unsynced_big_write_count = 0;
|
||||
unsynced_small_write_count = 0;
|
||||
unsynced_meta_write_count = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
int blockstore_impl_t::do_sync(blockstore_op_t *op, int base_state)
|
||||
{
|
||||
int op_state = PRIV(op)->op_state - base_state;
|
||||
if (op_state == 1) goto resume_1;
|
||||
if (op_state == 2) goto resume_2;
|
||||
assert(!op_state);
|
||||
if (flusher->get_syncing_buffer())
|
||||
{
|
||||
// Wait for flusher-initiated sync
|
||||
return 0;
|
||||
}
|
||||
if (dsk.disable_journal_fsync && dsk.disable_meta_fsync && dsk.disable_data_fsync || !unsynced_big_write_count && !unsynced_small_write_count)
|
||||
{
|
||||
// We can return immediately because sync only syncs previous writes
|
||||
unsynced_big_write_count = unsynced_small_write_count = unsynced_meta_write_count = 0;
|
||||
return 2;
|
||||
}
|
||||
PRIV(op)->lsn = heap->get_completed_lsn();
|
||||
if (!submit_fsyncs(PRIV(op)->pending_ops))
|
||||
{
|
||||
PRIV(op)->wait_detail = 1;
|
||||
PRIV(op)->wait_for = WAIT_SQE;
|
||||
return 0;
|
||||
}
|
||||
resume_1:
|
||||
if (PRIV(op)->pending_ops > 0)
|
||||
{
|
||||
PRIV(op)->op_state = base_state+1;
|
||||
return 1;
|
||||
}
|
||||
resume_2:
|
||||
heap->mark_lsn_fsynced(PRIV(op)->lsn);
|
||||
return 2;
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -5,14 +5,14 @@
|
||||
//
|
||||
// Initialize storage for tests:
|
||||
//
|
||||
// dd if=/dev/zero of=test_data.bin bs=1024 count=1048576
|
||||
// dd if=/dev/zero of=test_meta.bin bs=1024 count=256
|
||||
// dd if=/dev/zero of=test_journal.bin bs=1024 count=4096
|
||||
// dd if=/dev/zero of=test_data.bin bs=1M count=1024
|
||||
//
|
||||
// Random write:
|
||||
//
|
||||
// fio -thread -ioengine=./libfio_blockstore.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
|
||||
// -bs_config='{"data_device":"./test_data.bin"}' -size=1000M
|
||||
// [LD_PRELOAD=libasan.so.8] \
|
||||
// fio -name=test -thread -ioengine=../build/src/blockstore/libfio_vitastor_blk.so \
|
||||
// -bs=4k -direct=1 -rw=randwrite -iodepth=16 -size=900M -loops=10 \
|
||||
// -bs_config='{"data_device":"./test_data.bin","meta_offset":0,"journal_offset":16777216,"data_offset":33554432,"disable_data_fsync":true,"meta_format":3,"immediate_commit":"all","log_level":100,"journal_no_same_sector_overwrites":true,"journal_sector_buffer_count":1024}'
|
||||
//
|
||||
// Linear write:
|
||||
//
|
||||
@@ -38,12 +38,14 @@ struct bs_data
|
||||
std::vector<io_u*> completed;
|
||||
int op_n = 0, inflight = 0;
|
||||
bool last_sync = false;
|
||||
bool trace = false;
|
||||
};
|
||||
|
||||
struct bs_options
|
||||
{
|
||||
int __pad;
|
||||
char *json_config = NULL;
|
||||
int trace = 0;
|
||||
};
|
||||
|
||||
static struct fio_option options[] = {
|
||||
@@ -56,6 +58,16 @@ static struct fio_option options[] = {
|
||||
.category = FIO_OPT_C_ENGINE,
|
||||
.group = FIO_OPT_G_FILENAME,
|
||||
},
|
||||
{
|
||||
.name = "bs_trace",
|
||||
.lname = "trace",
|
||||
.type = FIO_OPT_BOOL,
|
||||
.off1 = offsetof(struct bs_options, trace),
|
||||
.help = "Trace operations",
|
||||
.def = "0",
|
||||
.category = FIO_OPT_C_ENGINE,
|
||||
.group = FIO_OPT_G_FILENAME,
|
||||
},
|
||||
{
|
||||
.name = NULL,
|
||||
},
|
||||
@@ -63,6 +75,7 @@ static struct fio_option options[] = {
|
||||
|
||||
static int bs_setup(struct thread_data *td)
|
||||
{
|
||||
bs_options *o = (bs_options*)td->eo;
|
||||
bs_data *bsd;
|
||||
//fio_file *f;
|
||||
//int r;
|
||||
@@ -83,6 +96,8 @@ static int bs_setup(struct thread_data *td)
|
||||
td->o.open_files++;
|
||||
}
|
||||
|
||||
bsd->trace = o->trace ? true : false;
|
||||
|
||||
//f = td->files[0];
|
||||
//f->real_file_size = size;
|
||||
return 0;
|
||||
@@ -168,7 +183,7 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
|
||||
{
|
||||
case DDIR_READ:
|
||||
op->opcode = BS_OP_READ;
|
||||
op->buf = io->xfer_buf;
|
||||
op->buf = (uint8_t*)io->xfer_buf;
|
||||
op->oid = {
|
||||
.inode = 1,
|
||||
.stripe = io->offset / bsd->bs->get_block_size(),
|
||||
@@ -176,21 +191,20 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
|
||||
op->version = UINT64_MAX; // last unstable
|
||||
op->offset = io->offset % bsd->bs->get_block_size();
|
||||
op->len = io->xfer_buflen;
|
||||
op->callback = [io](blockstore_op_t *op)
|
||||
op->callback = [io, n = bsd->op_n](blockstore_op_t *op)
|
||||
{
|
||||
io->error = op->retval < 0 ? -op->retval : 0;
|
||||
bs_data *bsd = (bs_data*)io->engine_data;
|
||||
bsd->inflight--;
|
||||
bsd->completed.push_back(io);
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("--- OP_READ %llx n=%d retval=%d\n", io, n, op->retval);
|
||||
#endif
|
||||
if (bsd->trace)
|
||||
printf("--- OP_READ %zx n=%d retval=%d\n", (size_t)op, n, op->retval);
|
||||
delete op;
|
||||
};
|
||||
break;
|
||||
case DDIR_WRITE:
|
||||
op->opcode = BS_OP_WRITE;
|
||||
op->buf = io->xfer_buf;
|
||||
op->opcode = BS_OP_WRITE_STABLE;
|
||||
op->buf = (uint8_t*)io->xfer_buf;
|
||||
op->oid = {
|
||||
.inode = 1,
|
||||
.stripe = io->offset / bsd->bs->get_block_size(),
|
||||
@@ -198,30 +212,28 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
|
||||
op->version = 0; // assign automatically
|
||||
op->offset = io->offset % bsd->bs->get_block_size();
|
||||
op->len = io->xfer_buflen;
|
||||
op->callback = [io](blockstore_op_t *op)
|
||||
op->callback = [io, n = bsd->op_n](blockstore_op_t *op)
|
||||
{
|
||||
io->error = op->retval < 0 ? -op->retval : 0;
|
||||
bs_data *bsd = (bs_data*)io->engine_data;
|
||||
bsd->inflight--;
|
||||
bsd->completed.push_back(io);
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("--- OP_WRITE %llx n=%d retval=%d\n", io, n, op->retval);
|
||||
#endif
|
||||
if (bsd->trace)
|
||||
printf("--- OP_WRITE %zx n=%d retval=%d\n", (size_t)op, n, op->retval);
|
||||
delete op;
|
||||
};
|
||||
bsd->last_sync = false;
|
||||
break;
|
||||
case DDIR_SYNC:
|
||||
op->opcode = BS_OP_SYNC_STAB_ALL;
|
||||
op->callback = [io](blockstore_op_t *op)
|
||||
op->opcode = BS_OP_SYNC;
|
||||
op->callback = [io, n = bsd->op_n](blockstore_op_t *op)
|
||||
{
|
||||
bs_data *bsd = (bs_data*)io->engine_data;
|
||||
io->error = op->retval < 0 ? -op->retval : 0;
|
||||
bsd->completed.push_back(io);
|
||||
bsd->inflight--;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("--- OP_SYNC %llx n=%d retval=%d\n", io, n, op->retval);
|
||||
#endif
|
||||
if (bsd->trace)
|
||||
printf("--- OP_SYNC %zx n=%d retval=%d\n", (size_t)op, n, op->retval);
|
||||
delete op;
|
||||
};
|
||||
bsd->last_sync = true;
|
||||
@@ -232,9 +244,8 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
|
||||
return FIO_Q_COMPLETED;
|
||||
}
|
||||
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("+++ %s %llx n=%d\n", op->opcode == OP_READ ? "OP_READ" : (op->opcode == OP_WRITE ? "OP_WRITE" : "OP_SYNC"), io, n);
|
||||
#endif
|
||||
if (bsd->trace)
|
||||
printf("+++ %s %zx n=%d\n", op->opcode == BS_OP_READ ? "OP_READ" : (op->opcode == BS_OP_WRITE_STABLE ? "OP_WRITE" : "OP_SYNC"), (size_t)op, bsd->op_n);
|
||||
io->error = 0;
|
||||
bsd->inflight++;
|
||||
bsd->bs->enqueue_op(op);
|
||||
@@ -290,7 +301,7 @@ static int bs_invalidate(struct thread_data *td, struct fio_file *f)
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct ioengine_ops ioengine = {
|
||||
struct ioengine_ops __attribute__((visibility("default"))) ioengine = {
|
||||
.name = "vitastor_blockstore",
|
||||
.version = FIO_IOOPS_VERSION,
|
||||
.flags = FIO_MEMALIGN | FIO_DISKLESSIO | FIO_NOEXTEND,
|
||||
|
@@ -1,12 +1,11 @@
|
||||
// Old metadata format on-disk structures
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "crc32c.h"
|
||||
#include <set>
|
||||
|
||||
#define MIN_JOURNAL_SIZE 4*1024*1024
|
||||
#define JOURNAL_MAGIC 0x4A33
|
||||
#define JOURNAL_VERSION_V1 1
|
||||
#define JOURNAL_VERSION_V2 2
|
||||
@@ -145,77 +144,14 @@ inline uint32_t je_crc32(journal_entry *je)
|
||||
return crc32c(0x48674bc7, ((uint8_t*)je)+4, je->size-4);
|
||||
}
|
||||
|
||||
struct journal_sector_info_t
|
||||
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
|
||||
// per "clean" entry on disk with fixed metadata tables
|
||||
struct __attribute__((__packed__)) clean_disk_entry
|
||||
{
|
||||
uint64_t offset;
|
||||
uint64_t flush_count;
|
||||
bool written;
|
||||
bool dirty;
|
||||
uint64_t submit_id;
|
||||
object_id oid;
|
||||
uint64_t version;
|
||||
uint8_t bitmap[];
|
||||
// Two more fields come after bitmap in metadata version 2:
|
||||
// uint32_t data_csum[];
|
||||
// uint32_t entry_csum;
|
||||
};
|
||||
|
||||
struct pending_journaling_t
|
||||
{
|
||||
int pending;
|
||||
int sector;
|
||||
blockstore_op_t *op;
|
||||
};
|
||||
|
||||
struct journal_t
|
||||
{
|
||||
int fd;
|
||||
bool inmemory = false;
|
||||
bool flush_journal = false;
|
||||
void *buffer = NULL;
|
||||
|
||||
uint64_t block_size;
|
||||
uint64_t offset, len;
|
||||
// Next free block offset
|
||||
uint64_t next_free = 0;
|
||||
// First occupied block offset
|
||||
uint64_t used_start = 0;
|
||||
// End of the last block not used for writing anymore
|
||||
uint64_t dirty_start = 0;
|
||||
uint32_t crc32_last = 0;
|
||||
|
||||
// Current sector(s) used for writing
|
||||
void *sector_buf = NULL;
|
||||
journal_sector_info_t *sector_info = NULL;
|
||||
uint64_t sector_count;
|
||||
bool no_same_sector_overwrites = false;
|
||||
int cur_sector = 0;
|
||||
int in_sector_pos = 0;
|
||||
std::vector<int> submitting_sectors;
|
||||
std::multimap<uint64_t, pending_journaling_t> flushing_ops;
|
||||
uint64_t submit_id = 0;
|
||||
|
||||
// Used sector map
|
||||
// May use ~ 80 MB per 1 GB of used journal space in the worst case
|
||||
std::map<uint64_t, uint64_t> used_sectors;
|
||||
|
||||
~journal_t();
|
||||
bool trim();
|
||||
uint64_t get_trim_pos();
|
||||
void dump_diagnostics();
|
||||
inline bool entry_fits(int size)
|
||||
{
|
||||
return !(block_size - in_sector_pos < size ||
|
||||
no_same_sector_overwrites && sector_info[cur_sector].written);
|
||||
}
|
||||
};
|
||||
|
||||
struct blockstore_journal_check_t
|
||||
{
|
||||
blockstore_impl_t *bs;
|
||||
uint64_t next_pos, next_sector, next_in_pos;
|
||||
int sectors_to_write, first_sector;
|
||||
bool right_dir; // writing to the end or the beginning of the ring buffer
|
||||
|
||||
blockstore_journal_check_t(blockstore_impl_t *bs);
|
||||
int check_available(blockstore_op_t *op, int required, int size, int data_after);
|
||||
};
|
||||
|
||||
journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size);
|
||||
|
||||
uint32_t crc32c_pad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad);
|
||||
uint32_t crc32c_nopad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad);
|
338
src/blockstore/multilist.cpp
Normal file
338
src/blockstore/multilist.cpp
Normal file
@@ -0,0 +1,338 @@
|
||||
// Variable-length O(1) disk space allocator
|
||||
// Copyright (c) Vitaliy Filippov, 2025+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <set>
|
||||
#include "multilist.h"
|
||||
|
||||
multilist_alloc_t::multilist_alloc_t(uint32_t count, uint32_t maxn):
|
||||
count(count), maxn(maxn)
|
||||
{
|
||||
// not-so-memory-efficient: 16 MB memory per 1 GB buffer space, but buffer spaces are small, so OK
|
||||
assert(count > 1 && count < 0x80000000);
|
||||
sizes.resize(count);
|
||||
nexts.resize(count); // nexts[i] = 0 -> area is used; nexts[i] = 1 -> no next; nexts[i] >= 2 -> next item
|
||||
prevs.resize(count);
|
||||
heads.resize(maxn); // heads[i] = 0 -> empty list; heads[i] >= 1 -> list head
|
||||
sizes[0] = count;
|
||||
sizes[count-1] = -count; // end
|
||||
nexts[0] = 1;
|
||||
heads[maxn-1] = 1;
|
||||
#ifdef MULTILIST_TRACE
|
||||
print();
|
||||
#endif
|
||||
}
|
||||
|
||||
bool multilist_alloc_t::is_free(uint32_t pos)
|
||||
{
|
||||
assert(pos < count);
|
||||
if (sizes[pos] < 0)
|
||||
pos += sizes[pos]+1;
|
||||
while (pos > 0 && !sizes[pos])
|
||||
pos--;
|
||||
return nexts[pos] > 0;
|
||||
}
|
||||
|
||||
uint32_t multilist_alloc_t::find(uint32_t size)
|
||||
{
|
||||
assert(size > 0);
|
||||
assert(size <= maxn);
|
||||
for (uint32_t i = size-1; i < maxn; i++)
|
||||
{
|
||||
if (heads[i])
|
||||
{
|
||||
return heads[i]-1;
|
||||
}
|
||||
}
|
||||
return UINT32_MAX;
|
||||
}
|
||||
|
||||
void multilist_alloc_t::verify()
|
||||
{
|
||||
std::set<uint32_t> reachable;
|
||||
for (int i = 0; i < maxn; i++)
|
||||
{
|
||||
uint32_t cur = heads[i];
|
||||
while (cur)
|
||||
{
|
||||
if (!nexts[cur-1])
|
||||
{
|
||||
fprintf(stderr, "ERROR: item %d from freelist %d is not free\n", cur-1, i);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
if (nexts[cur-1] >= count+2)
|
||||
{
|
||||
fprintf(stderr, "ERROR: next out of range at %d: %d\n", cur-1, nexts[cur-1]);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
if (!(i < maxn-1 ? sizes[cur-1] == i+1 : (sizes[cur-1] >= i+1)))
|
||||
{
|
||||
fprintf(stderr, "ERROR: item %d is in wrong freelist: expected size %d, but actual size is %d\n", cur-1, i+1, sizes[cur-1]);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
if (reachable.find(cur-1) != reachable.end())
|
||||
{
|
||||
fprintf(stderr, "ERROR: doubly-claimed item %d\n", cur-1);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
reachable.insert(cur-1);
|
||||
cur = nexts[cur-1]-1;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < count; )
|
||||
{
|
||||
if (sizes[i])
|
||||
{
|
||||
assert(i+sizes[i] <= count);
|
||||
if (sizes[i] > 1 && sizes[i+sizes[i]-1] != -sizes[i])
|
||||
{
|
||||
fprintf(stderr, "ERROR: start/end mismatch at %d: sizes[%d] should be %d, but is %d\n", i, i+sizes[i]-1, -sizes[i], sizes[i+sizes[i]-1]);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
for (int j = i+1; j < i+sizes[i]-1; j++)
|
||||
{
|
||||
if (sizes[j])
|
||||
{
|
||||
fprintf(stderr, "ERROR: internal non-zero at %d: %d\n", j, sizes[j]);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
}
|
||||
if (nexts[i] && reachable.find(i) == reachable.end())
|
||||
{
|
||||
fprintf(stderr, "ERROR: %d is unreachable from heads\n", i);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
if (nexts[i] >= 2)
|
||||
{
|
||||
if (nexts[i] >= 2+count)
|
||||
{
|
||||
fprintf(stderr, "ERROR: next out of range at %d: %d\n", i, nexts[i]);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
if (prevs[nexts[i]-2] != i+1)
|
||||
{
|
||||
fprintf(stderr, "ERROR: prev[next] (%d) != this (%d) at %d", prevs[nexts[i]-2], i+1, i);
|
||||
print();
|
||||
abort();
|
||||
}
|
||||
}
|
||||
i += (sizes[i] > 1 ? sizes[i] : 1);
|
||||
}
|
||||
else
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
void multilist_alloc_t::print()
|
||||
{
|
||||
printf("heads:");
|
||||
for (int i = 0; i < maxn; i++)
|
||||
if (heads[i])
|
||||
printf(" %u=%u", i, heads[i]);
|
||||
printf("\n");
|
||||
printf("sizes:");
|
||||
for (int i = 0; i < count; i++)
|
||||
if (sizes[i])
|
||||
printf(" %d=%d", i, sizes[i]);
|
||||
printf("\n");
|
||||
printf("prevs:");
|
||||
for (int i = 0; i < count; i++)
|
||||
if (prevs[i])
|
||||
printf(" %d=%d", i, prevs[i]);
|
||||
printf("\n");
|
||||
printf("nexts:");
|
||||
for (int i = 0; i < count; i++)
|
||||
if (nexts[i])
|
||||
printf(" %d=%d", i, nexts[i]);
|
||||
printf("\n");
|
||||
printf("items:");
|
||||
for (int i = 0; i < count; )
|
||||
{
|
||||
if (sizes[i])
|
||||
{
|
||||
printf(" %u=(s:%d,n:%u,p:%u)", i, sizes[i], nexts[i], prevs[i]);
|
||||
assert(i+sizes[i] <= count);
|
||||
i += (sizes[i] > 1 ? sizes[i] : 1);
|
||||
}
|
||||
else
|
||||
i++;
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void multilist_alloc_t::use(uint32_t pos, uint32_t size)
|
||||
{
|
||||
assert(pos+size <= count && size > 0);
|
||||
if (sizes[pos] <= 0)
|
||||
{
|
||||
uint32_t start = pos;
|
||||
if (sizes[start] < 0)
|
||||
start += sizes[start]+1;
|
||||
else
|
||||
while (start > 0 && !sizes[start])
|
||||
start--;
|
||||
assert(sizes[start] >= size);
|
||||
use_full(start);
|
||||
uint32_t full = sizes[start];
|
||||
sizes[pos-1] = -pos+start;
|
||||
sizes[start] = pos-start;
|
||||
free(start);
|
||||
sizes[pos+size-1] = -size;
|
||||
sizes[pos] = size;
|
||||
if (pos+size < start+full)
|
||||
{
|
||||
sizes[start+full-1] = -(start+full-pos-size);
|
||||
sizes[pos+size] = start+full-pos-size;
|
||||
free(pos+size);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(sizes[pos] >= size);
|
||||
use_full(pos);
|
||||
if (sizes[pos] > size)
|
||||
{
|
||||
uint32_t full = sizes[pos];
|
||||
sizes[pos+size-1] = -size;
|
||||
sizes[pos] = size;
|
||||
sizes[pos+full-1] = -full+size;
|
||||
sizes[pos+size] = full-size;
|
||||
free(pos+size);
|
||||
}
|
||||
}
|
||||
#ifdef MULTILIST_TRACE
|
||||
print();
|
||||
#endif
|
||||
}
|
||||
|
||||
void multilist_alloc_t::use_full(uint32_t pos)
|
||||
{
|
||||
uint32_t prevsize = sizes[pos];
|
||||
assert(prevsize);
|
||||
assert(nexts[pos]);
|
||||
uint32_t pi = (prevsize < maxn ? prevsize : maxn)-1;
|
||||
if (heads[pi] == pos+1)
|
||||
heads[pi] = nexts[pos]-1;
|
||||
if (prevs[pos])
|
||||
nexts[prevs[pos]-1] = nexts[pos];
|
||||
if (nexts[pos] >= 2)
|
||||
prevs[nexts[pos]-2] = prevs[pos];
|
||||
prevs[pos] = 0;
|
||||
nexts[pos] = 0;
|
||||
}
|
||||
|
||||
void multilist_alloc_t::free(uint32_t pos)
|
||||
{
|
||||
do_free(pos);
|
||||
#ifdef MULTILIST_TRACE
|
||||
print();
|
||||
#endif
|
||||
}
|
||||
|
||||
void multilist_alloc_t::do_free(uint32_t pos)
|
||||
{
|
||||
assert(!nexts[pos]);
|
||||
uint32_t size = sizes[pos];
|
||||
assert(size > 0);
|
||||
// merge with previous?
|
||||
if (pos > 0 && nexts[pos+(sizes[pos-1] == 1 ? -1 : sizes[pos-1])] > 0)
|
||||
{
|
||||
assert(sizes[pos-1] < 0 || sizes[pos-1] == 1);
|
||||
uint32_t prevsize = sizes[pos-1] < 0 ? -sizes[pos-1] : 1;
|
||||
use_full(pos-prevsize);
|
||||
sizes[pos] = 0;
|
||||
sizes[pos-1] = 0;
|
||||
size += prevsize;
|
||||
pos -= prevsize;
|
||||
sizes[pos+size-1] = -size;
|
||||
sizes[pos] = size;
|
||||
}
|
||||
// merge with next?
|
||||
if (pos+size < count && nexts[pos+size] >= 1)
|
||||
{
|
||||
uint32_t nextsize = sizes[pos+size];
|
||||
use_full(pos+size);
|
||||
sizes[pos+size] = 0;
|
||||
sizes[pos+size-1] = 0;
|
||||
size += nextsize;
|
||||
sizes[pos+size-1] = -size;
|
||||
sizes[pos] = size;
|
||||
}
|
||||
uint32_t ni = (size < maxn ? size : maxn)-1;
|
||||
nexts[pos] = heads[ni]+1;
|
||||
prevs[pos] = 0;
|
||||
if (heads[ni])
|
||||
prevs[heads[ni]-1] = pos+1;
|
||||
heads[ni] = pos+1;
|
||||
}
|
||||
|
||||
multilist_index_t::multilist_index_t(uint32_t count, uint32_t max_used, uint32_t init_used):
|
||||
count(count), max_used(max_used)
|
||||
{
|
||||
assert(init_used < max_used);
|
||||
nexts.resize(count, UINT32_MAX);
|
||||
prevs.resize(count, UINT32_MAX);
|
||||
heads.resize(max_used, UINT32_MAX);
|
||||
for (size_t i = 0; i < count-1; i++)
|
||||
{
|
||||
nexts[i] = i+1;
|
||||
prevs[i+1] = i;
|
||||
}
|
||||
prevs[0] = UINT32_MAX;
|
||||
nexts[count-1] = UINT32_MAX;
|
||||
heads[init_used] = 0;
|
||||
}
|
||||
|
||||
uint32_t multilist_index_t::find(uint32_t wanted_used)
|
||||
{
|
||||
assert(wanted_used < max_used);
|
||||
return heads[wanted_used];
|
||||
}
|
||||
|
||||
void multilist_index_t::change(uint32_t pos, uint32_t old_used, uint32_t new_used)
|
||||
{
|
||||
if (new_used == old_used)
|
||||
return;
|
||||
assert(old_used < max_used && new_used < max_used);
|
||||
if (prevs[pos] != UINT32_MAX)
|
||||
nexts[prevs[pos]] = nexts[pos];
|
||||
if (nexts[pos] != UINT32_MAX)
|
||||
prevs[nexts[pos]] = prevs[pos];
|
||||
if (heads[old_used] == pos)
|
||||
heads[old_used] = nexts[pos];
|
||||
prevs[pos] = UINT32_MAX;
|
||||
if (heads[new_used] != UINT32_MAX)
|
||||
prevs[heads[new_used]] = pos;
|
||||
nexts[pos] = heads[new_used];
|
||||
heads[new_used] = pos;
|
||||
}
|
||||
|
||||
void multilist_index_t::print()
|
||||
{
|
||||
printf("heads:");
|
||||
for (int i = 0; i < max_used; i++)
|
||||
if (heads[i] != UINT32_MAX)
|
||||
printf(" %u=%u", i, heads[i]);
|
||||
printf("\n");
|
||||
printf("prevs:");
|
||||
for (int i = 0; i < count; i++)
|
||||
if (prevs[i] != UINT32_MAX)
|
||||
printf(" %d=%d", i, prevs[i]);
|
||||
printf("\n");
|
||||
printf("nexts:");
|
||||
for (int i = 0; i < count; i++)
|
||||
if (nexts[i] != UINT32_MAX)
|
||||
printf(" %d=%d", i, nexts[i]);
|
||||
printf("\n");
|
||||
}
|
37
src/blockstore/multilist.h
Normal file
37
src/blockstore/multilist.h
Normal file
@@ -0,0 +1,37 @@
|
||||
// Variable-length O(1) disk space allocator
|
||||
// Copyright (c) Vitaliy Filippov, 2025+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
|
||||
struct multilist_alloc_t
|
||||
{
|
||||
const uint32_t count, maxn;
|
||||
std::vector<int32_t> sizes;
|
||||
std::vector<uint32_t> nexts, prevs, heads;
|
||||
|
||||
multilist_alloc_t(uint32_t count, uint32_t maxn);
|
||||
bool is_free(uint32_t pos);
|
||||
uint32_t find(uint32_t size);
|
||||
void use_full(uint32_t pos);
|
||||
void use(uint32_t pos, uint32_t size);
|
||||
void do_free(uint32_t pos);
|
||||
void free(uint32_t pos);
|
||||
void verify();
|
||||
void print();
|
||||
};
|
||||
|
||||
struct multilist_index_t
|
||||
{
|
||||
const uint32_t count, max_used;
|
||||
std::vector<uint32_t> nexts, prevs, heads;
|
||||
|
||||
// used should be always < max_used
|
||||
multilist_index_t(uint32_t count, uint32_t max_used, uint32_t init_used);
|
||||
uint32_t find(uint32_t wanted_used);
|
||||
void change(uint32_t pos, uint32_t old_used, uint32_t new_used);
|
||||
void print();
|
||||
};
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user