Compare commits

..

2 Commits

Author SHA1 Message Date
2f3b1f37a2 WIP "Heap" metadata storage scheme
Some checks failed
Test / test_rebalance_verify_ec_imm (push) Successful in 1m39s
Test / test_write_no_same (push) Successful in 8s
Test / test_write (push) Successful in 32s
Test / test_rebalance_verify_imm (push) Failing after 2m15s
Test / test_write_xor (push) Successful in 35s
Test / test_heal_pg_size_2 (push) Successful in 2m18s
Test / test_heal_local_read (push) Successful in 2m17s
Test / test_heal_ec (push) Successful in 2m20s
Test / test_heal_antietcd (push) Successful in 2m20s
Test / test_heal_csum_32k_dmj (push) Successful in 2m20s
Test / test_heal_csum_32k (push) Successful in 2m19s
Test / test_heal_csum_4k_dmj (push) Successful in 2m18s
Test / test_heal_csum_32k_dj (push) Failing after 2m29s
Test / test_resize_auto (push) Successful in 8s
Test / test_snapshot_pool2 (push) Successful in 14s
Test / test_osd_tags (push) Successful in 7s
Test / test_enospc (push) Successful in 9s
Test / test_enospc_xor (push) Successful in 13s
Test / test_enospc_imm (push) Successful in 10s
Test / test_enospc_imm_xor (push) Successful in 13s
Test / test_scrub (push) Successful in 15s
Test / test_scrub_zero_osd_2 (push) Successful in 13s
Test / test_scrub_xor (push) Successful in 15s
Test / test_heal_csum_4k_dj (push) Successful in 2m18s
Test / test_heal_csum_4k (push) Successful in 2m18s
Test / test_scrub_pg_size_3 (push) Successful in 16s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 18s
Test / test_scrub_ec (push) Successful in 15s
Test / test_nfs (push) Successful in 13s
Test / test_resize (push) Failing after 3m5s
2025-05-18 11:56:16 +03:00
abd5cbfbe4 Pass clean_bitmap explicitly 2025-05-18 11:55:04 +03:00
243 changed files with 8024 additions and 22586 deletions

View File

@@ -20,7 +20,7 @@ RUN echo 'deb http://deb.debian.org/debian bullseye-backports main' >> /etc/apt/
RUN apt-get update
RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN apt-get update && apt-get -y install jq lp-solve sudo nfs-common fdisk parted
RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`

View File

@@ -144,24 +144,6 @@ jobs:
echo ""
done
test_change_pg_count_online:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_change_pg_count_online.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_change_pg_size:
runs-on: ubuntu-latest
needs: build
@@ -702,24 +684,6 @@ jobs:
echo ""
done
test_write_iothreads:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: TEST_NAME=iothreads GLOBAL_CONFIG=',"client_iothread_count":4' /root/vitastor/tests/test_write.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_write_no_same:
runs-on: ubuntu-latest
needs: build
@@ -810,60 +774,6 @@ jobs:
echo ""
done
test_reweight_half:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_reweight_half.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_snapshot_pool2:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_snapshot_pool2.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_snapshot_read_bitmap:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_snapshot_read_bitmap.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_heal_csum_32k_dmj:
runs-on: ubuntu-latest
needs: build
@@ -1008,6 +918,24 @@ jobs:
echo ""
done
test_snapshot_pool2:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_snapshot_pool2.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_osd_tags:
runs-on: ubuntu-latest
needs: build

View File

@@ -2,19 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
set(VITASTOR_VERSION "2.4.0")
set(VITASTOR_VERSION "2.2.0")
include(CTest)
add_custom_target(build_tests)
add_custom_target(test
COMMAND
echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
)
# make -j16 -C ../../build test_heap && ../../build/src/test/test_heap
# make -j16 -C ../../build test_heap && rm -f $(find ../../build -name '*.gcda') && ctest -V -T test -T coverage -R heap --test-dir ../../build && (cd ../../build; gcovr -f ../src --html --html-nested -o coverage/index.html; cd ../src/test)
# make -j16 -C ../../build test_blockstore && rm -f $(find ../../build -name '*.gcda') && ctest -V -T test -T coverage -R blockstore --test-dir ../../build && (cd ../../build; gcovr -f ../src --html --html-nested -o coverage/index.html; cd ../src/test)
# kcov --include-path=../../../src ../../kcov ./test_blockstore
add_dependencies(test build_tests)
add_subdirectory(src)

View File

@@ -19,7 +19,7 @@ Vitastor нацелен в первую очередь на SSD и SSD+HDD кл
TCP и RDMA и на хорошем железе может достигать задержки 4 КБ чтения и записи на уровне ~0.1 мс,
что примерно в 10 раз быстрее, чем Ceph и другие популярные программные СХД.
Vitastor поддерживает QEMU-драйвер, протоколы UBLK, NBD и NFS, драйверы OpenStack, OpenNebula, Proxmox, Kubernetes.
Vitastor поддерживает QEMU-драйвер, протоколы NBD и NFS, драйверы OpenStack, OpenNebula, Proxmox, Kubernetes.
Другие драйверы могут также быть легко реализованы.
Подробности смотрите в документации по ссылкам. Можете начать отсюда: [Быстрый старт](docs/intro/quickstart.ru.md).
@@ -64,9 +64,8 @@ Vitastor поддерживает QEMU-драйвер, протоколы UBLK,
- [vitastor-cli](docs/usage/cli.ru.md) (консольный интерфейс)
- [vitastor-disk](docs/usage/disk.ru.md) (управление дисками)
- [fio](docs/usage/fio.ru.md) для тестов производительности
- [UBLK](docs/usage/ublk.ru.md) для монтирования ядром
- [NBD](docs/usage/nbd.ru.md) - старый интерфейс для монтирования ядром
- [QEMU, qemu-img и VDUSE](docs/usage/qemu.ru.md)
- [NBD](docs/usage/nbd.ru.md) для монтирования ядром
- [QEMU и qemu-img](docs/usage/qemu.ru.md)
- [NFS](docs/usage/nfs.ru.md) кластерная файловая система и псевдо-ФС прокси
- [Администрирование](docs/usage/admin.ru.md)
- Производительность

View File

@@ -19,7 +19,7 @@ supports TCP and RDMA and may achieve 4 KB read and write latency as low as ~0.1
with proper hardware which is ~10 times faster than other popular SDS's like Ceph
or internal systems of public clouds.
Vitastor supports QEMU, UBLK, NBD, NFS protocols, OpenStack, OpenNebula, Proxmox, Kubernetes drivers.
Vitastor supports QEMU, NBD, NFS protocols, OpenStack, OpenNebula, Proxmox, Kubernetes drivers.
More drivers may be created easily.
Read more details in the documentation. You can start from here: [Quick Start](docs/intro/quickstart.en.md).
@@ -64,9 +64,8 @@ Read more details in the documentation. You can start from here: [Quick Start](d
- [vitastor-cli](docs/usage/cli.en.md) (command-line interface)
- [vitastor-disk](docs/usage/disk.en.md) (disk management tool)
- [fio](docs/usage/fio.en.md) for benchmarks
- [UBLK](docs/usage/ublk.en.md) for kernel mounts
- [NBD](docs/usage/nbd.en.md) - old interface for kernel mounts
- [QEMU, qemu-img and VDUSE](docs/usage/qemu.en.md)
- [NBD](docs/usage/nbd.en.md) for kernel mounts
- [QEMU and qemu-img](docs/usage/qemu.en.md)
- [NFS](docs/usage/nfs.en.md) clustered file system and pseudo-FS proxy
- [Administration](docs/usage/admin.en.md)
- Performance

View File

@@ -36,7 +36,7 @@ RUN (echo deb http://vitastor.io/debian bookworm main > /etc/apt/sources.list.d/
((echo 'Package: *'; echo 'Pin: origin "vitastor.io"'; echo 'Pin-Priority: 1000') > /etc/apt/preferences.d/vitastor.pref) && \
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
apt-get update && \
apt-get install -y vitastor-client ibverbs-providers && \
apt-get install -y vitastor-client && \
wget https://vitastor.io/archive/qemu/qemu-bookworm-9.2.2%2Bds-1%2Bvitastor4/qemu-utils_9.2.2%2Bds-1%2Bvitastor4_amd64.deb && \
wget https://vitastor.io/archive/qemu/qemu-bookworm-9.2.2%2Bds-1%2Bvitastor4/qemu-block-extra_9.2.2%2Bds-1%2Bvitastor4_amd64.deb && \
dpkg -x qemu-utils*.deb tmp1 && \

View File

@@ -1,49 +0,0 @@
# Compile stage
FROM golang:bookworm AS build
ADD go.sum go.mod /app/
RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go mod download -x
ADD . /app
RUN perl -i -e '$/ = undef; while(<>) { s/\n\s*(\{\s*\n)/$1\n/g; s/\}(\s*\n\s*)else\b/$1} else/g; print; }' `find /app -name '*.go'` && \
cd /app && \
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o vitastor-csi
# Final stage
FROM debian:bookworm
LABEL maintainers="Vitaliy Filippov <vitalif@yourcmc.ru>"
LABEL description="Vitastor CSI Driver"
ENV NODE_ID=""
ENV CSI_ENDPOINT=""
RUN apt-get update && \
apt-get install -y wget && \
(echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
apt-get update && \
apt-get install -y e2fsprogs xfsprogs kmod iproute2 \
# NFS mount dependencies
nfs-common netbase \
# dependencies of qemu-storage-daemon
libnuma1 liburing2 libglib2.0-0 libfuse3-3 libaio1 libzstd1 libnettle8 \
libgmp10 libhogweed6 libp11-kit0 libidn2-0 libunistring2 libtasn1-6 libpcre2-8-0 libffi8 && \
apt-get clean && \
(echo options nbd nbds_max=128 > /etc/modprobe.d/nbd.conf)
COPY --from=build /app/vitastor-csi /bin/
ADD deb /deb
RUN apt-get update && \
apt-get -y install /deb/vitastor-client_*.deb && \
wget https://vitastor.io/archive/qemu/qemu-bookworm-9.2.2%2Bds-1%2Bvitastor4/qemu-utils_9.2.2%2Bds-1%2Bvitastor4_amd64.deb && \
wget https://vitastor.io/archive/qemu/qemu-bookworm-9.2.2%2Bds-1%2Bvitastor4/qemu-block-extra_9.2.2%2Bds-1%2Bvitastor4_amd64.deb && \
dpkg -x qemu-utils*.deb tmp1 && \
dpkg -x qemu-block-extra*.deb tmp1 && \
cp -a tmp1/usr/bin/qemu-storage-daemon /usr/bin/ && \
mkdir -p /usr/lib/x86_64-linux-gnu/qemu && \
cp -a tmp1/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/ && \
rm -rf tmp1 *.deb && \
apt-get clean
ENTRYPOINT ["/bin/vitastor-csi"]

View File

@@ -1,4 +1,4 @@
VITASTOR_VERSION ?= v2.4.0
VITASTOR_VERSION ?= v2.2.0
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v2.4.0
image: vitalif/vitastor-csi:v2.2.0
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -121,7 +121,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v2.4.0
image: vitalif/vitastor-csi:v2.2.0
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "2.4.0"
vitastorCSIDriverVersion = "2.2.0"
)
// Config struct fills the parameters of request or user input

View File

@@ -33,7 +33,7 @@ import (
type NodeServer struct
{
*Driver
method MountMethod
useVduse bool
stateDir string
nfsStageDir string
mounter mount.Interface
@@ -81,23 +81,16 @@ func NewNodeServer(driver *Driver) *NodeServer
}
ns := &NodeServer{
Driver: driver,
method: selectMountMethod(),
useVduse: checkVduseSupport(),
stateDir: stateDir,
nfsStageDir: nfsStageDir,
mounter: mount.New(""),
volumeLocks: make(map[string]bool),
}
ns.cond = sync.NewCond(&ns.mu)
if (ns.method == MOUNT_VDUSE)
if (ns.useVduse)
{
ns.restoreVduseDaemons()
}
else if (ns.method == MOUNT_UBLK)
{
ns.restoreUblkDaemons()
}
if (ns.method == MOUNT_VDUSE || ns.method == MOUNT_UBLK)
{
dur, err := time.ParseDuration(os.Getenv("RESTART_INTERVAL"))
if (err != nil)
{
@@ -143,14 +136,7 @@ func (ns *NodeServer) restarter()
for
{
<-ticker.C
if (ns.method == MOUNT_VDUSE)
{
ns.restoreVduseDaemons()
}
else if (ns.method == MOUNT_UBLK)
{
ns.restoreUblkDaemons()
}
ns.restoreVduseDaemons()
}
}
@@ -245,78 +231,6 @@ func (ns *NodeServer) checkVduseState(stateFile string, devs map[string]interfac
}
}
func (ns *NodeServer) restoreUblkDaemons()
{
pattern := ns.stateDir+"vitastor-ublk-*.json"
stateFiles, err := filepath.Glob(pattern)
if (err != nil)
{
klog.Errorf("failed to list %s: %v", pattern, err)
}
if (len(stateFiles) == 0)
{
return
}
for _, stateFile := range stateFiles
{
deviceNum := stateFile[len(ns.stateDir) + len("vitastor-ublk-") :]
deviceNum = deviceNum[0:len(deviceNum)-5]
ns.checkUblkState(deviceNum)
}
}
func (ns *NodeServer) checkUblkState(deviceNum string)
{
// Check if the ublk daemon is still active
// Read state file
stateFile := ns.stateDir + "vitastor-ublk-" + deviceNum + ".json"
stateJSON, err := os.ReadFile(stateFile)
if (err != nil)
{
klog.Warningf("error reading state file %v: %v", stateFile, err)
return
}
var state DeviceState
err = json.Unmarshal(stateJSON, &state)
if (err != nil)
{
klog.Warningf("state file %v contains invalid JSON (error %v): %v", stateFile, err, string(stateJSON))
return
}
// Lock volume
ns.lockVolume(state.ConfigPath+":block:"+state.Image)
defer ns.unlockVolume(state.ConfigPath+":block:"+state.Image)
// Recheck state file after locking
_, err = os.ReadFile(stateFile)
if (err != nil)
{
klog.Warningf("state file %v disappeared, skipping volume", stateFile)
return
}
// Check if the vitastor-ublk process is still active
pidFile := ns.stateDir + "vitastor-ublk-" + deviceNum + ".pid"
exists := false
proc, err := findByPidFile(pidFile)
if (err == nil)
{
exists = proc.Signal(syscall.Signal(0)) == nil
}
if (!exists)
{
// Restart daemon
klog.Warningf("recovering UBLK device /dev/ublkb%v for volume %v", deviceNum, state.Image)
_, err = mapUblk(ns.stateDir, state.Image, state.ConfigPath, state.Readonly, "/dev/ublkb"+deviceNum)
if (err != nil)
{
klog.Warningf("failed to recover ublk device for volume %v: %v", state.Image, err)
}
}
}
func (ns *NodeServer) restoreNfsDaemons()
{
pattern := ns.stateDir+"vitastor-nfs-*.json"
@@ -503,18 +417,14 @@ func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol
}
var devicePath, vdpaId string
if (ns.method == MOUNT_UBLK)
{
devicePath, err = mapUblk(ns.stateDir, volName, ctxVars["configPath"], false, "")
}
else if (ns.method == MOUNT_VDUSE)
{
devicePath, vdpaId, err = mapVduse(ns.stateDir, volName, ctxVars, false)
}
else /* if (ns.method == MOUNT_NBD) */
if (!ns.useVduse)
{
devicePath, err = mapNbd(volName, ctxVars, false)
}
else
{
devicePath, vdpaId, err = mapVduse(ns.stateDir, volName, ctxVars, false)
}
if (err != nil)
{
return nil, err
@@ -529,8 +439,7 @@ func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol
else
{
// Check existing format
var existingFormat string
existingFormat, err = diskMounter.GetDiskFormat(devicePath)
existingFormat, err := diskMounter.GetDiskFormat(devicePath)
if (err != nil)
{
klog.Errorf("failed to get disk format for path %s, error: %v", err)
@@ -586,6 +495,10 @@ func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol
case "xfs":
_, err = systemCombined("xfs_growfs", devicePath)
}
if (err != nil)
{
goto unmap
}
}
}
if (err != nil)
@@ -599,18 +512,14 @@ func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol
return &csi.NodeStageVolumeResponse{}, nil
unmap:
if (ns.method == MOUNT_UBLK)
{
unmapUblk(ns.stateDir, devicePath)
}
else if (ns.method == MOUNT_VDUSE)
{
unmapVduseById(ns.stateDir, vdpaId)
}
else /* if (ns.method == MOUNT_NBD) */
if (!ns.useVduse || len(devicePath) >= 8 && devicePath[0:8] == "/dev/nbd")
{
unmapNbd(devicePath)
}
else
{
unmapVduseById(ns.stateDir, vdpaId)
}
return nil, err
}
@@ -636,7 +545,7 @@ func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstag
defer ns.unlockVolume(ctxVars["configPath"]+":block:"+volName)
targetPath := req.GetStagingTargetPath()
devicePath, err := GetDeviceNameFromMount(targetPath)
devicePath, _, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
if (err != nil)
{
if (os.IsNotExist(err))
@@ -673,18 +582,14 @@ func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstag
// unmap device
if (len(refList) == 0)
{
if (ns.method == MOUNT_UBLK)
{
unmapUblk(ns.stateDir, devicePath)
}
else if (ns.method == MOUNT_VDUSE)
{
unmapVduse(ns.stateDir, devicePath)
}
else /* if (ns.method == MOUNT_NBD) */
if (!ns.useVduse)
{
unmapNbd(devicePath)
}
else
{
unmapVduse(ns.stateDir, devicePath)
}
}
return &csi.NodeUnstageVolumeResponse{}, nil
@@ -992,7 +897,7 @@ func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu
}
targetPath := req.GetTargetPath()
devicePath, err := GetDeviceNameFromMount(targetPath)
devicePath, _, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
if (err != nil)
{
if (os.IsNotExist(err))

View File

@@ -16,20 +16,10 @@ import (
"syscall"
"k8s.io/klog"
"k8s.io/utils/mount"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
type MountMethod int
const (
MOUNT_NBD MountMethod = 0
MOUNT_VDUSE MountMethod = 1
MOUNT_UBLK MountMethod = 2
)
func Contains(list []string, s string) bool
{
for i := 0; i < len(list); i++
@@ -42,26 +32,29 @@ func Contains(list []string, s string) bool
return false
}
func selectMountMethod() MountMethod
func checkVduseSupport() bool
{
// Check UBLK support (ublk_drv kernel module)
if (checkModule("ublk_drv"))
{
klog.Infof("UBLK support enabled successfully")
return MOUNT_UBLK
}
klog.Errorf(
"Your host apparently has no UBLK support. UBLK support disabled."+
" For UBLK you need at least Linux 6.0 and the ublk_drv kernel module.",
)
// Check VDUSE support (vdpa, vduse, virtio-vdpa kernel modules)
vduse := true
for _, mod := range []string{"vdpa", "vduse", "virtio-vdpa"}
{
if (!checkModule(mod))
_, err := os.Stat("/sys/module/"+mod)
if (err != nil)
{
vduse = false
break
if (!errors.Is(err, os.ErrNotExist))
{
klog.Errorf("failed to check /sys/module/%s: %v", mod, err)
}
c := exec.Command("/sbin/modprobe", mod)
c.Stdout = os.Stderr
c.Stderr = os.Stderr
err := c.Run()
if (err != nil)
{
klog.Errorf("/sbin/modprobe %s failed: %v", mod, err)
vduse = false
break
}
}
}
// Check that vdpa tool functions
@@ -76,38 +69,18 @@ func selectMountMethod() MountMethod
vduse = false
}
}
if (vduse)
if (!vduse)
{
klog.Errorf(
"Your host apparently has no VDUSE support. VDUSE support disabled, NBD will be used to map devices."+
" For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
)
}
else
{
klog.Infof("VDUSE support enabled successfully")
return MOUNT_VDUSE
}
klog.Errorf(
"Your host apparently has no VDUSE support. VDUSE support disabled, NBD will be used to map devices."+
" For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
)
return MOUNT_NBD
}
func checkModule(mod string) bool
{
_, err := os.Stat("/sys/module/"+mod)
if (err != nil)
{
if (!errors.Is(err, os.ErrNotExist))
{
klog.Errorf("failed to check /sys/module/%s: %v", mod, err)
}
c := exec.Command("/sbin/modprobe", mod)
c.Stdout = os.Stderr
c.Stderr = os.Stderr
err := c.Run()
if (err != nil)
{
klog.Errorf("/sbin/modprobe %s failed: %v", mod, err)
return false
}
}
return true
return vduse
}
func mapNbd(volName string, ctxVars map[string]string, readonly bool) (string, error)
@@ -244,7 +217,6 @@ func mapVduse(stateDir string, volName string, ctxVars map[string]string, readon
stateJSON, _ := json.Marshal(&DeviceState{
ConfigPath: ctxVars["configPath"],
VdpaId: vdpaId,
Image: volName,
Blockdev: blockdev,
Readonly: readonly,
@@ -337,117 +309,6 @@ func unmapVduseById(stateDir, vdpaId string)
}
}
func mapUblk(stateDir string, volName string, configPath string, readonly bool, recoverDev string) (string, error)
{
pidFile := ""
if (recoverDev != "")
{
if (len(recoverDev) < 10 || recoverDev[0:10] != "/dev/ublkb")
{
return "", fmt.Errorf("recover: %s does not start with /dev/ublkb", recoverDev)
}
pidFile = stateDir + "vitastor-ublk-" + recoverDev[10:] + ".pid"
}
else
{
pidFd, err := os.CreateTemp(stateDir, "vitastor-tmp-*.pid")
if (err != nil)
{
return "", err
}
pidFile = pidFd.Name()
pidFd.Close()
}
// Map device via vitastor-ublk
args := []string{
"map", "--image", volName, "--pidfile", pidFile,
}
if (configPath != "")
{
args = append(args, "--config_path", configPath)
}
if (readonly)
{
args = append(args, "--readonly")
}
if (recoverDev != "")
{
args = append(args, "--recover", recoverDev)
}
stdout, stderr, err := system("/usr/bin/vitastor-ublk", args...)
if (err != nil)
{
return "", err
}
devicePath := strings.TrimSpace(string(stdout))
if (devicePath == "")
{
return "", fmt.Errorf("vitastor-ublk did not return the name of the device. output: %s", stderr)
}
if (len(devicePath) >= 10 && devicePath[0:10] == "/dev/ublkb")
{
// Generate state file
devNum := devicePath[10:]
pidNew := stateDir + "vitastor-ublk-" + devNum + ".pid"
if (pidFile != pidNew)
{
err := os.Rename(pidFile, pidNew)
if (err != nil)
{
klog.Errorf("Failed to rename PID file %s to %s: %v", pidFile, pidNew, err)
}
else
{
pidFile = pidNew
}
}
stateFile := stateDir + "vitastor-ublk-" + devNum + ".json"
stateJSON, _ := json.Marshal(&DeviceState{
ConfigPath: configPath,
Image: volName,
Readonly: readonly,
PidFile: pidFile,
})
err = os.WriteFile(stateFile, stateJSON, 0600)
if (err == nil)
{
klog.Infof("Attached volume %s via UBLK as %s", volName, devicePath)
return devicePath, nil
}
os.Remove(stateFile)
}
killErr := killByPidFile(pidFile)
if (killErr != nil)
{
klog.Errorf("Failed to kill started vitastor-ublk: %v", killErr)
}
os.Remove(pidFile)
return "", err
}
func unmapUblk(stateDir, devicePath string)
{
if (len(devicePath) < 10 || devicePath[0:10] != "/dev/ublkb")
{
klog.Errorf("%s does not start with /dev/ublkb", devicePath)
return
}
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-ublk", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap UBLK device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
for _, ext := range []string{"json", "pid"}
{
fn := stateDir + "vitastor-ublk-" + devicePath[10:] + "." + ext
err := os.Remove(fn)
if (err != nil)
{
klog.Errorf("failed to remove %s: %v", fn, err)
}
}
}
func system(program string, args ...string) ([]byte, []byte, error)
{
klog.Infof("Running "+program+" "+strings.Join(args, " "))
@@ -479,43 +340,3 @@ func systemCombined(program string, args ...string) ([]byte, error)
}
return out.Bytes(), nil
}
func GetDeviceNameFromMount(mountPath string) (string, error)
{
// Use /proc/self/mountinfo to correctly parse bind mounts for block device files
mps, err := mount.ParseMountInfo("/proc/self/mountinfo")
if (err != nil)
{
return "", err
}
slTarget, err := filepath.EvalSymlinks(mountPath)
if (err != nil)
{
slTarget = mountPath
}
device := ""
for _, mp := range mps
{
if (mp.MountPoint == slTarget)
{
device = mp.Source
if (device[0] != '/' && mp.Root != "/")
{
// Handle {Source=udev Root=/vdb MountPoint=/var/lib/kubelet/tralaleylo/tralala}
for _, other := range mps
{
if (other.Root == "/" && other.Source == mp.Source)
{
device = other.MountPoint + mp.Root
break
}
}
}
break
}
}
return device, nil
}

View File

@@ -1,4 +1,7 @@
#!/bin/bash
docker build --build-arg DISTRO=debian --build-arg REL=bookworm -t vitastor-buildenv:bookworm -f vitastor-buildenv.Dockerfile .
docker run -i --rm -e REL=bookworm -v `dirname $0`/../:/root/vitastor vitastor-buildenv:bookworm /root/vitastor/debian/vitastor-build.sh
cat < vitastor.Dockerfile > ../Dockerfile
cd ..
mkdir -p packages
sudo podman build --build-arg DISTRO=debian --build-arg REL=bookworm -v `pwd`/packages:/root/packages -f Dockerfile .
rm Dockerfile

View File

@@ -1,4 +1,7 @@
#!/bin/bash
docker build --build-arg DISTRO=debian --build-arg REL=bullseye -t vitastor-buildenv:bullseye -f vitastor-buildenv.Dockerfile .
docker run -i --rm -e REL=bullseye -v `dirname $0`/../:/root/vitastor vitastor-buildenv:bullseye /root/vitastor/debian/vitastor-build.sh
cat < vitastor.Dockerfile > ../Dockerfile
cd ..
mkdir -p packages
sudo podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f Dockerfile .
rm Dockerfile

View File

@@ -1,4 +1,7 @@
#!/bin/bash
docker build --build-arg DISTRO=debian --build-arg REL=buster -t vitastor-buildenv:buster -f vitastor-buildenv.Dockerfile .
docker run -i --rm -e REL=buster -v `dirname $0`/../:/root/vitastor vitastor-buildenv:buster /root/vitastor/debian/vitastor-build.sh
cat < vitastor.Dockerfile > ../Dockerfile
cd ..
mkdir -p packages
sudo podman build --build-arg DISTRO=debian --build-arg REL=buster -v `pwd`/packages:/root/packages -f Dockerfile .
rm Dockerfile

View File

@@ -1,4 +0,0 @@
#!/bin/bash
docker build --build-arg DISTRO=debian --build-arg REL=trixie -t vitastor-buildenv:trixie -f vitastor-buildenv.Dockerfile .
docker run -i --rm -e REL=trixie -v `dirname $0`/../:/root/vitastor vitastor-buildenv:trixie /root/vitastor/debian/vitastor-build.sh

View File

@@ -1,5 +1,7 @@
#!/bin/bash
# Ubuntu 22.04 Jammy Jellyfish
docker build --build-arg DISTRO=ubuntu --build-arg REL=jammy -t vitastor-buildenv:jammy -f vitastor-buildenv.Dockerfile .
docker run -i --rm -e REL=jammy -v `dirname $0`/../:/root/vitastor vitastor-buildenv:jammy /root/vitastor/debian/vitastor-build.sh
cat < vitastor.Dockerfile > ../Dockerfile
cd ..
mkdir -p packages
sudo podman build --build-arg DISTRO=ubuntu --build-arg REL=jammy -v `pwd`/packages:/root/packages -f Dockerfile .
rm Dockerfile

View File

@@ -1,5 +0,0 @@
#!/bin/bash
# 24.04 Noble Numbat
docker build --build-arg DISTRO=ubuntu --build-arg REL=noble -t vitastor-buildenv:noble -f vitastor-buildenv.Dockerfile .
docker run -i --rm -e REL=noble -v `dirname $0`/../:/root/vitastor vitastor-buildenv:noble /root/vitastor/debian/vitastor-build.sh

2
debian/changelog vendored
View File

@@ -1,4 +1,4 @@
vitastor (2.4.0-1) unstable; urgency=medium
vitastor (2.2.0-1) unstable; urgency=medium
* Bugfixes

4
debian/control vendored
View File

@@ -2,9 +2,9 @@ Source: vitastor
Section: admin
Priority: optional
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
Build-Depends: debhelper, g++ (>= 8), libstdc++6 (>= 8),
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8),
linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev,
libibverbs-dev, librdmacm-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev,
libibverbs-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev,
node-bindings <!nocheck>, node-gyp, node-nan
Standards-Version: 4.5.0
Homepage: https://vitastor.io/

View File

@@ -26,7 +26,7 @@ RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" -o "$REL" = "bookworm" ]; then
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
RUN apt-get update
RUN DEBIAN_FRONTEND=noninteractive TZ=Europe/Moscow apt-get -y install fio libgoogle-perftools-dev devscripts
RUN DEBIAN_FRONTEND=noninteractive TZ=Europe/Moscow apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts
RUN DEBIAN_FRONTEND=noninteractive TZ=Europe/Moscow apt-get -y build-dep qemu
# To build a custom version
#RUN cp /root/packages/qemu-orig/* /root

View File

@@ -1,60 +0,0 @@
#!/bin/bash
# To be ran inside buildenv docker
set -e -x
[ -e /usr/lib/x86_64-linux-gnu/pkgconfig/libisal.pc ] || cp /root/vitastor/debian/libisal.pc /usr/lib/x86_64-linux-gnu/pkgconfig
mkdir -p /root/fio-build/
cd /root/fio-build/
rm -rf /root/fio-build/*
dpkg-source -x /root/fio*.dsc
FULLVER=`head -n1 /root/vitastor/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'`
VER=${FULLVER%%-*}
rm -rf /root/vitastor-$VER
mkdir /root/vitastor-$VER
cd /root/vitastor
cp -a $(ls | grep -v packages) /root/vitastor-$VER
rm -rf /root/vitastor/packages/vitastor-$REL
mkdir -p /root/vitastor/packages/vitastor-$REL
mv /root/vitastor-$VER /root/vitastor/packages/vitastor-$REL/
cd /root/vitastor/packages/vitastor-$REL/vitastor-$VER
rm -rf fio
ln -s /root/fio-build/fio-*/ ./fio
FIO=`head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'`
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h
sh copy-fio-includes.sh
rm fio
mkdir -p a b debian/patches
mv fio-copy b/fio
diff -NaurpbB a b > debian/patches/fio-headers.patch || true
echo fio-headers.patch >> debian/patches/series
rm -rf a b
echo "dep:fio=$FIO" > debian/fio_version
cd /root/vitastor/packages/vitastor-$REL/vitastor-$VER
mkdir mon/node_modules
cd mon/node_modules
curl -s https://git.yourcmc.ru/vitalif/antietcd/archive/master.tar.gz | tar -zx
curl -s https://git.yourcmc.ru/vitalif/tinyraft/archive/master.tar.gz | tar -zx
cd /root/vitastor/packages/vitastor-$REL
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_$VER.orig.tar.xz vitastor-$VER
cd vitastor-$VER
DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$FULLVER""$REL" "Rebuild for $REL"
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa
rm -rf /root/vitastor/packages/vitastor-$REL/vitastor-*/
# Why does ubuntu rename debug packages to *.ddeb?
cd /root/vitastor/packages/vitastor-$REL
if ls *.ddeb >/dev/null; then
perl -i -pe 's/\.ddeb/.deb/' *.buildinfo *.changes
for i in *.ddeb; do
mv $i ${i%%.ddeb}.deb
done
fi

View File

@@ -1,31 +0,0 @@
# Build environment for building Vitastor packages for Debian inside a container
# cd ..
# docker build --build-arg DISTRO=debian --build-arg REL=bullseye -f debian/vitastor.Dockerfile -t vitastor-buildenv:bullseye .
# docker run --rm -e REL=bullseye -v ./:/root/vitastor /root/vitastor/debian/vitastor-build.sh
ARG DISTRO=debian
ARG REL=
FROM $DISTRO:$REL
ARG DISTRO=debian
ARG REL=
WORKDIR /root
RUN set -e -x; \
if [ "$REL" = "buster" ]; then \
perl -i -pe 's/deb.debian.org/archive.debian.org/' /etc/apt/sources.list; \
apt-get update; \
apt-get -y install wget; \
wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg; \
echo "deb https://vitastor.io/debian $REL main" >> /etc/apt/sources.list; \
fi; \
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
perl -i -pe 's/Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/*.sources || true; \
echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
RUN apt-get update && \
apt-get -y install fio libgoogle-perftools-dev devscripts libjerasure-dev cmake \
libibverbs-dev librdmacm-dev libisal-dev libnl-3-dev libnl-genl-3-dev curl nodejs npm node-nan node-bindings && \
apt-get -y build-dep fio && \
apt-get --download-only source fio

View File

@@ -2,7 +2,6 @@ usr/bin/vita
usr/bin/vitastor-cli
usr/bin/vitastor-rm
usr/bin/vitastor-nbd
usr/bin/vitastor-ublk
usr/bin/vitastor-nfs
usr/bin/vitastor-kv
usr/bin/vitastor-kv-stress

65
debian/vitastor.Dockerfile vendored Normal file
View File

@@ -0,0 +1,65 @@
# Build Vitastor packages for Debian inside a container
# cd ..; podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/vitastor.Dockerfile .
ARG DISTRO=debian
ARG REL=
FROM $DISTRO:$REL
ARG DISTRO=debian
ARG REL=
WORKDIR /root
RUN set -e -x; \
if [ "$REL" = "buster" ]; then \
apt-get update; \
apt-get -y install wget; \
wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg; \
echo "deb https://vitastor.io/debian $REL main" >> /etc/apt/sources.list; \
fi; \
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
perl -i -pe 's/Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/debian.sources || true; \
echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
RUN apt-get update && \
apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake \
libibverbs-dev librdmacm-dev libisal-dev libnl-3-dev libnl-genl-3-dev curl nodejs npm node-nan node-bindings && \
apt-get -y build-dep fio && \
apt-get --download-only source fio
ADD . /root/vitastor
RUN set -e -x; \
[ -e /usr/lib/x86_64-linux-gnu/pkgconfig/libisal.pc ] || cp /root/vitastor/debian/libisal.pc /usr/lib/x86_64-linux-gnu/pkgconfig; \
mkdir -p /root/fio-build/; \
cd /root/fio-build/; \
rm -rf /root/fio-build/*; \
dpkg-source -x /root/fio*.dsc; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
FULLVER=$(head -n1 /root/vitastor/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
VER=${FULLVER%%-*}; \
cp -r /root/vitastor vitastor-$VER; \
cd vitastor-$VER; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
sh copy-fio-includes.sh; \
rm fio; \
mkdir -p a b debian/patches; \
mv fio-copy b/fio; \
diff -NaurpbB a b > debian/patches/fio-headers.patch || true; \
echo fio-headers.patch >> debian/patches/series; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL/vitastor-$VER; \
mkdir mon/node_modules; \
cd mon/node_modules; \
curl -s https://git.yourcmc.ru/vitalif/antietcd/archive/master.tar.gz | tar -zx; \
curl -s https://git.yourcmc.ru/vitalif/tinyraft/archive/master.tar.gz | tar -zx; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_$VER.orig.tar.xz vitastor-$VER; \
cd vitastor-$VER; \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$FULLVER""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
rm -rf /root/packages/vitastor-$REL/vitastor-*/

View File

@@ -3,7 +3,7 @@
FROM debian:bookworm
ADD etc/apt /etc/apt/
RUN apt-get update && apt-get -y install vitastor ibverbs-providers udev systemd qemu-system-x86 qemu-system-common qemu-block-extra qemu-utils jq nfs-common && apt-get clean
RUN apt-get update && apt-get -y install vitastor udev systemd qemu-system-x86 qemu-system-common qemu-block-extra qemu-utils jq nfs-common && apt-get clean
ADD sleep.sh /usr/bin/
ADD install.sh /usr/bin/
ADD scripts /opt/scripts/

View File

@@ -1,4 +1,4 @@
VITASTOR_VERSION ?= v2.4.0
VITASTOR_VERSION ?= v2.2.0
all: build push

View File

@@ -1,3 +0,0 @@
Package: *
Pin: release n=bookworm-backports
Pin-Priority: 500

View File

@@ -4,7 +4,7 @@
#
# Desired Vitastor version
VITASTOR_VERSION=v2.4.0
VITASTOR_VERSION=v2.2.0
# Additional arguments for all containers
# For example, you may want to specify a custom logging driver here

View File

@@ -25,9 +25,6 @@ affect their interaction with the cluster.
- [nbd_max_part](#nbd_max_part)
- [osd_nearfull_ratio](#osd_nearfull_ratio)
- [hostname](#hostname)
- [ublk_queue_depth](#ublk_queue_depth)
- [ublk_max_io_size](#ublk_max_io_size)
- [qemu_file_mirror_path](#qemu_file_mirror_path)
## client_iothread_count
@@ -228,28 +225,3 @@ without destroying and recreating OSDs.
Clients use host name to find their distance to OSDs when [localized reads](pool.en.md#local_reads)
are enabled. By default, standard [gethostname](https://man7.org/linux/man-pages/man2/gethostname.2.html)
function is used to determine host name, but you can also override it with this parameter.
## ublk_queue_depth
- Type: integer
- Default: 256
Default queue depth for [Vitastor ublk servers](../usage/ublk.en.md).
## ublk_max_io_size
- Type: integer
Default maximum I/O size for Vitastor [ublk servers](../usage/ublk.en.md).
The largest of 1 MB and pool block size multiplied by EC data chunk count is used if not specified.
## qemu_file_mirror_path
- Type: string
When set to an FS directory path (for example, `/mnt/vitastor/`), `qemu-img info` and similar
QAPI commands return the name of the image inside this directory instead of normal
`vitastor://?image=abc` URI as `filename`.
This allows to then mount this path using [vitastor-nfs](../usage/nfs.en.md) and trick
third-party systems like Veeam which rely on `filename` in the image info but don't support Vitastor.

View File

@@ -25,9 +25,6 @@
- [nbd_max_part](#nbd_max_part)
- [osd_nearfull_ratio](#osd_nearfull_ratio)
- [hostname](#hostname)
- [ublk_queue_depth](#ublk_queue_depth)
- [ublk_max_io_size](#ublk_max_io_size)
- [qemu_file_mirror_path](#qemu_file_mirror_path)
## client_iothread_count
@@ -233,30 +230,3 @@ RDMA и хотите повысить пиковую производитель
[локальные чтения](pool.ru.md#local_reads). По умолчанию для определения имени
хоста используется стандартная функция [gethostname](https://man7.org/linux/man-pages/man2/gethostname.2.html),
но вы также можете задать имя хоста вручную данным параметром.
## ublk_queue_depth
- Тип: целое число
- Значение по умолчанию: 256
Глубина очереди по умолчанию для [ublk-серверов Vitastor](../usage/ublk.ru.md).
## ublk_max_io_size
- Тип: целое число
Максимальный размер запроса ввода-вывода для [ublk-серверов Vitastor](../usage/ublk.ru.md).
Если не задан, используется максимум из 1 МБ и размера блока пула, умноженного на число частей
данных EC-пула.
## qemu_file_mirror_path
- Тип: строка
Если установить эту опцию равной пути к каталогу в ФС, команда `qemu-img info` и подобные
команды QAPI будут возвращать в поле `filename` имя образа внутри заданного каталога вместо
обычного адреса типа `vitastor://?image=abc`.
Это позволяет смонтировать этот путь с помощью [vitastor-nfs](../usage/nfs.ru.md) и обмануть
сторонние системы типа Veeam, которые полагаются на поле `filename` в информации об образе QEMU,
но не поддерживают Vitastor.

View File

@@ -74,7 +74,7 @@ Consider `use_rdmacm` for such networks.
## use_rdmacm
- Type: boolean
- Default: false
- Default: true
Use an alternative implementation of RDMA through RDMA-CM (Connection
Manager). Works with all RDMA networks: Infiniband, iWARP and

View File

@@ -74,7 +74,7 @@ RDMA-устройства, но они не имеют соединения с
## use_rdmacm
- Тип: булево (да/нет)
- Значение по умолчанию: false
- Значение по умолчанию: true
Использовать альтернативную реализацию RDMA на основе RDMA-CM (Connection
Manager). Работает со всеми типами RDMA-сетей: Infiniband, iWARP и

View File

@@ -491,7 +491,7 @@ Can be used to slow down scrubbing if it affects user load too much.
## scrub_list_limit
- Type: integer
- Default: 262144
- Default: 1000
- Can be changed online: yes
Number of objects to list in one listing operation during scrub.

View File

@@ -514,7 +514,7 @@ fsync небезопасным даже с режимом "directsync".
## scrub_list_limit
- Тип: целое число
- Значение по умолчанию: 262144
- Значение по умолчанию: 1000
- Можно менять на лету: да
Размер загружаемых за одну операцию списков объектов в процессе фоновой

View File

@@ -283,36 +283,3 @@
[локальные чтения](pool.ru.md#local_reads). По умолчанию для определения имени
хоста используется стандартная функция [gethostname](https://man7.org/linux/man-pages/man2/gethostname.2.html),
но вы также можете задать имя хоста вручную данным параметром.
- name: ublk_queue_depth
type: int
default: 256
online: false
info: Default queue depth for [Vitastor ublk servers](../usage/ublk.en.md).
info_ru: Глубина очереди по умолчанию для [ublk-серверов Vitastor](../usage/ublk.ru.md).
- name: ublk_max_io_size
type: int
online: false
info: |
Default maximum I/O size for Vitastor [ublk servers](../usage/ublk.en.md).
The largest of 1 MB and pool block size multiplied by EC data chunk count is used if not specified.
info_ru: |
Максимальный размер запроса ввода-вывода для [ublk-серверов Vitastor](../usage/ublk.ru.md).
Если не задан, используется максимум из 1 МБ и размера блока пула, умноженного на число частей
данных EC-пула.
- name: qemu_file_mirror_path
type: string
info: |
When set to an FS directory path (for example, `/mnt/vitastor/`), `qemu-img info` and similar
QAPI commands return the name of the image inside this directory instead of normal
`vitastor://?image=abc` URI as `filename`.
This allows to then mount this path using [vitastor-nfs](../usage/nfs.en.md) and trick
third-party systems like Veeam which rely on `filename` in the image info but don't support Vitastor.
info_ru: |
Если установить эту опцию равной пути к каталогу в ФС, команда `qemu-img info` и подобные
команды QAPI будут возвращать в поле `filename` имя образа внутри заданного каталога вместо
обычного адреса типа `vitastor://?image=abc`.
Это позволяет смонтировать этот путь с помощью [vitastor-nfs](../usage/nfs.ru.md) и обмануть
сторонние системы типа Veeam, которые полагаются на поле `filename` в информации об образе QEMU,
но не поддерживают Vitastor.

View File

@@ -24,8 +24,6 @@
{{../../installation/kubernetes.en.md}}
{{../../installation/s3.en.md}}
{{../../installation/source.en.md}}
{{../../config.en.md|indent=1}}
@@ -56,8 +54,6 @@
{{../../usage/fio.en.md}}
{{../../usage/ublk.en.md}}
{{../../usage/nbd.en.md}}
{{../../usage/qemu.en.md}}

View File

@@ -26,8 +26,6 @@
{{../../installation/source.ru.md}}
{{../../installation/s3.ru.md}}
{{../../config.ru.md|indent=1}}
{{../../config/common.ru.md|indent=2}}
@@ -56,8 +54,6 @@
{{../../usage/fio.ru.md}}
{{../../usage/ublk.ru.md}}
{{../../usage/nbd.ru.md}}
{{../../usage/qemu.ru.md}}

View File

@@ -51,7 +51,7 @@
Рассмотрите включение `use_rdmacm` для таких сетей.
- name: use_rdmacm
type: bool
default: false
default: true
info: |
Use an alternative implementation of RDMA through RDMA-CM (Connection
Manager). Works with all RDMA networks: Infiniband, iWARP and

View File

@@ -566,7 +566,7 @@
сильно влияет на пользовательскую нагрузку.
- name: scrub_list_limit
type: int
default: 262144
default: 1000
online: true
info: |
Number of objects to list in one listing operation during scrub.

View File

@@ -26,9 +26,9 @@ at Vitastor Kubernetes operator: https://github.com/Antilles7227/vitastor-operat
The instruction is very simple.
1. Download a Docker image of the desired version: \
`docker pull vitalif/vitastor:v2.4.0`
`docker pull vitastor:v2.2.0`
2. Install scripts to the host system: \
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitalif/vitastor:v2.4.0 install.sh`
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.0 install.sh`
3. Reload udev rules: \
`udevadm control --reload-rules`

View File

@@ -25,9 +25,9 @@ Vitastor можно установить в Docker/Podman. При этом etcd,
Инструкция по установке максимально простая.
1. Скачайте Docker-образ желаемой версии: \
`docker pull vitalif/vitastor:v2.4.0`
`docker pull vitastor:v2.2.0`
2. Установите скрипты в хост-систему командой: \
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitalif/vitastor:v2.4.0 install.sh`
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.0 install.sh`
3. Перезагрузите правила udev: \
`udevadm control --reload-rules`

View File

@@ -11,20 +11,12 @@
- Trust Vitastor package signing key:
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
- Add Vitastor package repository to your /etc/apt/sources.list:
- Debian 13 (Trixie/Sid): `deb https://vitastor.io/debian trixie main`
- Debian 12 (Bookworm): `deb https://vitastor.io/debian bookworm main`
- Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
- Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
- Ubuntu 22.04 (Jammy): `deb https://vitastor.io/debian jammy main`
- Ubuntu 24.04 (Noble): `deb https://vitastor.io/debian noble main`
- Add `-oldstable` to bookworm/bullseye/buster in this line to install the last
stable version from 0.9.x branch instead of 1.x
- To always prefer vitastor-patched QEMU and Libvirt versions, add the following to `/etc/apt/preferences`:
```
Package: *
Pin: origin "vitastor.io"
Pin-Priority: 501
```
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
## CentOS
@@ -50,6 +42,7 @@
recommended because io_uring is a relatively new technology and there is
at least one bug which reproduces with io_uring and HP SmartArray
controllers in 5.4
- liburing 0.4 or newer
- lp_solve
- etcd 3.4.15 or newer. Earlier versions won't work because of various bugs,
for example [#12402](https://github.com/etcd-io/etcd/pull/12402).

View File

@@ -11,20 +11,12 @@
- Добавьте ключ репозитория Vitastor:
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
- Добавьте репозиторий Vitastor в /etc/apt/sources.list:
- Debian 13 (Trixie/Sid): `deb https://vitastor.io/debian trixie main`
- Debian 12 (Bookworm): `deb https://vitastor.io/debian bookworm main`
- Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
- Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
- Ubuntu 22.04 (Jammy): `deb https://vitastor.io/debian jammy main`
- Ubuntu 24.04 (Noble): `deb https://vitastor.io/debian noble main`
- Добавьте `-oldstable` к слову bookworm/bullseye/buster в этой строке, чтобы
установить последнюю стабильную версию из ветки 0.9.x вместо 1.x
- Чтобы всегда предпочитались версии пакетов QEMU и Libvirt с патчами Vitastor, добавьте в `/etc/apt/preferences`:
```
Package: *
Pin: origin "vitastor.io"
Pin-Priority: 501
```
- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
## CentOS
@@ -49,6 +41,7 @@
- Ядро Linux 5.4 или новее, для поддержки io_uring. Рекомендуется даже 5.8,
так как io_uring - относительно новый интерфейс и в версиях до 5.8 встречались
некоторые баги, например, зависание с io_uring и контроллером HP SmartArray
- liburing 0.4 или новее
- lp_solve
- etcd 3.4.15 или новее. Более старые версии не будут работать из-за разных багов,
например, [#12402](https://github.com/etcd-io/etcd/pull/12402).

View File

@@ -6,10 +6,10 @@
# Proxmox VE
To enable Vitastor support in Proxmox Virtual Environment (6.4-8.x are supported):
To enable Vitastor support in Proxmox Virtual Environment (6.4-8.1 are supported):
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
trixie for 9.0+, bookworm for 8.1+, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
bookworm for 8.1, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
- Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
- Define storage in `/etc/pve/storage.cfg` (see below)
- Block network access from VMs to Vitastor network (to OSDs and etcd),

View File

@@ -6,10 +6,10 @@
# Proxmox VE
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.x):
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.1):
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
trixie для 9.0+, bookworm для 8.1+, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
bookworm для 8.1, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
- Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
- Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию

View File

@@ -15,7 +15,7 @@
- gcc and g++ 8 or newer, clang 10 or newer, or other compiler with C++11 plus
designated initializers support from C++20
- CMake
- jerasure headers and libraries
- liburing, jerasure headers and libraries
- ISA-L, libibverbs and librdmacm headers and libraries (optional)
- tcmalloc (google-perftools-dev)

View File

@@ -15,7 +15,7 @@
- gcc и g++ >= 8, либо clang >= 10, либо другой компилятор с поддержкой C++11 плюс
назначенных инициализаторов (designated initializers) из C++20
- CMake
- Заголовки и библиотеки jerasure
- Заголовки и библиотеки liburing, jerasure
- Опционально - заголовки и библиотеки ISA-L, libibverbs, librdmacm
- tcmalloc (google-perftools-dev)

View File

@@ -10,17 +10,8 @@ Copyright (c) Vitaliy Filippov (vitalif [at] yourcmc.ru), 2019+
Join Vitastor Telegram Chat: https://t.me/vitastor
License: VNPL 1.1 for server-side code and dual VNPL 1.1 + GPL 2.0+ for client tools.
Server-side code is licensed only under the terms of VNPL.
Client libraries (cluster_client and so on) are dual-licensed under the same
VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
software like QEMU and fio.
## VNPL
Vitastor Network Public License 1.1 (VNPL 1.1) is a copyleft license based on
All server-side code (OSD, Monitor and so on) is licensed under the terms of
Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
GNU GPLv3.0 with the additional "Network Interaction" clause which requires
opensourcing all programs directly or indirectly interacting with Vitastor
through a computer network and expressly designed to be used in conjunction
@@ -29,83 +20,18 @@ the terms of the same license, but also under the terms of any GPL-Compatible
Free Software License, as listed by the Free Software Foundation.
This is a stricter copyleft license than the Affero GPL.
The idea of VNPL is, in addition to modules linked to Vitastor code in a single
binary file, to extend copyleft action to micro-service modules only interacting
with it over the network.
Please note that VNPL doesn't require you to open the code of proprietary
software running inside a VM if it's not specially designed to be used with
Vitastor.
Basically, you can't use the software in a proprietary environment to provide
its functionality to users without opensourcing all intermediary components
standing between the user and Vitastor or purchasing a commercial license
from the author 😀.
At the same time, VNPL doesn't impose any restrictions on software *not specially designed*
to be used with Vitastor, for example, on Windows running inside a VM with a Vitastor disk.
Client libraries (cluster_client and so on) are dual-licensed under the same
VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
software like QEMU and fio.
## Explanation
Network copyleft is governed by the clause **13. Remote Network Interaction** of VNPL.
A program is considered to be a "Proxy Program" if it meets both conditions:
- It is specially designed to be used with Vitastor. Basically, it means that the program
has any functionality specific to Vitastor and thus "knows" that it works with Vitastor,
not with something random.
- It interacts with Vitastor directly or indirectly through any programming interface,
including API, CLI, network or any wrapper (also considered a Proxy Program itself).
If, in addition to that:
- You give any user an apportunity to interact with Vitastor directly or indirectly through
any computer interface including the network or any number of wrappers (Proxy Programs).
Then VNPL requires you to publish the code of all above Proxy Programs to all above users
under the terms of any GPL-compatible license - that is, GPL, LGPL, MIT/BSD or Apache 2,
because "GPL compatibility" is treated as an ability to legally include licensed code in
a GPL application.
So, if you have a "Proxy Program", but it's not open to the user who directly or indirectly
interacts with Vitastor - you are forbidden to use Vitastor under the terms of VNPL and you
need a commercial license which doesn't contain open-source requirements.
## Examples
- Vitastor Kubernetes CSI driver which creates PersistentVolumes by calling `vitastor-cli create`.
- Yes, it interacts with Vitastor through vitastor-cli.
- Yes, it is designed specially for use with Vitastor (it has no sense otherwise).
- So, CSI driver **definitely IS** a Proxy Program and must be published under the terms of
a free software license.
- Windows, installed in a VM with the system disk on Vitastor storage.
- Yes, it interacts with Vitastor indirectly - it reads and writes data through the block
device interface, emulated by QEMU.
- No, it definitely isn't designed specially for use with Vitastor - Windows was created long
ago before Vitastor and doesn't know anything about it.
- So, Windows **definitely IS NOT** a Proxy Program and VNPL doesn't require to open it.
- Cloud control panel which makes requests to Vitastor Kubernetes CSI driver.
- Yes, it interacts with Vitastor indirectly through the CSI driver, which is a Proxy Program.
- May or may not be designed specially for use with Vitastor. How to determine exactly?
Imagine that Vitastor is replaced with any other storage (for example, with a proprietary).
Do control panel functions change in any way? If they do (for example, if snapshots stop working),
then the panel contains specific functionality and thus is designed specially for use with Vitastor.
Otherwise, the panel is universal and isn't designed specially for Vitastor.
- So, whether you are required to open-source the panel also **depends** on whether it
contains specific functionality or not.
## Why?
Because I believe into the spirit of copyleft (Linux wouldn't become so popular without GPL!)
and, at the same time, I want to have a way to monetize the product.
Existing licenses including AGPL are useless for it with an SDS - SDS is a very deeply
internal software which is almost definitely invisible to the user and thus AGPL doesn't
require anyone to open the code even if they make a proprietary fork.
And, in fact, the current situation in the world where GPL is though to only restrict direct
linking of programs into a single executable file, isn't much correct. Nowadays, programs
are more often linked with network API calls, not with /usr/bin/ld, and a software product
may consist of dozens of microservices interacting with each other over the network.
That's why we need VNPL to keep the license sufficiently copyleft.
## License Texts
- VNPL 1.1 in English: [VNPL-1.1.txt](../../VNPL-1.1.txt)
- VNPL 1.1 in Russian: [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt)
- GPL 2.0: [GPL-2.0.txt](../../GPL-2.0.txt)
You can find the full text of VNPL-1.1 in the file [VNPL-1.1.txt](../../VNPL-1.1.txt).
GPL 2.0 is also included in this repository as [GPL-2.0.txt](../../GPL-2.0.txt).

View File

@@ -12,14 +12,6 @@
Лицензия: VNPL 1.1 на серверный код и двойная VNPL 1.1 + GPL 2.0+ на клиентский.
Серверные компоненты распространяются только на условиях VNPL.
Клиентские библиотеки распространяются на условиях двойной лицензии VNPL 1.0
и также на условиях GNU GPL 2.0 или более поздней версии. Так сделано в целях
совместимости с таким ПО, как QEMU и fio.
## VNPL
VNPL - "сетевой копилефт", собственная свободная копилефт-лицензия
Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с дополнительным
условием "Сетевого взаимодействия", требующим распространять все программы,
@@ -37,70 +29,9 @@ Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с д
На Windows и любое другое ПО, не разработанное *специально* для использования
вместе с Vitastor, никакие ограничения не накладываются.
## Пояснение
Клиентские библиотеки распространяются на условиях двойной лицензии VNPL 1.0
и также на условиях GNU GPL 2.0 или более поздней версии. Так сделано в целях
совместимости с таким ПО, как QEMU и fio.
Сетевой копилефт регулируется пунктом лицензии **13. Удалённое сетевое взаимодействие**.
Программа считается "прокси-программой", если верны оба условия:
- Она создана специально для работы вместе с Vitastor. По сути это означает, что программа
должна иметь специфичный для Vitastor функционал, то есть, "знать", что она взаимодействует
именно с Vitastor.
- Она прямо или косвенно взаимодействует с Vitastor через абсолютно любой программный
интерфейс, включая любые способы вызова: API, CLI, сеть или через какую-то обёртку (в
свою очередь тоже являющуюся прокси-программой).
Если в дополнение к этому также:
- Вы предоставляете любому пользователю возможность взаимодействовать с Vitastor по сети,
опять-таки, через любой интерфейс или любую серию "обёрток" (прокси-программ)
То, согласно VNPL, вы должны открыть код "прокси-программ" **таким пользователям** на условиях
любой GPL-совместимой лицензии - то есть, GPL, LGPL, MIT/BSD или Apache 2 - "совместимость с GPL"
понимается как возможность включать лицензируемый код в GPL-приложение.
Соответственно, если у вас есть "прокси-программа", но её код не открыт пользователю,
который прямо или косвенно взаимодействует с Vitastor - вам запрещено использовать Vitastor
на условиях VNPL и вам нужна коммерческая лицензия, не содержащая требований об открытии кода.
## Примеры
- Kubernetes CSI-драйвер Vitastor, создающий PersistentVolume с помощью вызова `vitastor-cli create`.
- Да, взаимодействует с Vitastor через vitastor-cli.
- Да, создавался специально для работы с Vitastor (иначе в чём же ещё его смысл).
- Значит, CSI-драйвер **точно считается** "прокси-программой" и должен быть открыт под свободной
лицензией.
- Windows, установленный в виртуальную машину на диске Vitastor.
- Да, взаимодействует с Vitastor "прямо или косвенно" - пишет и читает данные через интерфейс
блочного устройства, эмулируемый QEMU.
- Нет, точно не создан *специально для работы с Vitastor* - когда его создавали, никакого
Vitastor ещё и в помине не было.
- Значит, Windows **точно не считается** "прокси-программой" и на него требования VNPL не распространяются.
- Панель управления облака, делающая запросы к Kubernetes CSI-драйверу Vitastor.
- Да, взаимодействует с Vitastor косвенно через CSI-драйвер, являющийся "прокси-программой".
- Сходу не известно, создавалась ли конкретно для работы с Vitastor. Как понять, да или нет?
Представьте, что Vitastor заменён на любую другую систему хранения (например, на проприетарную).
Работа панели управления изменится? Если да (например, перестанут работать снапшоты) - значит,
панель содержит специфичный функционал и "создана специально для работы с Vitastor".
Если нет - значит, специфичного функционала панель не содержит и в принципе она универсальна.
- Нужно ли открывать панель - **зависит** от того, содержит она специфичный функционал или нет.
## Почему так?
Потому что я одновременно верю в дух копилефт-лицензий (Linux не стал бы так популярен,
если бы не GPL!) и хочу иметь возможность монетизации продукта.
При этом использовать даже AGPL для программной СХД бессмысленно - это глубоко внутреннее
ПО, которое пользователь почти наверняка не увидит вообще, поэтому и открывать код никому
никогда не придётся, даже при создании производного продукта.
Да и в целом сложившаяся в мире ситуация, при которой действие GPL ограничивается только
прямым связыванием в один исполняемый файл, не очень корректна. В настоящее время программы
гораздо чаще интегрируют сетевыми вызовами, а не с помощью /usr/bin/ld, и общий программный
продукт может состоять из нескольких десятков микросервисов, взаимодействующих по сети.
Поэтому для сохранения достаточной "копилефтности" и придумана VNPL.
## Тексты лицензий
- VNPL 1.1 на английском языке: [VNPL-1.1.txt](../../VNPL-1.1.txt)
- VNPL 1.1 на русском языке: [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt)
- GPL 2.0: [GPL-2.0.txt](../../GPL-2.0.txt)
Вы можете найти полный текст VNPL 1.1 на английском языке в файле [VNPL-1.1.txt](../../VNPL-1.1.txt),
VNPL 1.1 на русском языке в файле [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt), а GPL 2.0 в файле [GPL-2.0.txt](../../GPL-2.0.txt).

View File

@@ -52,7 +52,7 @@
- Generic user-space client library
- [Native QEMU driver](../usage/qemu.en.md)
- [Loadable fio engine for benchmarks](../usage/fio.en.md)
- [UBLK](../usage/ublk.en.md) and [NBD](../usage/nbd.en.md) servers for kernel mounts
- [NBD proxy for kernel mounts](../usage/nbd.en.md)
- [Simplified NFS proxy for file-based image access emulation (suitable for VMWare)](../usage/nfs.en.md#pseudo-fs)
## Roadmap

View File

@@ -54,7 +54,7 @@
- Общая пользовательская клиентская библиотека для работы с кластером
- [Драйвер диска для QEMU](../usage/qemu.ru.md)
- [Драйвер диска для утилиты тестирования производительности fio](../usage/fio.ru.md)
- [UBLK](../usage/ublk.ru.md) и [NBD](../usage/nbd.ru.md) серверы для монтирования образов ядром ("блочное устройство в режиме пользователя")
- [NBD-прокси для монтирования образов ядром](../usage/nbd.ru.md) ("блочное устройство в режиме пользователя")
- [Упрощённая NFS-прокси для эмуляции файлового доступа к образам (подходит для VMWare)](../usage/nfs.ru.md#псевдо-фс)
## Планы развития

View File

@@ -100,14 +100,12 @@ List images (only matching `<glob>` pattern(s) if passed).
Options:
```
--exact Do not match glob patterns as names, select only exact name matches.
-p|--pool POOL Filter images by pool ID or name
-l|--long Also report allocated size and I/O statistics
--del Also include delete operation statistics
--sort FIELD Sort by specified field (name, size, used_size, <read|write|delete>_<iops|bps|lat|queue>)
-r|--reverse Sort in descending order
-n|--count N Only list first N items
--tree Show image snapshot/clone tree
```
Example output:

View File

@@ -102,14 +102,12 @@ kaveri 2/1 32 0 B 10 G 0 B 100% 0%
Опции:
```
--exact Не применять ФС-шаблоны к именам, выводить только точные совпадения
-p|--pool POOL Фильтровать образы по пулу (ID или имени)
-l|--long Также выводить статистику занятого места и ввода-вывода
--del Также выводить статистику операций удаления
--sort FIELD Сортировать по заданному полю (name, size, used_size, <read|write|delete>_<iops|bps|lat|queue>)
-r|--reverse Сортировать в обратном порядке
-n|--count N Показывать только первые N записей
--tree Вывести снапшоты и клоны в виде дерева
```
Пример вывода:

View File

@@ -73,8 +73,6 @@ Options (automatic mode):
--max_other 10%
Use disks for OSD data even if they already have non-Vitastor partitions,
but only if these take up no more than this percent of disk space.
--dry-run
Check and print new OSD count for each disk but do not actually create them.
```
Options (single-device mode):

View File

@@ -74,8 +74,6 @@ vitastor-disk - инструмент командной строки для уп
--max_other 10%
Использовать диски под данные OSD, даже если на них уже есть не-Vitastor-овые
разделы, но только в случае, если они занимают не более данного процента диска.
--dry-run
Проверить и вывести число новых OSD для каждого диска, но не создавать их.
```
Опции для режима одного OSD:

View File

@@ -89,8 +89,6 @@ POSIX features currently not implemented in VitastorFS:
instead of actually allocated space
- Access times (`atime`) are not tracked (like `-o noatime`)
- Modification time (`mtime`) is updated lazily every second (like `-o lazytime`)
- Permission enforcement is disabled by default (and Linux NFS client doesn't
enforce them too). Use `--enforce 1` to enable it.
Other notable missing features which should be addressed in the future:
- Inode ID reuse. Currently inode IDs always grow, the limit is 2^48 inodes, so
@@ -260,5 +258,4 @@ Options:
| `--nfspath <PATH>` | set NFS export path to \<PATH> (default is /) |
| `--pidfile <FILE>` | write process ID to the specified file |
| `--logfile <FILE>` | log to the specified file |
| `--enforce 1` | enforce permissions at the server side (no by default) |
| `--foreground 1` | stay in foreground, do not daemonize |

View File

@@ -91,8 +91,6 @@ JSON-формате :-). Для инспекции содержимого БД
stat(2), так что `du` всегда показывает сумму размеров файлов, а не фактически занятое место
- Времена доступа (`atime`) не отслеживаются (как будто ФС смонтирована с `-o noatime`)
- Времена модификации (`mtime`) отслеживаются асинхронно (как будто ФС смонтирована с `-o lazytime`)
- Привилегии доступа по умолчанию не проверяются сервером (клиент NFS Linux их также не проверяет).
Чтобы включить проверки, используйте опцию `--enforce 1`.
Другие недостающие функции, которые нужно добавить в будущем:
- Переиспользование номеров инодов. В текущей реализации номера инодов всё время
@@ -272,5 +270,4 @@ VitastorFS из GPUDirect.
| `--nfspath <PATH>` | установить путь NFS-экспорта в \<PATH> (по умолчанию /) |
| `--pidfile <FILE>` | записать ID процесса в заданный файл |
| `--logfile <FILE>` | записывать логи в заданный файл |
| `--enforce 1` | проверять права доступа на стороне сервера (по умолчанию нет) |
| `--foreground 1` | не уходить в фон после запуска |

View File

@@ -130,16 +130,23 @@ Linux kernel, starting with version 5.15, supports a new interface for attaching
to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
exporting QEMU block devices over this protocol using qemu-storage-daemon.
VDUSE advantages:
- VDUSE copies memory 1 time instead of 2, and is thus faster than [NBD](nbd.en.md) for linear read/write.
- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long.
VDUSE is currently the best interface to attach Vitastor disks as kernel devices because:
- It avoids data copies and thus achieves much better performance than [NBD](nbd.en.md)
- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long
- It doesn't have hung device problem - if the userspace process dies it can be restarted (!)
and block device will continue operation (UBLK can do it too).
- It doesn't seem to have the device number limit (UBLK also doesn't).
and block device will continue operation
- It doesn't seem to have the device number limit
At the same time, VDUSE may be slower or faster than [UBLK](ublk.en.md) for linear read/write,
and iops-wise it's sometimes even slower than NBD. See performance comparison examples at the page [UBLK](ublk.en.md).
Example performance comparison:
| | direct fio | NBD | VDUSE |
|----------------------|-------------|-------------|-------------|
| linear write | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
| 4k random write Q128 | 240000 iops | 120000 iops | 178000 iops |
| 4k random write Q1 | 9500 iops | 7620 iops | 7640 iops |
| linear read | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
| 4k random read Q128 | 287000 iops | 140000 iops | 189000 iops |
| 4k random read Q1 | 9600 iops | 7640 iops | 7780 iops |
To try VDUSE you need at least Linux 5.15, built with VDUSE support
(CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
@@ -186,12 +193,3 @@ To remove the device:
vdpa dev del test1
kill <qemu-storage-daemon_process_PID>
```
## Veeam
Vitastor QEMU driver has a feature that allows to trick third-party systems like Veeam not able to parse qemu-img
vitastor URIs: [qemu_file_mirror_path](../config/client.en.md#qemu_file_mirror_path).
To make such systems work, you should set this option to an FS directory path (for example, `/mnt/vitastor/`) and
mount this directory using [`vitastor-nfs mount --block`](../usage/nfs.en.md). It will make them access
your images using files and, hopefully, succeed in doing their normal job :).

View File

@@ -132,16 +132,24 @@ qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.
Преимущества VDUSE:
VDUSE - на данный момент лучший интерфейс для подключения дисков Vitastor в виде блочных
устройств на уровне ядра, ибо:
- VDUSE не копирует данные и поэтому достигает значительно лучшей производительности, чем [NBD](nbd.ru.md)
- Также оно не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго
- Также оно не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
перезапустить (!) и блочное устройство продолжит работать
- По-видимому, у него нет предела числа подключаемых в систему устройств
- VDUSE копирует данные 1 раз, а не 2, и поэтому он быстрее, чем [NBD](nbd.ru.md) при линейном доступе.
- VDUSE не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго.
- VDUSE не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
перезапустить (!) и блочное устройство продолжит работать (в UBLK это тоже поддерживается).
- По-видимому, у него нет предела числа подключаемых в систему устройств (в UBLK лимита тоже нет).
Пример сравнения производительности:
Однако, при линейном доступе VDUSE может быть медленнее UBLK (а может быть и быстрее), а по iops
VDUSE иногда даже медленнее NBD. Пример сравнения производительности смотрите на странице [UBLK](ublk.ru.md).
| | Прямой fio | NBD | VDUSE |
|--------------------------|-------------|-------------|-------------|
| линейная запись | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
| 4k случайная запись Q128 | 240000 iops | 120000 iops | 178000 iops |
| 4k случайная запись Q1 | 9500 iops | 7620 iops | 7640 iops |
| линейное чтение | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
| 4k случайное чтение Q128 | 287000 iops | 140000 iops | 189000 iops |
| 4k случайное чтение Q1 | 9600 iops | 7640 iops | 7780 iops |
Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
VDUSE (CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
@@ -188,12 +196,3 @@ vdpa dev add name test1 mgmtdev vduse
vdpa dev del test1
kill <PID_процесса_qemu-storage-daemon>
```
## Veeam
Драйвер Vitastor QEMU имеет функцию, которая позволяет обманывать сторонние системы типа Veeam, которые
не могут сами по себе разобрать адреса дисков в vitastor: [qemu_file_mirror_path](../config/client.ru.md#qemu_file_mirror_path).
Чтобы заставить такие системы работать, вам нужно установить эту опцию равной пути к некоторому каталогу
в ФС (например, `/mnt/vitastor/`) и примонтировать этот каталог с помощью [`vitastor-nfs mount --block`](../usage/nfs.ru.md).
Они начнут обращаться к образам как к файлам и, вероятно, смогут заработать корректно :).

View File

@@ -1,116 +0,0 @@
[Documentation](../../README.md#documentation) → Usage → UBLK
-----
[Читать на русском](ublk.ru.md)
# UBLK
[ublk](https://docs.kernel.org/block/ublk.html) is a new io_uring-based Linux interface
for user-space block device drivers, available since Linux 6.0.
It's not zero-copy, but it's still a fast implementation, outperforming both [NBD](nbd.en.md)
and [VDUSE](qemu.en.md#vduse) iops-wise and may or may not outperform VDUSE in linear I/O MB/s.
ublk also allows to recover devices even if the server (vitastor-ublk process) dies.
## Example performance comparison
TCP (100G), 3 hosts each with 6 NVMe OSDs, 3 replicas, single client
| | direct fio | NBD | VDUSE | UBLK |
|----------------------|-------------|-------------|------------|-------------|
| linear write | 3807 MB/s | 1832 MB/s | 3226 MB/s | 3027 MB/s |
| linear read | 3067 MB/s | 1885 MB/s | 1800 MB/s | 2076 MB/s |
| 4k random write Q128 | 128624 iops | 91060 iops | 94621 iops | 149450 iops |
| 4k random read Q128 | 117769 iops | 153408 iops | 93157 iops | 171987 iops |
| 4k random write Q1 | 8090 iops | 6442 iops | 6316 iops | 7272 iops |
| 4k random read Q1 | 9474 iops | 7200 iops | 6840 iops | 8038 iops |
RDMA (100G), 3 hosts each with 6 NVMe OSDs, 3 replicas, single client
| | direct fio | NBD | VDUSE | UBLK |
|----------------------|-------------|-------------|-------------|-------------|
| linear write | 6998 MB/s | 1878 MB/s | 4249 MB/s | 3140 MB/s |
| linear read | 8628 MB/s | 3389 MB/s | 5062 MB/s | 3674 MB/s |
| 4k random write Q128 | 222541 iops | 181589 iops | 138281 iops | 218222 iops |
| 4k random read Q128 | 412647 iops | 239987 iops | 151663 iops | 269583 iops |
| 4k random write Q1 | 11601 iops | 8592 iops | 9111 iops | 10000 iops |
| 4k random read Q1 | 10102 iops | 7788 iops | 8111 iops | 8965 iops |
## Commands
vitastor-ublk supports the following commands:
- [map](#map)
- [unmap](#unmap)
- [ls](#ls)
## map
To create a local block device for a Vitastor image run:
```
vitastor-ublk map [/dev/ublkbN] --image testimg
```
It will output a block device name like /dev/ublkb0 which you can then use as a normal disk.
You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.
vitastor-ublk supports all usual Vitastor configuration options like `--config_path <path_to_config>` plus ublk-specific:
* `--recover` \
Recover a mapped device if the previous ublk server is dead.
* `--queue_depth 256` \
Maximum queue size for the device.
* `--max_io_size 1M` \
Maximum single I/O size for the device. Default: `max(1 MB, pool block size * EC part count)`.
* `--readonly` \
Make the device read-only.
* `--hdd` \
Mark the device as rotational.
* `--logfile /path/to/log/file.txt` \
Write log messages to the specified file instead of dropping them (in background mode)
or printing them to the standard output (in foreground mode).
* `--dev_num N` \
Use the specified device /dev/ublkbN instead of automatic selection (alternative syntax
to /dev/ublkbN positional parameter).
* `--foreground 1` \
Stay in foreground, do not daemonize.
Note that `ublk_queue_depth` and `ublk_max_io_size` may also be specified
in `/etc/vitastor/vitastor.conf` or in other configuration file specified with `--config_path`.
## unmap
To unmap the device run:
```
vitastor-ublk unmap /dev/ublkb0
```
## ls
```
vitastor-ublk ls [--json]
```
List mapped images.
Example output (normal format):
```
/dev/ublkb0
image: bench
pid: 584536
/dev/ublkb1
image: bench1
pid: 584546
```
Example output (JSON format):
```
{"/dev/ublkb0": {"image": "bench", "pid": 584536}, "/dev/ublkb1": {"image": "bench1", "pid": 584546}}
```

View File

@@ -1,121 +0,0 @@
[Документация](../../README-ru.md#документация) → Использование → UBLK
-----
[Read in English](ublk.en.md)
# UBLK
[ublk](https://docs.kernel.org/block/ublk.html) - это новый Linux-интерфейс на основе io_uring
для реализации блочных устройств в пространстве пользователя, доступный, начиная с Linux 6.0.
ublk тоже копирует память (т.е. не является zero-copy), но по IOPS всё равно обгоняет и
[NBD](nbd.ru.md), и [VDUSE](qemu.ru.md#vduse), и иногда может даже обгонять VDUSE по
скорости линейного доступа. Также ublk позволяет оживлять устройства, у которых умер
сервер (процесс-обработчик vitastor-ublk).
## Пример сравнения производительности
TCP (100G), 3 сервера с 6 NVMe OSD каждый, 3 реплики, один клиент
| | Прямой fio | NBD | VDUSE | UBLK |
|--------------------------|-------------|-------------|------------|-------------|
| линейная запись | 3807 MB/s | 1832 MB/s | 3226 MB/s | 3027 MB/s |
| линейное чтение | 3067 MB/s | 1885 MB/s | 1800 MB/s | 2076 MB/s |
| 4k случайная запись Q128 | 128624 iops | 91060 iops | 94621 iops | 149450 iops |
| 4k случайное чтение Q128 | 117769 iops | 153408 iops | 93157 iops | 171987 iops |
| 4k случайная запись Q1 | 8090 iops | 6442 iops | 6316 iops | 7272 iops |
| 4k случайное чтение Q1 | 9474 iops | 7200 iops | 6840 iops | 8038 iops |
RDMA (100G), 3 сервера с 6 NVMe OSD каждый, 3 реплики, один клиент
| | Прямой fio | NBD | VDUSE | UBLK |
|--------------------------|-------------|-------------|-------------|-------------|
| линейная запись | 6998 MB/s | 1878 MB/s | 4249 MB/s | 3140 MB/s |
| линейное чтение | 8628 MB/s | 3389 MB/s | 5062 MB/s | 3674 MB/s |
| 4k случайная запись Q128 | 222541 iops | 181589 iops | 138281 iops | 218222 iops |
| 4k случайное чтение Q128 | 412647 iops | 239987 iops | 151663 iops | 269583 iops |
| 4k случайная запись Q1 | 11601 iops | 8592 iops | 9111 iops | 10000 iops |
| 4k случайное чтение Q1 | 10102 iops | 7788 iops | 8111 iops | 8965 iops |
## Команды
vitastor-ublk поддерживает следующие команды:
- [map](#map)
- [unmap](#unmap)
- [ls](#ls)
## map
Чтобы создать локальное блочное устройство для образа, выполните команду:
```
vitastor-ublk map [/dev/ublkbN] --image testimg
```
Команда напечатает название блочного устройства вида /dev/ublkb0, которое потом можно
будет использовать как обычный диск.
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
vitastor-ublk поддерживает все обычные опции Vitastor, например, `--config_path <path_to_config>`,
плюс специфичные для ublk:
* `--recover` \
Восстановить ранее подключённое устройство, у которого умер обработчик.
* `--queue_depth 256` \
Максимальная глубина очереди устройства.
* `--max_io_size 1M` \
Максимальный размер запроса ввода-вывода для устройства. По умолчанию: `max(1 MB, блок данных пула * число частей данных EC)`.
* `--readonly` \
Подключить устройство в режиме только для чтения.
* `--hdd` \
Пометить устройство как вращающийся жёсткий диск (флаг rotational).
* `--logfile /path/to/log/file.txt` \
Писать сообщения о процессе работы в заданный файл, вместо пропуска их
при фоновом режиме запуска или печати на стандартный вывод при запуске
в консоли с `--foreground 1`.
* `--dev_num N` \
Использовать заданное устройство `/dev/ublkbN` вместо автоматического подбора.
* `--foreground 1` \
Не уводить процесс в фоновый режим.
Обратите внимание, что опции `ublk_queue_depth` и `ublk_max_io_size` можно
также задавать в `/etc/vitastor/vitastor.conf` или в другом файле конфигурации,
заданном опцией `--config_path`.
## unmap
Для отключения устройства выполните:
```
vitastor-ublk unmap /dev/ublkb0
```
## ls
```
vitastor-ublk ls [--json]
```
Вывести подключённые устройства.
Пример вывода в обычном формате:
```
/dev/ublkb0
image: bench
pid: 584536
/dev/ublkb1
image: bench1
pid: 584546
```
Пример вывода в JSON-формате:
```
{"/dev/ublkb0": {"image": "bench", "pid": 584536}, "/dev/ublkb1": {"image": "bench1", "pid": 584546}}
```

View File

@@ -96,7 +96,6 @@ class Mon
}
else
{
res.setHeader('Content-Type', 'text/plain; version=0.0.4; charset=utf-8');
res.write(export_prometheus_metrics(this.state));
}
}

View File

@@ -15,7 +15,7 @@ function get_osd_tree(global_config, state)
const stat = state.osd.stats[osd_num];
const osd_cfg = state.config.osd[osd_num];
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
if (isNaN(reweight) || reweight < 0 || reweight > 1)
if (isNaN(reweight) || reweight < 0 || reweight > 0)
reweight = 1;
if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
osd_cfg && osd_cfg.noout))
@@ -179,7 +179,7 @@ function filter_osds_by_block_layout(orig_tree, osd_stats, block_size, bitmap_gr
if (orig_tree[osd].level === 'osd')
{
const osd_stat = osd_stats[osd];
if (osd_stat && (osd_stat.data_block_size && osd_stat.data_block_size != block_size ||
if (osd_stat && (osd_stat.bs_block_size && osd_stat.bs_block_size != block_size ||
osd_stat.bitmap_granularity && osd_stat.bitmap_granularity != bitmap_granularity ||
osd_stat.immediate_commit == 'small' && immediate_commit == 'all' ||
osd_stat.immediate_commit == 'none' && immediate_commit != 'none'))

View File

@@ -1,6 +1,6 @@
{
"name": "vitastor-mon",
"version": "2.4.0",
"version": "2.2.0",
"description": "Vitastor SDS monitor service",
"main": "mon-main.js",
"scripts": {
@@ -9,7 +9,7 @@
"author": "Vitaliy Filippov",
"license": "UNLICENSED",
"dependencies": {
"antietcd": "^1.1.3",
"antietcd": "^1.1.2",
"sprintf-js": "^1.1.2",
"ws": "^7.2.5"
},

View File

@@ -1,6 +1,6 @@
{
"name": "vitastor",
"version": "2.4.0",
"version": "2.2.0",
"description": "Low-level native bindings to Vitastor client library",
"main": "index.js",
"keywords": [

View File

@@ -261,7 +261,7 @@ sub free_image
my ($vtype, $name, $vmid, undef, undef, undef) = $class->parse_volname($volname);
$class->deactivate_volume($storeid, $scfg, $volname);
my $full_list = run_cli($scfg, [ 'ls', '-l' ]);
my $list = _process_list($scfg, $storeid, $full_list, 0);
my $list = _process_list($scfg, $storeid, $full_list);
# Remove image and all its snapshots
my $rm_names = {
map { ($prefix.$_->{name} => 1) }
@@ -269,10 +269,6 @@ sub free_image
@$list
};
my $children = [ grep { $_->{parent_name} && $rm_names->{$_->{parent_name}} } @$full_list ];
$children = [ grep {
substr($_->{name}, 0, length($prefix.$name)) ne $prefix.$name &&
substr($_->{name}, 0, length($prefix.$name)+1) ne $prefix.$name.'@'
} @$children ];
die "Image has children: ".join(', ', map {
substr($_->{name}, 0, length $prefix) eq $prefix
? substr($_->name, length $prefix)
@@ -292,15 +288,14 @@ sub free_image
sub _process_list
{
my ($scfg, $storeid, $result, $skip_snapshot) = @_;
$skip_snapshot = 1 if !defined $skip_snapshot;
my ($scfg, $storeid, $result) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my $list = [];
foreach my $el (@$result)
{
next if !$el->{name} || length($prefix) && substr($el->{name}, 0, length $prefix) ne $prefix;
my $name = substr($el->{name}, length $prefix);
next if $skip_snapshot && $name =~ /@/;
next if $name =~ /@/;
my ($owner) = $name =~ /^(?:vm|base)-(\d+)-/s;
next if !defined $owner;
my $parent = !defined $el->{parent_name}
@@ -415,8 +410,8 @@ sub volume_size_info
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
my $info = _process_list($scfg, $storeid, run_cli($scfg, [ 'ls', $prefix.$name ]))->[0];
# (size, format, used, parent, ctime)
return wantarray ? ($info->{size}, $info->{format}, $info->{size}, $info->{parent}, 0) : $info->{size};
#return wantarray ? ($size, $format, $used, $parent, $st->ctime) : $size;
return $info->{size};
}
sub volume_resize
@@ -499,55 +494,4 @@ sub rename_volume
return "${storeid}:${base_name}${target_volname}";
}
sub _monkey_patch_qemu_blockdev_options
{
my ($cfg, $volid, $machine_version, $options) = @_;
my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid);
my $scfg = PVE::Storage::storage_config($cfg, $storeid);
my $plugin = PVE::Storage::Plugin->lookup($scfg->{type});
my ($vtype) = $plugin->parse_volname($volname);
die "cannot use volume of type '$vtype' as a QEMU blockdevice\n"
if $vtype ne 'images' && $vtype ne 'iso' && $vtype ne 'import';
return $plugin->qemu_blockdev_options($scfg, $storeid, $volname, $machine_version, $options);
}
sub qemu_blockdev_options
{
my ($class, $scfg, $storeid, $volname, $machine_version, $options) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
$name .= '@'.$options->{'snapshot-name'} if $options->{'snapshot-name'};
if ($scfg->{vitastor_nbd})
{
my $mapped = run_cli($scfg, [ 'ls' ], binary => '/usr/bin/vitastor-nbd');
my ($kerneldev) = grep { $mapped->{$_}->{image} eq $prefix.$name } keys %$mapped;
die "Image not mapped via NBD" if !$kerneldev;
return { driver => 'host_device', filename => $kerneldev };
}
my $blockdev = {
driver => 'vitastor',
image => $prefix.$name,
};
if ($scfg->{vitastor_config_path})
{
$blockdev->{'config-path'} = $scfg->{vitastor_config_path};
}
if ($scfg->{vitastor_etcd_address})
{
# FIXME This is the only exception: etcd_address -> etcd_host for qemu
$blockdev->{'etcd-host'} = $scfg->{vitastor_etcd_address};
}
if ($scfg->{vitastor_etcd_prefix})
{
$blockdev->{'etcd-prefix'} = $scfg->{vitastor_etcd_prefix};
}
return $blockdev;
}
*PVE::Storage::qemu_blockdev_options = *_monkey_patch_qemu_blockdev_options;
1;

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VITASTOR_VERSION = '2.4.0'
VITASTOR_VERSION = '2.2.0'
LOG = logging.getLogger(__name__)

View File

@@ -1,637 +0,0 @@
diff --git a/include/libvirt/libvirt-storage.h b/include/libvirt/libvirt-storage.h
index aaad4a3da1..5f5daa8341 100644
--- a/include/libvirt/libvirt-storage.h
+++ b/include/libvirt/libvirt-storage.h
@@ -326,6 +326,7 @@ typedef enum {
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17, /* (Since: 1.2.8) */
VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18, /* (Since: 3.1.0) */
VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19, /* (Since: 5.6.0) */
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20, /* (Since: 5.0.0) */
} virConnectListAllStoragePoolsFlags;
int virConnectListAllStoragePools(virConnectPtr conn,
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 1e24e41a48..ce359a4cf8 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -7435,7 +7435,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
src->configFile = virXPathString("string(./config/@file)", ctxt);
if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP ||
- src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS)
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS ||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR)
src->query = virXMLPropString(node, "query");
if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0)
@@ -31871,6 +31872,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSource *src,
case VIR_STORAGE_POOL_MPATH:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_SHEEPDOG:
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_LAST:
diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c
index b28af7fa56..d1aae6e43e 100644
--- a/src/conf/domain_validate.c
+++ b/src/conf/domain_validate.c
@@ -504,6 +504,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
case VIR_STORAGE_NET_PROTOCOL_RBD:
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
@@ -576,7 +577,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
}
}
- /* internal snapshots and config files are currently supported only with rbd: */
+ /* internal snapshots are currently supported only with rbd: */
if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
if (src->snapshot) {
@@ -584,10 +585,14 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
_("<snapshot> element is currently supported only with 'rbd' disks"));
return -1;
}
-
+ }
+ /* config files are currently supported only with rbd and vitastor: */
+ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
if (src->configFile) {
virReportError(VIR_ERR_XML_ERROR, "%s",
- _("<config> element is currently supported only with 'rbd' disks"));
+ _("<config> element is currently supported only with 'rbd' and 'vitastor' disks"));
return -1;
}
}
diff --git a/src/conf/schemas/domaincommon.rng b/src/conf/schemas/domaincommon.rng
index 183dd5db5e..dcc0d1a778 100644
--- a/src/conf/schemas/domaincommon.rng
+++ b/src/conf/schemas/domaincommon.rng
@@ -2066,6 +2066,35 @@
</element>
</define>
+ <define name="diskSourceNetworkProtocolVitastor">
+ <element name="source">
+ <interleave>
+ <attribute name="protocol">
+ <value>vitastor</value>
+ </attribute>
+ <ref name="diskSourceCommon"/>
+ <optional>
+ <attribute name="name"/>
+ </optional>
+ <optional>
+ <attribute name="query"/>
+ </optional>
+ <zeroOrMore>
+ <ref name="diskSourceNetworkHost"/>
+ </zeroOrMore>
+ <optional>
+ <element name="config">
+ <attribute name="file">
+ <ref name="absFilePath"/>
+ </attribute>
+ <empty/>
+ </element>
+ </optional>
+ <empty/>
+ </interleave>
+ </element>
+ </define>
+
<define name="diskSourceNetworkProtocolISCSI">
<element name="source">
<attribute name="protocol">
@@ -2416,6 +2445,7 @@
<ref name="diskSourceNetworkProtocolSimple"/>
<ref name="diskSourceNetworkProtocolVxHS"/>
<ref name="diskSourceNetworkProtocolNFS"/>
+ <ref name="diskSourceNetworkProtocolVitastor"/>
</choice>
</define>
diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c
index 1dc9365bf2..a8a736be81 100644
--- a/src/conf/storage_conf.c
+++ b/src/conf/storage_conf.c
@@ -56,7 +56,7 @@ VIR_ENUM_IMPL(virStoragePool,
"logical", "disk", "iscsi",
"iscsi-direct", "scsi", "mpath",
"rbd", "sheepdog", "gluster",
- "zfs", "vstorage",
+ "zfs", "vstorage", "vitastor",
);
VIR_ENUM_IMPL(virStoragePoolFormatFileSystem,
@@ -242,6 +242,18 @@ static virStoragePoolTypeInfo poolTypeInfo[] = {
.formatToString = virStorageFileFormatTypeToString,
}
},
+ {.poolType = VIR_STORAGE_POOL_VITASTOR,
+ .poolOptions = {
+ .flags = (VIR_STORAGE_POOL_SOURCE_HOST |
+ VIR_STORAGE_POOL_SOURCE_NETWORK |
+ VIR_STORAGE_POOL_SOURCE_NAME),
+ },
+ .volOptions = {
+ .defaultFormat = VIR_STORAGE_FILE_RAW,
+ .formatFromString = virStorageVolumeFormatFromString,
+ .formatToString = virStorageFileFormatTypeToString,
+ }
+ },
{.poolType = VIR_STORAGE_POOL_SHEEPDOG,
.poolOptions = {
.flags = (VIR_STORAGE_POOL_SOURCE_HOST |
@@ -538,6 +550,11 @@ virStoragePoolDefParseSource(xmlXPathContextPtr ctxt,
_("element 'name' is mandatory for RBD pool"));
return -1;
}
+ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) {
+ virReportError(VIR_ERR_XML_ERROR, "%s",
+ _("element 'name' is mandatory for Vitastor pool"));
+ return -1;
+ }
if (options->formatFromString) {
g_autofree char *format = NULL;
@@ -1127,6 +1144,7 @@ virStoragePoolDefFormatBuf(virBuffer *buf,
/* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor
* files, so they don't have a target */
if (def->type != VIR_STORAGE_POOL_RBD &&
+ def->type != VIR_STORAGE_POOL_VITASTOR &&
def->type != VIR_STORAGE_POOL_SHEEPDOG &&
def->type != VIR_STORAGE_POOL_GLUSTER &&
def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) {
diff --git a/src/conf/storage_conf.h b/src/conf/storage_conf.h
index fc67957cfe..720c07ef74 100644
--- a/src/conf/storage_conf.h
+++ b/src/conf/storage_conf.h
@@ -103,6 +103,7 @@ typedef enum {
VIR_STORAGE_POOL_GLUSTER, /* Gluster device */
VIR_STORAGE_POOL_ZFS, /* ZFS */
VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */
+ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */
VIR_STORAGE_POOL_LAST,
} virStoragePoolType;
@@ -454,6 +455,7 @@ VIR_ENUM_DECL(virStoragePartedFs);
VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \
VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \
VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \
VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \
VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \
diff --git a/src/conf/storage_source_conf.c b/src/conf/storage_source_conf.c
index 8a063be244..dd9c7f11a2 100644
--- a/src/conf/storage_source_conf.c
+++ b/src/conf/storage_source_conf.c
@@ -89,6 +89,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol,
"ssh",
"vxhs",
"nfs",
+ "vitastor",
);
@@ -1314,6 +1315,7 @@ virStorageSourceNetworkDefaultPort(virStorageNetProtocol protocol)
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
return 24007;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_RBD:
/* we don't provide a default for RBD */
return 0;
diff --git a/src/conf/storage_source_conf.h b/src/conf/storage_source_conf.h
index ebddf28cd6..873a2be65c 100644
--- a/src/conf/storage_source_conf.h
+++ b/src/conf/storage_source_conf.h
@@ -130,6 +130,7 @@ typedef enum {
VIR_STORAGE_NET_PROTOCOL_SSH,
VIR_STORAGE_NET_PROTOCOL_VXHS,
VIR_STORAGE_NET_PROTOCOL_NFS,
+ VIR_STORAGE_NET_PROTOCOL_VITASTOR,
VIR_STORAGE_NET_PROTOCOL_LAST
} virStorageNetProtocol;
diff --git a/src/conf/virstorageobj.c b/src/conf/virstorageobj.c
index 59fa5da372..4739167f5f 100644
--- a/src/conf/virstorageobj.c
+++ b/src/conf/virstorageobj.c
@@ -1438,6 +1438,7 @@ virStoragePoolObjSourceFindDuplicateCb(const void *payload,
return 1;
break;
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_ISCSI_DIRECT:
case VIR_STORAGE_POOL_RBD:
case VIR_STORAGE_POOL_LAST:
@@ -1921,6 +1922,8 @@ virStoragePoolObjMatch(virStoragePoolObj *obj,
(obj->def->type == VIR_STORAGE_POOL_MPATH)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) &&
(obj->def->type == VIR_STORAGE_POOL_RBD)) ||
+ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) &&
+ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) &&
(obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) &&
diff --git a/src/libvirt-storage.c b/src/libvirt-storage.c
index db7660aac4..561df34709 100644
--- a/src/libvirt-storage.c
+++ b/src/libvirt-storage.c
@@ -94,6 +94,7 @@ virStoragePoolGetConnect(virStoragePoolPtr pool)
* VIR_CONNECT_LIST_STORAGE_POOLS_SCSI
* VIR_CONNECT_LIST_STORAGE_POOLS_MPATH
* VIR_CONNECT_LIST_STORAGE_POOLS_RBD
+ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR
* VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG
* VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER
* VIR_CONNECT_LIST_STORAGE_POOLS_ZFS
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c
index bdd30dd65a..5353e00b4a 100644
--- a/src/libxl/libxl_conf.c
+++ b/src/libxl/libxl_conf.c
@@ -1081,6 +1081,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSource *src,
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT,
diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c
index ec8de30c01..61eab9606d 100644
--- a/src/libxl/xen_xl.c
+++ b/src/libxl/xen_xl.c
@@ -1461,6 +1461,7 @@ xenFormatXLDiskSrcNet(virStorageSource *src)
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT,
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
index 32568d4ae6..e625fa0720 100644
--- a/src/qemu/qemu_block.c
+++ b/src/qemu/qemu_block.c
@@ -731,6 +731,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src,
}
+static virJSONValue *
+qemuBlockStorageSourceGetVitastorProps(virStorageSource *src)
+{
+ virJSONValue *ret = NULL;
+ virStorageNetHostDef *host;
+ size_t i;
+ g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
+ g_autofree char *etcd = NULL;
+
+ for (i = 0; i < src->nhosts; i++) {
+ host = src->hosts + i;
+ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) {
+ return NULL;
+ }
+ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port);
+ }
+ if (src->nhosts > 0) {
+ etcd = virBufferContentAndReset(&buf);
+ }
+
+ if (virJSONValueObjectAdd(&ret,
+ "S:etcd-host", etcd,
+ "S:etcd-prefix", src->query,
+ "S:config-path", src->configFile,
+ "s:image", src->path,
+ NULL) < 0)
+ return NULL;
+
+ return ret;
+}
+
+
static virJSONValue *
qemuBlockStorageSourceGetSshProps(virStorageSource *src)
{
@@ -1082,6 +1114,12 @@ qemuBlockStorageSourceGetBackendProps(virStorageSource *src,
return NULL;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ driver = "vitastor";
+ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src)))
+ return NULL;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SSH:
driver = "ssh";
if (!(fileprops = qemuBlockStorageSourceGetSshProps(src)))
@@ -1985,6 +2023,7 @@ qemuBlockGetBackingStoreString(virStorageSource *src,
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
case VIR_STORAGE_NET_PROTOCOL_SSH:
@@ -2365,6 +2404,12 @@ qemuBlockStorageSourceCreateGetStorageProps(virStorageSource *src,
return -1;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ driver = "vitastor";
+ if (!(location = qemuBlockStorageSourceGetVitastorProps(src)))
+ return -1;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SSH:
if (srcPriv->nbdkitProcess) {
/* disk creation not yet supported with nbdkit, and even if it
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index 0d2548d8d4..91121d6e1f 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -4526,7 +4526,8 @@ qemuDomainValidateStorageSource(virStorageSource *src,
if (src->query &&
(actualType != VIR_STORAGE_TYPE_NETWORK ||
(src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS &&
- src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP))) {
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("query is supported only with HTTP(S) protocols"));
return -1;
@@ -8954,6 +8955,7 @@ qemuDomainPrepareStorageSourceTLS(virStorageSource *src,
break;
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c
index 8128154749..afb339b9b0 100644
--- a/src/qemu/qemu_snapshot.c
+++ b/src/qemu/qemu_snapshot.c
@@ -662,6 +662,7 @@ qemuSnapshotPrepareDiskExternalInactive(virDomainSnapshotDiskDef *snapdisk,
case VIR_STORAGE_NET_PROTOCOL_NONE:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
@@ -887,6 +888,7 @@ qemuSnapshotPrepareDiskInternal(virDomainDiskDef *disk,
case VIR_STORAGE_NET_PROTOCOL_NONE:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
diff --git a/src/storage/storage_driver.c b/src/storage/storage_driver.c
index e19e032427..59f91f4710 100644
--- a/src/storage/storage_driver.c
+++ b/src/storage/storage_driver.c
@@ -1626,6 +1626,7 @@ storageVolLookupByPathCallback(virStoragePoolObj *obj,
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_SHEEPDOG:
case VIR_STORAGE_POOL_ZFS:
case VIR_STORAGE_POOL_LAST:
diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c
index 80681924ea..8a3ade9ec0 100644
--- a/src/storage_file/storage_source_backingstore.c
+++ b/src/storage_file/storage_source_backingstore.c
@@ -287,6 +287,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr,
}
+static int
+virStorageSourceParseVitastorColonString(const char *colonstr,
+ virStorageSource *src)
+{
+ char *p, *e, *next;
+ g_autofree char *options = NULL;
+
+ /* optionally skip the "vitastor:" prefix if provided */
+ if (STRPREFIX(colonstr, "vitastor:"))
+ colonstr += strlen("vitastor:");
+
+ options = g_strdup(colonstr);
+
+ p = options;
+ while (*p) {
+ /* find : delimiter or end of string */
+ for (e = p; *e && *e != ':'; ++e) {
+ if (*e == '\\') {
+ e++;
+ if (*e == '\0')
+ break;
+ }
+ }
+ if (*e == '\0') {
+ next = e; /* last kv pair */
+ } else {
+ next = e + 1;
+ *e = '\0';
+ }
+
+ if (STRPREFIX(p, "image=")) {
+ src->path = g_strdup(p + strlen("image="));
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
+ src->query = g_strdup(p + strlen("etcd-prefix="));
+ } else if (STRPREFIX(p, "config-path=")) {
+ src->configFile = g_strdup(p + strlen("config-path="));
+ } else if (STRPREFIX(p, "etcd-host=")) {
+ char *h, *sep;
+
+ h = p + strlen("etcd-host=");
+ while (h < e) {
+ for (sep = h; sep < e; ++sep) {
+ if (*sep == '\\' && (sep[1] == ',' ||
+ sep[1] == ';' ||
+ sep[1] == ' ')) {
+ *sep = '\0';
+ sep += 2;
+ break;
+ }
+ }
+
+ if (virStorageSourceRBDAddHost(src, h) < 0)
+ return -1;
+
+ h = sep;
+ }
+ }
+
+ p = next;
+ }
+
+ if (!src->path) {
+ return -1;
+ }
+
+ return 0;
+}
+
+
static int
virStorageSourceParseNBDColonString(const char *nbdstr,
virStorageSource *src)
@@ -399,6 +468,11 @@ virStorageSourceParseBackingColon(virStorageSource *src,
return -1;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ if (virStorageSourceParseVitastorColonString(path, src) < 0)
+ return -1;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
@@ -975,6 +1049,54 @@ virStorageSourceParseBackingJSONRBD(virStorageSource *src,
return 0;
}
+static int
+virStorageSourceParseBackingJSONVitastor(virStorageSource *src,
+ virJSONValue *json,
+ const char *jsonstr G_GNUC_UNUSED,
+ int opaque G_GNUC_UNUSED)
+{
+ const char *filename;
+ const char *image = virJSONValueObjectGetString(json, "image");
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
+ virJSONValue *servers = virJSONValueObjectGetArray(json, "server");
+ size_t nservers;
+ size_t i;
+
+ src->type = VIR_STORAGE_TYPE_NETWORK;
+ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR;
+
+ /* legacy syntax passed via 'filename' option */
+ if ((filename = virJSONValueObjectGetString(json, "filename")))
+ return virStorageSourceParseVitastorColonString(filename, src);
+
+ if (!image) {
+ virReportError(VIR_ERR_INVALID_ARG, "%s",
+ _("missing image name in Vitastor backing volume "
+ "JSON specification"));
+ return -1;
+ }
+
+ src->path = g_strdup(image);
+ src->configFile = g_strdup(conf);
+ src->query = g_strdup(etcd_prefix);
+
+ if (servers) {
+ nservers = virJSONValueArraySize(servers);
+
+ src->hosts = g_new0(virStorageNetHostDef, nservers);
+ src->nhosts = nservers;
+
+ for (i = 0; i < nservers; i++) {
+ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i,
+ virJSONValueArrayGet(servers, i)) < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
static int
virStorageSourceParseBackingJSONRaw(virStorageSource *src,
virJSONValue *json,
@@ -1152,6 +1274,7 @@ static const struct virStorageSourceJSONDriverParser jsonParsers[] = {
{"sheepdog", false, virStorageSourceParseBackingJSONSheepdog, 0},
{"ssh", false, virStorageSourceParseBackingJSONSSH, 0},
{"rbd", false, virStorageSourceParseBackingJSONRBD, 0},
+ {"vitastor", false, virStorageSourceParseBackingJSONVitastor, 0},
{"raw", true, virStorageSourceParseBackingJSONRaw, 0},
{"nfs", false, virStorageSourceParseBackingJSONNFS, 0},
{"vxhs", false, virStorageSourceParseBackingJSONVxHS, 0},
diff --git a/src/test/test_driver.c b/src/test/test_driver.c
index 25335d9002..cf54069fbe 100644
--- a/src/test/test_driver.c
+++ b/src/test/test_driver.c
@@ -7340,6 +7340,7 @@ testStorageVolumeTypeForPool(int pooltype)
case VIR_STORAGE_POOL_ISCSI_DIRECT:
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
return VIR_STORAGE_VOL_NETWORK;
case VIR_STORAGE_POOL_LOGICAL:
case VIR_STORAGE_POOL_DISK:
diff --git a/tests/storagepoolcapsschemadata/poolcaps-fs.xml b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
index eee75af746..8bd0a57bdd 100644
--- a/tests/storagepoolcapsschemadata/poolcaps-fs.xml
+++ b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
@@ -204,4 +204,11 @@
</enum>
</volOptions>
</pool>
+ <pool type='vitastor' supported='no'>
+ <volOptions>
+ <defaultFormat type='raw'/>
+ <enum name='targetFormatType'>
+ </enum>
+ </volOptions>
+ </pool>
</storagepoolCapabilities>
diff --git a/tests/storagepoolcapsschemadata/poolcaps-full.xml b/tests/storagepoolcapsschemadata/poolcaps-full.xml
index 805950a937..852df0de16 100644
--- a/tests/storagepoolcapsschemadata/poolcaps-full.xml
+++ b/tests/storagepoolcapsschemadata/poolcaps-full.xml
@@ -204,4 +204,11 @@
</enum>
</volOptions>
</pool>
+ <pool type='vitastor' supported='yes'>
+ <volOptions>
+ <defaultFormat type='raw'/>
+ <enum name='targetFormatType'>
+ </enum>
+ </volOptions>
+ </pool>
</storagepoolCapabilities>
diff --git a/tests/storagepoolxml2argvtest.c b/tests/storagepoolxml2argvtest.c
index d5c2531ab8..b19308ac38 100644
--- a/tests/storagepoolxml2argvtest.c
+++ b/tests/storagepoolxml2argvtest.c
@@ -57,6 +57,7 @@ testCompareXMLToArgvFiles(bool shouldFail,
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_ZFS:
case VIR_STORAGE_POOL_VSTORAGE:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_LAST:
default:
VIR_TEST_DEBUG("pool type '%s' has no xml2argv test", defTypeStr);
diff --git a/tools/virsh-pool.c b/tools/virsh-pool.c
index 2010ef1356..072e2ff9e8 100644
--- a/tools/virsh-pool.c
+++ b/tools/virsh-pool.c
@@ -1187,6 +1187,9 @@ cmdPoolList(vshControl *ctl, const vshCmd *cmd G_GNUC_UNUSED)
case VIR_STORAGE_POOL_VSTORAGE:
flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE;
break;
+ case VIR_STORAGE_POOL_VITASTOR:
+ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR;
+ break;
case VIR_STORAGE_POOL_LAST:
break;
}

View File

@@ -1,172 +0,0 @@
Index: pve-qemu-kvm-10.0.2/block/meson.build
===================================================================
--- pve-qemu-kvm-10.0.2.orig/block/meson.build
+++ pve-qemu-kvm-10.0.2/block/meson.build
@@ -126,6 +126,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
Index: pve-qemu-kvm-10.0.2/meson.build
===================================================================
--- pve-qemu-kvm-10.0.2.orig/meson.build
+++ pve-qemu-kvm-10.0.2/meson.build
@@ -1622,6 +1622,26 @@ if not get_option('rbd').auto() or have_
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'))
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -2514,6 +2534,7 @@ endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
@@ -4812,6 +4833,7 @@ summary_info += {'fdt support': fd
summary_info += {'libcap-ng support': libcap_ng}
summary_info += {'bpf support': libbpf}
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
Index: pve-qemu-kvm-10.0.2/meson_options.txt
===================================================================
--- pve-qemu-kvm-10.0.2.orig/meson_options.txt
+++ pve-qemu-kvm-10.0.2/meson_options.txt
@@ -202,6 +202,8 @@ option('pvg', type: 'feature', value: 'a
description: 'macOS paravirtualized graphics support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
Index: pve-qemu-kvm-10.0.2/qapi/block-core.json
===================================================================
--- pve-qemu-kvm-10.0.2.orig/qapi/block-core.json
+++ pve-qemu-kvm-10.0.2/qapi/block-core.json
@@ -3599,7 +3599,7 @@
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
'pbs',
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4725,6 +4725,28 @@
'*server': ['InetSocketAddressBase'] } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -5194,6 +5216,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5674,6 +5697,20 @@
'*encrypt' : 'RbdEncryptionCreateOptions' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @location: Where to store the new image file. This location cannot
+# point to a snapshot.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -5895,6 +5932,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
Index: pve-qemu-kvm-10.0.2/scripts/meson-buildoptions.sh
===================================================================
--- pve-qemu-kvm-10.0.2.orig/scripts/meson-buildoptions.sh
+++ pve-qemu-kvm-10.0.2/scripts/meson-buildoptions.sh
@@ -175,6 +175,7 @@ meson_options_help() {
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' qpl Query Processing Library support'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' rust Rust support'
@@ -458,6 +459,8 @@ _meson_option_parse() {
--disable-qpl) printf "%s" -Dqpl=disabled ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-relocatable) printf "%s" -Drelocatable=true ;;

View File

@@ -1,172 +0,0 @@
diff --git a/block/meson.build b/block/meson.build
index 34b1b2a306..24ca0f1e52 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -114,6 +114,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
diff --git a/meson.build b/meson.build
index 41f68d3806..29eaed9ba4 100644
--- a/meson.build
+++ b/meson.build
@@ -1622,6 +1622,26 @@ if not get_option('rbd').auto() or have_block
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'))
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -2506,6 +2526,7 @@ endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
@@ -4813,6 +4834,7 @@ summary_info += {'fdt support': fdt_opt == 'internal' ? 'internal' : fdt}
summary_info += {'libcap-ng support': libcap_ng}
summary_info += {'bpf support': libbpf}
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
diff --git a/meson_options.txt b/meson_options.txt
index 59d973bca0..a3e7123980 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -202,6 +202,8 @@ option('pvg', type: 'feature', value: 'auto',
description: 'macOS paravirtualized graphics support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index b1937780e1..a511193620 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3216,7 +3216,7 @@
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4299,6 +4299,28 @@
'*key-secret': 'str',
'*server': ['InetSocketAddressBase'] } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -4767,6 +4789,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5240,6 +5263,20 @@
'*cluster-size' : 'size',
'*encrypt' : 'RbdEncryptionCreateOptions' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @location: Where to store the new image file. This location cannot
+# point to a snapshot.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVmdkSubformat:
#
@@ -5462,6 +5499,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 3e8e00852b..45aff3b6a9 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -175,6 +175,7 @@ meson_options_help() {
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' qpl Query Processing Library support'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' rust Rust support'
@@ -458,6 +459,8 @@ _meson_option_parse() {
--disable-qpl) printf "%s" -Dqpl=disabled ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-relocatable) printf "%s" -Drelocatable=true ;;

View File

@@ -7,24 +7,22 @@ set -e
VITASTOR=$(dirname $0)
VITASTOR=$(realpath "$VITASTOR/..")
REL=$(rpm --eval '%dist')
REL=${REL##.}
if [ "$REL" = "el8" ]; then
EL=$(rpm --eval '%dist')
if [ "$EL" = ".el8" ]; then
# CentOS 8
. /opt/rh/gcc-toolset-9/enable
elif [ "$REL" = "el7" ]; then
elif [ "$EL" = ".el7" ]; then
# CentOS 7
. /opt/rh/devtoolset-9/enable
fi
cd ~/rpmbuild/SPECS
rpmbuild -bp fio.spec
cd $VITASTOR
VER=$(grep ^Version: rpm/vitastor-$REL.spec | awk '{print $2}')
rm -rf fio
VER=$(grep ^Version: rpm/vitastor-el7.spec | awk '{print $2}')
ln -s ~/rpmbuild/BUILD/fio*/ fio
sh copy-fio-includes.sh
rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-$REL.spec
tar --transform "s#^#vitastor-$VER/#" --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-$VER.$REL.tar.gz $(ls | grep -v packages)
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform "s#^#vitastor-$VER/#" --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-$VER$(rpm --eval '%dist').tar.gz *

View File

@@ -1,16 +0,0 @@
#!/bin/bash
set -e -x
REL=$(rpm --eval '%dist')
REL=${REL##.}
cd /root/vitastor/rpm
./build-tarball.sh
VER=$(grep ^Version: vitastor-$REL.spec | awk '{print $2}')
cp /root/vitastor-$VER.$REL.tar.gz ~/rpmbuild/SOURCES
cp vitastor-$REL.spec ~/rpmbuild/SPECS/vitastor.spec
cd ~/rpmbuild/SPECS/
rpmbuild -ba vitastor.spec
mkdir -p /root/vitastor/packages/vitastor-$REL
rm -rf /root/vitastor/packages/vitastor-$REL/*
cp ~/rpmbuild/RPMS/*/*vitastor* /root/vitastor/packages/vitastor-$REL/
cp ~/rpmbuild/SRPMS/vitastor* /root/vitastor/packages/vitastor-$REL/

View File

@@ -1,8 +1,5 @@
# Build packages for CentOS 7 inside a container
# cd ..
# docker build -t vitastor-buildenv:el7 -f rpm/vitastor-el7.Dockerfile .
# docker run -i --rm -v ./:/root/vitastor vitastor-buildenv:el7 /root/vitastor/rpm/vitastor-build.sh
# cd ..; podman build -t vitastor-el7 -v `pwd`/packages:/root/packages -f rpm/vitastor-el7.Dockerfile .
# localedef -i ru_RU -f UTF-8 ru_RU.UTF-8
FROM centos:7
@@ -10,9 +7,7 @@ FROM centos:7
WORKDIR /root
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
RUN sed -i 's/^mirrorlist=/#mirrorlist=/; s!#baseurl=http://mirror.centos.org/centos/\$releasever!baseurl=http://vault.centos.org/7.9.2009!' /etc/yum.repos.d/*.repo
RUN yum -y --enablerepo=extras install centos-release-scl epel-release yum-utils rpm-build
RUN perl -i -pe 's!mirrorlist=!#mirrorlist=!s; s!#\s*baseurl=http://mirror.centos.org!baseurl=http://vault.centos.org!' /etc/yum.repos.d/CentOS-SCLo-scl*.repo
RUN yum -y install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm
RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gcc make cmake gperftools-devel \
fio rh-nodejs12 jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libnl3-devel
@@ -21,3 +16,32 @@ RUN rpm --nomd5 -i fio*.src.rpm
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
RUN cd ~/rpmbuild/SPECS && yum-builddep -y fio.spec
RUN yum -y install cmake3
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
RUN set -e; \
rpm -i liburing*.src.rpm; \
cd ~/rpmbuild/SPECS/; \
. /opt/rh/devtoolset-9/enable; \
rpmbuild -ba liburing.spec; \
mkdir -p /root/packages/liburing-el7; \
rm -rf /root/packages/liburing-el7/*; \
cp ~/rpmbuild/RPMS/*/liburing* /root/packages/liburing-el7/; \
cp ~/rpmbuild/SRPMS/liburing* /root/packages/liburing-el7/
RUN rpm -i `ls /root/packages/liburing-el7/liburing-*.x86_64.rpm | grep -v debug`
ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
VER=$(grep ^Version: vitastor-el7.spec | awk '{print $2}'); \
cp /root/vitastor-$VER.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \
mkdir -p /root/packages/vitastor-el7; \
rm -rf /root/packages/vitastor-el7/*; \
cp ~/rpmbuild/RPMS/*/*vitastor* /root/packages/vitastor-el7/; \
cp ~/rpmbuild/SRPMS/vitastor* /root/packages/vitastor-el7/

View File

@@ -1,12 +1,13 @@
Name: vitastor
Version: 2.4.0
Version: 2.2.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-2.4.0.el7.tar.gz
Source0: vitastor-2.2.0.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
BuildRequires: devtoolset-9-gcc-c++
BuildRequires: rh-nodejs12
@@ -34,6 +35,8 @@ size with configurable redundancy (replication or erasure codes/XOR).
Summary: Vitastor - OSD
Requires: libJerasure2
Requires: libisa-l
Requires: liburing >= 0.6
Requires: liburing < 2
Requires: vitastor-client = %{version}-%{release}
Requires: util-linux
Requires: parted
@@ -57,6 +60,8 @@ scheduling cluster-level operations.
%package -n vitastor-client
Summary: Vitastor - client
Requires: liburing >= 0.6
Requires: liburing < 2
%description -n vitastor-client
@@ -77,7 +82,7 @@ Vitastor library headers for development.
Summary: Vitastor - fio drivers
Group: Development/Libraries
Requires: vitastor-client = %{version}-%{release}
Requires: fio = 3.7-2.el7
Requires: fio = 3.7-1.el7
%description -n vitastor-fio
@@ -164,7 +169,6 @@ chown vitastor:vitastor /var/lib/vitastor
%files -n vitastor-client
%_bindir/vitastor-nbd
%_bindir/vitastor-ublk
%_bindir/vitastor-nfs
%_bindir/vitastor-cli
%_bindir/vitastor-rm

View File

@@ -1,7 +1,5 @@
# Build packages for CentOS 8 inside a container
# cd ..
# docker build -t vitastor-buildenv:el8 -f rpm/vitastor-el8.Dockerfile .
# docker run -i --rm -v ./:/root/vitastor vitastor-buildenv:el8 /root/vitastor/rpm/vitastor-build.sh
# cd ..; podman build -t vitastor-el8 -v `pwd`/packages:/root/packages -f rpm/vitastor-el8.Dockerfile .
FROM centos:8
@@ -17,3 +15,32 @@ RUN dnf -y install gcc-toolset-9 gcc-toolset-9-gcc-c++ gperftools-devel \
RUN dnf download --source fio
RUN rpm --nomd5 -i fio*.src.rpm
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec fio.spec
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
RUN set -e; \
rpm -i liburing*.src.rpm; \
cd ~/rpmbuild/SPECS/; \
. /opt/rh/gcc-toolset-9/enable; \
rpmbuild -ba liburing.spec; \
mkdir -p /root/packages/liburing-el8; \
rm -rf /root/packages/liburing-el8/*; \
cp ~/rpmbuild/RPMS/*/liburing* /root/packages/liburing-el8/; \
cp ~/rpmbuild/SRPMS/liburing* /root/packages/liburing-el8/
RUN rpm -i `ls /root/packages/liburing-el8/liburing-*.x86_64.rpm | grep -v debug`
ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
VER=$(grep ^Version: vitastor-el8.spec | awk '{print $2}'); \
cp /root/vitastor-$VER.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \
mkdir -p /root/packages/vitastor-el8; \
rm -rf /root/packages/vitastor-el8/*; \
cp ~/rpmbuild/RPMS/*/*vitastor* /root/packages/vitastor-el8/; \
cp ~/rpmbuild/SRPMS/vitastor* /root/packages/vitastor-el8/

View File

@@ -1,12 +1,13 @@
Name: vitastor
Version: 2.4.0
Version: 2.2.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-2.4.0.el8.tar.gz
Source0: vitastor-2.2.0.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
BuildRequires: gcc-toolset-9-gcc-c++
BuildRequires: nodejs >= 10
@@ -33,6 +34,8 @@ size with configurable redundancy (replication or erasure codes/XOR).
Summary: Vitastor - OSD
Requires: libJerasure2
Requires: libisa-l
Requires: liburing >= 0.6
Requires: liburing < 2
Requires: vitastor-client = %{version}-%{release}
Requires: util-linux
Requires: parted
@@ -55,6 +58,8 @@ scheduling cluster-level operations.
%package -n vitastor-client
Summary: Vitastor - client
Requires: liburing >= 0.6
Requires: liburing < 2
%description -n vitastor-client
@@ -75,7 +80,7 @@ Vitastor library headers for development.
Summary: Vitastor - fio drivers
Group: Development/Libraries
Requires: vitastor-client = %{version}-%{release}
Requires: fio = 3.19-3.el8
Requires: fio = 3.7-3.el8
%description -n vitastor-fio
@@ -161,7 +166,6 @@ chown vitastor:vitastor /var/lib/vitastor
%files -n vitastor-client
%_bindir/vitastor-nbd
%_bindir/vitastor-ublk
%_bindir/vitastor-nfs
%_bindir/vitastor-cli
%_bindir/vitastor-rm

View File

@@ -1,7 +1,5 @@
# Build packages for AlmaLinux 9 inside a container
# cd ..
# docker build -t vitastor-buildenv:el9 -f rpm/vitastor-el9.Dockerfile .
# docker run -i --rm -v ./:/root/vitastor vitastor-buildenv:el9 /root/vitastor/rpm/vitastor-build.sh
# cd ..; podman build -t vitastor-el9 -v `pwd`/packages:/root/packages -f rpm/vitastor-el9.Dockerfile .
FROM almalinux:9
@@ -10,7 +8,22 @@ WORKDIR /root
RUN sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/*.repo
RUN dnf -y install epel-release dnf-plugins-core
RUN dnf -y install https://vitastor.io/rpms/centos/9/vitastor-release-1.0-1.el9.noarch.rpm
RUN dnf -y install gcc-c++ gperftools-devel fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libarchive cmake libnl3-devel
RUN dnf -y install gcc-c++ gperftools-devel fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libarchive liburing-devel cmake libnl3-devel
RUN dnf download --source fio
RUN rpm --nomd5 -i fio*.src.rpm
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --spec fio.spec
ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
VER=$(grep ^Version: vitastor-el9.spec | awk '{print $2}'); \
cp /root/vitastor-$VER.el9.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \
mkdir -p /root/packages/vitastor-el9; \
rm -rf /root/packages/vitastor-el9/*; \
cp ~/rpmbuild/RPMS/*/*vitastor* /root/packages/vitastor-el9/; \
cp ~/rpmbuild/SRPMS/vitastor* /root/packages/vitastor-el9/

View File

@@ -1,12 +1,13 @@
Name: vitastor
Version: 2.4.0
Version: 2.2.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-2.4.0.el9.tar.gz
Source0: vitastor-2.2.0.el9.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
BuildRequires: gcc-c++
BuildRequires: nodejs >= 10
@@ -158,7 +159,6 @@ chown vitastor:vitastor /var/lib/vitastor
%files -n vitastor-client
%_bindir/vitastor-nbd
%_bindir/vitastor-ublk
%_bindir/vitastor-nfs
%_bindir/vitastor-cli
%_bindir/vitastor-rm

View File

@@ -12,30 +12,20 @@ set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree")
set(WITH_FIO true CACHE BOOL "Build FIO driver")
set(QEMU_PLUGINDIR qemu CACHE STRING "QEMU plugin directory suffix (qemu-kvm on RHEL)")
set(WITH_ASAN false CACHE BOOL "Build with AddressSanitizer")
set(WITH_SYSTEM_LIBURING false CACHE BOOL "Use system liburing")
if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
if(EXISTS "/etc/debian_version")
set(CMAKE_INSTALL_LIBDIR "lib/${CMAKE_LIBRARY_ARCHITECTURE}")
endif()
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
set(ENABLE_COVERAGE false CACHE BOOL "Enable code coverage")
add_definitions(-DVITASTOR_VERSION="2.4.0")
add_definitions(-D_GNU_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -fvisibility=hidden -I ${CMAKE_SOURCE_DIR}/src)
add_definitions(-DVITASTOR_VERSION="2.2.0")
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
add_link_options(-fno-omit-frame-pointer)
if (${WITH_ASAN})
add_definitions(-fsanitize=address)
add_link_options(-fsanitize=address -fno-omit-frame-pointer)
endif (${WITH_ASAN})
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fvisibility-inlines-hidden")
set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} -fvisibility-inlines-hidden")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fvisibility-inlines-hidden")
if (${ENABLE_COVERAGE})
add_definitions(-coverage)
add_link_options(-coverage)
endif()
set(CMAKE_BUILD_TYPE RelWithDebInfo)
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
@@ -59,6 +49,7 @@ endmacro(install_symlink)
check_include_file("linux/nbd-netlink.h" HAVE_NBD_NETLINK_H)
find_package(PkgConfig)
pkg_check_modules(LIBURING REQUIRED liburing)
if (${WITH_QEMU})
pkg_check_modules(GLIB REQUIRED glib-2.0)
endif (${WITH_QEMU})
@@ -75,14 +66,13 @@ if (RDMACM_LIBRARIES)
add_definitions(-DWITH_RDMACM)
endif (RDMACM_LIBRARIES)
if (${WITH_SYSTEM_LIBURING})
pkg_check_modules(LIBURING REQUIRED liburing>=2.10)
include_directories(${LIBURING_INCLUDE_DIRS})
else()
include_directories(${CMAKE_SOURCE_DIR}/src/liburing/include)
add_subdirectory(liburing)
set(LIBURING_LIBRARIES uring)
endif (${WITH_SYSTEM_LIBURING})
add_custom_target(build_tests)
add_custom_target(test
COMMAND
echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
)
add_dependencies(test build_tests)
include_directories(
../
@@ -96,6 +86,7 @@ include_directories(
${CMAKE_SOURCE_DIR}/src/test
${CMAKE_SOURCE_DIR}/src/util
/usr/include/jerasure
${LIBURING_INCLUDE_DIRS}
${IBVERBS_INCLUDE_DIRS}
)
@@ -110,7 +101,7 @@ add_subdirectory(test)
### Install
install(TARGETS vitastor-osd vitastor-disk vitastor-nbd vitastor-ublk vitastor-nfs vitastor-cli vitastor-kv vitastor-kv-stress RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install(TARGETS vitastor-osd vitastor-disk vitastor-nbd vitastor-nfs vitastor-cli vitastor-kv vitastor-kv-stress RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install_symlink(vitastor-disk ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-dump-journal)
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-rm)
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vita)

View File

@@ -2,17 +2,14 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
# libvitastor_blk.a
add_library(vitastor_blk STATIC
../util/allocator.cpp ../util/crc32c.c ../util/ringloop.cpp
multilist.cpp blockstore_heap.cpp blockstore_disk.cpp
blockstore.cpp blockstore_impl.cpp blockstore_init.cpp blockstore_open.cpp
blockstore_flush.cpp blockstore_read.cpp blockstore_stable.cpp blockstore_sync.cpp blockstore_write.cpp
# libvitastor_blk.so
add_library(vitastor_blk SHARED
../util/allocator.cpp blockstore.cpp blockstore_impl.cpp blockstore_disk.cpp blockstore_init.cpp blockstore_open.cpp blockstore_journal.cpp blockstore_read.cpp
blockstore_write.cpp blockstore_sync.cpp blockstore_stable.cpp blockstore_rollback.cpp blockstore_flush.cpp ../util/crc32c.c ../util/ringloop.cpp
)
target_compile_options(vitastor_blk PUBLIC -fPIC)
target_link_libraries(vitastor_blk
${LIBURING_LIBRARIES}
${ISAL_LIBRARIES}
tcmalloc_minimal
# for timerfd_manager
vitastor_common
)

View File

@@ -3,7 +3,7 @@
#include "blockstore_impl.h"
blockstore_t::blockstore_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd)
blockstore_t::blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd)
{
impl = new blockstore_impl_t(config, ringloop, tfd);
}
@@ -48,9 +48,9 @@ int blockstore_t::read_bitmap(object_id oid, uint64_t target_version, void *bitm
return impl->read_bitmap(oid, target_version, bitmap, result_version);
}
const std::map<uint64_t, uint64_t> & blockstore_t::get_inode_space_stats()
std::map<uint64_t, uint64_t> & blockstore_t::get_inode_space_stats()
{
return impl->get_inode_space_stats();
return impl->inode_space_stats;
}
void blockstore_t::dump_diagnostics()
@@ -82,3 +82,8 @@ uint32_t blockstore_t::get_bitmap_granularity()
{
return impl->get_bitmap_granularity();
}
void blockstore_t::set_no_inode_stats(const std::vector<uint64_t> & pool_ids)
{
impl->set_no_inode_stats(pool_ids);
}

View File

@@ -22,20 +22,18 @@
#define DIRECT_IO_ALIGNMENT 512
#endif
// Memory allocation alignment (page size is usually optimal)
#ifndef MEM_ALIGNMENT
#define MEM_ALIGNMENT 4096
#endif
// Default block size is 128 KB, current allowed range is 4K - 128M
#define DEFAULT_DATA_BLOCK_ORDER 17
#define MIN_DATA_BLOCK_SIZE 4*1024
#define MAX_DATA_BLOCK_SIZE 128*1024*1024
#define MAX_META_BLOCK_SIZE 64*1024
#define DEFAULT_BITMAP_GRANULARITY 4096
#define MIN_JOURNAL_SIZE 1024*1024
// "VITAstor"
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
#define BLOCKSTORE_META_FORMAT_V1 1
#define BLOCKSTORE_META_FORMAT_V2 2
#define BLOCKSTORE_META_FORMAT_HEAP 3
#define BS_OP_MIN 1
#define BS_OP_READ 1
#define BS_OP_WRITE 2
@@ -45,18 +43,13 @@
#define BS_OP_DELETE 6
#define BS_OP_LIST 7
#define BS_OP_ROLLBACK 8
#define BS_OP_MAX 8
#define BS_OP_SYNC_STAB_ALL 9
#define BS_OP_MAX 9
#define BS_OP_PRIVATE_DATA_SIZE 256
/*
All operations may be submitted in any order, because reads only see completed writes,
syncs only sync completed writes and writes don't depend on each other.
The only restriction is that the external code MUST NOT submit multiple writes for one
object in parallel. This is a natural restriction because `version` numbers are used though.
Blockstore opcode documentation:
## BS_OP_READ / BS_OP_WRITE / BS_OP_WRITE_STABLE
@@ -121,6 +114,14 @@ Input:
Output:
- retval = 0 or negative error number (-ENOENT if no such version for stabilize)
## BS_OP_SYNC_STAB_ALL
ONLY FOR TESTS! Sync and mark all unstable object versions as stable, at once.
Input: Nothing except opcode
Output:
- retval = 0 or negative error number (-EINVAL)
## BS_OP_LIST
Get a list of all objects in this Blockstore.
@@ -144,10 +145,10 @@ Output:
*/
struct __attribute__ ((visibility("default"))) blockstore_op_t
struct blockstore_op_t
{
// operation
uint64_t opcode = 0;
uint64_t opcode;
// finish callback
std::function<void (blockstore_op_t*)> callback;
union __attribute__((__packed__))
@@ -171,9 +172,9 @@ struct __attribute__ ((visibility("default"))) blockstore_op_t
uint32_t list_stable_limit;
};
};
uint8_t *buf = NULL;
uint8_t *bitmap = NULL;
int retval = 0;
void *buf;
void *bitmap;
int retval;
uint8_t private_data[BS_OP_PRIVATE_DATA_SIZE];
};
@@ -182,11 +183,11 @@ typedef std::map<std::string, std::string> blockstore_config_t;
class blockstore_impl_t;
class __attribute__((visibility("default"))) blockstore_t
class blockstore_t
{
blockstore_impl_t *impl;
public:
blockstore_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd);
blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
~blockstore_t();
// Update configuration
@@ -214,7 +215,10 @@ public:
int read_bitmap(object_id oid, uint64_t target_version, void *bitmap, uint64_t *result_version = NULL);
// Get per-inode space usage statistics
const std::map<uint64_t, uint64_t> & get_inode_space_stats();
std::map<uint64_t, uint64_t> & get_inode_space_stats();
// Set per-pool no_inode_stats
void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
// Print diagnostics to stdout
void dump_diagnostics();

View File

@@ -2,14 +2,11 @@
// License: VNPL-1.1 (see README.md for details)
#include <sys/file.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <stdexcept>
#include "blockstore.h"
#include "blockstore_impl.h"
#include "blockstore_disk.h"
#include "blockstore_heap.h"
#include "str_util.h"
#include "allocator.h"
@@ -47,11 +44,8 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
disk_alignment = parse_size(config["disk_alignment"]);
journal_block_size = parse_size(config["journal_block_size"]);
meta_block_size = parse_size(config["meta_block_size"]);
meta_block_target_free_space = parse_size(config["meta_block_target_free_space"]);
bitmap_granularity = parse_size(config["bitmap_granularity"]);
meta_format = stoull_full(config["meta_format"]);
atomic_write_size = (config.find("atomic_write_size") != config.end()
? parse_size(config["atomic_write_size"]) : 4096);
if (config.find("data_io") == config.end() &&
config.find("meta_io") == config.end() &&
config.find("journal_io") == config.end())
@@ -96,28 +90,12 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
if (!min_discard_size)
min_discard_size = 1024*1024;
discard_granularity = parse_size(config["discard_granularity"]);
inmemory_meta = config["inmemory_metadata"] != "false" && config["inmemory_metadata"] != "0" &&
config["inmemory_metadata"] != "no";
inmemory_journal = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
config["inmemory_journal"] != "no";
disable_data_fsync = config["disable_data_fsync"] == "true" || config["disable_data_fsync"] == "1" || config["disable_data_fsync"] == "yes";
disable_meta_fsync = config["disable_meta_fsync"] == "true" || config["disable_meta_fsync"] == "1" || config["disable_meta_fsync"] == "yes";
disable_journal_fsync = config["disable_journal_fsync"] == "true" || config["disable_journal_fsync"] == "1" || config["disable_journal_fsync"] == "yes";
if (mock_mode)
{
data_device_size = parse_size(config["data_device_size"]);
data_device_sect = parse_size(config["data_device_sect"]);
meta_device_size = parse_size(config["meta_device_size"]);
meta_device_sect = parse_size(config["meta_device_sect"]);
journal_device_size = parse_size(config["journal_device_size"]);
journal_device_sect = parse_size(config["journal_device_sect"]);
}
// Validate
if (!data_block_size)
{
data_block_size = (1 << DEFAULT_DATA_BLOCK_ORDER);
}
if (is_power_of_two(data_block_size) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE)
if ((block_order = is_power_of_two(data_block_size)) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE)
{
throw std::runtime_error("Bad block size");
}
@@ -149,17 +127,9 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
{
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
else if (meta_block_size > MAX_DATA_BLOCK_SIZE)
else if (meta_block_size > MAX_META_BLOCK_SIZE)
{
throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_DATA_BLOCK_SIZE));
}
if (!meta_block_target_free_space)
{
meta_block_target_free_space = 800;
}
if (meta_block_target_free_space >= meta_block_size)
{
throw std::runtime_error("meta_block_target_free_space must not exceed "+std::to_string(meta_block_size));
throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_META_BLOCK_SIZE));
}
if (data_offset % disk_alignment)
{
@@ -209,29 +179,17 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
{
throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
}
if (!meta_format)
{
meta_format = BLOCKSTORE_META_FORMAT_HEAP;
}
if (meta_device == data_device)
{
disable_meta_fsync = disable_data_fsync;
}
if (journal_device == meta_device)
{
disable_journal_fsync = disable_meta_fsync;
}
}
void blockstore_disk_t::calc_lengths()
void blockstore_disk_t::calc_lengths(bool skip_meta_check)
{
// data
data_len = data_device_size - data_offset;
if (data_device == meta_device && data_offset < meta_offset)
if (data_fd == meta_fd && data_offset < meta_offset)
{
data_len = meta_offset - data_offset;
}
if (data_device == journal_device && data_offset < journal_offset)
if (data_fd == journal_fd && data_offset < journal_offset)
{
data_len = data_len < journal_offset-data_offset
? data_len : journal_offset-data_offset;
@@ -246,23 +204,23 @@ void blockstore_disk_t::calc_lengths()
data_len = cfg_data_size;
}
// meta
meta_area_size = (meta_device == data_device ? data_device_size : meta_device_size) - meta_offset;
if (meta_device == data_device && meta_offset <= data_offset)
uint64_t meta_area_size = (meta_fd == data_fd ? data_device_size : meta_device_size) - meta_offset;
if (meta_fd == data_fd && meta_offset <= data_offset)
{
meta_area_size = data_offset - meta_offset;
}
if (meta_device == journal_device && meta_offset <= journal_offset)
if (meta_fd == journal_fd && meta_offset <= journal_offset)
{
meta_area_size = meta_area_size < journal_offset-meta_offset
? meta_area_size : journal_offset-meta_offset;
}
// journal
journal_len = (journal_device == data_device ? data_device_size : (journal_device == meta_device ? meta_device_size : journal_device_size)) - journal_offset;
if (journal_device == data_device && journal_offset <= data_offset)
journal_len = (journal_fd == data_fd ? data_device_size : (journal_fd == meta_fd ? meta_device_size : journal_device_size)) - journal_offset;
if (journal_fd == data_fd && journal_offset <= data_offset)
{
journal_len = data_offset - journal_offset;
}
if (journal_device == meta_device && journal_offset <= meta_offset)
if (journal_fd == meta_fd && journal_offset <= meta_offset)
{
journal_len = journal_len < meta_offset-journal_offset
? journal_len : meta_offset-journal_offset;
@@ -272,37 +230,37 @@ void blockstore_disk_t::calc_lengths()
clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8;
clean_dyn_size = clean_entry_bitmap_size*2 + (csum_block_size
? data_block_size/csum_block_size*(data_csum_type & 0xFF) : 0);
if (meta_format == BLOCKSTORE_META_FORMAT_HEAP)
clean_entry_size = sizeof(clean_disk_entry) + clean_dyn_size + 4 /*entry_csum*/;
meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
bool new_doesnt_fit = (!meta_format && !skip_meta_check && meta_area_size < meta_len && !data_csum_type);
if (meta_format == BLOCKSTORE_META_FORMAT_V1 || new_doesnt_fit)
{
uint32_t entries_per_block = ((meta_block_size-meta_block_target_free_space) /
(sizeof(heap_object_t) + sizeof(heap_write_t) + clean_dyn_size));
min_meta_len = (block_count+entries_per_block-1) / entries_per_block * meta_block_size;
}
else if (meta_format == BLOCKSTORE_META_FORMAT_V1)
{
clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + 2*clean_entry_bitmap_size;
min_meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size)
/ (meta_block_size / clean_entry_size)) * meta_block_size;
}
else if (meta_format == BLOCKSTORE_META_FORMAT_V2)
{
clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + clean_dyn_size + 4 /*entry_csum*/;
min_meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
uint64_t clean_entry_v0_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
uint64_t meta_v0_len = (1 + (block_count - 1 + meta_block_size / clean_entry_v0_size)
/ (meta_block_size / clean_entry_v0_size)) * meta_block_size;
if (meta_format == BLOCKSTORE_META_FORMAT_V1 || meta_area_size >= meta_v0_len)
{
// Old metadata fits.
if (new_doesnt_fit)
{
printf("Warning: Using old metadata format without checksums because the new format"
" doesn't fit into provided area (%ju bytes required, %ju bytes available)\n", meta_len, meta_area_size);
}
clean_entry_size = clean_entry_v0_size;
meta_len = meta_v0_len;
meta_format = BLOCKSTORE_META_FORMAT_V1;
}
else
meta_format = BLOCKSTORE_META_FORMAT_V2;
}
else
meta_format = BLOCKSTORE_META_FORMAT_V2;
if (!skip_meta_check && meta_area_size < meta_len)
{
throw std::runtime_error("meta_format = "+std::to_string(meta_format)+" is not supported");
}
}
void blockstore_disk_t::check_lengths()
{
if (meta_area_size < min_meta_len)
{
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(min_meta_len)+" bytes, have only "+std::to_string(meta_area_size)+" bytes");
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes, have only "+std::to_string(meta_area_size)+" bytes");
}
// requested journal size
if (cfg_journal_size > journal_len)
if (!skip_meta_check && cfg_journal_size > journal_len)
{
throw std::runtime_error("Requested journal_size is too large");
}
@@ -363,19 +321,12 @@ static int bs_openmode(const std::string & mode)
void blockstore_disk_t::open_data()
{
if (data_fd >= 0)
{
throw std::runtime_error("data device is already opened");
}
data_fd = mock_mode ? MOCK_DATA_FD : open(data_device.c_str(), bs_openmode(data_io) | O_RDWR);
data_fd = open(data_device.c_str(), bs_openmode(data_io) | O_RDWR);
if (data_fd == -1)
{
throw std::runtime_error("Failed to open data device "+data_device+": "+std::string(strerror(errno)));
}
if (!mock_mode)
{
check_size(data_fd, &data_device_size, &data_device_sect, "data device");
}
check_size(data_fd, &data_device_size, &data_device_sect, "data device");
if (disk_alignment % data_device_sect)
{
throw std::runtime_error(
@@ -387,7 +338,7 @@ void blockstore_disk_t::open_data()
{
throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_device_size));
}
if (!mock_mode && !disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
}
@@ -395,26 +346,19 @@ void blockstore_disk_t::open_data()
void blockstore_disk_t::open_meta()
{
if (meta_fd >= 0)
{
throw std::runtime_error("metadata device is already opened");
}
if (meta_device != data_device || meta_io != data_io)
{
meta_fd = mock_mode ? MOCK_META_FD : open(meta_device.c_str(), bs_openmode(meta_io) | O_RDWR);
meta_fd = open(meta_device.c_str(), bs_openmode(meta_io) | O_RDWR);
if (meta_fd == -1)
{
throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
}
if (!mock_mode)
{
check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device");
}
check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device");
if (meta_offset >= meta_device_size)
{
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_device_size));
}
if (!mock_mode && !disable_flock && meta_device != data_device && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
if (!disable_flock && meta_device != data_device && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
}
@@ -440,26 +384,15 @@ void blockstore_disk_t::open_meta()
void blockstore_disk_t::open_journal()
{
if (journal_fd >= 0)
{
throw std::runtime_error("journal device is already opened");
}
if (journal_device != meta_device || journal_io != meta_io)
{
journal_fd = mock_mode ? MOCK_JOURNAL_FD : open(journal_device.c_str(), bs_openmode(journal_io) | O_RDWR);
journal_fd = open(journal_device.c_str(), bs_openmode(journal_io) | O_RDWR);
if (journal_fd == -1)
{
throw std::runtime_error("Failed to open journal device "+journal_device+": "+std::string(strerror(errno)));
}
if (!mock_mode)
{
check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device");
}
if (journal_offset >= journal_device_size)
{
throw std::runtime_error("journal_offset exceeds device size = "+std::to_string(journal_device_size));
}
if (!mock_mode && !disable_flock && journal_device != meta_device && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device");
if (!disable_flock && journal_device != meta_device && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
}
@@ -485,32 +418,25 @@ void blockstore_disk_t::open_journal()
void blockstore_disk_t::close_all()
{
if (!mock_mode)
{
if (data_fd >= 0)
close(data_fd);
if (meta_fd >= 0 && meta_fd != data_fd)
close(meta_fd);
if (journal_fd >= 0 && journal_fd != meta_fd)
close(journal_fd);
}
if (data_fd >= 0)
close(data_fd);
if (meta_fd >= 0 && meta_fd != data_fd)
close(meta_fd);
if (journal_fd >= 0 && journal_fd != meta_fd)
close(journal_fd);
data_fd = meta_fd = journal_fd = -1;
}
// Sadly DISCARD only works through ioctl(), but it seems to always block the device queue,
// so it's not a big deal that we can only run it synchronously.
int blockstore_disk_t::trim_data(std::function<bool(uint64_t)> is_free)
int blockstore_disk_t::trim_data(allocator_t *alloc)
{
if (mock_mode)
{
return -EINVAL;
}
int r = 0;
uint64_t j = 0, i = 0;
uint64_t discarded = 0;
for (; i <= block_count; i++)
{
if (i >= block_count || is_free(i))
if (i >= block_count || alloc->get(i))
{
if (i > j && (i-j)*data_block_size >= min_discard_size)
{

View File

@@ -12,10 +12,6 @@
// Lower byte of checksum type is its length
#define BLOCKSTORE_CSUM_CRC32C 0x104
#define MOCK_DATA_FD 1000
#define MOCK_META_FD 1001
#define MOCK_JOURNAL_FD 1002
class allocator_t;
struct blockstore_disk_t
@@ -26,15 +22,11 @@ struct blockstore_disk_t
// Required write alignment and journal/metadata/data areas' location alignment
uint32_t disk_alignment = 4096;
// Journal block size - minimum_io_size of the journal device is the best choice
uint32_t journal_block_size = 4096;
uint64_t journal_block_size = 4096;
// Metadata block size - minimum_io_size of the metadata device is the best choice
uint32_t meta_block_size = 4096;
// Atomic write size of the data block device
uint32_t atomic_write_size = 4096;
// Target free space in metadata blocks
uint32_t meta_block_target_free_space = 800;
uint64_t meta_block_size = 4096;
// Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
uint32_t bitmap_granularity = 4096;
uint64_t bitmap_granularity = 4096;
// Data checksum type, BLOCKSTORE_CSUM_NONE or BLOCKSTORE_CSUM_CRC32C
uint32_t data_csum_type = BLOCKSTORE_CSUM_NONE;
// Checksum block size, must be a multiple of bitmap_granularity
@@ -44,36 +36,27 @@ struct blockstore_disk_t
// I/O modes for data, metadata and journal: direct or "" = O_DIRECT, cached = O_SYNC, directsync = O_DIRECT|O_SYNC
// O_SYNC without O_DIRECT = use Linux page cache for reads and writes
std::string data_io, meta_io, journal_io;
// It is safe to disable fsync() if drive write cache is writethrough
bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
// Keep journal (buffered data) in memory?
bool inmemory_meta = true;
// Keep metadata in memory?
bool inmemory_journal = true;
// Data discard granularity and minimum size (for the sake of performance)
bool discard_on_start = false;
uint64_t min_discard_size = 1024*1024;
uint64_t discard_granularity = 0;
int meta_fd = -1, data_fd = -1, journal_fd = -1;
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_area_size, min_meta_len, meta_format = 0;
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;
uint64_t data_offset, data_device_sect, data_device_size, data_len;
uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;
uint32_t block_order = 0;
uint64_t block_count = 0;
uint32_t clean_entry_bitmap_size = 0;
uint32_t clean_entry_size = 0, clean_dyn_size = 0; // for meta_v1/2
bool mock_mode = false;
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0, clean_dyn_size = 0;
void parse_config(std::map<std::string, std::string> & config);
void open_data();
void open_meta();
void open_journal();
void calc_lengths();
void check_lengths();
void calc_lengths(bool skip_meta_check = false);
void close_all();
int trim_data(std::function<bool(uint64_t)> is_free);
int trim_data(allocator_t *alloc);
inline uint64_t dirty_dyn_size(uint64_t offset, uint64_t len)
{

File diff suppressed because it is too large Load Diff

View File

@@ -1,20 +1,22 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#define COPY_BUF_JOURNAL 0x01
#define COPY_BUF_DATA 0x02
#define COPY_BUF_ZERO 0x04
#define COPY_BUF_CSUM_FILL 0x08
#define COPY_BUF_COALESCED 0x10
#define COPY_BUF_PADDED 0x20
#define COPY_BUF_SKIP_CSUM 0x40
#define COPY_BUF_JOURNAL 1
#define COPY_BUF_DATA 2
#define COPY_BUF_ZERO 4
#define COPY_BUF_CSUM_FILL 8
#define COPY_BUF_COALESCED 16
#define COPY_BUF_META_BLOCK 32
#define COPY_BUF_JOURNALED_BIG 64
struct copy_buffer_t
{
uint32_t copy_flags;
uint64_t offset, len, disk_loc, disk_offset, disk_len;
uint8_t *buf;
uint64_t wr_lsn;
int copy_flags;
uint64_t offset, len, disk_offset;
uint64_t journal_sector; // only for reads: sector+1 if used and !journal.inmemory, otherwise 0
void *buf;
uint8_t *csum_buf;
int *dyn_data;
};
struct meta_sector_t
@@ -25,6 +27,13 @@ struct meta_sector_t
int usage_count;
};
struct flusher_sync_t
{
bool fsync_meta;
int ready_count;
int state;
};
struct flusher_meta_write_t
{
uint64_t sector, pos;
@@ -40,75 +49,93 @@ class journal_flusher_co
{
blockstore_impl_t *bs;
journal_flusher_t *flusher;
int co_id;
int wait_state, wait_count;
int wait_state, wait_count, wait_journal_count;
struct io_uring_sqe *sqe;
struct ring_data_t *data;
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
std::list<flusher_sync_t>::iterator cur_sync;
object_id cur_oid;
uint64_t copy_id;
uint64_t compact_lsn;
uint64_t cur_version;
heap_object_t *cur_obj;
heap_write_t *begin_wr, *end_wr;
uint32_t modified_block;
bool should_repeat;
obj_ver_id cur;
std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_start, dirty_end;
std::map<object_id, uint64_t>::iterator repeat_it;
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_rj, simple_callback_w;
std::vector<copy_buffer_t> read_vec;
uint32_t overwrite_start, overwrite_end;
uint32_t big_start, big_end;
int i, res;
bool read_to_fill_incomplete;
bool try_trim = false;
bool skip_copy, has_delete, has_writes;
std::vector<copy_buffer_t> v;
std::vector<copy_buffer_t>::iterator it;
int i;
bool fill_incomplete, cleared_incomplete;
int read_to_fill_incomplete;
int copy_count;
uint64_t clean_loc;
uint64_t clean_loc, clean_ver, old_clean_loc, old_clean_ver;
flusher_meta_write_t meta_old, meta_new;
bool do_repeat = false;
bool clean_init_bitmap;
uint64_t clean_bitmap_offset, clean_bitmap_len;
uint8_t *clean_init_dyn_ptr;
uint8_t *new_clean_bitmap;
uint64_t new_trim_pos;
friend class journal_flusher_t;
void iterate_checksum_holes(std::function<void(int & pos, uint32_t hole_start, uint32_t hole_end)> cb);
void fill_partial_checksum_blocks();
void scan_dirty();
bool read_dirty(int wait_base);
bool modify_meta_do_reads(int wait_base);
bool wait_meta_reads(int wait_base);
bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
bool clear_incomplete_csum_block_bits(int wait_base);
void calc_block_checksums(uint32_t *new_data_csums, bool skip_overwrites);
void update_metadata_entry();
bool write_meta_block(flusher_meta_write_t & meta_block, int wait_base);
void update_clean_db();
void free_data_blocks();
bool fsync_batch(bool fsync_meta, int wait_base);
bool trim_journal(int wait_base);
void free_buffers();
int check_and_punch_checksums();
bool calc_block_checksums();
bool write_meta_block(int wait_base);
bool read_buffered(int wait_base);
bool fsync_meta(int wait_base);
int fsync_buffer(int wait_base);
bool trim_lsn(int wait_base);
public:
journal_flusher_co();
~journal_flusher_co();
bool loop();
};
// Journal flusher itself
class journal_flusher_t
{
int force_start = 0;
int min_flusher_count = 0, max_flusher_count = 0, cur_flusher_count = 0, target_flusher_count = 0;
int trim_wanted = 0;
bool dequeuing;
int min_flusher_count, max_flusher_count, cur_flusher_count, target_flusher_count;
int flusher_start_threshold;
journal_flusher_co *co;
blockstore_impl_t *bs;
friend class journal_flusher_co;
int advance_lsn_counter = 0;
uint64_t compact_counter = 0;
int journal_trim_counter;
bool trimming;
void* journal_superblock;
int active_flushers = 0;
int wanting_meta_fsync = 0;
bool fsyncing_meta = false;
int syncing_buffer = 0;
int active_flushers;
int syncing_flushers;
std::list<flusher_sync_t> syncs;
std::map<object_id, uint64_t> sync_to_repeat;
std::map<uint64_t, meta_sector_t> meta_sectors;
std::deque<object_id> flush_queue;
std::map<object_id, uint64_t> flush_versions; // FIXME: consider unordered_map?
bool try_find_older(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
bool try_find_other(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
public:
journal_flusher_t(blockstore_impl_t *bs);
~journal_flusher_t();
void loop();
int get_syncing_buffer();
uint64_t get_compact_counter();
bool is_trim_wanted() { return trim_wanted; }
bool is_active();
void mark_trim_possible();
void request_trim();
void release_trim();
void enqueue_flush(obj_ver_id oid);
void unshift_flush(obj_ver_id oid, bool force);
void remove_flush(object_id oid);
void dump_diagnostics();
bool is_mutated(uint64_t clean_loc);
};

File diff suppressed because it is too large Load Diff

View File

@@ -1,376 +0,0 @@
// Metadata storage version 3 ("heap")
// Copyright (c) Vitaliy Filippov, 2025+
// License: VNPL-1.1 (see README.md for details)
#pragma once
#include <map>
#include <unordered_map>
#include <set>
#include <deque>
#include <vector>
#include "../client/object_id.h"
#include "../util/robin_hood.h"
#include "blockstore_disk.h"
#include "multilist.h"
struct pool_shard_settings_t
{
uint32_t pg_count;
uint32_t pg_stripe_size;
};
#define BS_HEAP_TYPE 7
#define BS_HEAP_OBJECT 1
#define BS_HEAP_SMALL_WRITE 2
#define BS_HEAP_BIG_WRITE 3
#define BS_HEAP_TOMBSTONE 4
#define BS_HEAP_INTENT_WRITE 5
#define BS_HEAP_STABLE 8
class blockstore_heap_t;
struct __attribute__((__packed__)) heap_small_write_t
{
uint16_t size;
int16_t next_pos;
uint8_t flags;
uint64_t lsn;
uint64_t version;
uint64_t location;
uint32_t offset;
uint32_t len;
};
struct __attribute__((__packed__)) heap_big_write_t
{
uint16_t size;
int16_t next_pos;
uint8_t flags;
uint64_t lsn;
uint64_t version;
uint32_t block_num;
};
struct __attribute__((__packed__)) heap_tombstone_t
{
uint16_t size;
int16_t next_pos;
uint8_t flags;
uint64_t lsn;
uint64_t version;
};
struct __attribute__((__packed__)) heap_write_t
{
// size should have top bit cleared
uint16_t size = 0;
int16_t next_pos = 0;
uint8_t entry_type = 0; // BS_HEAP_*
uint64_t lsn = 0;
uint64_t version = 0;
// uint8_t[] external_bitmap
// uint8_t[] internal_bitmap
// uint32_t[] checksums
heap_write_t *next();
inline uint8_t type() const { return (entry_type & BS_HEAP_TYPE); }
inline heap_small_write_t& small() { return *(heap_small_write_t*)this; }
inline heap_big_write_t& big() { return *(heap_big_write_t*)this; }
uint32_t get_size(blockstore_heap_t *heap);
uint32_t get_csum_size(blockstore_heap_t *heap);
bool needs_recheck(blockstore_heap_t *heap);
bool needs_compact(blockstore_heap_t *heap);
bool is_compacted(uint64_t compacted_lsn);
bool can_be_collapsed(blockstore_heap_t *heap);
bool is_allowed_before_compacted(uint64_t compacted_lsn, bool is_last_entry);
uint8_t *get_ext_bitmap(blockstore_heap_t *heap);
uint8_t *get_int_bitmap(blockstore_heap_t *heap);
uint8_t *get_checksums(blockstore_heap_t *heap);
uint32_t *get_checksum(blockstore_heap_t *heap);
uint64_t big_location(blockstore_heap_t *heap);
void set_big_location(blockstore_heap_t *heap, uint64_t location);
};
struct __attribute__((__packed__)) heap_object_t
{
// size should have top bit cleared
uint16_t size = 0;
// linked list of write entries...
// newest entries are stored first to simplify scanning
int16_t write_pos = 0;
uint8_t entry_type = 0; // BS_HEAP_*
uint32_t crc32c = 0;
uint64_t inode = 0;
uint64_t stripe = 0;
heap_write_t *get_writes();
uint32_t calc_crc32c();
};
struct heap_object_lsn_t
{
object_id oid;
uint64_t lsn;
};
inline bool operator < (const heap_object_lsn_t & a, const heap_object_lsn_t & b)
{
return a.oid < b.oid || a.oid == b.oid && a.lsn < b.lsn;
}
struct tmp_compact_item_t
{
object_id oid;
uint64_t lsn;
bool compact;
};
struct heap_mvcc_copy_id_t
{
object_id oid;
uint64_t copy_id;
};
inline bool operator == (const heap_mvcc_copy_id_t & a, const heap_mvcc_copy_id_t & b)
{
return a.oid.inode == b.oid.inode && a.oid.stripe == b.oid.stripe && a.copy_id == b.copy_id;
}
namespace std
{
template<> struct hash<heap_mvcc_copy_id_t>
{
inline size_t operator()(const heap_mvcc_copy_id_t &s) const
{
size_t seed = std::hash<object_id>()(s.oid);
// Copy-pasted from spp::hash_combine()
seed ^= (s.copy_id + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
return seed;
}
};
};
struct heap_object_mvcc_t
{
uint32_t readers = 0;
heap_object_t *entry_copy = NULL;
};
struct __attribute__((__packed__)) heap_block_info_t
{
uint32_t used_space = 0;
uint32_t free_pos = 0;
uint8_t *data = NULL;
};
struct heap_inflight_lsn_t
{
object_id oid;
uint64_t flags;
};
struct heap_refqi_t
{
uint64_t lsn;
uint64_t inode;
uint64_t location;
uint32_t len;
bool is_data;
};
using i64hash_t = robin_hood::hash<uint64_t>;
using heap_block_index_t = robin_hood::unordered_flat_map<uint64_t,
robin_hood::unordered_flat_map<inode_t, robin_hood::unordered_flat_map<uint64_t, uint64_t, i64hash_t, std::equal_to<uint64_t>, 88>, i64hash_t>, i64hash_t>;
using heap_mvcc_map_t = robin_hood::unordered_flat_map<heap_mvcc_copy_id_t, heap_object_mvcc_t>;
class blockstore_heap_t
{
friend class heap_write_t;
friend class heap_object_t;
blockstore_disk_t *dsk = NULL;
uint8_t* buffer_area = NULL;
bool abort_on_corruption = false;
bool abort_on_overlap = true;
int log_level = 0;
const uint32_t meta_block_count = 0;
uint32_t target_block_free_space = 800;
uint64_t next_lsn = 0;
robin_hood::unordered_flat_map<pool_id_t, pool_shard_settings_t> pool_shard_settings;
// PG => inode => stripe => block number
heap_block_index_t block_index;
std::vector<heap_block_info_t> block_info;
allocator_t *data_alloc = NULL;
multilist_index_t *meta_alloc = NULL;
uint32_t meta_alloc_count = 0;
uint64_t meta_used_space = 0;
multilist_alloc_t *buffer_alloc = NULL;
heap_mvcc_map_t object_mvcc;
std::unordered_map<uint64_t, uint32_t> mvcc_data_refs;
std::unordered_map<uint64_t, uint32_t> mvcc_buffer_refs;
std::map<uint64_t, uint64_t> inode_space_stats;
uint64_t buffer_area_used_space = 0;
uint64_t data_used_space = 0;
// LSN queue: inflight (writing) -> completed [-> fsynced] -> compactable -> compacted [-> fsynced] -> trimmed and removed
std::deque<heap_inflight_lsn_t> inflight_lsn;
uint32_t to_compact_count = 0;
uint64_t first_inflight_lsn = 0;
uint64_t completed_lsn = 0;
uint64_t fsynced_lsn = 0;
uint64_t compacted_lsn = 0;
uint64_t next_compact_lsn = 0;
std::deque<heap_refqi_t> overwrite_ref_queue;
std::vector<tmp_compact_item_t> tmp_compact_queue;
std::deque<object_id> recheck_queue;
int recheck_in_progress = 0;
bool in_recheck = false;
std::function<void(bool is_data, uint64_t offset, uint64_t len, uint8_t* buf, std::function<void()>)> recheck_cb;
int recheck_queue_depth = 0;
const uint32_t max_write_entry_size;
uint64_t get_pg_id(inode_t inode, uint64_t stripe);
void defragment_block(uint32_t block_num);
uint32_t find_block_run(heap_block_info_t & block, uint32_t space);
uint32_t find_block_space(uint32_t block_num, uint32_t space);
uint32_t block_has_compactable(uint8_t *data);
uint32_t compact_object_to(heap_object_t *obj, uint64_t lsn, uint8_t *new_csums, bool do_free);
void copy_full_object(uint8_t *dst, heap_object_t *obj);
bool mvcc_save_copy(heap_object_t *obj);
bool mvcc_check_tracking(object_id oid);
void free_mvcc(heap_mvcc_map_t::iterator mvcc_it);
void allocate_block(heap_block_info_t & inf);
int allocate_new_object(object_id oid, uint32_t full_object_size, uint32_t *modified_block, heap_object_t **new_obj);
int add_object(object_id oid, heap_write_t *wr, uint32_t *modified_block);
void mark_overwritten(uint64_t over_lsn, uint64_t inode, heap_write_t *wr, heap_write_t *end_wr, bool tracking_active);
int update_object(uint32_t block_num, heap_object_t *obj, heap_write_t *wr, uint32_t *modified_block, uint32_t *moved_from_block);
void init_erase(uint32_t block_num, heap_object_t *obj);
void erase_object(uint32_t block_num, heap_object_t *obj, uint64_t lsn, bool tracking_active);
void reindex_block(uint32_t block_num, heap_object_t *from_obj);
void erase_block_index(inode_t inode, uint64_t stripe);
void deref_data(uint64_t inode, uint64_t location, bool free_at_0);
void deref_buffer(uint64_t inode, uint64_t location, uint32_t len, bool free_at_0);
void deref_overwrites(uint64_t lsn);
void free_object_space(inode_t inode, heap_write_t *from, heap_write_t *to, int mode = 0);
void add_used_space(uint32_t block_num, int32_t used_delta);
void push_inflight_lsn(object_id oid, uint64_t lsn, uint64_t flags);
public:
blockstore_heap_t(blockstore_disk_t *dsk, uint8_t *buffer_area, int log_level = 0);
~blockstore_heap_t();
// set initially compacted lsn - should be done before loading
void set_compacted_lsn(uint64_t compacted_lsn);
uint64_t get_compacted_lsn();
// load data from the disk, returns count of loaded write entries
void read_blocks(uint64_t disk_offset, uint64_t size, uint8_t *buf,
std::function<void(heap_object_t*)> handle_object, std::function<void(uint32_t, uint32_t, uint8_t*)> handle_block);
uint64_t load_blocks(uint64_t disk_offset, uint64_t size, uint8_t *buf);
// finish loading
void finish_load();
// recheck small write data after reading the database from disk
bool recheck_small_writes(std::function<void(bool is_data, uint64_t offset, uint64_t len, uint8_t* buf, std::function<void()>)> read_buffer, int queue_depth);
// initialize metadata area (fill it with empty data)
// returns 0 when done, EAGAIN when the caller has to wait more
int initialize();
// read from the metadata area
// returns 0 when done, EAGAIN when the caller has to wait more
int read();
// reshard database according to the pool's PG count
void reshard(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size);
// read an object entry and lock it against removal
// in the future, may become asynchronous
heap_object_t *lock_and_read_entry(object_id oid, uint64_t & copy_id);
// re-read a locked object entry with the given lsn (pointer may be invalidated)
heap_object_t *read_locked_entry(object_id oid, uint64_t copy_id);
// read an object entry without locking it
heap_object_t *read_entry(object_id oid, uint32_t *block_num_ptr, bool for_update = false);
// unlock an entry
bool unlock_entry(object_id oid, uint64_t copy_id);
// set or verify checksums in a write request
bool calc_checksums(heap_write_t *wr, uint8_t *data, bool set, uint32_t offset = 0, uint32_t len = 0);
// set or verify raw block checksums
bool calc_block_checksums(uint32_t *block_csums, uint8_t *data, uint8_t *bitmap, uint32_t start, uint32_t end,
bool set, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
bool calc_block_checksums(uint32_t *block_csums, uint8_t *bitmap,
uint32_t start, uint32_t end, std::function<uint8_t*(uint32_t start, uint32_t & len)> next,
bool set, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
// copy an object as is
int copy_object(heap_object_t *obj, uint32_t *modified_block);
// auto-compacts the object, then adds a write entry to it and to the compaction queue
// return 0 if OK, or maybe ENOSPC
int post_write(object_id oid, heap_write_t *wr, uint32_t *modified_block, uint32_t *moved_from_block);
int post_write(uint32_t & block_num, object_id oid, heap_object_t *obj, heap_write_t *wr, uint32_t *moved_from_block);
// stabilize an unstable object version
// return 0 if OK, ENOENT if not exists
int post_stabilize(object_id oid, uint64_t version, uint32_t *modified_block, uint64_t *new_lsn, uint64_t *new_to_lsn);
// rollback an unstable object version
// return 0 if OK, ENOENT if not exists, EBUSY if already stable
int post_rollback(object_id oid, uint64_t version, uint64_t *new_lsn, uint32_t *modified_block);
// forget an object
// return error code
int post_delete(object_id oid, uint64_t *new_lsn, uint32_t *modified_block);
int post_delete(uint32_t block_num, heap_object_t *obj, uint64_t *new_lsn);
// get the next object to compact
// guaranteed to return objects in min lsn order
// returns 0 if OK, ENOENT if nothing to compact
int get_next_compact(object_id & oid);
// get the range of an object eligible for compaction
void get_compact_range(heap_object_t *obj, uint64_t max_lsn, heap_write_t **begin_wr, heap_write_t **end_wr);
// mark an object as compacted up to the given lsn
int compact_object(object_id oid, uint64_t lsn, uint8_t *new_csums);
// retrieve object listing from a PG
int list_objects(uint32_t pg_num, object_id min_oid, object_id max_oid,
obj_ver_id **result_list, size_t *stable_count, size_t *unstable_count);
// set a block number for a new object and returns error status: 0, EAGAIN or ENOSPC
int get_block_for_new_object(uint32_t & out_block_num, uint32_t size = 0);
// inflight write tracking
void mark_lsn_completed(uint64_t lsn);
void mark_lsn_fsynced(uint64_t lsn);
void mark_lsn_compacted(uint64_t lsn, bool allow_undone = false);
void mark_object_compacted(heap_object_t *obj, uint64_t max_lsn);
void mark_lsn_trimmed(uint64_t lsn);
uint64_t get_completed_lsn();
uint64_t get_fsynced_lsn();
// data device block allocator functions
uint64_t find_free_data();
bool is_data_used(uint64_t location);
void use_data(inode_t inode, uint64_t location);
void free_data(inode_t inode, uint64_t location);
// buffer device allocator functions
uint64_t find_free_buffer_area(uint64_t size);
bool is_buffer_area_free(uint64_t location, uint64_t size);
void use_buffer_area(inode_t inode, uint64_t location, uint64_t size);
void free_buffer_area(inode_t inode, uint64_t location, uint64_t size);
uint64_t get_buffer_area_used_space();
// get metadata block data buffer and used space
uint8_t *get_meta_block(uint32_t block_num);
uint32_t get_meta_block_used_space(uint32_t block_num);
// get space usage statistics
uint64_t get_data_used_space();
const std::map<uint64_t, uint64_t> & get_inode_space_stats();
uint64_t get_meta_total_space();
uint64_t get_meta_used_space();
uint32_t get_meta_nearfull_blocks();
uint32_t get_inflight_queue_size();
uint32_t get_compact_queue_size();
uint32_t get_to_compact_count();
// get maximum size for a temporary heap_write_t buffer
uint32_t get_max_write_entry_size();
// only for tests
void set_abort_on_corruption(bool fail);
void set_abort_on_overlap(bool fail);
};

View File

@@ -1,17 +1,13 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include <stdexcept>
#include "blockstore_impl.h"
#include "crc32c.h"
blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd, bool mock_mode)
blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd)
{
assert(sizeof(blockstore_op_private_t) <= BS_OP_PRIVATE_DATA_SIZE);
this->tfd = tfd;
this->ringloop = ringloop;
dsk.mock_mode = mock_mode;
ring_consumer.loop = [this]() { loop(); };
ringloop->register_consumer(&ring_consumer);
initialized = 0;
@@ -21,43 +17,33 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_i *
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
dsk.calc_lengths();
calc_lengths();
alloc_dyn_data = dsk.clean_dyn_size > sizeof(void*) || dsk.csum_block_size > 0;
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
data_alloc = new allocator_t(dsk.block_count);
}
catch (std::exception & e)
{
dsk.close_all();
throw;
}
memset(zero_object, 0, dsk.data_block_size);
meta_superblock = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
memset(meta_superblock, 0, dsk.meta_block_size);
}
void blockstore_impl_t::init()
{
flusher = new journal_flusher_t(this);
if (dsk.inmemory_journal)
{
buffer_area = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.journal_len);
}
heap = new blockstore_heap_t(&dsk, buffer_area, log_level);
}
blockstore_impl_t::~blockstore_impl_t()
{
if (flusher)
delete flusher;
if (heap)
delete heap;
if (buffer_area)
free(buffer_area);
if (meta_superblock)
free(meta_superblock);
delete data_alloc;
delete flusher;
if (zero_object)
free(zero_object);
ringloop->unregister_consumer(&ring_consumer);
dsk.close_all();
if (metadata_buffer)
free(metadata_buffer);
if (clean_bitmaps)
free(clean_bitmaps);
if (heap_meta.blocks)
delete[] heap_meta.blocks;
}
bool blockstore_impl_t::is_started()
@@ -73,9 +59,10 @@ bool blockstore_impl_t::is_stalled()
// main event loop - produce requests
void blockstore_impl_t::loop()
{
// FIXME: initialized == 10 is ugly
if (initialized != 10)
{
// read metadata
// read metadata, then journal
if (initialized == 0)
{
metadata_init_reader = new blockstore_init_meta(this);
@@ -88,41 +75,69 @@ void blockstore_impl_t::loop()
{
delete metadata_init_reader;
metadata_init_reader = NULL;
journal_init_reader = new blockstore_init_journal(this);
initialized = 2;
}
}
if (initialized == 2)
{
int res = journal_init_reader->loop();
if (!res)
{
delete journal_init_reader;
journal_init_reader = NULL;
initialized = 3;
ringloop->wakeup();
}
}
if (initialized == 3)
{
if (!readonly && dsk.discard_on_start)
dsk.trim_data(data_alloc);
if (journal.flush_journal)
initialized = 4;
else
initialized = 10;
}
if (initialized == 4)
{
if (readonly)
{
dsk.trim_data([this](uint64_t block_num){ return heap->is_data_used(block_num * dsk.data_block_size); });
printf("Can't flush the journal in readonly mode\n");
exit(1);
}
initialized = 10;
flusher->loop();
ringloop->submit();
}
}
else
{
// try to submit ops
unsigned initial_ring_space = ringloop->space_left();
int op_idx = 0, new_idx = 0;
bool has_unfinished_writes = false;
// has_writes == 0 - no writes before the current queue item
// has_writes == 1 - some writes in progress
// has_writes == 2 - tried to submit some writes, but failed
int has_writes = 0, op_idx = 0, new_idx = 0;
for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
{
auto op = submit_queue[op_idx];
submit_queue[new_idx] = op;
// FIXME: This needs some simplification
// Writes should not block reads if the ring is not full and reads don't depend on them
// In all other cases we should stop submission
if (PRIV(op)->wait_for)
{
check_wait(op);
if (PRIV(op)->wait_for == WAIT_SQE)
{
// ring is full, stop submission
break;
}
else if (PRIV(op)->wait_for)
{
has_unfinished_writes = has_unfinished_writes || op->opcode == BS_OP_WRITE ||
op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE ||
op->opcode == BS_OP_STABLE || op->opcode == BS_OP_ROLLBACK;
if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE)
{
has_writes = 2;
}
continue;
}
}
@@ -135,33 +150,46 @@ void blockstore_impl_t::loop()
{
wr_st = dequeue_read(op);
}
else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE)
else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE)
{
if (has_writes == 2)
{
// Some writes already could not be submitted
continue;
}
wr_st = dequeue_write(op);
has_unfinished_writes = has_unfinished_writes || (wr_st != 2);
has_writes = wr_st > 0 ? 1 : 2;
}
else if (op->opcode == BS_OP_DELETE)
{
if (has_writes == 2)
{
// Some writes already could not be submitted
continue;
}
wr_st = dequeue_del(op);
has_writes = wr_st > 0 ? 1 : 2;
}
else if (op->opcode == BS_OP_SYNC)
{
// syncs only completed writes, so doesn't have to be blocked by anything
// sync only completed writes?
// wait for the data device fsync to complete, then submit journal writes for big writes
// then submit an fsync operation
wr_st = continue_sync(op);
}
else if (op->opcode == BS_OP_STABLE || op->opcode == BS_OP_ROLLBACK)
else if (op->opcode == BS_OP_STABLE)
{
wr_st = dequeue_stable(op);
has_unfinished_writes = has_unfinished_writes || (wr_st != 2);
}
else if (op->opcode == BS_OP_ROLLBACK)
{
wr_st = dequeue_rollback(op);
}
else if (op->opcode == BS_OP_LIST)
{
// LIST has to be blocked by previous writes and commits/rollbacks
if (!has_unfinished_writes)
{
process_list(op);
wr_st = 2;
}
else
{
wr_st = 0;
}
// LIST doesn't have to be blocked by previous modifications
process_list(op);
wr_st = 2;
}
if (wr_st == 2)
{
@@ -170,13 +198,16 @@ void blockstore_impl_t::loop()
}
if (wr_st == 0)
{
PRIV(op)->pending_ops = 0;
ringloop->restore(prev_sqe_pos);
if (PRIV(op)->wait_for == WAIT_SQE)
{
// ring is full, stop submission
break;
}
else if (PRIV(op)->wait_for == WAIT_JOURNAL)
{
PRIV(op)->wait_detail2 = (unstable_writes.size()+unstable_unsynced);
}
}
}
if (op_idx != new_idx)
@@ -196,6 +227,12 @@ void blockstore_impl_t::loop()
{
throw std::runtime_error(std::string("io_uring_submit: ") + strerror(-ret));
}
for (auto s: journal.submitting_sectors)
{
// Mark journal sector writes as submitted
journal.sector_info[s].submit_id = 0;
}
journal.submitting_sectors.clear();
if ((initial_ring_space - ringloop->space_left()) > 0)
{
live = true;
@@ -213,7 +250,7 @@ bool blockstore_impl_t::is_safe_to_stop()
{
return false;
}
if (unsynced_big_write_count > 0 || unsynced_small_write_count > 0)
if (unsynced_big_writes.size() > 0 || unsynced_small_writes.size() > 0)
{
if (!readonly && !stop_sync_submitted)
{
@@ -237,7 +274,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
{
if (PRIV(op)->wait_for == WAIT_SQE)
{
if (ringloop->space_left() < PRIV(op)->wait_detail)
if (ringloop->sqes_left() < PRIV(op)->wait_detail)
{
// stop submission if there's still no free space
#ifdef BLOCKSTORE_DEBUG
@@ -247,13 +284,40 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
}
PRIV(op)->wait_for = 0;
}
else if (PRIV(op)->wait_for == WAIT_COMPACTION)
else if (PRIV(op)->wait_for == WAIT_JOURNAL)
{
if (flusher->get_compact_counter() <= PRIV(op)->wait_detail)
if (journal.used_start == PRIV(op)->wait_detail &&
(unstable_writes.size()+unstable_unsynced) == PRIV(op)->wait_detail2)
{
// do not submit
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for more flushes\n");
printf("Still waiting to flush journal offset %08jx\n", PRIV(op)->wait_detail);
#endif
return;
}
flusher->release_trim();
PRIV(op)->wait_for = 0;
}
else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
{
int next = ((journal.cur_sector + 1) % journal.sector_count);
if (journal.sector_info[next].flush_count > 0 ||
journal.sector_info[next].dirty)
{
// do not submit
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for a journal buffer\n");
#endif
return;
}
PRIV(op)->wait_for = 0;
}
else if (PRIV(op)->wait_for == WAIT_FREE)
{
if (!data_alloc->get_free_count() && big_to_flush > 0)
{
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for free space on the data device\n");
#endif
return;
}
@@ -281,6 +345,44 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
return;
}
if (op->opcode == BS_OP_SYNC_STAB_ALL)
{
std::function<void(blockstore_op_t*)> *old_callback = new std::function<void(blockstore_op_t*)>(op->callback);
op->opcode = BS_OP_SYNC;
op->callback = [this, old_callback](blockstore_op_t *op)
{
if (op->retval >= 0 && unstable_writes.size() > 0)
{
op->opcode = BS_OP_STABLE;
op->len = unstable_writes.size();
obj_ver_id *vers = new obj_ver_id[op->len];
op->buf = vers;
int i = 0;
for (auto it = unstable_writes.begin(); it != unstable_writes.end(); it++, i++)
{
vers[i] = {
.oid = it->first,
.version = it->second,
};
}
unstable_writes.clear();
op->callback = [old_callback](blockstore_op_t *op)
{
obj_ver_id *vers = (obj_ver_id*)op->buf;
delete[] vers;
op->buf = NULL;
(*old_callback)(op);
delete old_callback;
};
this->enqueue_op(op);
}
else
{
(*old_callback)(op);
delete old_callback;
}
};
}
if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
{
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
@@ -299,11 +401,91 @@ void blockstore_impl_t::init_op(blockstore_op_t *op)
{
// Call constructor without allocating memory. We'll call destructor before returning op back
new ((void*)op->private_data) blockstore_op_private_t;
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->wait_for = 0;
PRIV(op)->op_state = 0;
PRIV(op)->pending_ops = 0;
}
static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
{
while (search_start < search_end)
{
int pos = search_start+(search_end-search_start)/2;
if (oid < list[pos].oid)
{
search_end = pos;
}
else if (list[pos].oid < oid)
{
search_start = pos+1;
}
else
{
list[pos].version = version;
return true;
}
}
return false;
}
blockstore_clean_db_t& blockstore_impl_t::clean_db_shard(object_id oid)
{
uint64_t pg_num = 0;
uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
auto sett_it = clean_db_settings.find(pool_id);
if (sett_it != clean_db_settings.end())
{
// like map_to_pg()
pg_num = (oid.stripe / sett_it->second.pg_stripe_size) % sett_it->second.pg_count + 1;
}
auto shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
if (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
{
auto sh_it = clean_db_shards.find(shard_id);
if (sh_it == clean_db_shards.end())
{
// clean_db_t stores larger entries with heap_meta, but we disguise it as smaller clean_entry :)
// patched cpp-btree with extra_data
clean_db_shards[shard_id] = blockstore_clean_db_t(
sizeof(clean_entry_heap_t) - sizeof(clean_entry)
+ (inmemory_meta ? dsk.clean_dyn_size : 2*dsk.clean_entry_bitmap_size)
);
return clean_db_shards[shard_id];
}
return sh_it->second;
}
return clean_db_shards[shard_id];
}
void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)
{
uint64_t pool_id = (uint64_t)pool;
std::map<pool_pg_id_t, blockstore_clean_db_t> new_shards;
auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
while (sh_it != clean_db_shards.end() &&
(sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
{
for (auto & pair: sh_it->second)
{
// like map_to_pg()
uint64_t pg_num = (pair.first.stripe / pg_stripe_size) % pg_count + 1;
uint64_t shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
new_shards[shard_id][pair.first] = pair.second;
}
clean_db_shards.erase(sh_it++);
}
for (sh_it = new_shards.begin(); sh_it != new_shards.end(); sh_it++)
{
auto & to = clean_db_shards[sh_it->first];
to.swap(sh_it->second);
}
clean_db_settings[pool_id] = (pool_shard_settings_t){
.pg_count = pg_count,
.pg_stripe_size = pg_stripe_size,
};
}
void blockstore_impl_t::process_list(blockstore_op_t *op)
{
uint32_t list_pg = op->pg_number+1;
@@ -312,8 +494,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
uint64_t min_inode = op->min_oid.inode;
uint64_t max_inode = op->max_oid.inode;
// Check PG
if (!pg_count || (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count) ||
!INODE_POOL(min_inode) || INODE_POOL(min_inode) != INODE_POOL(max_inode))
if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count))
{
op->retval = -EINVAL;
FINISH_OP(op);
@@ -321,40 +502,250 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
}
// Check if the DB needs resharding
// (we don't know about PGs from the beginning, we only create "shards" here)
heap->reshard(INODE_POOL(min_inode), pg_count, pg_stripe_size);
obj_ver_id *result = NULL;
size_t stable_count = 0, unstable_count = 0;
int res = heap->list_objects(list_pg, op->min_oid, op->max_oid, &result, &stable_count, &unstable_count);
if (op->list_stable_limit)
uint64_t first_shard = 0, last_shard = UINT64_MAX;
if (min_inode != 0 &&
// Check if min_inode == max_inode == pool_id<<N, i.e. this is a pool listing
(min_inode >> (64-POOL_ID_BITS)) == (max_inode >> (64-POOL_ID_BITS)))
{
// Ordered result is expected - used by scrub
// We use an unordered map
std::sort(result, result + stable_count);
if (stable_count > op->list_stable_limit)
pool_id_t pool_id = (min_inode >> (64-POOL_ID_BITS));
if (pg_count > 1)
{
memmove(result + op->list_stable_limit, result + stable_count, unstable_count);
stable_count = op->list_stable_limit;
// Per-pg listing
auto sh_it = clean_db_settings.find(pool_id);
if (sh_it == clean_db_settings.end() ||
sh_it->second.pg_count != pg_count ||
sh_it->second.pg_stripe_size != pg_stripe_size)
{
reshard_clean_db(pool_id, pg_count, pg_stripe_size);
}
first_shard = last_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS)) | list_pg;
}
else
{
// Per-pool listing
first_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS));
last_shard = ((uint64_t)(pool_id+1) << (64-POOL_ID_BITS)) - 1;
}
}
// Copy clean_db entries
int stable_count = 0, stable_alloc = 0;
if (min_inode != max_inode)
{
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
shard_it++)
{
auto & clean_db = shard_it->second;
stable_alloc += clean_db.size();
}
}
if (op->list_stable_limit > 0)
{
stable_alloc = op->list_stable_limit;
if (stable_alloc > 1024*1024)
stable_alloc = 1024*1024;
}
if (stable_alloc < 32768)
{
stable_alloc = 32768;
}
obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
if (!stable)
{
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
auto max_oid = op->max_oid;
bool limited = false;
pool_pg_id_t last_shard_id = 0;
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
shard_it++)
{
auto & clean_db = shard_it->second;
auto clean_it = clean_db.begin(), clean_end = clean_db.end();
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
{
clean_it = clean_db.lower_bound(op->min_oid);
}
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
{
clean_end = clean_db.upper_bound(max_oid);
}
for (; clean_it != clean_end; clean_it++)
{
if (stable_count >= stable_alloc)
{
stable_alloc *= 2;
obj_ver_id* nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!nst)
{
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
stable = nst;
}
stable[stable_count++] = {
.oid = clean_it->first,
.version = clean_it->second.version,
};
if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
{
if (!limited)
{
limited = true;
max_oid = stable[stable_count-1].oid;
}
break;
}
}
if (op->list_stable_limit > 0)
{
// To maintain the order, we have to include objects in the same range from other shards
if (last_shard_id != 0 && last_shard_id != shard_it->first)
std::sort(stable, stable+stable_count);
if (stable_count > op->list_stable_limit)
stable_count = op->list_stable_limit;
}
last_shard_id = shard_it->first;
}
if (op->list_stable_limit == 0 && first_shard != last_shard)
{
// If that's not a per-PG listing, sort clean entries (already sorted if list_stable_limit != 0)
std::sort(stable, stable+stable_count);
}
int clean_stable_count = stable_count;
// Copy dirty_db entries (sorted, too)
int unstable_count = 0, unstable_alloc = 0;
obj_ver_id *unstable = NULL;
{
auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
{
dirty_it = dirty_db.lower_bound({
.oid = op->min_oid,
.version = 0,
});
}
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
{
dirty_end = dirty_db.upper_bound({
.oid = max_oid,
.version = UINT64_MAX,
});
}
for (; dirty_it != dirty_end; dirty_it++)
{
if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count + 1) == list_pg) // like map_to_pg()
{
if (IS_DELETE(dirty_it->second.state))
{
// Deletions are always stable, so try to zero out two possible entries
if (!replace_stable(dirty_it->first.oid, 0, 0, clean_stable_count, stable))
{
replace_stable(dirty_it->first.oid, 0, clean_stable_count, stable_count, stable);
}
}
else if (IS_STABLE(dirty_it->second.state) || (dirty_it->second.state & BS_ST_INSTANT))
{
// First try to replace a clean stable version in the first part of the list
if (!replace_stable(dirty_it->first.oid, dirty_it->first.version, 0, clean_stable_count, stable))
{
// Then try to replace the last dirty stable version in the second part of the list
if (stable_count > 0 && stable[stable_count-1].oid == dirty_it->first.oid)
{
stable[stable_count-1].version = dirty_it->first.version;
}
else
{
if (stable_count >= stable_alloc)
{
stable_alloc += 32768;
obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!nst)
{
if (unstable)
free(unstable);
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
stable = nst;
}
stable[stable_count++] = dirty_it->first;
}
}
if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
{
// Stop here
break;
}
}
else
{
if (unstable_count >= unstable_alloc)
{
unstable_alloc += 32768;
obj_ver_id *nst = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc);
if (!nst)
{
if (stable)
free(stable);
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
unstable = nst;
}
unstable[unstable_count++] = dirty_it->first;
}
}
}
}
// Remove zeroed out stable entries
int j = 0;
for (int i = 0; i < stable_count; i++)
{
if (stable[i].version != 0)
{
stable[j++] = stable[i];
}
}
stable_count = j;
if (stable_count+unstable_count > stable_alloc)
{
stable_alloc = stable_count+unstable_count;
obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!nst)
{
if (unstable)
free(unstable);
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
stable = nst;
}
// Copy unstable entries
for (int i = 0; i < unstable_count; i++)
{
stable[j++] = unstable[i];
}
free(unstable);
op->version = stable_count;
op->retval = res == 0 ? stable_count+unstable_count : -res;
op->buf = (uint8_t*)result;
op->retval = stable_count+unstable_count;
op->buf = stable;
FINISH_OP(op);
}
void blockstore_impl_t::dump_diagnostics()
{
journal.dump_diagnostics();
flusher->dump_diagnostics();
}
void blockstore_meta_header_v3_t::set_crc32c()
{
header_csum = 0;
uint32_t calc = crc32c(0, this, version == BLOCKSTORE_META_FORMAT_HEAP
? sizeof(blockstore_meta_header_v3_t) : sizeof(blockstore_meta_header_v2_t));
header_csum = calc;
}
void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expected)
{
if (retval == -EAGAIN)
@@ -368,7 +759,85 @@ void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expecte
exit(1);
}
uint64_t blockstore_impl_t::get_free_block_count()
void blockstore_impl_t::set_no_inode_stats(const std::vector<uint64_t> & pool_ids)
{
return dsk.block_count - heap->get_data_used_space()/dsk.data_block_size;
for (auto & np: no_inode_stats)
{
np.second = 2;
}
for (auto pool_id: pool_ids)
{
if (!no_inode_stats[pool_id])
recalc_inode_space_stats(pool_id, false);
no_inode_stats[pool_id] = 1;
}
for (auto np_it = no_inode_stats.begin(); np_it != no_inode_stats.end(); )
{
if (np_it->second == 2)
{
recalc_inode_space_stats(np_it->first, true);
no_inode_stats.erase(np_it++);
}
else
np_it++;
}
}
void blockstore_impl_t::recalc_inode_space_stats(uint64_t pool_id, bool per_inode)
{
auto sp_begin = inode_space_stats.lower_bound((pool_id << (64-POOL_ID_BITS)));
auto sp_end = inode_space_stats.lower_bound(((pool_id+1) << (64-POOL_ID_BITS)));
inode_space_stats.erase(sp_begin, sp_end);
auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
while (sh_it != clean_db_shards.end() &&
(sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
{
for (auto & pair: sh_it->second)
{
uint64_t space_id = per_inode ? pair.first.inode : (pool_id << (64-POOL_ID_BITS));
inode_space_stats[space_id] += dsk.data_block_size;
}
sh_it++;
}
object_id last_oid = {};
bool last_exists = false;
auto dirty_it = dirty_db.lower_bound((obj_ver_id){ .oid = { .inode = (pool_id << (64-POOL_ID_BITS)) } });
while (dirty_it != dirty_db.end() && (dirty_it->first.oid.inode >> (64-POOL_ID_BITS)) == pool_id)
{
if (IS_STABLE(dirty_it->second.state) && (IS_BIG_WRITE(dirty_it->second.state) || IS_DELETE(dirty_it->second.state)))
{
bool exists = false;
if (last_oid == dirty_it->first.oid)
{
exists = last_exists;
}
else
{
auto & clean_db = clean_db_shard(dirty_it->first.oid);
auto clean_it = clean_db.find(dirty_it->first.oid);
exists = clean_it != clean_db.end();
}
uint64_t space_id = per_inode ? dirty_it->first.oid.inode : (pool_id << (64-POOL_ID_BITS));
if (IS_BIG_WRITE(dirty_it->second.state))
{
if (!exists)
inode_space_stats[space_id] += dsk.data_block_size;
last_exists = true;
}
else
{
if (exists)
{
auto & sp = inode_space_stats[space_id];
if (sp > dsk.data_block_size)
sp -= dsk.data_block_size;
else
inode_space_stats.erase(space_id);
}
last_exists = false;
}
last_oid = dirty_it->first.oid;
}
dirty_it++;
}
}

View File

@@ -5,7 +5,6 @@
#include "blockstore.h"
#include "blockstore_disk.h"
#include "blockstore_heap.h"
#include <sys/types.h>
#include <sys/ioctl.h>
@@ -20,18 +19,46 @@
#include <deque>
#include <new>
#include <unordered_map>
#include <unordered_set>
#include "cpp-btree/btree_map.h"
#include "malloc_or_die.h"
#include "allocator.h"
//#define BLOCKSTORE_DEBUG
// States are not stored on disk. Instead, they're deduced from the journal
#define BS_ST_SMALL_WRITE 0x01
#define BS_ST_BIG_WRITE 0x02
#define BS_ST_DELETE 0x03
#define BS_ST_WAIT_DEL 0x10
#define BS_ST_WAIT_BIG 0x20
#define BS_ST_IN_FLIGHT 0x30
#define BS_ST_SUBMITTED 0x40
#define BS_ST_WRITTEN 0x50
#define BS_ST_SYNCED 0x60
#define BS_ST_STABLE 0x70
#define BS_ST_INSTANT 0x100
#define IMMEDIATE_NONE 0
#define IMMEDIATE_SMALL 1
#define IMMEDIATE_ALL 2
#define BS_ST_TYPE_MASK 0x0F
#define BS_ST_WORKFLOW_MASK 0xF0
#define IS_IN_FLIGHT(st) (((st) & 0xF0) <= BS_ST_SUBMITTED)
#define IS_STABLE(st) (((st) & 0xF0) == BS_ST_STABLE)
#define IS_SYNCED(st) (((st) & 0xF0) >= BS_ST_SYNCED)
#define IS_JOURNAL(st) (((st) & 0x0F) == BS_ST_SMALL_WRITE)
#define IS_BIG_WRITE(st) (((st) & 0x0F) == BS_ST_BIG_WRITE)
#define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE)
#define IS_INSTANT(st) (((st) & BS_ST_TYPE_MASK) == BS_ST_DELETE || ((st) & BS_ST_INSTANT))
#define BS_SUBMIT_CHECK_SQES(n) \
if (ringloop->space_left() < (n))\
if (ringloop->sqes_left() < (n))\
{\
/* Pause until there are more requests available */\
PRIV(op)->wait_detail = (n);\
@@ -63,6 +90,16 @@
return 0;\
}
#include "blockstore_journal.h"
// "VITAstor"
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
#define BLOCKSTORE_META_FORMAT_V1 1
#define BLOCKSTORE_META_FORMAT_V2 2
#define BLOCKSTORE_META_FORMAT_HEAP 3
#define BLOCKSTORE_META_HEADER_V1_SIZE 36
#define BLOCKSTORE_META_HEADER_V2_SIZE 48
// metadata header (superblock)
struct __attribute__((__packed__)) blockstore_meta_header_v1_t
{
@@ -85,28 +122,134 @@ struct __attribute__((__packed__)) blockstore_meta_header_v2_t
uint32_t data_csum_type;
uint32_t csum_block_size;
uint32_t header_csum;
uint32_t block_id_bits; // 32 by default in heap meta
};
struct __attribute__((__packed__)) blockstore_meta_header_v3_t
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
// per "clean" entry on disk with fixed metadata tables
struct __attribute__((__packed__)) clean_disk_entry
{
uint64_t zero;
uint64_t magic;
object_id oid;
uint64_t version;
uint32_t meta_block_size;
uint32_t data_block_size;
uint32_t bitmap_granularity;
uint32_t data_csum_type;
uint32_t csum_block_size;
uint32_t header_csum;
uint64_t compacted_lsn;
void set_crc32c();
uint8_t bitmap[];
// Two more fields come after bitmap in metadata version 2:
// uint32_t data_csum[];
// uint32_t entry_csum;
};
// 32 = 16 + 16 bytes per "clean" entry in memory (object_id => clean_entry)
struct __attribute__((__packed__)) clean_entry
{
uint64_t version;
uint64_t location;
};
typedef uint32_t heap_block_num_t;
// 50 = 16 (key=object_id) + 26 (value) + 8 (bitmap) + N (checksum) bytes per "clean" entry in memory
struct __attribute__((__packed__)) clean_entry_heap_t
{
uint64_t version;
uint64_t location; // UINT64_MAX = deleted
// previous versions invalidated by this version
heap_block_num_t prev_versions;
// metadata block number
heap_block_num_t meta_block;
// offset within block
uint16_t block_offset;
uint8_t bitmap[];
};
struct __attribute__((__packed__)) heap_meta_block_header_t
{
uint64_t magic;
uint64_t seq_num;
uint32_t invalidates_blocks;
};
// 48+checksums = (40+bitmap)+checksums bytes per on-disk "heap" entry
// for 128 KB block without checksums, it's 48 bytes - 84 entries per 4 kb metadata block
// for 128 KB block with 4k checksums, it's 176 bytes - 22 entries per 4 kb metadata block
// for 1 MB block without checksums, it's 80 bytes - 50 entries per 4 kb metadata block
// for 1 MB block with 4k checksums, it's 1104 bytes O_o - only 3 entries per 4 kb metadata block
// for 1 MB block with 32k checksums, it's 176 bytes again
struct __attribute__((__packed__)) heap_meta_entry_t
{
object_id oid;
uint64_t version;
uint64_t location; // UINT64_MAX = deleted
uint64_t reserved;
uint8_t bitmap[];
};
struct heap_meta_block_t
{
heap_block_num_t offset = 0;
uint64_t seq_num = 0;
uint32_t used_space = 0;
std::vector<uint64_t> invalidates_blocks;
};
struct heap_meta_t
{
heap_block_num_t block_count = 0;
heap_meta_block_t *blocks = NULL;
// used space => block number
std::multimap<uint32_t, heap_block_num_t> used_space_map;
heap_block_num_t cur_written_block = 0;
uint8_t *written_block_buf = NULL;
};
// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry). Plus checksums
struct __attribute__((__packed__)) dirty_entry
{
uint32_t state;
uint32_t flags; // unneeded, but present for alignment
uint64_t location; // location in either journal or data -> in BYTES
uint32_t offset; // data offset within object (stripe)
uint32_t len; // data length
uint64_t journal_sector; // journal sector used for this entry
void* dyn_data; // dynamic data: external bitmap and data block checksums. may be a pointer to the in-memory journal
};
// - Sync must be submitted after previous writes/deletes (not before!)
// - Reads to the same object must be submitted after previous writes/deletes
// are written (not necessarily synced) in their location. This is because we
// rely on read-modify-write for erasure coding and we must return new data
// to calculate parity for subsequent writes
// - Writes may be submitted in any order, because they don't overlap. Each write
// goes into a new location - either on the journal device or on the data device
// - Stable (stabilize) must be submitted after sync of that object is completed
// It's even OK to return an error to the caller if that object is not synced yet
// - Journal trim may be processed only after all versions are moved to
// the main storage AND after all read operations for older versions complete
// - If an operation can not be submitted because the ring is full
// we should stop submission of other operations. Otherwise some "scatter" reads
// may end up blocked for a long time.
// Otherwise, the submit order is free, that is all operations may be submitted immediately
// In fact, adding a write operation must immediately result in dirty_db being populated
// Suspend operation until there are more free SQEs
#define WAIT_SQE 1
// Suspend until something is compacted
#define WAIT_COMPACTION 2
// Suspend operation until there are <wait_detail> bytes of free space in the journal on disk
#define WAIT_JOURNAL 3
// Suspend operation until the next journal sector buffer is free
#define WAIT_JOURNAL_BUFFER 4
// Suspend operation until there is some free space on the data device
#define WAIT_FREE 5
struct used_clean_obj_t
{
int refs;
bool was_freed; // was freed by a parallel flush?
bool was_changed; // was changed by a parallel flush?
};
// https://github.com/algorithm-ninja/cpp-btree
// https://github.com/greg7mdp/sparsepp/ was used previously, but it was TERRIBLY slow after resizing
// with sparsepp, random reads dropped to ~700 iops very fast with just as much as ~32k objects in the DB
typedef btree::btree_map<object_id, clean_entry> blockstore_clean_db_t;
typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
#include "blockstore_init.h"
@@ -119,47 +262,58 @@ struct blockstore_op_private_t
{
// Wait status
int wait_for;
uint64_t wait_detail;
uint64_t wait_detail, wait_detail2;
int pending_ops;
int op_state;
// Read, write, sync, stabilize
uint64_t lsn;
// Read
uint64_t clean_block_used;
std::vector<copy_buffer_t> read_vec;
// Write
uint64_t location;
bool is_big;
// Stabilize, rollback
int stab_pos;
// Stabilize
uint64_t to_lsn;
// Sync, write
uint64_t min_flushed_journal_sector, max_flushed_journal_sector;
// Write
struct iovec iov_zerofill[3];
// Warning: must not have a default value here because it's written to before calling constructor in blockstore_write.cpp O_o
uint64_t real_version;
timespec tv_begin;
// Sync
std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
};
typedef uint32_t pool_id_t;
typedef uint64_t pool_pg_id_t;
#define POOL_ID_BITS 16
struct pool_shard_settings_t
{
uint32_t pg_count;
uint32_t pg_stripe_size;
};
#define STAB_SPLIT_DONE 1
#define STAB_SPLIT_WAIT 2
#define STAB_SPLIT_SYNC 3
#define STAB_SPLIT_TODO 4
class blockstore_impl_t
{
public:
blockstore_disk_t dsk;
/******* OPTIONS *******/
bool readonly = false;
// It is safe to disable fsync() if drive write cache is writethrough
bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
// Enable if you want every operation to be executed with an "implicit fsync"
// Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
int immediate_commit = IMMEDIATE_NONE;
bool inmemory_meta = false;
uint32_t meta_write_recheck_parallelism = 0;
// Maximum and minimum flusher count
unsigned max_flusher_count = 0, min_flusher_count = 0;
unsigned journal_trim_interval = 0;
unsigned flusher_start_threshold = 0;
unsigned max_flusher_count, min_flusher_count;
unsigned journal_trim_interval;
// Maximum queue depth
unsigned max_write_iodepth = 128;
// Enable small (journaled) write throttling, useful for the SSD+HDD case
@@ -174,89 +328,141 @@ public:
uint64_t autosync_writes = 128;
// Log level (0-10)
int log_level = 0;
// Enable correct block checksum validation on objects updated with small writes when checksum block
// is larger than bitmap_granularity, at the expense of extra metadata fsyncs during compaction
bool perfect_csum_update = false;
/******* END OF OPTIONS *******/
struct ring_consumer_t ring_consumer;
blockstore_heap_t *heap = NULL;
uint8_t* meta_superblock = NULL;
uint8_t *buffer_area = NULL;
heap_meta_t heap_meta;
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
std::map<uint64_t, int> no_inode_stats;
uint8_t *clean_bitmaps = NULL;
blockstore_dirty_db_t dirty_db;
std::vector<blockstore_op_t*> submit_queue;
int unsynced_big_write_count = 0, unsynced_small_write_count = 0, unsynced_meta_write_count = 0;
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
int unsynced_big_write_count = 0, unstable_unsynced = 0;
int unsynced_queued_ops = 0;
allocator_t *data_alloc = NULL;
uint64_t used_blocks = 0;
uint8_t *zero_object = NULL;
void *metadata_buffer = NULL;
struct journal_t journal;
journal_flusher_t *flusher;
int big_to_flush = 0;
int write_iodepth = 0;
int inflight_big = 0;
bool fsyncing_data = false;
bool alloc_dyn_data = false;
// clean data blocks referenced by read operations
std::map<uint64_t, used_clean_obj_t> used_clean_objects;
bool live = false, queue_stall = false;
ring_loop_i *ringloop = NULL;
timerfd_manager_t *tfd = NULL;
ring_loop_t *ringloop;
timerfd_manager_t *tfd;
bool stop_sync_submitted = false;
bool stop_sync_submitted;
inline struct io_uring_sqe* get_sqe()
{
return ringloop->get_sqe();
}
friend class blockstore_init_meta;
friend class blockstore_init_journal;
friend struct blockstore_journal_check_t;
friend class journal_flusher_t;
friend class journal_flusher_co;
void calc_lengths();
void open_data();
void open_meta();
void open_journal();
uint8_t* get_clean_entry_bitmap(blockstore_clean_db_t::iterator clean_it, int offset);
blockstore_clean_db_t& clean_db_shard(object_id oid);
void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
void recalc_inode_space_stats(uint64_t pool_id, bool per_inode);
// Journaling
void prepare_journal_sector_write(int sector, blockstore_op_t *op);
void handle_journal_write(ring_data_t *data, uint64_t flush_id);
void disk_error_abort(const char *op, int retval, int expected);
// Asynchronous init
int initialized;
int metadata_buf_size;
blockstore_init_meta* metadata_init_reader;
blockstore_init_journal* journal_init_reader;
void init();
void check_wait(blockstore_op_t *op);
void init_op(blockstore_op_t *op);
// Read
int dequeue_read(blockstore_op_t *op);
int fulfill_read(blockstore_op_t *op);
uint32_t prepare_read(std::vector<copy_buffer_t> & read_vec, heap_object_t *obj, heap_write_t *wr, uint32_t start, uint32_t end);
uint32_t prepare_read_with_bitmaps(std::vector<copy_buffer_t> & read_vec, heap_object_t *obj, heap_write_t *wr, uint32_t start, uint32_t end);
uint32_t prepare_read_zero(std::vector<copy_buffer_t> & read_vec, uint32_t start, uint32_t end);
uint32_t prepare_read_simple(std::vector<copy_buffer_t> & read_vec, heap_object_t *obj, heap_write_t *wr, uint32_t start, uint32_t end);
void prepare_disk_read(std::vector<copy_buffer_t> & read_vec, int pos, heap_object_t *obj, heap_write_t *wr,
uint32_t blk_start, uint32_t blk_end, uint32_t start, uint32_t end, uint32_t copy_flags);
int dequeue_read(blockstore_op_t *read_op);
void find_holes(std::vector<copy_buffer_t> & read_vec, uint32_t item_start, uint32_t item_end,
std::function<void(int&, uint32_t, uint32_t)> callback);
void free_read_buffers(std::vector<copy_buffer_t> & rv);
std::function<int(int, bool, uint32_t, uint32_t)> callback);
int fulfill_read(blockstore_op_t *read_op,
uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
uint32_t item_state, uint64_t item_version, uint64_t item_location,
uint64_t journal_sector, uint8_t *csum, int *dyn_data);
bool fulfill_clean_read_journal(blockstore_op_t *read_op, uint64_t & fulfilled,
uint8_t *clean_entry_bitmap, int *dyn_data, uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
bool fulfill_clean_read_meta(blockstore_op_t *read_op, uint64_t & fulfilled, blockstore_clean_db_t::iterator clean_it);
int fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
uint8_t *clean_entry_bitmap, int *dyn_data, bool from_journal, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end);
int pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
uint64_t dirty_offset, uint64_t dirty_end, uint64_t dirty_loc, uint8_t *csum_ptr, int *dyn_data,
uint64_t offset, uint64_t submit_len, uint64_t & blk_begin, uint64_t & blk_end, uint8_t* & blk_buf);
bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
uint8_t* read_clean_meta_block(blockstore_op_t *op, blockstore_clean_db_t::iterator clean_it, int rv_pos);
bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
uint32_t item_state, uint64_t item_version);
void handle_read_event(ring_data_t *data, blockstore_op_t *op);
bool verify_read_checksums(blockstore_op_t *op);
// Write
bool enqueue_write(blockstore_op_t *op);
void prepare_meta_block_write(blockstore_op_t *op, uint64_t modified_block, io_uring_sqe *sqe = NULL);
void cancel_all_writes(blockstore_op_t *op, blockstore_dirty_db_t::iterator dirty_it, int retval);
int dequeue_write(blockstore_op_t *op);
int make_big_write(blockstore_op_t *op, uint32_t offset, uint32_t len, uint32_t *modified_block, uint32_t *moved_from_block);
int dequeue_del(blockstore_op_t *op);
int continue_write(blockstore_op_t *op);
void release_journal_sectors(blockstore_op_t *op);
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
// Sync
int continue_sync(blockstore_op_t *op);
bool submit_fsyncs(int & wait_count);
int do_sync(blockstore_op_t *op, int base_state);
void ack_sync(blockstore_op_t *op);
// Stabilize
int dequeue_stable(blockstore_op_t *op);
int continue_stable(blockstore_op_t *op);
void mark_stable(obj_ver_id ov, bool forget_dirty = false);
void stabilize_object(object_id oid, uint64_t max_ver);
blockstore_op_t* selective_sync(blockstore_op_t *op);
int split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider);
// Rollback
int dequeue_rollback(blockstore_op_t *op);
int continue_rollback(blockstore_op_t *op);
void mark_rolled_back(const obj_ver_id & ov);
void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);
void free_dirty_dyn_data(dirty_entry & e);
// List
void process_list(blockstore_op_t *op);
/*public:*/
public:
blockstore_impl_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd, bool mock_mode = false);
blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
~blockstore_impl_t();
void parse_config(blockstore_config_t & config, bool init);
@@ -282,13 +488,21 @@ public:
// Simplified synchronous operation: get object bitmap & current version
int read_bitmap(object_id oid, uint64_t target_version, void *bitmap, uint64_t *result_version = NULL);
// Unstable writes are added here (map of object_id -> version)
std::unordered_map<object_id, uint64_t> unstable_writes;
// Space usage statistics
std::map<uint64_t, uint64_t> inode_space_stats;
// Set per-pool no_inode_stats
void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
// Print diagnostics to stdout
void dump_diagnostics();
const std::map<uint64_t, uint64_t> & get_inode_space_stats() { return heap->get_inode_space_stats(); }
inline uint32_t get_block_size() { return dsk.data_block_size; }
inline uint64_t get_block_count() { return dsk.block_count; }
uint64_t get_free_block_count();
inline uint64_t get_free_block_count() { return dsk.block_count - used_blocks; }
inline uint32_t get_bitmap_granularity() { return dsk.disk_alignment; }
inline uint64_t get_journal_size() { return dsk.journal_len; }
};

File diff suppressed because it is too large Load Diff

View File

@@ -25,10 +25,53 @@ class blockstore_init_meta
uint64_t next_offset = 0;
uint64_t last_read_offset = 0;
uint64_t entries_loaded = 0;
unsigned entries_per_block = 0;
int i = 0, j = 0;
bool handle_meta_block(uint8_t *buf, uint64_t count, uint64_t done_cnt);
std::vector<uint64_t> entries_to_zero;
std::map<uint64_t, heap_block_num_t> heap_block_by_seq;
std::set<uint64_t> heap_invalidated_block_seq;
std::vector<heap_block_num_t> heap_invalidated_block_nums;
bool handle_array_meta_block(uint8_t *buf, uint64_t block_offset);
bool handle_heap_meta_block(uint8_t *buf, uint64_t block_offset);
void handle_event(ring_data_t *data, int buf_num);
public:
blockstore_init_meta(blockstore_impl_t *bs);
int loop();
};
struct bs_init_journal_done
{
void *buf;
uint64_t pos, len;
};
class blockstore_init_journal
{
blockstore_impl_t *bs;
int wait_state = 0, wait_count = 0, handle_res = 0;
uint64_t entries_loaded = 0;
uint32_t crc32_last = 0;
bool started = false;
uint64_t next_free;
std::vector<bs_init_journal_done> done;
std::vector<obj_ver_id> double_allocs;
std::vector<iovec> small_write_data;
uint64_t journal_pos = 0;
uint64_t continue_pos = 0;
void *init_write_buf = NULL;
uint64_t init_write_sector = 0;
bool wrapped = false;
void *submitted_buf;
struct io_uring_sqe *sqe;
struct ring_data_t *data;
journal_entry_start *je_start;
std::function<void(ring_data_t*)> simple_callback;
int handle_journal_part(void *buf, uint64_t done_pos, uint64_t len);
void handle_event(ring_data_t *data);
void erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it);
public:
blockstore_init_journal(blockstore_impl_t* bs);
int loop();
};

View File

@@ -0,0 +1,356 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "blockstore_impl.h"
blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
{
this->bs = bs;
sectors_to_write = 0;
next_pos = bs->journal.next_free;
next_sector = bs->journal.cur_sector;
first_sector = -1;
next_in_pos = bs->journal.in_sector_pos;
right_dir = next_pos >= bs->journal.used_start;
}
// Check if we can write <required> entries of <size> bytes and <data_after> data bytes after them to the journal
int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries_required, int size, int data_after)
{
uint64_t prev_next = next_sector;
int required = entries_required;
while (1)
{
int fits = bs->journal.no_same_sector_overwrites && next_pos == bs->journal.next_free && bs->journal.sector_info[next_sector].written
? 0
: (bs->journal.block_size - next_in_pos) / size;
if (fits > 0)
{
if (fits > required)
{
fits = required;
}
if (first_sector == -1)
{
first_sector = next_sector;
}
required -= fits;
next_in_pos += fits * size;
if (next_sector != prev_next || !sectors_to_write)
{
// Except the previous call to this function
sectors_to_write++;
}
}
else if (bs->journal.sector_info[next_sector].dirty)
{
if (next_sector != prev_next || !sectors_to_write)
{
// Except the previous call to this function
sectors_to_write++;
}
}
if (required <= 0)
{
break;
}
next_pos = next_pos + bs->journal.block_size;
if (next_pos >= bs->journal.len)
{
next_pos = bs->journal.block_size;
right_dir = false;
}
next_in_pos = 0;
next_sector = ((next_sector + 1) % bs->journal.sector_count);
if (next_sector == first_sector)
{
// next_sector may wrap when all sectors are flushed and the incoming batch is too big
// This is an error condition, we can't wait for anything in this case
throw std::runtime_error(
"Blockstore journal_sector_buffer_count="+std::to_string(bs->journal.sector_count)+
" is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes"
);
}
if (bs->journal.sector_info[next_sector].flush_count > 0 ||
bs->journal.sector_info[next_sector].dirty)
{
// No memory buffer available. Wait for it.
int used = 0, dirty = 0;
for (int i = 0; i < bs->journal.sector_count; i++)
{
if (bs->journal.sector_info[i].dirty)
{
dirty++;
used++;
}
if (bs->journal.sector_info[i].flush_count > 0)
{
used++;
}
}
// In fact, it's even more rare than "ran out of journal space", so print a warning
printf(
"Ran out of journal sector buffers: %d/%ju buffers used (%d dirty), next buffer (%jd)"
" is %s and flushed %ju times. Consider increasing \'journal_sector_buffer_count\'\n",
used, bs->journal.sector_count, dirty, next_sector,
bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
bs->journal.sector_info[next_sector].flush_count
);
PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
return 0;
}
}
if (data_after > 0)
{
next_pos = next_pos + data_after;
if (next_pos >= bs->journal.len)
{
if (right_dir)
next_pos = bs->journal.block_size + data_after;
right_dir = false;
}
}
if (!right_dir && next_pos >= bs->journal.used_start-bs->journal.block_size)
{
// No space in the journal. Wait until used_start changes.
if (bs->log_level > 5)
{
printf(
"Ran out of journal space (used_start=%08jx, next_free=%08jx, dirty_start=%08jx)\n",
bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
);
}
PRIV(op)->wait_for = WAIT_JOURNAL;
bs->flusher->request_trim();
PRIV(op)->wait_detail = bs->journal.used_start;
return 0;
}
return 1;
}
journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size)
{
if (!journal.entry_fits(size))
{
assert(!journal.sector_info[journal.cur_sector].dirty);
// Move to the next journal sector
if (journal.sector_info[journal.cur_sector].flush_count > 0)
{
// Also select next sector buffer in memory
journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
assert(!journal.sector_info[journal.cur_sector].flush_count);
}
else
{
journal.dirty_start = journal.next_free;
}
journal.sector_info[journal.cur_sector].written = false;
journal.sector_info[journal.cur_sector].offset = journal.next_free;
journal.in_sector_pos = 0;
auto next_next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
// double check that next_free doesn't cross used_start from the left
assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
journal.next_free = next_next_free;
memset(journal.inmemory
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
}
journal_entry *je = (struct journal_entry*)(
(journal.inmemory
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector) + journal.in_sector_pos
);
journal.in_sector_pos += size;
je->magic = JOURNAL_MAGIC;
je->type = type;
je->size = size;
je->crc32_prev = journal.crc32_last;
journal.sector_info[journal.cur_sector].dirty = true;
return je;
}
void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_op_t *op)
{
// Don't submit the same sector twice in the same batch
if (!journal.sector_info[cur_sector].submit_id)
{
io_uring_sqe *sqe = get_sqe();
// Caller must ensure availability of an SQE
assert(sqe != NULL);
ring_data_t *data = ((ring_data_t*)sqe->user_data);
journal.sector_info[cur_sector].written = true;
journal.sector_info[cur_sector].submit_id = ++journal.submit_id;
assert(journal.submit_id != 0); // check overflow
journal.submitting_sectors.push_back(cur_sector);
journal.sector_info[cur_sector].flush_count++;
data->iov = (struct iovec){
(journal.inmemory
? (uint8_t*)journal.buffer + journal.sector_info[cur_sector].offset
: (uint8_t*)journal.sector_buf + journal.block_size*cur_sector),
(size_t)journal.block_size
};
data->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
my_uring_prep_writev(
sqe, dsk.journal_fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
);
}
journal.sector_info[cur_sector].dirty = false;
// But always remember that this operation has to wait until this exact journal write is finished
journal.flushing_ops.emplace(journal.sector_info[cur_sector].submit_id, (pending_journaling_t){
.pending = 1,
.sector = cur_sector,
.op = op,
});
auto priv = PRIV(op);
priv->pending_ops++;
if (!priv->min_flushed_journal_sector)
priv->min_flushed_journal_sector = 1+cur_sector;
assert(priv->min_flushed_journal_sector <= journal.sector_count);
priv->max_flushed_journal_sector = 1+cur_sector;
}
void blockstore_impl_t::handle_journal_write(ring_data_t *data, uint64_t flush_id)
{
live = true;
if (data->res != data->iov.iov_len)
{
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
disk_error_abort("journal write", data->res, data->iov.iov_len);
}
auto fl_it = journal.flushing_ops.lower_bound(flush_id);
if (fl_it != journal.flushing_ops.end() && fl_it->first == flush_id && fl_it->second.sector >= 0)
{
journal.sector_info[fl_it->second.sector].flush_count--;
}
auto is_first = fl_it == journal.flushing_ops.begin();
while (fl_it != journal.flushing_ops.end())
{
bool del = false;
if (fl_it->first == flush_id)
{
fl_it->second.pending = 0;
del = is_first;
}
else
{
del = !fl_it->second.pending;
}
if (del)
{
// Do not complete this operation if previous writes are unfinished
// Otherwise also complete following operations waiting for this one
auto priv = PRIV(fl_it->second.op);
priv->pending_ops--;
assert(priv->pending_ops >= 0);
if (priv->pending_ops == 0)
{
release_journal_sectors(fl_it->second.op);
priv->op_state++;
ringloop->wakeup();
}
journal.flushing_ops.erase(fl_it++);
}
else
{
fl_it++;
}
}
}
journal_t::~journal_t()
{
if (sector_buf)
free(sector_buf);
if (sector_info)
free(sector_info);
if (buffer)
free(buffer);
sector_buf = NULL;
sector_info = NULL;
buffer = NULL;
}
uint64_t journal_t::get_trim_pos()
{
auto journal_used_it = used_sectors.lower_bound(used_start);
if (journal_used_it == used_sectors.end())
{
// Journal is cleared to its end, restart from the beginning
journal_used_it = used_sectors.begin();
if (journal_used_it == used_sectors.end())
{
// Journal is empty
return next_free;
}
else
{
// next_free does not need updating during trim
#ifdef BLOCKSTORE_DEBUG
printf(
"Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
used_start, next_free, dirty_start,
journal_used_it->first, journal_used_it->second
);
#endif
return journal_used_it->first;
}
}
else if (journal_used_it->first > used_start)
{
// Journal is cleared up to <journal_used_it>
#ifdef BLOCKSTORE_DEBUG
printf(
"Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
used_start, next_free, dirty_start,
journal_used_it->first, journal_used_it->second
);
#endif
return journal_used_it->first;
}
// Can't trim journal
return used_start;
}
void journal_t::dump_diagnostics()
{
auto journal_used_it = used_sectors.lower_bound(used_start);
if (journal_used_it == used_sectors.end())
{
// Journal is cleared to its end, restart from the beginning
journal_used_it = used_sectors.begin();
}
printf(
"Journal: used_start=%08jx next_free=%08jx dirty_start=%08jx trim_to=%08jx trim_to_refs=%jd\n",
used_start, next_free, dirty_start,
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
);
}
static uint64_t zero_page[4096];
uint32_t crc32c_pad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad)
{
uint32_t r = prev_crc;
while (left_pad >= 4096)
{
r = crc32c(r, zero_page, 4096);
left_pad -= 4096;
}
if (left_pad > 0)
r = crc32c(r, zero_page, left_pad);
r = crc32c(r, buf, len);
while (right_pad >= 4096)
{
r = crc32c(r, zero_page, 4096);
right_pad -= 4096;
}
if (left_pad > 0)
r = crc32c(r, zero_page, right_pad);
return r;
}
uint32_t crc32c_nopad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad)
{
return crc32c(0, buf, len);
}

View File

@@ -1,11 +1,12 @@
// Old metadata format on-disk structures
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#pragma once
#include "crc32c.h"
#include <set>
#define MIN_JOURNAL_SIZE 4*1024*1024
#define JOURNAL_MAGIC 0x4A33
#define JOURNAL_VERSION_V1 1
#define JOURNAL_VERSION_V2 2
@@ -144,14 +145,77 @@ inline uint32_t je_crc32(journal_entry *je)
return crc32c(0x48674bc7, ((uint8_t*)je)+4, je->size-4);
}
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
// per "clean" entry on disk with fixed metadata tables
struct __attribute__((__packed__)) clean_disk_entry
struct journal_sector_info_t
{
object_id oid;
uint64_t version;
uint8_t bitmap[];
// Two more fields come after bitmap in metadata version 2:
// uint32_t data_csum[];
// uint32_t entry_csum;
uint64_t offset;
uint64_t flush_count;
bool written;
bool dirty;
uint64_t submit_id;
};
struct pending_journaling_t
{
int pending;
int sector;
blockstore_op_t *op;
};
struct journal_t
{
int fd;
bool inmemory = false;
bool flush_journal = false;
void *buffer = NULL;
uint64_t block_size;
uint64_t offset, len;
// Next free block offset
uint64_t next_free = 0;
// First occupied block offset
uint64_t used_start = 0;
// End of the last block not used for writing anymore
uint64_t dirty_start = 0;
uint32_t crc32_last = 0;
// Current sector(s) used for writing
void *sector_buf = NULL;
journal_sector_info_t *sector_info = NULL;
uint64_t sector_count;
bool no_same_sector_overwrites = false;
int cur_sector = 0;
int in_sector_pos = 0;
std::vector<int> submitting_sectors;
std::multimap<uint64_t, pending_journaling_t> flushing_ops;
uint64_t submit_id = 0;
// Used sector map
// May use ~ 80 MB per 1 GB of used journal space in the worst case
std::map<uint64_t, uint64_t> used_sectors;
~journal_t();
bool trim();
uint64_t get_trim_pos();
void dump_diagnostics();
inline bool entry_fits(int size)
{
return !(block_size - in_sector_pos < size ||
no_same_sector_overwrites && sector_info[cur_sector].written);
}
};
struct blockstore_journal_check_t
{
blockstore_impl_t *bs;
uint64_t next_pos, next_sector, next_in_pos;
int sectors_to_write, first_sector;
bool right_dir; // writing to the end or the beginning of the ring buffer
blockstore_journal_check_t(blockstore_impl_t *bs);
int check_available(blockstore_op_t *op, int required, int size, int data_after);
};
journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size);
uint32_t crc32c_pad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad);
uint32_t crc32c_nopad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad);

View File

@@ -2,7 +2,6 @@
// License: VNPL-1.1 (see README.md for details)
#include <sys/file.h>
#include <stdexcept>
#include "blockstore_impl.h"
void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
@@ -15,14 +14,12 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
}
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
journal_trim_interval = strtoull(config["journal_trim_interval"].c_str(), NULL, 10);
flusher_start_threshold = strtoull(config["flusher_start_threshold"].c_str(), NULL, 10);
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
perfect_csum_update = config["perfect_csum_update"] == "true" || config["perfect_csum_update"] == "1" || config["perfect_csum_update"] == "yes";
if (config["autosync_writes"] != "")
{
autosync_writes = strtoull(config["autosync_writes"].c_str(), NULL, 10);
@@ -31,17 +28,13 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
{
max_flusher_count = 256;
}
if (!min_flusher_count)
if (!min_flusher_count || journal.flush_journal)
{
min_flusher_count = 1;
}
if (!journal_trim_interval)
{
journal_trim_interval = 1024;
}
if (!flusher_start_threshold)
{
flusher_start_threshold = 32;
journal_trim_interval = 512;
}
if (!max_write_iodepth)
{
@@ -75,6 +68,23 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
{
readonly = true;
}
if (config["disable_data_fsync"] == "true" || config["disable_data_fsync"] == "1" || config["disable_data_fsync"] == "yes")
{
disable_data_fsync = true;
}
if (config["disable_meta_fsync"] == "true" || config["disable_meta_fsync"] == "1" || config["disable_meta_fsync"] == "yes")
{
disable_meta_fsync = true;
}
if (config["disable_journal_fsync"] == "true" || config["disable_journal_fsync"] == "1" || config["disable_journal_fsync"] == "yes")
{
disable_journal_fsync = true;
}
if (config["flush_journal"] == "true" || config["flush_journal"] == "1" || config["flush_journal"] == "yes")
{
// Only flush journal and exit
journal.flush_journal = true;
}
if (config["immediate_commit"] == "all")
{
immediate_commit = IMMEDIATE_ALL;
@@ -84,23 +94,89 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
immediate_commit = IMMEDIATE_SMALL;
}
metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
meta_write_recheck_parallelism = strtoull(config["meta_write_recheck_parallelism"].c_str(), NULL, 10);
inmemory_meta = config["inmemory_metadata"] != "false" && config["inmemory_metadata"] != "0" &&
config["inmemory_metadata"] != "no";
journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10);
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
journal.inmemory = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
config["inmemory_journal"] != "no";
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
// Validate
if (journal.sector_count < 2)
{
journal.sector_count = 32;
}
if (metadata_buf_size < 65536)
{
metadata_buf_size = 4*1024*1024;
}
if (!meta_write_recheck_parallelism)
if (metadata_buf_size % dsk.meta_block_size)
{
meta_write_recheck_parallelism = 16;
metadata_buf_size = ((metadata_buf_size+dsk.meta_block_size-1) / dsk.meta_block_size) * dsk.meta_block_size;
}
if (immediate_commit != IMMEDIATE_NONE && !dsk.disable_journal_fsync)
if (dsk.meta_device == dsk.data_device)
{
disable_meta_fsync = disable_data_fsync;
}
if (dsk.journal_device == dsk.meta_device)
{
disable_journal_fsync = disable_meta_fsync;
}
if (immediate_commit != IMMEDIATE_NONE && !disable_journal_fsync)
{
throw std::runtime_error("immediate_commit requires disable_journal_fsync");
}
if (immediate_commit == IMMEDIATE_ALL && !dsk.disable_data_fsync)
if (immediate_commit == IMMEDIATE_ALL && !disable_data_fsync)
{
throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
}
// init some fields
journal.block_size = dsk.journal_block_size;
journal.next_free = dsk.journal_block_size;
journal.used_start = dsk.journal_block_size;
// no free space because sector is initially unmapped
journal.in_sector_pos = dsk.journal_block_size;
}
void blockstore_impl_t::calc_lengths()
{
dsk.calc_lengths();
journal.len = dsk.journal_len;
journal.block_size = dsk.journal_block_size;
journal.offset = dsk.journal_offset;
if (inmemory_meta)
{
metadata_buffer = memalign(MEM_ALIGNMENT, dsk.meta_len);
if (!metadata_buffer)
throw std::runtime_error("Failed to allocate memory for the metadata ("+std::to_string(dsk.meta_len/1024/1024)+" MB)");
}
else if (dsk.clean_entry_bitmap_size || dsk.data_csum_type)
{
clean_bitmaps = (uint8_t*)malloc(dsk.block_count * 2 * dsk.clean_entry_bitmap_size);
if (!clean_bitmaps)
{
throw std::runtime_error(
"Failed to allocate memory for the metadata sparse write bitmap ("+
std::to_string(dsk.block_count * 2 * dsk.clean_entry_bitmap_size / 1024 / 1024)+" MB)"
);
}
}
if (journal.inmemory)
{
journal.buffer = memalign(MEM_ALIGNMENT, journal.len);
if (!journal.buffer)
throw std::runtime_error("Failed to allocate memory for journal ("+std::to_string(journal.len/1024/1024)+" MB)");
}
else
{
journal.sector_buf = (uint8_t*)memalign(MEM_ALIGNMENT, journal.sector_count * dsk.journal_block_size);
if (!journal.sector_buf)
throw std::bad_alloc();
}
journal.sector_info = (journal_sector_info_t*)calloc(journal.sector_count, sizeof(journal_sector_info_t));
if (!journal.sector_info)
{
throw std::bad_alloc();
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,258 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "blockstore_impl.h"
int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
{
if (PRIV(op)->op_state)
{
return continue_rollback(op);
}
int r = split_stab_op(op, [this](obj_ver_id ov)
{
// Check that there are some versions greater than v->version (which may be zero),
// check that they're unstable, synced, and not currently written to
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
.oid = ov.oid,
.version = UINT64_MAX,
});
if (dirty_it == dirty_db.begin())
{
// Already rolled back, skip this object version
return STAB_SPLIT_DONE;
}
else
{
dirty_it--;
if (dirty_it->first.oid != ov.oid || dirty_it->first.version < ov.version)
{
// Already rolled back, skip this object version
return STAB_SPLIT_DONE;
}
while (dirty_it->first.oid == ov.oid && dirty_it->first.version > ov.version)
{
if (IS_IN_FLIGHT(dirty_it->second.state))
{
// Object write is still in progress. Wait until the write request completes
return STAB_SPLIT_WAIT;
}
else if (!IS_SYNCED(dirty_it->second.state) ||
IS_STABLE(dirty_it->second.state))
{
// Sync the object
return STAB_SPLIT_SYNC;
}
if (dirty_it == dirty_db.begin())
{
break;
}
dirty_it--;
}
return STAB_SPLIT_TODO;
}
});
if (r != 1)
{
return r;
}
// Check journal space
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, op->len, sizeof(journal_entry_rollback), 0))
{
return 0;
}
// There is sufficient space. Check SQEs
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
// Prepare and submit journal entries
int s = 0;
auto v = (obj_ver_id*)op->buf;
for (int i = 0; i < op->len; i++, v++)
{
if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
journal.sector_info[journal.cur_sector].dirty)
{
prepare_journal_sector_write(journal.cur_sector, op);
s++;
}
journal_entry_rollback *je = (journal_entry_rollback*)
prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
je->oid = v->oid;
je->version = v->version;
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
}
prepare_journal_sector_write(journal.cur_sector, op);
s++;
assert(s == space_check.sectors_to_write);
PRIV(op)->op_state = 1;
return 1;
}
int blockstore_impl_t::continue_rollback(blockstore_op_t *op)
{
if (PRIV(op)->op_state == 2)
goto resume_2;
else if (PRIV(op)->op_state == 4)
goto resume_4;
else
return 1;
resume_2:
if (!disable_journal_fsync)
{
BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = 3;
return 1;
}
resume_4:
obj_ver_id* v;
int i;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
mark_rolled_back(*v);
}
// Acknowledge op
op->retval = 0;
FINISH_OP(op);
return 2;
}
void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
{
auto it = dirty_db.lower_bound((obj_ver_id){
.oid = ov.oid,
.version = UINT64_MAX,
});
if (it != dirty_db.begin())
{
uint64_t max_unstable = 0;
auto rm_start = it;
auto rm_end = it;
it--;
while (1)
{
if (it->first.oid != ov.oid)
break;
else if (it->first.version <= ov.version)
{
if (!IS_STABLE(it->second.state))
max_unstable = it->first.version;
break;
}
else if (IS_IN_FLIGHT(it->second.state) || IS_STABLE(it->second.state))
break;
// Remove entry
rm_start = it;
if (it == dirty_db.begin())
break;
it--;
}
if (rm_start != rm_end)
{
erase_dirty(rm_start, rm_end, UINT64_MAX);
auto unstab_it = unstable_writes.find(ov.oid);
if (unstab_it != unstable_writes.end())
{
if (max_unstable == 0)
unstable_writes.erase(unstab_it);
else
unstab_it->second = max_unstable;
}
}
}
}
void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc)
{
if (dirty_end == dirty_start)
{
return;
}
auto dirty_it = dirty_end;
dirty_it--;
if (IS_DELETE(dirty_it->second.state))
{
object_id oid = dirty_it->first.oid;
#ifdef BLOCKSTORE_DEBUG
printf("Unblock writes-after-delete %jx:%jx v%ju\n", oid.inode, oid.stripe, dirty_it->first.version);
#endif
dirty_it = dirty_end;
// Unblock operations blocked by delete flushing
uint32_t next_state = BS_ST_IN_FLIGHT;
while (dirty_it != dirty_db.end() && dirty_it->first.oid == oid)
{
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL)
{
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | next_state;
if (IS_BIG_WRITE(dirty_it->second.state))
{
next_state = BS_ST_WAIT_BIG;
}
}
dirty_it++;
}
dirty_it = dirty_end;
dirty_it--;
}
while (1)
{
if ((IS_BIG_WRITE(dirty_it->second.state) || IS_DELETE(dirty_it->second.state)) &&
IS_STABLE(dirty_it->second.state))
{
big_to_flush--;
}
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc &&
dirty_it->second.location != UINT64_MAX)
{
#ifdef BLOCKSTORE_DEBUG
printf("Free block %ju from %jx:%jx v%ju\n", dirty_it->second.location >> dsk.block_order,
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
#endif
data_alloc->set(dirty_it->second.location >> dsk.block_order, false);
}
auto used = --journal.used_sectors.at(dirty_it->second.journal_sector);
#ifdef BLOCKSTORE_DEBUG
printf(
"remove usage of journal offset %08jx by %jx:%jx v%ju (%ju refs)\n", dirty_it->second.journal_sector,
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
);
#endif
if (used == 0)
{
journal.used_sectors.erase(dirty_it->second.journal_sector);
if (dirty_it->second.journal_sector == journal.sector_info[journal.cur_sector].offset)
{
// Mark current sector as "full" to select the new one
journal.in_sector_pos = dsk.journal_block_size;
}
flusher->mark_trim_possible();
}
free_dirty_dyn_data(dirty_it->second);
if (dirty_it == dirty_start)
{
break;
}
dirty_it--;
}
dirty_db.erase(dirty_start, dirty_end);
}
void blockstore_impl_t::free_dirty_dyn_data(dirty_entry & e)
{
if (e.dyn_data)
{
if (alloc_dyn_data &&
--*((int*)e.dyn_data) == 0) // refcount
{
// dyn_data contains the bitmap and checksums
// free it if it doesn't refer to the in-memory journal
free(e.dyn_data);
}
e.dyn_data = NULL;
}
}

Some files were not shown because too many files have changed in this diff Show More