forked from vitalif/vitastor
Compare commits
46 Commits
Author | SHA1 | Date | |
---|---|---|---|
7dba1148e7 | |||
6b69db73ac | |||
d48a824846 | |||
40985282ff | |||
acf403e886 | |||
cf03b9c84d | |||
7c2379d458 | |||
a2189100dd | |||
bb84379db6 | |||
714dda8151 | |||
834554c523 | |||
e718116f54 | |||
98e3528a14 | |||
8e88f77101 | |||
caa2cc2e6c | |||
842ba8b831 | |||
1493823f9e | |||
c857272f44 | |||
340a4b4f27 | |||
5118980315 | |||
d71cc174e3 | |||
0eb929f1ba | |||
83146fa3e2 | |||
15dcaf7903 | |||
cd18ef7323 | |||
39531ef1a6 | |||
d334914948 | |||
c373425562 | |||
3615e57879 | |||
0edc6fe5a6 | |||
9c30df83e3 | |||
a420c77107 | |||
4100d829c7 | |||
79ebda933e | |||
65d08e067e | |||
d289753df4 | |||
85298ddae2 | |||
e23296a327 | |||
839ec9e6e0 | |||
7cbfdff41a | |||
951272f27f | |||
a3fb1d4c98 | |||
88402e6eb6 | |||
390239c51b | |||
b7b2adfa32 | |||
36c276358b |
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VERSION "0.6.14")
|
||||
set(VERSION "0.6.17")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
54
README-ru.md
54
README-ru.md
@@ -52,6 +52,7 @@ Vitastor на данный момент находится в статусе п
|
||||
- Слияние снапшотов (vitastor-cli {snap-rm,flatten,merge})
|
||||
- Консольный интерфейс для управления образами (vitastor-cli {ls,create,modify})
|
||||
- Плагин для Proxmox
|
||||
- Упрощённая NFS-прокси для эмуляции файлового доступа к образам (подходит для VMWare)
|
||||
|
||||
## Планы развития
|
||||
|
||||
@@ -59,7 +60,6 @@ Vitastor на данный момент находится в статусе п
|
||||
- Другие инструменты администрирования
|
||||
- Плагины для OpenNebula и других облачных систем
|
||||
- iSCSI-прокси
|
||||
- Упрощённый NFS прокси
|
||||
- Более быстрое переключение при отказах
|
||||
- Фоновая проверка целостности без контрольных сумм (сверка реплик)
|
||||
- Контрольные суммы
|
||||
@@ -407,6 +407,7 @@ Vitastor с однопоточной NBD прокси на том же стен
|
||||
- На хостах мониторов:
|
||||
- Пропишите нужные вам значения в файле `/usr/lib/vitastor/mon/make-units.sh`
|
||||
- Создайте юниты systemd для etcd и мониторов: `/usr/lib/vitastor/mon/make-units.sh`
|
||||
- Запустите etcd и мониторы: `systemctl start etcd vitastor-mon`
|
||||
- Пропишите etcd_address и osd_network в `/etc/vitastor/vitastor.conf`. Например:
|
||||
```
|
||||
{
|
||||
@@ -414,7 +415,14 @@ Vitastor с однопоточной NBD прокси на том же стен
|
||||
"osd_network": "10.200.1.0/24"
|
||||
}
|
||||
```
|
||||
- Создайте юниты systemd для OSD: `/usr/lib/vitastor/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
|
||||
- Инициализуйте OSD:
|
||||
- SSD: `/usr/lib/vitastor/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
|
||||
- Гибридные, HDD+SSD: `/usr/lib/vitastor/mon/make-osd-hybrid.js /dev/sda /dev/sdb ...` - передайте
|
||||
все ваши SSD и HDD скрипту в командной строке подряд, скрипт автоматически выделит разделы под
|
||||
журналы на SSD и данные на HDD. Скрипт пропускает HDD, на которых уже есть разделы
|
||||
или вообще какие-то данные, поэтому если диски непустые, сначала очистите их с помощью
|
||||
`wipefs -a`. SSD с таблицей разделов не пропускаются, но так как скрипт создаёт новые разделы
|
||||
для журналов, на SSD должно быть доступно свободное нераспределённое место.
|
||||
- Вы можете менять параметры OSD в юнитах systemd или в `vitastor.conf`. Смысл некоторых параметров:
|
||||
- `disable_data_fsync 1` - отключает fsync, используется с SSD с конденсаторами.
|
||||
- `immediate_commit all` - используется с SSD с конденсаторами.
|
||||
@@ -430,7 +438,6 @@ Vitastor с однопоточной NBD прокси на том же стен
|
||||
диски, используемые на одном из тестовых стендов - Intel D3-S4510 - очень сильно не любят такую
|
||||
перезапись, и для них была добавлена эта опция. Когда данный режим включён, также нужно поднимать
|
||||
значение `journal_sector_buffer_count`, так как иначе Vitastor не хватит буферов для записи в журнал.
|
||||
- Запустите все etcd: `systemctl start etcd`
|
||||
- Создайте глобальную конфигурацию в etcd: `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
|
||||
(если все ваши диски - серверные с конденсаторами).
|
||||
- Создайте пулы: `etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`.
|
||||
@@ -523,9 +530,48 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
||||
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
|
||||
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
|
||||
|
||||
### NFS
|
||||
|
||||
В Vitastor реализована упрощённая NFS 3.0 прокси для эмуляции файлового доступа к образам.
|
||||
Это не полноценная файловая система, т.к. метаданные всех файлов (образов) сохраняются
|
||||
в etcd и всё время хранятся в оперативной памяти - то есть, положить туда много файлов
|
||||
не получится.
|
||||
|
||||
Однако в качестве способа доступа к образам виртуальных машин NFS прокси прекрасно подходит
|
||||
и позволяет подключить Vitastor, например, к VMWare.
|
||||
|
||||
При этом, если вы используете режим immediate_commit=all (для SSD с конденсаторами или HDD
|
||||
с отключённым кэшем), то NFS-сервер не имеет состояния и вы можете свободно поднять
|
||||
его в нескольких экземплярах и использовать поверх них сетевой балансировщик нагрузки или
|
||||
схему с отказоустойчивостью.
|
||||
|
||||
Использование vitastor-nfs:
|
||||
|
||||
```
|
||||
vitastor-nfs [--etcd_address ADDR] [ДРУГИЕ ОПЦИИ]
|
||||
|
||||
--subdir <DIR> экспортировать "поддиректорию" - образы с префиксом имени <DIR>/ (по умолчанию пусто - экспортировать все образы)
|
||||
--portmap 0 отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий)
|
||||
--bind <IP> принимать соединения по адресу <IP> (по умолчанию 0.0.0.0 - на всех)
|
||||
--nfspath <PATH> установить путь NFS-экспорта в <PATH> (по умолчанию /)
|
||||
--port <PORT> использовать порт <PORT> для NFS-сервисов (по умолчанию 2049)
|
||||
--pool <POOL> исползовать пул <POOL> для новых образов (обязательно, если пул в кластере не один)
|
||||
--foreground 1 не уходить в фон после запуска
|
||||
```
|
||||
|
||||
Пример монтирования Vitastor через NFS:
|
||||
|
||||
```
|
||||
vitastor-nfs --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
|
||||
```
|
||||
|
||||
```
|
||||
mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
```
|
||||
|
||||
### Kubernetes
|
||||
|
||||
У Vitastor есть CSI-плагин для Kubernetes, поддерживающий RWO-тома.
|
||||
У Vitastor есть CSI-плагин для Kubernetes, поддерживающий RWO, а также блочные RWX, тома.
|
||||
|
||||
Для установки возьмите манифесты из директории [csi/deploy/](csi/deploy/), поместите
|
||||
вашу конфигурацию подключения к Vitastor в [csi/deploy/001-csi-config-map.yaml](001-csi-config-map.yaml),
|
||||
|
53
README.md
53
README.md
@@ -46,6 +46,7 @@ breaking changes in the future. However, the following is implemented:
|
||||
- Snapshot merge tool (vitastor-cli {snap-rm,flatten,merge})
|
||||
- Image management CLI (vitastor-cli {ls,create,modify})
|
||||
- Proxmox storage plugin
|
||||
- Simplified NFS proxy for file-based image access emulation (suitable for VMWare)
|
||||
|
||||
## Roadmap
|
||||
|
||||
@@ -53,7 +54,6 @@ breaking changes in the future. However, the following is implemented:
|
||||
- Other administrative tools
|
||||
- Plugins for OpenNebula and other cloud systems
|
||||
- iSCSI proxy
|
||||
- Simplified NFS proxy
|
||||
- Faster failover
|
||||
- Scrubbing without checksums (verification of replicas)
|
||||
- Checksums
|
||||
@@ -360,6 +360,7 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
|
||||
- On the monitor hosts:
|
||||
- Edit variables at the top of `/usr/lib/vitastor/mon/make-units.sh` to desired values.
|
||||
- Create systemd units for the monitor and etcd: `/usr/lib/vitastor/mon/make-units.sh`
|
||||
- Start etcd and monitors: `systemctl start etcd vitastor-mon`
|
||||
- Put etcd_address and osd_network into `/etc/vitastor/vitastor.conf`. Example:
|
||||
```
|
||||
{
|
||||
@@ -367,7 +368,13 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
|
||||
"osd_network": "10.200.1.0/24"
|
||||
}
|
||||
```
|
||||
- Create systemd units for your OSDs: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
|
||||
- Initialize OSDs:
|
||||
- Simplest, SSD-only: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
|
||||
- Hybrid, HDD+SSD: `/usr/lib/vitastor/mon/make-osd-hybrid.js /dev/sda /dev/sdb ...` - pass all your
|
||||
devices (HDD and SSD) to this script - it will partition disks and initialize journals on its own.
|
||||
This script skips HDDs which are already partitioned so if you want to use non-empty disks for
|
||||
Vitastor you should first wipe them with `wipefs -a`. SSDs with GPT partition table are not skipped,
|
||||
but some free unpartitioned space must be available because the script creates new partitions for journals.
|
||||
- You can change OSD configuration in units or in `vitastor.conf`. Notable configuration variables:
|
||||
- `disable_data_fsync 1` - only safe with server-grade drives with capacitors.
|
||||
- `immediate_commit all` - use this if all your drives are server-grade.
|
||||
@@ -472,9 +479,49 @@ It will output the device name, like /dev/nbd0 which you can then format and mou
|
||||
|
||||
Again, you can use `--pool <POOL> --inode <INODE> --size <SIZE>` insteaf of `--image <IMAGE>` if you want.
|
||||
|
||||
### NFS
|
||||
|
||||
Vitastor has a simplified NFS 3.0 proxy for file-based image access emulation. It's not
|
||||
suitable as a full-featured file system, at least because all file/image metadata is stored
|
||||
in etcd and kept in memory all the time - thus you can't put a lot of files in it.
|
||||
|
||||
However, NFS proxy is totally fine as a method to provide VM image access and allows to
|
||||
plug Vitastor into, for example, VMWare. It's important to note that for VMWare it's a much
|
||||
better access method than iSCSI, because with iSCSI we'd have to put all VM images into one
|
||||
Vitastor image exported as a LUN to VMWare and formatted with VMFS. VMWare doesn't use VMFS
|
||||
over NFS.
|
||||
|
||||
NFS proxy is stateless if you use immediate_commit=all mode (for SSD with capacitors or
|
||||
HDDs with disabled cache), so you can run multiple NFS proxies and use a network load
|
||||
balancer or any failover method you want to in that case.
|
||||
|
||||
vitastor-nfs usage:
|
||||
|
||||
```
|
||||
vitastor-nfs [--etcd_address ADDR] [OTHER OPTIONS]
|
||||
|
||||
--subdir <DIR> export images prefixed <DIR>/ (default empty - export all images)
|
||||
--portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)
|
||||
--bind <IP> bind service to <IP> address (default 0.0.0.0)
|
||||
--nfspath <PATH> set NFS export path to <PATH> (default is /)
|
||||
--port <PORT> use port <PORT> for NFS services (default is 2049)
|
||||
--pool <POOL> use <POOL> as default pool for new files (images)
|
||||
--foreground 1 stay in foreground, do not daemonize
|
||||
```
|
||||
|
||||
Example start and mount commands:
|
||||
|
||||
```
|
||||
vitastor-nfs --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
|
||||
```
|
||||
|
||||
```
|
||||
mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
```
|
||||
|
||||
### Kubernetes
|
||||
|
||||
Vitastor has a CSI plugin for Kubernetes which supports RWO volumes.
|
||||
Vitastor has a CSI plugin for Kubernetes which supports RWO (and block RWX) volumes.
|
||||
|
||||
To deploy it, take manifests from [csi/deploy/](csi/deploy/) directory, put your
|
||||
Vitastor configuration in [csi/deploy/001-csi-config-map.yaml](001-csi-config-map.yaml),
|
||||
|
Submodule cpp-btree updated: 6e20146406...45e6d1f131
@@ -1,4 +1,4 @@
|
||||
VERSION ?= v0.6.14
|
||||
VERSION ?= v0.6.17
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v0.6.14
|
||||
image: vitalif/vitastor-csi:v0.6.17
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -116,7 +116,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v0.6.14
|
||||
image: vitalif/vitastor-csi:v0.6.17
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
13
csi/deploy/example-pvc-block.yaml
Normal file
13
csi/deploy/example-pvc-block.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: test-vitastor-pvc-block
|
||||
spec:
|
||||
storageClassName: vitastor
|
||||
volumeMode: Block
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
17
csi/deploy/example-test-pod-block.yaml
Normal file
17
csi/deploy/example-test-pod-block.yaml
Normal file
@@ -0,0 +1,17 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: vitastor-test-block-pvc
|
||||
namespace: default
|
||||
spec:
|
||||
containers:
|
||||
- name: vitastor-test-block-pvc
|
||||
image: nginx
|
||||
volumeDevices:
|
||||
- name: data
|
||||
devicePath: /dev/xvda
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: test-vitastor-pvc-block
|
||||
readOnly: false
|
17
csi/deploy/example-test-pod.yaml
Normal file
17
csi/deploy/example-test-pod.yaml
Normal file
@@ -0,0 +1,17 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: vitastor-test-nginx
|
||||
namespace: default
|
||||
spec:
|
||||
containers:
|
||||
- name: vitastor-test-nginx
|
||||
image: nginx
|
||||
volumeMounts:
|
||||
- mountPath: /usr/share/nginx/html/s3
|
||||
name: data
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: test-vitastor-pvc
|
||||
readOnly: false
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "0.6.14"
|
||||
vitastorCSIDriverVersion = "0.6.17"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
@@ -67,29 +67,44 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
|
||||
klog.Infof("received node publish volume request %+v", protosanitizer.StripSecrets(req))
|
||||
|
||||
targetPath := req.GetTargetPath()
|
||||
isBlock := req.GetVolumeCapability().GetBlock() != nil
|
||||
|
||||
// Check that it's not already mounted
|
||||
free, error := mount.IsNotMountPoint(ns.mounter, targetPath)
|
||||
_, error := mount.IsNotMountPoint(ns.mounter, targetPath)
|
||||
if (error != nil)
|
||||
{
|
||||
if (os.IsNotExist(error))
|
||||
{
|
||||
error := os.MkdirAll(targetPath, 0777)
|
||||
if (error != nil)
|
||||
if (isBlock)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, error.Error())
|
||||
pathFile, err := os.OpenFile(targetPath, os.O_CREATE|os.O_RDWR, 0o600)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("failed to create block device mount target %s with error: %v", targetPath, err)
|
||||
return nil, status.Error(codes.Internal, err.Error())
|
||||
}
|
||||
err = pathFile.Close()
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("failed to close %s with error: %v", targetPath, err)
|
||||
return nil, status.Error(codes.Internal, err.Error())
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
err := os.MkdirAll(targetPath, 0777)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("failed to create fs mount target %s with error: %v", targetPath, err)
|
||||
return nil, status.Error(codes.Internal, err.Error())
|
||||
}
|
||||
}
|
||||
free = true
|
||||
}
|
||||
else
|
||||
{
|
||||
return nil, status.Error(codes.Internal, error.Error())
|
||||
}
|
||||
}
|
||||
if (!free)
|
||||
{
|
||||
return &csi.NodePublishVolumeResponse{}, nil
|
||||
}
|
||||
|
||||
ctxVars := make(map[string]string)
|
||||
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
|
||||
@@ -149,7 +164,6 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
|
||||
|
||||
// Format the device (ext4 or xfs)
|
||||
fsType := req.GetVolumeCapability().GetMount().GetFsType()
|
||||
isBlock := req.GetVolumeCapability().GetBlock() != nil
|
||||
opt := req.GetVolumeCapability().GetMount().GetMountFlags()
|
||||
opt = append(opt, "_netdev")
|
||||
if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
|
||||
|
2
debian/changelog
vendored
2
debian/changelog
vendored
@@ -1,4 +1,4 @@
|
||||
vitastor (0.6.14-1) unstable; urgency=medium
|
||||
vitastor (0.6.17-1) unstable; urgency=medium
|
||||
|
||||
* RDMA support
|
||||
* Bugfixes
|
||||
|
1
debian/vitastor-client.install
vendored
1
debian/vitastor-client.install
vendored
@@ -2,5 +2,6 @@ usr/bin/vita
|
||||
usr/bin/vitastor-cli
|
||||
usr/bin/vitastor-rm
|
||||
usr/bin/vitastor-nbd
|
||||
usr/bin/vitastor-nfs
|
||||
usr/lib/*/libvitastor*.so*
|
||||
mon/make-osd.sh /usr/lib/vitastor
|
||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -33,8 +33,8 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-0.6.14; \
|
||||
cd vitastor-0.6.14; \
|
||||
cp -r /root/vitastor vitastor-0.6.17; \
|
||||
cd vitastor-0.6.17; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
@@ -47,8 +47,8 @@ RUN set -e -x; \
|
||||
rm -rf a b; \
|
||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.14.orig.tar.xz vitastor-0.6.14; \
|
||||
cd vitastor-0.6.14; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.17.orig.tar.xz vitastor-0.6.17; \
|
||||
cd vitastor-0.6.17; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
55
docs/gen-docs.js
Executable file
55
docs/gen-docs.js
Executable file
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/nodejs
|
||||
|
||||
const fs = require('fs');
|
||||
const yaml = require('yaml');
|
||||
|
||||
const L = {
|
||||
en: {},
|
||||
ru: {
|
||||
Type: 'Тип',
|
||||
Default: 'Значение по умолчанию',
|
||||
Minimum: 'Минимальное значение',
|
||||
},
|
||||
};
|
||||
const types = {
|
||||
en: {
|
||||
string: 'string',
|
||||
bool: 'boolean',
|
||||
int: 'integer',
|
||||
sec: 'seconds',
|
||||
ms: 'milliseconds',
|
||||
us: 'microseconds',
|
||||
},
|
||||
ru: {
|
||||
string: 'строка',
|
||||
bool: 'булево (да/нет)',
|
||||
int: 'целое число',
|
||||
sec: 'секунды',
|
||||
ms: 'миллисекунды',
|
||||
us: 'микросекунды',
|
||||
},
|
||||
};
|
||||
const params_files = fs.readdirSync(__dirname+'/params')
|
||||
.filter(f => f.substr(-4) == '.yml')
|
||||
.map(f => f.substr(0, f.length-4));
|
||||
|
||||
for (const file of params_files)
|
||||
{
|
||||
const cfg = yaml.parse(fs.readFileSync(__dirname+'/params/'+file+'.yml', { encoding: 'utf-8' }));
|
||||
for (const lang in types)
|
||||
{
|
||||
let out = '\n\n{{< toc >}}';
|
||||
for (const c of cfg)
|
||||
{
|
||||
out += `\n\n## ${c.name}\n\n`;
|
||||
out += `- ${L[lang]['Type'] || 'Type'}: ${c["type_"+lang] || types[lang][c.type] || c.type}\n`;
|
||||
if (c.default !== undefined)
|
||||
out += `- ${L[lang]['Default'] || 'Default'}: ${c.default}\n`;
|
||||
if (c.min !== undefined)
|
||||
out += `- ${L[lang]['Minimum'] || 'Minimum'}: ${c.min}\n`;
|
||||
out += `\n`+(c["info_"+lang] || c["info"]).replace(/\s+$/, '');
|
||||
}
|
||||
const head = fs.readFileSync(__dirname+'/params/head/'+file+'.'+lang+'.md', { encoding: 'utf-8' });
|
||||
fs.writeFileSync(__dirname+'/hugo/content/config/'+file+'.'+lang+'.md', head.replace(/\s+$/, '')+out+"\n");
|
||||
}
|
||||
}
|
6
docs/hugo/archetypes/default.md
Normal file
6
docs/hugo/archetypes/default.md
Normal file
@@ -0,0 +1,6 @@
|
||||
---
|
||||
title: "{{ replace .Name "-" " " | title }}"
|
||||
date: {{ .Date }}
|
||||
draft: true
|
||||
---
|
||||
|
35
docs/hugo/config.yaml
Normal file
35
docs/hugo/config.yaml
Normal file
@@ -0,0 +1,35 @@
|
||||
baseURL: http://localhost
|
||||
title: Vitastor
|
||||
theme: hugo-geekdoc
|
||||
#languageCode: en-us
|
||||
|
||||
pluralizeListTitles: false
|
||||
|
||||
# Geekdoc required configuration
|
||||
pygmentsUseClasses: true
|
||||
pygmentsCodeFences: true
|
||||
disablePathToLower: true
|
||||
|
||||
# Required if you want to render robots.txt template
|
||||
enableRobotsTXT: true
|
||||
|
||||
defaultContentLanguage: en
|
||||
languages:
|
||||
en:
|
||||
weight: 1
|
||||
languageName: English
|
||||
ru:
|
||||
weight: 1
|
||||
languageName: Русский
|
||||
|
||||
markup:
|
||||
goldmark:
|
||||
renderer:
|
||||
# Needed for mermaid shortcode
|
||||
unsafe: true
|
||||
tableOfContents:
|
||||
startLevel: 1
|
||||
endLevel: 9
|
||||
|
||||
taxonomies:
|
||||
tag: tags
|
6
docs/hugo/content/_index.md
Normal file
6
docs/hugo/content/_index.md
Normal file
@@ -0,0 +1,6 @@
|
||||
## The Idea
|
||||
|
||||
Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
|
||||
architecturally similar to Ceph which means strong consistency, primary-replication,
|
||||
symmetric clustering and automatic data distribution over any number of drives
|
||||
of any size with configurable redundancy (replication or erasure codes/XOR).
|
61
docs/hugo/content/config/_index.en.md
Normal file
61
docs/hugo/content/config/_index.en.md
Normal file
@@ -0,0 +1,61 @@
|
||||
---
|
||||
title: Parameter Reference
|
||||
weight: 1
|
||||
---
|
||||
|
||||
Vitastor configuration consists of:
|
||||
- Configuration parameters (key-value), described here
|
||||
- [Pool configuration]({{< ref "config/pool" >}})
|
||||
- OSD placement tree configuration
|
||||
- Inode configuration i.e. image metadata like name, size and parent reference
|
||||
|
||||
Configuration parameters can be set in 3 places:
|
||||
- Configuration file (`/etc/vitastor/vitastor.conf` or other path)
|
||||
- etcd key `/vitastor/config/global`. Most variables can be set there, but etcd
|
||||
connection parameters should obviously be set in the configuration file.
|
||||
- Command line of Vitastor components: OSD, mon, fio and QEMU options,
|
||||
OpenStack/Proxmox/etc configuration. The latter doesn't allow to set all
|
||||
variables directly, but it allows to override the configuration file and
|
||||
set everything you need inside it.
|
||||
|
||||
In the future, additional configuration methods may be added:
|
||||
- OSD superblock which will, by design, contain parameters related to the disk
|
||||
layout and to one specific OSD.
|
||||
- OSD-specific keys in etcd like `/vitastor/config/osd/<number>`.
|
||||
|
||||
## Common Parameters
|
||||
|
||||
These are the most common parameters which apply to all components of Vitastor.
|
||||
|
||||
[See the list]({{< ref "common" >}})
|
||||
|
||||
## Cluster-Wide Disk Layout Parameters
|
||||
|
||||
These parameters apply to clients and OSDs and can't be changed after OSD
|
||||
initialization.
|
||||
|
||||
[See the list]({{< ref "layout-cluster" >}})
|
||||
|
||||
## OSD Disk Layout Parameters
|
||||
|
||||
These parameters apply to OSDs and can't be changed after OSD initialization.
|
||||
|
||||
[See the list]({{< ref "layout-osd" >}})
|
||||
|
||||
## Network Protocol Parameters
|
||||
|
||||
These parameters apply to clients and OSDs and can be changed with a restart.
|
||||
|
||||
[See the list]({{< ref "network" >}})
|
||||
|
||||
## Runtime OSD Parameters
|
||||
|
||||
These parameters apply to OSDs and can be changed with an OSD restart.
|
||||
|
||||
[See the list]({{< ref "osd" >}})
|
||||
|
||||
## Monitor Parameters
|
||||
|
||||
These parameters only apply to Monitors.
|
||||
|
||||
[See the list]({{< ref "monitor" >}})
|
63
docs/hugo/content/config/_index.ru.md
Normal file
63
docs/hugo/content/config/_index.ru.md
Normal file
@@ -0,0 +1,63 @@
|
||||
---
|
||||
title: Перечень настроек
|
||||
weight: 1
|
||||
---
|
||||
|
||||
Конфигурация Vitastor состоит из:
|
||||
- Параметров (ключ-значение), описанных на данной странице
|
||||
- Настроек пулов
|
||||
- Настроек дерева OSD
|
||||
- Настроек инодов, т.е. метаданных образов, таких, как имя, размер и ссылки на
|
||||
родительский образ
|
||||
|
||||
Параметры конфигурации могут задаваться в 3 местах:
|
||||
- Файле конфигурации (`/etc/vitastor/vitastor.conf` или по другому пути)
|
||||
- Ключе в etcd `/vitastor/config/global`. Большая часть параметров может
|
||||
задаваться там, кроме, естественно, самих параметров соединения с etcd,
|
||||
которые должны задаваться в файле конфигурации
|
||||
- В командной строке компонентов Vitastor: OSD, монитора, опциях fio и QEMU,
|
||||
настроек OpenStack, Proxmox и т.п. Последние, как правило, не включают полный
|
||||
набор параметров напрямую, но разрешают определить путь к файлу конфигурации
|
||||
и задать любые параметры в нём.
|
||||
|
||||
В будущем также могут быть добавлены другие способы конфигурации:
|
||||
- Суперблок OSD, в котором будут храниться параметры OSD, связанные с дисковым
|
||||
форматом и с этим конкретным OSD.
|
||||
- OSD-специфичные ключи в etcd типа `/vitastor/config/osd/<номер>`.
|
||||
|
||||
## Общие параметры
|
||||
|
||||
Это наиболее общие параметры, используемые всеми компонентами Vitastor.
|
||||
|
||||
[Посмотреть список]({{< ref "common" >}})
|
||||
|
||||
## Дисковые параметры уровня кластера
|
||||
|
||||
Эти параметры используются клиентами и OSD и не могут быть изменены после
|
||||
инициализации OSD.
|
||||
|
||||
[Посмотреть список]({{< ref "layout-cluster" >}})
|
||||
|
||||
## Дисковые параметры OSD
|
||||
|
||||
Эти параметры используются OSD и не могут быть изменены после инициализации OSD.
|
||||
|
||||
[Посмотреть список]({{< ref "layout-osd" >}})
|
||||
|
||||
## Параметры сетевого протокола
|
||||
|
||||
Эти параметры используются клиентами и OSD и могут быть изменены с перезапуском.
|
||||
|
||||
[Посмотреть список]({{< ref "network" >}})
|
||||
|
||||
## Изменяемые параметры OSD
|
||||
|
||||
Эти параметры используются OSD и могут быть изменены с перезапуском.
|
||||
|
||||
[Посмотреть список]({{< ref "osd" >}})
|
||||
|
||||
## Параметры мониторов
|
||||
|
||||
Данные параметры используются только мониторами Vitastor.
|
||||
|
||||
[Посмотреть список]({{< ref "monitor" >}})
|
178
docs/hugo/content/config/pool.en.md
Normal file
178
docs/hugo/content/config/pool.en.md
Normal file
@@ -0,0 +1,178 @@
|
||||
---
|
||||
title: Pool configuration
|
||||
weight: 100
|
||||
---
|
||||
|
||||
Pool configuration is set in etcd key `/vitastor/config/pools` in the following
|
||||
JSON format:
|
||||
|
||||
```
|
||||
{
|
||||
"<Numeric ID>": {
|
||||
"name": "<name>",
|
||||
...other parameters...
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
{{< toc >}}
|
||||
|
||||
# Parameters
|
||||
|
||||
## name
|
||||
|
||||
- Type: string
|
||||
- Required
|
||||
|
||||
Pool name.
|
||||
|
||||
## scheme
|
||||
|
||||
- Type: string
|
||||
- Required
|
||||
- One of: "replicated", "xor" or "jerasure"
|
||||
|
||||
Redundancy scheme used for data in this pool.
|
||||
|
||||
## pg_size
|
||||
|
||||
- Type: integer
|
||||
- Required
|
||||
|
||||
Total number of disks for PGs of this pool - i.e., number of replicas for
|
||||
replicated pools and number of data plus parity disks for EC/XOR pools.
|
||||
|
||||
## parity_chunks
|
||||
|
||||
- Type: integer
|
||||
|
||||
Number of parity chunks for EC/XOR pools. For such pools, data will be lost
|
||||
if you lose more than parity_chunks disks at once, so this parameter can be
|
||||
equally described as FTT (number of failures to tolerate).
|
||||
|
||||
Required for EC/XOR pools, ignored for replicated pools.
|
||||
|
||||
## pg_minsize
|
||||
|
||||
- Type: integer
|
||||
- Required
|
||||
|
||||
Number of available live disks for PGs of this pool to remain active.
|
||||
That is, if it becomes impossible to place PG data on at least (pg_minsize)
|
||||
OSDs, PG is deactivated for both read and write. So you know that a fresh
|
||||
write always goes to at least (pg_minsize) OSDs (disks).
|
||||
|
||||
FIXME: pg_minsize behaviour may be changed in the future to only make PGs
|
||||
read-only instead of deactivating them.
|
||||
|
||||
## pg_count
|
||||
|
||||
- Type: integer
|
||||
- Required
|
||||
|
||||
Number of PGs for this pool. The value should be big enough for the monitor /
|
||||
LP solver to be able to optimize data placement.
|
||||
|
||||
"Enough" is usually around 64-128 PGs per OSD, i.e. you set pg_count for pool
|
||||
to (total OSD count * 100 / pg_size). You can round it to the closest power of 2,
|
||||
because it makes it easier to reduce or increase PG count later by dividing or
|
||||
multiplying it by 2.
|
||||
|
||||
In Vitastor, PGs are ephemeral, so you can change pool PG count anytime just
|
||||
by overwriting pool configuration in etcd. Amount of the data affected by
|
||||
rebalance will be smaller if the new PG count is a multiple of the old PG count
|
||||
or vice versa.
|
||||
|
||||
## failure_domain
|
||||
|
||||
- Type: string
|
||||
- Default: host
|
||||
|
||||
Failure domain specification. Must be "host" or "osd" or refer to one of the
|
||||
placement tree levels, defined in [placement_levels]({{< ref "config/monitor#placement_levels" >}}).
|
||||
|
||||
Two replicas, or two parts in case of EC/XOR, of the same block of data are
|
||||
never put on OSDs in the same failure domain (for example, on the same host).
|
||||
So failure domain specifies the unit which failure you are protecting yourself
|
||||
from.
|
||||
|
||||
## max_osd_combinations
|
||||
|
||||
- Type: integer
|
||||
- Default: 10000
|
||||
|
||||
Vitastor data placement algorithm is based on the LP solver and OSD combinations
|
||||
which are fed to it are generated ramdonly. This parameter specifies the maximum
|
||||
number of combinations to generate when optimising PG placement.
|
||||
|
||||
This parameter usually doesn't require to be changed.
|
||||
|
||||
## pg_stripe_size
|
||||
|
||||
- Type: integer
|
||||
- Default: 0
|
||||
|
||||
Specifies the stripe size for this pool according to which images are split into
|
||||
different PGs. Stripe size can't be smaller than [block_size]({{< ref "config/layout-cluster#block_size" >}})
|
||||
multiplied by (pg_size - parity_chunks) for EC/XOR pools, or 1 for replicated pools,
|
||||
and the same value is used by default.
|
||||
|
||||
This means first `pg_stripe_size = (block_size * (pg_size-parity_chunks))` bytes
|
||||
of an image go to one PG, next `pg_stripe_size` bytes go to another PG and so on.
|
||||
|
||||
Usually doesn't require to be changed separately from the block size.
|
||||
|
||||
## root_node
|
||||
|
||||
- Type: string
|
||||
|
||||
Specifies the root node of the OSD tree to restrict this pool OSDs to.
|
||||
Referenced root node must exist in /vitastor/config/node_placement.
|
||||
|
||||
## osd_tags
|
||||
|
||||
- Type: string or array of strings
|
||||
|
||||
Specifies OSD tags to restrict this pool to. If multiple tags are specified,
|
||||
only OSDs having all of these tags will be used for this pool.
|
||||
|
||||
## primary_affinity_tags
|
||||
|
||||
- Type: string or array of strings
|
||||
|
||||
Specifies OSD tags to prefer putting primary OSDs in this pool to.
|
||||
Note that for EC/XOR pools Vitastor always prefers to put primary OSD on one
|
||||
of the OSDs containing a data chunk for a PG.
|
||||
|
||||
# Examples
|
||||
|
||||
## Replicated pool
|
||||
|
||||
```
|
||||
{
|
||||
"1": {
|
||||
"name":"testpool",
|
||||
"scheme":"replicated",
|
||||
"pg_size":2,
|
||||
"pg_minsize":1,
|
||||
"pg_count":256,
|
||||
"failure_domain":"host"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Erasure-coded pool
|
||||
|
||||
```
|
||||
{
|
||||
"2": {
|
||||
"name":"ecpool",
|
||||
"scheme":"jerasure",
|
||||
"pg_size":3,
|
||||
"parity_chunks":1,
|
||||
"pg_minsize":2,
|
||||
"pg_count":256,
|
||||
"failure_domain":"host"
|
||||
}
|
||||
}
|
||||
```
|
41
docs/hugo/content/installation/packages.md
Normal file
41
docs/hugo/content/installation/packages.md
Normal file
@@ -0,0 +1,41 @@
|
||||
---
|
||||
title: Packages
|
||||
weight: 2
|
||||
---
|
||||
|
||||
## Debian
|
||||
|
||||
- Trust Vitastor package signing key:
|
||||
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
|
||||
- Add Vitastor package repository to your /etc/apt/sources.list:
|
||||
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
- For Debian 10 (Buster) also enable backports repository:
|
||||
`deb http://deb.debian.org/debian buster-backports main`
|
||||
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`
|
||||
|
||||
## CentOS
|
||||
|
||||
- Add Vitastor package repository:
|
||||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
|
||||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
|
||||
- Enable EPEL: `yum/dnf install epel-release`
|
||||
- Enable additional CentOS repositories:
|
||||
- CentOS 7: `yum install centos-release-scl`
|
||||
- CentOS 8: `dnf install centos-release-advanced-virtualization`
|
||||
- Enable elrepo-kernel:
|
||||
- CentOS 7: `yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm`
|
||||
- CentOS 8: `dnf install https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm`
|
||||
- Install packages: `yum/dnf install vitastor lpsolve etcd kernel-ml qemu-kvm`
|
||||
|
||||
## Installation requirements
|
||||
|
||||
- Linux kernel 5.4 or newer, for io_uring support. 5.8 or later is highly
|
||||
recommended because io_uring is a relatively new technology and there is
|
||||
at least one bug which reproduces with io_uring and HP SmartArray
|
||||
controllers in 5.4
|
||||
- liburing 0.4 or newer
|
||||
- lp_solve
|
||||
- etcd 3.4.15 or newer. Earlier versions won't work because of various bugs,
|
||||
for example [#12402](https://github.com/etcd-io/etcd/pull/12402).
|
||||
- node.js 10 or newer
|
72
docs/hugo/content/installation/quickstart.md
Normal file
72
docs/hugo/content/installation/quickstart.md
Normal file
@@ -0,0 +1,72 @@
|
||||
---
|
||||
title: Quick Start
|
||||
weight: 1
|
||||
---
|
||||
|
||||
Prepare:
|
||||
|
||||
- Get some SATA or NVMe SSDs with capacitors (server-grade drives). You can use desktop SSDs
|
||||
with lazy fsync, but prepare for inferior single-thread latency. Read more about capacitors
|
||||
[here]({{< ref "config/layout-cluster#immediate_commit" >}}).
|
||||
- Get a fast network (at least 10 Gbit/s). Something like Mellanox ConnectX-4 with RoCEv2 is ideal.
|
||||
- Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
|
||||
- [Install Vitastor packages]({{< ref "installation/packages" >}}).
|
||||
|
||||
## Configure monitors
|
||||
|
||||
On the monitor hosts:
|
||||
- Edit variables at the top of `/usr/lib/vitastor/mon/make-units.sh` to desired values.
|
||||
- Create systemd units for the monitor and etcd: `/usr/lib/vitastor/mon/make-units.sh`
|
||||
- Start etcd and monitors: `systemctl start etcd vitastor-mon`
|
||||
|
||||
## Configure OSDs
|
||||
|
||||
- Put etcd_address and osd_network into `/etc/vitastor/vitastor.conf`. Example:
|
||||
```
|
||||
{
|
||||
"etcd_address": ["10.200.1.10:2379","10.200.1.11:2379","10.200.1.12:2379"],
|
||||
"osd_network": "10.200.1.0/24"
|
||||
}
|
||||
```
|
||||
- Initialize OSDs:
|
||||
- Simplest, SSD-only: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
|
||||
- Hybrid, HDD+SSD: `/usr/lib/vitastor/mon/make-osd-hybrid.js /dev/sda /dev/sdb ...` — pass all your
|
||||
devices (HDD and SSD) to this script — it will partition disks and initialize journals on its own.
|
||||
This script skips HDDs which are already partitioned so if you want to use non-empty disks for
|
||||
Vitastor you should first wipe them with `wipefs -a`. SSDs with GPT partition table are not skipped,
|
||||
but some free unpartitioned space must be available because the script creates new partitions for journals.
|
||||
- You can change OSD configuration in units or in `vitastor.conf`.
|
||||
Check [Configuration Reference]({{< ref "config" >}}) for parameter descriptions.
|
||||
- `systemctl start vitastor.target` everywhere.
|
||||
- If all your drives have capacitors, create global configuration in etcd: \
|
||||
`etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
|
||||
|
||||
## Create a pool
|
||||
|
||||
Create pool configuration in etcd:
|
||||
|
||||
```
|
||||
etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
|
||||
"scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'
|
||||
```
|
||||
|
||||
For jerasure pools the configuration should look like the following:
|
||||
|
||||
```
|
||||
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
|
||||
"scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
|
||||
```
|
||||
|
||||
After you do this, one of the monitors will configure PGs and OSDs will start them.
|
||||
|
||||
You can check PG states with `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. All PGs should become 'active'.
|
||||
|
||||
## Create an image
|
||||
|
||||
Use vitastor-cli ([read CLI documentation here]({{< ref "usage/cli" >}})):
|
||||
|
||||
```
|
||||
vitastor-cli create -s 10G testimg
|
||||
```
|
||||
|
||||
After that, you can run benchmarks or start QEMU manually with this image.
|
54
docs/hugo/content/installation/source.md
Normal file
54
docs/hugo/content/installation/source.md
Normal file
@@ -0,0 +1,54 @@
|
||||
---
|
||||
title: Building from Source
|
||||
weight: 3
|
||||
---
|
||||
|
||||
## Requirements
|
||||
|
||||
- gcc and g++ 8 or newer, clang 10 or newer, or other compiler with C++11 plus
|
||||
designated initializers support from C++20
|
||||
- CMake
|
||||
- liburing, jerasure headers
|
||||
|
||||
## Basic instructions
|
||||
|
||||
Download source, for example using git: `git clone --recurse-submodules https://yourcmc.ru/git/vitalif/vitastor/`
|
||||
|
||||
Get `fio` source and symlink it into `<vitastor>/fio`. If you don't want to build fio engine,
|
||||
you can disable it by passing `-DWITH_FIO=no` to cmake.
|
||||
|
||||
Build and install Vitastor:
|
||||
|
||||
```
|
||||
cd vitastor
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. && make -j8 install
|
||||
```
|
||||
|
||||
## QEMU Driver
|
||||
|
||||
It's recommended to build the QEMU driver (qemu_driver.c) in-tree, as a part of
|
||||
QEMU build process. To do that:
|
||||
- Install vitastor client library headers (from source or from vitastor-client-dev package)
|
||||
- Take a corresponding patch from `patches/qemu-*-vitastor.patch` and apply it to QEMU source
|
||||
- Copy `src/qemu_driver.c` to QEMU source directory as `block/block-vitastor.c`
|
||||
- Build QEMU as usual
|
||||
|
||||
But it is also possible to build it out-of-tree. To do that:
|
||||
- Get QEMU source, begin to build it, stop the build and copy headers:
|
||||
- `<qemu>/include` → `<vitastor>/qemu/include`
|
||||
- Debian:
|
||||
* Use qemu packages from the main repository
|
||||
* `<qemu>/b/qemu/config-host.h` → `<vitastor>/qemu/b/qemu/config-host.h`
|
||||
* `<qemu>/b/qemu/qapi` → `<vitastor>/qemu/b/qemu/qapi`
|
||||
- CentOS 8:
|
||||
* Use qemu packages from the Advanced-Virtualization repository. To enable it, run
|
||||
`yum install centos-release-advanced-virtualization.noarch` and then `yum install qemu`
|
||||
* `<qemu>/config-host.h` → `<vitastor>/qemu/b/qemu/config-host.h`
|
||||
* For QEMU 3.0+: `<qemu>/qapi` → `<vitastor>/qemu/b/qemu/qapi`
|
||||
* For QEMU 2.0+: `<qemu>/qapi-types.h` → `<vitastor>/qemu/b/qemu/qapi-types.h`
|
||||
- `config-host.h` and `qapi` are required because they contain generated headers
|
||||
- Configure Vitastor with `WITH_QEMU=yes` and, if you're on RHEL, also with `QEMU_PLUGINDIR=qemu-kvm`:
|
||||
`cmake .. -DWITH_QEMU=yes`.
|
||||
- After that, Vitastor will build `block-vitastor.so` during its build process.
|
4
docs/hugo/content/introduction/_index.md
Normal file
4
docs/hugo/content/introduction/_index.md
Normal file
@@ -0,0 +1,4 @@
|
||||
---
|
||||
title: Introduction
|
||||
weight: -1
|
||||
---
|
73
docs/hugo/content/introduction/architecture.md
Normal file
73
docs/hugo/content/introduction/architecture.md
Normal file
@@ -0,0 +1,73 @@
|
||||
---
|
||||
title: Architecture
|
||||
weight: 3
|
||||
---
|
||||
|
||||
For people familiar with Ceph, Vitastor is quite similar:
|
||||
|
||||
- Vitastor also has Pools, PGs, OSDs, Monitors, Failure Domains, Placement Tree:
|
||||
- OSD (Object Storage Daemon) is a process that stores data and serves read/write requests.
|
||||
- PG (Placement Group) is a container for data that (normally) shares the same replicas.
|
||||
- Pool is a container for data that has the same redundancy scheme and placement rules.
|
||||
- Monitor is a separate daemon that watches cluster state and controls data distribution.
|
||||
- Failure Domain is a group of OSDs that you allow to fail. It's "host" by default.
|
||||
- Placement Tree groups OSDs in a hierarchy to later split them into Failure Domains.
|
||||
- Vitastor also distributes every image data across the whole cluster.
|
||||
- Vitastor is also transactional (every write to the cluster is atomic).
|
||||
- OSDs also have journal and metadata and they can also be put on separate drives.
|
||||
- Just like in Ceph, client library attempts to recover from any cluster failure so
|
||||
you can basically reboot the whole cluster and only pause, but not crash, your clients
|
||||
(please report a bug if the client crashes in that case).
|
||||
|
||||
However, there are also differences:
|
||||
|
||||
- Vitastor's main focus is on SSDs. Hybrid SSD+HDD setups are also possible.
|
||||
- Vitastor OSD is (and will always be) single-threaded. If you want to dedicate more than 1 core
|
||||
per drive you should run multiple OSDs each on a different partition of the drive.
|
||||
Vitastor isn't CPU-hungry though (as opposed to Ceph), so 1 core is sufficient in a lot of cases.
|
||||
- Metadata and journal are always kept in memory. Metadata size depends linearly on drive capacity
|
||||
and data store block size which is 128 KB by default. With 128 KB blocks metadata should occupy
|
||||
around 512 MB per 1 TB (which is still less than Ceph wants). Journal doesn't have to be big,
|
||||
the example test below was conducted with only 16 MB journal. A big journal is probably even
|
||||
harmful as dirty write metadata also take some memory.
|
||||
- Vitastor storage layer doesn't have internal copy-on-write or redirect-write. I know that maybe
|
||||
it's possible to create a good copy-on-write storage, but it's much harder and makes performance
|
||||
less deterministic, so CoW isn't used in Vitastor.
|
||||
- The basic layer of Vitastor is block storage with fixed-size blocks, not object storage with
|
||||
rich semantics like in Ceph (RADOS).
|
||||
- There's a "lazy fsync" mode which allows to batch writes before flushing them to the disk.
|
||||
This allows to use Vitastor with desktop SSDs, but still lowers performance due to additional
|
||||
network roundtrips, so use server SSDs with capacitor-based power loss protection
|
||||
("Advanced Power Loss Protection") for best performance.
|
||||
- PGs are ephemeral. This means that they aren't stored on data disks and only exist in memory
|
||||
while OSDs are running.
|
||||
- Recovery process is per-object (per-block), not per-PG. Also there are no PGLOGs.
|
||||
- Monitors don't store data. Cluster configuration and state is stored in etcd in simple human-readable
|
||||
JSON structures. Monitors only watch cluster state and handle data movement.
|
||||
Thus Vitastor's Monitor isn't a critical component of the system and is more similar to Ceph's Manager.
|
||||
Vitastor's Monitor is implemented in node.js.
|
||||
- PG distribution isn't based on consistent hashes. All PG mappings are stored in etcd.
|
||||
Rebalancing PGs between OSDs is done by mathematical optimization - data distribution problem
|
||||
is reduced to a linear programming problem and solved by lp_solve. This allows for almost
|
||||
perfect (96-99% uniformity compared to Ceph's 80-90%) data distribution in most cases, ability
|
||||
to map PGs by hand without breaking rebalancing logic, reduced OSD peer-to-peer communication
|
||||
(on average, OSDs have fewer peers) and less data movement. It also probably has a drawback -
|
||||
this method may fail in very large clusters, but up to several hundreds of OSDs it's perfectly fine.
|
||||
It's also easy to add consistent hashes in the future if something proves their necessity.
|
||||
- There's no separate CRUSH layer. You select pool redundancy scheme, placement root, failure domain
|
||||
and so on directly in pool configuration.
|
||||
- Images are global i.e. you can't create multiple images with the same name in different pools.
|
||||
|
||||
## Implementation Principles
|
||||
|
||||
- I like architecturally simple solutions. Vitastor is and will always be designed
|
||||
exactly like that.
|
||||
- I also like reinventing the wheel to some extent, like writing my own HTTP client
|
||||
for etcd interaction instead of using prebuilt libraries, because in this case
|
||||
I'm confident about what my code does and what it doesn't do.
|
||||
- I don't care about C++ "best practices" like RAII or proper inheritance or usage of
|
||||
smart pointers or whatever and I don't intend to change my mind, so if you're here
|
||||
looking for ideal reference C++ code, this probably isn't the right place.
|
||||
- I like node.js better than any other dynamically-typed language interpreter
|
||||
because it's faster than any other interpreter in the world, has neutral C-like
|
||||
syntax and built-in event loop. That's why Monitor is implemented in node.js.
|
34
docs/hugo/content/introduction/author.md
Normal file
34
docs/hugo/content/introduction/author.md
Normal file
@@ -0,0 +1,34 @@
|
||||
---
|
||||
title: Author and License
|
||||
weight: 3
|
||||
---
|
||||
|
||||
Copyright (c) Vitaliy Filippov (vitalif [at] yourcmc.ru), 2019+
|
||||
|
||||
Join Vitastor Telegram Chat: https://t.me/vitastor
|
||||
|
||||
All server-side code (OSD, Monitor and so on) is licensed under the terms of
|
||||
Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
|
||||
GNU GPLv3.0 with the additional "Network Interaction" clause which requires
|
||||
opensourcing all programs directly or indirectly interacting with Vitastor
|
||||
through a computer network and expressly designed to be used in conjunction
|
||||
with it ("Proxy Programs"). Proxy Programs may be made public not only under
|
||||
the terms of the same license, but also under the terms of any GPL-Compatible
|
||||
Free Software License, as listed by the Free Software Foundation.
|
||||
This is a stricter copyleft license than the Affero GPL.
|
||||
|
||||
Please note that VNPL doesn't require you to open the code of proprietary
|
||||
software running inside a VM if it's not specially designed to be used with
|
||||
Vitastor.
|
||||
|
||||
Basically, you can't use the software in a proprietary environment to provide
|
||||
its functionality to users without opensourcing all intermediary components
|
||||
standing between the user and Vitastor or purchasing a commercial license
|
||||
from the author 😀.
|
||||
|
||||
Client libraries (cluster_client and so on) are dual-licensed under the same
|
||||
VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
|
||||
software like QEMU and fio.
|
||||
|
||||
You can find the full text of VNPL-1.1 in the file [VNPL-1.1.txt](VNPL-1.1.txt).
|
||||
GPL 2.0 is also included in this repository as [GPL-2.0.txt](GPL-2.0.txt).
|
60
docs/hugo/content/introduction/features.md
Normal file
60
docs/hugo/content/introduction/features.md
Normal file
@@ -0,0 +1,60 @@
|
||||
---
|
||||
title: Features
|
||||
weight: 1
|
||||
---
|
||||
|
||||
Vitastor is currently a pre-release and it still misses some important features.
|
||||
However, the following is implemented:
|
||||
|
||||
- Basic part: highly-available block storage with symmetric clustering and no SPOF
|
||||
- Performance ;-D
|
||||
- Multiple redundancy schemes: Replication, XOR n+1, Reed-Solomon erasure codes
|
||||
based on jerasure library with any number of data and parity drives in a group
|
||||
- Configuration via simple JSON data structures in etcd (parameters, pools and images)
|
||||
- Automatic data distribution over OSDs, with support for:
|
||||
- Mathematical optimization for better uniformity and less data movement
|
||||
- Multiple pools
|
||||
- Placement tree, OSD selection by tags (device classes) and placement root
|
||||
- Configurable failure domains
|
||||
- Recovery of degraded blocks
|
||||
- Rebalancing (data movement between OSDs)
|
||||
- Lazy fsync support
|
||||
- Per-OSD and per-image I/O and space usage statistics in etcd
|
||||
- Snapshots and copy-on-write image clones
|
||||
- Write throttling to smooth random write workloads in SSD+HDD configurations
|
||||
- RDMA/RoCEv2 support via libibverbs
|
||||
|
||||
CLI (vitastor-cli):
|
||||
- Pool listing and space stats (df)
|
||||
- Image listing, space and I/O stats (ls)
|
||||
- Image and snapshot creation (create, modify)
|
||||
- Image removal and snapshot merge (rm, flatten, merge, rm-data)
|
||||
|
||||
Plugins and packaging:
|
||||
- Debian and CentOS packages
|
||||
- Generic user-space client library
|
||||
- Native QEMU driver
|
||||
- Loadable fio engine for benchmarks
|
||||
- NBD proxy for kernel mounts
|
||||
- CSI plugin for Kubernetes
|
||||
- OpenStack support: Cinder driver, Nova and libvirt patches
|
||||
- Proxmox storage plugin and packages
|
||||
|
||||
## Roadmap
|
||||
|
||||
The following features are planned for the future:
|
||||
|
||||
- Better OSD creation and auto-start tools
|
||||
- Other administrative tools
|
||||
- Web GUI
|
||||
- OpenNebula plugin
|
||||
- iSCSI proxy
|
||||
- Simplified NFS proxy
|
||||
- Multi-threaded client
|
||||
- Faster failover
|
||||
- Scrubbing without checksums (verification of replicas)
|
||||
- Checksums
|
||||
- Tiered storage (SSD caching)
|
||||
- NVDIMM support
|
||||
- Compression (possibly)
|
||||
- Read caching using system page cache (possibly)
|
93
docs/hugo/content/performance/comparison1.md
Normal file
93
docs/hugo/content/performance/comparison1.md
Normal file
@@ -0,0 +1,93 @@
|
||||
---
|
||||
title: Example Comparison with Ceph
|
||||
weight: 4
|
||||
---
|
||||
|
||||
Hardware configuration: 4 nodes, each with:
|
||||
- 6x SATA SSD Intel D3-S4510 3.84 TB
|
||||
- 2x Xeon Gold 6242 (16 cores @ 2.8 GHz)
|
||||
- 384 GB RAM
|
||||
- 1x 25 GbE network interface (Mellanox ConnectX-4 LX), connected to a Juniper QFX5200 switch
|
||||
|
||||
CPU powersaving was disabled. Both Vitastor and Ceph were configured with 2 OSDs per 1 SSD.
|
||||
|
||||
All of the results below apply to 4 KB blocks and random access (unless indicated otherwise).
|
||||
|
||||
T8Q64 tests were conducted over 8 400GB RBD images from all hosts (every host was running 2 instances of fio).
|
||||
This is because Ceph has performance penalties related to running multiple clients over a single RBD image.
|
||||
|
||||
cephx_sign_messages was set to false during tests, RocksDB and Bluestore settings were left at defaults.
|
||||
|
||||
T8Q64 read test was conducted over 1 larger inode (3.2T) from all hosts (every host was running 2 instances of fio).
|
||||
Vitastor has no performance penalties related to running multiple clients over a single inode.
|
||||
If conducted from one node with all primary OSDs moved to other nodes the result was slightly lower (689000 iops),
|
||||
this is because all operations resulted in network roundtrips between the client and the primary OSD.
|
||||
When fio was colocated with OSDs (like in Ceph benchmarks above), 1/4 of the read workload actually
|
||||
used the loopback network.
|
||||
|
||||
Vitastor was configured with: `--disable_data_fsync true --immediate_commit all --flusher_count 8
|
||||
--disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096
|
||||
--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024
|
||||
--journal_size 16777216`.
|
||||
|
||||
## Raw drive performance
|
||||
|
||||
- T1Q1 write ~27000 iops (~0.037ms latency)
|
||||
- T1Q1 read ~9800 iops (~0.101ms latency)
|
||||
- T1Q32 write ~60000 iops
|
||||
- T1Q32 read ~81700 iops
|
||||
|
||||
## 2 replicas
|
||||
|
||||
### Ceph 15.2.4 (Bluestore)
|
||||
|
||||
- T1Q1 write ~1000 iops (~1ms latency)
|
||||
- T1Q1 read ~1750 iops (~0.57ms latency)
|
||||
- T8Q64 write ~100000 iops, total CPU usage by OSDs about 40 virtual cores on each node
|
||||
- T8Q64 read ~480000 iops, total CPU usage by OSDs about 40 virtual cores on each node
|
||||
|
||||
In fact, not that bad for Ceph. These servers are an example of well-balanced Ceph nodes.
|
||||
However, CPU usage and I/O latency were through the roof, as usual.
|
||||
|
||||
### Vitastor 0.4.0 (native)
|
||||
|
||||
- T1Q1 write: 7087 iops (0.14ms latency)
|
||||
- T1Q1 read: 6838 iops (0.145ms latency)
|
||||
- T2Q64 write: 162000 iops, total CPU usage by OSDs about 3 virtual cores on each node
|
||||
- T8Q64 read: 895000 iops, total CPU usage by OSDs about 4 virtual cores on each node
|
||||
- Linear write (4M T1Q32): 2800 MB/s
|
||||
- Linear read (4M T1Q32): 1500 MB/s
|
||||
|
||||
### Vitastor 0.4.0 (NBD)
|
||||
|
||||
NBD is currently required to mount Vitastor via kernel, but it imposes additional overhead
|
||||
due to additional copying between the kernel and userspace. This mostly hurts linear
|
||||
bandwidth, not iops.
|
||||
|
||||
Vitastor with single-threaded NBD on the same hardware:
|
||||
- T1Q1 write: 6000 iops (0.166ms latency)
|
||||
- T1Q1 read: 5518 iops (0.18ms latency)
|
||||
- T1Q128 write: 94400 iops
|
||||
- T1Q128 read: 103000 iops
|
||||
- Linear write (4M T1Q128): 1266 MB/s (compared to 2800 MB/s via fio)
|
||||
- Linear read (4M T1Q128): 975 MB/s (compared to 1500 MB/s via fio)
|
||||
|
||||
## EC/XOR 2+1
|
||||
|
||||
### Ceph 15.2.4
|
||||
|
||||
- T1Q1 write: 730 iops (~1.37ms latency)
|
||||
- T1Q1 read: 1500 iops with cold cache (~0.66ms latency), 2300 iops after 2 minute metadata cache warmup (~0.435ms latency)
|
||||
- T4Q128 write (4 RBD images): 45300 iops, total CPU usage by OSDs about 30 virtual cores on each node
|
||||
- T8Q64 read (4 RBD images): 278600 iops, total CPU usage by OSDs about 40 virtual cores on each node
|
||||
- Linear write (4M T1Q32): 1950 MB/s before preallocation, 2500 MB/s after preallocation
|
||||
- Linear read (4M T1Q32): 2400 MB/s
|
||||
|
||||
### Vitastor 0.4.0
|
||||
|
||||
- T1Q1 write: 2808 iops (~0.355ms latency)
|
||||
- T1Q1 read: 6190 iops (~0.16ms latency)
|
||||
- T2Q64 write: 85500 iops, total CPU usage by OSDs about 3.4 virtual cores on each node
|
||||
- T8Q64 read: 812000 iops, total CPU usage by OSDs about 4.7 virtual cores on each node
|
||||
- Linear write (4M T1Q32): 3200 MB/s
|
||||
- Linear read (4M T1Q32): 1800 MB/s
|
46
docs/hugo/content/performance/theoretical.md
Normal file
46
docs/hugo/content/performance/theoretical.md
Normal file
@@ -0,0 +1,46 @@
|
||||
---
|
||||
title: Vitastor's Theoretical Maximum Performance
|
||||
weight: 3
|
||||
---
|
||||
|
||||
Replicated setups:
|
||||
- Single-threaded (T1Q1) read latency: 1 network roundtrip + 1 disk read.
|
||||
- Single-threaded write+fsync latency:
|
||||
- With immediate commit: 2 network roundtrips + 1 disk write.
|
||||
- With lazy commit: 4 network roundtrips + 1 disk write + 1 disk flush.
|
||||
- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)).
|
||||
- Saturated parallel write iops: min(network bandwidth, sum(disk write iops / number of replicas / write amplification)).
|
||||
|
||||
EC/XOR setups:
|
||||
- Single-threaded (T1Q1) read latency: 1.5 network roundtrips + 1 disk read.
|
||||
- Single-threaded write+fsync latency:
|
||||
- With immediate commit: 3.5 network roundtrips + 1 disk read + 2 disk writes.
|
||||
- With lazy commit: 5.5 network roundtrips + 1 disk read + 2 disk writes + 2 disk fsyncs.
|
||||
- 0.5 in actually (k-1)/k which means that an additional roundtrip doesn't happen when
|
||||
the read sub-operation can be served locally.
|
||||
- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)).
|
||||
- Saturated parallel write iops: min(network bandwidth, sum(disk write iops * number of data drives / (number of data + parity drives) / write amplification)).
|
||||
In fact, you should put disk write iops under the condition of ~10% reads / ~90% writes in this formula.
|
||||
|
||||
Write amplification for 4 KB blocks is usually 3-5 in Vitastor:
|
||||
1. Journal block write
|
||||
2. Journal data write
|
||||
3. Metadata block write
|
||||
4. Another journal block write for EC/XOR setups
|
||||
5. Data block write
|
||||
|
||||
If you manage to get an SSD which handles 512 byte blocks well (Optane?) you may
|
||||
lower 1, 3 and 4 to 512 bytes (1/8 of data size) and get WA as low as 2.375.
|
||||
|
||||
Lazy fsync also reduces WA for parallel workloads because journal blocks are only
|
||||
written when they fill up or fsync is requested.
|
||||
|
||||
## In Practice
|
||||
|
||||
In practice, using tests from [Understanding Performance]({{< ref "performance/understanding" >}})
|
||||
and good server-grade SSD/NVMe drives, you should head for:
|
||||
- At least 5000 T1Q1 replicated read and write iops (maximum 0.2ms latency)
|
||||
- At least ~80k parallel read iops or ~30k write iops per 1 core (1 OSD)
|
||||
- Disk-speed or wire-speed linear reads and writes, whichever is the bottleneck in your case
|
||||
|
||||
If your results are lower, that may mean you have bad drives, bad network or some kind of misconfiguration.
|
6
docs/hugo/content/performance/tuning.md
Normal file
6
docs/hugo/content/performance/tuning.md
Normal file
@@ -0,0 +1,6 @@
|
||||
---
|
||||
title: Tuning
|
||||
weight: 2
|
||||
---
|
||||
|
||||
- Disable CPU powersaving
|
52
docs/hugo/content/performance/understanding.md
Normal file
52
docs/hugo/content/performance/understanding.md
Normal file
@@ -0,0 +1,52 @@
|
||||
---
|
||||
title: Understanding Storage Performance
|
||||
weight: 1
|
||||
---
|
||||
|
||||
The most important thing for fast storage is latency, not parallel iops.
|
||||
|
||||
The best possible latency is achieved with one thread and queue depth of 1 which basically means
|
||||
"client load as low as possible". In this case IOPS = 1/latency, and this number doesn't
|
||||
scale with number of servers, drives, server processes or threads and so on.
|
||||
Single-threaded IOPS and latency numbers only depend on *how fast a single daemon is*.
|
||||
|
||||
Why is it important? It's important because some of the applications *can't* use
|
||||
queue depth greater than 1 because their task isn't parallelizable. A notable example
|
||||
is any ACID DBMS because all of them write their WALs sequentially with fsync()s.
|
||||
|
||||
fsync, by the way, is another important thing often missing in benchmarks. The point is
|
||||
that drives have cache buffers and don't guarantee that your data is actually persisted
|
||||
until you call fsync() which is translated to a FLUSH CACHE command by the OS.
|
||||
|
||||
Desktop SSDs are very fast without fsync - NVMes, for example, can process ~80000 write
|
||||
operations per second with queue depth of 1 without fsync - but they're really slow with
|
||||
fsync because they have to actually write data to flash chips when you call fsync. Typical
|
||||
number is around 1000-2000 iops with fsync.
|
||||
|
||||
Server SSDs often have supercapacitors that act as a built-in UPS and allow the drive
|
||||
to flush its DRAM cache to the persistent flash storage when a power loss occurs.
|
||||
This makes them perform equally well with and without fsync. This feature is called
|
||||
"Advanced Power Loss Protection" by Intel; other vendors either call it similarly
|
||||
or directly as "Full Capacitor-Based Power Loss Protection".
|
||||
|
||||
All software-defined storages that I currently know are slow in terms of latency.
|
||||
Notable examples are Ceph and internal SDSes used by cloud providers like Amazon, Google,
|
||||
Yandex and so on. They're all slow and can only reach ~0.3ms read and ~0.6ms 4 KB write latency
|
||||
with best-in-slot hardware.
|
||||
|
||||
And that's in the SSD era when you can buy an SSD that has ~0.04ms latency for 100 $.
|
||||
|
||||
I use the following 6 commands with small variations to benchmark any storage:
|
||||
|
||||
- Linear write:
|
||||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4M -iodepth=32 -rw=write -runtime=60 -filename=/dev/sdX`
|
||||
- Linear read:
|
||||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4M -iodepth=32 -rw=read -runtime=60 -filename=/dev/sdX`
|
||||
- Random write latency (T1Q1, this hurts storages the most):
|
||||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=1 -fsync=1 -rw=randwrite -runtime=60 -filename=/dev/sdX`
|
||||
- Random read latency (T1Q1):
|
||||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=1 -rw=randread -runtime=60 -filename=/dev/sdX`
|
||||
- Parallel write iops (use numjobs if a single CPU core is insufficient to saturate the load):
|
||||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=128 [-numjobs=4 -group_reporting] -rw=randwrite -runtime=60 -filename=/dev/sdX`
|
||||
- Parallel read iops (use numjobs if a single CPU core is insufficient to saturate the load):
|
||||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=128 [-numjobs=4 -group_reporting] -rw=randread -runtime=60 -filename=/dev/sdX`
|
183
docs/hugo/content/usage/cli.md
Normal file
183
docs/hugo/content/usage/cli.md
Normal file
@@ -0,0 +1,183 @@
|
||||
---
|
||||
title: Vitastor CLI
|
||||
weight: 1
|
||||
---
|
||||
|
||||
vitastor-cli is a command-line tool for administrative tasks like image management.
|
||||
|
||||
It supports the following commands:
|
||||
|
||||
{{< toc >}}
|
||||
|
||||
Global options:
|
||||
|
||||
```
|
||||
--etcd_address ADDR Etcd connection address
|
||||
--iodepth N Send N operations in parallel to each OSD when possible (default 32)
|
||||
--parallel_osds M Work with M osds in parallel when possible (default 4)
|
||||
--progress 1|0 Report progress (default 1)
|
||||
--cas 1|0 Use online CAS writes when possible (default auto)
|
||||
--no-color Disable colored output
|
||||
--json JSON output
|
||||
```
|
||||
|
||||
## status
|
||||
|
||||
`vitastor-cli status`
|
||||
|
||||
Show cluster status.
|
||||
|
||||
Example output:
|
||||
|
||||
```
|
||||
cluster:
|
||||
etcd: 1 / 1 up, 1.8 M database size
|
||||
mon: 1 up, master stump
|
||||
osd: 8 / 12 up
|
||||
|
||||
data:
|
||||
raw: 498.5 G used, 301.2 G / 799.7 G available, 399.8 G down
|
||||
state: 156.6 G clean, 97.6 G misplaced
|
||||
pools: 2 / 3 active
|
||||
pgs: 30 active
|
||||
34 active+has_misplaced
|
||||
32 offline
|
||||
|
||||
io:
|
||||
client: 0 B/s rd, 0 op/s rd, 0 B/s wr, 0 op/s wr
|
||||
rebalance: 989.8 M/s, 7.9 K op/s
|
||||
```
|
||||
|
||||
## df
|
||||
|
||||
`vitastor-cli df`
|
||||
|
||||
Show pool space statistics.
|
||||
|
||||
Example output:
|
||||
|
||||
```
|
||||
NAME SCHEME PGS TOTAL USED AVAILABLE USED% EFFICIENCY
|
||||
testpool 2/1 32 100 G 34.2 G 60.7 G 39.23% 100%
|
||||
size1 1/1 32 199.9 G 10 G 121.5 G 39.23% 100%
|
||||
kaveri 2/1 32 0 B 10 G 0 B 100% 0%
|
||||
```
|
||||
|
||||
In the example above, "kaveri" pool has "zero" efficiency because all its OSD are down.
|
||||
|
||||
## ls
|
||||
|
||||
`vitastor-cli ls [-l] [-p POOL] [--sort FIELD] [-r] [-n N] [<glob> ...]`
|
||||
|
||||
List images (only matching `<glob>` pattern(s) if passed).
|
||||
|
||||
Options:
|
||||
|
||||
```
|
||||
-p|--pool POOL Filter images by pool ID or name
|
||||
-l|--long Also report allocated size and I/O statistics
|
||||
--del Also include delete operation statistics
|
||||
--sort FIELD Sort by specified field (name, size, used_size, <read|write|delete>_<iops|bps|lat|queue>)
|
||||
-r|--reverse Sort in descending order
|
||||
-n|--count N Only list first N items
|
||||
```
|
||||
|
||||
Example output:
|
||||
|
||||
```
|
||||
NAME POOL SIZE USED READ IOPS QUEUE LAT WRITE IOPS QUEUE LAT FLAGS PARENT
|
||||
debian9 testpool 20 G 12.3 G 0 B/s 0 0 0 us 0 B/s 0 0 0 us RO
|
||||
pve/vm-100-disk-0 testpool 20 G 0 B 0 B/s 0 0 0 us 0 B/s 0 0 0 us - debian9
|
||||
pve/base-101-disk-0 testpool 20 G 0 B 0 B/s 0 0 0 us 0 B/s 0 0 0 us RO debian9
|
||||
pve/vm-102-disk-0 testpool 32 G 36.4 M 0 B/s 0 0 0 us 0 B/s 0 0 0 us - pve/base-101-disk-0
|
||||
debian9-test testpool 20 G 36.6 M 0 B/s 0 0 0 us 0 B/s 0 0 0 us - debian9
|
||||
bench testpool 10 G 10 G 0 B/s 0 0 0 us 0 B/s 0 0 0 us -
|
||||
bench-kaveri kaveri 10 G 10 G 0 B/s 0 0 0 us 0 B/s 0 0 0 us -
|
||||
```
|
||||
|
||||
## create
|
||||
|
||||
`vitastor-cli create -s|--size <size> [-p|--pool <id|name>] [--parent <parent_name>[@<snapshot>]] <name>`
|
||||
|
||||
Create an image. You may use K/M/G/T suffixes for `<size>`. If `--parent` is specified,
|
||||
a copy-on-write image clone is created. Parent must be a snapshot (readonly image).
|
||||
Pool must be specified if there is more than one pool.
|
||||
|
||||
```
|
||||
vitastor-cli create --snapshot <snapshot> [-p|--pool <id|name>] <image>
|
||||
vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
```
|
||||
|
||||
Create a snapshot of image `<name>` (either form can be used). May be used live if only a single writer is active.
|
||||
|
||||
## modify
|
||||
|
||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
||||
|
||||
Rename, resize image or change its readonly status. Images with children can't be made read-write.
|
||||
If the new size is smaller than the old size, extra data will be purged.
|
||||
You should resize file system in the image, if present, before shrinking it.
|
||||
|
||||
```
|
||||
-f|--force Proceed with shrinking or setting readwrite flag even if the image has children.
|
||||
```
|
||||
|
||||
## rm
|
||||
|
||||
`vitastor-cli rm <from> [<to>] [--writers-stopped]`
|
||||
|
||||
Remove `<from>` or all layers between `<from>` and `<to>` (`<to>` must be a child of `<from>`),
|
||||
rebasing all their children accordingly. --writers-stopped allows merging to be a bit
|
||||
more effective in case of a single 'slim' read-write child and 'fat' removed parent:
|
||||
the child is merged into parent and parent is renamed to child in that case.
|
||||
In other cases parent layers are always merged into children.
|
||||
|
||||
## flatten
|
||||
|
||||
`vitastor-cli flatten <layer>`
|
||||
|
||||
Flatten a layer, i.e. merge data and detach it from parents.
|
||||
|
||||
## rm-data
|
||||
|
||||
`vitastor-cli rm-data --pool <pool> --inode <inode> [--wait-list] [--min-offset <offset>]`
|
||||
|
||||
Remove inode data without changing metadata.
|
||||
|
||||
```
|
||||
--wait-list Retrieve full objects listings before starting to remove objects.
|
||||
Requires more memory, but allows to show correct removal progress.
|
||||
--min-offset Purge only data starting with specified offset.
|
||||
```
|
||||
|
||||
## merge-data
|
||||
|
||||
`vitastor-cli merge-data <from> <to> [--target <target>]`
|
||||
|
||||
Merge layer data without changing metadata. Merge `<from>`..`<to>` to `<target>`.
|
||||
`<to>` must be a child of `<from>` and `<target>` may be one of the layers between
|
||||
`<from>` and `<to>`, including `<from>` and `<to>`.
|
||||
|
||||
## alloc-osd
|
||||
|
||||
`vitastor-cli alloc-osd`
|
||||
|
||||
Allocate a new OSD number and reserve it by creating empty `/osd/stats/<n>` key.
|
||||
|
||||
## simple-offsets
|
||||
|
||||
`vitastor-cli simple-offsets <device>`
|
||||
|
||||
Calculate offsets for simple&stupid (no superblock) OSD deployment.
|
||||
|
||||
Options:
|
||||
|
||||
```
|
||||
--object_size 128k Set blockstore block size
|
||||
--bitmap_granularity 4k Set bitmap granularity
|
||||
--journal_size 16M Set journal size
|
||||
--device_block_size 4k Set device block size
|
||||
--journal_offset 0 Set journal offset
|
||||
--device_size 0 Set device size
|
||||
--format text Result format: json, options, env, or text
|
||||
```
|
20
docs/hugo/content/usage/nbd.md
Normal file
20
docs/hugo/content/usage/nbd.md
Normal file
@@ -0,0 +1,20 @@
|
||||
---
|
||||
title: NBD
|
||||
weight: 6
|
||||
---
|
||||
|
||||
To create a local block device for a Vitastor image, use NBD. For example:
|
||||
|
||||
```
|
||||
vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
||||
```
|
||||
|
||||
It will output the device name, like /dev/nbd0 which you can then format and mount as a normal block device.
|
||||
|
||||
You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.
|
||||
|
||||
To unmap the device run:
|
||||
|
||||
```
|
||||
vitastor-nbd unmap /dev/nbd0
|
||||
```
|
39
docs/hugo/content/usage/qemu.md
Normal file
39
docs/hugo/content/usage/qemu.md
Normal file
@@ -0,0 +1,39 @@
|
||||
---
|
||||
title: QEMU and qemu-img
|
||||
weight: 2
|
||||
---
|
||||
|
||||
You need patched QEMU version to use Vitastor driver.
|
||||
|
||||
To start a VM using plain QEMU command-line with Vitastor disk, use the following commands:
|
||||
|
||||
Old syntax (-drive):
|
||||
|
||||
```
|
||||
qemu-system-x86_64 -enable-kvm -m 1024 \
|
||||
-drive 'file=vitastor:etcd_host=192.168.7.2\:2379/v3:image=debian9',
|
||||
format=raw,if=none,id=drive-virtio-disk0,cache=none \
|
||||
-device 'virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
|
||||
id=virtio-disk0,bootindex=1,write-cache=off' \
|
||||
-vnc 0.0.0.0:0
|
||||
```
|
||||
|
||||
New syntax (-blockdev):
|
||||
|
||||
```
|
||||
qemu-system-x86_64 -enable-kvm -m 1024 \
|
||||
-blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
|
||||
"cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
|
||||
-device 'virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
|
||||
id=virtio-disk0,bootindex=1,write-cache=off' \
|
||||
-vnc 0.0.0.0:0
|
||||
```
|
||||
|
||||
For qemu-img, you should use `vitastor:etcd_host=<HOST>:image=<IMAGE>` as filename. For example:
|
||||
|
||||
```
|
||||
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=debian10'
|
||||
```
|
||||
|
||||
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
|
||||
if you don't want to use inode metadata.
|
37
docs/hugo/i18n/ru.yaml
Normal file
37
docs/hugo/i18n/ru.yaml
Normal file
@@ -0,0 +1,37 @@
|
||||
---
|
||||
nav_navigation: Навигация
|
||||
nav_tags: Теги
|
||||
nav_more: Подробнее
|
||||
nav_top: К началу
|
||||
|
||||
form_placeholder_search: Поиск
|
||||
|
||||
error_page_title: Открыта несуществующая страница
|
||||
error_message_title: Потерялись?
|
||||
error_message_code: Ошибка 404
|
||||
error_message_text: >
|
||||
Похоже, страница, которую вы открыли, не существует. Попробуйте найти
|
||||
нужную информацию с <a class="gdoc-error__link" href="{{ . }}">главной страницы</a>.
|
||||
|
||||
button_toggle_dark: Переключить тёмный/светлый/авто режим
|
||||
button_nav_open: Показать навигацию
|
||||
button_nav_close: Скрыть навигацию
|
||||
button_menu_open: Открыть меню
|
||||
button_menu_close: Закрыть меню
|
||||
button_homepage: На главную
|
||||
|
||||
title_anchor_prefix: "Ссылка на:"
|
||||
|
||||
posts_read_more: Читать подробнее
|
||||
posts_read_time:
|
||||
one: "Одна минута на чтение"
|
||||
other: "{{ . }} минут(ы) на чтение"
|
||||
posts_update_prefix: Обновлено
|
||||
|
||||
footer_build_with: >
|
||||
Сделано на <a href="https://gohugo.io/" class="gdoc-footer__link">Hugo</a> с
|
||||
<svg class="icon gdoc_heart"><use xlink:href="#gdoc_heart"></use></svg>
|
||||
footer_legal_notice: Правовая информация
|
||||
footer_privacy_policy: Приватность
|
||||
|
||||
language_switch_no_tranlation_prefix: "Страница не переведена:"
|
34
docs/hugo/layouts/partials/site-footer.html
Normal file
34
docs/hugo/layouts/partials/site-footer.html
Normal file
@@ -0,0 +1,34 @@
|
||||
<footer class="gdoc-footer">
|
||||
<div class="container flex">
|
||||
<div class="flex flex-wrap" style="flex: 1">
|
||||
<span class="gdoc-footer__item gdoc-footer__item--row">
|
||||
© Vitaliy Filippov, 2021+
|
||||
</span>
|
||||
</div>
|
||||
<div class="flex flex-wrap">
|
||||
{{ with .Site.Params.GeekdocLegalNotice }}
|
||||
<span class="gdoc-footer__item gdoc-footer__item--row">
|
||||
<a href="{{ . | relURL }}" class="gdoc-footer__link">{{ i18n "footer_legal_notice" }}</a>
|
||||
</span>
|
||||
{{ end }}
|
||||
{{ with .Site.Params.GeekdocPrivacyPolicy }}
|
||||
<span class="gdoc-footer__item gdoc-footer__item--row">
|
||||
<a href="{{ . | relURL }}" class="gdoc-footer__link">{{ i18n "footer_privacy_policy" }}</a>
|
||||
</span>
|
||||
{{ end }}
|
||||
</div>
|
||||
{{ if (default true .Site.Params.GeekdocBackToTop) }}
|
||||
<div class="flex flex-25 justify-end">
|
||||
<span class="gdoc-footer__item gdoc-footer__item--row" style="margin-right: 50px">
|
||||
{{ i18n "footer_build_with" | safeHTML }}
|
||||
</span>
|
||||
<span class="gdoc-footer__item">
|
||||
<a class="gdoc-footer__link fake-link" href="#" aria-label="{{ i18n "nav_top" }}">
|
||||
<svg class="icon gdoc_keyboard_arrow_up"><use xlink:href="#gdoc_keyboard_arrow_up"></use></svg>
|
||||
<span class="hidden-mobile">{{ i18n "nav_top" }}</span>
|
||||
</a>
|
||||
</span>
|
||||
</div>
|
||||
{{ end }}
|
||||
</div>
|
||||
</footer>
|
215
docs/hugo/static/brand.svg
Normal file
215
docs/hugo/static/brand.svg
Normal file
@@ -0,0 +1,215 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<svg
|
||||
xmlns:osb="http://www.openswatchbook.org/uri/2009/osb"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
sodipodi:docname="logo_only2.svg"
|
||||
inkscape:version="1.0.2 (e86c870879, 2021-01-15)"
|
||||
id="svg1340"
|
||||
version="1.1"
|
||||
viewBox="0 0 100 86.80192"
|
||||
height="86.801918mm"
|
||||
width="100mm"
|
||||
inkscape:export-filename="/var/home/vitali/SVN/vitastor/presentation/logos/logo_only.png"
|
||||
inkscape:export-xdpi="92.889999"
|
||||
inkscape:export-ydpi="92.889999">
|
||||
<defs
|
||||
id="defs1334">
|
||||
<linearGradient
|
||||
osb:paint="gradient"
|
||||
id="linearGradient866">
|
||||
<stop
|
||||
id="stop862"
|
||||
offset="0"
|
||||
style="stop-color:#c0c0c0;stop-opacity:1" />
|
||||
<stop
|
||||
id="stop864"
|
||||
offset="1"
|
||||
style="stop-color:#000000;stop-opacity:0" />
|
||||
</linearGradient>
|
||||
<linearGradient
|
||||
id="linearGradient846"
|
||||
osb:paint="gradient">
|
||||
<stop
|
||||
style="stop-color:#ffd42a;stop-opacity:1"
|
||||
offset="0"
|
||||
id="stop842" />
|
||||
<stop
|
||||
style="stop-color:#ffa200;stop-opacity:1"
|
||||
offset="1"
|
||||
id="stop844" />
|
||||
</linearGradient>
|
||||
<radialGradient
|
||||
r="50"
|
||||
fy="159.11139"
|
||||
fx="202.36813"
|
||||
cy="159.11139"
|
||||
cx="202.36813"
|
||||
gradientTransform="matrix(1.2462942,-1.2279529,0.77712408,0.78873143,-190.96813,230.1331)"
|
||||
gradientUnits="userSpaceOnUse"
|
||||
id="radialGradient1530"
|
||||
xlink:href="#linearGradient1352"
|
||||
inkscape:collect="always" />
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
id="linearGradient1352">
|
||||
<stop
|
||||
style="stop-color:#00c9e6;stop-opacity:1"
|
||||
offset="0"
|
||||
id="stop1348" />
|
||||
<stop
|
||||
style="stop-color:#5240d3;stop-opacity:1"
|
||||
offset="1"
|
||||
id="stop1350" />
|
||||
</linearGradient>
|
||||
<linearGradient
|
||||
y2="62.555599"
|
||||
x2="51.484566"
|
||||
y1="62.555599"
|
||||
x1="38.105473"
|
||||
gradientTransform="rotate(-16.930773,271.11609,-412.42594)"
|
||||
gradientUnits="userSpaceOnUse"
|
||||
id="linearGradient1508"
|
||||
xlink:href="#linearGradient1323"
|
||||
inkscape:collect="always" />
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
id="linearGradient1323">
|
||||
<stop
|
||||
style="stop-color:#000000;stop-opacity:0.47178105"
|
||||
offset="0"
|
||||
id="stop1319" />
|
||||
<stop
|
||||
style="stop-color:#eeaaff;stop-opacity:0;"
|
||||
offset="1"
|
||||
id="stop1321" />
|
||||
</linearGradient>
|
||||
<radialGradient
|
||||
r="21.541935"
|
||||
fy="24.614815"
|
||||
fx="45.312912"
|
||||
cy="24.614815"
|
||||
cx="45.312912"
|
||||
gradientTransform="matrix(1.0933447,0.13113705,-0.12664108,1.0558599,-1.082187,93.974708)"
|
||||
gradientUnits="userSpaceOnUse"
|
||||
id="radialGradient1504"
|
||||
xlink:href="#linearGradient846"
|
||||
inkscape:collect="always" />
|
||||
<filter
|
||||
style="color-interpolation-filters:sRGB"
|
||||
inkscape:label="Drop Shadow"
|
||||
id="filter1497"
|
||||
width="2"
|
||||
height="2"
|
||||
x="-0.5"
|
||||
y="-0.5">
|
||||
<feFlood
|
||||
flood-opacity="0.498039"
|
||||
flood-color="rgb(0,0,0)"
|
||||
result="flood"
|
||||
id="feFlood1487" />
|
||||
<feComposite
|
||||
in="flood"
|
||||
in2="SourceGraphic"
|
||||
operator="in"
|
||||
result="composite1"
|
||||
id="feComposite1489" />
|
||||
<feGaussianBlur
|
||||
in="composite1"
|
||||
stdDeviation="6"
|
||||
result="blur"
|
||||
id="feGaussianBlur1491" />
|
||||
<feOffset
|
||||
dx="0"
|
||||
dy="6"
|
||||
result="offset"
|
||||
id="feOffset1493" />
|
||||
<feComposite
|
||||
in="offset"
|
||||
in2="offset"
|
||||
operator="atop"
|
||||
result="composite2"
|
||||
id="feComposite1495" />
|
||||
</filter>
|
||||
<radialGradient
|
||||
r="21.541935"
|
||||
fy="24.614815"
|
||||
fx="45.312912"
|
||||
cy="24.614815"
|
||||
cx="45.312912"
|
||||
gradientTransform="matrix(1.0933447,0.13113705,-0.12664108,1.0558599,-1.082187,93.974708)"
|
||||
gradientUnits="userSpaceOnUse"
|
||||
id="radialGradient1506"
|
||||
xlink:href="#linearGradient846"
|
||||
inkscape:collect="always" />
|
||||
</defs>
|
||||
<sodipodi:namedview
|
||||
inkscape:window-maximized="1"
|
||||
inkscape:window-y="0"
|
||||
inkscape:window-x="0"
|
||||
inkscape:window-height="992"
|
||||
inkscape:window-width="1920"
|
||||
fit-margin-bottom="0"
|
||||
fit-margin-right="0"
|
||||
fit-margin-left="0"
|
||||
fit-margin-top="-30"
|
||||
showgrid="false"
|
||||
inkscape:document-rotation="0"
|
||||
inkscape:current-layer="layer1"
|
||||
inkscape:document-units="mm"
|
||||
inkscape:cy="47.914558"
|
||||
inkscape:cx="-103.69646"
|
||||
inkscape:zoom="0.7"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:pageopacity="1"
|
||||
borderopacity="1.0"
|
||||
bordercolor="#666666"
|
||||
pagecolor="#000000"
|
||||
id="base" />
|
||||
<metadata
|
||||
id="metadata1337">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<g
|
||||
transform="translate(-133.26969,-52.101187)"
|
||||
id="layer1"
|
||||
inkscape:groupmode="layer"
|
||||
inkscape:label="Слой 1">
|
||||
<path
|
||||
style="fill:url(#radialGradient1530);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 133.26969,59.089473 50,75.000087 50,-75.000087 z"
|
||||
id="path1528"
|
||||
sodipodi:nodetypes="cccc" />
|
||||
<path
|
||||
d="m 194.29572,89.403603 -8.41706,2.562119 -2.50682,7.49308 7.17785,23.579008 9.60097,-14.40173 z"
|
||||
style="fill:url(#linearGradient1508);fill-opacity:1;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.501961"
|
||||
id="path1459" />
|
||||
<g
|
||||
transform="translate(135.70225,-49.385894)"
|
||||
id="g1465">
|
||||
<path
|
||||
id="path1461"
|
||||
style="fill:url(#radialGradient1504);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;filter:url(#filter1497)"
|
||||
d="m 28.817436,101.36529 c 3.112699,10.74423 6.225077,21.48892 9.333984,32.23438 2.519532,0 5.039063,0 7.558594,0 -0.985406,8.09729 -2.085815,16.18202 -2.951172,24.29297 -0.06053,0.88723 1.098131,1.61652 1.76,0.9155 1.007514,-1.05482 1.676008,-2.3829 2.528566,-3.56053 7.51538,-11.37722 14.987447,-22.78299 22.482919,-34.17333 -3.239584,0 -6.479167,0 -9.71875,0 2.887267,-6.79562 5.775365,-13.59088 8.662109,-20.38672 -13.284505,0 -26.56901,0 -39.853516,0 0.06576,0.22591 0.131511,0.45182 0.197266,0.67773 z" />
|
||||
<path
|
||||
sodipodi:nodetypes="cccccccc"
|
||||
id="path1463"
|
||||
d="m 30.735882,102.2764 h 35.342242 l -8.662729,20.3854 h 9.173783 l -22.106472,33.62346 3.027029,-24.27377 H 39.34604 Z"
|
||||
style="fill:url(#radialGradient1506);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 7.4 KiB |
138
docs/hugo/static/custom.css
Normal file
138
docs/hugo/static/custom.css
Normal file
@@ -0,0 +1,138 @@
|
||||
/* Global customization */
|
||||
|
||||
:root {
|
||||
--code-max-height: 60rem;
|
||||
}
|
||||
|
||||
/* Light mode theming */
|
||||
:root,
|
||||
:root[color-mode="light"] {
|
||||
--header-background: #404050;
|
||||
--header-font-color: #ffffff;
|
||||
|
||||
--body-background: #ffffff;
|
||||
--body-font-color: #343a40;
|
||||
|
||||
--button-background: #62cb97;
|
||||
--button-border-color: #4ec58a;
|
||||
|
||||
--link-color: #c54e8a;
|
||||
--link-color-visited: #c54e8a;
|
||||
|
||||
--code-background: #f5f6f8;
|
||||
--code-accent-color: #e3e7eb;
|
||||
--code-accent-color-lite: #eff1f3;
|
||||
|
||||
--accent-color: #e9ecef;
|
||||
--accent-color-lite: #f8f9fa;
|
||||
|
||||
--control-icons: #b2bac1;
|
||||
|
||||
--footer-background: #606070;
|
||||
--footer-font-color: #ffffff;
|
||||
--footer-link-color: #ffcc5c;
|
||||
--footer-link-color-visited: #ffcc5c;
|
||||
}
|
||||
@media (prefers-color-scheme: light) {
|
||||
:root {
|
||||
--header-background: #404050;
|
||||
--header-font-color: #ffffff;
|
||||
|
||||
--body-background: #ffffff;
|
||||
--body-font-color: #343a40;
|
||||
|
||||
--button-background: #62cb97;
|
||||
--button-border-color: #4ec58a;
|
||||
|
||||
--link-color: #c54e8a;
|
||||
--link-color-visited: #c54e8a;
|
||||
|
||||
--code-background: #f5f6f8;
|
||||
--code-accent-color: #e3e7eb;
|
||||
--code-accent-color-lite: #eff1f3;
|
||||
|
||||
--accent-color: #e9ecef;
|
||||
--accent-color-lite: #f8f9fa;
|
||||
|
||||
--control-icons: #b2bac1;
|
||||
|
||||
--footer-background: #606070;
|
||||
--footer-font-color: #ffffff;
|
||||
--footer-link-color: #ffcc5c;
|
||||
--footer-link-color-visited: #ffcc5c;
|
||||
}
|
||||
}
|
||||
|
||||
/* Dark mode theming */
|
||||
:root[color-mode="dark"] {
|
||||
--header-background: #202830;
|
||||
--header-font-color: #ffffff;
|
||||
|
||||
--body-background: #343a44;
|
||||
--body-font-color: #ced3d8;
|
||||
|
||||
--button-background: #62cb97;
|
||||
--button-border-color: #4ec58a;
|
||||
|
||||
--link-color: #7ac29e;
|
||||
--link-color-visited: #7ac29e;
|
||||
|
||||
--code-background: #2f353a;
|
||||
--code-accent-color: #262b2f;
|
||||
--code-accent-color-lite: #2b3035;
|
||||
|
||||
--accent-color: #2b3035;
|
||||
--accent-color-lite: #2f353a;
|
||||
|
||||
--control-icons: #b2bac1;
|
||||
|
||||
--footer-background: #2f333e;
|
||||
--footer-font-color: #cccccc;
|
||||
--footer-link-color: #7ac29e;
|
||||
--footer-link-color-visited: #7ac29e;
|
||||
}
|
||||
@media (prefers-color-scheme: dark) {
|
||||
:root {
|
||||
--header-background: #404070;
|
||||
--header-font-color: #ffffff;
|
||||
|
||||
--body-background: #343a40;
|
||||
--body-font-color: #ced3d8;
|
||||
|
||||
--button-background: #62cb97;
|
||||
--button-border-color: #4ec58a;
|
||||
|
||||
--link-color: #7ac29e;
|
||||
--link-color-visited: #7ac29e;
|
||||
|
||||
--code-background: #2f353a;
|
||||
--code-accent-color: #262b2f;
|
||||
--code-accent-color-lite: #2b3035;
|
||||
|
||||
--accent-color: #2b3035;
|
||||
--accent-color-lite: #2f353a;
|
||||
|
||||
--control-icons: #b2bac1;
|
||||
|
||||
--footer-background: #2f333e;
|
||||
--footer-font-color: #cccccc;
|
||||
--footer-link-color: #7ac29e;
|
||||
--footer-link-color-visited: #7ac29e;
|
||||
}
|
||||
}
|
||||
|
||||
.gdoc-brand__img {
|
||||
width: 48px;
|
||||
height: auto;
|
||||
margin-top: -4px;
|
||||
margin-bottom: -4px;
|
||||
}
|
||||
|
||||
.gdoc-menu-header > span {
|
||||
display: flex;
|
||||
flex-direction: row-reverse;
|
||||
}
|
||||
|
||||
span.gdoc-language {
|
||||
margin-right: 20px;
|
||||
}
|
BIN
docs/hugo/static/favicon/favicon-16x16.png
Normal file
BIN
docs/hugo/static/favicon/favicon-16x16.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 709 B |
BIN
docs/hugo/static/favicon/favicon-32x32.png
Normal file
BIN
docs/hugo/static/favicon/favicon-32x32.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.5 KiB |
196
docs/hugo/static/favicon/favicon.svg
Normal file
196
docs/hugo/static/favicon/favicon.svg
Normal file
@@ -0,0 +1,196 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<svg
|
||||
xmlns:osb="http://www.openswatchbook.org/uri/2009/osb"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
sodipodi:docname="favicon.svg"
|
||||
inkscape:version="1.0.2 (e86c870879, 2021-01-15)"
|
||||
id="svg1340"
|
||||
version="1.1"
|
||||
viewBox="0 0 100 100"
|
||||
height="100mm"
|
||||
width="100mm"
|
||||
inkscape:export-filename="/var/home/vitali/SVN/vitastor/docs/static/favicon/favicon-64x64.png"
|
||||
inkscape:export-xdpi="16.26"
|
||||
inkscape:export-ydpi="16.26">
|
||||
<defs
|
||||
id="defs1334">
|
||||
<linearGradient
|
||||
osb:paint="gradient"
|
||||
id="linearGradient866">
|
||||
<stop
|
||||
id="stop862"
|
||||
offset="0"
|
||||
style="stop-color:#c0c0c0;stop-opacity:1" />
|
||||
<stop
|
||||
id="stop864"
|
||||
offset="1"
|
||||
style="stop-color:#000000;stop-opacity:0" />
|
||||
</linearGradient>
|
||||
<linearGradient
|
||||
id="linearGradient846"
|
||||
osb:paint="gradient">
|
||||
<stop
|
||||
style="stop-color:#ffd42a;stop-opacity:1"
|
||||
offset="0"
|
||||
id="stop842" />
|
||||
<stop
|
||||
style="stop-color:#ffa200;stop-opacity:1"
|
||||
offset="1"
|
||||
id="stop844" />
|
||||
</linearGradient>
|
||||
<radialGradient
|
||||
r="50"
|
||||
fy="159.11139"
|
||||
fx="202.36813"
|
||||
cy="159.11139"
|
||||
cx="202.36813"
|
||||
gradientTransform="matrix(1.2462942,-1.2279529,0.77712408,0.78873143,-190.96813,230.1331)"
|
||||
gradientUnits="userSpaceOnUse"
|
||||
id="radialGradient1530"
|
||||
xlink:href="#linearGradient1352"
|
||||
inkscape:collect="always" />
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
id="linearGradient1352">
|
||||
<stop
|
||||
style="stop-color:#00c9e6;stop-opacity:1"
|
||||
offset="0"
|
||||
id="stop1348" />
|
||||
<stop
|
||||
style="stop-color:#5240d3;stop-opacity:1"
|
||||
offset="1"
|
||||
id="stop1350" />
|
||||
</linearGradient>
|
||||
<linearGradient
|
||||
y2="62.555599"
|
||||
x2="51.484566"
|
||||
y1="62.555599"
|
||||
x1="38.105473"
|
||||
gradientTransform="rotate(-16.930773,271.11609,-412.42594)"
|
||||
gradientUnits="userSpaceOnUse"
|
||||
id="linearGradient1508"
|
||||
xlink:href="#linearGradient1323"
|
||||
inkscape:collect="always" />
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
id="linearGradient1323">
|
||||
<stop
|
||||
style="stop-color:#000000;stop-opacity:0.47178105"
|
||||
offset="0"
|
||||
id="stop1319" />
|
||||
<stop
|
||||
style="stop-color:#eeaaff;stop-opacity:0;"
|
||||
offset="1"
|
||||
id="stop1321" />
|
||||
</linearGradient>
|
||||
<filter
|
||||
style="color-interpolation-filters:sRGB"
|
||||
inkscape:label="Drop Shadow"
|
||||
id="filter1497"
|
||||
width="2"
|
||||
height="2"
|
||||
x="-0.5"
|
||||
y="-0.5">
|
||||
<feFlood
|
||||
flood-opacity="0.498039"
|
||||
flood-color="rgb(0,0,0)"
|
||||
result="flood"
|
||||
id="feFlood1487" />
|
||||
<feComposite
|
||||
in="flood"
|
||||
in2="SourceGraphic"
|
||||
operator="in"
|
||||
result="composite1"
|
||||
id="feComposite1489" />
|
||||
<feGaussianBlur
|
||||
in="composite1"
|
||||
stdDeviation="6"
|
||||
result="blur"
|
||||
id="feGaussianBlur1491" />
|
||||
<feOffset
|
||||
dx="0"
|
||||
dy="6"
|
||||
result="offset"
|
||||
id="feOffset1493" />
|
||||
<feComposite
|
||||
in="offset"
|
||||
in2="offset"
|
||||
operator="atop"
|
||||
result="composite2"
|
||||
id="feComposite1495" />
|
||||
</filter>
|
||||
<radialGradient
|
||||
r="21.541935"
|
||||
fy="24.614815"
|
||||
fx="45.312912"
|
||||
cy="24.614815"
|
||||
cx="45.312912"
|
||||
gradientTransform="matrix(1.6678615,0.20004527,-0.19318681,1.6106796,108.48083,22.966962)"
|
||||
gradientUnits="userSpaceOnUse"
|
||||
id="radialGradient1506"
|
||||
xlink:href="#linearGradient846"
|
||||
inkscape:collect="always" />
|
||||
</defs>
|
||||
<sodipodi:namedview
|
||||
inkscape:window-maximized="1"
|
||||
inkscape:window-y="0"
|
||||
inkscape:window-x="0"
|
||||
inkscape:window-height="992"
|
||||
inkscape:window-width="1920"
|
||||
fit-margin-bottom="0"
|
||||
fit-margin-right="0"
|
||||
fit-margin-left="0"
|
||||
fit-margin-top="0"
|
||||
showgrid="false"
|
||||
inkscape:document-rotation="0"
|
||||
inkscape:current-layer="layer1"
|
||||
inkscape:document-units="mm"
|
||||
inkscape:cy="83.752268"
|
||||
inkscape:cx="-103.69645"
|
||||
inkscape:zoom="0.7"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:pageopacity="0"
|
||||
borderopacity="1.0"
|
||||
bordercolor="#666666"
|
||||
pagecolor="#000000"
|
||||
id="base" />
|
||||
<metadata
|
||||
id="metadata1337">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<g
|
||||
transform="translate(-133.26969,-35.630924)"
|
||||
id="layer1"
|
||||
inkscape:groupmode="layer"
|
||||
inkscape:label="Слой 1">
|
||||
<path
|
||||
style="fill:url(#radialGradient1530);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 133.26969,59.089473 50,75.000087 50,-75.000087 z"
|
||||
id="path1528"
|
||||
sodipodi:nodetypes="cccc" />
|
||||
<path
|
||||
d="m 194.29572,89.403603 -8.41706,2.562119 -2.50682,7.49308 7.17785,23.579008 9.60097,-14.40173 z"
|
||||
style="fill:url(#linearGradient1508);fill-opacity:1;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.501961"
|
||||
id="path1459" />
|
||||
<path
|
||||
sodipodi:nodetypes="cccccccc"
|
||||
id="path1463"
|
||||
d="m 157.01826,35.630924 h 53.91343 l -13.21471,31.09726 h 13.99432 l -33.7227,51.291496 4.61762,-37.02885 h -12.45344 z"
|
||||
style="fill:url(#radialGradient1506);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 6.2 KiB |
6
docs/params/head/common.en.md
Normal file
6
docs/params/head/common.en.md
Normal file
@@ -0,0 +1,6 @@
|
||||
---
|
||||
title: Common Parameters
|
||||
weight: 1
|
||||
---
|
||||
|
||||
These are the most common parameters which apply to all components of Vitastor.
|
6
docs/params/head/common.ru.md
Normal file
6
docs/params/head/common.ru.md
Normal file
@@ -0,0 +1,6 @@
|
||||
---
|
||||
title: Общие параметры
|
||||
weight: 1
|
||||
---
|
||||
|
||||
Это наиболее общие параметры, используемые всеми компонентами Vitastor.
|
7
docs/params/head/layout-cluster.en.md
Normal file
7
docs/params/head/layout-cluster.en.md
Normal file
@@ -0,0 +1,7 @@
|
||||
---
|
||||
title: Cluster-Wide Disk Layout Parameters
|
||||
weight: 2
|
||||
---
|
||||
|
||||
These parameters apply to clients and OSDs, are fixed at the moment of OSD drive
|
||||
initialization and can't be changed after it without losing data.
|
7
docs/params/head/layout-cluster.ru.md
Normal file
7
docs/params/head/layout-cluster.ru.md
Normal file
@@ -0,0 +1,7 @@
|
||||
---
|
||||
title: Дисковые параметры уровня кластера
|
||||
weight: 2
|
||||
---
|
||||
|
||||
Данные параметры используются клиентами и OSD, задаются в момент инициализации
|
||||
диска OSD и не могут быть изменены после этого без потери данных.
|
7
docs/params/head/layout-osd.en.md
Normal file
7
docs/params/head/layout-osd.en.md
Normal file
@@ -0,0 +1,7 @@
|
||||
---
|
||||
title: OSD Disk Layout Parameters
|
||||
weight: 3
|
||||
---
|
||||
|
||||
These parameters apply to OSDs, are fixed at the moment of OSD drive
|
||||
initialization and can't be changed after it without losing data.
|
8
docs/params/head/layout-osd.ru.md
Normal file
8
docs/params/head/layout-osd.ru.md
Normal file
@@ -0,0 +1,8 @@
|
||||
---
|
||||
title: Дисковые параметры OSD
|
||||
weight: 3
|
||||
---
|
||||
|
||||
Данные параметры используются только OSD и, также как и общекластерные
|
||||
дисковые параметры, задаются в момент инициализации дисков OSD и не могут быть
|
||||
изменены после этого без потери данных.
|
6
docs/params/head/monitor.en.md
Normal file
6
docs/params/head/monitor.en.md
Normal file
@@ -0,0 +1,6 @@
|
||||
---
|
||||
title: Monitor Parameters
|
||||
weight: 6
|
||||
---
|
||||
|
||||
These parameters only apply to Monitors.
|
6
docs/params/head/monitor.ru.md
Normal file
6
docs/params/head/monitor.ru.md
Normal file
@@ -0,0 +1,6 @@
|
||||
---
|
||||
title: Параметры мониторов
|
||||
weight: 6
|
||||
---
|
||||
|
||||
Данные параметры используются только мониторами Vitastor.
|
7
docs/params/head/network.en.md
Normal file
7
docs/params/head/network.en.md
Normal file
@@ -0,0 +1,7 @@
|
||||
---
|
||||
title: Network Protocol Parameters
|
||||
weight: 4
|
||||
---
|
||||
|
||||
These parameters apply to clients and OSDs and affect network connection logic
|
||||
between clients, OSDs and etcd.
|
7
docs/params/head/network.ru.md
Normal file
7
docs/params/head/network.ru.md
Normal file
@@ -0,0 +1,7 @@
|
||||
---
|
||||
title: Параметры сетевого протокола
|
||||
weight: 4
|
||||
---
|
||||
|
||||
Данные параметры используются клиентами и OSD и влияют на логику сетевого
|
||||
взаимодействия между клиентами, OSD, а также etcd.
|
7
docs/params/head/osd.en.md
Normal file
7
docs/params/head/osd.en.md
Normal file
@@ -0,0 +1,7 @@
|
||||
---
|
||||
title: Runtime OSD Parameters
|
||||
weight: 5
|
||||
---
|
||||
|
||||
These parameters only apply to OSDs, are not fixed at the moment of OSD drive
|
||||
initialization and can be changed with an OSD restart.
|
8
docs/params/head/osd.ru.md
Normal file
8
docs/params/head/osd.ru.md
Normal file
@@ -0,0 +1,8 @@
|
||||
---
|
||||
title: Изменяемые параметры OSD
|
||||
weight: 5
|
||||
---
|
||||
|
||||
Данные параметры используются только OSD, но, в отличие от дисковых параметров,
|
||||
не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
|
||||
момент с перезапуском OSD.
|
@@ -248,6 +248,8 @@
|
||||
row and slow down significantly (from 25000+ iops to ~3000 iops). When
|
||||
this option is set, Vitastor will always move to the next sector of the
|
||||
journal after writing it instead of possibly overwriting it the second time.
|
||||
|
||||
Most (99%) other SSDs don't need this option.
|
||||
info_ru: |
|
||||
Включайте данную опцию для SSD вроде Intel D3-S4510 и D3-S4610, которые
|
||||
ОЧЕНЬ не любят, когда ПО перезаписывает один и тот же сектор несколько раз
|
||||
@@ -256,6 +258,8 @@
|
||||
данная опция установлена, Vitastor всегда переходит к следующему сектору
|
||||
журнала после записи вместо потенциально повторной перезаписи того же
|
||||
самого сектора.
|
||||
|
||||
Почти все другие SSD (99% моделей) не требуют данной опции.
|
||||
- name: throttle_small_writes
|
||||
type: bool
|
||||
default: false
|
||||
|
414
mon/make-osd-hybrid.js
Executable file
414
mon/make-osd-hybrid.js
Executable file
@@ -0,0 +1,414 @@
|
||||
#!/usr/bin/nodejs
|
||||
// systemd unit generator for hybrid (HDD+SSD) vitastor OSDs
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1
|
||||
|
||||
// USAGE: nodejs make-osd-hybrid.js [--disable_ssd_cache 0] [--disable_hdd_cache 0] /dev/sda /dev/sdb /dev/sdc /dev/sdd ...
|
||||
// I.e. - just pass all HDDs and SSDs mixed, the script will decide where
|
||||
// to put journals on its own
|
||||
|
||||
const fs = require('fs');
|
||||
const fsp = fs.promises;
|
||||
const child_process = require('child_process');
|
||||
|
||||
const options = {
|
||||
debug: 1,
|
||||
journal_size: 1024*1024*1024,
|
||||
min_meta_size: 1024*1024*1024,
|
||||
object_size: 1024*1024,
|
||||
bitmap_granularity: 4096,
|
||||
device_block_size: 4096,
|
||||
disable_ssd_cache: 1,
|
||||
disable_hdd_cache: 1,
|
||||
};
|
||||
|
||||
run().catch(console.fatal);
|
||||
|
||||
async function run()
|
||||
{
|
||||
const device_list = parse_options();
|
||||
await system_or_die("mkdir -p /var/log/vitastor; chown vitastor /var/log/vitastor");
|
||||
// Collect devices
|
||||
const all_devices = await collect_devices(device_list);
|
||||
const ssds = all_devices.filter(d => d.ssd);
|
||||
const hdds = all_devices.filter(d => !d.ssd);
|
||||
// Collect existing OSD units
|
||||
const osd_units = await collect_osd_units();
|
||||
// Count assigned HDD journals and unallocated space for each SSD
|
||||
await check_journal_count(ssds, osd_units);
|
||||
// Create new OSDs
|
||||
await create_new_hybrid_osds(hdds, ssds, osd_units);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
function parse_options()
|
||||
{
|
||||
const devices = [];
|
||||
const opt = {};
|
||||
for (let i = 2; i < process.argv.length; i++)
|
||||
{
|
||||
const arg = process.argv[i];
|
||||
if (arg == '--help' || arg == '-h')
|
||||
{
|
||||
opt.help = true;
|
||||
break;
|
||||
}
|
||||
else if (arg.substr(0, 2) == '--')
|
||||
opt[arg.substr(2)] = process.argv[++i];
|
||||
else
|
||||
devices.push(arg);
|
||||
}
|
||||
if (opt.help || !devices.length)
|
||||
{
|
||||
console.log(
|
||||
'Prepare hybrid (HDD+SSD) Vitastor OSDs\n'+
|
||||
'(c) Vitaliy Filippov, 2019+, license: VNPL-1.1\n\n'+
|
||||
'USAGE: nodejs make-osd-hybrid.js [OPTIONS] /dev/sda /dev/sdb /dev/sdc ...\n'+
|
||||
'Just pass all your SSDs and HDDs in any order, the script will distribute OSDs for you.\n\n'+
|
||||
'OPTIONS (with defaults):\n'+
|
||||
Object.keys(options).map(k => ` --${k} ${options[k]}`).join('\n')
|
||||
);
|
||||
process.exit(0);
|
||||
}
|
||||
for (const k in opt)
|
||||
options[k] = opt[k];
|
||||
return devices;
|
||||
}
|
||||
|
||||
// Collect devices
|
||||
async function collect_devices(devices_to_check)
|
||||
{
|
||||
const devices = [];
|
||||
for (const dev of devices_to_check)
|
||||
{
|
||||
if (dev.substr(0, 5) != '/dev/')
|
||||
{
|
||||
console.log(`${dev} does not start with /dev/, skipping`);
|
||||
continue;
|
||||
}
|
||||
if (!await file_exists('/sys/block/'+dev.substr(5)))
|
||||
{
|
||||
console.log(`${dev} is a partition, skipping`);
|
||||
continue;
|
||||
}
|
||||
// Check if the device is an SSD
|
||||
const rot = '/sys/block/'+dev.substr(5)+'/queue/rotational';
|
||||
if (!await file_exists(rot))
|
||||
{
|
||||
console.log(`${dev} does not have ${rot} to check whether it's an SSD, skipping`);
|
||||
continue;
|
||||
}
|
||||
const ssd = !parseInt(await fsp.readFile(rot, { encoding: 'utf-8' }));
|
||||
// Check if the device has partition table
|
||||
let [ has_partition_table, parts ] = await system(`sfdisk --dump ${dev} --json`);
|
||||
if (has_partition_table != 0)
|
||||
{
|
||||
// Check if the device has any data
|
||||
const [ has_data, out ] = await system(`blkid ${dev}`);
|
||||
if (has_data == 0)
|
||||
{
|
||||
console.log(`${dev} contains data, skipping:\n ${out.trim().replace(/\n/g, '\n ')}`);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
parts = parts ? JSON.parse(parts).partitiontable : null;
|
||||
if (parts && parts.label != 'gpt')
|
||||
{
|
||||
console.log(`${dev} contains "${parts.label}" partition table, only GPT is supported, skipping`);
|
||||
continue;
|
||||
}
|
||||
devices.push({
|
||||
path: dev,
|
||||
ssd,
|
||||
parts,
|
||||
});
|
||||
}
|
||||
return devices;
|
||||
}
|
||||
|
||||
// Collect existing OSD units
|
||||
async function collect_osd_units()
|
||||
{
|
||||
const units = [];
|
||||
for (const unit of (await system("ls /etc/systemd/system/vitastor-osd*.service"))[1].trim().split('\n'))
|
||||
{
|
||||
if (!unit)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
let cmd = /^ExecStart\s*=\s*(([^\n]*\\\n)*[^\n]*)/.exec(await fsp.readFile(unit, { encoding: 'utf-8' }));
|
||||
if (!cmd)
|
||||
{
|
||||
console.log('ExecStart= not found in '+unit+', skipping')
|
||||
continue;
|
||||
}
|
||||
let kv = {}, key;
|
||||
cmd = cmd[1].replace(/^bash\s+-c\s+'/, '')
|
||||
.replace(/>>\s*\S+2>\s*&1\s*'$/, '')
|
||||
.replace(/\s*\\\n\s*/g, ' ')
|
||||
.replace(/([^\s']+)|'([^']+)'/g, (m, m1, m2) =>
|
||||
{
|
||||
m1 = m1||m2;
|
||||
if (key == null)
|
||||
{
|
||||
if (m1.substr(0, 2) != '--')
|
||||
{
|
||||
console.log('Strange command line in '+unit+', stopping');
|
||||
process.exit(1);
|
||||
}
|
||||
key = m1.substr(2);
|
||||
}
|
||||
else
|
||||
{
|
||||
kv[key] = m1;
|
||||
key = null;
|
||||
}
|
||||
});
|
||||
units.push(kv);
|
||||
}
|
||||
return units;
|
||||
}
|
||||
|
||||
// Count assigned HDD journals and unallocated space for each SSD
|
||||
async function check_journal_count(ssds, osd_units)
|
||||
{
|
||||
const units_by_journal = osd_units.reduce((a, c) =>
|
||||
{
|
||||
if (c.journal_device)
|
||||
a[c.journal_device] = c;
|
||||
return a;
|
||||
}, {});
|
||||
for (const dev of ssds)
|
||||
{
|
||||
dev.journals = 0;
|
||||
if (dev.parts)
|
||||
{
|
||||
for (const part of dev.parts.partitions)
|
||||
{
|
||||
if (part.uuid && units_by_journal['/dev/disk/by-partuuid/'+part.uuid.toLowerCase()])
|
||||
{
|
||||
dev.journals++;
|
||||
}
|
||||
}
|
||||
dev.free = free_from_parttable(dev.parts);
|
||||
}
|
||||
else
|
||||
{
|
||||
dev.free = parseInt(await system_or_die("blockdev --getsize64 "+dev.path));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function create_new_hybrid_osds(hdds, ssds, osd_units)
|
||||
{
|
||||
const units_by_disk = osd_units.reduce((a, c) => { a[c.data_device] = c; return a; }, {});
|
||||
for (const dev of hdds)
|
||||
{
|
||||
if (!dev.parts)
|
||||
{
|
||||
// HDD is not partitioned yet, create a single partition
|
||||
// + is the "default value" for sfdisk
|
||||
await system_or_die('sfdisk '+dev.path, 'label: gpt\n\n+ +\n');
|
||||
dev.parts = JSON.parse(await system_or_die('sfdisk --dump '+dev.path+' --json')).partitiontable;
|
||||
}
|
||||
if (dev.parts.partitions.length != 1)
|
||||
{
|
||||
console.log(dev.path+' has more than 1 partition, skipping');
|
||||
}
|
||||
else if ((dev.parts.partitions[0].start + dev.parts.partitions[0].size) != (1 + dev.parts.lastlba))
|
||||
{
|
||||
console.log(dev.path+'1 is not a whole-disk partition, skipping');
|
||||
}
|
||||
else if (!dev.parts.partitions[0].uuid)
|
||||
{
|
||||
console.log(dev.parts.partitions[0].node+' does not have UUID. Please repartition '+dev.path+' with GPT');
|
||||
}
|
||||
else if (!units_by_disk['/dev/disk/by-partuuid/'+dev.parts.partitions[0].uuid.toLowerCase()])
|
||||
{
|
||||
await create_hybrid_osd(dev, ssds);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function create_hybrid_osd(dev, ssds)
|
||||
{
|
||||
// Create a new OSD
|
||||
// Calculate metadata size
|
||||
const data_device = '/dev/disk/by-partuuid/'+dev.parts.partitions[0].uuid.toLowerCase();
|
||||
const data_size = dev.parts.partitions[0].size * dev.parts.sectorsize;
|
||||
const meta_entry_size = 24 + 2*options.object_size/options.bitmap_granularity/8;
|
||||
const entries_per_block = Math.floor(options.device_block_size / meta_entry_size);
|
||||
const object_count = Math.floor(data_size / options.object_size);
|
||||
let meta_size = Math.ceil(1 + object_count / entries_per_block) * options.device_block_size;
|
||||
// Leave some extra space for future metadata formats and round metadata area size to multiples of 1 MB
|
||||
meta_size = 2*meta_size;
|
||||
meta_size = Math.ceil(meta_size/1024/1024) * 1024*1024;
|
||||
if (meta_size < options.min_meta_size)
|
||||
meta_size = options.min_meta_size;
|
||||
let journal_size = Math.ceil(options.journal_size/1024/1024) * 1024*1024;
|
||||
// Pick an SSD for journal, balancing the number of journals across SSDs
|
||||
let selected_ssd;
|
||||
for (const ssd of ssds)
|
||||
if (ssd.free >= (meta_size+journal_size) && (!selected_ssd || selected_ssd.journals > ssd.journals))
|
||||
selected_ssd = ssd;
|
||||
if (!selected_ssd)
|
||||
{
|
||||
console.error('Could not find free space for SSD journal and metadata for '+dev.path);
|
||||
process.exit(1);
|
||||
}
|
||||
// Allocate an OSD number
|
||||
const osd_num = (await system_or_die("vitastor-cli alloc-osd")).trim();
|
||||
if (!osd_num)
|
||||
{
|
||||
console.error('Failed to run vitastor-cli alloc-osd');
|
||||
process.exit(1);
|
||||
}
|
||||
console.log('Creating OSD '+osd_num+' on '+dev.path+' (HDD) with journal and metadata on '+selected_ssd.path+' (SSD)');
|
||||
// Add two partitions: journal and metadata
|
||||
const new_parts = await add_partitions(selected_ssd, [ journal_size, meta_size ]);
|
||||
selected_ssd.journals++;
|
||||
const journal_device = '/dev/disk/by-partuuid/'+new_parts[0].uuid.toLowerCase();
|
||||
const meta_device = '/dev/disk/by-partuuid/'+new_parts[1].uuid.toLowerCase();
|
||||
// Wait until the device symlinks appear
|
||||
while (!await file_exists(journal_device))
|
||||
{
|
||||
await new Promise(ok => setTimeout(ok, 100));
|
||||
}
|
||||
while (!await file_exists(meta_device))
|
||||
{
|
||||
await new Promise(ok => setTimeout(ok, 100));
|
||||
}
|
||||
// Zero out metadata and journal
|
||||
await system_or_die("dd if=/dev/zero of="+journal_device+" bs=1M count="+(journal_size/1024/1024)+" oflag=direct");
|
||||
await system_or_die("dd if=/dev/zero of="+meta_device+" bs=1M count="+(meta_size/1024/1024)+" oflag=direct");
|
||||
// Create unit file for the OSD
|
||||
const has_scsi_cache_type = options.disable_ssd_cache &&
|
||||
(await system("ls /sys/block/"+selected_ssd.path.substr(5)+"/device/scsi_disk/*/cache_type"))[0] == 0;
|
||||
const write_through = options.disable_ssd_cache && (
|
||||
has_scsi_cache_type || selected_ssd.path.substr(5, 4) == 'nvme'
|
||||
&& (await system_or_die("/sys/block/"+selected_ssd.path.substr(5)+"/queue/write_cache")).trim() == "write through");
|
||||
await fsp.writeFile('/etc/systemd/system/vitastor-osd'+osd_num+'.service',
|
||||
`[Unit]
|
||||
Description=Vitastor object storage daemon osd.${osd_num}
|
||||
After=network-online.target local-fs.target time-sync.target
|
||||
Wants=network-online.target local-fs.target time-sync.target
|
||||
PartOf=vitastor.target
|
||||
|
||||
[Service]
|
||||
LimitNOFILE=1048576
|
||||
LimitNPROC=1048576
|
||||
LimitMEMLOCK=infinity
|
||||
ExecStart=bash -c '/usr/bin/vitastor-osd \\
|
||||
--osd_num ${osd_num} ${write_through
|
||||
? "--disable_meta_fsync 1 --disable_journal_fsync 1 --immediate_commit "+(options.disable_hdd_cache ? "all" : "small")
|
||||
: ""} \\
|
||||
--throttle_small_writes 1 \\
|
||||
--disk_alignment ${options.device_block_size} \\
|
||||
--journal_block_size ${options.device_block_size} \\
|
||||
--meta_block_size ${options.device_block_size} \\
|
||||
--journal_no_same_sector_overwrites true \\
|
||||
--journal_sector_buffer_count 1024 \\
|
||||
--block_size ${options.object_size} \\
|
||||
--data_device ${data_device} \\
|
||||
--journal_device ${journal_device} \\
|
||||
--meta_device ${meta_device} >>/var/log/vitastor/osd${osd_num}.log 2>&1'
|
||||
WorkingDirectory=/
|
||||
ExecStartPre=+chown vitastor:vitastor ${data_device}
|
||||
ExecStartPre=+chown vitastor:vitastor ${journal_device}
|
||||
ExecStartPre=+chown vitastor:vitastor ${meta_device}${
|
||||
has_scsi_cache_type
|
||||
? "\nExecStartPre=+bash -c 'D=$$$(readlink "+journal_device+"); echo write through > $$$(dirname /sys/block/*/$$\${D##*/})/device/scsi_disk/*/cache_type'"
|
||||
: ""}${
|
||||
options.disable_hdd_cache
|
||||
? "\nExecStartPre=+bash -c 'D=$$$(readlink "+data_device+"); echo write through > $$$(dirname /sys/block/*/$$\${D##*/})/device/scsi_disk/*/cache_type'"
|
||||
: ""}
|
||||
User=vitastor
|
||||
PrivateTmp=false
|
||||
TasksMax=infinity
|
||||
Restart=always
|
||||
StartLimitInterval=0
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=vitastor.target
|
||||
`);
|
||||
await system_or_die("systemctl enable vitastor-osd"+osd_num);
|
||||
}
|
||||
|
||||
async function add_partitions(dev, sizes)
|
||||
{
|
||||
let script = 'label: gpt\n\n';
|
||||
if (dev.parts)
|
||||
{
|
||||
// Old partitions
|
||||
for (const part of dev.parts.partitions)
|
||||
{
|
||||
script += part.node+': '+Object.keys(part).map(k => k == 'node' ? '' : k+'='+part[k]).filter(k => k).join(', ')+'\n';
|
||||
}
|
||||
}
|
||||
// New partitions
|
||||
for (const size of sizes)
|
||||
{
|
||||
script += '+ '+Math.ceil(size/1024)+'KiB\n';
|
||||
}
|
||||
await system_or_die('sfdisk '+dev.path, script);
|
||||
// Get new partition table and find the new partition
|
||||
const newpt = JSON.parse(await system_or_die('sfdisk --dump '+dev.path+' --json')).partitiontable;
|
||||
const old_nodes = dev.parts ? dev.parts.partitions.reduce((a, c) => { a[c.uuid] = true; return a; }, {}) : {};
|
||||
const new_nodes = newpt.partitions.filter(part => !old_nodes[part.uuid]);
|
||||
if (new_nodes.length != sizes.length)
|
||||
{
|
||||
console.error('Failed to partition '+dev.path+': new partitions not found in table');
|
||||
process.exit(1);
|
||||
}
|
||||
dev.parts = newpt;
|
||||
dev.free = free_from_parttable(newpt);
|
||||
return new_nodes;
|
||||
}
|
||||
|
||||
function free_from_parttable(pt)
|
||||
{
|
||||
let free = pt.lastlba + 1 - pt.firstlba;
|
||||
for (const part of pt.partitions)
|
||||
{
|
||||
free -= part.size;
|
||||
}
|
||||
free *= pt.sectorsize;
|
||||
return free;
|
||||
}
|
||||
|
||||
async function system_or_die(cmd, input = '')
|
||||
{
|
||||
let [ exitcode, stdout, stderr ] = await system(cmd, input);
|
||||
if (exitcode != 0)
|
||||
{
|
||||
console.error(cmd+' failed: '+stderr);
|
||||
process.exit(1);
|
||||
}
|
||||
return stdout;
|
||||
}
|
||||
|
||||
async function system(cmd, input = '')
|
||||
{
|
||||
if (options.debug)
|
||||
{
|
||||
process.stderr.write('+ '+cmd+(input ? " <<EOF\n"+input.replace(/\s*$/, '\n')+"EOF" : '')+'\n');
|
||||
}
|
||||
const cp = child_process.spawn(cmd, { shell: true });
|
||||
let stdout = '', stderr = '', finish_cb;
|
||||
cp.stdout.on('data', buf => stdout += buf.toString());
|
||||
cp.stderr.on('data', buf => stderr += buf.toString());
|
||||
cp.on('exit', () => finish_cb && finish_cb());
|
||||
cp.stdin.write(input);
|
||||
cp.stdin.end();
|
||||
if (cp.exitCode == null)
|
||||
{
|
||||
await new Promise(ok => finish_cb = ok);
|
||||
}
|
||||
return [ cp.exitCode, stdout, stderr ];
|
||||
}
|
||||
|
||||
async function file_exists(filename)
|
||||
{
|
||||
return new Promise((ok, no) => fs.access(filename, fs.constants.R_OK, err => ok(!err)));
|
||||
}
|
@@ -25,6 +25,10 @@ OPT=$(vitastor-cli simple-offsets --format options $DEV | tr '\n' ' ')
|
||||
META=$(vitastor-cli simple-offsets --format json $DEV | jq .data_offset)
|
||||
dd if=/dev/zero of=$DEV bs=1048576 count=$(((META+1048575)/1048576)) oflag=direct
|
||||
|
||||
mkdir -p /var/log/vitastor
|
||||
id vitastor &>/dev/null || useradd vitastor
|
||||
chown vitastor /var/log/vitastor
|
||||
|
||||
cat >/etc/systemd/system/vitastor-osd$OSD_NUM.service <<EOF
|
||||
[Unit]
|
||||
Description=Vitastor object storage daemon osd.$OSD_NUM
|
||||
@@ -36,14 +40,14 @@ PartOf=vitastor.target
|
||||
LimitNOFILE=1048576
|
||||
LimitNPROC=1048576
|
||||
LimitMEMLOCK=infinity
|
||||
ExecStart=/usr/bin/vitastor-osd \\
|
||||
ExecStart=bash -c '/usr/bin/vitastor-osd \\
|
||||
--osd_num $OSD_NUM \\
|
||||
--disable_data_fsync 1 \\
|
||||
--immediate_commit all \\
|
||||
--disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
|
||||
--journal_no_same_sector_overwrites true \\
|
||||
--journal_sector_buffer_count 1024 \\
|
||||
$OPT
|
||||
$OPT >>/var/log/vitastor/osd$OSD_NUM.log 2>&1'
|
||||
WorkingDirectory=/
|
||||
ExecStartPre=+chown vitastor:vitastor $DEV
|
||||
User=vitastor
|
||||
|
91
mon/mon.js
91
mon/mon.js
@@ -31,6 +31,7 @@ const etcd_allow = new RegExp('^'+[
|
||||
'osd/inodestats/[1-9]\\d*',
|
||||
'osd/space/[1-9]\\d*',
|
||||
'mon/master',
|
||||
'mon/member/[a-f0-9]+',
|
||||
'pg/state/[1-9]\\d*/[1-9]\\d*',
|
||||
'pg/stats/[1-9]\\d*/[1-9]\\d*',
|
||||
'pg/history/[1-9]\\d*/[1-9]\\d*',
|
||||
@@ -159,6 +160,8 @@ const etcd_tree = {
|
||||
root_node?: 'rack1',
|
||||
// restrict pool to OSDs having all of these tags
|
||||
osd_tags?: 'nvme' | [ 'nvme', ... ],
|
||||
// prefer to put primary on OSD with these tags
|
||||
primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
|
||||
},
|
||||
...
|
||||
}, */
|
||||
@@ -223,21 +226,28 @@ const etcd_tree = {
|
||||
}, */
|
||||
},
|
||||
inodestats: {
|
||||
/* <inode_t>: {
|
||||
read: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
write: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
/* <pool_id>: {
|
||||
<inode_t>: {
|
||||
read: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
write: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
},
|
||||
}, */
|
||||
},
|
||||
space: {
|
||||
/* <osd_num_t>: {
|
||||
<inode_t>: uint64_t, // bytes
|
||||
<pool_id>: {
|
||||
<inode_t>: uint64_t, // bytes
|
||||
},
|
||||
}, */
|
||||
},
|
||||
},
|
||||
mon: {
|
||||
master: {
|
||||
/* ip: [ string ], */
|
||||
/* ip: [ string ], id: uint64_t */
|
||||
},
|
||||
standby: {
|
||||
/* <uint64_t>: { ip: [ string ] }, */
|
||||
},
|
||||
},
|
||||
pg: {
|
||||
@@ -268,7 +278,7 @@ const etcd_tree = {
|
||||
<pg_id>: {
|
||||
osd_sets: osd_num_t[][],
|
||||
all_peers: osd_num_t[],
|
||||
epoch: uint32_t,
|
||||
epoch: uint64_t,
|
||||
},
|
||||
}, */
|
||||
},
|
||||
@@ -673,11 +683,25 @@ class Mon
|
||||
}, this.etcd_start_timeout, 0);
|
||||
}
|
||||
|
||||
get_mon_state()
|
||||
{
|
||||
return { ip: this.local_ips(), hostname: os.hostname() };
|
||||
}
|
||||
|
||||
async get_lease()
|
||||
{
|
||||
const max_ttl = this.config.etcd_mon_ttl + this.config.etcd_mon_timeout/1000*this.config.etcd_mon_retries;
|
||||
const res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, -1);
|
||||
// Get lease
|
||||
let res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, -1);
|
||||
this.etcd_lease_id = res.ID;
|
||||
// Register in /mon/member, just for the information
|
||||
const state = this.get_mon_state();
|
||||
res = await this.etcd_call('/kv/put', {
|
||||
key: b64(this.etcd_prefix+'/mon/member/'+this.etcd_lease_id),
|
||||
value: b64(JSON.stringify(state)),
|
||||
lease: ''+this.etcd_lease_id
|
||||
}, this.etcd_start_timeout, 0);
|
||||
// Set refresh timer
|
||||
this.lease_timer = setInterval(async () =>
|
||||
{
|
||||
const res = await this.etcd_call('/lease/keepalive', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
|
||||
@@ -703,7 +727,7 @@ class Mon
|
||||
|
||||
async become_master()
|
||||
{
|
||||
const state = { ip: this.local_ips() };
|
||||
const state = { ...this.get_mon_state(), id: ''+this.etcd_lease_id };
|
||||
while (1)
|
||||
{
|
||||
const res = await this.etcd_call('/kv/txn', {
|
||||
@@ -881,27 +905,39 @@ class Mon
|
||||
return this.seed + 2147483648;
|
||||
}
|
||||
|
||||
pick_primary(pool_id, osd_set, up_osds)
|
||||
pick_primary(pool_id, osd_set, up_osds, aff_osds)
|
||||
{
|
||||
let alive_set;
|
||||
if (this.state.config.pools[pool_id].scheme === 'replicated')
|
||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
{
|
||||
// Prefer "affinity" OSDs
|
||||
alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]);
|
||||
if (!alive_set.length)
|
||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Prefer data OSDs for EC because they can actually read something without an additional network hop
|
||||
const pg_data_size = (this.state.config.pools[pool_id].pg_size||0) -
|
||||
(this.state.config.pools[pool_id].parity_chunks||0);
|
||||
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && aff_osds[osd_num]);
|
||||
if (!alive_set.length)
|
||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]);
|
||||
if (!alive_set.length)
|
||||
{
|
||||
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
if (!alive_set.length)
|
||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
}
|
||||
}
|
||||
if (!alive_set.length)
|
||||
return 0;
|
||||
return alive_set[this.rng() % alive_set.length];
|
||||
}
|
||||
|
||||
save_new_pgs_txn(request, pool_id, up_osds, prev_pgs, new_pgs, pg_history)
|
||||
save_new_pgs_txn(request, pool_id, up_osds, osd_tree, prev_pgs, new_pgs, pg_history)
|
||||
{
|
||||
const aff_osds = this.get_affinity_osds(this.state.config.pools[pool_id], up_osds, osd_tree);
|
||||
const pg_items = {};
|
||||
this.reset_rng();
|
||||
new_pgs.map((osd_set, i) =>
|
||||
@@ -909,7 +945,7 @@ class Mon
|
||||
osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
|
||||
pg_items[i+1] = {
|
||||
osd_set,
|
||||
primary: this.pick_primary(pool_id, osd_set, up_osds),
|
||||
primary: this.pick_primary(pool_id, osd_set, up_osds, aff_osds),
|
||||
};
|
||||
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
|
||||
prev_pgs[i].filter(osd_num => osd_num).length > 0)
|
||||
@@ -1040,6 +1076,13 @@ class Mon
|
||||
console.log('Pool '+pool_id+' has invalid osd_tags (must be a string or array of strings)');
|
||||
return false;
|
||||
}
|
||||
if (pool_cfg.primary_affinity_tags && typeof(pool_cfg.primary_affinity_tags) != 'string' &&
|
||||
(!(pool_cfg.primary_affinity_tags instanceof Array) || pool_cfg.primary_affinity_tags.filter(t => typeof t != 'string').length > 0))
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' has invalid primary_affinity_tags (must be a string or array of strings)');
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1069,6 +1112,17 @@ class Mon
|
||||
}
|
||||
}
|
||||
|
||||
get_affinity_osds(pool_cfg, up_osds, osd_tree)
|
||||
{
|
||||
let aff_osds = up_osds;
|
||||
if (pool_cfg.primary_affinity_tags)
|
||||
{
|
||||
aff_osds = { ...up_osds };
|
||||
this.filter_osds_by_tags(osd_tree, { x: aff_osds }, pool_cfg.primary_affinity_tags);
|
||||
}
|
||||
return aff_osds;
|
||||
}
|
||||
|
||||
async recheck_pgs()
|
||||
{
|
||||
// Take configuration and state, check it against the stored configuration hash
|
||||
@@ -1099,7 +1153,7 @@ class Mon
|
||||
{
|
||||
prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set;
|
||||
}
|
||||
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, prev_pgs, [], []);
|
||||
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, osd_tree, prev_pgs, [], []);
|
||||
}
|
||||
}
|
||||
for (const pool_id in this.state.config.pools)
|
||||
@@ -1206,7 +1260,7 @@ class Mon
|
||||
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
value: b64(JSON.stringify(this.state.pool.stats[pool_id])),
|
||||
} });
|
||||
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, real_prev_pgs, optimize_result.int_pgs, pg_history);
|
||||
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, osd_tree, real_prev_pgs, optimize_result.int_pgs, pg_history);
|
||||
}
|
||||
this.state.config.pgs.hash = tree_hash;
|
||||
await this.save_pg_config(etcd_request);
|
||||
@@ -1223,13 +1277,14 @@ class Mon
|
||||
continue;
|
||||
}
|
||||
const replicated = pool_cfg.scheme === 'replicated';
|
||||
const aff_osds = this.get_affinity_osds(pool_cfg, up_osds, osd_tree);
|
||||
this.reset_rng();
|
||||
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
|
||||
{
|
||||
const pg_cfg = this.state.config.pgs.items[pool_id][pg_num];
|
||||
if (pg_cfg)
|
||||
{
|
||||
const new_primary = this.pick_primary(pool_id, pg_cfg.osd_set, up_osds);
|
||||
const new_primary = this.pick_primary(pool_id, pg_cfg.osd_set, up_osds, aff_osds);
|
||||
if (pg_cfg.primary != new_primary)
|
||||
{
|
||||
console.log(
|
||||
|
@@ -49,7 +49,8 @@ async function run()
|
||||
}
|
||||
options.journal_offset = Math.ceil(options.journal_offset/options.device_block_size)*options.device_block_size;
|
||||
const meta_offset = options.journal_offset + Math.ceil(options.journal_size/options.device_block_size)*options.device_block_size;
|
||||
const entries_per_block = Math.floor(options.device_block_size / (24 + 2*options.object_size/options.bitmap_granularity/8));
|
||||
const meta_entry_size = 24 + 2*options.object_size/options.bitmap_granularity/8;
|
||||
const entries_per_block = Math.floor(options.device_block_size / meta_entry_size);
|
||||
const object_count = Math.floor((device_size-meta_offset)/options.object_size);
|
||||
const meta_size = Math.ceil(1 + object_count / entries_per_block) * options.device_block_size;
|
||||
const data_offset = meta_offset + meta_size;
|
||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VERSION = '0.6.14'
|
||||
VERSION = '0.6.17'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
@@ -355,7 +355,25 @@ class VitastorDriver(driver.CloneableImageVD,
|
||||
def revert_to_snapshot(self, context, volume, snapshot):
|
||||
"""Revert a volume to a given snapshot."""
|
||||
|
||||
# FIXME Delete the image, then recreate it from the snapshot
|
||||
vol_name = utils.convert_str(snapshot.volume_name)
|
||||
snap_name = utils.convert_str(snapshot.name)
|
||||
|
||||
# Delete the image and recreate it from the snapshot
|
||||
args = [ 'vitastor-cli', 'rm', vol_name, *(self._vitastor_args()) ]
|
||||
try:
|
||||
self._execute(*args)
|
||||
except processutils.ProcessExecutionError as exc:
|
||||
LOG.error("Failed to delete image "+vol_name+": "+exc)
|
||||
raise exception.VolumeBackendAPIException(data = exc.stderr)
|
||||
args = [
|
||||
'vitastor-cli', 'create', '--parent', vol_name+'@'+snap_name,
|
||||
vol_name, *(self._vitastor_args())
|
||||
]
|
||||
try:
|
||||
self._execute(*args)
|
||||
except processutils.ProcessExecutionError as exc:
|
||||
LOG.error("Failed to recreate image "+vol_name+" from "+vol_name+"@"+snap_name+": "+exc)
|
||||
raise exception.VolumeBackendAPIException(data = exc.stderr)
|
||||
|
||||
def delete_snapshot(self, snapshot):
|
||||
"""Deletes a snapshot."""
|
||||
@@ -363,24 +381,15 @@ class VitastorDriver(driver.CloneableImageVD,
|
||||
vol_name = utils.convert_str(snapshot.volume_name)
|
||||
snap_name = utils.convert_str(snapshot.name)
|
||||
|
||||
# Find the snapshot
|
||||
resp = self._etcd_txn({ 'success': [
|
||||
{ 'request_range': { 'key': 'index/image/'+vol_name+'@'+snap_name } },
|
||||
] })
|
||||
if len(resp['responses'][0]['kvs']) == 0:
|
||||
raise exception.SnapshotNotFound(snapshot_id = snap_name)
|
||||
inode_id = int(resp['responses'][0]['kvs'][0]['value']['id'])
|
||||
pool_id = int(resp['responses'][0]['kvs'][0]['value']['pool_id'])
|
||||
parents = {}
|
||||
parents[(pool_id << 48) | (inode_id & 0xffffffffffff)] = True
|
||||
|
||||
# Check if there are child volumes
|
||||
children = self._child_count(parents)
|
||||
if children > 0:
|
||||
raise exception.SnapshotIsBusy(snapshot_name = snap_name)
|
||||
|
||||
# FIXME: We can't delete snapshots because we can't merge layers yet
|
||||
raise exception.VolumeBackendAPIException(data = 'Snapshot delete (layer merge) is not implemented yet')
|
||||
args = [
|
||||
'vitastor-cli', 'rm', vol_name+'@'+snap_name,
|
||||
*(self._vitastor_args())
|
||||
]
|
||||
try:
|
||||
self._execute(*args)
|
||||
except processutils.ProcessExecutionError as exc:
|
||||
LOG.error("Failed to remove snapshot "+vol_name+'@'+snap_name+": "+exc)
|
||||
raise exception.VolumeBackendAPIException(data = exc.stderr)
|
||||
|
||||
def _child_count(self, parents):
|
||||
children = 0
|
||||
|
@@ -25,4 +25,4 @@ rm fio
|
||||
mv fio-copy fio
|
||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-0.6.14/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.14$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-0.6.17/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.17$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -34,7 +34,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.6.14.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.6.17.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.6.14
|
||||
Version: 0.6.17
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.6.14.el7.tar.gz
|
||||
Source0: vitastor-0.6.17.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
@@ -119,6 +119,7 @@ cp -r mon %buildroot/usr/lib/vitastor
|
||||
|
||||
%files -n vitastor-client
|
||||
%_bindir/vitastor-nbd
|
||||
%_bindir/vitastor-nfs
|
||||
%_bindir/vitastor-cli
|
||||
%_bindir/vitastor-rm
|
||||
%_bindir/vita
|
||||
|
@@ -33,7 +33,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.6.14.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.6.17.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.6.14
|
||||
Version: 0.6.17
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.6.14.el8.tar.gz
|
||||
Source0: vitastor-0.6.17.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
@@ -116,6 +116,7 @@ cp -r mon %buildroot/usr/lib/vitastor
|
||||
|
||||
%files -n vitastor-client
|
||||
%_bindir/vitastor-nbd
|
||||
%_bindir/vitastor-nfs
|
||||
%_bindir/vitastor-cli
|
||||
%_bindir/vitastor-rm
|
||||
%_bindir/vita
|
||||
|
@@ -15,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="0.6.14")
|
||||
add_definitions(-DVERSION="0.6.17")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||
@@ -124,6 +124,18 @@ add_library(vitastor_client SHARED
|
||||
cluster_client.cpp
|
||||
cluster_client_list.cpp
|
||||
vitastor_c.cpp
|
||||
cli_common.cpp
|
||||
cli_alloc_osd.cpp
|
||||
cli_simple_offsets.cpp
|
||||
cli_status.cpp
|
||||
cli_df.cpp
|
||||
cli_ls.cpp
|
||||
cli_create.cpp
|
||||
cli_modify.cpp
|
||||
cli_flatten.cpp
|
||||
cli_merge.cpp
|
||||
cli_rm_data.cpp
|
||||
cli_rm.cpp
|
||||
)
|
||||
set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h")
|
||||
target_link_libraries(vitastor_client
|
||||
@@ -152,10 +164,24 @@ target_link_libraries(vitastor-nbd
|
||||
vitastor_client
|
||||
)
|
||||
|
||||
# vitastor-nfs
|
||||
add_executable(vitastor-nfs
|
||||
nfs_proxy.cpp
|
||||
nfs_conn.cpp
|
||||
nfs_portmap.cpp
|
||||
sha256.c
|
||||
nfs/xdr_impl.cpp
|
||||
nfs/rpc_xdr.cpp
|
||||
nfs/portmap_xdr.cpp
|
||||
nfs/nfs_xdr.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-nfs
|
||||
vitastor_client
|
||||
)
|
||||
|
||||
# vitastor-cli
|
||||
add_executable(vitastor-cli
|
||||
cli.cpp cli_alloc_osd.cpp cli_simple_offsets.cpp cli_df.cpp
|
||||
cli_ls.cpp cli_create.cpp cli_modify.cpp cli_flatten.cpp cli_merge.cpp cli_rm.cpp cli_snap_rm.cpp
|
||||
cli.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-cli
|
||||
vitastor_client
|
||||
@@ -244,7 +270,7 @@ target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mo
|
||||
|
||||
### Install
|
||||
|
||||
install(TARGETS vitastor-osd vitastor-dump-journal vitastor-nbd vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
install(TARGETS vitastor-osd vitastor-dump-journal vitastor-nbd vitastor-nfs vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-rm)
|
||||
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vita)
|
||||
install(
|
||||
|
@@ -25,7 +25,7 @@ allocator::allocator(uint64_t blocks)
|
||||
size = free = blocks;
|
||||
last_one_mask = (blocks % 64) == 0
|
||||
? UINT64_MAX
|
||||
: ((1l << (blocks % 64)) - 1);
|
||||
: (((uint64_t)1 << (blocks % 64)) - 1);
|
||||
for (uint64_t i = 0; i < total; i++)
|
||||
{
|
||||
mask[i] = 0;
|
||||
@@ -79,7 +79,7 @@ void allocator::set(uint64_t addr, bool value)
|
||||
}
|
||||
if (value)
|
||||
{
|
||||
mask[last] = mask[last] | (1l << bit);
|
||||
mask[last] = mask[last] | ((uint64_t)1 << bit);
|
||||
if (mask[last] != (!is_last || cur_addr/64 < size/64
|
||||
? UINT64_MAX : last_one_mask))
|
||||
{
|
||||
@@ -88,7 +88,7 @@ void allocator::set(uint64_t addr, bool value)
|
||||
}
|
||||
else
|
||||
{
|
||||
mask[last] = mask[last] & ~(1l << bit);
|
||||
mask[last] = mask[last] & ~((uint64_t)1 << bit);
|
||||
}
|
||||
is_last = false;
|
||||
if (p2 > 1)
|
||||
|
@@ -415,8 +415,11 @@ stop_flusher:
|
||||
flusher->active_flushers++;
|
||||
resume_1:
|
||||
// Find it in clean_db
|
||||
clean_it = bs->clean_db.find(cur.oid);
|
||||
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
||||
{
|
||||
auto & clean_db = bs->clean_db_shard(cur.oid);
|
||||
auto clean_it = clean_db.find(cur.oid);
|
||||
old_clean_loc = (clean_it != clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
||||
}
|
||||
// Scan dirty versions of the object
|
||||
if (!scan_dirty(1))
|
||||
{
|
||||
@@ -870,10 +873,11 @@ void journal_flusher_co::update_clean_db()
|
||||
#endif
|
||||
bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
|
||||
}
|
||||
auto & clean_db = bs->clean_db_shard(cur.oid);
|
||||
if (has_delete)
|
||||
{
|
||||
auto clean_it = bs->clean_db.find(cur.oid);
|
||||
bs->clean_db.erase(clean_it);
|
||||
auto clean_it = clean_db.find(cur.oid);
|
||||
clean_db.erase(clean_it);
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %lu from %lx:%lx v%lu (delete)\n",
|
||||
clean_loc >> bs->block_order,
|
||||
@@ -884,7 +888,7 @@ void journal_flusher_co::update_clean_db()
|
||||
}
|
||||
else
|
||||
{
|
||||
bs->clean_db[cur.oid] = {
|
||||
clean_db[cur.oid] = {
|
||||
.version = cur.version,
|
||||
.location = clean_loc,
|
||||
};
|
||||
|
@@ -49,7 +49,6 @@ class journal_flusher_co
|
||||
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
|
||||
|
||||
bool skip_copy, has_delete, has_writes;
|
||||
blockstore_clean_db_t::iterator clean_it;
|
||||
std::vector<copy_buffer_t> v;
|
||||
std::vector<copy_buffer_t>::iterator it;
|
||||
int copy_count;
|
||||
|
@@ -118,7 +118,7 @@ void blockstore_impl_t::loop()
|
||||
// has_writes == 0 - no writes before the current queue item
|
||||
// has_writes == 1 - some writes in progress
|
||||
// has_writes == 2 - tried to submit some writes, but failed
|
||||
int has_writes = 0, op_idx = 0, new_idx = 0;
|
||||
int has_writes = 0, op_idx = 0, new_idx = 0, done_lists = 0;
|
||||
for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
|
||||
{
|
||||
auto op = submit_queue[op_idx];
|
||||
@@ -198,9 +198,14 @@ void blockstore_impl_t::loop()
|
||||
}
|
||||
else if (op->opcode == BS_OP_LIST)
|
||||
{
|
||||
// LIST doesn't need to be blocked by previous modifications
|
||||
process_list(op);
|
||||
wr_st = 2;
|
||||
// LIST doesn't have to be blocked by previous modifications
|
||||
// But don't do a lot of LISTs at once, because they're blocking and potentially slow
|
||||
if (single_tick_list_limit <= 0 || done_lists < single_tick_list_limit)
|
||||
{
|
||||
process_list(op);
|
||||
done_lists++;
|
||||
wr_st = 2;
|
||||
}
|
||||
}
|
||||
if (wr_st == 2)
|
||||
{
|
||||
@@ -423,22 +428,104 @@ static bool replace_stable(object_id oid, uint64_t version, int search_start, in
|
||||
return false;
|
||||
}
|
||||
|
||||
blockstore_clean_db_t& blockstore_impl_t::clean_db_shard(object_id oid)
|
||||
{
|
||||
uint64_t pg_num = 0;
|
||||
uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
|
||||
auto sh_it = clean_db_settings.find(pool_id);
|
||||
if (sh_it != clean_db_settings.end())
|
||||
{
|
||||
// like map_to_pg()
|
||||
pg_num = (oid.stripe / sh_it->second.pg_stripe_size) % sh_it->second.pg_count + 1;
|
||||
}
|
||||
return clean_db_shards[(pool_id << (64-POOL_ID_BITS)) | pg_num];
|
||||
}
|
||||
|
||||
void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)
|
||||
{
|
||||
uint64_t pool_id = (uint64_t)pool;
|
||||
std::map<pool_pg_id_t, blockstore_clean_db_t> new_shards;
|
||||
auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
|
||||
while (sh_it != clean_db_shards.end() &&
|
||||
(sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
|
||||
{
|
||||
for (auto & pair: sh_it->second)
|
||||
{
|
||||
// like map_to_pg()
|
||||
uint64_t pg_num = (pair.first.stripe / pg_stripe_size) % pg_count + 1;
|
||||
uint64_t shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
|
||||
new_shards[shard_id][pair.first] = pair.second;
|
||||
}
|
||||
clean_db_shards.erase(sh_it++);
|
||||
}
|
||||
for (sh_it = new_shards.begin(); sh_it != new_shards.end(); sh_it++)
|
||||
{
|
||||
auto & to = clean_db_shards[sh_it->first];
|
||||
to.swap(sh_it->second);
|
||||
}
|
||||
clean_db_settings[pool_id] = (pool_shard_settings_t){
|
||||
.pg_count = pg_count,
|
||||
.pg_stripe_size = pg_stripe_size,
|
||||
};
|
||||
}
|
||||
|
||||
void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
{
|
||||
uint32_t list_pg = op->offset;
|
||||
uint32_t list_pg = op->offset+1;
|
||||
uint32_t pg_count = op->len;
|
||||
uint64_t pg_stripe_size = op->oid.stripe;
|
||||
uint64_t min_inode = op->oid.inode;
|
||||
uint64_t max_inode = op->version;
|
||||
// Check PG
|
||||
if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
|
||||
if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg > pg_count))
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
// Copy clean_db entries (sorted)
|
||||
int stable_count = 0, stable_alloc = clean_db.size() / (pg_count ? pg_count : 1);
|
||||
// Check if the DB needs resharding
|
||||
// (we don't know about PGs from the beginning, we only create "shards" here)
|
||||
uint64_t first_shard = 0, last_shard = UINT64_MAX;
|
||||
if (min_inode != 0 &&
|
||||
// Check if min_inode == max_inode == pool_id<<N, i.e. this is a pool listing
|
||||
(min_inode >> (64-POOL_ID_BITS)) == (max_inode >> (64-POOL_ID_BITS)))
|
||||
{
|
||||
pool_id_t pool_id = (min_inode >> (64-POOL_ID_BITS));
|
||||
if (pg_count > 1)
|
||||
{
|
||||
// Per-pg listing
|
||||
auto sh_it = clean_db_settings.find(pool_id);
|
||||
if (sh_it == clean_db_settings.end() ||
|
||||
sh_it->second.pg_count != pg_count ||
|
||||
sh_it->second.pg_stripe_size != pg_stripe_size)
|
||||
{
|
||||
reshard_clean_db(pool_id, pg_count, pg_stripe_size);
|
||||
}
|
||||
first_shard = last_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS)) | list_pg;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Per-pool listing
|
||||
first_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS));
|
||||
last_shard = ((uint64_t)(pool_id+1) << (64-POOL_ID_BITS)) - 1;
|
||||
}
|
||||
}
|
||||
// Copy clean_db entries
|
||||
int stable_count = 0, stable_alloc = 0;
|
||||
if (min_inode != max_inode)
|
||||
{
|
||||
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||
shard_it++)
|
||||
{
|
||||
auto & clean_db = shard_it->second;
|
||||
stable_alloc += clean_db.size();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
stable_alloc = 32768;
|
||||
}
|
||||
obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!stable)
|
||||
{
|
||||
@@ -446,7 +533,11 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||
shard_it++)
|
||||
{
|
||||
auto & clean_db = shard_it->second;
|
||||
auto clean_it = clean_db.begin(), clean_end = clean_db.end();
|
||||
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
|
||||
{
|
||||
@@ -461,26 +552,28 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
}
|
||||
for (; clean_it != clean_end; clean_it++)
|
||||
{
|
||||
if (!pg_count || ((clean_it->first.stripe / pg_stripe_size) % pg_count) == list_pg) // like map_to_pg()
|
||||
if (stable_count >= stable_alloc)
|
||||
{
|
||||
if (stable_count >= stable_alloc)
|
||||
stable_alloc *= 2;
|
||||
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!stable)
|
||||
{
|
||||
stable_alloc += 32768;
|
||||
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!stable)
|
||||
{
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
stable[stable_count++] = {
|
||||
.oid = clean_it->first,
|
||||
.version = clean_it->second.version,
|
||||
};
|
||||
}
|
||||
stable[stable_count++] = {
|
||||
.oid = clean_it->first,
|
||||
.version = clean_it->second.version,
|
||||
};
|
||||
}
|
||||
}
|
||||
if (first_shard != last_shard)
|
||||
{
|
||||
// If that's not a per-PG listing, sort clean entries
|
||||
std::sort(stable, stable+stable_count);
|
||||
}
|
||||
int clean_stable_count = stable_count;
|
||||
// Copy dirty_db entries (sorted, too)
|
||||
int unstable_count = 0, unstable_alloc = 0;
|
||||
@@ -506,7 +599,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
}
|
||||
for (; dirty_it != dirty_end; dirty_it++)
|
||||
{
|
||||
if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg) // like map_to_pg()
|
||||
if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count + 1) == list_pg) // like map_to_pg()
|
||||
{
|
||||
if (IS_DELETE(dirty_it->second.state))
|
||||
{
|
||||
|
@@ -204,6 +204,17 @@ typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
|
||||
|
||||
#include "blockstore_flush.h"
|
||||
|
||||
typedef uint32_t pool_id_t;
|
||||
typedef uint64_t pool_pg_id_t;
|
||||
|
||||
#define POOL_ID_BITS 16
|
||||
|
||||
struct pool_shard_settings_t
|
||||
{
|
||||
uint32_t pg_count;
|
||||
uint32_t pg_stripe_size;
|
||||
};
|
||||
|
||||
class blockstore_impl_t
|
||||
{
|
||||
/******* OPTIONS *******/
|
||||
@@ -241,11 +252,14 @@ class blockstore_impl_t
|
||||
int throttle_target_parallelism = 1;
|
||||
// Minimum difference in microseconds between target and real execution times to throttle the response
|
||||
int throttle_threshold_us = 50;
|
||||
// Maximum number of LIST operations to be processed between
|
||||
int single_tick_list_limit = 1;
|
||||
/******* END OF OPTIONS *******/
|
||||
|
||||
struct ring_consumer_t ring_consumer;
|
||||
|
||||
blockstore_clean_db_t clean_db;
|
||||
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
|
||||
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
|
||||
uint8_t *clean_bitmap = NULL;
|
||||
blockstore_dirty_db_t dirty_db;
|
||||
std::vector<blockstore_op_t*> submit_queue;
|
||||
@@ -294,6 +308,9 @@ class blockstore_impl_t
|
||||
void open_journal();
|
||||
uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
|
||||
|
||||
blockstore_clean_db_t& clean_db_shard(object_id oid);
|
||||
void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
|
||||
|
||||
// Journaling
|
||||
void prepare_journal_sector_write(int sector, blockstore_op_t *op);
|
||||
void handle_journal_write(ring_data_t *data, uint64_t flush_id);
|
||||
|
@@ -131,6 +131,7 @@ resume_1:
|
||||
}
|
||||
// Skip superblock
|
||||
bs->meta_offset += bs->meta_block_size;
|
||||
bs->meta_len -= bs->meta_block_size;
|
||||
prev_done = 0;
|
||||
done_len = 0;
|
||||
done_pos = 0;
|
||||
@@ -222,10 +223,11 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
|
||||
}
|
||||
if (entry->oid.inode > 0)
|
||||
{
|
||||
auto clean_it = bs->clean_db.find(entry->oid);
|
||||
if (clean_it == bs->clean_db.end() || clean_it->second.version < entry->version)
|
||||
auto & clean_db = bs->clean_db_shard(entry->oid);
|
||||
auto clean_it = clean_db.find(entry->oid);
|
||||
if (clean_it == clean_db.end() || clean_it->second.version < entry->version)
|
||||
{
|
||||
if (clean_it != bs->clean_db.end())
|
||||
if (clean_it != clean_db.end())
|
||||
{
|
||||
// free the previous block
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
@@ -245,7 +247,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
|
||||
printf("Allocate block (clean entry) %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
||||
#endif
|
||||
bs->data_alloc->set(done_cnt+i, true);
|
||||
bs->clean_db[entry->oid] = (struct clean_entry){
|
||||
clean_db[entry->oid] = (struct clean_entry){
|
||||
.version = entry->version,
|
||||
.location = (done_cnt+i) << block_order,
|
||||
};
|
||||
@@ -656,8 +658,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
init_write_sector = proc_pos;
|
||||
return 0;
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(je->small_write.oid);
|
||||
if (clean_it == bs->clean_db.end() ||
|
||||
auto & clean_db = bs->clean_db_shard(je->small_write.oid);
|
||||
auto clean_it = clean_db.find(je->small_write.oid);
|
||||
if (clean_it == clean_db.end() ||
|
||||
clean_it->second.version < je->small_write.version)
|
||||
{
|
||||
obj_ver_id ov = {
|
||||
@@ -735,8 +738,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
erase_dirty_object(dirty_it);
|
||||
}
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
||||
if (clean_it == bs->clean_db.end() ||
|
||||
auto & clean_db = bs->clean_db_shard(je->big_write.oid);
|
||||
auto clean_it = clean_db.find(je->big_write.oid);
|
||||
if (clean_it == clean_db.end() ||
|
||||
clean_it->second.version < je->big_write.version)
|
||||
{
|
||||
// oid, version, block
|
||||
@@ -841,8 +845,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
dirty_it--;
|
||||
dirty_exists = dirty_it->first.oid == je->del.oid;
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(je->del.oid);
|
||||
bool clean_exists = (clean_it != bs->clean_db.end() &&
|
||||
auto & clean_db = bs->clean_db_shard(je->del.oid);
|
||||
auto clean_it = clean_db.find(je->del.oid);
|
||||
bool clean_exists = (clean_it != clean_db.end() &&
|
||||
clean_it->second.version < je->del.version);
|
||||
if (!clean_exists && dirty_exists)
|
||||
{
|
||||
@@ -901,8 +906,9 @@ void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto clean_it = bs->clean_db.find(oid);
|
||||
uint64_t clean_loc = clean_it != bs->clean_db.end()
|
||||
auto & clean_db = bs->clean_db_shard(oid);
|
||||
auto clean_it = clean_db.find(oid);
|
||||
uint64_t clean_loc = clean_it != clean_db.end()
|
||||
? clean_it->second.location : UINT64_MAX;
|
||||
if (exists && clean_loc == UINT64_MAX)
|
||||
{
|
||||
|
@@ -111,6 +111,7 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse
|
||||
|
||||
int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
{
|
||||
auto & clean_db = clean_db_shard(read_op->oid);
|
||||
auto clean_it = clean_db.find(read_op->oid);
|
||||
auto dirty_it = dirty_db.upper_bound((obj_ver_id){
|
||||
.oid = read_op->oid,
|
||||
@@ -297,6 +298,7 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void
|
||||
dirty_it--;
|
||||
}
|
||||
}
|
||||
auto & clean_db = clean_db_shard(oid);
|
||||
auto clean_it = clean_db.find(oid);
|
||||
if (clean_it != clean_db.end())
|
||||
{
|
||||
|
@@ -54,6 +54,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||
auto dirty_it = dirty_db.find(*v);
|
||||
if (dirty_it == dirty_db.end())
|
||||
{
|
||||
auto & clean_db = clean_db_shard(v->oid);
|
||||
auto clean_it = clean_db.find(v->oid);
|
||||
if (clean_it == clean_db.end() || clean_it->second.version < v->version)
|
||||
{
|
||||
@@ -188,6 +189,7 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
|
||||
}
|
||||
if (exists == -1)
|
||||
{
|
||||
auto & clean_db = clean_db_shard(v.oid);
|
||||
auto clean_it = clean_db.find(v.oid);
|
||||
exists = clean_it != clean_db.end() ? 1 : 0;
|
||||
}
|
||||
@@ -215,6 +217,7 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto & clean_db = clean_db_shard(v.oid);
|
||||
auto clean_it = clean_db.find(v.oid);
|
||||
uint64_t clean_loc = clean_it != clean_db.end()
|
||||
? clean_it->second.location : UINT64_MAX;
|
||||
|
@@ -41,6 +41,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
}
|
||||
if (!found)
|
||||
{
|
||||
auto & clean_db = clean_db_shard(op->oid);
|
||||
auto clean_it = clean_db.find(op->oid);
|
||||
if (clean_it != clean_db.end())
|
||||
{
|
||||
@@ -543,12 +544,13 @@ resume_4:
|
||||
if (ref_us > exec_us + throttle_threshold_us)
|
||||
{
|
||||
// Pause reply
|
||||
PRIV(op)->op_state = 5;
|
||||
// Remember that the timer can in theory be called right here
|
||||
tfd->set_timer_us(ref_us-exec_us, false, [this, op](int timer_id)
|
||||
{
|
||||
PRIV(op)->op_state++;
|
||||
ringloop->wakeup();
|
||||
});
|
||||
PRIV(op)->op_state = 5;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
292
src/cli.cpp
292
src/cli.cpp
@@ -2,8 +2,7 @@
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
/**
|
||||
* CLI tool
|
||||
* Currently can (a) remove inodes and (b) merge snapshot/clone layers
|
||||
* CLI tool and also a library for administrative tasks
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
@@ -17,7 +16,9 @@
|
||||
|
||||
static const char *exe_name = NULL;
|
||||
|
||||
json11::Json::object cli_tool_t::parse_args(int narg, const char *args[])
|
||||
static void help();
|
||||
|
||||
static json11::Json::object parse_args(int narg, const char *args[])
|
||||
{
|
||||
json11::Json::object cfg;
|
||||
json11::Json::array cmd;
|
||||
@@ -79,13 +80,16 @@ json11::Json::object cli_tool_t::parse_args(int narg, const char *args[])
|
||||
return cfg;
|
||||
}
|
||||
|
||||
void cli_tool_t::help()
|
||||
static void help()
|
||||
{
|
||||
printf(
|
||||
"Vitastor command-line tool\n"
|
||||
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"USAGE:\n"
|
||||
"%s status\n"
|
||||
" Show cluster status\n"
|
||||
"\n"
|
||||
"%s df\n"
|
||||
" Show pool space statistics\n"
|
||||
"\n"
|
||||
@@ -155,223 +159,177 @@ void cli_tool_t::help()
|
||||
" --no-color Disable colored output\n"
|
||||
" --json JSON output\n"
|
||||
,
|
||||
exe_name, exe_name, exe_name, exe_name, exe_name, exe_name,
|
||||
exe_name, exe_name, exe_name, exe_name, exe_name, exe_name, exe_name,
|
||||
exe_name, exe_name, exe_name, exe_name, exe_name, exe_name
|
||||
);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
void cli_tool_t::change_parent(inode_t cur, inode_t new_parent)
|
||||
{
|
||||
auto cur_cfg_it = cli->st_cli.inode_config.find(cur);
|
||||
if (cur_cfg_it == cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
|
||||
exit(1);
|
||||
}
|
||||
inode_config_t new_cfg = cur_cfg_it->second;
|
||||
std::string cur_name = new_cfg.name;
|
||||
std::string cur_cfg_key = base64_encode(cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cur))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cur)));
|
||||
new_cfg.parent_id = new_parent;
|
||||
json11::Json::object cur_cfg_json = cli->st_cli.serialize_inode_cfg(&new_cfg);
|
||||
waiting++;
|
||||
cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cur_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", new_cfg.mod_revision+1 },
|
||||
},
|
||||
} },
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", cur_cfg_key },
|
||||
{ "value", base64_encode(json11::Json(cur_cfg_json).dump()) },
|
||||
} }
|
||||
},
|
||||
} },
|
||||
}, [this, new_parent, cur, cur_name](std::string err, json11::Json res)
|
||||
{
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error changing parent of %s: %s\n", cur_name.c_str(), err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
fprintf(stderr, "Inode %s was modified during snapshot deletion\n", cur_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (new_parent)
|
||||
{
|
||||
auto new_parent_it = cli->st_cli.inode_config.find(new_parent);
|
||||
std::string new_parent_name = new_parent_it != cli->st_cli.inode_config.end()
|
||||
? new_parent_it->second.name : "<unknown>";
|
||||
printf(
|
||||
"Parent of layer %s (inode %lu in pool %u) changed to %s (inode %lu in pool %u)\n",
|
||||
cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur),
|
||||
new_parent_name.c_str(), INODE_NO_POOL(new_parent), INODE_POOL(new_parent)
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf(
|
||||
"Parent of layer %s (inode %lu in pool %u) detached\n",
|
||||
cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur)
|
||||
);
|
||||
}
|
||||
waiting--;
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void cli_tool_t::etcd_txn(json11::Json txn)
|
||||
{
|
||||
waiting++;
|
||||
cli->st_cli.etcd_txn_slow(txn, [this](std::string err, json11::Json res)
|
||||
{
|
||||
waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error reading from etcd: %s\n", err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
etcd_result = res;
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
inode_config_t* cli_tool_t::get_inode_cfg(const std::string & name)
|
||||
{
|
||||
for (auto & ic: cli->st_cli.inode_config)
|
||||
{
|
||||
if (ic.second.name == name)
|
||||
{
|
||||
return &ic.second;
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "Layer %s not found\n", name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
void cli_tool_t::run(json11::Json cfg)
|
||||
static int run(cli_tool_t *p, json11::Json::object cfg)
|
||||
{
|
||||
cli_result_t result;
|
||||
p->parse_config(cfg);
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
cfg.erase("command");
|
||||
std::function<bool(cli_result_t &)> action_cb;
|
||||
if (!cmd.size())
|
||||
{
|
||||
fprintf(stderr, "command is missing\n");
|
||||
exit(1);
|
||||
result = { .err = EINVAL, .text = "command is missing" };
|
||||
}
|
||||
else if (cmd[0] == "status")
|
||||
{
|
||||
// Show cluster status
|
||||
action_cb = p->start_status(cfg);
|
||||
}
|
||||
else if (cmd[0] == "df")
|
||||
{
|
||||
// Show pool space stats
|
||||
action_cb = start_df(cfg);
|
||||
action_cb = p->start_df(cfg);
|
||||
}
|
||||
else if (cmd[0] == "ls")
|
||||
{
|
||||
// List images
|
||||
action_cb = start_ls(cfg);
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cmd.erase(cmd.begin(), cmd.begin()+1);
|
||||
cfg["names"] = cmd;
|
||||
}
|
||||
action_cb = p->start_ls(cfg);
|
||||
}
|
||||
else if (cmd[0] == "create" || cmd[0] == "snap-create")
|
||||
else if (cmd[0] == "snap-create")
|
||||
{
|
||||
// Create snapshot
|
||||
std::string name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
int pos = name.find('@');
|
||||
if (pos == std::string::npos || pos == name.length()-1)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Please specify new snapshot name after @" };
|
||||
}
|
||||
else
|
||||
{
|
||||
cfg["image"] = name.substr(0, pos);
|
||||
cfg["snapshot"] = name.substr(pos + 1);
|
||||
action_cb = p->start_create(cfg);
|
||||
}
|
||||
}
|
||||
else if (cmd[0] == "create")
|
||||
{
|
||||
// Create image/snapshot
|
||||
action_cb = start_create(cfg);
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cfg["image"] = cmd[1];
|
||||
}
|
||||
action_cb = p->start_create(cfg);
|
||||
}
|
||||
else if (cmd[0] == "modify")
|
||||
{
|
||||
// Modify image
|
||||
action_cb = start_modify(cfg);
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cfg["image"] = cmd[1];
|
||||
}
|
||||
action_cb = p->start_modify(cfg);
|
||||
}
|
||||
else if (cmd[0] == "rm-data")
|
||||
{
|
||||
// Delete inode data
|
||||
action_cb = start_rm(cfg);
|
||||
action_cb = p->start_rm_data(cfg);
|
||||
}
|
||||
else if (cmd[0] == "merge-data")
|
||||
{
|
||||
// Merge layer data without affecting metadata
|
||||
action_cb = start_merge(cfg);
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cfg["from"] = cmd[1];
|
||||
if (cmd.size() > 2)
|
||||
cfg["to"] = cmd[2];
|
||||
}
|
||||
action_cb = p->start_merge(cfg);
|
||||
}
|
||||
else if (cmd[0] == "flatten")
|
||||
{
|
||||
// Merge layer data without affecting metadata
|
||||
action_cb = start_flatten(cfg);
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cfg["image"] = cmd[1];
|
||||
}
|
||||
action_cb = p->start_flatten(cfg);
|
||||
}
|
||||
else if (cmd[0] == "rm")
|
||||
{
|
||||
// Remove multiple snapshots and rebase their children
|
||||
action_cb = start_snap_rm(cfg);
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cfg["from"] = cmd[1];
|
||||
if (cmd.size() > 2)
|
||||
cfg["to"] = cmd[2];
|
||||
}
|
||||
action_cb = p->start_rm(cfg);
|
||||
}
|
||||
else if (cmd[0] == "alloc-osd")
|
||||
{
|
||||
// Allocate a new OSD number
|
||||
action_cb = start_alloc_osd(cfg);
|
||||
action_cb = p->start_alloc_osd(cfg);
|
||||
}
|
||||
else if (cmd[0] == "simple-offsets")
|
||||
{
|
||||
// Calculate offsets for simple & stupid OSD deployment without superblock
|
||||
action_cb = simple_offsets(cfg);
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cfg["device"] = cmd[1];
|
||||
}
|
||||
action_cb = p->simple_offsets(cfg);
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "unknown command: %s\n", cmd[0].string_value().c_str());
|
||||
exit(1);
|
||||
result = { .err = EINVAL, .text = "unknown command: "+cmd[0].string_value() };
|
||||
}
|
||||
if (action_cb == NULL)
|
||||
if (action_cb != NULL)
|
||||
{
|
||||
return;
|
||||
}
|
||||
color = !cfg["no-color"].bool_value();
|
||||
json_output = cfg["json"].bool_value();
|
||||
iodepth = cfg["iodepth"].uint64_value();
|
||||
if (!iodepth)
|
||||
iodepth = 32;
|
||||
parallel_osds = cfg["parallel_osds"].uint64_value();
|
||||
if (!parallel_osds)
|
||||
parallel_osds = 4;
|
||||
log_level = cfg["log_level"].int64_value();
|
||||
progress = cfg["progress"].uint64_value() ? true : false;
|
||||
list_first = cfg["wait-list"].uint64_value() ? true : false;
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(512);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||
cli->on_ready([this]()
|
||||
{
|
||||
// Initialize job
|
||||
consumer.loop = [this]()
|
||||
// Create client
|
||||
json11::Json cfg_j = cfg;
|
||||
p->ringloop = new ring_loop_t(512);
|
||||
p->epmgr = new epoll_manager_t(p->ringloop);
|
||||
p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg_j);
|
||||
// Smaller timeout by default for more interactiveness
|
||||
p->cli->st_cli.etcd_slow_timeout = p->cli->st_cli.etcd_quick_timeout;
|
||||
p->loop_and_wait(action_cb, [&](const cli_result_t & r)
|
||||
{
|
||||
result = r;
|
||||
action_cb = NULL;
|
||||
});
|
||||
// Loop until it completes
|
||||
while (action_cb != NULL)
|
||||
{
|
||||
p->ringloop->loop();
|
||||
if (action_cb != NULL)
|
||||
{
|
||||
bool done = action_cb();
|
||||
if (done)
|
||||
{
|
||||
action_cb = NULL;
|
||||
}
|
||||
}
|
||||
ringloop->submit();
|
||||
};
|
||||
ringloop->register_consumer(&consumer);
|
||||
consumer.loop();
|
||||
});
|
||||
// Loop until it completes
|
||||
while (action_cb != NULL)
|
||||
{
|
||||
ringloop->loop();
|
||||
if (action_cb != NULL)
|
||||
ringloop->wait();
|
||||
p->ringloop->wait();
|
||||
}
|
||||
// Destroy the client
|
||||
delete p->cli;
|
||||
delete p->epmgr;
|
||||
delete p->ringloop;
|
||||
p->cli = NULL;
|
||||
p->epmgr = NULL;
|
||||
p->ringloop = NULL;
|
||||
}
|
||||
// Destroy the client
|
||||
delete cli;
|
||||
delete epmgr;
|
||||
delete ringloop;
|
||||
cli = NULL;
|
||||
epmgr = NULL;
|
||||
ringloop = NULL;
|
||||
// Print result
|
||||
if (p->json_output && !result.data.is_null())
|
||||
{
|
||||
printf("%s\n", result.data.dump().c_str());
|
||||
}
|
||||
else if (p->json_output && result.err)
|
||||
{
|
||||
printf("%s\n", json11::Json(json11::Json::object {
|
||||
{ "error_code", result.err },
|
||||
{ "error_text", result.text },
|
||||
}).dump().c_str());
|
||||
}
|
||||
else if (result.text != "")
|
||||
{
|
||||
fprintf(result.err ? stderr : stdout, result.text[result.text.size()-1] == '\n' ? "%s" : "%s\n", result.text.c_str());
|
||||
}
|
||||
return result.err;
|
||||
}
|
||||
|
||||
int main(int narg, const char *args[])
|
||||
@@ -380,7 +338,7 @@ int main(int narg, const char *args[])
|
||||
setvbuf(stderr, NULL, _IONBF, 0);
|
||||
exe_name = args[0];
|
||||
cli_tool_t *p = new cli_tool_t();
|
||||
p->run(cli_tool_t::parse_args(narg, args));
|
||||
int r = run(p, parse_args(narg, args));
|
||||
delete p;
|
||||
return 0;
|
||||
return r;
|
||||
}
|
||||
|
47
src/cli.h
47
src/cli.h
@@ -19,11 +19,18 @@ class epoll_manager_t;
|
||||
class cluster_client_t;
|
||||
struct inode_config_t;
|
||||
|
||||
struct cli_result_t
|
||||
{
|
||||
int err;
|
||||
std::string text;
|
||||
json11::Json data;
|
||||
};
|
||||
|
||||
class cli_tool_t
|
||||
{
|
||||
public:
|
||||
uint64_t iodepth = 0, parallel_osds = 0;
|
||||
bool progress = true;
|
||||
uint64_t iodepth = 4, parallel_osds = 32;
|
||||
bool progress = false;
|
||||
bool list_first = false;
|
||||
bool json_output = false;
|
||||
int log_level = 0;
|
||||
@@ -34,33 +41,33 @@ public:
|
||||
cluster_client_t *cli = NULL;
|
||||
|
||||
int waiting = 0;
|
||||
cli_result_t etcd_err;
|
||||
json11::Json etcd_result;
|
||||
ring_consumer_t consumer;
|
||||
std::function<bool(void)> action_cb;
|
||||
|
||||
void run(json11::Json cfg);
|
||||
void parse_config(json11::Json cfg);
|
||||
|
||||
void change_parent(inode_t cur, inode_t new_parent);
|
||||
void change_parent(inode_t cur, inode_t new_parent, cli_result_t *result);
|
||||
inode_config_t* get_inode_cfg(const std::string & name);
|
||||
|
||||
static json11::Json::object parse_args(int narg, const char *args[]);
|
||||
static void help();
|
||||
|
||||
friend struct rm_inode_t;
|
||||
friend struct snap_merger_t;
|
||||
friend struct snap_flattener_t;
|
||||
friend struct snap_remover_t;
|
||||
|
||||
std::function<bool(void)> start_df(json11::Json);
|
||||
std::function<bool(void)> start_ls(json11::Json);
|
||||
std::function<bool(void)> start_create(json11::Json);
|
||||
std::function<bool(void)> start_modify(json11::Json);
|
||||
std::function<bool(void)> start_rm(json11::Json);
|
||||
std::function<bool(void)> start_merge(json11::Json);
|
||||
std::function<bool(void)> start_flatten(json11::Json);
|
||||
std::function<bool(void)> start_snap_rm(json11::Json);
|
||||
std::function<bool(void)> start_alloc_osd(json11::Json cfg, uint64_t *out = NULL);
|
||||
std::function<bool(void)> simple_offsets(json11::Json cfg);
|
||||
std::function<bool(cli_result_t &)> start_status(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_df(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_ls(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_create(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_modify(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_rm_data(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_merge(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_flatten(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_rm(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_alloc_osd(json11::Json cfg);
|
||||
std::function<bool(cli_result_t &)> simple_offsets(json11::Json cfg);
|
||||
|
||||
// Should be called like loop_and_wait(start_status(), <completion callback>)
|
||||
void loop_and_wait(std::function<bool(cli_result_t &)> loop_cb, std::function<void(const cli_result_t &)> complete_cb);
|
||||
|
||||
void etcd_txn(json11::Json txn);
|
||||
};
|
||||
@@ -69,7 +76,7 @@ uint64_t parse_size(std::string size_str);
|
||||
|
||||
std::string print_table(json11::Json items, json11::Json header, bool use_esc);
|
||||
|
||||
std::string format_size(uint64_t size);
|
||||
std::string format_size(uint64_t size, bool nobytes = false);
|
||||
|
||||
std::string format_lat(uint64_t lat);
|
||||
|
||||
|
@@ -16,6 +16,7 @@ struct alloc_osd_t
|
||||
uint64_t new_id = 1;
|
||||
|
||||
int state = 0;
|
||||
cli_result_t result;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
@@ -62,6 +63,12 @@ struct alloc_osd_t
|
||||
state = 1;
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!parent->etcd_result["succeeded"].bool_value())
|
||||
{
|
||||
std::vector<osd_num_t> used;
|
||||
@@ -99,23 +106,23 @@ struct alloc_osd_t
|
||||
}
|
||||
} while (!parent->etcd_result["succeeded"].bool_value());
|
||||
state = 100;
|
||||
result = (cli_result_t){
|
||||
.text = std::to_string(new_id),
|
||||
.data = json11::Json(new_id),
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_alloc_osd(json11::Json cfg, uint64_t *out)
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_alloc_osd(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto alloc_osd = new alloc_osd_t();
|
||||
alloc_osd->parent = this;
|
||||
return [alloc_osd, out]()
|
||||
return [alloc_osd](cli_result_t & result)
|
||||
{
|
||||
alloc_osd->loop();
|
||||
if (alloc_osd->is_done())
|
||||
{
|
||||
if (out)
|
||||
*out = alloc_osd->new_id;
|
||||
else if (alloc_osd->new_id)
|
||||
printf("%lu\n", alloc_osd->new_id);
|
||||
result = alloc_osd->result;
|
||||
delete alloc_osd;
|
||||
return true;
|
||||
}
|
||||
|
149
src/cli_common.cpp
Normal file
149
src/cli_common.cpp
Normal file
@@ -0,0 +1,149 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "base64.h"
|
||||
#include "cluster_client.h"
|
||||
#include "cli.h"
|
||||
|
||||
void cli_tool_t::change_parent(inode_t cur, inode_t new_parent, cli_result_t *result)
|
||||
{
|
||||
auto cur_cfg_it = cli->st_cli.inode_config.find(cur);
|
||||
if (cur_cfg_it == cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[128];
|
||||
snprintf(buf, 128, "Inode 0x%lx disappeared", cur);
|
||||
*result = (cli_result_t){ .err = EIO, .text = buf };
|
||||
return;
|
||||
}
|
||||
inode_config_t new_cfg = cur_cfg_it->second;
|
||||
std::string cur_name = new_cfg.name;
|
||||
std::string cur_cfg_key = base64_encode(cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cur))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cur)));
|
||||
new_cfg.parent_id = new_parent;
|
||||
json11::Json::object cur_cfg_json = cli->st_cli.serialize_inode_cfg(&new_cfg);
|
||||
waiting++;
|
||||
cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cur_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", new_cfg.mod_revision+1 },
|
||||
},
|
||||
} },
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", cur_cfg_key },
|
||||
{ "value", base64_encode(json11::Json(cur_cfg_json).dump()) },
|
||||
} }
|
||||
},
|
||||
} },
|
||||
}, [this, result, new_parent, cur, cur_name](std::string err, json11::Json res)
|
||||
{
|
||||
if (err != "")
|
||||
{
|
||||
*result = (cli_result_t){ .err = EIO, .text = "Error changing parent of "+cur_name+": "+err };
|
||||
}
|
||||
else if (!res["succeeded"].bool_value())
|
||||
{
|
||||
*result = (cli_result_t){ .err = EAGAIN, .text = "Image "+cur_name+" was modified during change" };
|
||||
}
|
||||
else if (new_parent)
|
||||
{
|
||||
auto new_parent_it = cli->st_cli.inode_config.find(new_parent);
|
||||
std::string new_parent_name = new_parent_it != cli->st_cli.inode_config.end()
|
||||
? new_parent_it->second.name : "<unknown>";
|
||||
*result = (cli_result_t){
|
||||
.text = "Parent of layer "+cur_name+" (inode "+std::to_string(INODE_NO_POOL(cur))+
|
||||
" in pool "+std::to_string(INODE_POOL(cur))+") changed to "+new_parent_name+
|
||||
" (inode "+std::to_string(INODE_NO_POOL(new_parent))+" in pool "+std::to_string(INODE_POOL(new_parent))+")",
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
*result = (cli_result_t){
|
||||
.text = "Parent of layer "+cur_name+" (inode "+std::to_string(INODE_NO_POOL(cur))+
|
||||
" in pool "+std::to_string(INODE_POOL(cur))+") detached",
|
||||
};
|
||||
}
|
||||
waiting--;
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void cli_tool_t::etcd_txn(json11::Json txn)
|
||||
{
|
||||
waiting++;
|
||||
cli->st_cli.etcd_txn_slow(txn, [this](std::string err, json11::Json res)
|
||||
{
|
||||
waiting--;
|
||||
if (err != "")
|
||||
etcd_err = (cli_result_t){ .err = EIO, .text = "Error communicating with etcd: "+err };
|
||||
else
|
||||
etcd_err = (cli_result_t){ .err = 0 };
|
||||
etcd_result = res;
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
inode_config_t* cli_tool_t::get_inode_cfg(const std::string & name)
|
||||
{
|
||||
for (auto & ic: cli->st_cli.inode_config)
|
||||
{
|
||||
if (ic.second.name == name)
|
||||
{
|
||||
return &ic.second;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void cli_tool_t::parse_config(json11::Json cfg)
|
||||
{
|
||||
color = !cfg["no-color"].bool_value();
|
||||
json_output = cfg["json"].bool_value();
|
||||
iodepth = cfg["iodepth"].uint64_value();
|
||||
if (!iodepth)
|
||||
iodepth = 32;
|
||||
parallel_osds = cfg["parallel_osds"].uint64_value();
|
||||
if (!parallel_osds)
|
||||
parallel_osds = 4;
|
||||
log_level = cfg["log_level"].int64_value();
|
||||
progress = cfg["progress"].uint64_value() ? true : false;
|
||||
list_first = cfg["wait-list"].uint64_value() ? true : false;
|
||||
}
|
||||
|
||||
struct cli_result_looper_t
|
||||
{
|
||||
ring_consumer_t consumer;
|
||||
cli_result_t result;
|
||||
std::function<bool(cli_result_t &)> loop_cb;
|
||||
std::function<void(const cli_result_t &)> complete_cb;
|
||||
};
|
||||
|
||||
void cli_tool_t::loop_and_wait(std::function<bool(cli_result_t &)> loop_cb, std::function<void(const cli_result_t &)> complete_cb)
|
||||
{
|
||||
auto *looper = new cli_result_looper_t();
|
||||
looper->loop_cb = loop_cb;
|
||||
looper->complete_cb = complete_cb;
|
||||
looper->consumer.loop = [this, looper]()
|
||||
{
|
||||
bool done = looper->loop_cb(looper->result);
|
||||
if (done)
|
||||
{
|
||||
ringloop->unregister_consumer(&looper->consumer);
|
||||
looper->loop_cb = NULL;
|
||||
looper->complete_cb(looper->result);
|
||||
delete looper;
|
||||
return;
|
||||
}
|
||||
ringloop->submit();
|
||||
};
|
||||
cli->on_ready([this, looper]()
|
||||
{
|
||||
ringloop->register_consumer(&looper->consumer);
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
@@ -25,14 +25,18 @@ struct image_creator_t
|
||||
pool_id_t new_pool_id = 0;
|
||||
std::string new_pool_name;
|
||||
std::string image_name, new_snap, new_parent;
|
||||
json11::Json new_meta;
|
||||
uint64_t size;
|
||||
bool force_size = false;
|
||||
|
||||
pool_id_t old_pool_id = 0;
|
||||
inode_t new_parent_id = 0;
|
||||
inode_t new_id = 0, old_id = 0;
|
||||
uint64_t max_id_mod_rev = 0, cfg_mod_rev = 0, idx_mod_rev = 0;
|
||||
inode_config_t new_cfg;
|
||||
|
||||
int state = 0;
|
||||
cli_result_t result;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
@@ -43,13 +47,27 @@ struct image_creator_t
|
||||
{
|
||||
if (state >= 1)
|
||||
goto resume_1;
|
||||
if (image_name == "")
|
||||
{
|
||||
// FIXME: EINVAL -> specific codes for every error
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Image name is missing" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (image_name.find('@') != std::string::npos)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Image name can't contain @ character" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (new_pool_id)
|
||||
{
|
||||
auto & pools = parent->cli->st_cli.pool_config;
|
||||
if (pools.find(new_pool_id) == pools.end())
|
||||
{
|
||||
fprintf(stderr, "Pool %u does not exist\n", new_pool_id);
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+std::to_string(new_pool_id)+" does not exist" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
}
|
||||
else if (new_pool_name != "")
|
||||
@@ -64,8 +82,9 @@ struct image_creator_t
|
||||
}
|
||||
if (!new_pool_id)
|
||||
{
|
||||
fprintf(stderr, "Pool %s does not exist\n", new_pool_name.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+new_pool_name+" does not exist" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
}
|
||||
else if (parent->cli->st_cli.pool_config.size() == 1)
|
||||
@@ -91,8 +110,9 @@ struct image_creator_t
|
||||
{
|
||||
if (ic.second.name == image_name)
|
||||
{
|
||||
fprintf(stderr, "Image %s already exists\n", image_name.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = EEXIST, .text = "Image "+image_name+" already exists" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (ic.second.name == new_parent)
|
||||
{
|
||||
@@ -109,18 +129,21 @@ struct image_creator_t
|
||||
}
|
||||
if (new_parent != "" && !new_parent_id)
|
||||
{
|
||||
fprintf(stderr, "Parent image not found\n");
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Parent image "+new_parent+" not found" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!new_pool_id)
|
||||
{
|
||||
fprintf(stderr, "Pool name or ID is missing\n");
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Pool name or ID is missing" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!size)
|
||||
if (!size && !force_size)
|
||||
{
|
||||
fprintf(stderr, "Image size is missing\n");
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Image size is missing" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
do
|
||||
{
|
||||
@@ -131,23 +154,36 @@ struct image_creator_t
|
||||
resume_2:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
extract_next_id(parent->etcd_result["responses"][0]);
|
||||
attempt_create();
|
||||
state = 3;
|
||||
resume_3:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!parent->etcd_result["succeeded"].bool_value() &&
|
||||
parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items().size() > 0)
|
||||
{
|
||||
fprintf(stderr, "Image %s already exists\n", image_name.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = EEXIST, .text = "Image "+image_name+" already exists" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
} while (!parent->etcd_result["succeeded"].bool_value());
|
||||
if (parent->progress)
|
||||
{
|
||||
printf("Image %s created\n", image_name.c_str());
|
||||
}
|
||||
// Save into inode_config for library users to be able to take it from there immediately
|
||||
new_cfg.mod_revision = parent->etcd_result["responses"][0]["response_put"]["header"]["revision"].uint64_value();
|
||||
parent->cli->st_cli.insert_inode_config(new_cfg);
|
||||
result = (cli_result_t){ .err = 0, .text = "Image "+image_name+" created" };
|
||||
state = 100;
|
||||
}
|
||||
|
||||
@@ -163,14 +199,16 @@ resume_3:
|
||||
{
|
||||
if (ic.second.name == image_name+"@"+new_snap)
|
||||
{
|
||||
fprintf(stderr, "Snapshot %s@%s already exists\n", image_name.c_str(), new_snap.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = EEXIST, .text = "Snapshot "+image_name+"@"+new_snap+" already exists" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (new_parent != "")
|
||||
{
|
||||
fprintf(stderr, "--parent can't be used with snapshots\n");
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Parent can't be specified for snapshots" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
do
|
||||
{
|
||||
@@ -182,8 +220,9 @@ resume_3:
|
||||
return;
|
||||
if (!old_id)
|
||||
{
|
||||
fprintf(stderr, "Image %s does not exist\n", image_name.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Image "+image_name+" does not exist" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!new_pool_id)
|
||||
{
|
||||
@@ -195,17 +234,24 @@ resume_3:
|
||||
resume_4:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!parent->etcd_result["succeeded"].bool_value() &&
|
||||
parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items().size() > 0)
|
||||
{
|
||||
fprintf(stderr, "Snapshot %s@%s already exists\n", image_name.c_str(), new_snap.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = EEXIST, .text = "Snapshot "+image_name+"@"+new_snap+" already exists" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
} while (!parent->etcd_result["succeeded"].bool_value());
|
||||
if (parent->progress)
|
||||
{
|
||||
printf("Snapshot %s@%s created\n", image_name.c_str(), new_snap.c_str());
|
||||
}
|
||||
// Save into inode_config for library users to be able to take it from there immediately
|
||||
new_cfg.mod_revision = parent->etcd_result["responses"][0]["response_put"]["header"]["revision"].uint64_value();
|
||||
parent->cli->st_cli.insert_inode_config(new_cfg);
|
||||
result = (cli_result_t){ .err = 0, .text = "Snapshot "+image_name+"@"+new_snap+" created" };
|
||||
state = 100;
|
||||
}
|
||||
|
||||
@@ -259,6 +305,12 @@ resume_4:
|
||||
resume_2:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
extract_next_id(parent->etcd_result["responses"][0]);
|
||||
old_id = 0;
|
||||
old_pool_id = 0;
|
||||
@@ -288,8 +340,9 @@ resume_2:
|
||||
idx_mod_rev = kv.mod_revision;
|
||||
if (!old_id || !old_pool_id || old_pool_id >= POOL_ID_MAX)
|
||||
{
|
||||
fprintf(stderr, "Invalid pool or inode ID in etcd key %s\n", kv.key.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Invalid pool or inode ID in etcd key "+kv.key };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
}
|
||||
parent->etcd_txn(json11::Json::object {
|
||||
@@ -308,6 +361,12 @@ resume_2:
|
||||
resume_3:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]);
|
||||
size = kv.value["size"].uint64_value();
|
||||
@@ -324,12 +383,13 @@ resume_3:
|
||||
|
||||
void attempt_create()
|
||||
{
|
||||
inode_config_t new_cfg = {
|
||||
new_cfg = {
|
||||
.num = INODE_WITH_POOL(new_pool_id, new_id),
|
||||
.name = image_name,
|
||||
.size = size,
|
||||
.parent_id = (new_snap != "" ? INODE_WITH_POOL(old_pool_id, old_id) : new_parent_id),
|
||||
.readonly = false,
|
||||
.meta = new_meta,
|
||||
};
|
||||
json11::Json::array checks = json11::Json::array {
|
||||
json11::Json::object {
|
||||
@@ -457,77 +517,76 @@ uint64_t parse_size(std::string size_str)
|
||||
if (type_char == 'k' || type_char == 'm' || type_char == 'g' || type_char == 't')
|
||||
{
|
||||
if (type_char == 'k')
|
||||
mul = 1l<<10;
|
||||
mul = (uint64_t)1<<10;
|
||||
else if (type_char == 'm')
|
||||
mul = 1l<<20;
|
||||
mul = (uint64_t)1<<20;
|
||||
else if (type_char == 'g')
|
||||
mul = 1l<<30;
|
||||
mul = (uint64_t)1<<30;
|
||||
else /*if (type_char == 't')*/
|
||||
mul = 1l<<40;
|
||||
mul = (uint64_t)1<<40;
|
||||
size_str = size_str.substr(0, size_str.length()-1);
|
||||
}
|
||||
uint64_t size = json11::Json(size_str).uint64_value() * mul;
|
||||
if (size == 0 && size_str != "0" && (size_str != "" || mul != 1))
|
||||
{
|
||||
fprintf(stderr, "Invalid syntax for size: %s\n", size_str.c_str());
|
||||
exit(1);
|
||||
return UINT64_MAX;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_create(json11::Json cfg)
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_create(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto image_creator = new image_creator_t();
|
||||
image_creator->parent = this;
|
||||
image_creator->image_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
image_creator->image_name = cfg["image"].string_value();
|
||||
image_creator->new_pool_id = cfg["pool"].uint64_value();
|
||||
image_creator->new_pool_name = cfg["pool"].string_value();
|
||||
image_creator->force_size = cfg["force_size"].bool_value();
|
||||
if (cfg["image_meta"].is_object())
|
||||
{
|
||||
image_creator->new_meta = cfg["image-meta"];
|
||||
}
|
||||
if (cfg["snapshot"].string_value() != "")
|
||||
{
|
||||
image_creator->new_snap = cfg["snapshot"].string_value();
|
||||
}
|
||||
else if (cmd[0] == "snap-create")
|
||||
{
|
||||
int p = image_creator->image_name.find('@');
|
||||
if (p == std::string::npos || p == image_creator->image_name.length()-1)
|
||||
{
|
||||
fprintf(stderr, "Please specify new snapshot name after @\n");
|
||||
exit(1);
|
||||
}
|
||||
image_creator->new_snap = image_creator->image_name.substr(p + 1);
|
||||
image_creator->image_name = image_creator->image_name.substr(0, p);
|
||||
}
|
||||
image_creator->new_parent = cfg["parent"].string_value();
|
||||
if (cfg["size"].string_value() != "")
|
||||
{
|
||||
image_creator->size = parse_size(cfg["size"].string_value());
|
||||
if (image_creator->size % 4096)
|
||||
if (image_creator->size == UINT64_MAX)
|
||||
{
|
||||
fprintf(stderr, "Size should be a multiple of 4096\n");
|
||||
exit(1);
|
||||
return [size = cfg["size"].string_value()](cli_result_t & result)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Invalid syntax for size: "+size };
|
||||
return true;
|
||||
};
|
||||
}
|
||||
if ((image_creator->size % 4096) && !cfg["force_size"].bool_value())
|
||||
{
|
||||
delete image_creator;
|
||||
return [](cli_result_t & result)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Size should be a multiple of 4096" };
|
||||
return true;
|
||||
};
|
||||
}
|
||||
if (image_creator->new_snap != "")
|
||||
{
|
||||
fprintf(stderr, "--size can't be specified for snapshots\n");
|
||||
exit(1);
|
||||
delete image_creator;
|
||||
return [](cli_result_t & result)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Size can't be specified for snapshots" };
|
||||
return true;
|
||||
};
|
||||
}
|
||||
}
|
||||
if (image_creator->image_name == "")
|
||||
{
|
||||
fprintf(stderr, "Image name is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
if (image_creator->image_name.find('@') != std::string::npos)
|
||||
{
|
||||
fprintf(stderr, "Image name can't contain @ character\n");
|
||||
exit(1);
|
||||
}
|
||||
return [image_creator]()
|
||||
return [image_creator](cli_result_t & result)
|
||||
{
|
||||
image_creator->loop();
|
||||
if (image_creator->is_done())
|
||||
{
|
||||
result = image_creator->result;
|
||||
delete image_creator;
|
||||
return true;
|
||||
}
|
||||
|
@@ -12,6 +12,7 @@ struct pool_lister_t
|
||||
|
||||
int state = 0;
|
||||
json11::Json space_info;
|
||||
cli_result_t result;
|
||||
std::map<pool_id_t, json11::Json::object> pool_stats;
|
||||
|
||||
bool is_done()
|
||||
@@ -52,6 +53,12 @@ struct pool_lister_t
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
space_info = parent->etcd_result;
|
||||
std::map<pool_id_t, uint64_t> osd_free;
|
||||
for (auto & kv_item: space_info["responses"][0]["response_range"]["kvs"].array_items())
|
||||
@@ -124,8 +131,8 @@ resume_1:
|
||||
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
|
||||
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
|
||||
: "EC "+std::to_string(pool_cfg.pg_size-pool_cfg.parity_chunks)+"+"+std::to_string(pool_cfg.parity_chunks) },
|
||||
{ "used_raw", (uint64_t)(pool_stats[pool_cfg.id]["used_raw_tb"].number_value() * (1l<<40)) },
|
||||
{ "total_raw", (uint64_t)(pool_stats[pool_cfg.id]["total_raw_tb"].number_value() * (1l<<40)) },
|
||||
{ "used_raw", (uint64_t)(pool_stats[pool_cfg.id]["used_raw_tb"].number_value() * ((uint64_t)1<<40)) },
|
||||
{ "total_raw", (uint64_t)(pool_stats[pool_cfg.id]["total_raw_tb"].number_value() * ((uint64_t)1<<40)) },
|
||||
{ "max_available", pool_avail },
|
||||
{ "raw_to_usable", pool_stats[pool_cfg.id]["raw_to_usable"].number_value() },
|
||||
{ "space_efficiency", pool_stats[pool_cfg.id]["space_efficiency"].number_value() },
|
||||
@@ -150,10 +157,12 @@ resume_1:
|
||||
get_stats();
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (state == 100)
|
||||
return;
|
||||
if (parent->json_output)
|
||||
{
|
||||
// JSON output
|
||||
printf("%s\n", json11::Json(to_list()).dump().c_str());
|
||||
result.data = to_list();
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
@@ -206,21 +215,22 @@ resume_1:
|
||||
: 100)+"%";
|
||||
kv.second["eff_fmt"] = format_q(kv.second["space_efficiency"].number_value()*100)+"%";
|
||||
}
|
||||
printf("%s", print_table(to_list(), cols, parent->color).c_str());
|
||||
result.data = to_list();
|
||||
result.text = print_table(result.data, cols, parent->color);
|
||||
state = 100;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_df(json11::Json cfg)
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_df(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto lister = new pool_lister_t();
|
||||
lister->parent = this;
|
||||
return [lister]()
|
||||
return [lister](cli_result_t & result)
|
||||
{
|
||||
lister->loop();
|
||||
if (lister->is_done())
|
||||
{
|
||||
result = lister->result;
|
||||
delete lister;
|
||||
return true;
|
||||
}
|
||||
|
@@ -22,12 +22,19 @@ struct snap_flattener_t
|
||||
std::string top_parent_name;
|
||||
inode_t target_id = 0;
|
||||
int state = 0;
|
||||
std::function<bool(void)> merger_cb;
|
||||
std::function<bool(cli_result_t &)> merger_cb;
|
||||
cli_result_t result;
|
||||
|
||||
void get_merge_parents()
|
||||
{
|
||||
// Get all parents of target
|
||||
inode_config_t *target_cfg = parent->get_inode_cfg(target_name);
|
||||
if (!target_cfg)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+target_name+" not found" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
target_id = target_cfg->num;
|
||||
std::vector<inode_t> chain_list;
|
||||
inode_config_t *cur = target_cfg;
|
||||
@@ -37,23 +44,34 @@ struct snap_flattener_t
|
||||
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
|
||||
if (it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
|
||||
exit(1);
|
||||
result = (cli_result_t){
|
||||
.err = ENOENT,
|
||||
.text = "Parent inode of layer "+cur->name+" (id "+std::to_string(cur->parent_id)+") does not exist",
|
||||
.data = json11::Json::object {
|
||||
{ "error", "parent-not-found" },
|
||||
{ "inode_id", cur->num },
|
||||
{ "inode_name", cur->name },
|
||||
{ "parent_id", cur->parent_id },
|
||||
},
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
cur = &it->second;
|
||||
chain_list.push_back(cur->num);
|
||||
}
|
||||
if (cur->parent_id != 0)
|
||||
{
|
||||
fprintf(stderr, "Layer %s has a loop in parents\n", target_name.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = EBADF, .text = "Layer "+target_name+" has a loop in parents" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
top_parent_name = cur->name;
|
||||
}
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 5;
|
||||
return state == 100;
|
||||
}
|
||||
|
||||
void loop()
|
||||
@@ -64,11 +82,20 @@ struct snap_flattener_t
|
||||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
if (target_name == "")
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Layer to flatten not specified" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
// Get parent layers
|
||||
get_merge_parents();
|
||||
if (state == 100)
|
||||
return;
|
||||
// Start merger
|
||||
merger_cb = parent->start_merge(json11::Json::object {
|
||||
{ "command", json11::Json::array{ "merge-data", top_parent_name, target_name } },
|
||||
{ "from", top_parent_name },
|
||||
{ "to", target_name },
|
||||
{ "target", target_name },
|
||||
{ "delete-source", false },
|
||||
{ "cas", use_cas },
|
||||
@@ -76,14 +103,19 @@ struct snap_flattener_t
|
||||
});
|
||||
// Wait for it
|
||||
resume_1:
|
||||
while (!merger_cb())
|
||||
while (!merger_cb(result))
|
||||
{
|
||||
state = 1;
|
||||
return;
|
||||
}
|
||||
merger_cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
// Change parent
|
||||
parent->change_parent(target_id, 0);
|
||||
parent->change_parent(target_id, 0, &result);
|
||||
// Wait for it to complete
|
||||
state = 2;
|
||||
resume_2:
|
||||
@@ -92,31 +124,26 @@ resume_2:
|
||||
state = 3;
|
||||
resume_3:
|
||||
// Done
|
||||
return;
|
||||
state = 100;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_flatten(json11::Json cfg)
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_flatten(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto flattener = new snap_flattener_t();
|
||||
flattener->parent = this;
|
||||
flattener->target_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
if (flattener->target_name == "")
|
||||
{
|
||||
fprintf(stderr, "Layer to flatten argument is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
flattener->target_name = cfg["image"].string_value();
|
||||
flattener->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
if (!flattener->fsync_interval)
|
||||
flattener->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
flattener->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
||||
return [flattener]()
|
||||
return [flattener](cli_result_t & result)
|
||||
{
|
||||
flattener->loop();
|
||||
if (flattener->is_done())
|
||||
{
|
||||
result = flattener->result;
|
||||
delete flattener;
|
||||
return true;
|
||||
}
|
||||
|
@@ -24,6 +24,7 @@ struct image_lister_t
|
||||
int state = 0;
|
||||
std::map<inode_t, json11::Json::object> stats;
|
||||
json11::Json space_info;
|
||||
cli_result_t result;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
@@ -44,8 +45,9 @@ struct image_lister_t
|
||||
}
|
||||
if (!list_pool_id)
|
||||
{
|
||||
fprintf(stderr, "Pool %s does not exist\n", list_pool_name.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+list_pool_name+" does not exist" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
}
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
@@ -116,6 +118,12 @@ struct image_lister_t
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
space_info = parent->etcd_result;
|
||||
std::map<pool_id_t, uint64_t> pool_pg_real_size;
|
||||
for (auto & kv_item: space_info["responses"][0]["response_range"]["kvs"].array_items())
|
||||
@@ -245,11 +253,13 @@ resume_1:
|
||||
get_stats();
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (state == 100)
|
||||
return;
|
||||
}
|
||||
result.data = to_list();
|
||||
if (parent->json_output)
|
||||
{
|
||||
// JSON output
|
||||
printf("%s\n", json11::Json(to_list()).dump().c_str());
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
@@ -359,7 +369,7 @@ resume_1:
|
||||
kv.second["size_fmt"] = format_size(kv.second["size"].uint64_value());
|
||||
kv.second["ro"] = kv.second["readonly"].bool_value() ? "RO" : "-";
|
||||
}
|
||||
printf("%s", print_table(to_list(), cols, parent->color).c_str());
|
||||
result.text = print_table(to_list(), cols, parent->color);
|
||||
state = 100;
|
||||
}
|
||||
};
|
||||
@@ -436,23 +446,26 @@ std::string print_table(json11::Json items, json11::Json header, bool use_esc)
|
||||
return str;
|
||||
}
|
||||
|
||||
static uint64_t size_thresh[] = { 1024l*1024*1024*1024, 1024l*1024*1024, 1024l*1024, 1024, 0 };
|
||||
static uint64_t size_thresh[] = { (uint64_t)1024*1024*1024*1024, (uint64_t)1024*1024*1024, (uint64_t)1024*1024, 1024, 0 };
|
||||
static uint64_t size_thresh_d[] = { (uint64_t)1000000000000, (uint64_t)1000000000, (uint64_t)1000000, (uint64_t)1000, 0 };
|
||||
static const int size_thresh_n = sizeof(size_thresh)/sizeof(size_thresh[0]);
|
||||
static const char *size_unit = "TGMKB";
|
||||
|
||||
std::string format_size(uint64_t size)
|
||||
std::string format_size(uint64_t size, bool nobytes)
|
||||
{
|
||||
uint64_t *thr = nobytes ? size_thresh_d : size_thresh;
|
||||
char buf[256];
|
||||
for (int i = 0; i < sizeof(size_thresh)/sizeof(size_thresh[0]); i++)
|
||||
for (int i = 0; i < size_thresh_n; i++)
|
||||
{
|
||||
if (size >= size_thresh[i] || i >= sizeof(size_thresh)/sizeof(size_thresh[0])-1)
|
||||
if (size >= thr[i] || i >= size_thresh_n-1)
|
||||
{
|
||||
double value = size_thresh[i] ? (double)size/size_thresh[i] : size;
|
||||
double value = thr[i] ? (double)size/thr[i] : size;
|
||||
int l = snprintf(buf, sizeof(buf), "%.1f", value);
|
||||
assert(l < sizeof(buf)-2);
|
||||
if (buf[l-1] == '0')
|
||||
l -= 2;
|
||||
buf[l] = ' ';
|
||||
buf[l+1] = size_unit[i];
|
||||
buf[l] = i == size_thresh_n-1 && nobytes ? 0 : ' ';
|
||||
buf[l+1] = i == size_thresh_n-1 && nobytes ? 0 : size_unit[i];
|
||||
buf[l+2] = 0;
|
||||
break;
|
||||
}
|
||||
@@ -543,9 +556,8 @@ back:
|
||||
return true;
|
||||
}
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_ls(json11::Json cfg)
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_ls(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto lister = new image_lister_t();
|
||||
lister->parent = this;
|
||||
lister->list_pool_id = cfg["pool"].uint64_value();
|
||||
@@ -555,15 +567,16 @@ std::function<bool(void)> cli_tool_t::start_ls(json11::Json cfg)
|
||||
lister->sort_field = cfg["sort"].string_value();
|
||||
lister->reverse = cfg["reverse"].bool_value();
|
||||
lister->max_count = cfg["count"].uint64_value();
|
||||
for (int i = 1; i < cmd.size(); i++)
|
||||
for (auto & item: cfg["names"].array_items())
|
||||
{
|
||||
lister->only_names.insert(cmd[i].string_value());
|
||||
lister->only_names.insert(item.string_value());
|
||||
}
|
||||
return [lister]()
|
||||
return [lister](cli_result_t & result)
|
||||
{
|
||||
lister->loop();
|
||||
if (lister->is_done())
|
||||
{
|
||||
result = lister->result;
|
||||
delete lister;
|
||||
return true;
|
||||
}
|
||||
|
@@ -12,6 +12,9 @@ struct snap_rw_op_t
|
||||
cluster_op_t op;
|
||||
int todo = 0;
|
||||
uint32_t start = 0, end = 0;
|
||||
int error_code = 0;
|
||||
uint64_t error_offset = 0;
|
||||
bool error_read = false;
|
||||
};
|
||||
|
||||
// Layer merge is the base for multiple operations:
|
||||
@@ -54,17 +57,45 @@ struct snap_merger_t
|
||||
uint64_t last_written_offset = 0;
|
||||
int deleted_unsynced = 0;
|
||||
uint64_t processed = 0, to_process = 0;
|
||||
std::string rwo_error;
|
||||
|
||||
cli_result_t result;
|
||||
|
||||
void start_merge()
|
||||
{
|
||||
if (from_name == "" || to_name == "")
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Beginning or end of the merge sequence is missing" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
check_delete_source = delete_source || check_delete_source;
|
||||
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
|
||||
if (!from_cfg)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+from_name+" not found" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
|
||||
if (!to_cfg)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+to_name+" not found" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
inode_config_t *target_cfg = target_name == "" ? from_cfg : parent->get_inode_cfg(target_name);
|
||||
if (!target_cfg)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+target_name+" not found" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (to_cfg->num == from_cfg->num)
|
||||
{
|
||||
fprintf(stderr, "Only one layer specified, nothing to merge\n");
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Only one layer specified, nothing to merge" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
// Check that to_cfg is actually a child of from_cfg and target_cfg is somewhere between them
|
||||
std::vector<inode_t> chain_list;
|
||||
@@ -78,8 +109,18 @@ struct snap_merger_t
|
||||
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
|
||||
if (it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
|
||||
exit(1);
|
||||
result = (cli_result_t){
|
||||
.err = ENOENT,
|
||||
.text = "Parent inode of layer "+cur->name+" (id "+std::to_string(cur->parent_id)+") does not exist",
|
||||
.data = json11::Json::object {
|
||||
{ "error", "parent-not-found" },
|
||||
{ "inode_id", cur->num },
|
||||
{ "inode_name", cur->name },
|
||||
{ "parent_id", cur->parent_id },
|
||||
},
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
cur = &it->second;
|
||||
chain_list.push_back(cur->num);
|
||||
@@ -87,8 +128,9 @@ struct snap_merger_t
|
||||
}
|
||||
if (cur->parent_id != from_cfg->num)
|
||||
{
|
||||
fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Layer "+to_name+" is not a child of "+from_name };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
chain_list.push_back(from_cfg->num);
|
||||
layer_block_size[from_cfg->num] = get_block_size(from_cfg->num);
|
||||
@@ -99,8 +141,9 @@ struct snap_merger_t
|
||||
}
|
||||
if (sources.find(target_cfg->num) == sources.end())
|
||||
{
|
||||
fprintf(stderr, "Layer %s is not between %s and %s\n", target_name.c_str(), to_name.c_str(), from_name.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Layer "+target_name+" is not between "+to_name+" and "+from_name };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
target = target_cfg->num;
|
||||
target_rank = sources.at(target);
|
||||
@@ -130,14 +173,15 @@ struct snap_merger_t
|
||||
int parent_rank = it->second;
|
||||
if (parent_rank < to_rank && (parent_rank >= target_rank || check_delete_source))
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Layers at or above %s, but below %s are not allowed"
|
||||
" to have other children, but %s is a child of %s\n",
|
||||
(check_delete_source ? from_name.c_str() : target_name.c_str()),
|
||||
to_name.c_str(), ic.second.name.c_str(),
|
||||
parent->cli->st_cli.inode_config.at(ic.second.parent_id).name.c_str()
|
||||
);
|
||||
exit(1);
|
||||
result = (cli_result_t){
|
||||
.err = EINVAL,
|
||||
.text = "Layers at or above "+(check_delete_source ? from_name : target_name)+
|
||||
", but below "+to_name+" are not allowed to have other children, but "+
|
||||
ic.second.name+" is a child of "+
|
||||
parent->cli->st_cli.inode_config.at(ic.second.parent_id).name,
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (parent_rank >= to_rank)
|
||||
{
|
||||
@@ -152,11 +196,14 @@ struct snap_merger_t
|
||||
use_cas = 0;
|
||||
}
|
||||
sources.erase(target);
|
||||
printf(
|
||||
"Merging %ld layer(s) into target %s%s (inode %lu in pool %u)\n",
|
||||
sources.size(), target_cfg->name.c_str(),
|
||||
use_cas ? " online (with CAS)" : "", INODE_NO_POOL(target), INODE_POOL(target)
|
||||
);
|
||||
if (parent->progress)
|
||||
{
|
||||
printf(
|
||||
"Merging %ld layer(s) into target %s%s (inode %lu in pool %u)\n",
|
||||
sources.size(), target_cfg->name.c_str(),
|
||||
use_cas ? " online (with CAS)" : "", INODE_NO_POOL(target), INODE_POOL(target)
|
||||
);
|
||||
}
|
||||
target_block_size = get_block_size(target);
|
||||
}
|
||||
|
||||
@@ -179,7 +226,7 @@ struct snap_merger_t
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 6;
|
||||
return state == 100;
|
||||
}
|
||||
|
||||
void continue_merge()
|
||||
@@ -194,8 +241,8 @@ struct snap_merger_t
|
||||
goto resume_4;
|
||||
else if (state == 5)
|
||||
goto resume_5;
|
||||
else if (state == 6)
|
||||
goto resume_6;
|
||||
else if (state == 100)
|
||||
goto resume_100;
|
||||
// Get parents and so on
|
||||
start_merge();
|
||||
// First list lower layers
|
||||
@@ -253,7 +300,8 @@ struct snap_merger_t
|
||||
oit = merge_offsets.begin();
|
||||
resume_5:
|
||||
// Now read, overwrite and optionally delete offsets one by one
|
||||
while (in_flight < parent->iodepth*parent->parallel_osds && oit != merge_offsets.end())
|
||||
while (in_flight < parent->iodepth*parent->parallel_osds &&
|
||||
oit != merge_offsets.end() && !rwo_error.size())
|
||||
{
|
||||
in_flight++;
|
||||
read_and_write(*oit);
|
||||
@@ -264,6 +312,15 @@ struct snap_merger_t
|
||||
printf("\rOverwriting blocks: %lu/%lu", processed, to_process);
|
||||
}
|
||||
}
|
||||
if (in_flight == 0 && rwo_error.size())
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = EIO,
|
||||
.text = rwo_error,
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (in_flight > 0 || oit != merge_offsets.end())
|
||||
{
|
||||
// Wait until overwrites finish
|
||||
@@ -274,9 +331,9 @@ struct snap_merger_t
|
||||
printf("\rOverwriting blocks: %lu/%lu\n", to_process, to_process);
|
||||
}
|
||||
// Done
|
||||
printf("Done, layers from %s to %s merged into %s\n", from_name.c_str(), to_name.c_str(), target_name.c_str());
|
||||
state = 6;
|
||||
resume_6:
|
||||
result = (cli_result_t){ .text = "Done, layers from "+from_name+" to "+to_name+" merged into "+target_name };
|
||||
state = 100;
|
||||
resume_100:
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -314,7 +371,10 @@ struct snap_merger_t
|
||||
if (status & INODE_LIST_DONE)
|
||||
{
|
||||
auto & name = parent->cli->st_cli.inode_config.at(src).name;
|
||||
printf("Got listing of layer %s (inode %lu in pool %u)\n", name.c_str(), INODE_NO_POOL(src), INODE_POOL(src));
|
||||
if (parent->progress)
|
||||
{
|
||||
printf("Got listing of layer %s (inode %lu in pool %u)\n", name.c_str(), INODE_NO_POOL(src), INODE_POOL(src));
|
||||
}
|
||||
if (delete_source)
|
||||
{
|
||||
// Sort the inode listing
|
||||
@@ -396,8 +456,9 @@ struct snap_merger_t
|
||||
{
|
||||
if (op->retval != op->len)
|
||||
{
|
||||
fprintf(stderr, "error reading target at offset %lx: %s\n", op->offset, strerror(-op->retval));
|
||||
exit(1);
|
||||
rwo->error_code = -op->retval;
|
||||
rwo->error_offset = op->offset;
|
||||
rwo->error_read = true;
|
||||
}
|
||||
next_write(rwo);
|
||||
};
|
||||
@@ -410,7 +471,7 @@ struct snap_merger_t
|
||||
// FIXME: Allow to use single write with "holes" (OSDs don't allow it yet)
|
||||
uint32_t gran = parent->cli->get_bs_bitmap_granularity();
|
||||
uint64_t bitmap_size = target_block_size / gran;
|
||||
while (rwo->end < bitmap_size)
|
||||
while (rwo->end < bitmap_size && !rwo->error_code)
|
||||
{
|
||||
auto bit = ((*((uint8_t*)rwo->op.bitmap_buf + (rwo->end >> 3))) & (1 << (rwo->end & 0x7)));
|
||||
if (!bit)
|
||||
@@ -434,7 +495,7 @@ struct snap_merger_t
|
||||
rwo->end++;
|
||||
}
|
||||
}
|
||||
if (rwo->end > rwo->start)
|
||||
if (rwo->end > rwo->start && !rwo->error_code)
|
||||
{
|
||||
// write start->end
|
||||
rwo->todo++;
|
||||
@@ -473,8 +534,9 @@ struct snap_merger_t
|
||||
delete subop;
|
||||
return;
|
||||
}
|
||||
fprintf(stderr, "error writing target at offset %lx: %s\n", subop->offset, strerror(-subop->retval));
|
||||
exit(1);
|
||||
rwo->error_code = -subop->retval;
|
||||
rwo->error_offset = subop->offset;
|
||||
rwo->error_read = false;
|
||||
}
|
||||
// Increment CAS version
|
||||
rwo->op.version++;
|
||||
@@ -510,11 +572,12 @@ struct snap_merger_t
|
||||
{
|
||||
if (!rwo->todo)
|
||||
{
|
||||
if (last_written_offset < rwo->op.offset+target_block_size)
|
||||
if (!rwo->error_code &&
|
||||
last_written_offset < rwo->op.offset+target_block_size)
|
||||
{
|
||||
last_written_offset = rwo->op.offset+target_block_size;
|
||||
}
|
||||
if (delete_source)
|
||||
if (!rwo->error_code && delete_source)
|
||||
{
|
||||
deleted_unsynced++;
|
||||
if (deleted_unsynced >= fsync_interval)
|
||||
@@ -544,6 +607,13 @@ struct snap_merger_t
|
||||
}
|
||||
}
|
||||
free(rwo->buf);
|
||||
if (rwo->error_code)
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Error %s target at offset %lx: %s",
|
||||
rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(rwo->error_code));
|
||||
rwo_error = std::string(buf);
|
||||
}
|
||||
delete rwo;
|
||||
in_flight--;
|
||||
continue_merge_reent();
|
||||
@@ -551,30 +621,25 @@ struct snap_merger_t
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_merge(json11::Json cfg)
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_merge(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto merger = new snap_merger_t();
|
||||
merger->parent = this;
|
||||
merger->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
merger->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
|
||||
merger->from_name = cfg["from"].string_value();
|
||||
merger->to_name = cfg["to"].string_value();
|
||||
merger->target_name = cfg["target"].string_value();
|
||||
if (merger->from_name == "" || merger->to_name == "")
|
||||
{
|
||||
fprintf(stderr, "Beginning or end of the merge sequence is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
merger->delete_source = cfg["delete-source"].string_value() != "";
|
||||
merger->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
if (!merger->fsync_interval)
|
||||
merger->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
merger->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
||||
return [merger]()
|
||||
return [merger](cli_result_t & result)
|
||||
{
|
||||
merger->continue_merge_reent();
|
||||
if (merger->is_done())
|
||||
{
|
||||
result = merger->result;
|
||||
delete merger;
|
||||
return true;
|
||||
}
|
||||
|
@@ -13,6 +13,7 @@ struct image_changer_t
|
||||
std::string image_name;
|
||||
std::string new_name;
|
||||
uint64_t new_size = 0;
|
||||
bool force_size = false;
|
||||
bool set_readonly = false, set_readwrite = false, force = false;
|
||||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
@@ -23,7 +24,8 @@ struct image_changer_t
|
||||
bool has_children = false;
|
||||
|
||||
int state = 0;
|
||||
std::function<bool(void)> cb;
|
||||
std::function<bool(cli_result_t &)> cb;
|
||||
cli_result_t result;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
@@ -36,6 +38,18 @@ struct image_changer_t
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
if (image_name == "")
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Image name is missing" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (new_size != 0 && (new_size % 4096) && !force_size)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Image size should be a multiple of 4096" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (ic.second.name == image_name)
|
||||
@@ -46,14 +60,16 @@ struct image_changer_t
|
||||
}
|
||||
if (new_name != "" && ic.second.name == new_name)
|
||||
{
|
||||
fprintf(stderr, "Image %s already exists\n", new_name.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = EEXIST, .text = "Image "+new_name+" already exists" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (!inode_num)
|
||||
{
|
||||
fprintf(stderr, "Image %s does not exist\n", image_name.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Image "+image_name+" does not exist" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
@@ -65,37 +81,43 @@ struct image_changer_t
|
||||
}
|
||||
if ((!set_readwrite || !cfg.readonly) &&
|
||||
(!set_readonly || cfg.readonly) &&
|
||||
(!new_size || cfg.size == new_size) &&
|
||||
(!new_size && !force_size || cfg.size == new_size) &&
|
||||
(new_name == "" || new_name == image_name))
|
||||
{
|
||||
printf("No change\n");
|
||||
result = (cli_result_t){ .text = "No change" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (new_size != 0)
|
||||
if (new_size != 0 || force_size)
|
||||
{
|
||||
if (cfg.size >= new_size)
|
||||
{
|
||||
// Check confirmation when trimming an image with children
|
||||
if (has_children && !force)
|
||||
{
|
||||
fprintf(stderr, "Image %s has children. Refusing to shrink it without --force\n", image_name.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Image "+image_name+" has children. Refusing to shrink it without --force" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
// Shrink the image first
|
||||
cb = parent->start_rm(json11::Json::object {
|
||||
cb = parent->start_rm_data(json11::Json::object {
|
||||
{ "inode", INODE_NO_POOL(inode_num) },
|
||||
{ "pool", (uint64_t)INODE_POOL(inode_num) },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
{ "min-offset", new_size },
|
||||
{ "min-offset", ((new_size+4095)/4096)*4096 },
|
||||
});
|
||||
resume_1:
|
||||
while (!cb())
|
||||
while (!cb(result))
|
||||
{
|
||||
state = 1;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
}
|
||||
cfg.size = new_size;
|
||||
}
|
||||
@@ -109,8 +131,9 @@ resume_1:
|
||||
// Check confirmation when making an image with children read-write
|
||||
if (has_children && !force)
|
||||
{
|
||||
fprintf(stderr, "Image %s has children. Refusing to make it read-write without --force\n", image_name.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Image "+image_name+" has children. Refusing to make it read-write without --force" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (new_name != "")
|
||||
@@ -178,34 +201,38 @@ resume_1:
|
||||
resume_2:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!parent->etcd_result["succeeded"].bool_value())
|
||||
{
|
||||
fprintf(stderr, "Image %s was modified by someone else, please repeat your request\n", image_name.c_str());
|
||||
exit(1);
|
||||
result = (cli_result_t){ .err = EAGAIN, .text = "Image "+image_name+" was modified by someone else, please repeat your request" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
printf("Image %s modified\n", image_name.c_str());
|
||||
// Save into inode_config for library users to be able to take it from there immediately
|
||||
cfg.mod_revision = parent->etcd_result["responses"][0]["response_put"]["header"]["revision"].uint64_value();
|
||||
if (new_name != "")
|
||||
{
|
||||
parent->cli->st_cli.inode_by_name.erase(image_name);
|
||||
}
|
||||
parent->cli->st_cli.insert_inode_config(cfg);
|
||||
result = (cli_result_t){ .err = 0, .text = "Image "+image_name+" modified" };
|
||||
state = 100;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_modify(json11::Json cfg)
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto changer = new image_changer_t();
|
||||
changer->parent = this;
|
||||
changer->image_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
if (changer->image_name == "")
|
||||
{
|
||||
fprintf(stderr, "Image name is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
changer->image_name = cfg["image"].string_value();
|
||||
changer->new_name = cfg["rename"].string_value();
|
||||
changer->new_size = parse_size(cfg["resize"].string_value());
|
||||
if (changer->new_size != 0 && (changer->new_size % 4096))
|
||||
{
|
||||
fprintf(stderr, "Image size should be a multiple of 4096\n");
|
||||
exit(1);
|
||||
}
|
||||
changer->new_size = parse_size(cfg["resize"].as_string());
|
||||
changer->force_size = cfg["force_size"].bool_value();
|
||||
changer->force = cfg["force"].bool_value();
|
||||
changer->set_readonly = cfg["readonly"].bool_value();
|
||||
changer->set_readwrite = cfg["readwrite"].bool_value();
|
||||
@@ -213,11 +240,12 @@ std::function<bool(void)> cli_tool_t::start_modify(json11::Json cfg)
|
||||
if (!changer->fsync_interval)
|
||||
changer->fsync_interval = 128;
|
||||
// FIXME Check that the image doesn't have children when shrinking
|
||||
return [changer]()
|
||||
return [changer](cli_result_t & result)
|
||||
{
|
||||
changer->loop();
|
||||
if (changer->is_done())
|
||||
{
|
||||
result = changer->result;
|
||||
delete changer;
|
||||
return true;
|
||||
}
|
||||
|
818
src/cli_rm.cpp
818
src/cli_rm.cpp
@@ -1,212 +1,658 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <fcntl.h>
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
#include "base64.h"
|
||||
|
||||
#define RM_LISTING 1
|
||||
#define RM_REMOVING 2
|
||||
#define RM_END 3
|
||||
|
||||
struct rm_pg_t
|
||||
// Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets
|
||||
//
|
||||
// Exactly one child of the requested layers may be merged using the "inverted" workflow,
|
||||
// where we merge it "down" into one of the "to-be-removed" layers and then rename the
|
||||
// "to-be-removed" layer to the child. It may be done either if all writers are stopped
|
||||
// before trying to delete layers (which is signaled by --writers-stopped) or if that child
|
||||
// is a read-only layer (snapshot) itself.
|
||||
//
|
||||
// This "inverted" workflow trades copying data of one of the deleted layers for copying
|
||||
// data of one child of the chain which is also a child of the "traded" layer. So we
|
||||
// choose the (parent,child) pair which has the largest difference between "parent" and
|
||||
// "child" inode sizes.
|
||||
//
|
||||
// All other children of the chain are processed by iterating though them, merging removed
|
||||
// parents into them and rebasing them to the last layer which isn't a member of the removed
|
||||
// chain.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// <parent> - <from> - <layer 2> - <to> - <child 1>
|
||||
// \ \ \- <child 2>
|
||||
// \ \- <child 3>
|
||||
// \-<child 4>
|
||||
//
|
||||
// 1) Find optimal pair for the "reverse" scenario
|
||||
// Imagine that it's (<layer 2>, <child 1>) in this example
|
||||
// 2) Process all children except <child 1>:
|
||||
// - Merge <from>..<to> to <child 2>
|
||||
// - Set <child 2> parent to <parent>
|
||||
// - Repeat for others
|
||||
// 3) Process <child 1>:
|
||||
// - Merge <from>..<child 1> to <layer 2>
|
||||
// - Set <layer 2> parent to <parent>
|
||||
// - Rename <layer 2> to <child 1>
|
||||
// 4) Delete other layers of the chain (<from>, <to>)
|
||||
struct snap_remover_t
|
||||
{
|
||||
pg_num_t pg_num;
|
||||
osd_num_t rm_osd_num;
|
||||
std::set<object_id> objects;
|
||||
std::set<object_id>::iterator obj_pos;
|
||||
uint64_t obj_count = 0, obj_done = 0;
|
||||
cli_tool_t *parent;
|
||||
|
||||
// remove from..to
|
||||
std::string from_name, to_name;
|
||||
// writers are stopped, we can safely change writable layers
|
||||
bool writers_stopped = false;
|
||||
// use CAS writes (0 = never, 1 = auto, 2 = always)
|
||||
int use_cas = 1;
|
||||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
|
||||
std::map<inode_t,int> sources;
|
||||
std::map<inode_t,uint64_t> inode_used;
|
||||
std::vector<inode_t> merge_children;
|
||||
std::vector<inode_t> chain_list;
|
||||
std::map<inode_t,int> inverse_candidates;
|
||||
inode_t inverse_parent = 0, inverse_child = 0;
|
||||
inode_t new_parent = 0;
|
||||
int state = 0;
|
||||
int in_flight = 0;
|
||||
};
|
||||
int current_child = 0;
|
||||
std::function<bool(cli_result_t &)> cb;
|
||||
|
||||
struct rm_inode_t
|
||||
{
|
||||
uint64_t inode = 0;
|
||||
pool_id_t pool_id = 0;
|
||||
uint64_t min_offset = 0;
|
||||
cli_result_t result;
|
||||
|
||||
cli_tool_t *parent = NULL;
|
||||
inode_list_t *lister = NULL;
|
||||
std::vector<rm_pg_t*> lists;
|
||||
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
|
||||
uint64_t pgs_to_list = 0;
|
||||
bool lists_done = false;
|
||||
int state = 0;
|
||||
|
||||
void start_delete()
|
||||
bool is_done()
|
||||
{
|
||||
lister = parent->cli->list_inode_start(inode, [this](inode_list_t *lst,
|
||||
std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
|
||||
{
|
||||
rm_pg_t *rm = new rm_pg_t((rm_pg_t){
|
||||
.pg_num = pg_num,
|
||||
.rm_osd_num = primary_osd,
|
||||
.objects = objects,
|
||||
.obj_count = objects.size(),
|
||||
.obj_done = 0,
|
||||
});
|
||||
if (min_offset == 0)
|
||||
{
|
||||
total_count += objects.size();
|
||||
}
|
||||
else
|
||||
{
|
||||
for (object_id oid: objects)
|
||||
{
|
||||
if (oid.stripe >= min_offset)
|
||||
{
|
||||
total_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
rm->obj_pos = rm->objects.begin();
|
||||
lists.push_back(rm);
|
||||
if (parent->list_first)
|
||||
{
|
||||
parent->cli->list_inode_next(lister, 1);
|
||||
}
|
||||
if (status & INODE_LIST_DONE)
|
||||
{
|
||||
lists_done = true;
|
||||
}
|
||||
pgs_to_list--;
|
||||
continue_delete();
|
||||
});
|
||||
if (!lister)
|
||||
{
|
||||
fprintf(stderr, "Failed to list inode %lu from pool %u objects\n", INODE_NO_POOL(inode), INODE_POOL(inode));
|
||||
exit(1);
|
||||
}
|
||||
pgs_to_list = parent->cli->list_pg_count(lister);
|
||||
parent->cli->list_inode_next(lister, parent->parallel_osds);
|
||||
return state == 100;
|
||||
}
|
||||
|
||||
void send_ops(rm_pg_t *cur_list)
|
||||
void loop()
|
||||
{
|
||||
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
||||
parent->cli->msgr.osd_peer_fds.end())
|
||||
{
|
||||
// Initiate connection
|
||||
parent->cli->msgr.connect_peer(cur_list->rm_osd_num, parent->cli->st_cli.peer_states[cur_list->rm_osd_num]);
|
||||
return;
|
||||
}
|
||||
while (cur_list->in_flight < parent->iodepth && cur_list->obj_pos != cur_list->objects.end())
|
||||
{
|
||||
if (cur_list->obj_pos->stripe >= min_offset)
|
||||
{
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
// Already checked that it exists above, but anyway
|
||||
op->peer_fd = parent->cli->msgr.osd_peer_fds.at(cur_list->rm_osd_num);
|
||||
op->req = (osd_any_op_t){
|
||||
.rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_DELETE,
|
||||
},
|
||||
.inode = cur_list->obj_pos->inode,
|
||||
.offset = cur_list->obj_pos->stripe,
|
||||
.len = 0,
|
||||
},
|
||||
};
|
||||
op->callback = [this, cur_list](osd_op_t *op)
|
||||
{
|
||||
cur_list->in_flight--;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
|
||||
op->req.rw.inode, op->req.rw.offset,
|
||||
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||
}
|
||||
delete op;
|
||||
cur_list->obj_done++;
|
||||
total_done++;
|
||||
continue_delete();
|
||||
};
|
||||
cur_list->in_flight++;
|
||||
parent->cli->msgr.outbox_push(op);
|
||||
}
|
||||
cur_list->obj_pos++;
|
||||
}
|
||||
}
|
||||
|
||||
void continue_delete()
|
||||
{
|
||||
if (parent->list_first && !lists_done)
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
|
||||
{
|
||||
delete lists[i];
|
||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||
i--;
|
||||
if (!lists_done)
|
||||
{
|
||||
parent->cli->list_inode_next(lister, 1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
send_ops(lists[i]);
|
||||
}
|
||||
}
|
||||
if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
|
||||
{
|
||||
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
|
||||
total_prev_pct = total_done*1000/total_count;
|
||||
}
|
||||
if (lists_done && !lists.size())
|
||||
{
|
||||
printf("Done, inode %lu in pool %u data removed\n", INODE_NO_POOL(inode), pool_id);
|
||||
state = 2;
|
||||
}
|
||||
}
|
||||
|
||||
bool loop()
|
||||
{
|
||||
if (state == 0)
|
||||
{
|
||||
start_delete();
|
||||
state = 1;
|
||||
}
|
||||
else if (state == 1)
|
||||
{
|
||||
continue_delete();
|
||||
}
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
else if (state == 4)
|
||||
goto resume_4;
|
||||
else if (state == 5)
|
||||
goto resume_5;
|
||||
else if (state == 6)
|
||||
goto resume_6;
|
||||
else if (state == 7)
|
||||
goto resume_7;
|
||||
else if (state == 8)
|
||||
goto resume_8;
|
||||
else if (state == 100)
|
||||
goto resume_100;
|
||||
assert(!state);
|
||||
if (from_name == "")
|
||||
{
|
||||
return true;
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Layer to remove argument is missing" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
return false;
|
||||
if (to_name == "")
|
||||
{
|
||||
to_name = from_name;
|
||||
}
|
||||
// Get children to merge
|
||||
get_merge_children();
|
||||
if (state == 100)
|
||||
return;
|
||||
// Try to select an inode for the "inverse" optimized scenario
|
||||
// Read statistics from etcd to do it
|
||||
read_stats();
|
||||
if (state == 100)
|
||||
return;
|
||||
state = 1;
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
choose_inverse_candidate();
|
||||
// Merge children one by one, except our "inverse" child
|
||||
for (current_child = 0; current_child < merge_children.size(); current_child++)
|
||||
{
|
||||
if (merge_children[current_child] == inverse_child)
|
||||
continue;
|
||||
start_merge_child(merge_children[current_child], merge_children[current_child]);
|
||||
if (state == 100)
|
||||
return;
|
||||
resume_2:
|
||||
while (!cb(result))
|
||||
{
|
||||
state = 2;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
parent->change_parent(merge_children[current_child], new_parent, &result);
|
||||
state = 3;
|
||||
resume_3:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (result.err)
|
||||
{
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
else if (parent->progress)
|
||||
printf("%s\n", result.text.c_str());
|
||||
}
|
||||
// Merge our "inverse" child into our "inverse" parent
|
||||
if (inverse_child != 0)
|
||||
{
|
||||
start_merge_child(inverse_child, inverse_parent);
|
||||
if (state == 100)
|
||||
return;
|
||||
resume_4:
|
||||
while (!cb(result))
|
||||
{
|
||||
state = 4;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
// Delete "inverse" child data
|
||||
start_delete_source(inverse_child);
|
||||
if (state == 100)
|
||||
return;
|
||||
resume_5:
|
||||
while (!cb(result))
|
||||
{
|
||||
state = 5;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
// Delete "inverse" child metadata, rename parent over it,
|
||||
// and also change parent links of the previous "inverse" child
|
||||
rename_inverse_parent();
|
||||
if (state == 100)
|
||||
return;
|
||||
state = 6;
|
||||
resume_6:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
// Delete parents, except the "inverse" one
|
||||
for (current_child = 0; current_child < chain_list.size(); current_child++)
|
||||
{
|
||||
if (chain_list[current_child] == inverse_parent)
|
||||
continue;
|
||||
start_delete_source(chain_list[current_child]);
|
||||
resume_7:
|
||||
while (!cb(result))
|
||||
{
|
||||
state = 7;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
delete_inode_config(chain_list[current_child]);
|
||||
if (state == 100)
|
||||
return;
|
||||
state = 8;
|
||||
resume_8:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
state = 100;
|
||||
resume_100:
|
||||
// Done
|
||||
return;
|
||||
}
|
||||
|
||||
void get_merge_children()
|
||||
{
|
||||
// Get all children of from..to
|
||||
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
|
||||
if (!from_cfg)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+from_name+" not found" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
|
||||
if (!to_cfg)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+to_name+" not found" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
// Check that to_cfg is actually a child of from_cfg
|
||||
// FIXME de-copypaste the following piece of code with snap_merger_t
|
||||
inode_config_t *cur = to_cfg;
|
||||
chain_list.push_back(cur->num);
|
||||
while (cur->num != from_cfg->num && cur->parent_id != 0)
|
||||
{
|
||||
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
|
||||
if (it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Parent inode of layer %s (id 0x%lx) not found", cur->name.c_str(), cur->parent_id);
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
cur = &it->second;
|
||||
chain_list.push_back(cur->num);
|
||||
}
|
||||
if (cur->num != from_cfg->num)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Layer "+to_name+" is not a child of "+from_name };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
new_parent = from_cfg->parent_id;
|
||||
// Calculate ranks
|
||||
int i = chain_list.size()-1;
|
||||
for (inode_t item: chain_list)
|
||||
{
|
||||
sources[item] = i--;
|
||||
}
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (!ic.second.parent_id)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto it = sources.find(ic.second.parent_id);
|
||||
if (it != sources.end() && sources.find(ic.second.num) == sources.end())
|
||||
{
|
||||
merge_children.push_back(ic.second.num);
|
||||
if (ic.second.readonly || writers_stopped)
|
||||
{
|
||||
inverse_candidates[ic.second.num] = it->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void read_stats()
|
||||
{
|
||||
if (inverse_candidates.size() == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
json11::Json::array reads;
|
||||
for (auto cp: inverse_candidates)
|
||||
{
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
for (auto cp: sources)
|
||||
{
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "success", reads },
|
||||
}, [this](std::string err, json11::Json data)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
result = (cli_result_t){ .err = EIO, .text = "Error reading layer statistics from etcd: "+err };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
for (auto inode_result: data["responses"].array_items())
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]);
|
||||
pool_id_t pool_id = 0;
|
||||
inode_t inode = 0;
|
||||
char null_byte = 0;
|
||||
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
|
||||
if (!inode || null_byte != 0)
|
||||
{
|
||||
result = (cli_result_t){ .err = EIO, .text = "Bad key returned from etcd: "+kv.key };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id);
|
||||
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+std::to_string(pool_id)+" does not exist" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
inode = INODE_WITH_POOL(pool_id, inode);
|
||||
auto & pool_cfg = pool_cfg_it->second;
|
||||
uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size;
|
||||
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
|
||||
}
|
||||
inode_used[inode] = used_bytes;
|
||||
}
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void choose_inverse_candidate()
|
||||
{
|
||||
uint64_t max_diff = 0;
|
||||
for (auto cp: inverse_candidates)
|
||||
{
|
||||
inode_t child = cp.first;
|
||||
uint64_t child_used = inode_used[child];
|
||||
int rank = cp.second;
|
||||
for (int i = chain_list.size()-rank; i < chain_list.size(); i++)
|
||||
{
|
||||
inode_t parent = chain_list[i];
|
||||
uint64_t parent_used = inode_used[parent];
|
||||
if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used)))
|
||||
{
|
||||
max_diff = (parent_used-child_used);
|
||||
inverse_parent = parent;
|
||||
inverse_child = child;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void rename_inverse_parent()
|
||||
{
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(inverse_child);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_child);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_parent);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
inode_config_t *child_cfg = &child_it->second;
|
||||
inode_config_t *target_cfg = &target_it->second;
|
||||
std::string child_name = child_cfg->name;
|
||||
std::string target_name = target_cfg->name;
|
||||
std::string child_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_child))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_child))
|
||||
);
|
||||
std::string target_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_parent))
|
||||
);
|
||||
// Fill new configuration
|
||||
inode_config_t new_cfg = *child_cfg;
|
||||
new_cfg.num = target_cfg->num;
|
||||
new_cfg.parent_id = new_parent;
|
||||
json11::Json::array cmp = json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", child_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", child_cfg->mod_revision+1 },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", target_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", target_cfg->mod_revision+1 },
|
||||
},
|
||||
};
|
||||
json11::Json::array txn = json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", child_cfg_key },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", target_cfg_key },
|
||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&new_cfg)).dump()) },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+child_cfg->name) },
|
||||
{ "value", base64_encode(json11::Json({
|
||||
{ "id", INODE_NO_POOL(inverse_parent) },
|
||||
{ "pool_id", (uint64_t)INODE_POOL(inverse_parent) },
|
||||
}).dump()) },
|
||||
} },
|
||||
},
|
||||
};
|
||||
// Reparent children of inverse_child
|
||||
for (auto & cp: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (cp.second.parent_id == child_cfg->num)
|
||||
{
|
||||
auto cp_cfg = cp.second;
|
||||
cp_cfg.parent_id = inverse_parent;
|
||||
auto cp_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cp.second.num))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cp.second.num))
|
||||
);
|
||||
cmp.push_back(json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cp_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", cp.second.mod_revision+1 },
|
||||
});
|
||||
txn.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", cp_key },
|
||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&cp_cfg)).dump()) },
|
||||
} },
|
||||
});
|
||||
}
|
||||
}
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", cmp },
|
||||
{ "success", txn },
|
||||
}, [this, target_name, child_name](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
result = (cli_result_t){ .err = EIO, .text = "Error renaming "+target_name+" to "+child_name+": "+err };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = EAGAIN,
|
||||
.text = "Parent ("+target_name+"), child ("+child_name+"), or one of its children"
|
||||
" configuration was modified during rename",
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (parent->progress)
|
||||
printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void delete_inode_config(inode_t cur)
|
||||
{
|
||||
auto cur_cfg_it = parent->cli->st_cli.inode_config.find(cur);
|
||||
if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", cur);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
inode_config_t *cur_cfg = &cur_cfg_it->second;
|
||||
std::string cur_name = cur_cfg->name;
|
||||
std::string cur_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cur))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cur))
|
||||
);
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cur_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", cur_cfg->mod_revision+1 },
|
||||
},
|
||||
} },
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", cur_cfg_key },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) },
|
||||
} },
|
||||
},
|
||||
} },
|
||||
}, [this, cur, cur_name](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
result = (cli_result_t){ .err = EIO, .text = "Error deleting "+cur_name+": "+err };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
result = (cli_result_t){ .err = EAGAIN, .text = "Layer "+cur_name+" was modified during deletion" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
// Modify inode_config for library users to be able to take it from there immediately
|
||||
parent->cli->st_cli.inode_by_name.erase(cur_name);
|
||||
parent->cli->st_cli.inode_config.erase(cur);
|
||||
if (parent->progress)
|
||||
printf("Layer %s deleted\n", cur_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void start_merge_child(inode_t child_inode, inode_t target_inode)
|
||||
{
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(child_inode);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", child_inode);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(target_inode);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", target_inode);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
cb = parent->start_merge(json11::Json::object {
|
||||
{ "from", from_name },
|
||||
{ "to", child_it->second.name },
|
||||
{ "target", target_it->second.name },
|
||||
{ "delete-source", false },
|
||||
{ "cas", use_cas },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
}
|
||||
|
||||
void start_delete_source(inode_t inode)
|
||||
{
|
||||
auto source = parent->cli->st_cli.inode_config.find(inode);
|
||||
if (source == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", inode);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
cb = parent->start_rm_data(json11::Json::object {
|
||||
{ "inode", inode },
|
||||
{ "pool", (uint64_t)INODE_POOL(inode) },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_rm(json11::Json cfg)
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_rm(json11::Json cfg)
|
||||
{
|
||||
auto remover = new rm_inode_t();
|
||||
remover->parent = this;
|
||||
remover->inode = cfg["inode"].uint64_value();
|
||||
remover->pool_id = cfg["pool"].uint64_value();
|
||||
if (remover->pool_id)
|
||||
auto snap_remover = new snap_remover_t();
|
||||
snap_remover->parent = this;
|
||||
snap_remover->from_name = cfg["from"].string_value();
|
||||
snap_remover->to_name = cfg["to"].string_value();
|
||||
snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
if (!snap_remover->fsync_interval)
|
||||
snap_remover->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
||||
if (!cfg["writers_stopped"].is_null())
|
||||
snap_remover->writers_stopped = true;
|
||||
return [snap_remover](cli_result_t & result)
|
||||
{
|
||||
remover->inode = (remover->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
|
||||
}
|
||||
remover->pool_id = INODE_POOL(remover->inode);
|
||||
if (!remover->pool_id)
|
||||
{
|
||||
fprintf(stderr, "pool is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
remover->min_offset = cfg["min-offset"].uint64_value();
|
||||
return [remover]()
|
||||
{
|
||||
if (remover->loop())
|
||||
snap_remover->loop();
|
||||
if (snap_remover->is_done())
|
||||
{
|
||||
delete remover;
|
||||
result = snap_remover->result;
|
||||
delete snap_remover;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
232
src/cli_rm_data.cpp
Normal file
232
src/cli_rm_data.cpp
Normal file
@@ -0,0 +1,232 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
|
||||
#define RM_LISTING 1
|
||||
#define RM_REMOVING 2
|
||||
#define RM_END 3
|
||||
|
||||
struct rm_pg_t
|
||||
{
|
||||
pg_num_t pg_num;
|
||||
osd_num_t rm_osd_num;
|
||||
std::set<object_id> objects;
|
||||
std::set<object_id>::iterator obj_pos;
|
||||
uint64_t obj_count = 0, obj_done = 0;
|
||||
int state = 0;
|
||||
int in_flight = 0;
|
||||
};
|
||||
|
||||
struct rm_inode_t
|
||||
{
|
||||
uint64_t inode = 0;
|
||||
pool_id_t pool_id = 0;
|
||||
uint64_t min_offset = 0;
|
||||
|
||||
cli_tool_t *parent = NULL;
|
||||
inode_list_t *lister = NULL;
|
||||
std::vector<rm_pg_t*> lists;
|
||||
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
|
||||
uint64_t pgs_to_list = 0;
|
||||
bool lists_done = false;
|
||||
int state = 0;
|
||||
int error_count = 0;
|
||||
|
||||
cli_result_t result;
|
||||
|
||||
void start_delete()
|
||||
{
|
||||
lister = parent->cli->list_inode_start(inode, [this](inode_list_t *lst,
|
||||
std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
|
||||
{
|
||||
rm_pg_t *rm = new rm_pg_t((rm_pg_t){
|
||||
.pg_num = pg_num,
|
||||
.rm_osd_num = primary_osd,
|
||||
.objects = objects,
|
||||
.obj_count = objects.size(),
|
||||
.obj_done = 0,
|
||||
});
|
||||
if (min_offset == 0)
|
||||
{
|
||||
total_count += objects.size();
|
||||
}
|
||||
else
|
||||
{
|
||||
for (object_id oid: objects)
|
||||
{
|
||||
if (oid.stripe >= min_offset)
|
||||
{
|
||||
total_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
rm->obj_pos = rm->objects.begin();
|
||||
lists.push_back(rm);
|
||||
if (parent->list_first)
|
||||
{
|
||||
parent->cli->list_inode_next(lister, 1);
|
||||
}
|
||||
if (status & INODE_LIST_DONE)
|
||||
{
|
||||
lists_done = true;
|
||||
}
|
||||
pgs_to_list--;
|
||||
continue_delete();
|
||||
});
|
||||
if (!lister)
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = EIO,
|
||||
.text = "Failed to list objects of inode "+std::to_string(INODE_NO_POOL(inode))+
|
||||
" from pool "+std::to_string(INODE_POOL(inode)),
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
pgs_to_list = parent->cli->list_pg_count(lister);
|
||||
parent->cli->list_inode_next(lister, parent->parallel_osds);
|
||||
}
|
||||
|
||||
void send_ops(rm_pg_t *cur_list)
|
||||
{
|
||||
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
||||
parent->cli->msgr.osd_peer_fds.end())
|
||||
{
|
||||
// Initiate connection
|
||||
parent->cli->msgr.connect_peer(cur_list->rm_osd_num, parent->cli->st_cli.peer_states[cur_list->rm_osd_num]);
|
||||
return;
|
||||
}
|
||||
while (cur_list->in_flight < parent->iodepth && cur_list->obj_pos != cur_list->objects.end())
|
||||
{
|
||||
if (cur_list->obj_pos->stripe >= min_offset)
|
||||
{
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
// Already checked that it exists above, but anyway
|
||||
op->peer_fd = parent->cli->msgr.osd_peer_fds.at(cur_list->rm_osd_num);
|
||||
op->req = (osd_any_op_t){
|
||||
.rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_DELETE,
|
||||
},
|
||||
.inode = cur_list->obj_pos->inode,
|
||||
.offset = cur_list->obj_pos->stripe,
|
||||
.len = 0,
|
||||
},
|
||||
};
|
||||
op->callback = [this, cur_list](osd_op_t *op)
|
||||
{
|
||||
cur_list->in_flight--;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
|
||||
op->req.rw.inode, op->req.rw.offset,
|
||||
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||
error_count++;
|
||||
}
|
||||
delete op;
|
||||
cur_list->obj_done++;
|
||||
total_done++;
|
||||
continue_delete();
|
||||
};
|
||||
cur_list->in_flight++;
|
||||
parent->cli->msgr.outbox_push(op);
|
||||
}
|
||||
cur_list->obj_pos++;
|
||||
}
|
||||
}
|
||||
|
||||
void continue_delete()
|
||||
{
|
||||
if (parent->list_first && !lists_done)
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
|
||||
{
|
||||
delete lists[i];
|
||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||
i--;
|
||||
if (!lists_done)
|
||||
{
|
||||
parent->cli->list_inode_next(lister, 1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
send_ops(lists[i]);
|
||||
}
|
||||
}
|
||||
if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
|
||||
{
|
||||
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
|
||||
total_prev_pct = total_done*1000/total_count;
|
||||
}
|
||||
if (lists_done && !lists.size())
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = error_count > 0 ? EIO : 0,
|
||||
.text = error_count > 0 ? "Some blocks were not removed" : (
|
||||
"Done, inode "+std::to_string(INODE_NO_POOL(inode))+" from pool "+
|
||||
std::to_string(pool_id)+" removed"),
|
||||
};
|
||||
state = 100;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 100;
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
if (state == 100)
|
||||
return;
|
||||
if (!pool_id)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Pool is not specified" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
start_delete();
|
||||
if (state == 100)
|
||||
return;
|
||||
state = 1;
|
||||
resume_1:
|
||||
continue_delete();
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_rm_data(json11::Json cfg)
|
||||
{
|
||||
auto remover = new rm_inode_t();
|
||||
remover->parent = this;
|
||||
remover->inode = cfg["inode"].uint64_value();
|
||||
remover->pool_id = cfg["pool"].uint64_value();
|
||||
if (remover->pool_id)
|
||||
{
|
||||
remover->inode = (remover->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
|
||||
}
|
||||
remover->pool_id = INODE_POOL(remover->inode);
|
||||
remover->min_offset = cfg["min-offset"].uint64_value();
|
||||
return [remover](cli_result_t & result)
|
||||
{
|
||||
remover->loop();
|
||||
if (remover->is_done())
|
||||
{
|
||||
result = remover->result;
|
||||
delete remover;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
@@ -11,9 +11,9 @@
|
||||
#include <sys/stat.h>
|
||||
|
||||
// Calculate offsets for a block device and print OSD command line parameters
|
||||
std::function<bool(void)> cli_tool_t::simple_offsets(json11::Json cfg)
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::simple_offsets(json11::Json cfg)
|
||||
{
|
||||
std::string device = cfg["command"][1].string_value();
|
||||
std::string device = cfg["device"].string_value();
|
||||
uint64_t object_size = parse_size(cfg["object_size"].string_value());
|
||||
uint64_t bitmap_granularity = parse_size(cfg["bitmap_granularity"].string_value());
|
||||
uint64_t journal_size = parse_size(cfg["journal_size"].string_value());
|
||||
|
@@ -1,568 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <fcntl.h>
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
#include "base64.h"
|
||||
|
||||
// Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets
|
||||
//
|
||||
// Exactly one child of the requested layers may be merged using the "inverted" workflow,
|
||||
// where we merge it "down" into one of the "to-be-removed" layers and then rename the
|
||||
// "to-be-removed" layer to the child. It may be done either if all writers are stopped
|
||||
// before trying to delete layers (which is signaled by --writers-stopped) or if that child
|
||||
// is a read-only layer (snapshot) itself.
|
||||
//
|
||||
// This "inverted" workflow trades copying data of one of the deleted layers for copying
|
||||
// data of one child of the chain which is also a child of the "traded" layer. So we
|
||||
// choose the (parent,child) pair which has the largest difference between "parent" and
|
||||
// "child" inode sizes.
|
||||
//
|
||||
// All other children of the chain are processed by iterating though them, merging removed
|
||||
// parents into them and rebasing them to the last layer which isn't a member of the removed
|
||||
// chain.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// <parent> - <from> - <layer 2> - <to> - <child 1>
|
||||
// \ \ \- <child 2>
|
||||
// \ \- <child 3>
|
||||
// \-<child 4>
|
||||
//
|
||||
// 1) Find optimal pair for the "reverse" scenario
|
||||
// Imagine that it's (<layer 2>, <child 1>) in this example
|
||||
// 2) Process all children except <child 1>:
|
||||
// - Merge <from>..<to> to <child 2>
|
||||
// - Set <child 2> parent to <parent>
|
||||
// - Repeat for others
|
||||
// 3) Process <child 1>:
|
||||
// - Merge <from>..<child 1> to <layer 2>
|
||||
// - Set <layer 2> parent to <parent>
|
||||
// - Rename <layer 2> to <child 1>
|
||||
// 4) Delete other layers of the chain (<from>, <to>)
|
||||
struct snap_remover_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
|
||||
// remove from..to
|
||||
std::string from_name, to_name;
|
||||
// writers are stopped, we can safely change writable layers
|
||||
bool writers_stopped = false;
|
||||
// use CAS writes (0 = never, 1 = auto, 2 = always)
|
||||
int use_cas = 1;
|
||||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
|
||||
std::map<inode_t,int> sources;
|
||||
std::map<inode_t,uint64_t> inode_used;
|
||||
std::vector<inode_t> merge_children;
|
||||
std::vector<inode_t> chain_list;
|
||||
std::map<inode_t,int> inverse_candidates;
|
||||
inode_t inverse_parent = 0, inverse_child = 0;
|
||||
inode_t new_parent = 0;
|
||||
int state = 0;
|
||||
int current_child = 0;
|
||||
std::function<bool(void)> cb;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 9;
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
else if (state == 4)
|
||||
goto resume_4;
|
||||
else if (state == 5)
|
||||
goto resume_5;
|
||||
else if (state == 6)
|
||||
goto resume_6;
|
||||
else if (state == 7)
|
||||
goto resume_7;
|
||||
else if (state == 8)
|
||||
goto resume_8;
|
||||
else if (state == 9)
|
||||
goto resume_9;
|
||||
// Get children to merge
|
||||
get_merge_children();
|
||||
// Try to select an inode for the "inverse" optimized scenario
|
||||
// Read statistics from etcd to do it
|
||||
read_stats();
|
||||
state = 1;
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
choose_inverse_candidate();
|
||||
// Merge children one by one, except our "inverse" child
|
||||
for (current_child = 0; current_child < merge_children.size(); current_child++)
|
||||
{
|
||||
if (merge_children[current_child] == inverse_child)
|
||||
continue;
|
||||
start_merge_child(merge_children[current_child], merge_children[current_child]);
|
||||
resume_2:
|
||||
while (!cb())
|
||||
{
|
||||
state = 2;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
parent->change_parent(merge_children[current_child], new_parent);
|
||||
state = 3;
|
||||
resume_3:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
// Merge our "inverse" child into our "inverse" parent
|
||||
if (inverse_child != 0)
|
||||
{
|
||||
start_merge_child(inverse_child, inverse_parent);
|
||||
resume_4:
|
||||
while (!cb())
|
||||
{
|
||||
state = 4;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
// Delete "inverse" child data
|
||||
start_delete_source(inverse_child);
|
||||
resume_5:
|
||||
while (!cb())
|
||||
{
|
||||
state = 5;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
// Delete "inverse" child metadata, rename parent over it,
|
||||
// and also change parent links of the previous "inverse" child
|
||||
rename_inverse_parent();
|
||||
state = 6;
|
||||
resume_6:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
// Delete parents, except the "inverse" one
|
||||
for (current_child = 0; current_child < chain_list.size(); current_child++)
|
||||
{
|
||||
if (chain_list[current_child] == inverse_parent)
|
||||
continue;
|
||||
start_delete_source(chain_list[current_child]);
|
||||
resume_7:
|
||||
while (!cb())
|
||||
{
|
||||
state = 7;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
delete_inode_config(chain_list[current_child]);
|
||||
state = 8;
|
||||
resume_8:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
state = 9;
|
||||
resume_9:
|
||||
// Done
|
||||
return;
|
||||
}
|
||||
|
||||
void get_merge_children()
|
||||
{
|
||||
// Get all children of from..to
|
||||
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
|
||||
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
|
||||
// Check that to_cfg is actually a child of from_cfg
|
||||
// FIXME de-copypaste the following piece of code with snap_merger_t
|
||||
inode_config_t *cur = to_cfg;
|
||||
chain_list.push_back(cur->num);
|
||||
while (cur->num != from_cfg->num && cur->parent_id != 0)
|
||||
{
|
||||
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
|
||||
if (it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
|
||||
exit(1);
|
||||
}
|
||||
cur = &it->second;
|
||||
chain_list.push_back(cur->num);
|
||||
}
|
||||
if (cur->num != from_cfg->num)
|
||||
{
|
||||
fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
new_parent = from_cfg->parent_id;
|
||||
// Calculate ranks
|
||||
int i = chain_list.size()-1;
|
||||
for (inode_t item: chain_list)
|
||||
{
|
||||
sources[item] = i--;
|
||||
}
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (!ic.second.parent_id)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto it = sources.find(ic.second.parent_id);
|
||||
if (it != sources.end() && sources.find(ic.second.num) == sources.end())
|
||||
{
|
||||
merge_children.push_back(ic.second.num);
|
||||
if (ic.second.readonly || writers_stopped)
|
||||
{
|
||||
inverse_candidates[ic.second.num] = it->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void read_stats()
|
||||
{
|
||||
if (inverse_candidates.size() == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
json11::Json::array reads;
|
||||
for (auto cp: inverse_candidates)
|
||||
{
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
for (auto cp: sources)
|
||||
{
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "success", reads },
|
||||
}, [this](std::string err, json11::Json data)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error reading layer statistics from etcd: %s\n", err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
for (auto inode_result: data["responses"].array_items())
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]);
|
||||
pool_id_t pool_id = 0;
|
||||
inode_t inode = 0;
|
||||
char null_byte = 0;
|
||||
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
|
||||
if (!inode || null_byte != 0)
|
||||
{
|
||||
fprintf(stderr, "Bad key returned from etcd: %s\n", kv.key.c_str());
|
||||
exit(1);
|
||||
}
|
||||
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id);
|
||||
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
|
||||
{
|
||||
fprintf(stderr, "Pool %u does not exist\n", pool_id);
|
||||
exit(1);
|
||||
}
|
||||
inode = INODE_WITH_POOL(pool_id, inode);
|
||||
auto & pool_cfg = pool_cfg_it->second;
|
||||
uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size;
|
||||
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
|
||||
}
|
||||
inode_used[inode] = used_bytes;
|
||||
}
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void choose_inverse_candidate()
|
||||
{
|
||||
uint64_t max_diff = 0;
|
||||
for (auto cp: inverse_candidates)
|
||||
{
|
||||
inode_t child = cp.first;
|
||||
uint64_t child_used = inode_used[child];
|
||||
int rank = cp.second;
|
||||
for (int i = chain_list.size()-rank; i < chain_list.size(); i++)
|
||||
{
|
||||
inode_t parent = chain_list[i];
|
||||
uint64_t parent_used = inode_used[parent];
|
||||
if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used)))
|
||||
{
|
||||
max_diff = (parent_used-child_used);
|
||||
inverse_parent = parent;
|
||||
inverse_child = child;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void rename_inverse_parent()
|
||||
{
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(inverse_child);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inverse_child);
|
||||
exit(1);
|
||||
}
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inverse_parent);
|
||||
exit(1);
|
||||
}
|
||||
inode_config_t *child_cfg = &child_it->second;
|
||||
inode_config_t *target_cfg = &target_it->second;
|
||||
std::string child_name = child_cfg->name;
|
||||
std::string target_name = target_cfg->name;
|
||||
std::string child_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_child))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_child))
|
||||
);
|
||||
std::string target_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_parent))
|
||||
);
|
||||
// Fill new configuration
|
||||
inode_config_t new_cfg = *child_cfg;
|
||||
new_cfg.num = target_cfg->num;
|
||||
new_cfg.parent_id = new_parent;
|
||||
json11::Json::array cmp = json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", child_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", child_cfg->mod_revision+1 },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", target_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", target_cfg->mod_revision+1 },
|
||||
},
|
||||
};
|
||||
json11::Json::array txn = json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", child_cfg_key },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", target_cfg_key },
|
||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&new_cfg)).dump()) },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+child_cfg->name) },
|
||||
{ "value", base64_encode(json11::Json({
|
||||
{ "id", INODE_NO_POOL(inverse_parent) },
|
||||
{ "pool_id", (uint64_t)INODE_POOL(inverse_parent) },
|
||||
}).dump()) },
|
||||
} },
|
||||
},
|
||||
};
|
||||
// Reparent children of inverse_child
|
||||
for (auto & cp: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (cp.second.parent_id == child_cfg->num)
|
||||
{
|
||||
auto cp_cfg = cp.second;
|
||||
cp_cfg.parent_id = inverse_parent;
|
||||
auto cp_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cp.second.num))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cp.second.num))
|
||||
);
|
||||
cmp.push_back(json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cp_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", cp.second.mod_revision+1 },
|
||||
});
|
||||
txn.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", cp_key },
|
||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&cp_cfg)).dump()) },
|
||||
} },
|
||||
});
|
||||
}
|
||||
}
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", cmp },
|
||||
{ "success", txn },
|
||||
}, [this, target_name, child_name](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error renaming %s to %s: %s\n", target_name.c_str(), child_name.c_str(), err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Parent (%s), child (%s), or one of its children"
|
||||
" configuration was modified during rename\n", target_name.c_str(), child_name.c_str()
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void delete_inode_config(inode_t cur)
|
||||
{
|
||||
auto cur_cfg_it = parent->cli->st_cli.inode_config.find(cur);
|
||||
if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
|
||||
exit(1);
|
||||
}
|
||||
inode_config_t *cur_cfg = &cur_cfg_it->second;
|
||||
std::string cur_name = cur_cfg->name;
|
||||
std::string cur_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cur))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cur))
|
||||
);
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cur_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", cur_cfg->mod_revision+1 },
|
||||
},
|
||||
} },
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", cur_cfg_key },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) },
|
||||
} },
|
||||
},
|
||||
} },
|
||||
}, [this, cur_name](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error deleting %s: %s\n", cur_name.c_str(), err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
fprintf(stderr, "Layer %s configuration was modified during deletion\n", cur_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
printf("Layer %s deleted\n", cur_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void start_merge_child(inode_t child_inode, inode_t target_inode)
|
||||
{
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(child_inode);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", child_inode);
|
||||
exit(1);
|
||||
}
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(target_inode);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", target_inode);
|
||||
exit(1);
|
||||
}
|
||||
cb = parent->start_merge(json11::Json::object {
|
||||
{ "command", json11::Json::array{ "merge-data", from_name, child_it->second.name } },
|
||||
{ "target", target_it->second.name },
|
||||
{ "delete-source", false },
|
||||
{ "cas", use_cas },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
}
|
||||
|
||||
void start_delete_source(inode_t inode)
|
||||
{
|
||||
auto source = parent->cli->st_cli.inode_config.find(inode);
|
||||
if (source == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inode);
|
||||
exit(1);
|
||||
}
|
||||
cb = parent->start_rm(json11::Json::object {
|
||||
{ "inode", inode },
|
||||
{ "pool", (uint64_t)INODE_POOL(inode) },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_snap_rm(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto snap_remover = new snap_remover_t();
|
||||
snap_remover->parent = this;
|
||||
snap_remover->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
snap_remover->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
|
||||
if (snap_remover->from_name == "")
|
||||
{
|
||||
fprintf(stderr, "Layer to remove argument is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
if (snap_remover->to_name == "")
|
||||
{
|
||||
snap_remover->to_name = snap_remover->from_name;
|
||||
}
|
||||
snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
if (!snap_remover->fsync_interval)
|
||||
snap_remover->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
||||
if (!cfg["writers_stopped"].is_null())
|
||||
snap_remover->writers_stopped = true;
|
||||
return [snap_remover]()
|
||||
{
|
||||
snap_remover->loop();
|
||||
if (snap_remover->is_done())
|
||||
{
|
||||
delete snap_remover;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
301
src/cli_status.cpp
Normal file
301
src/cli_status.cpp
Normal file
@@ -0,0 +1,301 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
#include "base64.h"
|
||||
#include "pg_states.h"
|
||||
|
||||
// Print cluster status:
|
||||
// etcd, mon, osd states
|
||||
// raw/used space, object states, pool states, pg states
|
||||
// client io, recovery io, rebalance io
|
||||
struct status_printer_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
|
||||
int state = 0;
|
||||
json11::Json::array mon_members, osd_stats;
|
||||
json11::Json agg_stats;
|
||||
std::map<pool_id_t, json11::Json::object> pool_stats;
|
||||
json11::Json::array etcd_states;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 100;
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
// etcd states
|
||||
{
|
||||
auto addrs = parent->cli->st_cli.get_addresses();
|
||||
etcd_states.resize(addrs.size());
|
||||
for (int i = 0; i < etcd_states.size(); i++)
|
||||
{
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_call_oneshot(
|
||||
addrs[i], "/maintenance/status", json11::Json::object(),
|
||||
parent->cli->st_cli.etcd_quick_timeout, [this, i](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
etcd_states[i] = err != "" ? json11::Json::object{ { "error", err } } : res;
|
||||
parent->ringloop->wakeup();
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
state = 1;
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
// Monitors, OSD states
|
||||
parent->etcd_txn(json11::Json::object {
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/mon/") },
|
||||
{ "range_end", base64_encode(parent->cli->st_cli.etcd_prefix+"/mon0") },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+"/osd/stats/"
|
||||
) },
|
||||
{ "range_end", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+"/osd/stats0"
|
||||
) },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/stats") },
|
||||
} },
|
||||
},
|
||||
} },
|
||||
});
|
||||
state = 2;
|
||||
resume_2:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
fprintf(stderr, "%s\n", parent->etcd_err.text.c_str());
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
mon_members = parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items();
|
||||
osd_stats = parent->etcd_result["responses"][1]["response_range"]["kvs"].array_items();
|
||||
if (parent->etcd_result["responses"][2]["response_range"]["kvs"].array_items().size() > 0)
|
||||
{
|
||||
agg_stats = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][2]["response_range"]["kvs"][0]).value;
|
||||
}
|
||||
int etcd_alive = 0;
|
||||
uint64_t etcd_db_size = 0;
|
||||
std::string etcd_detail;
|
||||
for (int i = 0; i < etcd_states.size(); i++)
|
||||
{
|
||||
if (etcd_states[i]["error"].is_null())
|
||||
{
|
||||
etcd_alive++;
|
||||
etcd_db_size = etcd_states[i]["dbSizeInUse"].uint64_value();
|
||||
}
|
||||
}
|
||||
int mon_count = 0;
|
||||
std::string mon_master;
|
||||
for (int i = 0; i < mon_members.size(); i++)
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(mon_members[i]);
|
||||
kv.key = kv.key.substr(parent->cli->st_cli.etcd_prefix.size());
|
||||
if (kv.key.substr(0, 12) == "/mon/member/")
|
||||
mon_count++;
|
||||
else if (kv.key == "/mon/master")
|
||||
{
|
||||
if (kv.value["hostname"].is_string())
|
||||
mon_master = kv.value["hostname"].string_value();
|
||||
else
|
||||
mon_master = kv.value["ip"][0].string_value();
|
||||
}
|
||||
}
|
||||
int osd_count = 0, osd_up = 0;
|
||||
uint64_t total_raw = 0, free_raw = 0, free_down_raw = 0, down_raw = 0;
|
||||
for (int i = 0; i < osd_stats.size(); i++)
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(osd_stats[i]);
|
||||
osd_num_t stat_osd_num = 0;
|
||||
char null_byte = 0;
|
||||
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.size(), "/osd/stats/%lu%c", &stat_osd_num, &null_byte);
|
||||
if (!stat_osd_num || null_byte != 0)
|
||||
{
|
||||
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
||||
continue;
|
||||
}
|
||||
osd_count++;
|
||||
total_raw += kv.value["size"].uint64_value();
|
||||
free_raw += kv.value["free"].uint64_value();
|
||||
auto peer_it = parent->cli->st_cli.peer_states.find(stat_osd_num);
|
||||
if (peer_it != parent->cli->st_cli.peer_states.end())
|
||||
{
|
||||
osd_up++;
|
||||
}
|
||||
else
|
||||
{
|
||||
down_raw += kv.value["size"].uint64_value();
|
||||
free_down_raw += kv.value["size"].uint64_value();
|
||||
}
|
||||
}
|
||||
int pool_count = 0, pools_active = 0;
|
||||
std::map<std::string, int> pgs_by_state;
|
||||
std::string pgs_by_state_str;
|
||||
for (auto & pool_pair: parent->cli->st_cli.pool_config)
|
||||
{
|
||||
auto & pool_cfg = pool_pair.second;
|
||||
bool active = true;
|
||||
if (pool_cfg.pg_config.size() != pool_cfg.pg_count)
|
||||
{
|
||||
active = false;
|
||||
pgs_by_state["offline"] += pool_cfg.pg_count-pool_cfg.pg_config.size();
|
||||
}
|
||||
pool_count++;
|
||||
for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++)
|
||||
{
|
||||
if (!(pg_it->second.cur_state & PG_ACTIVE))
|
||||
{
|
||||
active = false;
|
||||
}
|
||||
std::string pg_state_str;
|
||||
for (int i = 0; i < pg_state_bit_count; i++)
|
||||
{
|
||||
if (pg_it->second.cur_state & pg_state_bits[i])
|
||||
{
|
||||
pg_state_str += "+";
|
||||
pg_state_str += pg_state_names[i];
|
||||
}
|
||||
}
|
||||
if (pg_state_str.size())
|
||||
pgs_by_state[pg_state_str.substr(1)]++;
|
||||
else
|
||||
pgs_by_state["offline"]++;
|
||||
}
|
||||
if (active)
|
||||
{
|
||||
pools_active++;
|
||||
}
|
||||
}
|
||||
for (auto & kv: pgs_by_state)
|
||||
{
|
||||
if (pgs_by_state_str.size())
|
||||
{
|
||||
pgs_by_state_str += "\n ";
|
||||
}
|
||||
pgs_by_state_str += std::to_string(kv.second)+" "+kv.first;
|
||||
}
|
||||
uint64_t object_size = parent->cli->get_bs_block_size();
|
||||
std::string more_states;
|
||||
uint64_t obj_n;
|
||||
obj_n = agg_stats["object_counts"]["misplaced"].uint64_value();
|
||||
if (obj_n > 0)
|
||||
more_states += ", "+format_size(obj_n*object_size)+" misplaced";
|
||||
obj_n = agg_stats["object_counts"]["degraded"].uint64_value();
|
||||
if (obj_n > 0)
|
||||
more_states += ", "+format_size(obj_n*object_size)+" degraded";
|
||||
obj_n = agg_stats["object_counts"]["incomplete"].uint64_value();
|
||||
if (obj_n > 0)
|
||||
more_states += ", "+format_size(obj_n*object_size)+" incomplete";
|
||||
std::string recovery_io;
|
||||
{
|
||||
uint64_t deg_bps = agg_stats["recovery_stats"]["degraded"]["bps"].uint64_value();
|
||||
uint64_t deg_iops = agg_stats["recovery_stats"]["degraded"]["iops"].uint64_value();
|
||||
uint64_t misp_bps = agg_stats["recovery_stats"]["misplaced"]["bps"].uint64_value();
|
||||
uint64_t misp_iops = agg_stats["recovery_stats"]["misplaced"]["iops"].uint64_value();
|
||||
if (deg_iops > 0 || deg_bps > 0)
|
||||
recovery_io += " recovery: "+format_size(deg_bps)+"/s, "+format_size(deg_iops, true)+" op/s\n";
|
||||
if (misp_iops > 0 || misp_bps > 0)
|
||||
recovery_io += " rebalance: "+format_size(misp_bps)+"/s, "+format_size(misp_iops, true)+" op/s\n";
|
||||
}
|
||||
if (parent->json_output)
|
||||
{
|
||||
// JSON output
|
||||
printf("%s\n", json11::Json(json11::Json::object {
|
||||
{ "etcd_alive", etcd_alive },
|
||||
{ "etcd_count", (uint64_t)etcd_states.size() },
|
||||
{ "etcd_db_size", etcd_db_size },
|
||||
{ "mon_count", mon_count },
|
||||
{ "mon_master", mon_master },
|
||||
{ "osd_up", osd_up },
|
||||
{ "osd_count", osd_count },
|
||||
{ "total_raw", total_raw },
|
||||
{ "free_raw", free_raw },
|
||||
{ "down_raw", down_raw },
|
||||
{ "free_down_raw", free_down_raw },
|
||||
{ "clean_data", agg_stats["object_counts"]["clean"].uint64_value() * object_size },
|
||||
{ "misplaced_data", agg_stats["object_counts"]["misplaced"].uint64_value() * object_size },
|
||||
{ "degraded_data", agg_stats["object_counts"]["degraded"].uint64_value() * object_size },
|
||||
{ "incomplete_data", agg_stats["object_counts"]["incomplete"].uint64_value() * object_size },
|
||||
{ "pool_count", pool_count },
|
||||
{ "active_pool_count", pools_active },
|
||||
{ "pg_states", pgs_by_state },
|
||||
{ "op_stats", agg_stats["op_stats"] },
|
||||
{ "recovery_stats", agg_stats["recovery_stats"] },
|
||||
{ "object_counts", agg_stats["object_counts"] },
|
||||
}).dump().c_str());
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
printf(
|
||||
" cluster:\n"
|
||||
" etcd: %d / %ld up, %s database size\n"
|
||||
" mon: %d up%s\n"
|
||||
" osd: %d / %d up\n"
|
||||
" \n"
|
||||
" data:\n"
|
||||
" raw: %s used, %s / %s available%s\n"
|
||||
" state: %s clean%s\n"
|
||||
" pools: %d / %d active\n"
|
||||
" pgs: %s\n"
|
||||
" \n"
|
||||
" io:\n"
|
||||
" client:%s %s/s rd, %s op/s rd, %s/s wr, %s op/s wr\n"
|
||||
"%s",
|
||||
etcd_alive, etcd_states.size(), format_size(etcd_db_size).c_str(),
|
||||
mon_count, mon_master == "" ? "" : (", master "+mon_master).c_str(),
|
||||
osd_up, osd_count,
|
||||
format_size(total_raw-free_raw).c_str(),
|
||||
format_size(free_raw-free_down_raw).c_str(),
|
||||
format_size(total_raw-down_raw).c_str(),
|
||||
(down_raw > 0 ? (", "+format_size(down_raw)+" down").c_str() : ""),
|
||||
format_size(agg_stats["object_counts"]["clean"].uint64_value() * object_size).c_str(), more_states.c_str(),
|
||||
pools_active, pool_count,
|
||||
pgs_by_state_str.c_str(),
|
||||
recovery_io.size() > 0 ? " " : "",
|
||||
format_size(agg_stats["op_stats"]["primary_read"]["bps"].uint64_value()).c_str(),
|
||||
format_size(agg_stats["op_stats"]["primary_read"]["iops"].uint64_value(), true).c_str(),
|
||||
format_size(agg_stats["op_stats"]["primary_write"]["bps"].uint64_value()).c_str(),
|
||||
format_size(agg_stats["op_stats"]["primary_write"]["iops"].uint64_value(), true).c_str(),
|
||||
recovery_io.c_str()
|
||||
);
|
||||
state = 100;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_status(json11::Json cfg)
|
||||
{
|
||||
auto printer = new status_printer_t();
|
||||
printer->parent = this;
|
||||
return [printer](cli_result_t & result)
|
||||
{
|
||||
printer->loop();
|
||||
if (printer->is_done())
|
||||
{
|
||||
result = { .err = 0 };
|
||||
delete printer;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
@@ -9,6 +9,7 @@
|
||||
#define PART_SENT 1
|
||||
#define PART_DONE 2
|
||||
#define PART_ERROR 4
|
||||
#define PART_RETRY 8
|
||||
#define CACHE_DIRTY 1
|
||||
#define CACHE_FLUSHING 2
|
||||
#define CACHE_REPEATING 3
|
||||
@@ -373,6 +374,11 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
|
||||
continue_ops();
|
||||
}
|
||||
|
||||
bool cluster_client_t::get_immediate_commit()
|
||||
{
|
||||
return immediate_commit;
|
||||
}
|
||||
|
||||
void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
|
||||
{
|
||||
if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
|
||||
@@ -670,14 +676,17 @@ resume_2:
|
||||
if (!try_send(op, i))
|
||||
{
|
||||
// We'll need to retry again
|
||||
op->up_wait = true;
|
||||
if (!retry_timeout_id)
|
||||
if (op->parts[i].flags & PART_RETRY)
|
||||
{
|
||||
retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
|
||||
op->up_wait = true;
|
||||
if (!retry_timeout_id)
|
||||
{
|
||||
retry_timeout_id = 0;
|
||||
continue_ops(true);
|
||||
});
|
||||
retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
|
||||
{
|
||||
retry_timeout_id = 0;
|
||||
continue_ops(true);
|
||||
});
|
||||
}
|
||||
}
|
||||
op->state = 2;
|
||||
}
|
||||
@@ -746,7 +755,7 @@ resume_3:
|
||||
{
|
||||
for (int i = 0; i < op->parts.size(); i++)
|
||||
{
|
||||
op->parts[i].flags = 0;
|
||||
op->parts[i].flags = PART_RETRY;
|
||||
}
|
||||
goto resume_2;
|
||||
}
|
||||
|
@@ -118,6 +118,8 @@ public:
|
||||
bool is_ready();
|
||||
void on_ready(std::function<void(void)> fn);
|
||||
|
||||
bool get_immediate_commit();
|
||||
|
||||
static void copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers);
|
||||
void continue_ops(bool up_retry = false);
|
||||
inode_list_t *list_inode_start(inode_t inode,
|
||||
|
@@ -64,6 +64,42 @@ void etcd_state_client_t::etcd_txn_slow(json11::Json txn, std::function<void(std
|
||||
etcd_call("/kv/txn", txn, etcd_slow_timeout, max_etcd_attempts, 0, callback);
|
||||
}
|
||||
|
||||
std::vector<std::string> etcd_state_client_t::get_addresses()
|
||||
{
|
||||
auto addrs = etcd_local;
|
||||
addrs.insert(addrs.end(), etcd_addresses.begin(), etcd_addresses.end());
|
||||
return addrs;
|
||||
}
|
||||
|
||||
void etcd_state_client_t::etcd_call_oneshot(std::string etcd_address, std::string api, json11::Json payload,
|
||||
int timeout, std::function<void(std::string, json11::Json)> callback)
|
||||
{
|
||||
std::string etcd_api_path;
|
||||
int pos = etcd_address.find('/');
|
||||
if (pos >= 0)
|
||||
{
|
||||
etcd_api_path = etcd_address.substr(pos);
|
||||
etcd_address = etcd_address.substr(0, pos);
|
||||
}
|
||||
std::string req = payload.dump();
|
||||
req = "POST "+etcd_api_path+api+" HTTP/1.1\r\n"
|
||||
"Host: "+etcd_address+"\r\n"
|
||||
"Content-Type: application/json\r\n"
|
||||
"Content-Length: "+std::to_string(req.size())+"\r\n"
|
||||
"Connection: close\r\n"
|
||||
"\r\n"+req;
|
||||
auto http_cli = http_init(tfd);
|
||||
auto cb = [http_cli, callback](const http_response_t *response)
|
||||
{
|
||||
std::string err;
|
||||
json11::Json data;
|
||||
response->parse_json_response(err, data);
|
||||
callback(err, data);
|
||||
http_close(http_cli);
|
||||
};
|
||||
http_request(http_cli, etcd_address, req, { .timeout = timeout }, cb);
|
||||
}
|
||||
|
||||
void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int timeout,
|
||||
int retries, int interval, std::function<void(std::string, json11::Json)> callback)
|
||||
{
|
||||
@@ -302,9 +338,14 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
{
|
||||
if (data["result"]["created"].bool_value())
|
||||
{
|
||||
if (etcd_watches_initialised == 3 && this->log_level > 0)
|
||||
uint64_t watch_id = data["result"]["watch_id"].uint64_value();
|
||||
if (watch_id == ETCD_CONFIG_WATCH_ID ||
|
||||
watch_id == ETCD_PG_STATE_WATCH_ID ||
|
||||
watch_id == ETCD_PG_HISTORY_WATCH_ID ||
|
||||
watch_id == ETCD_OSD_STATE_WATCH_ID)
|
||||
etcd_watches_initialised++;
|
||||
if (etcd_watches_initialised == 4 && this->log_level > 0)
|
||||
fprintf(stderr, "Successfully subscribed to etcd at %s\n", selected_etcd_address.c_str());
|
||||
etcd_watches_initialised++;
|
||||
}
|
||||
if (data["result"]["canceled"].bool_value())
|
||||
{
|
||||
@@ -433,6 +474,10 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
{ "progress_notify", true },
|
||||
} }
|
||||
}).dump());
|
||||
if (on_start_watcher_hook)
|
||||
{
|
||||
on_start_watcher_hook(etcd_watch_ws);
|
||||
}
|
||||
if (ws_keepalive_timer < 0)
|
||||
{
|
||||
ws_keepalive_timer = tfd->set_timer(etcd_ws_keepalive_interval*1000, true, [this](int)
|
||||
@@ -918,6 +963,10 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
}
|
||||
if (!value.is_object())
|
||||
{
|
||||
if (on_inode_change_hook != NULL)
|
||||
{
|
||||
on_inode_change_hook(inode_num, true);
|
||||
}
|
||||
this->inode_config.erase(inode_num);
|
||||
}
|
||||
else
|
||||
@@ -932,38 +981,47 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Inode %lu/%lu parent_pool value is invalid, ignoring parent setting\n",
|
||||
inode_num >> (64-POOL_ID_BITS), inode_num & ((1l << (64-POOL_ID_BITS)) - 1)
|
||||
inode_num >> (64-POOL_ID_BITS), inode_num & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)
|
||||
);
|
||||
parent_inode_num = 0;
|
||||
}
|
||||
else
|
||||
parent_inode_num |= parent_pool_id << (64-POOL_ID_BITS);
|
||||
}
|
||||
inode_config_t cfg = (inode_config_t){
|
||||
insert_inode_config((inode_config_t){
|
||||
.num = inode_num,
|
||||
.name = value["name"].string_value(),
|
||||
.size = value["size"].uint64_value(),
|
||||
.parent_id = parent_inode_num,
|
||||
.readonly = value["readonly"].bool_value(),
|
||||
.meta = value["meta"],
|
||||
.mod_revision = kv.mod_revision,
|
||||
};
|
||||
this->inode_config[inode_num] = cfg;
|
||||
if (cfg.name != "")
|
||||
{
|
||||
this->inode_by_name[cfg.name] = inode_num;
|
||||
for (auto w: watches)
|
||||
{
|
||||
if (w->name == value["name"].string_value())
|
||||
{
|
||||
w->cfg = cfg;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void etcd_state_client_t::insert_inode_config(const inode_config_t & cfg)
|
||||
{
|
||||
this->inode_config[cfg.num] = cfg;
|
||||
if (cfg.name != "")
|
||||
{
|
||||
this->inode_by_name[cfg.name] = cfg.num;
|
||||
for (auto w: watches)
|
||||
{
|
||||
if (w->name == cfg.name)
|
||||
{
|
||||
w->cfg = cfg;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (on_inode_change_hook != NULL)
|
||||
{
|
||||
on_inode_change_hook(cfg.num, false);
|
||||
}
|
||||
}
|
||||
|
||||
inode_watch_t* etcd_state_client_t::watch_inode(std::string name)
|
||||
{
|
||||
inode_watch_t *watch = new inode_watch_t;
|
||||
@@ -1006,6 +1064,10 @@ json11::Json::object etcd_state_client_t::serialize_inode_cfg(inode_config_t *cf
|
||||
{
|
||||
new_cfg["readonly"] = true;
|
||||
}
|
||||
if (cfg->meta.is_object())
|
||||
{
|
||||
new_cfg["meta"] = cfg->meta;
|
||||
}
|
||||
return new_cfg;
|
||||
}
|
||||
|
||||
|
@@ -56,6 +56,8 @@ struct inode_config_t
|
||||
uint64_t size;
|
||||
inode_t parent_id;
|
||||
bool readonly;
|
||||
// Arbitrary metadata
|
||||
json11::Json meta;
|
||||
// Change revision of the metadata in etcd
|
||||
uint64_t mod_revision;
|
||||
};
|
||||
@@ -109,9 +111,13 @@ public:
|
||||
std::function<void(pool_id_t, pg_num_t)> on_change_pg_history_hook;
|
||||
std::function<void(osd_num_t)> on_change_osd_state_hook;
|
||||
std::function<void()> on_reload_hook;
|
||||
std::function<void(inode_t, bool)> on_inode_change_hook;
|
||||
std::function<void(http_co_t *)> on_start_watcher_hook;
|
||||
|
||||
json11::Json::object serialize_inode_cfg(inode_config_t *cfg);
|
||||
etcd_kv_t parse_etcd_kv(const json11::Json & kv_json);
|
||||
std::vector<std::string> get_addresses();
|
||||
void etcd_call_oneshot(std::string etcd_address, std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback);
|
||||
void etcd_call(std::string api, json11::Json payload, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
|
||||
void etcd_txn(json11::Json txn, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
|
||||
void etcd_txn_slow(json11::Json txn, std::function<void(std::string, json11::Json)> callback);
|
||||
@@ -120,6 +126,7 @@ public:
|
||||
void load_pgs();
|
||||
void parse_state(const etcd_kv_t & kv);
|
||||
void parse_config(const json11::Json & config);
|
||||
void insert_inode_config(const inode_config_t & cfg);
|
||||
inode_watch_t* watch_inode(std::string name);
|
||||
void close_watch(inode_watch_t* watch);
|
||||
int address_count();
|
||||
|
@@ -214,14 +214,14 @@ static int sec_setup(struct thread_data *td)
|
||||
|
||||
if (!o->image)
|
||||
{
|
||||
if (!(o->inode & ((1l << (64-POOL_ID_BITS)) - 1)))
|
||||
if (!(o->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)))
|
||||
{
|
||||
td_verror(td, EINVAL, "inode number is missing");
|
||||
return 1;
|
||||
}
|
||||
if (o->pool)
|
||||
{
|
||||
o->inode = (o->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (o->pool << (64-POOL_ID_BITS));
|
||||
o->inode = (o->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (o->pool << (64-POOL_ID_BITS));
|
||||
}
|
||||
if (!(o->inode >> (64-POOL_ID_BITS)))
|
||||
{
|
||||
|
@@ -62,9 +62,10 @@ struct http_co_t
|
||||
void run_cb_and_clear();
|
||||
void start_connection();
|
||||
void close_connection();
|
||||
void next_request();
|
||||
void handle_events();
|
||||
void handle_connect_result();
|
||||
void submit_read();
|
||||
void submit_read(bool check_timeout);
|
||||
void submit_send();
|
||||
bool handle_read();
|
||||
void post_message(int type, const std::string & msg);
|
||||
@@ -128,6 +129,7 @@ void http_co_t::run_cb_and_clear()
|
||||
// Call callback after clearing it because otherwise we may hit reenterability problems
|
||||
if (cb != NULL)
|
||||
cb(&parsed);
|
||||
next_request();
|
||||
}
|
||||
|
||||
void http_co_t::send_request(const std::string & host, const std::string & request,
|
||||
@@ -161,17 +163,6 @@ void http_co_t::send_request(const std::string & host, const std::string & reque
|
||||
this->sent = 0;
|
||||
this->response_callback = response_callback;
|
||||
this->parsed = {};
|
||||
if (request_timeout > 0)
|
||||
{
|
||||
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
|
||||
{
|
||||
stackin();
|
||||
close_connection();
|
||||
parsed = { .error = "HTTP request timed out" };
|
||||
run_cb_and_clear();
|
||||
stackout();
|
||||
});
|
||||
}
|
||||
if (state == HTTP_CO_KEEPALIVE)
|
||||
{
|
||||
state = HTTP_CO_SENDING_REQUEST;
|
||||
@@ -181,6 +172,28 @@ void http_co_t::send_request(const std::string & host, const std::string & reque
|
||||
{
|
||||
start_connection();
|
||||
}
|
||||
// Do it _after_ state assignment because set_timer() can actually trigger
|
||||
// other timers and requests (reenterability is our friend)
|
||||
if (request_timeout > 0)
|
||||
{
|
||||
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
|
||||
{
|
||||
stackin();
|
||||
if (state == HTTP_CO_REQUEST_SENT)
|
||||
{
|
||||
// In case of high CPU load, we may not handle etcd responses in time
|
||||
// For this case, first check the socket and only then terminate request with the timeout
|
||||
submit_read(true);
|
||||
}
|
||||
else
|
||||
{
|
||||
close_connection();
|
||||
parsed = { .error = "HTTP request timed out" };
|
||||
run_cb_and_clear();
|
||||
}
|
||||
stackout();
|
||||
});
|
||||
}
|
||||
stackout();
|
||||
}
|
||||
|
||||
@@ -274,6 +287,7 @@ void http_co_t::start_connection()
|
||||
struct sockaddr_storage addr;
|
||||
if (!string_to_addr(host.c_str(), 1, 80, &addr))
|
||||
{
|
||||
close_connection();
|
||||
parsed = { .error = "Invalid address: "+host };
|
||||
run_cb_and_clear();
|
||||
stackout();
|
||||
@@ -282,6 +296,7 @@ void http_co_t::start_connection()
|
||||
peer_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||
if (peer_fd < 0)
|
||||
{
|
||||
close_connection();
|
||||
parsed = { .error = std::string("socket: ")+strerror(errno) };
|
||||
run_cb_and_clear();
|
||||
stackout();
|
||||
@@ -323,10 +338,12 @@ void http_co_t::handle_events()
|
||||
epoll_events &= ~EPOLLOUT;
|
||||
if (epoll_events & EPOLLIN)
|
||||
{
|
||||
submit_read();
|
||||
submit_read(false);
|
||||
}
|
||||
else if (epoll_events & (EPOLLRDHUP|EPOLLERR))
|
||||
{
|
||||
if (state == HTTP_CO_HEADERS_RECEIVED)
|
||||
std::swap(parsed.body, response);
|
||||
close_connection();
|
||||
run_cb_and_clear();
|
||||
break;
|
||||
@@ -410,10 +427,11 @@ again:
|
||||
stackout();
|
||||
}
|
||||
|
||||
void http_co_t::submit_read()
|
||||
void http_co_t::submit_read(bool check_timeout)
|
||||
{
|
||||
stackin();
|
||||
int res;
|
||||
again:
|
||||
if (rbuf.size() != READ_BUFFER_SIZE)
|
||||
{
|
||||
rbuf.resize(READ_BUFFER_SIZE);
|
||||
@@ -428,12 +446,29 @@ void http_co_t::submit_read()
|
||||
}
|
||||
if (res == -EAGAIN || res == -EINTR)
|
||||
{
|
||||
epoll_events = epoll_events & ~EPOLLIN;
|
||||
if (check_timeout)
|
||||
{
|
||||
if (res == -EINTR)
|
||||
goto again;
|
||||
else
|
||||
{
|
||||
// Timeout happened and there is no data to read
|
||||
close_connection();
|
||||
parsed = { .error = "HTTP request timed out" };
|
||||
run_cb_and_clear();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
epoll_events = epoll_events & ~EPOLLIN;
|
||||
}
|
||||
}
|
||||
else if (res <= 0)
|
||||
{
|
||||
// < 0 means error, 0 means EOF
|
||||
epoll_events = epoll_events & ~EPOLLIN;
|
||||
if (state == HTTP_CO_HEADERS_RECEIVED)
|
||||
std::swap(parsed.body, response);
|
||||
close_connection();
|
||||
if (res < 0)
|
||||
parsed = { .error = std::string("recvmsg: ")+strerror(-res) };
|
||||
@@ -501,8 +536,11 @@ bool http_co_t::handle_read()
|
||||
if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size)
|
||||
{
|
||||
std::swap(parsed.body, response);
|
||||
response_callback(&parsed);
|
||||
parsed.eof = true;
|
||||
if (!keepalive)
|
||||
close_connection();
|
||||
else
|
||||
state = HTTP_CO_KEEPALIVE;
|
||||
run_cb_and_clear();
|
||||
}
|
||||
else if (state == HTTP_CO_CHUNKED && response.size() > 0)
|
||||
{
|
||||
@@ -533,10 +571,14 @@ bool http_co_t::handle_read()
|
||||
response_callback(&parsed);
|
||||
parsed.body = "";
|
||||
}
|
||||
if (parsed.eof && !want_streaming)
|
||||
else if (parsed.eof)
|
||||
{
|
||||
// Normal response
|
||||
response_callback(&parsed);
|
||||
if (!keepalive)
|
||||
close_connection();
|
||||
else
|
||||
state = HTTP_CO_KEEPALIVE;
|
||||
run_cb_and_clear();
|
||||
}
|
||||
}
|
||||
else if (state == HTTP_CO_WEBSOCKET && response.size() > 0)
|
||||
@@ -547,29 +589,20 @@ bool http_co_t::handle_read()
|
||||
parsed.body = "";
|
||||
}
|
||||
}
|
||||
if (parsed.eof)
|
||||
{
|
||||
response_callback = NULL;
|
||||
parsed = {};
|
||||
if (!keepalive)
|
||||
{
|
||||
close_connection();
|
||||
}
|
||||
else
|
||||
{
|
||||
state = HTTP_CO_KEEPALIVE;
|
||||
if (keepalive_queue.size() > 0)
|
||||
{
|
||||
auto next = keepalive_queue[0];
|
||||
keepalive_queue.erase(keepalive_queue.begin(), keepalive_queue.begin()+1);
|
||||
next();
|
||||
}
|
||||
}
|
||||
}
|
||||
stackout();
|
||||
return true;
|
||||
}
|
||||
|
||||
void http_co_t::next_request()
|
||||
{
|
||||
if (keepalive_queue.size() > 0)
|
||||
{
|
||||
auto next = keepalive_queue[0];
|
||||
keepalive_queue.erase(keepalive_queue.begin(), keepalive_queue.begin()+1);
|
||||
next();
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t stoull_full(const std::string & str, int base)
|
||||
{
|
||||
if (isspace(str[0]))
|
||||
|
@@ -54,6 +54,8 @@ protected:
|
||||
msghdr read_msg = { 0 }, send_msg = { 0 };
|
||||
iovec read_iov = { 0 };
|
||||
|
||||
std::string logfile = "/dev/null";
|
||||
|
||||
public:
|
||||
~nbd_proxy()
|
||||
{
|
||||
@@ -187,7 +189,7 @@ public:
|
||||
uint64_t pool = cfg["pool"].uint64_value();
|
||||
if (pool)
|
||||
{
|
||||
inode = (inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS));
|
||||
inode = (inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS));
|
||||
}
|
||||
if (!(inode >> (64-POOL_ID_BITS)))
|
||||
{
|
||||
@@ -278,6 +280,10 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
if (cfg["logfile"].is_string())
|
||||
{
|
||||
logfile = cfg["logfile"].string_value();
|
||||
}
|
||||
if (bg)
|
||||
{
|
||||
daemonize();
|
||||
@@ -363,14 +369,14 @@ public:
|
||||
setsid();
|
||||
if (fork())
|
||||
exit(0);
|
||||
if (chdir("/") != 0)
|
||||
fprintf(stderr, "Warning: Failed to chdir into /\n");
|
||||
close(0);
|
||||
close(1);
|
||||
close(2);
|
||||
open("/dev/null", O_RDONLY);
|
||||
open("/dev/null", O_WRONLY);
|
||||
open("/dev/null", O_WRONLY);
|
||||
open(logfile.c_str(), O_WRONLY|O_APPEND|O_CREAT, 0666);
|
||||
open(logfile.c_str(), O_WRONLY|O_APPEND|O_CREAT, 0666);
|
||||
if (chdir("/") != 0)
|
||||
fprintf(stderr, "Warning: Failed to chdir into /\n");
|
||||
}
|
||||
|
||||
json11::Json::object list_mapped()
|
||||
@@ -684,6 +690,7 @@ protected:
|
||||
{
|
||||
assert(result <= cur_left);
|
||||
cur_left -= result;
|
||||
cur_buf = (uint8_t*)cur_buf + result;
|
||||
result = 0;
|
||||
}
|
||||
if (cur_left <= 0)
|
||||
@@ -698,6 +705,12 @@ protected:
|
||||
if (read_state == CL_READ_HDR)
|
||||
{
|
||||
int req_type = be32toh(cur_req.type);
|
||||
if (be32toh(cur_req.magic) == NBD_REQUEST_MAGIC && req_type == NBD_CMD_DISC)
|
||||
{
|
||||
// Disconnect
|
||||
close(nbd_fd);
|
||||
exit(0);
|
||||
}
|
||||
if (be32toh(cur_req.magic) != NBD_REQUEST_MAGIC ||
|
||||
req_type != NBD_CMD_READ && req_type != NBD_CMD_WRITE && req_type != NBD_CMD_FLUSH)
|
||||
{
|
||||
|
1690
src/nfs/nfs.h
Normal file
1690
src/nfs/nfs.h
Normal file
File diff suppressed because it is too large
Load Diff
1380
src/nfs/nfs.x
Normal file
1380
src/nfs/nfs.x
Normal file
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user