forked from vitalif/vitastor
Compare commits
1 Commits
zerocopy-t
...
non-odp-rd
Author | SHA1 | Date | |
---|---|---|---|
043ed854f3 |
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VERSION "0.6.17")
|
||||
set(VERSION "0.6.12")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
54
README-ru.md
54
README-ru.md
@@ -52,7 +52,6 @@ Vitastor на данный момент находится в статусе п
|
||||
- Слияние снапшотов (vitastor-cli {snap-rm,flatten,merge})
|
||||
- Консольный интерфейс для управления образами (vitastor-cli {ls,create,modify})
|
||||
- Плагин для Proxmox
|
||||
- Упрощённая NFS-прокси для эмуляции файлового доступа к образам (подходит для VMWare)
|
||||
|
||||
## Планы развития
|
||||
|
||||
@@ -60,6 +59,7 @@ Vitastor на данный момент находится в статусе п
|
||||
- Другие инструменты администрирования
|
||||
- Плагины для OpenNebula и других облачных систем
|
||||
- iSCSI-прокси
|
||||
- Упрощённый NFS прокси
|
||||
- Более быстрое переключение при отказах
|
||||
- Фоновая проверка целостности без контрольных сумм (сверка реплик)
|
||||
- Контрольные суммы
|
||||
@@ -407,7 +407,6 @@ Vitastor с однопоточной NBD прокси на том же стен
|
||||
- На хостах мониторов:
|
||||
- Пропишите нужные вам значения в файле `/usr/lib/vitastor/mon/make-units.sh`
|
||||
- Создайте юниты systemd для etcd и мониторов: `/usr/lib/vitastor/mon/make-units.sh`
|
||||
- Запустите etcd и мониторы: `systemctl start etcd vitastor-mon`
|
||||
- Пропишите etcd_address и osd_network в `/etc/vitastor/vitastor.conf`. Например:
|
||||
```
|
||||
{
|
||||
@@ -415,14 +414,7 @@ Vitastor с однопоточной NBD прокси на том же стен
|
||||
"osd_network": "10.200.1.0/24"
|
||||
}
|
||||
```
|
||||
- Инициализуйте OSD:
|
||||
- SSD: `/usr/lib/vitastor/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
|
||||
- Гибридные, HDD+SSD: `/usr/lib/vitastor/mon/make-osd-hybrid.js /dev/sda /dev/sdb ...` - передайте
|
||||
все ваши SSD и HDD скрипту в командной строке подряд, скрипт автоматически выделит разделы под
|
||||
журналы на SSD и данные на HDD. Скрипт пропускает HDD, на которых уже есть разделы
|
||||
или вообще какие-то данные, поэтому если диски непустые, сначала очистите их с помощью
|
||||
`wipefs -a`. SSD с таблицей разделов не пропускаются, но так как скрипт создаёт новые разделы
|
||||
для журналов, на SSD должно быть доступно свободное нераспределённое место.
|
||||
- Создайте юниты systemd для OSD: `/usr/lib/vitastor/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
|
||||
- Вы можете менять параметры OSD в юнитах systemd или в `vitastor.conf`. Смысл некоторых параметров:
|
||||
- `disable_data_fsync 1` - отключает fsync, используется с SSD с конденсаторами.
|
||||
- `immediate_commit all` - используется с SSD с конденсаторами.
|
||||
@@ -438,6 +430,7 @@ Vitastor с однопоточной NBD прокси на том же стен
|
||||
диски, используемые на одном из тестовых стендов - Intel D3-S4510 - очень сильно не любят такую
|
||||
перезапись, и для них была добавлена эта опция. Когда данный режим включён, также нужно поднимать
|
||||
значение `journal_sector_buffer_count`, так как иначе Vitastor не хватит буферов для записи в журнал.
|
||||
- Запустите все etcd: `systemctl start etcd`
|
||||
- Создайте глобальную конфигурацию в etcd: `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
|
||||
(если все ваши диски - серверные с конденсаторами).
|
||||
- Создайте пулы: `etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`.
|
||||
@@ -530,48 +523,9 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
||||
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
|
||||
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
|
||||
|
||||
### NFS
|
||||
|
||||
В Vitastor реализована упрощённая NFS 3.0 прокси для эмуляции файлового доступа к образам.
|
||||
Это не полноценная файловая система, т.к. метаданные всех файлов (образов) сохраняются
|
||||
в etcd и всё время хранятся в оперативной памяти - то есть, положить туда много файлов
|
||||
не получится.
|
||||
|
||||
Однако в качестве способа доступа к образам виртуальных машин NFS прокси прекрасно подходит
|
||||
и позволяет подключить Vitastor, например, к VMWare.
|
||||
|
||||
При этом, если вы используете режим immediate_commit=all (для SSD с конденсаторами или HDD
|
||||
с отключённым кэшем), то NFS-сервер не имеет состояния и вы можете свободно поднять
|
||||
его в нескольких экземплярах и использовать поверх них сетевой балансировщик нагрузки или
|
||||
схему с отказоустойчивостью.
|
||||
|
||||
Использование vitastor-nfs:
|
||||
|
||||
```
|
||||
vitastor-nfs [--etcd_address ADDR] [ДРУГИЕ ОПЦИИ]
|
||||
|
||||
--subdir <DIR> экспортировать "поддиректорию" - образы с префиксом имени <DIR>/ (по умолчанию пусто - экспортировать все образы)
|
||||
--portmap 0 отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий)
|
||||
--bind <IP> принимать соединения по адресу <IP> (по умолчанию 0.0.0.0 - на всех)
|
||||
--nfspath <PATH> установить путь NFS-экспорта в <PATH> (по умолчанию /)
|
||||
--port <PORT> использовать порт <PORT> для NFS-сервисов (по умолчанию 2049)
|
||||
--pool <POOL> исползовать пул <POOL> для новых образов (обязательно, если пул в кластере не один)
|
||||
--foreground 1 не уходить в фон после запуска
|
||||
```
|
||||
|
||||
Пример монтирования Vitastor через NFS:
|
||||
|
||||
```
|
||||
vitastor-nfs --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
|
||||
```
|
||||
|
||||
```
|
||||
mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
```
|
||||
|
||||
### Kubernetes
|
||||
|
||||
У Vitastor есть CSI-плагин для Kubernetes, поддерживающий RWO, а также блочные RWX, тома.
|
||||
У Vitastor есть CSI-плагин для Kubernetes, поддерживающий RWO-тома.
|
||||
|
||||
Для установки возьмите манифесты из директории [csi/deploy/](csi/deploy/), поместите
|
||||
вашу конфигурацию подключения к Vitastor в [csi/deploy/001-csi-config-map.yaml](001-csi-config-map.yaml),
|
||||
|
53
README.md
53
README.md
@@ -46,7 +46,6 @@ breaking changes in the future. However, the following is implemented:
|
||||
- Snapshot merge tool (vitastor-cli {snap-rm,flatten,merge})
|
||||
- Image management CLI (vitastor-cli {ls,create,modify})
|
||||
- Proxmox storage plugin
|
||||
- Simplified NFS proxy for file-based image access emulation (suitable for VMWare)
|
||||
|
||||
## Roadmap
|
||||
|
||||
@@ -54,6 +53,7 @@ breaking changes in the future. However, the following is implemented:
|
||||
- Other administrative tools
|
||||
- Plugins for OpenNebula and other cloud systems
|
||||
- iSCSI proxy
|
||||
- Simplified NFS proxy
|
||||
- Faster failover
|
||||
- Scrubbing without checksums (verification of replicas)
|
||||
- Checksums
|
||||
@@ -360,7 +360,6 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
|
||||
- On the monitor hosts:
|
||||
- Edit variables at the top of `/usr/lib/vitastor/mon/make-units.sh` to desired values.
|
||||
- Create systemd units for the monitor and etcd: `/usr/lib/vitastor/mon/make-units.sh`
|
||||
- Start etcd and monitors: `systemctl start etcd vitastor-mon`
|
||||
- Put etcd_address and osd_network into `/etc/vitastor/vitastor.conf`. Example:
|
||||
```
|
||||
{
|
||||
@@ -368,13 +367,7 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
|
||||
"osd_network": "10.200.1.0/24"
|
||||
}
|
||||
```
|
||||
- Initialize OSDs:
|
||||
- Simplest, SSD-only: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
|
||||
- Hybrid, HDD+SSD: `/usr/lib/vitastor/mon/make-osd-hybrid.js /dev/sda /dev/sdb ...` - pass all your
|
||||
devices (HDD and SSD) to this script - it will partition disks and initialize journals on its own.
|
||||
This script skips HDDs which are already partitioned so if you want to use non-empty disks for
|
||||
Vitastor you should first wipe them with `wipefs -a`. SSDs with GPT partition table are not skipped,
|
||||
but some free unpartitioned space must be available because the script creates new partitions for journals.
|
||||
- Create systemd units for your OSDs: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
|
||||
- You can change OSD configuration in units or in `vitastor.conf`. Notable configuration variables:
|
||||
- `disable_data_fsync 1` - only safe with server-grade drives with capacitors.
|
||||
- `immediate_commit all` - use this if all your drives are server-grade.
|
||||
@@ -479,49 +472,9 @@ It will output the device name, like /dev/nbd0 which you can then format and mou
|
||||
|
||||
Again, you can use `--pool <POOL> --inode <INODE> --size <SIZE>` insteaf of `--image <IMAGE>` if you want.
|
||||
|
||||
### NFS
|
||||
|
||||
Vitastor has a simplified NFS 3.0 proxy for file-based image access emulation. It's not
|
||||
suitable as a full-featured file system, at least because all file/image metadata is stored
|
||||
in etcd and kept in memory all the time - thus you can't put a lot of files in it.
|
||||
|
||||
However, NFS proxy is totally fine as a method to provide VM image access and allows to
|
||||
plug Vitastor into, for example, VMWare. It's important to note that for VMWare it's a much
|
||||
better access method than iSCSI, because with iSCSI we'd have to put all VM images into one
|
||||
Vitastor image exported as a LUN to VMWare and formatted with VMFS. VMWare doesn't use VMFS
|
||||
over NFS.
|
||||
|
||||
NFS proxy is stateless if you use immediate_commit=all mode (for SSD with capacitors or
|
||||
HDDs with disabled cache), so you can run multiple NFS proxies and use a network load
|
||||
balancer or any failover method you want to in that case.
|
||||
|
||||
vitastor-nfs usage:
|
||||
|
||||
```
|
||||
vitastor-nfs [--etcd_address ADDR] [OTHER OPTIONS]
|
||||
|
||||
--subdir <DIR> export images prefixed <DIR>/ (default empty - export all images)
|
||||
--portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)
|
||||
--bind <IP> bind service to <IP> address (default 0.0.0.0)
|
||||
--nfspath <PATH> set NFS export path to <PATH> (default is /)
|
||||
--port <PORT> use port <PORT> for NFS services (default is 2049)
|
||||
--pool <POOL> use <POOL> as default pool for new files (images)
|
||||
--foreground 1 stay in foreground, do not daemonize
|
||||
```
|
||||
|
||||
Example start and mount commands:
|
||||
|
||||
```
|
||||
vitastor-nfs --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
|
||||
```
|
||||
|
||||
```
|
||||
mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
```
|
||||
|
||||
### Kubernetes
|
||||
|
||||
Vitastor has a CSI plugin for Kubernetes which supports RWO (and block RWX) volumes.
|
||||
Vitastor has a CSI plugin for Kubernetes which supports RWO volumes.
|
||||
|
||||
To deploy it, take manifests from [csi/deploy/](csi/deploy/) directory, put your
|
||||
Vitastor configuration in [csi/deploy/001-csi-config-map.yaml](001-csi-config-map.yaml),
|
||||
|
Submodule cpp-btree updated: 45e6d1f131...6e20146406
@@ -1,4 +1,4 @@
|
||||
VERSION ?= v0.6.17
|
||||
VERSION ?= v0.6.12
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v0.6.17
|
||||
image: vitalif/vitastor-csi:v0.6.12
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -116,7 +116,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v0.6.17
|
||||
image: vitalif/vitastor-csi:v0.6.12
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -1,13 +0,0 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: test-vitastor-pvc-block
|
||||
spec:
|
||||
storageClassName: vitastor
|
||||
volumeMode: Block
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
@@ -1,17 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: vitastor-test-block-pvc
|
||||
namespace: default
|
||||
spec:
|
||||
containers:
|
||||
- name: vitastor-test-block-pvc
|
||||
image: nginx
|
||||
volumeDevices:
|
||||
- name: data
|
||||
devicePath: /dev/xvda
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: test-vitastor-pvc-block
|
||||
readOnly: false
|
@@ -1,17 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: vitastor-test-nginx
|
||||
namespace: default
|
||||
spec:
|
||||
containers:
|
||||
- name: vitastor-test-nginx
|
||||
image: nginx
|
||||
volumeMounts:
|
||||
- mountPath: /usr/share/nginx/html/s3
|
||||
name: data
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: test-vitastor-pvc
|
||||
readOnly: false
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "0.6.17"
|
||||
vitastorCSIDriverVersion = "0.6.12"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
@@ -67,44 +67,29 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
|
||||
klog.Infof("received node publish volume request %+v", protosanitizer.StripSecrets(req))
|
||||
|
||||
targetPath := req.GetTargetPath()
|
||||
isBlock := req.GetVolumeCapability().GetBlock() != nil
|
||||
|
||||
// Check that it's not already mounted
|
||||
_, error := mount.IsNotMountPoint(ns.mounter, targetPath)
|
||||
free, error := mount.IsNotMountPoint(ns.mounter, targetPath)
|
||||
if (error != nil)
|
||||
{
|
||||
if (os.IsNotExist(error))
|
||||
{
|
||||
if (isBlock)
|
||||
error := os.MkdirAll(targetPath, 0777)
|
||||
if (error != nil)
|
||||
{
|
||||
pathFile, err := os.OpenFile(targetPath, os.O_CREATE|os.O_RDWR, 0o600)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("failed to create block device mount target %s with error: %v", targetPath, err)
|
||||
return nil, status.Error(codes.Internal, err.Error())
|
||||
}
|
||||
err = pathFile.Close()
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("failed to close %s with error: %v", targetPath, err)
|
||||
return nil, status.Error(codes.Internal, err.Error())
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
err := os.MkdirAll(targetPath, 0777)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("failed to create fs mount target %s with error: %v", targetPath, err)
|
||||
return nil, status.Error(codes.Internal, err.Error())
|
||||
}
|
||||
return nil, status.Error(codes.Internal, error.Error())
|
||||
}
|
||||
free = true
|
||||
}
|
||||
else
|
||||
{
|
||||
return nil, status.Error(codes.Internal, error.Error())
|
||||
}
|
||||
}
|
||||
if (!free)
|
||||
{
|
||||
return &csi.NodePublishVolumeResponse{}, nil
|
||||
}
|
||||
|
||||
ctxVars := make(map[string]string)
|
||||
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
|
||||
@@ -164,6 +149,7 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
|
||||
|
||||
// Format the device (ext4 or xfs)
|
||||
fsType := req.GetVolumeCapability().GetMount().GetFsType()
|
||||
isBlock := req.GetVolumeCapability().GetBlock() != nil
|
||||
opt := req.GetVolumeCapability().GetMount().GetMountFlags()
|
||||
opt = append(opt, "_netdev")
|
||||
if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
|
||||
|
2
debian/changelog
vendored
2
debian/changelog
vendored
@@ -1,4 +1,4 @@
|
||||
vitastor (0.6.17-1) unstable; urgency=medium
|
||||
vitastor (0.6.12-1) unstable; urgency=medium
|
||||
|
||||
* RDMA support
|
||||
* Bugfixes
|
||||
|
1
debian/vitastor-client.install
vendored
1
debian/vitastor-client.install
vendored
@@ -2,6 +2,5 @@ usr/bin/vita
|
||||
usr/bin/vitastor-cli
|
||||
usr/bin/vitastor-rm
|
||||
usr/bin/vitastor-nbd
|
||||
usr/bin/vitastor-nfs
|
||||
usr/lib/*/libvitastor*.so*
|
||||
mon/make-osd.sh /usr/lib/vitastor
|
||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -33,8 +33,8 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-0.6.17; \
|
||||
cd vitastor-0.6.17; \
|
||||
cp -r /root/vitastor vitastor-0.6.12; \
|
||||
cd vitastor-0.6.12; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
@@ -47,8 +47,8 @@ RUN set -e -x; \
|
||||
rm -rf a b; \
|
||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.17.orig.tar.xz vitastor-0.6.17; \
|
||||
cd vitastor-0.6.17; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.12.orig.tar.xz vitastor-0.6.12; \
|
||||
cd vitastor-0.6.12; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
@@ -30,18 +30,6 @@
|
||||
будут использоваться обычные синхронные системные вызовы send/recv. Для OSD
|
||||
это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в
|
||||
принципе, это может применяться для клиентов со старыми версиями ядра.
|
||||
- name: use_zerocopy_send
|
||||
type: bool
|
||||
default: false
|
||||
info: |
|
||||
If true, OSDs and clients will attempt to use TCP zero-copy send
|
||||
(MSG_ZEROCOPY) for big buffers. It's recommended to raise net.ipv4.tcp_wmem
|
||||
and net.core.wmem_max sysctls when using this mode.
|
||||
info_ru: |
|
||||
Если установлено в true, то OSD и клиенты будут стараться использовать
|
||||
TCP-отправку без копирования (MSG_ZEROCOPY) для больших буферов данных.
|
||||
Рекомендуется поднять значения sysctl net.ipv4.tcp_wmem и net.core.wmem_max
|
||||
при использовании этого режима.
|
||||
- name: use_rdma
|
||||
type: bool
|
||||
default: true
|
||||
@@ -60,19 +48,28 @@
|
||||
type: string
|
||||
info: |
|
||||
RDMA device name to use for Vitastor OSD communications (for example,
|
||||
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
|
||||
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
|
||||
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||
root to list available RDMA devices and their features.
|
||||
"rocep5s0f0"). Please note that if your RDMA device doesn't support
|
||||
Implicit ODP (Implicit On-Demand Paging) then all Vitastor OSDs and clients
|
||||
will have to use mlockall() to lock all application memory to use RDMA.
|
||||
In case of the native Vitastor QEMU driver with RDMA, all virtual machine
|
||||
memory will be locked if your RDMA device doesn't support Implicit ODP.
|
||||
|
||||
Notably, Mellanox ConnectX-3 and older adapters don't support Implicit ODP,
|
||||
while ConnectX-4 and newer do. Run `ibv_devinfo -v` as root to list
|
||||
available RDMA devices and their features.
|
||||
info_ru: |
|
||||
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
||||
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
||||
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
|
||||
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
|
||||
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||
параметры и возможности.
|
||||
Имейте в виду, что если ваше устройство не поддерживает Implicit ODP
|
||||
(Implicit On-Demand Paging), то все OSD и клиенты Vitastor будут вынуждены
|
||||
блокировать всю память приложения с помощью mlockall(), чтобы задействовать
|
||||
RDMA. В случае нативного QEMU-драйвера это будет означать, что при
|
||||
использовании RDMA на устройстве без поддержки Implicit ODP блокироваться
|
||||
от выгрузки будет вся память виртуальных машин.
|
||||
|
||||
В случае с адаптерами Mellanox Implicit ODP поддерживается начиная с
|
||||
ConnectX-4. ConnectX-3 и более старые адаптеры не поддерживают Implicit ODP.
|
||||
Чтобы посмотреть список своих RDMA-устройств и их возможностей, запустите
|
||||
`ibv_devinfo -v` от имени суперпользователя.
|
||||
- name: rdma_port_num
|
||||
type: int
|
||||
default: 1
|
||||
|
@@ -1,414 +0,0 @@
|
||||
#!/usr/bin/nodejs
|
||||
// systemd unit generator for hybrid (HDD+SSD) vitastor OSDs
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1
|
||||
|
||||
// USAGE: nodejs make-osd-hybrid.js [--disable_ssd_cache 0] [--disable_hdd_cache 0] /dev/sda /dev/sdb /dev/sdc /dev/sdd ...
|
||||
// I.e. - just pass all HDDs and SSDs mixed, the script will decide where
|
||||
// to put journals on its own
|
||||
|
||||
const fs = require('fs');
|
||||
const fsp = fs.promises;
|
||||
const child_process = require('child_process');
|
||||
|
||||
const options = {
|
||||
debug: 1,
|
||||
journal_size: 1024*1024*1024,
|
||||
min_meta_size: 1024*1024*1024,
|
||||
object_size: 1024*1024,
|
||||
bitmap_granularity: 4096,
|
||||
device_block_size: 4096,
|
||||
disable_ssd_cache: 1,
|
||||
disable_hdd_cache: 1,
|
||||
};
|
||||
|
||||
run().catch(console.fatal);
|
||||
|
||||
async function run()
|
||||
{
|
||||
const device_list = parse_options();
|
||||
await system_or_die("mkdir -p /var/log/vitastor; chown vitastor /var/log/vitastor");
|
||||
// Collect devices
|
||||
const all_devices = await collect_devices(device_list);
|
||||
const ssds = all_devices.filter(d => d.ssd);
|
||||
const hdds = all_devices.filter(d => !d.ssd);
|
||||
// Collect existing OSD units
|
||||
const osd_units = await collect_osd_units();
|
||||
// Count assigned HDD journals and unallocated space for each SSD
|
||||
await check_journal_count(ssds, osd_units);
|
||||
// Create new OSDs
|
||||
await create_new_hybrid_osds(hdds, ssds, osd_units);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
function parse_options()
|
||||
{
|
||||
const devices = [];
|
||||
const opt = {};
|
||||
for (let i = 2; i < process.argv.length; i++)
|
||||
{
|
||||
const arg = process.argv[i];
|
||||
if (arg == '--help' || arg == '-h')
|
||||
{
|
||||
opt.help = true;
|
||||
break;
|
||||
}
|
||||
else if (arg.substr(0, 2) == '--')
|
||||
opt[arg.substr(2)] = process.argv[++i];
|
||||
else
|
||||
devices.push(arg);
|
||||
}
|
||||
if (opt.help || !devices.length)
|
||||
{
|
||||
console.log(
|
||||
'Prepare hybrid (HDD+SSD) Vitastor OSDs\n'+
|
||||
'(c) Vitaliy Filippov, 2019+, license: VNPL-1.1\n\n'+
|
||||
'USAGE: nodejs make-osd-hybrid.js [OPTIONS] /dev/sda /dev/sdb /dev/sdc ...\n'+
|
||||
'Just pass all your SSDs and HDDs in any order, the script will distribute OSDs for you.\n\n'+
|
||||
'OPTIONS (with defaults):\n'+
|
||||
Object.keys(options).map(k => ` --${k} ${options[k]}`).join('\n')
|
||||
);
|
||||
process.exit(0);
|
||||
}
|
||||
for (const k in opt)
|
||||
options[k] = opt[k];
|
||||
return devices;
|
||||
}
|
||||
|
||||
// Collect devices
|
||||
async function collect_devices(devices_to_check)
|
||||
{
|
||||
const devices = [];
|
||||
for (const dev of devices_to_check)
|
||||
{
|
||||
if (dev.substr(0, 5) != '/dev/')
|
||||
{
|
||||
console.log(`${dev} does not start with /dev/, skipping`);
|
||||
continue;
|
||||
}
|
||||
if (!await file_exists('/sys/block/'+dev.substr(5)))
|
||||
{
|
||||
console.log(`${dev} is a partition, skipping`);
|
||||
continue;
|
||||
}
|
||||
// Check if the device is an SSD
|
||||
const rot = '/sys/block/'+dev.substr(5)+'/queue/rotational';
|
||||
if (!await file_exists(rot))
|
||||
{
|
||||
console.log(`${dev} does not have ${rot} to check whether it's an SSD, skipping`);
|
||||
continue;
|
||||
}
|
||||
const ssd = !parseInt(await fsp.readFile(rot, { encoding: 'utf-8' }));
|
||||
// Check if the device has partition table
|
||||
let [ has_partition_table, parts ] = await system(`sfdisk --dump ${dev} --json`);
|
||||
if (has_partition_table != 0)
|
||||
{
|
||||
// Check if the device has any data
|
||||
const [ has_data, out ] = await system(`blkid ${dev}`);
|
||||
if (has_data == 0)
|
||||
{
|
||||
console.log(`${dev} contains data, skipping:\n ${out.trim().replace(/\n/g, '\n ')}`);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
parts = parts ? JSON.parse(parts).partitiontable : null;
|
||||
if (parts && parts.label != 'gpt')
|
||||
{
|
||||
console.log(`${dev} contains "${parts.label}" partition table, only GPT is supported, skipping`);
|
||||
continue;
|
||||
}
|
||||
devices.push({
|
||||
path: dev,
|
||||
ssd,
|
||||
parts,
|
||||
});
|
||||
}
|
||||
return devices;
|
||||
}
|
||||
|
||||
// Collect existing OSD units
|
||||
async function collect_osd_units()
|
||||
{
|
||||
const units = [];
|
||||
for (const unit of (await system("ls /etc/systemd/system/vitastor-osd*.service"))[1].trim().split('\n'))
|
||||
{
|
||||
if (!unit)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
let cmd = /^ExecStart\s*=\s*(([^\n]*\\\n)*[^\n]*)/.exec(await fsp.readFile(unit, { encoding: 'utf-8' }));
|
||||
if (!cmd)
|
||||
{
|
||||
console.log('ExecStart= not found in '+unit+', skipping')
|
||||
continue;
|
||||
}
|
||||
let kv = {}, key;
|
||||
cmd = cmd[1].replace(/^bash\s+-c\s+'/, '')
|
||||
.replace(/>>\s*\S+2>\s*&1\s*'$/, '')
|
||||
.replace(/\s*\\\n\s*/g, ' ')
|
||||
.replace(/([^\s']+)|'([^']+)'/g, (m, m1, m2) =>
|
||||
{
|
||||
m1 = m1||m2;
|
||||
if (key == null)
|
||||
{
|
||||
if (m1.substr(0, 2) != '--')
|
||||
{
|
||||
console.log('Strange command line in '+unit+', stopping');
|
||||
process.exit(1);
|
||||
}
|
||||
key = m1.substr(2);
|
||||
}
|
||||
else
|
||||
{
|
||||
kv[key] = m1;
|
||||
key = null;
|
||||
}
|
||||
});
|
||||
units.push(kv);
|
||||
}
|
||||
return units;
|
||||
}
|
||||
|
||||
// Count assigned HDD journals and unallocated space for each SSD
|
||||
async function check_journal_count(ssds, osd_units)
|
||||
{
|
||||
const units_by_journal = osd_units.reduce((a, c) =>
|
||||
{
|
||||
if (c.journal_device)
|
||||
a[c.journal_device] = c;
|
||||
return a;
|
||||
}, {});
|
||||
for (const dev of ssds)
|
||||
{
|
||||
dev.journals = 0;
|
||||
if (dev.parts)
|
||||
{
|
||||
for (const part of dev.parts.partitions)
|
||||
{
|
||||
if (part.uuid && units_by_journal['/dev/disk/by-partuuid/'+part.uuid.toLowerCase()])
|
||||
{
|
||||
dev.journals++;
|
||||
}
|
||||
}
|
||||
dev.free = free_from_parttable(dev.parts);
|
||||
}
|
||||
else
|
||||
{
|
||||
dev.free = parseInt(await system_or_die("blockdev --getsize64 "+dev.path));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function create_new_hybrid_osds(hdds, ssds, osd_units)
|
||||
{
|
||||
const units_by_disk = osd_units.reduce((a, c) => { a[c.data_device] = c; return a; }, {});
|
||||
for (const dev of hdds)
|
||||
{
|
||||
if (!dev.parts)
|
||||
{
|
||||
// HDD is not partitioned yet, create a single partition
|
||||
// + is the "default value" for sfdisk
|
||||
await system_or_die('sfdisk '+dev.path, 'label: gpt\n\n+ +\n');
|
||||
dev.parts = JSON.parse(await system_or_die('sfdisk --dump '+dev.path+' --json')).partitiontable;
|
||||
}
|
||||
if (dev.parts.partitions.length != 1)
|
||||
{
|
||||
console.log(dev.path+' has more than 1 partition, skipping');
|
||||
}
|
||||
else if ((dev.parts.partitions[0].start + dev.parts.partitions[0].size) != (1 + dev.parts.lastlba))
|
||||
{
|
||||
console.log(dev.path+'1 is not a whole-disk partition, skipping');
|
||||
}
|
||||
else if (!dev.parts.partitions[0].uuid)
|
||||
{
|
||||
console.log(dev.parts.partitions[0].node+' does not have UUID. Please repartition '+dev.path+' with GPT');
|
||||
}
|
||||
else if (!units_by_disk['/dev/disk/by-partuuid/'+dev.parts.partitions[0].uuid.toLowerCase()])
|
||||
{
|
||||
await create_hybrid_osd(dev, ssds);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function create_hybrid_osd(dev, ssds)
|
||||
{
|
||||
// Create a new OSD
|
||||
// Calculate metadata size
|
||||
const data_device = '/dev/disk/by-partuuid/'+dev.parts.partitions[0].uuid.toLowerCase();
|
||||
const data_size = dev.parts.partitions[0].size * dev.parts.sectorsize;
|
||||
const meta_entry_size = 24 + 2*options.object_size/options.bitmap_granularity/8;
|
||||
const entries_per_block = Math.floor(options.device_block_size / meta_entry_size);
|
||||
const object_count = Math.floor(data_size / options.object_size);
|
||||
let meta_size = Math.ceil(1 + object_count / entries_per_block) * options.device_block_size;
|
||||
// Leave some extra space for future metadata formats and round metadata area size to multiples of 1 MB
|
||||
meta_size = 2*meta_size;
|
||||
meta_size = Math.ceil(meta_size/1024/1024) * 1024*1024;
|
||||
if (meta_size < options.min_meta_size)
|
||||
meta_size = options.min_meta_size;
|
||||
let journal_size = Math.ceil(options.journal_size/1024/1024) * 1024*1024;
|
||||
// Pick an SSD for journal, balancing the number of journals across SSDs
|
||||
let selected_ssd;
|
||||
for (const ssd of ssds)
|
||||
if (ssd.free >= (meta_size+journal_size) && (!selected_ssd || selected_ssd.journals > ssd.journals))
|
||||
selected_ssd = ssd;
|
||||
if (!selected_ssd)
|
||||
{
|
||||
console.error('Could not find free space for SSD journal and metadata for '+dev.path);
|
||||
process.exit(1);
|
||||
}
|
||||
// Allocate an OSD number
|
||||
const osd_num = (await system_or_die("vitastor-cli alloc-osd")).trim();
|
||||
if (!osd_num)
|
||||
{
|
||||
console.error('Failed to run vitastor-cli alloc-osd');
|
||||
process.exit(1);
|
||||
}
|
||||
console.log('Creating OSD '+osd_num+' on '+dev.path+' (HDD) with journal and metadata on '+selected_ssd.path+' (SSD)');
|
||||
// Add two partitions: journal and metadata
|
||||
const new_parts = await add_partitions(selected_ssd, [ journal_size, meta_size ]);
|
||||
selected_ssd.journals++;
|
||||
const journal_device = '/dev/disk/by-partuuid/'+new_parts[0].uuid.toLowerCase();
|
||||
const meta_device = '/dev/disk/by-partuuid/'+new_parts[1].uuid.toLowerCase();
|
||||
// Wait until the device symlinks appear
|
||||
while (!await file_exists(journal_device))
|
||||
{
|
||||
await new Promise(ok => setTimeout(ok, 100));
|
||||
}
|
||||
while (!await file_exists(meta_device))
|
||||
{
|
||||
await new Promise(ok => setTimeout(ok, 100));
|
||||
}
|
||||
// Zero out metadata and journal
|
||||
await system_or_die("dd if=/dev/zero of="+journal_device+" bs=1M count="+(journal_size/1024/1024)+" oflag=direct");
|
||||
await system_or_die("dd if=/dev/zero of="+meta_device+" bs=1M count="+(meta_size/1024/1024)+" oflag=direct");
|
||||
// Create unit file for the OSD
|
||||
const has_scsi_cache_type = options.disable_ssd_cache &&
|
||||
(await system("ls /sys/block/"+selected_ssd.path.substr(5)+"/device/scsi_disk/*/cache_type"))[0] == 0;
|
||||
const write_through = options.disable_ssd_cache && (
|
||||
has_scsi_cache_type || selected_ssd.path.substr(5, 4) == 'nvme'
|
||||
&& (await system_or_die("/sys/block/"+selected_ssd.path.substr(5)+"/queue/write_cache")).trim() == "write through");
|
||||
await fsp.writeFile('/etc/systemd/system/vitastor-osd'+osd_num+'.service',
|
||||
`[Unit]
|
||||
Description=Vitastor object storage daemon osd.${osd_num}
|
||||
After=network-online.target local-fs.target time-sync.target
|
||||
Wants=network-online.target local-fs.target time-sync.target
|
||||
PartOf=vitastor.target
|
||||
|
||||
[Service]
|
||||
LimitNOFILE=1048576
|
||||
LimitNPROC=1048576
|
||||
LimitMEMLOCK=infinity
|
||||
ExecStart=bash -c '/usr/bin/vitastor-osd \\
|
||||
--osd_num ${osd_num} ${write_through
|
||||
? "--disable_meta_fsync 1 --disable_journal_fsync 1 --immediate_commit "+(options.disable_hdd_cache ? "all" : "small")
|
||||
: ""} \\
|
||||
--throttle_small_writes 1 \\
|
||||
--disk_alignment ${options.device_block_size} \\
|
||||
--journal_block_size ${options.device_block_size} \\
|
||||
--meta_block_size ${options.device_block_size} \\
|
||||
--journal_no_same_sector_overwrites true \\
|
||||
--journal_sector_buffer_count 1024 \\
|
||||
--block_size ${options.object_size} \\
|
||||
--data_device ${data_device} \\
|
||||
--journal_device ${journal_device} \\
|
||||
--meta_device ${meta_device} >>/var/log/vitastor/osd${osd_num}.log 2>&1'
|
||||
WorkingDirectory=/
|
||||
ExecStartPre=+chown vitastor:vitastor ${data_device}
|
||||
ExecStartPre=+chown vitastor:vitastor ${journal_device}
|
||||
ExecStartPre=+chown vitastor:vitastor ${meta_device}${
|
||||
has_scsi_cache_type
|
||||
? "\nExecStartPre=+bash -c 'D=$$$(readlink "+journal_device+"); echo write through > $$$(dirname /sys/block/*/$$\${D##*/})/device/scsi_disk/*/cache_type'"
|
||||
: ""}${
|
||||
options.disable_hdd_cache
|
||||
? "\nExecStartPre=+bash -c 'D=$$$(readlink "+data_device+"); echo write through > $$$(dirname /sys/block/*/$$\${D##*/})/device/scsi_disk/*/cache_type'"
|
||||
: ""}
|
||||
User=vitastor
|
||||
PrivateTmp=false
|
||||
TasksMax=infinity
|
||||
Restart=always
|
||||
StartLimitInterval=0
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=vitastor.target
|
||||
`);
|
||||
await system_or_die("systemctl enable vitastor-osd"+osd_num);
|
||||
}
|
||||
|
||||
async function add_partitions(dev, sizes)
|
||||
{
|
||||
let script = 'label: gpt\n\n';
|
||||
if (dev.parts)
|
||||
{
|
||||
// Old partitions
|
||||
for (const part of dev.parts.partitions)
|
||||
{
|
||||
script += part.node+': '+Object.keys(part).map(k => k == 'node' ? '' : k+'='+part[k]).filter(k => k).join(', ')+'\n';
|
||||
}
|
||||
}
|
||||
// New partitions
|
||||
for (const size of sizes)
|
||||
{
|
||||
script += '+ '+Math.ceil(size/1024)+'KiB\n';
|
||||
}
|
||||
await system_or_die('sfdisk '+dev.path, script);
|
||||
// Get new partition table and find the new partition
|
||||
const newpt = JSON.parse(await system_or_die('sfdisk --dump '+dev.path+' --json')).partitiontable;
|
||||
const old_nodes = dev.parts ? dev.parts.partitions.reduce((a, c) => { a[c.uuid] = true; return a; }, {}) : {};
|
||||
const new_nodes = newpt.partitions.filter(part => !old_nodes[part.uuid]);
|
||||
if (new_nodes.length != sizes.length)
|
||||
{
|
||||
console.error('Failed to partition '+dev.path+': new partitions not found in table');
|
||||
process.exit(1);
|
||||
}
|
||||
dev.parts = newpt;
|
||||
dev.free = free_from_parttable(newpt);
|
||||
return new_nodes;
|
||||
}
|
||||
|
||||
function free_from_parttable(pt)
|
||||
{
|
||||
let free = pt.lastlba + 1 - pt.firstlba;
|
||||
for (const part of pt.partitions)
|
||||
{
|
||||
free -= part.size;
|
||||
}
|
||||
free *= pt.sectorsize;
|
||||
return free;
|
||||
}
|
||||
|
||||
async function system_or_die(cmd, input = '')
|
||||
{
|
||||
let [ exitcode, stdout, stderr ] = await system(cmd, input);
|
||||
if (exitcode != 0)
|
||||
{
|
||||
console.error(cmd+' failed: '+stderr);
|
||||
process.exit(1);
|
||||
}
|
||||
return stdout;
|
||||
}
|
||||
|
||||
async function system(cmd, input = '')
|
||||
{
|
||||
if (options.debug)
|
||||
{
|
||||
process.stderr.write('+ '+cmd+(input ? " <<EOF\n"+input.replace(/\s*$/, '\n')+"EOF" : '')+'\n');
|
||||
}
|
||||
const cp = child_process.spawn(cmd, { shell: true });
|
||||
let stdout = '', stderr = '', finish_cb;
|
||||
cp.stdout.on('data', buf => stdout += buf.toString());
|
||||
cp.stderr.on('data', buf => stderr += buf.toString());
|
||||
cp.on('exit', () => finish_cb && finish_cb());
|
||||
cp.stdin.write(input);
|
||||
cp.stdin.end();
|
||||
if (cp.exitCode == null)
|
||||
{
|
||||
await new Promise(ok => finish_cb = ok);
|
||||
}
|
||||
return [ cp.exitCode, stdout, stderr ];
|
||||
}
|
||||
|
||||
async function file_exists(filename)
|
||||
{
|
||||
return new Promise((ok, no) => fs.access(filename, fs.constants.R_OK, err => ok(!err)));
|
||||
}
|
@@ -25,10 +25,6 @@ OPT=$(vitastor-cli simple-offsets --format options $DEV | tr '\n' ' ')
|
||||
META=$(vitastor-cli simple-offsets --format json $DEV | jq .data_offset)
|
||||
dd if=/dev/zero of=$DEV bs=1048576 count=$(((META+1048575)/1048576)) oflag=direct
|
||||
|
||||
mkdir -p /var/log/vitastor
|
||||
id vitastor &>/dev/null || useradd vitastor
|
||||
chown vitastor /var/log/vitastor
|
||||
|
||||
cat >/etc/systemd/system/vitastor-osd$OSD_NUM.service <<EOF
|
||||
[Unit]
|
||||
Description=Vitastor object storage daemon osd.$OSD_NUM
|
||||
@@ -40,14 +36,14 @@ PartOf=vitastor.target
|
||||
LimitNOFILE=1048576
|
||||
LimitNPROC=1048576
|
||||
LimitMEMLOCK=infinity
|
||||
ExecStart=bash -c '/usr/bin/vitastor-osd \\
|
||||
ExecStart=/usr/bin/vitastor-osd \\
|
||||
--osd_num $OSD_NUM \\
|
||||
--disable_data_fsync 1 \\
|
||||
--immediate_commit all \\
|
||||
--disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
|
||||
--journal_no_same_sector_overwrites true \\
|
||||
--journal_sector_buffer_count 1024 \\
|
||||
$OPT >>/var/log/vitastor/osd$OSD_NUM.log 2>&1'
|
||||
$OPT
|
||||
WorkingDirectory=/
|
||||
ExecStartPre=+chown vitastor:vitastor $DEV
|
||||
User=vitastor
|
||||
|
119
mon/mon.js
119
mon/mon.js
@@ -31,7 +31,6 @@ const etcd_allow = new RegExp('^'+[
|
||||
'osd/inodestats/[1-9]\\d*',
|
||||
'osd/space/[1-9]\\d*',
|
||||
'mon/master',
|
||||
'mon/member/[a-f0-9]+',
|
||||
'pg/state/[1-9]\\d*/[1-9]\\d*',
|
||||
'pg/stats/[1-9]\\d*/[1-9]\\d*',
|
||||
'pg/history/[1-9]\\d*/[1-9]\\d*',
|
||||
@@ -64,7 +63,6 @@ const etcd_tree = {
|
||||
// client and osd
|
||||
tcp_header_buffer_size: 65536,
|
||||
use_sync_send_recv: false,
|
||||
use_zerocopy_send: false,
|
||||
use_rdma: true,
|
||||
rdma_device: null, // for example, "rocep5s0f0"
|
||||
rdma_port_num: 1,
|
||||
@@ -161,8 +159,6 @@ const etcd_tree = {
|
||||
root_node?: 'rack1',
|
||||
// restrict pool to OSDs having all of these tags
|
||||
osd_tags?: 'nvme' | [ 'nvme', ... ],
|
||||
// prefer to put primary on OSD with these tags
|
||||
primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
|
||||
},
|
||||
...
|
||||
}, */
|
||||
@@ -227,28 +223,21 @@ const etcd_tree = {
|
||||
}, */
|
||||
},
|
||||
inodestats: {
|
||||
/* <pool_id>: {
|
||||
<inode_t>: {
|
||||
read: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
write: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
},
|
||||
/* <inode_t>: {
|
||||
read: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
write: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
|
||||
}, */
|
||||
},
|
||||
space: {
|
||||
/* <osd_num_t>: {
|
||||
<pool_id>: {
|
||||
<inode_t>: uint64_t, // bytes
|
||||
},
|
||||
<inode_t>: uint64_t, // bytes
|
||||
}, */
|
||||
},
|
||||
},
|
||||
mon: {
|
||||
master: {
|
||||
/* ip: [ string ], id: uint64_t */
|
||||
},
|
||||
standby: {
|
||||
/* <uint64_t>: { ip: [ string ] }, */
|
||||
/* ip: [ string ], */
|
||||
},
|
||||
},
|
||||
pg: {
|
||||
@@ -279,7 +268,7 @@ const etcd_tree = {
|
||||
<pg_id>: {
|
||||
osd_sets: osd_num_t[][],
|
||||
all_peers: osd_num_t[],
|
||||
epoch: uint64_t,
|
||||
epoch: uint32_t,
|
||||
},
|
||||
}, */
|
||||
},
|
||||
@@ -684,25 +673,11 @@ class Mon
|
||||
}, this.etcd_start_timeout, 0);
|
||||
}
|
||||
|
||||
get_mon_state()
|
||||
{
|
||||
return { ip: this.local_ips(), hostname: os.hostname() };
|
||||
}
|
||||
|
||||
async get_lease()
|
||||
{
|
||||
const max_ttl = this.config.etcd_mon_ttl + this.config.etcd_mon_timeout/1000*this.config.etcd_mon_retries;
|
||||
// Get lease
|
||||
let res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, -1);
|
||||
const res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, -1);
|
||||
this.etcd_lease_id = res.ID;
|
||||
// Register in /mon/member, just for the information
|
||||
const state = this.get_mon_state();
|
||||
res = await this.etcd_call('/kv/put', {
|
||||
key: b64(this.etcd_prefix+'/mon/member/'+this.etcd_lease_id),
|
||||
value: b64(JSON.stringify(state)),
|
||||
lease: ''+this.etcd_lease_id
|
||||
}, this.etcd_start_timeout, 0);
|
||||
// Set refresh timer
|
||||
this.lease_timer = setInterval(async () =>
|
||||
{
|
||||
const res = await this.etcd_call('/lease/keepalive', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
|
||||
@@ -728,7 +703,7 @@ class Mon
|
||||
|
||||
async become_master()
|
||||
{
|
||||
const state = { ...this.get_mon_state(), id: ''+this.etcd_lease_id };
|
||||
const state = { ip: this.local_ips() };
|
||||
while (1)
|
||||
{
|
||||
const res = await this.etcd_call('/kv/txn', {
|
||||
@@ -906,39 +881,27 @@ class Mon
|
||||
return this.seed + 2147483648;
|
||||
}
|
||||
|
||||
pick_primary(pool_id, osd_set, up_osds, aff_osds)
|
||||
pick_primary(pool_id, osd_set, up_osds)
|
||||
{
|
||||
let alive_set;
|
||||
if (this.state.config.pools[pool_id].scheme === 'replicated')
|
||||
{
|
||||
// Prefer "affinity" OSDs
|
||||
alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]);
|
||||
if (!alive_set.length)
|
||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
}
|
||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
else
|
||||
{
|
||||
// Prefer data OSDs for EC because they can actually read something without an additional network hop
|
||||
const pg_data_size = (this.state.config.pools[pool_id].pg_size||0) -
|
||||
(this.state.config.pools[pool_id].parity_chunks||0);
|
||||
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && aff_osds[osd_num]);
|
||||
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
if (!alive_set.length)
|
||||
alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]);
|
||||
if (!alive_set.length)
|
||||
{
|
||||
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
if (!alive_set.length)
|
||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
}
|
||||
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
|
||||
}
|
||||
if (!alive_set.length)
|
||||
return 0;
|
||||
return alive_set[this.rng() % alive_set.length];
|
||||
}
|
||||
|
||||
save_new_pgs_txn(request, pool_id, up_osds, osd_tree, prev_pgs, new_pgs, pg_history)
|
||||
save_new_pgs_txn(request, pool_id, up_osds, prev_pgs, new_pgs, pg_history)
|
||||
{
|
||||
const aff_osds = this.get_affinity_osds(this.state.config.pools[pool_id], up_osds, osd_tree);
|
||||
const pg_items = {};
|
||||
this.reset_rng();
|
||||
new_pgs.map((osd_set, i) =>
|
||||
@@ -946,7 +909,7 @@ class Mon
|
||||
osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
|
||||
pg_items[i+1] = {
|
||||
osd_set,
|
||||
primary: this.pick_primary(pool_id, osd_set, up_osds, aff_osds),
|
||||
primary: this.pick_primary(pool_id, osd_set, up_osds),
|
||||
};
|
||||
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
|
||||
prev_pgs[i].filter(osd_num => osd_num).length > 0)
|
||||
@@ -1077,13 +1040,6 @@ class Mon
|
||||
console.log('Pool '+pool_id+' has invalid osd_tags (must be a string or array of strings)');
|
||||
return false;
|
||||
}
|
||||
if (pool_cfg.primary_affinity_tags && typeof(pool_cfg.primary_affinity_tags) != 'string' &&
|
||||
(!(pool_cfg.primary_affinity_tags instanceof Array) || pool_cfg.primary_affinity_tags.filter(t => typeof t != 'string').length > 0))
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' has invalid primary_affinity_tags (must be a string or array of strings)');
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1113,17 +1069,6 @@ class Mon
|
||||
}
|
||||
}
|
||||
|
||||
get_affinity_osds(pool_cfg, up_osds, osd_tree)
|
||||
{
|
||||
let aff_osds = up_osds;
|
||||
if (pool_cfg.primary_affinity_tags)
|
||||
{
|
||||
aff_osds = { ...up_osds };
|
||||
this.filter_osds_by_tags(osd_tree, { x: aff_osds }, pool_cfg.primary_affinity_tags);
|
||||
}
|
||||
return aff_osds;
|
||||
}
|
||||
|
||||
async recheck_pgs()
|
||||
{
|
||||
// Take configuration and state, check it against the stored configuration hash
|
||||
@@ -1154,7 +1099,7 @@ class Mon
|
||||
{
|
||||
prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set;
|
||||
}
|
||||
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, osd_tree, prev_pgs, [], []);
|
||||
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, prev_pgs, [], []);
|
||||
}
|
||||
}
|
||||
for (const pool_id in this.state.config.pools)
|
||||
@@ -1261,7 +1206,7 @@ class Mon
|
||||
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
value: b64(JSON.stringify(this.state.pool.stats[pool_id])),
|
||||
} });
|
||||
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, osd_tree, real_prev_pgs, optimize_result.int_pgs, pg_history);
|
||||
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, real_prev_pgs, optimize_result.int_pgs, pg_history);
|
||||
}
|
||||
this.state.config.pgs.hash = tree_hash;
|
||||
await this.save_pg_config(etcd_request);
|
||||
@@ -1278,14 +1223,13 @@ class Mon
|
||||
continue;
|
||||
}
|
||||
const replicated = pool_cfg.scheme === 'replicated';
|
||||
const aff_osds = this.get_affinity_osds(pool_cfg, up_osds, osd_tree);
|
||||
this.reset_rng();
|
||||
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
|
||||
{
|
||||
const pg_cfg = this.state.config.pgs.items[pool_id][pg_num];
|
||||
if (pg_cfg)
|
||||
{
|
||||
const new_primary = this.pick_primary(pool_id, pg_cfg.osd_set, up_osds, aff_osds);
|
||||
const new_primary = this.pick_primary(pool_id, pg_cfg.osd_set, up_osds);
|
||||
if (pg_cfg.primary != new_primary)
|
||||
{
|
||||
console.log(
|
||||
@@ -1401,30 +1345,21 @@ class Mon
|
||||
const tm = prev_stats ? BigInt(timestamp - prev_stats.timestamp) : 0;
|
||||
for (const op in op_stats)
|
||||
{
|
||||
if (prev_stats && prev_stats.op_stats && prev_stats.op_stats[op])
|
||||
{
|
||||
op_stats[op].bps = (op_stats[op].bytes - prev_stats.op_stats[op].bytes) * 1000n / tm;
|
||||
op_stats[op].iops = (op_stats[op].count - prev_stats.op_stats[op].count) * 1000n / tm;
|
||||
op_stats[op].lat = (op_stats[op].usec - prev_stats.op_stats[op].usec)
|
||||
/ ((op_stats[op].count - prev_stats.op_stats[op].count) || 1n);
|
||||
}
|
||||
op_stats[op].bps = prev_stats ? (op_stats[op].bytes - prev_stats.op_stats[op].bytes) * 1000n / tm : 0;
|
||||
op_stats[op].iops = prev_stats ? (op_stats[op].count - prev_stats.op_stats[op].count) * 1000n / tm : 0;
|
||||
op_stats[op].lat = prev_stats ? (op_stats[op].usec - prev_stats.op_stats[op].usec)
|
||||
/ ((op_stats[op].count - prev_stats.op_stats[op].count) || 1n) : 0;
|
||||
}
|
||||
for (const op in subop_stats)
|
||||
{
|
||||
if (prev_stats && prev_stats.subop_stats && prev_stats.subop_stats[op])
|
||||
{
|
||||
subop_stats[op].iops = (subop_stats[op].count - prev_stats.subop_stats[op].count) * 1000n / tm;
|
||||
subop_stats[op].lat = (subop_stats[op].usec - prev_stats.subop_stats[op].usec)
|
||||
/ ((subop_stats[op].count - prev_stats.subop_stats[op].count) || 1n);
|
||||
}
|
||||
subop_stats[op].iops = prev_stats ? (subop_stats[op].count - prev_stats.subop_stats[op].count) * 1000n / tm : 0;
|
||||
subop_stats[op].lat = prev_stats ? (subop_stats[op].usec - prev_stats.subop_stats[op].usec)
|
||||
/ ((subop_stats[op].count - prev_stats.subop_stats[op].count) || 1n) : 0;
|
||||
}
|
||||
for (const op in recovery_stats)
|
||||
{
|
||||
if (prev_stats && prev_stats.recovery_stats && prev_stats.recovery_stats[op])
|
||||
{
|
||||
recovery_stats[op].bps = (recovery_stats[op].bytes - prev_stats.recovery_stats[op].bytes) * 1000n / tm;
|
||||
recovery_stats[op].iops = (recovery_stats[op].count - prev_stats.recovery_stats[op].count) * 1000n / tm;
|
||||
}
|
||||
recovery_stats[op].bps = prev_stats ? (recovery_stats[op].bytes - prev_stats.recovery_stats[op].bytes) * 1000n / tm : 0;
|
||||
recovery_stats[op].iops = prev_stats ? (recovery_stats[op].count - prev_stats.recovery_stats[op].count) * 1000n / tm : 0;
|
||||
}
|
||||
return { op_stats, subop_stats, recovery_stats };
|
||||
}
|
||||
|
@@ -49,8 +49,7 @@ async function run()
|
||||
}
|
||||
options.journal_offset = Math.ceil(options.journal_offset/options.device_block_size)*options.device_block_size;
|
||||
const meta_offset = options.journal_offset + Math.ceil(options.journal_size/options.device_block_size)*options.device_block_size;
|
||||
const meta_entry_size = 24 + 2*options.object_size/options.bitmap_granularity/8;
|
||||
const entries_per_block = Math.floor(options.device_block_size / meta_entry_size);
|
||||
const entries_per_block = Math.floor(options.device_block_size / (24 + 2*options.object_size/options.bitmap_granularity/8));
|
||||
const object_count = Math.floor((device_size-meta_offset)/options.object_size);
|
||||
const meta_size = Math.ceil(1 + object_count / entries_per_block) * options.device_block_size;
|
||||
const data_offset = meta_offset + meta_size;
|
||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VERSION = '0.6.17'
|
||||
VERSION = '0.6.12'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
@@ -355,25 +355,7 @@ class VitastorDriver(driver.CloneableImageVD,
|
||||
def revert_to_snapshot(self, context, volume, snapshot):
|
||||
"""Revert a volume to a given snapshot."""
|
||||
|
||||
vol_name = utils.convert_str(snapshot.volume_name)
|
||||
snap_name = utils.convert_str(snapshot.name)
|
||||
|
||||
# Delete the image and recreate it from the snapshot
|
||||
args = [ 'vitastor-cli', 'rm', vol_name, *(self._vitastor_args()) ]
|
||||
try:
|
||||
self._execute(*args)
|
||||
except processutils.ProcessExecutionError as exc:
|
||||
LOG.error("Failed to delete image "+vol_name+": "+exc)
|
||||
raise exception.VolumeBackendAPIException(data = exc.stderr)
|
||||
args = [
|
||||
'vitastor-cli', 'create', '--parent', vol_name+'@'+snap_name,
|
||||
vol_name, *(self._vitastor_args())
|
||||
]
|
||||
try:
|
||||
self._execute(*args)
|
||||
except processutils.ProcessExecutionError as exc:
|
||||
LOG.error("Failed to recreate image "+vol_name+" from "+vol_name+"@"+snap_name+": "+exc)
|
||||
raise exception.VolumeBackendAPIException(data = exc.stderr)
|
||||
# FIXME Delete the image, then recreate it from the snapshot
|
||||
|
||||
def delete_snapshot(self, snapshot):
|
||||
"""Deletes a snapshot."""
|
||||
@@ -381,15 +363,24 @@ class VitastorDriver(driver.CloneableImageVD,
|
||||
vol_name = utils.convert_str(snapshot.volume_name)
|
||||
snap_name = utils.convert_str(snapshot.name)
|
||||
|
||||
args = [
|
||||
'vitastor-cli', 'rm', vol_name+'@'+snap_name,
|
||||
*(self._vitastor_args())
|
||||
]
|
||||
try:
|
||||
self._execute(*args)
|
||||
except processutils.ProcessExecutionError as exc:
|
||||
LOG.error("Failed to remove snapshot "+vol_name+'@'+snap_name+": "+exc)
|
||||
raise exception.VolumeBackendAPIException(data = exc.stderr)
|
||||
# Find the snapshot
|
||||
resp = self._etcd_txn({ 'success': [
|
||||
{ 'request_range': { 'key': 'index/image/'+vol_name+'@'+snap_name } },
|
||||
] })
|
||||
if len(resp['responses'][0]['kvs']) == 0:
|
||||
raise exception.SnapshotNotFound(snapshot_id = snap_name)
|
||||
inode_id = int(resp['responses'][0]['kvs'][0]['value']['id'])
|
||||
pool_id = int(resp['responses'][0]['kvs'][0]['value']['pool_id'])
|
||||
parents = {}
|
||||
parents[(pool_id << 48) | (inode_id & 0xffffffffffff)] = True
|
||||
|
||||
# Check if there are child volumes
|
||||
children = self._child_count(parents)
|
||||
if children > 0:
|
||||
raise exception.SnapshotIsBusy(snapshot_name = snap_name)
|
||||
|
||||
# FIXME: We can't delete snapshots because we can't merge layers yet
|
||||
raise exception.VolumeBackendAPIException(data = 'Snapshot delete (layer merge) is not implemented yet')
|
||||
|
||||
def _child_count(self, parents):
|
||||
children = 0
|
||||
|
@@ -25,4 +25,4 @@ rm fio
|
||||
mv fio-copy fio
|
||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-0.6.17/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.17$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-0.6.12/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.12$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -34,7 +34,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.6.17.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.6.12.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.6.17
|
||||
Version: 0.6.12
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.6.17.el7.tar.gz
|
||||
Source0: vitastor-0.6.12.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
@@ -119,7 +119,6 @@ cp -r mon %buildroot/usr/lib/vitastor
|
||||
|
||||
%files -n vitastor-client
|
||||
%_bindir/vitastor-nbd
|
||||
%_bindir/vitastor-nfs
|
||||
%_bindir/vitastor-cli
|
||||
%_bindir/vitastor-rm
|
||||
%_bindir/vita
|
||||
|
@@ -33,7 +33,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.6.17.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.6.12.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.6.17
|
||||
Version: 0.6.12
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.6.17.el8.tar.gz
|
||||
Source0: vitastor-0.6.12.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
@@ -116,7 +116,6 @@ cp -r mon %buildroot/usr/lib/vitastor
|
||||
|
||||
%files -n vitastor-client
|
||||
%_bindir/vitastor-nbd
|
||||
%_bindir/vitastor-nfs
|
||||
%_bindir/vitastor-cli
|
||||
%_bindir/vitastor-rm
|
||||
%_bindir/vita
|
||||
|
@@ -15,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="0.6.17")
|
||||
add_definitions(-DVERSION="0.6.12")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||
@@ -124,18 +124,6 @@ add_library(vitastor_client SHARED
|
||||
cluster_client.cpp
|
||||
cluster_client_list.cpp
|
||||
vitastor_c.cpp
|
||||
cli_common.cpp
|
||||
cli_alloc_osd.cpp
|
||||
cli_simple_offsets.cpp
|
||||
cli_status.cpp
|
||||
cli_df.cpp
|
||||
cli_ls.cpp
|
||||
cli_create.cpp
|
||||
cli_modify.cpp
|
||||
cli_flatten.cpp
|
||||
cli_merge.cpp
|
||||
cli_rm_data.cpp
|
||||
cli_rm.cpp
|
||||
)
|
||||
set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h")
|
||||
target_link_libraries(vitastor_client
|
||||
@@ -164,24 +152,10 @@ target_link_libraries(vitastor-nbd
|
||||
vitastor_client
|
||||
)
|
||||
|
||||
# vitastor-nfs
|
||||
add_executable(vitastor-nfs
|
||||
nfs_proxy.cpp
|
||||
nfs_conn.cpp
|
||||
nfs_portmap.cpp
|
||||
sha256.c
|
||||
nfs/xdr_impl.cpp
|
||||
nfs/rpc_xdr.cpp
|
||||
nfs/portmap_xdr.cpp
|
||||
nfs/nfs_xdr.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-nfs
|
||||
vitastor_client
|
||||
)
|
||||
|
||||
# vitastor-cli
|
||||
add_executable(vitastor-cli
|
||||
cli.cpp
|
||||
cli.cpp cli_alloc_osd.cpp cli_simple_offsets.cpp cli_df.cpp
|
||||
cli_ls.cpp cli_create.cpp cli_modify.cpp cli_flatten.cpp cli_merge.cpp cli_rm.cpp cli_snap_rm.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-cli
|
||||
vitastor_client
|
||||
@@ -270,7 +244,7 @@ target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mo
|
||||
|
||||
### Install
|
||||
|
||||
install(TARGETS vitastor-osd vitastor-dump-journal vitastor-nbd vitastor-nfs vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
install(TARGETS vitastor-osd vitastor-dump-journal vitastor-nbd vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-rm)
|
||||
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vita)
|
||||
install(
|
||||
|
@@ -1,5 +1,3 @@
|
||||
#include <sys/socket.h>
|
||||
#include <unistd.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <net/if.h>
|
||||
#include <sys/types.h>
|
||||
@@ -11,7 +9,7 @@
|
||||
|
||||
#include "addr_util.h"
|
||||
|
||||
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr)
|
||||
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr *addr)
|
||||
{
|
||||
if (parse_port)
|
||||
{
|
||||
@@ -27,7 +25,7 @@ bool string_to_addr(std::string str, bool parse_port, int default_port, struct s
|
||||
}
|
||||
if (inet_pton(AF_INET, str.c_str(), &((struct sockaddr_in*)addr)->sin_addr) == 1)
|
||||
{
|
||||
addr->ss_family = AF_INET;
|
||||
addr->sa_family = AF_INET;
|
||||
((struct sockaddr_in*)addr)->sin_port = htons(default_port);
|
||||
return true;
|
||||
}
|
||||
@@ -35,30 +33,30 @@ bool string_to_addr(std::string str, bool parse_port, int default_port, struct s
|
||||
str = str.substr(1, str.length()-2);
|
||||
if (inet_pton(AF_INET6, str.c_str(), &((struct sockaddr_in6*)addr)->sin6_addr) == 1)
|
||||
{
|
||||
addr->ss_family = AF_INET6;
|
||||
addr->sa_family = AF_INET6;
|
||||
((struct sockaddr_in6*)addr)->sin6_port = htons(default_port);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string addr_to_string(const sockaddr_storage &addr)
|
||||
std::string addr_to_string(const sockaddr &addr)
|
||||
{
|
||||
char peer_str[256];
|
||||
bool ok = false;
|
||||
int port;
|
||||
if (addr.ss_family == AF_INET)
|
||||
if (addr.sa_family == AF_INET)
|
||||
{
|
||||
ok = !!inet_ntop(AF_INET, &((sockaddr_in*)&addr)->sin_addr, peer_str, 256);
|
||||
port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
||||
}
|
||||
else if (addr.ss_family == AF_INET6)
|
||||
else if (addr.sa_family == AF_INET6)
|
||||
{
|
||||
ok = !!inet_ntop(AF_INET6, &((sockaddr_in6*)&addr)->sin6_addr, peer_str, 256);
|
||||
port = ntohs(((sockaddr_in6*)&addr)->sin6_port);
|
||||
}
|
||||
else
|
||||
throw std::runtime_error("Unknown address family "+std::to_string(addr.ss_family));
|
||||
throw std::runtime_error("Unknown address family "+std::to_string(addr.sa_family));
|
||||
if (!ok)
|
||||
throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
|
||||
return std::string(peer_str)+":"+std::to_string(port);
|
||||
@@ -188,51 +186,3 @@ std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool
|
||||
freeifaddrs(list);
|
||||
return addresses;
|
||||
}
|
||||
|
||||
int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port)
|
||||
{
|
||||
sockaddr_storage addr;
|
||||
if (!string_to_addr(bind_address, 0, bind_port, &addr))
|
||||
{
|
||||
throw std::runtime_error("bind address "+bind_address+" is not valid");
|
||||
}
|
||||
|
||||
int listen_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||
if (listen_fd < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("socket: ") + strerror(errno));
|
||||
}
|
||||
int enable = 1;
|
||||
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
|
||||
|
||||
if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("bind: ") + strerror(errno));
|
||||
}
|
||||
if (listening_port)
|
||||
{
|
||||
if (bind_port == 0)
|
||||
{
|
||||
socklen_t len = sizeof(addr);
|
||||
if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
|
||||
}
|
||||
*listening_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
||||
}
|
||||
else
|
||||
{
|
||||
*listening_port = bind_port;
|
||||
}
|
||||
}
|
||||
|
||||
if (listen(listen_fd, listen_backlog ? listen_backlog : 128) < 0)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("listen: ") + strerror(errno));
|
||||
}
|
||||
|
||||
return listen_fd;
|
||||
}
|
||||
|
@@ -4,7 +4,6 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr);
|
||||
std::string addr_to_string(const sockaddr_storage &addr);
|
||||
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr *addr);
|
||||
std::string addr_to_string(const sockaddr &addr);
|
||||
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg = std::vector<std::string>(), bool include_v6 = false);
|
||||
int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port);
|
||||
|
@@ -25,7 +25,7 @@ allocator::allocator(uint64_t blocks)
|
||||
size = free = blocks;
|
||||
last_one_mask = (blocks % 64) == 0
|
||||
? UINT64_MAX
|
||||
: (((uint64_t)1 << (blocks % 64)) - 1);
|
||||
: ((1l << (blocks % 64)) - 1);
|
||||
for (uint64_t i = 0; i < total; i++)
|
||||
{
|
||||
mask[i] = 0;
|
||||
@@ -79,7 +79,7 @@ void allocator::set(uint64_t addr, bool value)
|
||||
}
|
||||
if (value)
|
||||
{
|
||||
mask[last] = mask[last] | ((uint64_t)1 << bit);
|
||||
mask[last] = mask[last] | (1l << bit);
|
||||
if (mask[last] != (!is_last || cur_addr/64 < size/64
|
||||
? UINT64_MAX : last_one_mask))
|
||||
{
|
||||
@@ -88,7 +88,7 @@ void allocator::set(uint64_t addr, bool value)
|
||||
}
|
||||
else
|
||||
{
|
||||
mask[last] = mask[last] & ~((uint64_t)1 << bit);
|
||||
mask[last] = mask[last] & ~(1l << bit);
|
||||
}
|
||||
is_last = false;
|
||||
if (p2 > 1)
|
||||
|
@@ -21,7 +21,7 @@
|
||||
// Memory alignment for direct I/O (usually 512 bytes)
|
||||
// All other alignments must be a multiple of this one
|
||||
#ifndef MEM_ALIGNMENT
|
||||
#define MEM_ALIGNMENT 4096
|
||||
#define MEM_ALIGNMENT 512
|
||||
#endif
|
||||
|
||||
// Default block size is 128 KB, current allowed range is 4K - 128M
|
||||
|
@@ -415,11 +415,8 @@ stop_flusher:
|
||||
flusher->active_flushers++;
|
||||
resume_1:
|
||||
// Find it in clean_db
|
||||
{
|
||||
auto & clean_db = bs->clean_db_shard(cur.oid);
|
||||
auto clean_it = clean_db.find(cur.oid);
|
||||
old_clean_loc = (clean_it != clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
||||
}
|
||||
clean_it = bs->clean_db.find(cur.oid);
|
||||
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
||||
// Scan dirty versions of the object
|
||||
if (!scan_dirty(1))
|
||||
{
|
||||
@@ -873,11 +870,10 @@ void journal_flusher_co::update_clean_db()
|
||||
#endif
|
||||
bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
|
||||
}
|
||||
auto & clean_db = bs->clean_db_shard(cur.oid);
|
||||
if (has_delete)
|
||||
{
|
||||
auto clean_it = clean_db.find(cur.oid);
|
||||
clean_db.erase(clean_it);
|
||||
auto clean_it = bs->clean_db.find(cur.oid);
|
||||
bs->clean_db.erase(clean_it);
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %lu from %lx:%lx v%lu (delete)\n",
|
||||
clean_loc >> bs->block_order,
|
||||
@@ -888,7 +884,7 @@ void journal_flusher_co::update_clean_db()
|
||||
}
|
||||
else
|
||||
{
|
||||
clean_db[cur.oid] = {
|
||||
bs->clean_db[cur.oid] = {
|
||||
.version = cur.version,
|
||||
.location = clean_loc,
|
||||
};
|
||||
|
@@ -49,6 +49,7 @@ class journal_flusher_co
|
||||
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
|
||||
|
||||
bool skip_copy, has_delete, has_writes;
|
||||
blockstore_clean_db_t::iterator clean_it;
|
||||
std::vector<copy_buffer_t> v;
|
||||
std::vector<copy_buffer_t>::iterator it;
|
||||
int copy_count;
|
||||
|
@@ -118,7 +118,7 @@ void blockstore_impl_t::loop()
|
||||
// has_writes == 0 - no writes before the current queue item
|
||||
// has_writes == 1 - some writes in progress
|
||||
// has_writes == 2 - tried to submit some writes, but failed
|
||||
int has_writes = 0, op_idx = 0, new_idx = 0, done_lists = 0;
|
||||
int has_writes = 0, op_idx = 0, new_idx = 0;
|
||||
for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
|
||||
{
|
||||
auto op = submit_queue[op_idx];
|
||||
@@ -198,14 +198,9 @@ void blockstore_impl_t::loop()
|
||||
}
|
||||
else if (op->opcode == BS_OP_LIST)
|
||||
{
|
||||
// LIST doesn't have to be blocked by previous modifications
|
||||
// But don't do a lot of LISTs at once, because they're blocking and potentially slow
|
||||
if (single_tick_list_limit <= 0 || done_lists < single_tick_list_limit)
|
||||
{
|
||||
process_list(op);
|
||||
done_lists++;
|
||||
wr_st = 2;
|
||||
}
|
||||
// LIST doesn't need to be blocked by previous modifications
|
||||
process_list(op);
|
||||
wr_st = 2;
|
||||
}
|
||||
if (wr_st == 2)
|
||||
{
|
||||
@@ -428,104 +423,22 @@ static bool replace_stable(object_id oid, uint64_t version, int search_start, in
|
||||
return false;
|
||||
}
|
||||
|
||||
blockstore_clean_db_t& blockstore_impl_t::clean_db_shard(object_id oid)
|
||||
{
|
||||
uint64_t pg_num = 0;
|
||||
uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
|
||||
auto sh_it = clean_db_settings.find(pool_id);
|
||||
if (sh_it != clean_db_settings.end())
|
||||
{
|
||||
// like map_to_pg()
|
||||
pg_num = (oid.stripe / sh_it->second.pg_stripe_size) % sh_it->second.pg_count + 1;
|
||||
}
|
||||
return clean_db_shards[(pool_id << (64-POOL_ID_BITS)) | pg_num];
|
||||
}
|
||||
|
||||
void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)
|
||||
{
|
||||
uint64_t pool_id = (uint64_t)pool;
|
||||
std::map<pool_pg_id_t, blockstore_clean_db_t> new_shards;
|
||||
auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
|
||||
while (sh_it != clean_db_shards.end() &&
|
||||
(sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
|
||||
{
|
||||
for (auto & pair: sh_it->second)
|
||||
{
|
||||
// like map_to_pg()
|
||||
uint64_t pg_num = (pair.first.stripe / pg_stripe_size) % pg_count + 1;
|
||||
uint64_t shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
|
||||
new_shards[shard_id][pair.first] = pair.second;
|
||||
}
|
||||
clean_db_shards.erase(sh_it++);
|
||||
}
|
||||
for (sh_it = new_shards.begin(); sh_it != new_shards.end(); sh_it++)
|
||||
{
|
||||
auto & to = clean_db_shards[sh_it->first];
|
||||
to.swap(sh_it->second);
|
||||
}
|
||||
clean_db_settings[pool_id] = (pool_shard_settings_t){
|
||||
.pg_count = pg_count,
|
||||
.pg_stripe_size = pg_stripe_size,
|
||||
};
|
||||
}
|
||||
|
||||
void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
{
|
||||
uint32_t list_pg = op->offset+1;
|
||||
uint32_t list_pg = op->offset;
|
||||
uint32_t pg_count = op->len;
|
||||
uint64_t pg_stripe_size = op->oid.stripe;
|
||||
uint64_t min_inode = op->oid.inode;
|
||||
uint64_t max_inode = op->version;
|
||||
// Check PG
|
||||
if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg > pg_count))
|
||||
if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
// Check if the DB needs resharding
|
||||
// (we don't know about PGs from the beginning, we only create "shards" here)
|
||||
uint64_t first_shard = 0, last_shard = UINT64_MAX;
|
||||
if (min_inode != 0 &&
|
||||
// Check if min_inode == max_inode == pool_id<<N, i.e. this is a pool listing
|
||||
(min_inode >> (64-POOL_ID_BITS)) == (max_inode >> (64-POOL_ID_BITS)))
|
||||
{
|
||||
pool_id_t pool_id = (min_inode >> (64-POOL_ID_BITS));
|
||||
if (pg_count > 1)
|
||||
{
|
||||
// Per-pg listing
|
||||
auto sh_it = clean_db_settings.find(pool_id);
|
||||
if (sh_it == clean_db_settings.end() ||
|
||||
sh_it->second.pg_count != pg_count ||
|
||||
sh_it->second.pg_stripe_size != pg_stripe_size)
|
||||
{
|
||||
reshard_clean_db(pool_id, pg_count, pg_stripe_size);
|
||||
}
|
||||
first_shard = last_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS)) | list_pg;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Per-pool listing
|
||||
first_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS));
|
||||
last_shard = ((uint64_t)(pool_id+1) << (64-POOL_ID_BITS)) - 1;
|
||||
}
|
||||
}
|
||||
// Copy clean_db entries
|
||||
int stable_count = 0, stable_alloc = 0;
|
||||
if (min_inode != max_inode)
|
||||
{
|
||||
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||
shard_it++)
|
||||
{
|
||||
auto & clean_db = shard_it->second;
|
||||
stable_alloc += clean_db.size();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
stable_alloc = 32768;
|
||||
}
|
||||
// Copy clean_db entries (sorted)
|
||||
int stable_count = 0, stable_alloc = clean_db.size() / (pg_count ? pg_count : 1);
|
||||
obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!stable)
|
||||
{
|
||||
@@ -533,11 +446,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||
shard_it++)
|
||||
{
|
||||
auto & clean_db = shard_it->second;
|
||||
auto clean_it = clean_db.begin(), clean_end = clean_db.end();
|
||||
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
|
||||
{
|
||||
@@ -552,28 +461,26 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
}
|
||||
for (; clean_it != clean_end; clean_it++)
|
||||
{
|
||||
if (stable_count >= stable_alloc)
|
||||
if (!pg_count || ((clean_it->first.stripe / pg_stripe_size) % pg_count) == list_pg) // like map_to_pg()
|
||||
{
|
||||
stable_alloc *= 2;
|
||||
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!stable)
|
||||
if (stable_count >= stable_alloc)
|
||||
{
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
stable_alloc += 32768;
|
||||
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
|
||||
if (!stable)
|
||||
{
|
||||
op->retval = -ENOMEM;
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
}
|
||||
stable[stable_count++] = {
|
||||
.oid = clean_it->first,
|
||||
.version = clean_it->second.version,
|
||||
};
|
||||
}
|
||||
stable[stable_count++] = {
|
||||
.oid = clean_it->first,
|
||||
.version = clean_it->second.version,
|
||||
};
|
||||
}
|
||||
}
|
||||
if (first_shard != last_shard)
|
||||
{
|
||||
// If that's not a per-PG listing, sort clean entries
|
||||
std::sort(stable, stable+stable_count);
|
||||
}
|
||||
int clean_stable_count = stable_count;
|
||||
// Copy dirty_db entries (sorted, too)
|
||||
int unstable_count = 0, unstable_alloc = 0;
|
||||
@@ -599,7 +506,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
}
|
||||
for (; dirty_it != dirty_end; dirty_it++)
|
||||
{
|
||||
if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count + 1) == list_pg) // like map_to_pg()
|
||||
if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg) // like map_to_pg()
|
||||
{
|
||||
if (IS_DELETE(dirty_it->second.state))
|
||||
{
|
||||
|
@@ -204,17 +204,6 @@ typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
|
||||
|
||||
#include "blockstore_flush.h"
|
||||
|
||||
typedef uint32_t pool_id_t;
|
||||
typedef uint64_t pool_pg_id_t;
|
||||
|
||||
#define POOL_ID_BITS 16
|
||||
|
||||
struct pool_shard_settings_t
|
||||
{
|
||||
uint32_t pg_count;
|
||||
uint32_t pg_stripe_size;
|
||||
};
|
||||
|
||||
class blockstore_impl_t
|
||||
{
|
||||
/******* OPTIONS *******/
|
||||
@@ -252,14 +241,11 @@ class blockstore_impl_t
|
||||
int throttle_target_parallelism = 1;
|
||||
// Minimum difference in microseconds between target and real execution times to throttle the response
|
||||
int throttle_threshold_us = 50;
|
||||
// Maximum number of LIST operations to be processed between
|
||||
int single_tick_list_limit = 1;
|
||||
/******* END OF OPTIONS *******/
|
||||
|
||||
struct ring_consumer_t ring_consumer;
|
||||
|
||||
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
|
||||
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
|
||||
blockstore_clean_db_t clean_db;
|
||||
uint8_t *clean_bitmap = NULL;
|
||||
blockstore_dirty_db_t dirty_db;
|
||||
std::vector<blockstore_op_t*> submit_queue;
|
||||
@@ -308,9 +294,6 @@ class blockstore_impl_t
|
||||
void open_journal();
|
||||
uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
|
||||
|
||||
blockstore_clean_db_t& clean_db_shard(object_id oid);
|
||||
void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
|
||||
|
||||
// Journaling
|
||||
void prepare_journal_sector_write(int sector, blockstore_op_t *op);
|
||||
void handle_journal_write(ring_data_t *data, uint64_t flush_id);
|
||||
|
@@ -131,7 +131,6 @@ resume_1:
|
||||
}
|
||||
// Skip superblock
|
||||
bs->meta_offset += bs->meta_block_size;
|
||||
bs->meta_len -= bs->meta_block_size;
|
||||
prev_done = 0;
|
||||
done_len = 0;
|
||||
done_pos = 0;
|
||||
@@ -223,11 +222,10 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
|
||||
}
|
||||
if (entry->oid.inode > 0)
|
||||
{
|
||||
auto & clean_db = bs->clean_db_shard(entry->oid);
|
||||
auto clean_it = clean_db.find(entry->oid);
|
||||
if (clean_it == clean_db.end() || clean_it->second.version < entry->version)
|
||||
auto clean_it = bs->clean_db.find(entry->oid);
|
||||
if (clean_it == bs->clean_db.end() || clean_it->second.version < entry->version)
|
||||
{
|
||||
if (clean_it != clean_db.end())
|
||||
if (clean_it != bs->clean_db.end())
|
||||
{
|
||||
// free the previous block
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
@@ -247,7 +245,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
|
||||
printf("Allocate block (clean entry) %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
|
||||
#endif
|
||||
bs->data_alloc->set(done_cnt+i, true);
|
||||
clean_db[entry->oid] = (struct clean_entry){
|
||||
bs->clean_db[entry->oid] = (struct clean_entry){
|
||||
.version = entry->version,
|
||||
.location = (done_cnt+i) << block_order,
|
||||
};
|
||||
@@ -658,9 +656,8 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
init_write_sector = proc_pos;
|
||||
return 0;
|
||||
}
|
||||
auto & clean_db = bs->clean_db_shard(je->small_write.oid);
|
||||
auto clean_it = clean_db.find(je->small_write.oid);
|
||||
if (clean_it == clean_db.end() ||
|
||||
auto clean_it = bs->clean_db.find(je->small_write.oid);
|
||||
if (clean_it == bs->clean_db.end() ||
|
||||
clean_it->second.version < je->small_write.version)
|
||||
{
|
||||
obj_ver_id ov = {
|
||||
@@ -738,9 +735,8 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
erase_dirty_object(dirty_it);
|
||||
}
|
||||
}
|
||||
auto & clean_db = bs->clean_db_shard(je->big_write.oid);
|
||||
auto clean_it = clean_db.find(je->big_write.oid);
|
||||
if (clean_it == clean_db.end() ||
|
||||
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
||||
if (clean_it == bs->clean_db.end() ||
|
||||
clean_it->second.version < je->big_write.version)
|
||||
{
|
||||
// oid, version, block
|
||||
@@ -845,9 +841,8 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||
dirty_it--;
|
||||
dirty_exists = dirty_it->first.oid == je->del.oid;
|
||||
}
|
||||
auto & clean_db = bs->clean_db_shard(je->del.oid);
|
||||
auto clean_it = clean_db.find(je->del.oid);
|
||||
bool clean_exists = (clean_it != clean_db.end() &&
|
||||
auto clean_it = bs->clean_db.find(je->del.oid);
|
||||
bool clean_exists = (clean_it != bs->clean_db.end() &&
|
||||
clean_it->second.version < je->del.version);
|
||||
if (!clean_exists && dirty_exists)
|
||||
{
|
||||
@@ -906,9 +901,8 @@ void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto & clean_db = bs->clean_db_shard(oid);
|
||||
auto clean_it = clean_db.find(oid);
|
||||
uint64_t clean_loc = clean_it != clean_db.end()
|
||||
auto clean_it = bs->clean_db.find(oid);
|
||||
uint64_t clean_loc = clean_it != bs->clean_db.end()
|
||||
? clean_it->second.location : UINT64_MAX;
|
||||
if (exists && clean_loc == UINT64_MAX)
|
||||
{
|
||||
|
@@ -111,7 +111,6 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse
|
||||
|
||||
int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
{
|
||||
auto & clean_db = clean_db_shard(read_op->oid);
|
||||
auto clean_it = clean_db.find(read_op->oid);
|
||||
auto dirty_it = dirty_db.upper_bound((obj_ver_id){
|
||||
.oid = read_op->oid,
|
||||
@@ -298,7 +297,6 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void
|
||||
dirty_it--;
|
||||
}
|
||||
}
|
||||
auto & clean_db = clean_db_shard(oid);
|
||||
auto clean_it = clean_db.find(oid);
|
||||
if (clean_it != clean_db.end())
|
||||
{
|
||||
|
@@ -54,7 +54,6 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||
auto dirty_it = dirty_db.find(*v);
|
||||
if (dirty_it == dirty_db.end())
|
||||
{
|
||||
auto & clean_db = clean_db_shard(v->oid);
|
||||
auto clean_it = clean_db.find(v->oid);
|
||||
if (clean_it == clean_db.end() || clean_it->second.version < v->version)
|
||||
{
|
||||
@@ -189,7 +188,6 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
|
||||
}
|
||||
if (exists == -1)
|
||||
{
|
||||
auto & clean_db = clean_db_shard(v.oid);
|
||||
auto clean_it = clean_db.find(v.oid);
|
||||
exists = clean_it != clean_db.end() ? 1 : 0;
|
||||
}
|
||||
@@ -217,7 +215,6 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto & clean_db = clean_db_shard(v.oid);
|
||||
auto clean_it = clean_db.find(v.oid);
|
||||
uint64_t clean_loc = clean_it != clean_db.end()
|
||||
? clean_it->second.location : UINT64_MAX;
|
||||
|
@@ -41,7 +41,6 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
}
|
||||
if (!found)
|
||||
{
|
||||
auto & clean_db = clean_db_shard(op->oid);
|
||||
auto clean_it = clean_db.find(op->oid);
|
||||
if (clean_it != clean_db.end())
|
||||
{
|
||||
@@ -544,13 +543,12 @@ resume_4:
|
||||
if (ref_us > exec_us + throttle_threshold_us)
|
||||
{
|
||||
// Pause reply
|
||||
PRIV(op)->op_state = 5;
|
||||
// Remember that the timer can in theory be called right here
|
||||
tfd->set_timer_us(ref_us-exec_us, false, [this, op](int timer_id)
|
||||
{
|
||||
PRIV(op)->op_state++;
|
||||
ringloop->wakeup();
|
||||
});
|
||||
PRIV(op)->op_state = 5;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
290
src/cli.cpp
290
src/cli.cpp
@@ -2,7 +2,8 @@
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
/**
|
||||
* CLI tool and also a library for administrative tasks
|
||||
* CLI tool
|
||||
* Currently can (a) remove inodes and (b) merge snapshot/clone layers
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
@@ -16,9 +17,7 @@
|
||||
|
||||
static const char *exe_name = NULL;
|
||||
|
||||
static void help();
|
||||
|
||||
static json11::Json::object parse_args(int narg, const char *args[])
|
||||
json11::Json::object cli_tool_t::parse_args(int narg, const char *args[])
|
||||
{
|
||||
json11::Json::object cfg;
|
||||
json11::Json::array cmd;
|
||||
@@ -80,16 +79,13 @@ static json11::Json::object parse_args(int narg, const char *args[])
|
||||
return cfg;
|
||||
}
|
||||
|
||||
static void help()
|
||||
void cli_tool_t::help()
|
||||
{
|
||||
printf(
|
||||
"Vitastor command-line tool\n"
|
||||
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"USAGE:\n"
|
||||
"%s status\n"
|
||||
" Show cluster status\n"
|
||||
"\n"
|
||||
"%s df\n"
|
||||
" Show pool space statistics\n"
|
||||
"\n"
|
||||
@@ -159,177 +155,223 @@ static void help()
|
||||
" --no-color Disable colored output\n"
|
||||
" --json JSON output\n"
|
||||
,
|
||||
exe_name, exe_name, exe_name, exe_name, exe_name, exe_name, exe_name,
|
||||
exe_name, exe_name, exe_name, exe_name, exe_name, exe_name,
|
||||
exe_name, exe_name, exe_name, exe_name, exe_name, exe_name
|
||||
);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
static int run(cli_tool_t *p, json11::Json::object cfg)
|
||||
void cli_tool_t::change_parent(inode_t cur, inode_t new_parent)
|
||||
{
|
||||
auto cur_cfg_it = cli->st_cli.inode_config.find(cur);
|
||||
if (cur_cfg_it == cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
|
||||
exit(1);
|
||||
}
|
||||
inode_config_t new_cfg = cur_cfg_it->second;
|
||||
std::string cur_name = new_cfg.name;
|
||||
std::string cur_cfg_key = base64_encode(cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cur))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cur)));
|
||||
new_cfg.parent_id = new_parent;
|
||||
json11::Json::object cur_cfg_json = cli->st_cli.serialize_inode_cfg(&new_cfg);
|
||||
waiting++;
|
||||
cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cur_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", new_cfg.mod_revision+1 },
|
||||
},
|
||||
} },
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", cur_cfg_key },
|
||||
{ "value", base64_encode(json11::Json(cur_cfg_json).dump()) },
|
||||
} }
|
||||
},
|
||||
} },
|
||||
}, [this, new_parent, cur, cur_name](std::string err, json11::Json res)
|
||||
{
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error changing parent of %s: %s\n", cur_name.c_str(), err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
fprintf(stderr, "Inode %s was modified during snapshot deletion\n", cur_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (new_parent)
|
||||
{
|
||||
auto new_parent_it = cli->st_cli.inode_config.find(new_parent);
|
||||
std::string new_parent_name = new_parent_it != cli->st_cli.inode_config.end()
|
||||
? new_parent_it->second.name : "<unknown>";
|
||||
printf(
|
||||
"Parent of layer %s (inode %lu in pool %u) changed to %s (inode %lu in pool %u)\n",
|
||||
cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur),
|
||||
new_parent_name.c_str(), INODE_NO_POOL(new_parent), INODE_POOL(new_parent)
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf(
|
||||
"Parent of layer %s (inode %lu in pool %u) detached\n",
|
||||
cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur)
|
||||
);
|
||||
}
|
||||
waiting--;
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void cli_tool_t::etcd_txn(json11::Json txn)
|
||||
{
|
||||
waiting++;
|
||||
cli->st_cli.etcd_txn_slow(txn, [this](std::string err, json11::Json res)
|
||||
{
|
||||
waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error reading from etcd: %s\n", err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
etcd_result = res;
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
inode_config_t* cli_tool_t::get_inode_cfg(const std::string & name)
|
||||
{
|
||||
for (auto & ic: cli->st_cli.inode_config)
|
||||
{
|
||||
if (ic.second.name == name)
|
||||
{
|
||||
return &ic.second;
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "Layer %s not found\n", name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
void cli_tool_t::run(json11::Json cfg)
|
||||
{
|
||||
cli_result_t result;
|
||||
p->parse_config(cfg);
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
cfg.erase("command");
|
||||
std::function<bool(cli_result_t &)> action_cb;
|
||||
if (!cmd.size())
|
||||
{
|
||||
result = { .err = EINVAL, .text = "command is missing" };
|
||||
}
|
||||
else if (cmd[0] == "status")
|
||||
{
|
||||
// Show cluster status
|
||||
action_cb = p->start_status(cfg);
|
||||
fprintf(stderr, "command is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
else if (cmd[0] == "df")
|
||||
{
|
||||
// Show pool space stats
|
||||
action_cb = p->start_df(cfg);
|
||||
action_cb = start_df(cfg);
|
||||
}
|
||||
else if (cmd[0] == "ls")
|
||||
{
|
||||
// List images
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cmd.erase(cmd.begin(), cmd.begin()+1);
|
||||
cfg["names"] = cmd;
|
||||
}
|
||||
action_cb = p->start_ls(cfg);
|
||||
action_cb = start_ls(cfg);
|
||||
}
|
||||
else if (cmd[0] == "snap-create")
|
||||
{
|
||||
// Create snapshot
|
||||
std::string name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
int pos = name.find('@');
|
||||
if (pos == std::string::npos || pos == name.length()-1)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Please specify new snapshot name after @" };
|
||||
}
|
||||
else
|
||||
{
|
||||
cfg["image"] = name.substr(0, pos);
|
||||
cfg["snapshot"] = name.substr(pos + 1);
|
||||
action_cb = p->start_create(cfg);
|
||||
}
|
||||
}
|
||||
else if (cmd[0] == "create")
|
||||
else if (cmd[0] == "create" || cmd[0] == "snap-create")
|
||||
{
|
||||
// Create image/snapshot
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cfg["image"] = cmd[1];
|
||||
}
|
||||
action_cb = p->start_create(cfg);
|
||||
action_cb = start_create(cfg);
|
||||
}
|
||||
else if (cmd[0] == "modify")
|
||||
{
|
||||
// Modify image
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cfg["image"] = cmd[1];
|
||||
}
|
||||
action_cb = p->start_modify(cfg);
|
||||
action_cb = start_modify(cfg);
|
||||
}
|
||||
else if (cmd[0] == "rm-data")
|
||||
{
|
||||
// Delete inode data
|
||||
action_cb = p->start_rm_data(cfg);
|
||||
action_cb = start_rm(cfg);
|
||||
}
|
||||
else if (cmd[0] == "merge-data")
|
||||
{
|
||||
// Merge layer data without affecting metadata
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cfg["from"] = cmd[1];
|
||||
if (cmd.size() > 2)
|
||||
cfg["to"] = cmd[2];
|
||||
}
|
||||
action_cb = p->start_merge(cfg);
|
||||
action_cb = start_merge(cfg);
|
||||
}
|
||||
else if (cmd[0] == "flatten")
|
||||
{
|
||||
// Merge layer data without affecting metadata
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cfg["image"] = cmd[1];
|
||||
}
|
||||
action_cb = p->start_flatten(cfg);
|
||||
action_cb = start_flatten(cfg);
|
||||
}
|
||||
else if (cmd[0] == "rm")
|
||||
{
|
||||
// Remove multiple snapshots and rebase their children
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cfg["from"] = cmd[1];
|
||||
if (cmd.size() > 2)
|
||||
cfg["to"] = cmd[2];
|
||||
}
|
||||
action_cb = p->start_rm(cfg);
|
||||
action_cb = start_snap_rm(cfg);
|
||||
}
|
||||
else if (cmd[0] == "alloc-osd")
|
||||
{
|
||||
// Allocate a new OSD number
|
||||
action_cb = p->start_alloc_osd(cfg);
|
||||
action_cb = start_alloc_osd(cfg);
|
||||
}
|
||||
else if (cmd[0] == "simple-offsets")
|
||||
{
|
||||
// Calculate offsets for simple & stupid OSD deployment without superblock
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cfg["device"] = cmd[1];
|
||||
}
|
||||
action_cb = p->simple_offsets(cfg);
|
||||
action_cb = simple_offsets(cfg);
|
||||
}
|
||||
else
|
||||
{
|
||||
result = { .err = EINVAL, .text = "unknown command: "+cmd[0].string_value() };
|
||||
fprintf(stderr, "unknown command: %s\n", cmd[0].string_value().c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (action_cb != NULL)
|
||||
if (action_cb == NULL)
|
||||
{
|
||||
// Create client
|
||||
json11::Json cfg_j = cfg;
|
||||
p->ringloop = new ring_loop_t(512);
|
||||
p->epmgr = new epoll_manager_t(p->ringloop);
|
||||
p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg_j);
|
||||
// Smaller timeout by default for more interactiveness
|
||||
p->cli->st_cli.etcd_slow_timeout = p->cli->st_cli.etcd_quick_timeout;
|
||||
p->loop_and_wait(action_cb, [&](const cli_result_t & r)
|
||||
return;
|
||||
}
|
||||
color = !cfg["no-color"].bool_value();
|
||||
json_output = cfg["json"].bool_value();
|
||||
iodepth = cfg["iodepth"].uint64_value();
|
||||
if (!iodepth)
|
||||
iodepth = 32;
|
||||
parallel_osds = cfg["parallel_osds"].uint64_value();
|
||||
if (!parallel_osds)
|
||||
parallel_osds = 4;
|
||||
log_level = cfg["log_level"].int64_value();
|
||||
progress = cfg["progress"].uint64_value() ? true : false;
|
||||
list_first = cfg["wait-list"].uint64_value() ? true : false;
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(512);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||
cli->on_ready([this]()
|
||||
{
|
||||
// Initialize job
|
||||
consumer.loop = [this]()
|
||||
{
|
||||
result = r;
|
||||
action_cb = NULL;
|
||||
});
|
||||
// Loop until it completes
|
||||
while (action_cb != NULL)
|
||||
{
|
||||
p->ringloop->loop();
|
||||
if (action_cb != NULL)
|
||||
p->ringloop->wait();
|
||||
}
|
||||
// Destroy the client
|
||||
delete p->cli;
|
||||
delete p->epmgr;
|
||||
delete p->ringloop;
|
||||
p->cli = NULL;
|
||||
p->epmgr = NULL;
|
||||
p->ringloop = NULL;
|
||||
}
|
||||
// Print result
|
||||
if (p->json_output && !result.data.is_null())
|
||||
{
|
||||
bool done = action_cb();
|
||||
if (done)
|
||||
{
|
||||
action_cb = NULL;
|
||||
}
|
||||
}
|
||||
ringloop->submit();
|
||||
};
|
||||
ringloop->register_consumer(&consumer);
|
||||
consumer.loop();
|
||||
});
|
||||
// Loop until it completes
|
||||
while (action_cb != NULL)
|
||||
{
|
||||
printf("%s\n", result.data.dump().c_str());
|
||||
ringloop->loop();
|
||||
if (action_cb != NULL)
|
||||
ringloop->wait();
|
||||
}
|
||||
else if (p->json_output && result.err)
|
||||
{
|
||||
printf("%s\n", json11::Json(json11::Json::object {
|
||||
{ "error_code", result.err },
|
||||
{ "error_text", result.text },
|
||||
}).dump().c_str());
|
||||
}
|
||||
else if (result.text != "")
|
||||
{
|
||||
fprintf(result.err ? stderr : stdout, result.text[result.text.size()-1] == '\n' ? "%s" : "%s\n", result.text.c_str());
|
||||
}
|
||||
return result.err;
|
||||
// Destroy the client
|
||||
delete cli;
|
||||
delete epmgr;
|
||||
delete ringloop;
|
||||
cli = NULL;
|
||||
epmgr = NULL;
|
||||
ringloop = NULL;
|
||||
}
|
||||
|
||||
int main(int narg, const char *args[])
|
||||
@@ -338,7 +380,7 @@ int main(int narg, const char *args[])
|
||||
setvbuf(stderr, NULL, _IONBF, 0);
|
||||
exe_name = args[0];
|
||||
cli_tool_t *p = new cli_tool_t();
|
||||
int r = run(p, parse_args(narg, args));
|
||||
p->run(cli_tool_t::parse_args(narg, args));
|
||||
delete p;
|
||||
return r;
|
||||
return 0;
|
||||
}
|
||||
|
47
src/cli.h
47
src/cli.h
@@ -19,18 +19,11 @@ class epoll_manager_t;
|
||||
class cluster_client_t;
|
||||
struct inode_config_t;
|
||||
|
||||
struct cli_result_t
|
||||
{
|
||||
int err;
|
||||
std::string text;
|
||||
json11::Json data;
|
||||
};
|
||||
|
||||
class cli_tool_t
|
||||
{
|
||||
public:
|
||||
uint64_t iodepth = 4, parallel_osds = 32;
|
||||
bool progress = false;
|
||||
uint64_t iodepth = 0, parallel_osds = 0;
|
||||
bool progress = true;
|
||||
bool list_first = false;
|
||||
bool json_output = false;
|
||||
int log_level = 0;
|
||||
@@ -41,33 +34,33 @@ public:
|
||||
cluster_client_t *cli = NULL;
|
||||
|
||||
int waiting = 0;
|
||||
cli_result_t etcd_err;
|
||||
json11::Json etcd_result;
|
||||
ring_consumer_t consumer;
|
||||
std::function<bool(void)> action_cb;
|
||||
|
||||
void parse_config(json11::Json cfg);
|
||||
void run(json11::Json cfg);
|
||||
|
||||
void change_parent(inode_t cur, inode_t new_parent, cli_result_t *result);
|
||||
void change_parent(inode_t cur, inode_t new_parent);
|
||||
inode_config_t* get_inode_cfg(const std::string & name);
|
||||
|
||||
static json11::Json::object parse_args(int narg, const char *args[]);
|
||||
static void help();
|
||||
|
||||
friend struct rm_inode_t;
|
||||
friend struct snap_merger_t;
|
||||
friend struct snap_flattener_t;
|
||||
friend struct snap_remover_t;
|
||||
|
||||
std::function<bool(cli_result_t &)> start_status(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_df(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_ls(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_create(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_modify(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_rm_data(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_merge(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_flatten(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_rm(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_alloc_osd(json11::Json cfg);
|
||||
std::function<bool(cli_result_t &)> simple_offsets(json11::Json cfg);
|
||||
|
||||
// Should be called like loop_and_wait(start_status(), <completion callback>)
|
||||
void loop_and_wait(std::function<bool(cli_result_t &)> loop_cb, std::function<void(const cli_result_t &)> complete_cb);
|
||||
std::function<bool(void)> start_df(json11::Json);
|
||||
std::function<bool(void)> start_ls(json11::Json);
|
||||
std::function<bool(void)> start_create(json11::Json);
|
||||
std::function<bool(void)> start_modify(json11::Json);
|
||||
std::function<bool(void)> start_rm(json11::Json);
|
||||
std::function<bool(void)> start_merge(json11::Json);
|
||||
std::function<bool(void)> start_flatten(json11::Json);
|
||||
std::function<bool(void)> start_snap_rm(json11::Json);
|
||||
std::function<bool(void)> start_alloc_osd(json11::Json cfg, uint64_t *out = NULL);
|
||||
std::function<bool(void)> simple_offsets(json11::Json cfg);
|
||||
|
||||
void etcd_txn(json11::Json txn);
|
||||
};
|
||||
@@ -76,7 +69,7 @@ uint64_t parse_size(std::string size_str);
|
||||
|
||||
std::string print_table(json11::Json items, json11::Json header, bool use_esc);
|
||||
|
||||
std::string format_size(uint64_t size, bool nobytes = false);
|
||||
std::string format_size(uint64_t size);
|
||||
|
||||
std::string format_lat(uint64_t lat);
|
||||
|
||||
|
@@ -16,7 +16,6 @@ struct alloc_osd_t
|
||||
uint64_t new_id = 1;
|
||||
|
||||
int state = 0;
|
||||
cli_result_t result;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
@@ -63,12 +62,6 @@ struct alloc_osd_t
|
||||
state = 1;
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!parent->etcd_result["succeeded"].bool_value())
|
||||
{
|
||||
std::vector<osd_num_t> used;
|
||||
@@ -106,23 +99,23 @@ struct alloc_osd_t
|
||||
}
|
||||
} while (!parent->etcd_result["succeeded"].bool_value());
|
||||
state = 100;
|
||||
result = (cli_result_t){
|
||||
.text = std::to_string(new_id),
|
||||
.data = json11::Json(new_id),
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_alloc_osd(json11::Json cfg)
|
||||
std::function<bool(void)> cli_tool_t::start_alloc_osd(json11::Json cfg, uint64_t *out)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto alloc_osd = new alloc_osd_t();
|
||||
alloc_osd->parent = this;
|
||||
return [alloc_osd](cli_result_t & result)
|
||||
return [alloc_osd, out]()
|
||||
{
|
||||
alloc_osd->loop();
|
||||
if (alloc_osd->is_done())
|
||||
{
|
||||
result = alloc_osd->result;
|
||||
if (out)
|
||||
*out = alloc_osd->new_id;
|
||||
else if (alloc_osd->new_id)
|
||||
printf("%lu\n", alloc_osd->new_id);
|
||||
delete alloc_osd;
|
||||
return true;
|
||||
}
|
||||
|
@@ -1,149 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "base64.h"
|
||||
#include "cluster_client.h"
|
||||
#include "cli.h"
|
||||
|
||||
void cli_tool_t::change_parent(inode_t cur, inode_t new_parent, cli_result_t *result)
|
||||
{
|
||||
auto cur_cfg_it = cli->st_cli.inode_config.find(cur);
|
||||
if (cur_cfg_it == cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[128];
|
||||
snprintf(buf, 128, "Inode 0x%lx disappeared", cur);
|
||||
*result = (cli_result_t){ .err = EIO, .text = buf };
|
||||
return;
|
||||
}
|
||||
inode_config_t new_cfg = cur_cfg_it->second;
|
||||
std::string cur_name = new_cfg.name;
|
||||
std::string cur_cfg_key = base64_encode(cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cur))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cur)));
|
||||
new_cfg.parent_id = new_parent;
|
||||
json11::Json::object cur_cfg_json = cli->st_cli.serialize_inode_cfg(&new_cfg);
|
||||
waiting++;
|
||||
cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cur_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", new_cfg.mod_revision+1 },
|
||||
},
|
||||
} },
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", cur_cfg_key },
|
||||
{ "value", base64_encode(json11::Json(cur_cfg_json).dump()) },
|
||||
} }
|
||||
},
|
||||
} },
|
||||
}, [this, result, new_parent, cur, cur_name](std::string err, json11::Json res)
|
||||
{
|
||||
if (err != "")
|
||||
{
|
||||
*result = (cli_result_t){ .err = EIO, .text = "Error changing parent of "+cur_name+": "+err };
|
||||
}
|
||||
else if (!res["succeeded"].bool_value())
|
||||
{
|
||||
*result = (cli_result_t){ .err = EAGAIN, .text = "Image "+cur_name+" was modified during change" };
|
||||
}
|
||||
else if (new_parent)
|
||||
{
|
||||
auto new_parent_it = cli->st_cli.inode_config.find(new_parent);
|
||||
std::string new_parent_name = new_parent_it != cli->st_cli.inode_config.end()
|
||||
? new_parent_it->second.name : "<unknown>";
|
||||
*result = (cli_result_t){
|
||||
.text = "Parent of layer "+cur_name+" (inode "+std::to_string(INODE_NO_POOL(cur))+
|
||||
" in pool "+std::to_string(INODE_POOL(cur))+") changed to "+new_parent_name+
|
||||
" (inode "+std::to_string(INODE_NO_POOL(new_parent))+" in pool "+std::to_string(INODE_POOL(new_parent))+")",
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
*result = (cli_result_t){
|
||||
.text = "Parent of layer "+cur_name+" (inode "+std::to_string(INODE_NO_POOL(cur))+
|
||||
" in pool "+std::to_string(INODE_POOL(cur))+") detached",
|
||||
};
|
||||
}
|
||||
waiting--;
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void cli_tool_t::etcd_txn(json11::Json txn)
|
||||
{
|
||||
waiting++;
|
||||
cli->st_cli.etcd_txn_slow(txn, [this](std::string err, json11::Json res)
|
||||
{
|
||||
waiting--;
|
||||
if (err != "")
|
||||
etcd_err = (cli_result_t){ .err = EIO, .text = "Error communicating with etcd: "+err };
|
||||
else
|
||||
etcd_err = (cli_result_t){ .err = 0 };
|
||||
etcd_result = res;
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
inode_config_t* cli_tool_t::get_inode_cfg(const std::string & name)
|
||||
{
|
||||
for (auto & ic: cli->st_cli.inode_config)
|
||||
{
|
||||
if (ic.second.name == name)
|
||||
{
|
||||
return &ic.second;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void cli_tool_t::parse_config(json11::Json cfg)
|
||||
{
|
||||
color = !cfg["no-color"].bool_value();
|
||||
json_output = cfg["json"].bool_value();
|
||||
iodepth = cfg["iodepth"].uint64_value();
|
||||
if (!iodepth)
|
||||
iodepth = 32;
|
||||
parallel_osds = cfg["parallel_osds"].uint64_value();
|
||||
if (!parallel_osds)
|
||||
parallel_osds = 4;
|
||||
log_level = cfg["log_level"].int64_value();
|
||||
progress = cfg["progress"].uint64_value() ? true : false;
|
||||
list_first = cfg["wait-list"].uint64_value() ? true : false;
|
||||
}
|
||||
|
||||
struct cli_result_looper_t
|
||||
{
|
||||
ring_consumer_t consumer;
|
||||
cli_result_t result;
|
||||
std::function<bool(cli_result_t &)> loop_cb;
|
||||
std::function<void(const cli_result_t &)> complete_cb;
|
||||
};
|
||||
|
||||
void cli_tool_t::loop_and_wait(std::function<bool(cli_result_t &)> loop_cb, std::function<void(const cli_result_t &)> complete_cb)
|
||||
{
|
||||
auto *looper = new cli_result_looper_t();
|
||||
looper->loop_cb = loop_cb;
|
||||
looper->complete_cb = complete_cb;
|
||||
looper->consumer.loop = [this, looper]()
|
||||
{
|
||||
bool done = looper->loop_cb(looper->result);
|
||||
if (done)
|
||||
{
|
||||
ringloop->unregister_consumer(&looper->consumer);
|
||||
looper->loop_cb = NULL;
|
||||
looper->complete_cb(looper->result);
|
||||
delete looper;
|
||||
return;
|
||||
}
|
||||
ringloop->submit();
|
||||
};
|
||||
cli->on_ready([this, looper]()
|
||||
{
|
||||
ringloop->register_consumer(&looper->consumer);
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
@@ -25,18 +25,14 @@ struct image_creator_t
|
||||
pool_id_t new_pool_id = 0;
|
||||
std::string new_pool_name;
|
||||
std::string image_name, new_snap, new_parent;
|
||||
json11::Json new_meta;
|
||||
uint64_t size;
|
||||
bool force_size = false;
|
||||
|
||||
pool_id_t old_pool_id = 0;
|
||||
inode_t new_parent_id = 0;
|
||||
inode_t new_id = 0, old_id = 0;
|
||||
uint64_t max_id_mod_rev = 0, cfg_mod_rev = 0, idx_mod_rev = 0;
|
||||
inode_config_t new_cfg;
|
||||
|
||||
int state = 0;
|
||||
cli_result_t result;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
@@ -47,27 +43,13 @@ struct image_creator_t
|
||||
{
|
||||
if (state >= 1)
|
||||
goto resume_1;
|
||||
if (image_name == "")
|
||||
{
|
||||
// FIXME: EINVAL -> specific codes for every error
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Image name is missing" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (image_name.find('@') != std::string::npos)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Image name can't contain @ character" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (new_pool_id)
|
||||
{
|
||||
auto & pools = parent->cli->st_cli.pool_config;
|
||||
if (pools.find(new_pool_id) == pools.end())
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+std::to_string(new_pool_id)+" does not exist" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Pool %u does not exist\n", new_pool_id);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else if (new_pool_name != "")
|
||||
@@ -82,9 +64,8 @@ struct image_creator_t
|
||||
}
|
||||
if (!new_pool_id)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+new_pool_name+" does not exist" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Pool %s does not exist\n", new_pool_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else if (parent->cli->st_cli.pool_config.size() == 1)
|
||||
@@ -110,9 +91,8 @@ struct image_creator_t
|
||||
{
|
||||
if (ic.second.name == image_name)
|
||||
{
|
||||
result = (cli_result_t){ .err = EEXIST, .text = "Image "+image_name+" already exists" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Image %s already exists\n", image_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (ic.second.name == new_parent)
|
||||
{
|
||||
@@ -129,21 +109,18 @@ struct image_creator_t
|
||||
}
|
||||
if (new_parent != "" && !new_parent_id)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Parent image "+new_parent+" not found" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Parent image not found\n");
|
||||
exit(1);
|
||||
}
|
||||
if (!new_pool_id)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Pool name or ID is missing" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Pool name or ID is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
if (!size && !force_size)
|
||||
if (!size)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Image size is missing" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Image size is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
do
|
||||
{
|
||||
@@ -154,36 +131,23 @@ struct image_creator_t
|
||||
resume_2:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
extract_next_id(parent->etcd_result["responses"][0]);
|
||||
attempt_create();
|
||||
state = 3;
|
||||
resume_3:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!parent->etcd_result["succeeded"].bool_value() &&
|
||||
parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items().size() > 0)
|
||||
{
|
||||
result = (cli_result_t){ .err = EEXIST, .text = "Image "+image_name+" already exists" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Image %s already exists\n", image_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
} while (!parent->etcd_result["succeeded"].bool_value());
|
||||
// Save into inode_config for library users to be able to take it from there immediately
|
||||
new_cfg.mod_revision = parent->etcd_result["responses"][0]["response_put"]["header"]["revision"].uint64_value();
|
||||
parent->cli->st_cli.insert_inode_config(new_cfg);
|
||||
result = (cli_result_t){ .err = 0, .text = "Image "+image_name+" created" };
|
||||
if (parent->progress)
|
||||
{
|
||||
printf("Image %s created\n", image_name.c_str());
|
||||
}
|
||||
state = 100;
|
||||
}
|
||||
|
||||
@@ -199,16 +163,14 @@ resume_3:
|
||||
{
|
||||
if (ic.second.name == image_name+"@"+new_snap)
|
||||
{
|
||||
result = (cli_result_t){ .err = EEXIST, .text = "Snapshot "+image_name+"@"+new_snap+" already exists" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Snapshot %s@%s already exists\n", image_name.c_str(), new_snap.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (new_parent != "")
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Parent can't be specified for snapshots" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "--parent can't be used with snapshots\n");
|
||||
exit(1);
|
||||
}
|
||||
do
|
||||
{
|
||||
@@ -220,9 +182,8 @@ resume_3:
|
||||
return;
|
||||
if (!old_id)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Image "+image_name+" does not exist" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Image %s does not exist\n", image_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (!new_pool_id)
|
||||
{
|
||||
@@ -234,24 +195,17 @@ resume_3:
|
||||
resume_4:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!parent->etcd_result["succeeded"].bool_value() &&
|
||||
parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items().size() > 0)
|
||||
{
|
||||
result = (cli_result_t){ .err = EEXIST, .text = "Snapshot "+image_name+"@"+new_snap+" already exists" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Snapshot %s@%s already exists\n", image_name.c_str(), new_snap.c_str());
|
||||
exit(1);
|
||||
}
|
||||
} while (!parent->etcd_result["succeeded"].bool_value());
|
||||
// Save into inode_config for library users to be able to take it from there immediately
|
||||
new_cfg.mod_revision = parent->etcd_result["responses"][0]["response_put"]["header"]["revision"].uint64_value();
|
||||
parent->cli->st_cli.insert_inode_config(new_cfg);
|
||||
result = (cli_result_t){ .err = 0, .text = "Snapshot "+image_name+"@"+new_snap+" created" };
|
||||
if (parent->progress)
|
||||
{
|
||||
printf("Snapshot %s@%s created\n", image_name.c_str(), new_snap.c_str());
|
||||
}
|
||||
state = 100;
|
||||
}
|
||||
|
||||
@@ -305,12 +259,6 @@ resume_4:
|
||||
resume_2:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
extract_next_id(parent->etcd_result["responses"][0]);
|
||||
old_id = 0;
|
||||
old_pool_id = 0;
|
||||
@@ -340,9 +288,8 @@ resume_2:
|
||||
idx_mod_rev = kv.mod_revision;
|
||||
if (!old_id || !old_pool_id || old_pool_id >= POOL_ID_MAX)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Invalid pool or inode ID in etcd key "+kv.key };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Invalid pool or inode ID in etcd key %s\n", kv.key.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
parent->etcd_txn(json11::Json::object {
|
||||
@@ -361,12 +308,6 @@ resume_2:
|
||||
resume_3:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]);
|
||||
size = kv.value["size"].uint64_value();
|
||||
@@ -383,13 +324,12 @@ resume_3:
|
||||
|
||||
void attempt_create()
|
||||
{
|
||||
new_cfg = {
|
||||
inode_config_t new_cfg = {
|
||||
.num = INODE_WITH_POOL(new_pool_id, new_id),
|
||||
.name = image_name,
|
||||
.size = size,
|
||||
.parent_id = (new_snap != "" ? INODE_WITH_POOL(old_pool_id, old_id) : new_parent_id),
|
||||
.readonly = false,
|
||||
.meta = new_meta,
|
||||
};
|
||||
json11::Json::array checks = json11::Json::array {
|
||||
json11::Json::object {
|
||||
@@ -517,76 +457,77 @@ uint64_t parse_size(std::string size_str)
|
||||
if (type_char == 'k' || type_char == 'm' || type_char == 'g' || type_char == 't')
|
||||
{
|
||||
if (type_char == 'k')
|
||||
mul = (uint64_t)1<<10;
|
||||
mul = 1l<<10;
|
||||
else if (type_char == 'm')
|
||||
mul = (uint64_t)1<<20;
|
||||
mul = 1l<<20;
|
||||
else if (type_char == 'g')
|
||||
mul = (uint64_t)1<<30;
|
||||
mul = 1l<<30;
|
||||
else /*if (type_char == 't')*/
|
||||
mul = (uint64_t)1<<40;
|
||||
mul = 1l<<40;
|
||||
size_str = size_str.substr(0, size_str.length()-1);
|
||||
}
|
||||
uint64_t size = json11::Json(size_str).uint64_value() * mul;
|
||||
if (size == 0 && size_str != "0" && (size_str != "" || mul != 1))
|
||||
{
|
||||
return UINT64_MAX;
|
||||
fprintf(stderr, "Invalid syntax for size: %s\n", size_str.c_str());
|
||||
exit(1);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_create(json11::Json cfg)
|
||||
std::function<bool(void)> cli_tool_t::start_create(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto image_creator = new image_creator_t();
|
||||
image_creator->parent = this;
|
||||
image_creator->image_name = cfg["image"].string_value();
|
||||
image_creator->image_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
image_creator->new_pool_id = cfg["pool"].uint64_value();
|
||||
image_creator->new_pool_name = cfg["pool"].string_value();
|
||||
image_creator->force_size = cfg["force_size"].bool_value();
|
||||
if (cfg["image_meta"].is_object())
|
||||
{
|
||||
image_creator->new_meta = cfg["image-meta"];
|
||||
}
|
||||
if (cfg["snapshot"].string_value() != "")
|
||||
{
|
||||
image_creator->new_snap = cfg["snapshot"].string_value();
|
||||
}
|
||||
else if (cmd[0] == "snap-create")
|
||||
{
|
||||
int p = image_creator->image_name.find('@');
|
||||
if (p == std::string::npos || p == image_creator->image_name.length()-1)
|
||||
{
|
||||
fprintf(stderr, "Please specify new snapshot name after @\n");
|
||||
exit(1);
|
||||
}
|
||||
image_creator->new_snap = image_creator->image_name.substr(p + 1);
|
||||
image_creator->image_name = image_creator->image_name.substr(0, p);
|
||||
}
|
||||
image_creator->new_parent = cfg["parent"].string_value();
|
||||
if (cfg["size"].string_value() != "")
|
||||
{
|
||||
image_creator->size = parse_size(cfg["size"].string_value());
|
||||
if (image_creator->size == UINT64_MAX)
|
||||
if (image_creator->size % 4096)
|
||||
{
|
||||
return [size = cfg["size"].string_value()](cli_result_t & result)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Invalid syntax for size: "+size };
|
||||
return true;
|
||||
};
|
||||
}
|
||||
if ((image_creator->size % 4096) && !cfg["force_size"].bool_value())
|
||||
{
|
||||
delete image_creator;
|
||||
return [](cli_result_t & result)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Size should be a multiple of 4096" };
|
||||
return true;
|
||||
};
|
||||
fprintf(stderr, "Size should be a multiple of 4096\n");
|
||||
exit(1);
|
||||
}
|
||||
if (image_creator->new_snap != "")
|
||||
{
|
||||
delete image_creator;
|
||||
return [](cli_result_t & result)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Size can't be specified for snapshots" };
|
||||
return true;
|
||||
};
|
||||
fprintf(stderr, "--size can't be specified for snapshots\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
return [image_creator](cli_result_t & result)
|
||||
if (image_creator->image_name == "")
|
||||
{
|
||||
fprintf(stderr, "Image name is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
if (image_creator->image_name.find('@') != std::string::npos)
|
||||
{
|
||||
fprintf(stderr, "Image name can't contain @ character\n");
|
||||
exit(1);
|
||||
}
|
||||
return [image_creator]()
|
||||
{
|
||||
image_creator->loop();
|
||||
if (image_creator->is_done())
|
||||
{
|
||||
result = image_creator->result;
|
||||
delete image_creator;
|
||||
return true;
|
||||
}
|
||||
|
@@ -12,7 +12,6 @@ struct pool_lister_t
|
||||
|
||||
int state = 0;
|
||||
json11::Json space_info;
|
||||
cli_result_t result;
|
||||
std::map<pool_id_t, json11::Json::object> pool_stats;
|
||||
|
||||
bool is_done()
|
||||
@@ -53,12 +52,6 @@ struct pool_lister_t
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
space_info = parent->etcd_result;
|
||||
std::map<pool_id_t, uint64_t> osd_free;
|
||||
for (auto & kv_item: space_info["responses"][0]["response_range"]["kvs"].array_items())
|
||||
@@ -131,8 +124,8 @@ resume_1:
|
||||
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
|
||||
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
|
||||
: "EC "+std::to_string(pool_cfg.pg_size-pool_cfg.parity_chunks)+"+"+std::to_string(pool_cfg.parity_chunks) },
|
||||
{ "used_raw", (uint64_t)(pool_stats[pool_cfg.id]["used_raw_tb"].number_value() * ((uint64_t)1<<40)) },
|
||||
{ "total_raw", (uint64_t)(pool_stats[pool_cfg.id]["total_raw_tb"].number_value() * ((uint64_t)1<<40)) },
|
||||
{ "used_raw", (uint64_t)(pool_stats[pool_cfg.id]["used_raw_tb"].number_value() * (1l<<40)) },
|
||||
{ "total_raw", (uint64_t)(pool_stats[pool_cfg.id]["total_raw_tb"].number_value() * (1l<<40)) },
|
||||
{ "max_available", pool_avail },
|
||||
{ "raw_to_usable", pool_stats[pool_cfg.id]["raw_to_usable"].number_value() },
|
||||
{ "space_efficiency", pool_stats[pool_cfg.id]["space_efficiency"].number_value() },
|
||||
@@ -157,12 +150,10 @@ resume_1:
|
||||
get_stats();
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (state == 100)
|
||||
return;
|
||||
if (parent->json_output)
|
||||
{
|
||||
// JSON output
|
||||
result.data = to_list();
|
||||
printf("%s\n", json11::Json(to_list()).dump().c_str());
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
@@ -215,22 +206,21 @@ resume_1:
|
||||
: 100)+"%";
|
||||
kv.second["eff_fmt"] = format_q(kv.second["space_efficiency"].number_value()*100)+"%";
|
||||
}
|
||||
result.data = to_list();
|
||||
result.text = print_table(result.data, cols, parent->color);
|
||||
printf("%s", print_table(to_list(), cols, parent->color).c_str());
|
||||
state = 100;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_df(json11::Json cfg)
|
||||
std::function<bool(void)> cli_tool_t::start_df(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto lister = new pool_lister_t();
|
||||
lister->parent = this;
|
||||
return [lister](cli_result_t & result)
|
||||
return [lister]()
|
||||
{
|
||||
lister->loop();
|
||||
if (lister->is_done())
|
||||
{
|
||||
result = lister->result;
|
||||
delete lister;
|
||||
return true;
|
||||
}
|
||||
|
@@ -22,19 +22,12 @@ struct snap_flattener_t
|
||||
std::string top_parent_name;
|
||||
inode_t target_id = 0;
|
||||
int state = 0;
|
||||
std::function<bool(cli_result_t &)> merger_cb;
|
||||
cli_result_t result;
|
||||
std::function<bool(void)> merger_cb;
|
||||
|
||||
void get_merge_parents()
|
||||
{
|
||||
// Get all parents of target
|
||||
inode_config_t *target_cfg = parent->get_inode_cfg(target_name);
|
||||
if (!target_cfg)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+target_name+" not found" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
target_id = target_cfg->num;
|
||||
std::vector<inode_t> chain_list;
|
||||
inode_config_t *cur = target_cfg;
|
||||
@@ -44,34 +37,23 @@ struct snap_flattener_t
|
||||
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
|
||||
if (it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = ENOENT,
|
||||
.text = "Parent inode of layer "+cur->name+" (id "+std::to_string(cur->parent_id)+") does not exist",
|
||||
.data = json11::Json::object {
|
||||
{ "error", "parent-not-found" },
|
||||
{ "inode_id", cur->num },
|
||||
{ "inode_name", cur->name },
|
||||
{ "parent_id", cur->parent_id },
|
||||
},
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
|
||||
exit(1);
|
||||
}
|
||||
cur = &it->second;
|
||||
chain_list.push_back(cur->num);
|
||||
}
|
||||
if (cur->parent_id != 0)
|
||||
{
|
||||
result = (cli_result_t){ .err = EBADF, .text = "Layer "+target_name+" has a loop in parents" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Layer %s has a loop in parents\n", target_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
top_parent_name = cur->name;
|
||||
}
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 100;
|
||||
return state == 5;
|
||||
}
|
||||
|
||||
void loop()
|
||||
@@ -82,20 +64,11 @@ struct snap_flattener_t
|
||||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
if (target_name == "")
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Layer to flatten not specified" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
// Get parent layers
|
||||
get_merge_parents();
|
||||
if (state == 100)
|
||||
return;
|
||||
// Start merger
|
||||
merger_cb = parent->start_merge(json11::Json::object {
|
||||
{ "from", top_parent_name },
|
||||
{ "to", target_name },
|
||||
{ "command", json11::Json::array{ "merge-data", top_parent_name, target_name } },
|
||||
{ "target", target_name },
|
||||
{ "delete-source", false },
|
||||
{ "cas", use_cas },
|
||||
@@ -103,19 +76,14 @@ struct snap_flattener_t
|
||||
});
|
||||
// Wait for it
|
||||
resume_1:
|
||||
while (!merger_cb(result))
|
||||
while (!merger_cb())
|
||||
{
|
||||
state = 1;
|
||||
return;
|
||||
}
|
||||
merger_cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
// Change parent
|
||||
parent->change_parent(target_id, 0, &result);
|
||||
parent->change_parent(target_id, 0);
|
||||
// Wait for it to complete
|
||||
state = 2;
|
||||
resume_2:
|
||||
@@ -124,26 +92,31 @@ resume_2:
|
||||
state = 3;
|
||||
resume_3:
|
||||
// Done
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_flatten(json11::Json cfg)
|
||||
std::function<bool(void)> cli_tool_t::start_flatten(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto flattener = new snap_flattener_t();
|
||||
flattener->parent = this;
|
||||
flattener->target_name = cfg["image"].string_value();
|
||||
flattener->target_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
if (flattener->target_name == "")
|
||||
{
|
||||
fprintf(stderr, "Layer to flatten argument is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
flattener->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
if (!flattener->fsync_interval)
|
||||
flattener->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
flattener->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
||||
return [flattener](cli_result_t & result)
|
||||
return [flattener]()
|
||||
{
|
||||
flattener->loop();
|
||||
if (flattener->is_done())
|
||||
{
|
||||
result = flattener->result;
|
||||
delete flattener;
|
||||
return true;
|
||||
}
|
||||
|
@@ -24,7 +24,6 @@ struct image_lister_t
|
||||
int state = 0;
|
||||
std::map<inode_t, json11::Json::object> stats;
|
||||
json11::Json space_info;
|
||||
cli_result_t result;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
@@ -45,9 +44,8 @@ struct image_lister_t
|
||||
}
|
||||
if (!list_pool_id)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+list_pool_name+" does not exist" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Pool %s does not exist\n", list_pool_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
@@ -118,12 +116,6 @@ struct image_lister_t
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
space_info = parent->etcd_result;
|
||||
std::map<pool_id_t, uint64_t> pool_pg_real_size;
|
||||
for (auto & kv_item: space_info["responses"][0]["response_range"]["kvs"].array_items())
|
||||
@@ -162,7 +154,7 @@ resume_1:
|
||||
if (pool_it != parent->cli->st_cli.pool_config.end())
|
||||
{
|
||||
auto & pool_cfg = pool_it->second;
|
||||
used_size = used_size / (pool_pg_real_size[pool_id] ? pool_pg_real_size[pool_id] : 1)
|
||||
used_size = used_size / pool_pg_real_size[pool_id]
|
||||
* (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
||||
}
|
||||
auto stat_it = stats.find(inode_num);
|
||||
@@ -253,13 +245,11 @@ resume_1:
|
||||
get_stats();
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (state == 100)
|
||||
return;
|
||||
}
|
||||
result.data = to_list();
|
||||
if (parent->json_output)
|
||||
{
|
||||
// JSON output
|
||||
printf("%s\n", json11::Json(to_list()).dump().c_str());
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
@@ -369,7 +359,7 @@ resume_1:
|
||||
kv.second["size_fmt"] = format_size(kv.second["size"].uint64_value());
|
||||
kv.second["ro"] = kv.second["readonly"].bool_value() ? "RO" : "-";
|
||||
}
|
||||
result.text = print_table(to_list(), cols, parent->color);
|
||||
printf("%s", print_table(to_list(), cols, parent->color).c_str());
|
||||
state = 100;
|
||||
}
|
||||
};
|
||||
@@ -446,26 +436,23 @@ std::string print_table(json11::Json items, json11::Json header, bool use_esc)
|
||||
return str;
|
||||
}
|
||||
|
||||
static uint64_t size_thresh[] = { (uint64_t)1024*1024*1024*1024, (uint64_t)1024*1024*1024, (uint64_t)1024*1024, 1024, 0 };
|
||||
static uint64_t size_thresh_d[] = { (uint64_t)1000000000000, (uint64_t)1000000000, (uint64_t)1000000, (uint64_t)1000, 0 };
|
||||
static const int size_thresh_n = sizeof(size_thresh)/sizeof(size_thresh[0]);
|
||||
static uint64_t size_thresh[] = { 1024l*1024*1024*1024, 1024l*1024*1024, 1024l*1024, 1024, 0 };
|
||||
static const char *size_unit = "TGMKB";
|
||||
|
||||
std::string format_size(uint64_t size, bool nobytes)
|
||||
std::string format_size(uint64_t size)
|
||||
{
|
||||
uint64_t *thr = nobytes ? size_thresh_d : size_thresh;
|
||||
char buf[256];
|
||||
for (int i = 0; i < size_thresh_n; i++)
|
||||
for (int i = 0; i < sizeof(size_thresh)/sizeof(size_thresh[0]); i++)
|
||||
{
|
||||
if (size >= thr[i] || i >= size_thresh_n-1)
|
||||
if (size >= size_thresh[i] || i >= sizeof(size_thresh)/sizeof(size_thresh[0])-1)
|
||||
{
|
||||
double value = thr[i] ? (double)size/thr[i] : size;
|
||||
double value = size_thresh[i] ? (double)size/size_thresh[i] : size;
|
||||
int l = snprintf(buf, sizeof(buf), "%.1f", value);
|
||||
assert(l < sizeof(buf)-2);
|
||||
if (buf[l-1] == '0')
|
||||
l -= 2;
|
||||
buf[l] = i == size_thresh_n-1 && nobytes ? 0 : ' ';
|
||||
buf[l+1] = i == size_thresh_n-1 && nobytes ? 0 : size_unit[i];
|
||||
buf[l] = ' ';
|
||||
buf[l+1] = size_unit[i];
|
||||
buf[l+2] = 0;
|
||||
break;
|
||||
}
|
||||
@@ -556,8 +543,9 @@ back:
|
||||
return true;
|
||||
}
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_ls(json11::Json cfg)
|
||||
std::function<bool(void)> cli_tool_t::start_ls(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto lister = new image_lister_t();
|
||||
lister->parent = this;
|
||||
lister->list_pool_id = cfg["pool"].uint64_value();
|
||||
@@ -567,16 +555,15 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_ls(json11::Json cfg)
|
||||
lister->sort_field = cfg["sort"].string_value();
|
||||
lister->reverse = cfg["reverse"].bool_value();
|
||||
lister->max_count = cfg["count"].uint64_value();
|
||||
for (auto & item: cfg["names"].array_items())
|
||||
for (int i = 1; i < cmd.size(); i++)
|
||||
{
|
||||
lister->only_names.insert(item.string_value());
|
||||
lister->only_names.insert(cmd[i].string_value());
|
||||
}
|
||||
return [lister](cli_result_t & result)
|
||||
return [lister]()
|
||||
{
|
||||
lister->loop();
|
||||
if (lister->is_done())
|
||||
{
|
||||
result = lister->result;
|
||||
delete lister;
|
||||
return true;
|
||||
}
|
||||
|
@@ -12,9 +12,6 @@ struct snap_rw_op_t
|
||||
cluster_op_t op;
|
||||
int todo = 0;
|
||||
uint32_t start = 0, end = 0;
|
||||
int error_code = 0;
|
||||
uint64_t error_offset = 0;
|
||||
bool error_read = false;
|
||||
};
|
||||
|
||||
// Layer merge is the base for multiple operations:
|
||||
@@ -57,45 +54,17 @@ struct snap_merger_t
|
||||
uint64_t last_written_offset = 0;
|
||||
int deleted_unsynced = 0;
|
||||
uint64_t processed = 0, to_process = 0;
|
||||
std::string rwo_error;
|
||||
|
||||
cli_result_t result;
|
||||
|
||||
void start_merge()
|
||||
{
|
||||
if (from_name == "" || to_name == "")
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Beginning or end of the merge sequence is missing" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
check_delete_source = delete_source || check_delete_source;
|
||||
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
|
||||
if (!from_cfg)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+from_name+" not found" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
|
||||
if (!to_cfg)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+to_name+" not found" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
inode_config_t *target_cfg = target_name == "" ? from_cfg : parent->get_inode_cfg(target_name);
|
||||
if (!target_cfg)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+target_name+" not found" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (to_cfg->num == from_cfg->num)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Only one layer specified, nothing to merge" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Only one layer specified, nothing to merge\n");
|
||||
exit(1);
|
||||
}
|
||||
// Check that to_cfg is actually a child of from_cfg and target_cfg is somewhere between them
|
||||
std::vector<inode_t> chain_list;
|
||||
@@ -109,18 +78,8 @@ struct snap_merger_t
|
||||
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
|
||||
if (it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = ENOENT,
|
||||
.text = "Parent inode of layer "+cur->name+" (id "+std::to_string(cur->parent_id)+") does not exist",
|
||||
.data = json11::Json::object {
|
||||
{ "error", "parent-not-found" },
|
||||
{ "inode_id", cur->num },
|
||||
{ "inode_name", cur->name },
|
||||
{ "parent_id", cur->parent_id },
|
||||
},
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
|
||||
exit(1);
|
||||
}
|
||||
cur = &it->second;
|
||||
chain_list.push_back(cur->num);
|
||||
@@ -128,9 +87,8 @@ struct snap_merger_t
|
||||
}
|
||||
if (cur->parent_id != from_cfg->num)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Layer "+to_name+" is not a child of "+from_name };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
chain_list.push_back(from_cfg->num);
|
||||
layer_block_size[from_cfg->num] = get_block_size(from_cfg->num);
|
||||
@@ -141,9 +99,8 @@ struct snap_merger_t
|
||||
}
|
||||
if (sources.find(target_cfg->num) == sources.end())
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Layer "+target_name+" is not between "+to_name+" and "+from_name };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Layer %s is not between %s and %s\n", target_name.c_str(), to_name.c_str(), from_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
target = target_cfg->num;
|
||||
target_rank = sources.at(target);
|
||||
@@ -173,15 +130,14 @@ struct snap_merger_t
|
||||
int parent_rank = it->second;
|
||||
if (parent_rank < to_rank && (parent_rank >= target_rank || check_delete_source))
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = EINVAL,
|
||||
.text = "Layers at or above "+(check_delete_source ? from_name : target_name)+
|
||||
", but below "+to_name+" are not allowed to have other children, but "+
|
||||
ic.second.name+" is a child of "+
|
||||
parent->cli->st_cli.inode_config.at(ic.second.parent_id).name,
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(
|
||||
stderr, "Layers at or above %s, but below %s are not allowed"
|
||||
" to have other children, but %s is a child of %s\n",
|
||||
(check_delete_source ? from_name.c_str() : target_name.c_str()),
|
||||
to_name.c_str(), ic.second.name.c_str(),
|
||||
parent->cli->st_cli.inode_config.at(ic.second.parent_id).name.c_str()
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
if (parent_rank >= to_rank)
|
||||
{
|
||||
@@ -196,14 +152,11 @@ struct snap_merger_t
|
||||
use_cas = 0;
|
||||
}
|
||||
sources.erase(target);
|
||||
if (parent->progress)
|
||||
{
|
||||
printf(
|
||||
"Merging %ld layer(s) into target %s%s (inode %lu in pool %u)\n",
|
||||
sources.size(), target_cfg->name.c_str(),
|
||||
use_cas ? " online (with CAS)" : "", INODE_NO_POOL(target), INODE_POOL(target)
|
||||
);
|
||||
}
|
||||
printf(
|
||||
"Merging %ld layer(s) into target %s%s (inode %lu in pool %u)\n",
|
||||
sources.size(), target_cfg->name.c_str(),
|
||||
use_cas ? " online (with CAS)" : "", INODE_NO_POOL(target), INODE_POOL(target)
|
||||
);
|
||||
target_block_size = get_block_size(target);
|
||||
}
|
||||
|
||||
@@ -226,7 +179,7 @@ struct snap_merger_t
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 100;
|
||||
return state == 6;
|
||||
}
|
||||
|
||||
void continue_merge()
|
||||
@@ -241,8 +194,8 @@ struct snap_merger_t
|
||||
goto resume_4;
|
||||
else if (state == 5)
|
||||
goto resume_5;
|
||||
else if (state == 100)
|
||||
goto resume_100;
|
||||
else if (state == 6)
|
||||
goto resume_6;
|
||||
// Get parents and so on
|
||||
start_merge();
|
||||
// First list lower layers
|
||||
@@ -300,8 +253,7 @@ struct snap_merger_t
|
||||
oit = merge_offsets.begin();
|
||||
resume_5:
|
||||
// Now read, overwrite and optionally delete offsets one by one
|
||||
while (in_flight < parent->iodepth*parent->parallel_osds &&
|
||||
oit != merge_offsets.end() && !rwo_error.size())
|
||||
while (in_flight < parent->iodepth*parent->parallel_osds && oit != merge_offsets.end())
|
||||
{
|
||||
in_flight++;
|
||||
read_and_write(*oit);
|
||||
@@ -312,15 +264,6 @@ struct snap_merger_t
|
||||
printf("\rOverwriting blocks: %lu/%lu", processed, to_process);
|
||||
}
|
||||
}
|
||||
if (in_flight == 0 && rwo_error.size())
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = EIO,
|
||||
.text = rwo_error,
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (in_flight > 0 || oit != merge_offsets.end())
|
||||
{
|
||||
// Wait until overwrites finish
|
||||
@@ -331,9 +274,9 @@ struct snap_merger_t
|
||||
printf("\rOverwriting blocks: %lu/%lu\n", to_process, to_process);
|
||||
}
|
||||
// Done
|
||||
result = (cli_result_t){ .text = "Done, layers from "+from_name+" to "+to_name+" merged into "+target_name };
|
||||
state = 100;
|
||||
resume_100:
|
||||
printf("Done, layers from %s to %s merged into %s\n", from_name.c_str(), to_name.c_str(), target_name.c_str());
|
||||
state = 6;
|
||||
resume_6:
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -371,10 +314,7 @@ struct snap_merger_t
|
||||
if (status & INODE_LIST_DONE)
|
||||
{
|
||||
auto & name = parent->cli->st_cli.inode_config.at(src).name;
|
||||
if (parent->progress)
|
||||
{
|
||||
printf("Got listing of layer %s (inode %lu in pool %u)\n", name.c_str(), INODE_NO_POOL(src), INODE_POOL(src));
|
||||
}
|
||||
printf("Got listing of layer %s (inode %lu in pool %u)\n", name.c_str(), INODE_NO_POOL(src), INODE_POOL(src));
|
||||
if (delete_source)
|
||||
{
|
||||
// Sort the inode listing
|
||||
@@ -456,9 +396,8 @@ struct snap_merger_t
|
||||
{
|
||||
if (op->retval != op->len)
|
||||
{
|
||||
rwo->error_code = -op->retval;
|
||||
rwo->error_offset = op->offset;
|
||||
rwo->error_read = true;
|
||||
fprintf(stderr, "error reading target at offset %lx: %s\n", op->offset, strerror(-op->retval));
|
||||
exit(1);
|
||||
}
|
||||
next_write(rwo);
|
||||
};
|
||||
@@ -471,7 +410,7 @@ struct snap_merger_t
|
||||
// FIXME: Allow to use single write with "holes" (OSDs don't allow it yet)
|
||||
uint32_t gran = parent->cli->get_bs_bitmap_granularity();
|
||||
uint64_t bitmap_size = target_block_size / gran;
|
||||
while (rwo->end < bitmap_size && !rwo->error_code)
|
||||
while (rwo->end < bitmap_size)
|
||||
{
|
||||
auto bit = ((*((uint8_t*)rwo->op.bitmap_buf + (rwo->end >> 3))) & (1 << (rwo->end & 0x7)));
|
||||
if (!bit)
|
||||
@@ -495,7 +434,7 @@ struct snap_merger_t
|
||||
rwo->end++;
|
||||
}
|
||||
}
|
||||
if (rwo->end > rwo->start && !rwo->error_code)
|
||||
if (rwo->end > rwo->start)
|
||||
{
|
||||
// write start->end
|
||||
rwo->todo++;
|
||||
@@ -534,9 +473,8 @@ struct snap_merger_t
|
||||
delete subop;
|
||||
return;
|
||||
}
|
||||
rwo->error_code = -subop->retval;
|
||||
rwo->error_offset = subop->offset;
|
||||
rwo->error_read = false;
|
||||
fprintf(stderr, "error writing target at offset %lx: %s\n", subop->offset, strerror(-subop->retval));
|
||||
exit(1);
|
||||
}
|
||||
// Increment CAS version
|
||||
rwo->op.version++;
|
||||
@@ -572,12 +510,11 @@ struct snap_merger_t
|
||||
{
|
||||
if (!rwo->todo)
|
||||
{
|
||||
if (!rwo->error_code &&
|
||||
last_written_offset < rwo->op.offset+target_block_size)
|
||||
if (last_written_offset < rwo->op.offset+target_block_size)
|
||||
{
|
||||
last_written_offset = rwo->op.offset+target_block_size;
|
||||
}
|
||||
if (!rwo->error_code && delete_source)
|
||||
if (delete_source)
|
||||
{
|
||||
deleted_unsynced++;
|
||||
if (deleted_unsynced >= fsync_interval)
|
||||
@@ -607,13 +544,6 @@ struct snap_merger_t
|
||||
}
|
||||
}
|
||||
free(rwo->buf);
|
||||
if (rwo->error_code)
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Error %s target at offset %lx: %s",
|
||||
rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(rwo->error_code));
|
||||
rwo_error = std::string(buf);
|
||||
}
|
||||
delete rwo;
|
||||
in_flight--;
|
||||
continue_merge_reent();
|
||||
@@ -621,25 +551,30 @@ struct snap_merger_t
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_merge(json11::Json cfg)
|
||||
std::function<bool(void)> cli_tool_t::start_merge(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto merger = new snap_merger_t();
|
||||
merger->parent = this;
|
||||
merger->from_name = cfg["from"].string_value();
|
||||
merger->to_name = cfg["to"].string_value();
|
||||
merger->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
merger->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
|
||||
merger->target_name = cfg["target"].string_value();
|
||||
if (merger->from_name == "" || merger->to_name == "")
|
||||
{
|
||||
fprintf(stderr, "Beginning or end of the merge sequence is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
merger->delete_source = cfg["delete-source"].string_value() != "";
|
||||
merger->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
if (!merger->fsync_interval)
|
||||
merger->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
merger->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
||||
return [merger](cli_result_t & result)
|
||||
return [merger]()
|
||||
{
|
||||
merger->continue_merge_reent();
|
||||
if (merger->is_done())
|
||||
{
|
||||
result = merger->result;
|
||||
delete merger;
|
||||
return true;
|
||||
}
|
||||
|
@@ -13,7 +13,6 @@ struct image_changer_t
|
||||
std::string image_name;
|
||||
std::string new_name;
|
||||
uint64_t new_size = 0;
|
||||
bool force_size = false;
|
||||
bool set_readonly = false, set_readwrite = false, force = false;
|
||||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
@@ -24,8 +23,7 @@ struct image_changer_t
|
||||
bool has_children = false;
|
||||
|
||||
int state = 0;
|
||||
std::function<bool(cli_result_t &)> cb;
|
||||
cli_result_t result;
|
||||
std::function<bool(void)> cb;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
@@ -38,18 +36,6 @@ struct image_changer_t
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
if (image_name == "")
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Image name is missing" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (new_size != 0 && (new_size % 4096) && !force_size)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Image size should be a multiple of 4096" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (ic.second.name == image_name)
|
||||
@@ -60,16 +46,14 @@ struct image_changer_t
|
||||
}
|
||||
if (new_name != "" && ic.second.name == new_name)
|
||||
{
|
||||
result = (cli_result_t){ .err = EEXIST, .text = "Image "+new_name+" already exists" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Image %s already exists\n", new_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (!inode_num)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Image "+image_name+" does not exist" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Image %s does not exist\n", image_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
@@ -81,43 +65,37 @@ struct image_changer_t
|
||||
}
|
||||
if ((!set_readwrite || !cfg.readonly) &&
|
||||
(!set_readonly || cfg.readonly) &&
|
||||
(!new_size && !force_size || cfg.size == new_size) &&
|
||||
(!new_size || cfg.size == new_size) &&
|
||||
(new_name == "" || new_name == image_name))
|
||||
{
|
||||
result = (cli_result_t){ .text = "No change" };
|
||||
printf("No change\n");
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (new_size != 0 || force_size)
|
||||
if (new_size != 0)
|
||||
{
|
||||
if (cfg.size >= new_size)
|
||||
{
|
||||
// Check confirmation when trimming an image with children
|
||||
if (has_children && !force)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Image "+image_name+" has children. Refusing to shrink it without --force" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Image %s has children. Refusing to shrink it without --force\n", image_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
// Shrink the image first
|
||||
cb = parent->start_rm_data(json11::Json::object {
|
||||
cb = parent->start_rm(json11::Json::object {
|
||||
{ "inode", INODE_NO_POOL(inode_num) },
|
||||
{ "pool", (uint64_t)INODE_POOL(inode_num) },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
{ "min-offset", ((new_size+4095)/4096)*4096 },
|
||||
{ "min-offset", new_size },
|
||||
});
|
||||
resume_1:
|
||||
while (!cb(result))
|
||||
while (!cb())
|
||||
{
|
||||
state = 1;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
}
|
||||
cfg.size = new_size;
|
||||
}
|
||||
@@ -131,9 +109,8 @@ resume_1:
|
||||
// Check confirmation when making an image with children read-write
|
||||
if (has_children && !force)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Image "+image_name+" has children. Refusing to make it read-write without --force" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Image %s has children. Refusing to make it read-write without --force\n", image_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (new_name != "")
|
||||
@@ -201,38 +178,34 @@ resume_1:
|
||||
resume_2:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!parent->etcd_result["succeeded"].bool_value())
|
||||
{
|
||||
result = (cli_result_t){ .err = EAGAIN, .text = "Image "+image_name+" was modified by someone else, please repeat your request" };
|
||||
state = 100;
|
||||
return;
|
||||
fprintf(stderr, "Image %s was modified by someone else, please repeat your request\n", image_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
// Save into inode_config for library users to be able to take it from there immediately
|
||||
cfg.mod_revision = parent->etcd_result["responses"][0]["response_put"]["header"]["revision"].uint64_value();
|
||||
if (new_name != "")
|
||||
{
|
||||
parent->cli->st_cli.inode_by_name.erase(image_name);
|
||||
}
|
||||
parent->cli->st_cli.insert_inode_config(cfg);
|
||||
result = (cli_result_t){ .err = 0, .text = "Image "+image_name+" modified" };
|
||||
printf("Image %s modified\n", image_name.c_str());
|
||||
state = 100;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
|
||||
std::function<bool(void)> cli_tool_t::start_modify(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto changer = new image_changer_t();
|
||||
changer->parent = this;
|
||||
changer->image_name = cfg["image"].string_value();
|
||||
changer->image_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
if (changer->image_name == "")
|
||||
{
|
||||
fprintf(stderr, "Image name is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
changer->new_name = cfg["rename"].string_value();
|
||||
changer->new_size = parse_size(cfg["resize"].as_string());
|
||||
changer->force_size = cfg["force_size"].bool_value();
|
||||
changer->new_size = parse_size(cfg["resize"].string_value());
|
||||
if (changer->new_size != 0 && (changer->new_size % 4096))
|
||||
{
|
||||
fprintf(stderr, "Image size should be a multiple of 4096\n");
|
||||
exit(1);
|
||||
}
|
||||
changer->force = cfg["force"].bool_value();
|
||||
changer->set_readonly = cfg["readonly"].bool_value();
|
||||
changer->set_readwrite = cfg["readwrite"].bool_value();
|
||||
@@ -240,12 +213,11 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
|
||||
if (!changer->fsync_interval)
|
||||
changer->fsync_interval = 128;
|
||||
// FIXME Check that the image doesn't have children when shrinking
|
||||
return [changer](cli_result_t & result)
|
||||
return [changer]()
|
||||
{
|
||||
changer->loop();
|
||||
if (changer->is_done())
|
||||
{
|
||||
result = changer->result;
|
||||
delete changer;
|
||||
return true;
|
||||
}
|
||||
|
751
src/cli_rm.cpp
751
src/cli_rm.cpp
@@ -1,658 +1,211 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <fcntl.h>
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
#include "base64.h"
|
||||
|
||||
// Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets
|
||||
//
|
||||
// Exactly one child of the requested layers may be merged using the "inverted" workflow,
|
||||
// where we merge it "down" into one of the "to-be-removed" layers and then rename the
|
||||
// "to-be-removed" layer to the child. It may be done either if all writers are stopped
|
||||
// before trying to delete layers (which is signaled by --writers-stopped) or if that child
|
||||
// is a read-only layer (snapshot) itself.
|
||||
//
|
||||
// This "inverted" workflow trades copying data of one of the deleted layers for copying
|
||||
// data of one child of the chain which is also a child of the "traded" layer. So we
|
||||
// choose the (parent,child) pair which has the largest difference between "parent" and
|
||||
// "child" inode sizes.
|
||||
//
|
||||
// All other children of the chain are processed by iterating though them, merging removed
|
||||
// parents into them and rebasing them to the last layer which isn't a member of the removed
|
||||
// chain.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// <parent> - <from> - <layer 2> - <to> - <child 1>
|
||||
// \ \ \- <child 2>
|
||||
// \ \- <child 3>
|
||||
// \-<child 4>
|
||||
//
|
||||
// 1) Find optimal pair for the "reverse" scenario
|
||||
// Imagine that it's (<layer 2>, <child 1>) in this example
|
||||
// 2) Process all children except <child 1>:
|
||||
// - Merge <from>..<to> to <child 2>
|
||||
// - Set <child 2> parent to <parent>
|
||||
// - Repeat for others
|
||||
// 3) Process <child 1>:
|
||||
// - Merge <from>..<child 1> to <layer 2>
|
||||
// - Set <layer 2> parent to <parent>
|
||||
// - Rename <layer 2> to <child 1>
|
||||
// 4) Delete other layers of the chain (<from>, <to>)
|
||||
struct snap_remover_t
|
||||
#define RM_LISTING 1
|
||||
#define RM_REMOVING 2
|
||||
#define RM_END 3
|
||||
|
||||
struct rm_pg_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
|
||||
// remove from..to
|
||||
std::string from_name, to_name;
|
||||
// writers are stopped, we can safely change writable layers
|
||||
bool writers_stopped = false;
|
||||
// use CAS writes (0 = never, 1 = auto, 2 = always)
|
||||
int use_cas = 1;
|
||||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
|
||||
std::map<inode_t,int> sources;
|
||||
std::map<inode_t,uint64_t> inode_used;
|
||||
std::vector<inode_t> merge_children;
|
||||
std::vector<inode_t> chain_list;
|
||||
std::map<inode_t,int> inverse_candidates;
|
||||
inode_t inverse_parent = 0, inverse_child = 0;
|
||||
inode_t new_parent = 0;
|
||||
pg_num_t pg_num;
|
||||
osd_num_t rm_osd_num;
|
||||
std::set<object_id> objects;
|
||||
std::set<object_id>::iterator obj_pos;
|
||||
uint64_t obj_count = 0, obj_done = 0;
|
||||
int state = 0;
|
||||
int current_child = 0;
|
||||
std::function<bool(cli_result_t &)> cb;
|
||||
int in_flight = 0;
|
||||
};
|
||||
|
||||
cli_result_t result;
|
||||
struct rm_inode_t
|
||||
{
|
||||
uint64_t inode = 0;
|
||||
pool_id_t pool_id = 0;
|
||||
uint64_t min_offset = 0;
|
||||
|
||||
bool is_done()
|
||||
cli_tool_t *parent = NULL;
|
||||
inode_list_t *lister = NULL;
|
||||
std::vector<rm_pg_t*> lists;
|
||||
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
|
||||
uint64_t pgs_to_list = 0;
|
||||
bool lists_done = false;
|
||||
int state = 0;
|
||||
|
||||
void start_delete()
|
||||
{
|
||||
return state == 100;
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
else if (state == 4)
|
||||
goto resume_4;
|
||||
else if (state == 5)
|
||||
goto resume_5;
|
||||
else if (state == 6)
|
||||
goto resume_6;
|
||||
else if (state == 7)
|
||||
goto resume_7;
|
||||
else if (state == 8)
|
||||
goto resume_8;
|
||||
else if (state == 100)
|
||||
goto resume_100;
|
||||
assert(!state);
|
||||
if (from_name == "")
|
||||
lister = parent->cli->list_inode_start(inode, [this](inode_list_t *lst,
|
||||
std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Layer to remove argument is missing" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (to_name == "")
|
||||
{
|
||||
to_name = from_name;
|
||||
}
|
||||
// Get children to merge
|
||||
get_merge_children();
|
||||
if (state == 100)
|
||||
return;
|
||||
// Try to select an inode for the "inverse" optimized scenario
|
||||
// Read statistics from etcd to do it
|
||||
read_stats();
|
||||
if (state == 100)
|
||||
return;
|
||||
state = 1;
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
choose_inverse_candidate();
|
||||
// Merge children one by one, except our "inverse" child
|
||||
for (current_child = 0; current_child < merge_children.size(); current_child++)
|
||||
{
|
||||
if (merge_children[current_child] == inverse_child)
|
||||
continue;
|
||||
start_merge_child(merge_children[current_child], merge_children[current_child]);
|
||||
if (state == 100)
|
||||
return;
|
||||
resume_2:
|
||||
while (!cb(result))
|
||||
{
|
||||
state = 2;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
parent->change_parent(merge_children[current_child], new_parent, &result);
|
||||
state = 3;
|
||||
resume_3:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (result.err)
|
||||
{
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
else if (parent->progress)
|
||||
printf("%s\n", result.text.c_str());
|
||||
}
|
||||
// Merge our "inverse" child into our "inverse" parent
|
||||
if (inverse_child != 0)
|
||||
{
|
||||
start_merge_child(inverse_child, inverse_parent);
|
||||
if (state == 100)
|
||||
return;
|
||||
resume_4:
|
||||
while (!cb(result))
|
||||
{
|
||||
state = 4;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
// Delete "inverse" child data
|
||||
start_delete_source(inverse_child);
|
||||
if (state == 100)
|
||||
return;
|
||||
resume_5:
|
||||
while (!cb(result))
|
||||
{
|
||||
state = 5;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
// Delete "inverse" child metadata, rename parent over it,
|
||||
// and also change parent links of the previous "inverse" child
|
||||
rename_inverse_parent();
|
||||
if (state == 100)
|
||||
return;
|
||||
state = 6;
|
||||
resume_6:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
// Delete parents, except the "inverse" one
|
||||
for (current_child = 0; current_child < chain_list.size(); current_child++)
|
||||
{
|
||||
if (chain_list[current_child] == inverse_parent)
|
||||
continue;
|
||||
start_delete_source(chain_list[current_child]);
|
||||
resume_7:
|
||||
while (!cb(result))
|
||||
{
|
||||
state = 7;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
if (result.err)
|
||||
{
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
delete_inode_config(chain_list[current_child]);
|
||||
if (state == 100)
|
||||
return;
|
||||
state = 8;
|
||||
resume_8:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
state = 100;
|
||||
resume_100:
|
||||
// Done
|
||||
return;
|
||||
}
|
||||
|
||||
void get_merge_children()
|
||||
{
|
||||
// Get all children of from..to
|
||||
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
|
||||
if (!from_cfg)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+from_name+" not found" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
|
||||
if (!to_cfg)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+to_name+" not found" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
// Check that to_cfg is actually a child of from_cfg
|
||||
// FIXME de-copypaste the following piece of code with snap_merger_t
|
||||
inode_config_t *cur = to_cfg;
|
||||
chain_list.push_back(cur->num);
|
||||
while (cur->num != from_cfg->num && cur->parent_id != 0)
|
||||
{
|
||||
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
|
||||
if (it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Parent inode of layer %s (id 0x%lx) not found", cur->name.c_str(), cur->parent_id);
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
cur = &it->second;
|
||||
chain_list.push_back(cur->num);
|
||||
}
|
||||
if (cur->num != from_cfg->num)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Layer "+to_name+" is not a child of "+from_name };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
new_parent = from_cfg->parent_id;
|
||||
// Calculate ranks
|
||||
int i = chain_list.size()-1;
|
||||
for (inode_t item: chain_list)
|
||||
{
|
||||
sources[item] = i--;
|
||||
}
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (!ic.second.parent_id)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto it = sources.find(ic.second.parent_id);
|
||||
if (it != sources.end() && sources.find(ic.second.num) == sources.end())
|
||||
{
|
||||
merge_children.push_back(ic.second.num);
|
||||
if (ic.second.readonly || writers_stopped)
|
||||
{
|
||||
inverse_candidates[ic.second.num] = it->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void read_stats()
|
||||
{
|
||||
if (inverse_candidates.size() == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
json11::Json::array reads;
|
||||
for (auto cp: inverse_candidates)
|
||||
{
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
rm_pg_t *rm = new rm_pg_t((rm_pg_t){
|
||||
.pg_num = pg_num,
|
||||
.rm_osd_num = primary_osd,
|
||||
.objects = objects,
|
||||
.obj_count = objects.size(),
|
||||
.obj_done = 0,
|
||||
});
|
||||
}
|
||||
for (auto cp: sources)
|
||||
{
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "success", reads },
|
||||
}, [this](std::string err, json11::Json data)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
if (min_offset == 0)
|
||||
{
|
||||
result = (cli_result_t){ .err = EIO, .text = "Error reading layer statistics from etcd: "+err };
|
||||
state = 100;
|
||||
return;
|
||||
total_count += objects.size();
|
||||
}
|
||||
for (auto inode_result: data["responses"].array_items())
|
||||
else
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]);
|
||||
pool_id_t pool_id = 0;
|
||||
inode_t inode = 0;
|
||||
char null_byte = 0;
|
||||
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
|
||||
if (!inode || null_byte != 0)
|
||||
for (object_id oid: objects)
|
||||
{
|
||||
result = (cli_result_t){ .err = EIO, .text = "Bad key returned from etcd: "+kv.key };
|
||||
state = 100;
|
||||
return;
|
||||
if (oid.stripe >= min_offset)
|
||||
{
|
||||
total_count++;
|
||||
}
|
||||
}
|
||||
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id);
|
||||
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+std::to_string(pool_id)+" does not exist" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
inode = INODE_WITH_POOL(pool_id, inode);
|
||||
auto & pool_cfg = pool_cfg_it->second;
|
||||
uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size;
|
||||
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
|
||||
}
|
||||
inode_used[inode] = used_bytes;
|
||||
}
|
||||
parent->ringloop->wakeup();
|
||||
rm->obj_pos = rm->objects.begin();
|
||||
lists.push_back(rm);
|
||||
if (parent->list_first)
|
||||
{
|
||||
parent->cli->list_inode_next(lister, 1);
|
||||
}
|
||||
if (status & INODE_LIST_DONE)
|
||||
{
|
||||
lists_done = true;
|
||||
}
|
||||
pgs_to_list--;
|
||||
continue_delete();
|
||||
});
|
||||
if (!lister)
|
||||
{
|
||||
fprintf(stderr, "Failed to list inode %lu from pool %u objects\n", INODE_NO_POOL(inode), INODE_POOL(inode));
|
||||
exit(1);
|
||||
}
|
||||
pgs_to_list = parent->cli->list_pg_count(lister);
|
||||
parent->cli->list_inode_next(lister, parent->parallel_osds);
|
||||
}
|
||||
|
||||
void choose_inverse_candidate()
|
||||
void send_ops(rm_pg_t *cur_list)
|
||||
{
|
||||
uint64_t max_diff = 0;
|
||||
for (auto cp: inverse_candidates)
|
||||
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
||||
parent->cli->msgr.osd_peer_fds.end())
|
||||
{
|
||||
inode_t child = cp.first;
|
||||
uint64_t child_used = inode_used[child];
|
||||
int rank = cp.second;
|
||||
for (int i = chain_list.size()-rank; i < chain_list.size(); i++)
|
||||
{
|
||||
inode_t parent = chain_list[i];
|
||||
uint64_t parent_used = inode_used[parent];
|
||||
if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used)))
|
||||
{
|
||||
max_diff = (parent_used-child_used);
|
||||
inverse_parent = parent;
|
||||
inverse_child = child;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void rename_inverse_parent()
|
||||
{
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(inverse_child);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_child);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
// Initiate connection
|
||||
parent->cli->msgr.connect_peer(cur_list->rm_osd_num, parent->cli->st_cli.peer_states[cur_list->rm_osd_num]);
|
||||
return;
|
||||
}
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
while (cur_list->in_flight < parent->iodepth && cur_list->obj_pos != cur_list->objects.end())
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_parent);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
inode_config_t *child_cfg = &child_it->second;
|
||||
inode_config_t *target_cfg = &target_it->second;
|
||||
std::string child_name = child_cfg->name;
|
||||
std::string target_name = target_cfg->name;
|
||||
std::string child_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_child))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_child))
|
||||
);
|
||||
std::string target_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_parent))
|
||||
);
|
||||
// Fill new configuration
|
||||
inode_config_t new_cfg = *child_cfg;
|
||||
new_cfg.num = target_cfg->num;
|
||||
new_cfg.parent_id = new_parent;
|
||||
json11::Json::array cmp = json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", child_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", child_cfg->mod_revision+1 },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", target_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", target_cfg->mod_revision+1 },
|
||||
},
|
||||
};
|
||||
json11::Json::array txn = json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", child_cfg_key },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", target_cfg_key },
|
||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&new_cfg)).dump()) },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+child_cfg->name) },
|
||||
{ "value", base64_encode(json11::Json({
|
||||
{ "id", INODE_NO_POOL(inverse_parent) },
|
||||
{ "pool_id", (uint64_t)INODE_POOL(inverse_parent) },
|
||||
}).dump()) },
|
||||
} },
|
||||
},
|
||||
};
|
||||
// Reparent children of inverse_child
|
||||
for (auto & cp: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (cp.second.parent_id == child_cfg->num)
|
||||
if (cur_list->obj_pos->stripe >= min_offset)
|
||||
{
|
||||
auto cp_cfg = cp.second;
|
||||
cp_cfg.parent_id = inverse_parent;
|
||||
auto cp_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cp.second.num))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cp.second.num))
|
||||
);
|
||||
cmp.push_back(json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cp_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", cp.second.mod_revision+1 },
|
||||
});
|
||||
txn.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", cp_key },
|
||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&cp_cfg)).dump()) },
|
||||
} },
|
||||
});
|
||||
}
|
||||
}
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", cmp },
|
||||
{ "success", txn },
|
||||
}, [this, target_name, child_name](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
result = (cli_result_t){ .err = EIO, .text = "Error renaming "+target_name+" to "+child_name+": "+err };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = EAGAIN,
|
||||
.text = "Parent ("+target_name+"), child ("+child_name+"), or one of its children"
|
||||
" configuration was modified during rename",
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = parent->cli->msgr.osd_peer_fds[cur_list->rm_osd_num];
|
||||
op->req = (osd_any_op_t){
|
||||
.rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_DELETE,
|
||||
},
|
||||
.inode = cur_list->obj_pos->inode,
|
||||
.offset = cur_list->obj_pos->stripe,
|
||||
.len = 0,
|
||||
},
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
op->callback = [this, cur_list](osd_op_t *op)
|
||||
{
|
||||
cur_list->in_flight--;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
|
||||
op->req.rw.inode, op->req.rw.offset,
|
||||
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||
}
|
||||
delete op;
|
||||
cur_list->obj_done++;
|
||||
total_done++;
|
||||
continue_delete();
|
||||
};
|
||||
cur_list->in_flight++;
|
||||
parent->cli->msgr.outbox_push(op);
|
||||
}
|
||||
if (parent->progress)
|
||||
printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
cur_list->obj_pos++;
|
||||
}
|
||||
}
|
||||
|
||||
void delete_inode_config(inode_t cur)
|
||||
void continue_delete()
|
||||
{
|
||||
auto cur_cfg_it = parent->cli->st_cli.inode_config.find(cur);
|
||||
if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
|
||||
if (parent->list_first && !lists_done)
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", cur);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
inode_config_t *cur_cfg = &cur_cfg_it->second;
|
||||
std::string cur_name = cur_cfg->name;
|
||||
std::string cur_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cur))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cur))
|
||||
);
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cur_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", cur_cfg->mod_revision+1 },
|
||||
},
|
||||
} },
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", cur_cfg_key },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) },
|
||||
} },
|
||||
},
|
||||
} },
|
||||
}, [this, cur, cur_name](std::string err, json11::Json res)
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
|
||||
{
|
||||
result = (cli_result_t){ .err = EIO, .text = "Error deleting "+cur_name+": "+err };
|
||||
state = 100;
|
||||
return;
|
||||
delete lists[i];
|
||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||
i--;
|
||||
if (!lists_done)
|
||||
{
|
||||
parent->cli->list_inode_next(lister, 1);
|
||||
}
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
else
|
||||
{
|
||||
result = (cli_result_t){ .err = EAGAIN, .text = "Layer "+cur_name+" was modified during deletion" };
|
||||
state = 100;
|
||||
return;
|
||||
send_ops(lists[i]);
|
||||
}
|
||||
// Modify inode_config for library users to be able to take it from there immediately
|
||||
parent->cli->st_cli.inode_by_name.erase(cur_name);
|
||||
parent->cli->st_cli.inode_config.erase(cur);
|
||||
if (parent->progress)
|
||||
printf("Layer %s deleted\n", cur_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
|
||||
{
|
||||
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
|
||||
total_prev_pct = total_done*1000/total_count;
|
||||
}
|
||||
if (lists_done && !lists.size())
|
||||
{
|
||||
printf("Done, inode %lu in pool %u data removed\n", INODE_NO_POOL(inode), pool_id);
|
||||
state = 2;
|
||||
}
|
||||
}
|
||||
|
||||
void start_merge_child(inode_t child_inode, inode_t target_inode)
|
||||
bool loop()
|
||||
{
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(child_inode);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
if (state == 0)
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", child_inode);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
start_delete();
|
||||
state = 1;
|
||||
}
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(target_inode);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
else if (state == 1)
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", target_inode);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
continue_delete();
|
||||
}
|
||||
cb = parent->start_merge(json11::Json::object {
|
||||
{ "from", from_name },
|
||||
{ "to", child_it->second.name },
|
||||
{ "target", target_it->second.name },
|
||||
{ "delete-source", false },
|
||||
{ "cas", use_cas },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
}
|
||||
|
||||
void start_delete_source(inode_t inode)
|
||||
{
|
||||
auto source = parent->cli->st_cli.inode_config.find(inode);
|
||||
if (source == parent->cli->st_cli.inode_config.end())
|
||||
else if (state == 2)
|
||||
{
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Inode 0x%lx disappeared", inode);
|
||||
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
|
||||
state = 100;
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
cb = parent->start_rm_data(json11::Json::object {
|
||||
{ "inode", inode },
|
||||
{ "pool", (uint64_t)INODE_POOL(inode) },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_rm(json11::Json cfg)
|
||||
std::function<bool(void)> cli_tool_t::start_rm(json11::Json cfg)
|
||||
{
|
||||
auto snap_remover = new snap_remover_t();
|
||||
snap_remover->parent = this;
|
||||
snap_remover->from_name = cfg["from"].string_value();
|
||||
snap_remover->to_name = cfg["to"].string_value();
|
||||
snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
if (!snap_remover->fsync_interval)
|
||||
snap_remover->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
||||
if (!cfg["writers_stopped"].is_null())
|
||||
snap_remover->writers_stopped = true;
|
||||
return [snap_remover](cli_result_t & result)
|
||||
auto remover = new rm_inode_t();
|
||||
remover->parent = this;
|
||||
remover->inode = cfg["inode"].uint64_value();
|
||||
remover->pool_id = cfg["pool"].uint64_value();
|
||||
if (remover->pool_id)
|
||||
{
|
||||
snap_remover->loop();
|
||||
if (snap_remover->is_done())
|
||||
remover->inode = (remover->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
|
||||
}
|
||||
remover->pool_id = INODE_POOL(remover->inode);
|
||||
if (!remover->pool_id)
|
||||
{
|
||||
fprintf(stderr, "pool is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
remover->min_offset = cfg["min-offset"].uint64_value();
|
||||
return [remover]()
|
||||
{
|
||||
if (remover->loop())
|
||||
{
|
||||
result = snap_remover->result;
|
||||
delete snap_remover;
|
||||
delete remover;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
@@ -1,232 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
|
||||
#define RM_LISTING 1
|
||||
#define RM_REMOVING 2
|
||||
#define RM_END 3
|
||||
|
||||
struct rm_pg_t
|
||||
{
|
||||
pg_num_t pg_num;
|
||||
osd_num_t rm_osd_num;
|
||||
std::set<object_id> objects;
|
||||
std::set<object_id>::iterator obj_pos;
|
||||
uint64_t obj_count = 0, obj_done = 0;
|
||||
int state = 0;
|
||||
int in_flight = 0;
|
||||
};
|
||||
|
||||
struct rm_inode_t
|
||||
{
|
||||
uint64_t inode = 0;
|
||||
pool_id_t pool_id = 0;
|
||||
uint64_t min_offset = 0;
|
||||
|
||||
cli_tool_t *parent = NULL;
|
||||
inode_list_t *lister = NULL;
|
||||
std::vector<rm_pg_t*> lists;
|
||||
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
|
||||
uint64_t pgs_to_list = 0;
|
||||
bool lists_done = false;
|
||||
int state = 0;
|
||||
int error_count = 0;
|
||||
|
||||
cli_result_t result;
|
||||
|
||||
void start_delete()
|
||||
{
|
||||
lister = parent->cli->list_inode_start(inode, [this](inode_list_t *lst,
|
||||
std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
|
||||
{
|
||||
rm_pg_t *rm = new rm_pg_t((rm_pg_t){
|
||||
.pg_num = pg_num,
|
||||
.rm_osd_num = primary_osd,
|
||||
.objects = objects,
|
||||
.obj_count = objects.size(),
|
||||
.obj_done = 0,
|
||||
});
|
||||
if (min_offset == 0)
|
||||
{
|
||||
total_count += objects.size();
|
||||
}
|
||||
else
|
||||
{
|
||||
for (object_id oid: objects)
|
||||
{
|
||||
if (oid.stripe >= min_offset)
|
||||
{
|
||||
total_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
rm->obj_pos = rm->objects.begin();
|
||||
lists.push_back(rm);
|
||||
if (parent->list_first)
|
||||
{
|
||||
parent->cli->list_inode_next(lister, 1);
|
||||
}
|
||||
if (status & INODE_LIST_DONE)
|
||||
{
|
||||
lists_done = true;
|
||||
}
|
||||
pgs_to_list--;
|
||||
continue_delete();
|
||||
});
|
||||
if (!lister)
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = EIO,
|
||||
.text = "Failed to list objects of inode "+std::to_string(INODE_NO_POOL(inode))+
|
||||
" from pool "+std::to_string(INODE_POOL(inode)),
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
pgs_to_list = parent->cli->list_pg_count(lister);
|
||||
parent->cli->list_inode_next(lister, parent->parallel_osds);
|
||||
}
|
||||
|
||||
void send_ops(rm_pg_t *cur_list)
|
||||
{
|
||||
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
||||
parent->cli->msgr.osd_peer_fds.end())
|
||||
{
|
||||
// Initiate connection
|
||||
parent->cli->msgr.connect_peer(cur_list->rm_osd_num, parent->cli->st_cli.peer_states[cur_list->rm_osd_num]);
|
||||
return;
|
||||
}
|
||||
while (cur_list->in_flight < parent->iodepth && cur_list->obj_pos != cur_list->objects.end())
|
||||
{
|
||||
if (cur_list->obj_pos->stripe >= min_offset)
|
||||
{
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
// Already checked that it exists above, but anyway
|
||||
op->peer_fd = parent->cli->msgr.osd_peer_fds.at(cur_list->rm_osd_num);
|
||||
op->req = (osd_any_op_t){
|
||||
.rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = parent->cli->next_op_id(),
|
||||
.opcode = OSD_OP_DELETE,
|
||||
},
|
||||
.inode = cur_list->obj_pos->inode,
|
||||
.offset = cur_list->obj_pos->stripe,
|
||||
.len = 0,
|
||||
},
|
||||
};
|
||||
op->callback = [this, cur_list](osd_op_t *op)
|
||||
{
|
||||
cur_list->in_flight--;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
|
||||
op->req.rw.inode, op->req.rw.offset,
|
||||
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
|
||||
error_count++;
|
||||
}
|
||||
delete op;
|
||||
cur_list->obj_done++;
|
||||
total_done++;
|
||||
continue_delete();
|
||||
};
|
||||
cur_list->in_flight++;
|
||||
parent->cli->msgr.outbox_push(op);
|
||||
}
|
||||
cur_list->obj_pos++;
|
||||
}
|
||||
}
|
||||
|
||||
void continue_delete()
|
||||
{
|
||||
if (parent->list_first && !lists_done)
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < lists.size(); i++)
|
||||
{
|
||||
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
|
||||
{
|
||||
delete lists[i];
|
||||
lists.erase(lists.begin()+i, lists.begin()+i+1);
|
||||
i--;
|
||||
if (!lists_done)
|
||||
{
|
||||
parent->cli->list_inode_next(lister, 1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
send_ops(lists[i]);
|
||||
}
|
||||
}
|
||||
if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
|
||||
{
|
||||
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
|
||||
total_prev_pct = total_done*1000/total_count;
|
||||
}
|
||||
if (lists_done && !lists.size())
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = error_count > 0 ? EIO : 0,
|
||||
.text = error_count > 0 ? "Some blocks were not removed" : (
|
||||
"Done, inode "+std::to_string(INODE_NO_POOL(inode))+" from pool "+
|
||||
std::to_string(pool_id)+" removed"),
|
||||
};
|
||||
state = 100;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 100;
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
if (state == 100)
|
||||
return;
|
||||
if (!pool_id)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Pool is not specified" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
start_delete();
|
||||
if (state == 100)
|
||||
return;
|
||||
state = 1;
|
||||
resume_1:
|
||||
continue_delete();
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_rm_data(json11::Json cfg)
|
||||
{
|
||||
auto remover = new rm_inode_t();
|
||||
remover->parent = this;
|
||||
remover->inode = cfg["inode"].uint64_value();
|
||||
remover->pool_id = cfg["pool"].uint64_value();
|
||||
if (remover->pool_id)
|
||||
{
|
||||
remover->inode = (remover->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
|
||||
}
|
||||
remover->pool_id = INODE_POOL(remover->inode);
|
||||
remover->min_offset = cfg["min-offset"].uint64_value();
|
||||
return [remover](cli_result_t & result)
|
||||
{
|
||||
remover->loop();
|
||||
if (remover->is_done())
|
||||
{
|
||||
result = remover->result;
|
||||
delete remover;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
@@ -11,9 +11,9 @@
|
||||
#include <sys/stat.h>
|
||||
|
||||
// Calculate offsets for a block device and print OSD command line parameters
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::simple_offsets(json11::Json cfg)
|
||||
std::function<bool(void)> cli_tool_t::simple_offsets(json11::Json cfg)
|
||||
{
|
||||
std::string device = cfg["device"].string_value();
|
||||
std::string device = cfg["command"][1].string_value();
|
||||
uint64_t object_size = parse_size(cfg["object_size"].string_value());
|
||||
uint64_t bitmap_granularity = parse_size(cfg["bitmap_granularity"].string_value());
|
||||
uint64_t journal_size = parse_size(cfg["journal_size"].string_value());
|
||||
|
568
src/cli_snap_rm.cpp
Normal file
568
src/cli_snap_rm.cpp
Normal file
@@ -0,0 +1,568 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <fcntl.h>
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
#include "base64.h"
|
||||
|
||||
// Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets
|
||||
//
|
||||
// Exactly one child of the requested layers may be merged using the "inverted" workflow,
|
||||
// where we merge it "down" into one of the "to-be-removed" layers and then rename the
|
||||
// "to-be-removed" layer to the child. It may be done either if all writers are stopped
|
||||
// before trying to delete layers (which is signaled by --writers-stopped) or if that child
|
||||
// is a read-only layer (snapshot) itself.
|
||||
//
|
||||
// This "inverted" workflow trades copying data of one of the deleted layers for copying
|
||||
// data of one child of the chain which is also a child of the "traded" layer. So we
|
||||
// choose the (parent,child) pair which has the largest difference between "parent" and
|
||||
// "child" inode sizes.
|
||||
//
|
||||
// All other children of the chain are processed by iterating though them, merging removed
|
||||
// parents into them and rebasing them to the last layer which isn't a member of the removed
|
||||
// chain.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// <parent> - <from> - <layer 2> - <to> - <child 1>
|
||||
// \ \ \- <child 2>
|
||||
// \ \- <child 3>
|
||||
// \-<child 4>
|
||||
//
|
||||
// 1) Find optimal pair for the "reverse" scenario
|
||||
// Imagine that it's (<layer 2>, <child 1>) in this example
|
||||
// 2) Process all children except <child 1>:
|
||||
// - Merge <from>..<to> to <child 2>
|
||||
// - Set <child 2> parent to <parent>
|
||||
// - Repeat for others
|
||||
// 3) Process <child 1>:
|
||||
// - Merge <from>..<child 1> to <layer 2>
|
||||
// - Set <layer 2> parent to <parent>
|
||||
// - Rename <layer 2> to <child 1>
|
||||
// 4) Delete other layers of the chain (<from>, <to>)
|
||||
struct snap_remover_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
|
||||
// remove from..to
|
||||
std::string from_name, to_name;
|
||||
// writers are stopped, we can safely change writable layers
|
||||
bool writers_stopped = false;
|
||||
// use CAS writes (0 = never, 1 = auto, 2 = always)
|
||||
int use_cas = 1;
|
||||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
|
||||
std::map<inode_t,int> sources;
|
||||
std::map<inode_t,uint64_t> inode_used;
|
||||
std::vector<inode_t> merge_children;
|
||||
std::vector<inode_t> chain_list;
|
||||
std::map<inode_t,int> inverse_candidates;
|
||||
inode_t inverse_parent = 0, inverse_child = 0;
|
||||
inode_t new_parent = 0;
|
||||
int state = 0;
|
||||
int current_child = 0;
|
||||
std::function<bool(void)> cb;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 9;
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
else if (state == 4)
|
||||
goto resume_4;
|
||||
else if (state == 5)
|
||||
goto resume_5;
|
||||
else if (state == 6)
|
||||
goto resume_6;
|
||||
else if (state == 7)
|
||||
goto resume_7;
|
||||
else if (state == 8)
|
||||
goto resume_8;
|
||||
else if (state == 9)
|
||||
goto resume_9;
|
||||
// Get children to merge
|
||||
get_merge_children();
|
||||
// Try to select an inode for the "inverse" optimized scenario
|
||||
// Read statistics from etcd to do it
|
||||
read_stats();
|
||||
state = 1;
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
choose_inverse_candidate();
|
||||
// Merge children one by one, except our "inverse" child
|
||||
for (current_child = 0; current_child < merge_children.size(); current_child++)
|
||||
{
|
||||
if (merge_children[current_child] == inverse_child)
|
||||
continue;
|
||||
start_merge_child(merge_children[current_child], merge_children[current_child]);
|
||||
resume_2:
|
||||
while (!cb())
|
||||
{
|
||||
state = 2;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
parent->change_parent(merge_children[current_child], new_parent);
|
||||
state = 3;
|
||||
resume_3:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
// Merge our "inverse" child into our "inverse" parent
|
||||
if (inverse_child != 0)
|
||||
{
|
||||
start_merge_child(inverse_child, inverse_parent);
|
||||
resume_4:
|
||||
while (!cb())
|
||||
{
|
||||
state = 4;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
// Delete "inverse" child data
|
||||
start_delete_source(inverse_child);
|
||||
resume_5:
|
||||
while (!cb())
|
||||
{
|
||||
state = 5;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
// Delete "inverse" child metadata, rename parent over it,
|
||||
// and also change parent links of the previous "inverse" child
|
||||
rename_inverse_parent();
|
||||
state = 6;
|
||||
resume_6:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
// Delete parents, except the "inverse" one
|
||||
for (current_child = 0; current_child < chain_list.size(); current_child++)
|
||||
{
|
||||
if (chain_list[current_child] == inverse_parent)
|
||||
continue;
|
||||
start_delete_source(chain_list[current_child]);
|
||||
resume_7:
|
||||
while (!cb())
|
||||
{
|
||||
state = 7;
|
||||
return;
|
||||
}
|
||||
cb = NULL;
|
||||
delete_inode_config(chain_list[current_child]);
|
||||
state = 8;
|
||||
resume_8:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
}
|
||||
state = 9;
|
||||
resume_9:
|
||||
// Done
|
||||
return;
|
||||
}
|
||||
|
||||
void get_merge_children()
|
||||
{
|
||||
// Get all children of from..to
|
||||
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
|
||||
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
|
||||
// Check that to_cfg is actually a child of from_cfg
|
||||
// FIXME de-copypaste the following piece of code with snap_merger_t
|
||||
inode_config_t *cur = to_cfg;
|
||||
chain_list.push_back(cur->num);
|
||||
while (cur->num != from_cfg->num && cur->parent_id != 0)
|
||||
{
|
||||
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
|
||||
if (it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
|
||||
exit(1);
|
||||
}
|
||||
cur = &it->second;
|
||||
chain_list.push_back(cur->num);
|
||||
}
|
||||
if (cur->num != from_cfg->num)
|
||||
{
|
||||
fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
new_parent = from_cfg->parent_id;
|
||||
// Calculate ranks
|
||||
int i = chain_list.size()-1;
|
||||
for (inode_t item: chain_list)
|
||||
{
|
||||
sources[item] = i--;
|
||||
}
|
||||
for (auto & ic: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (!ic.second.parent_id)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto it = sources.find(ic.second.parent_id);
|
||||
if (it != sources.end() && sources.find(ic.second.num) == sources.end())
|
||||
{
|
||||
merge_children.push_back(ic.second.num);
|
||||
if (ic.second.readonly || writers_stopped)
|
||||
{
|
||||
inverse_candidates[ic.second.num] = it->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void read_stats()
|
||||
{
|
||||
if (inverse_candidates.size() == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
json11::Json::array reads;
|
||||
for (auto cp: inverse_candidates)
|
||||
{
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
for (auto cp: sources)
|
||||
{
|
||||
inode_t inode = cp.first;
|
||||
reads.push_back(json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inode))
|
||||
) },
|
||||
} }
|
||||
});
|
||||
}
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "success", reads },
|
||||
}, [this](std::string err, json11::Json data)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error reading layer statistics from etcd: %s\n", err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
for (auto inode_result: data["responses"].array_items())
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]);
|
||||
pool_id_t pool_id = 0;
|
||||
inode_t inode = 0;
|
||||
char null_byte = 0;
|
||||
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
|
||||
if (!inode || null_byte != 0)
|
||||
{
|
||||
fprintf(stderr, "Bad key returned from etcd: %s\n", kv.key.c_str());
|
||||
exit(1);
|
||||
}
|
||||
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id);
|
||||
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
|
||||
{
|
||||
fprintf(stderr, "Pool %u does not exist\n", pool_id);
|
||||
exit(1);
|
||||
}
|
||||
inode = INODE_WITH_POOL(pool_id, inode);
|
||||
auto & pool_cfg = pool_cfg_it->second;
|
||||
uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size;
|
||||
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
|
||||
}
|
||||
inode_used[inode] = used_bytes;
|
||||
}
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void choose_inverse_candidate()
|
||||
{
|
||||
uint64_t max_diff = 0;
|
||||
for (auto cp: inverse_candidates)
|
||||
{
|
||||
inode_t child = cp.first;
|
||||
uint64_t child_used = inode_used[child];
|
||||
int rank = cp.second;
|
||||
for (int i = chain_list.size()-rank; i < chain_list.size(); i++)
|
||||
{
|
||||
inode_t parent = chain_list[i];
|
||||
uint64_t parent_used = inode_used[parent];
|
||||
if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used)))
|
||||
{
|
||||
max_diff = (parent_used-child_used);
|
||||
inverse_parent = parent;
|
||||
inverse_child = child;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void rename_inverse_parent()
|
||||
{
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(inverse_child);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inverse_child);
|
||||
exit(1);
|
||||
}
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inverse_parent);
|
||||
exit(1);
|
||||
}
|
||||
inode_config_t *child_cfg = &child_it->second;
|
||||
inode_config_t *target_cfg = &target_it->second;
|
||||
std::string child_name = child_cfg->name;
|
||||
std::string target_name = target_cfg->name;
|
||||
std::string child_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_child))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_child))
|
||||
);
|
||||
std::string target_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+
|
||||
"/"+std::to_string(INODE_NO_POOL(inverse_parent))
|
||||
);
|
||||
// Fill new configuration
|
||||
inode_config_t new_cfg = *child_cfg;
|
||||
new_cfg.num = target_cfg->num;
|
||||
new_cfg.parent_id = new_parent;
|
||||
json11::Json::array cmp = json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", child_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", child_cfg->mod_revision+1 },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", target_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", target_cfg->mod_revision+1 },
|
||||
},
|
||||
};
|
||||
json11::Json::array txn = json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", child_cfg_key },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", target_cfg_key },
|
||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&new_cfg)).dump()) },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+child_cfg->name) },
|
||||
{ "value", base64_encode(json11::Json({
|
||||
{ "id", INODE_NO_POOL(inverse_parent) },
|
||||
{ "pool_id", (uint64_t)INODE_POOL(inverse_parent) },
|
||||
}).dump()) },
|
||||
} },
|
||||
},
|
||||
};
|
||||
// Reparent children of inverse_child
|
||||
for (auto & cp: parent->cli->st_cli.inode_config)
|
||||
{
|
||||
if (cp.second.parent_id == child_cfg->num)
|
||||
{
|
||||
auto cp_cfg = cp.second;
|
||||
cp_cfg.parent_id = inverse_parent;
|
||||
auto cp_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cp.second.num))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cp.second.num))
|
||||
);
|
||||
cmp.push_back(json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cp_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", cp.second.mod_revision+1 },
|
||||
});
|
||||
txn.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", cp_key },
|
||||
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&cp_cfg)).dump()) },
|
||||
} },
|
||||
});
|
||||
}
|
||||
}
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", cmp },
|
||||
{ "success", txn },
|
||||
}, [this, target_name, child_name](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error renaming %s to %s: %s\n", target_name.c_str(), child_name.c_str(), err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Parent (%s), child (%s), or one of its children"
|
||||
" configuration was modified during rename\n", target_name.c_str(), child_name.c_str()
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void delete_inode_config(inode_t cur)
|
||||
{
|
||||
auto cur_cfg_it = parent->cli->st_cli.inode_config.find(cur);
|
||||
if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
|
||||
exit(1);
|
||||
}
|
||||
inode_config_t *cur_cfg = &cur_cfg_it->second;
|
||||
std::string cur_name = cur_cfg->name;
|
||||
std::string cur_cfg_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+
|
||||
"/config/inode/"+std::to_string(INODE_POOL(cur))+
|
||||
"/"+std::to_string(INODE_NO_POOL(cur))
|
||||
);
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "compare", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", cur_cfg_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", cur_cfg->mod_revision+1 },
|
||||
},
|
||||
} },
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", cur_cfg_key },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) },
|
||||
} },
|
||||
},
|
||||
} },
|
||||
}, [this, cur_name](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Error deleting %s: %s\n", cur_name.c_str(), err.c_str());
|
||||
exit(1);
|
||||
}
|
||||
if (!res["succeeded"].bool_value())
|
||||
{
|
||||
fprintf(stderr, "Layer %s configuration was modified during deletion\n", cur_name.c_str());
|
||||
exit(1);
|
||||
}
|
||||
printf("Layer %s deleted\n", cur_name.c_str());
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
|
||||
void start_merge_child(inode_t child_inode, inode_t target_inode)
|
||||
{
|
||||
auto child_it = parent->cli->st_cli.inode_config.find(child_inode);
|
||||
if (child_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", child_inode);
|
||||
exit(1);
|
||||
}
|
||||
auto target_it = parent->cli->st_cli.inode_config.find(target_inode);
|
||||
if (target_it == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", target_inode);
|
||||
exit(1);
|
||||
}
|
||||
cb = parent->start_merge(json11::Json::object {
|
||||
{ "command", json11::Json::array{ "merge-data", from_name, child_it->second.name } },
|
||||
{ "target", target_it->second.name },
|
||||
{ "delete-source", false },
|
||||
{ "cas", use_cas },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
}
|
||||
|
||||
void start_delete_source(inode_t inode)
|
||||
{
|
||||
auto source = parent->cli->st_cli.inode_config.find(inode);
|
||||
if (source == parent->cli->st_cli.inode_config.end())
|
||||
{
|
||||
fprintf(stderr, "Inode %ld disappeared\n", inode);
|
||||
exit(1);
|
||||
}
|
||||
cb = parent->start_rm(json11::Json::object {
|
||||
{ "inode", inode },
|
||||
{ "pool", (uint64_t)INODE_POOL(inode) },
|
||||
{ "fsync-interval", fsync_interval },
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(void)> cli_tool_t::start_snap_rm(json11::Json cfg)
|
||||
{
|
||||
json11::Json::array cmd = cfg["command"].array_items();
|
||||
auto snap_remover = new snap_remover_t();
|
||||
snap_remover->parent = this;
|
||||
snap_remover->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
|
||||
snap_remover->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
|
||||
if (snap_remover->from_name == "")
|
||||
{
|
||||
fprintf(stderr, "Layer to remove argument is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
if (snap_remover->to_name == "")
|
||||
{
|
||||
snap_remover->to_name = snap_remover->from_name;
|
||||
}
|
||||
snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
if (!snap_remover->fsync_interval)
|
||||
snap_remover->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
|
||||
if (!cfg["writers_stopped"].is_null())
|
||||
snap_remover->writers_stopped = true;
|
||||
return [snap_remover]()
|
||||
{
|
||||
snap_remover->loop();
|
||||
if (snap_remover->is_done())
|
||||
{
|
||||
delete snap_remover;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
@@ -1,301 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
#include "base64.h"
|
||||
#include "pg_states.h"
|
||||
|
||||
// Print cluster status:
|
||||
// etcd, mon, osd states
|
||||
// raw/used space, object states, pool states, pg states
|
||||
// client io, recovery io, rebalance io
|
||||
struct status_printer_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
|
||||
int state = 0;
|
||||
json11::Json::array mon_members, osd_stats;
|
||||
json11::Json agg_stats;
|
||||
std::map<pool_id_t, json11::Json::object> pool_stats;
|
||||
json11::Json::array etcd_states;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 100;
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
// etcd states
|
||||
{
|
||||
auto addrs = parent->cli->st_cli.get_addresses();
|
||||
etcd_states.resize(addrs.size());
|
||||
for (int i = 0; i < etcd_states.size(); i++)
|
||||
{
|
||||
parent->waiting++;
|
||||
parent->cli->st_cli.etcd_call_oneshot(
|
||||
addrs[i], "/maintenance/status", json11::Json::object(),
|
||||
parent->cli->st_cli.etcd_quick_timeout, [this, i](std::string err, json11::Json res)
|
||||
{
|
||||
parent->waiting--;
|
||||
etcd_states[i] = err != "" ? json11::Json::object{ { "error", err } } : res;
|
||||
parent->ringloop->wakeup();
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
state = 1;
|
||||
resume_1:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
// Monitors, OSD states
|
||||
parent->etcd_txn(json11::Json::object {
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/mon/") },
|
||||
{ "range_end", base64_encode(parent->cli->st_cli.etcd_prefix+"/mon0") },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+"/osd/stats/"
|
||||
) },
|
||||
{ "range_end", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+"/osd/stats0"
|
||||
) },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/stats") },
|
||||
} },
|
||||
},
|
||||
} },
|
||||
});
|
||||
state = 2;
|
||||
resume_2:
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
fprintf(stderr, "%s\n", parent->etcd_err.text.c_str());
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
mon_members = parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items();
|
||||
osd_stats = parent->etcd_result["responses"][1]["response_range"]["kvs"].array_items();
|
||||
if (parent->etcd_result["responses"][2]["response_range"]["kvs"].array_items().size() > 0)
|
||||
{
|
||||
agg_stats = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][2]["response_range"]["kvs"][0]).value;
|
||||
}
|
||||
int etcd_alive = 0;
|
||||
uint64_t etcd_db_size = 0;
|
||||
std::string etcd_detail;
|
||||
for (int i = 0; i < etcd_states.size(); i++)
|
||||
{
|
||||
if (etcd_states[i]["error"].is_null())
|
||||
{
|
||||
etcd_alive++;
|
||||
etcd_db_size = etcd_states[i]["dbSizeInUse"].uint64_value();
|
||||
}
|
||||
}
|
||||
int mon_count = 0;
|
||||
std::string mon_master;
|
||||
for (int i = 0; i < mon_members.size(); i++)
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(mon_members[i]);
|
||||
kv.key = kv.key.substr(parent->cli->st_cli.etcd_prefix.size());
|
||||
if (kv.key.substr(0, 12) == "/mon/member/")
|
||||
mon_count++;
|
||||
else if (kv.key == "/mon/master")
|
||||
{
|
||||
if (kv.value["hostname"].is_string())
|
||||
mon_master = kv.value["hostname"].string_value();
|
||||
else
|
||||
mon_master = kv.value["ip"][0].string_value();
|
||||
}
|
||||
}
|
||||
int osd_count = 0, osd_up = 0;
|
||||
uint64_t total_raw = 0, free_raw = 0, free_down_raw = 0, down_raw = 0;
|
||||
for (int i = 0; i < osd_stats.size(); i++)
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(osd_stats[i]);
|
||||
osd_num_t stat_osd_num = 0;
|
||||
char null_byte = 0;
|
||||
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.size(), "/osd/stats/%lu%c", &stat_osd_num, &null_byte);
|
||||
if (!stat_osd_num || null_byte != 0)
|
||||
{
|
||||
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
|
||||
continue;
|
||||
}
|
||||
osd_count++;
|
||||
total_raw += kv.value["size"].uint64_value();
|
||||
free_raw += kv.value["free"].uint64_value();
|
||||
auto peer_it = parent->cli->st_cli.peer_states.find(stat_osd_num);
|
||||
if (peer_it != parent->cli->st_cli.peer_states.end())
|
||||
{
|
||||
osd_up++;
|
||||
}
|
||||
else
|
||||
{
|
||||
down_raw += kv.value["size"].uint64_value();
|
||||
free_down_raw += kv.value["free"].uint64_value();
|
||||
}
|
||||
}
|
||||
int pool_count = 0, pools_active = 0;
|
||||
std::map<std::string, int> pgs_by_state;
|
||||
std::string pgs_by_state_str;
|
||||
for (auto & pool_pair: parent->cli->st_cli.pool_config)
|
||||
{
|
||||
auto & pool_cfg = pool_pair.second;
|
||||
bool active = true;
|
||||
if (pool_cfg.pg_config.size() != pool_cfg.pg_count)
|
||||
{
|
||||
active = false;
|
||||
pgs_by_state["offline"] += pool_cfg.pg_count-pool_cfg.pg_config.size();
|
||||
}
|
||||
pool_count++;
|
||||
for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++)
|
||||
{
|
||||
if (!(pg_it->second.cur_state & PG_ACTIVE))
|
||||
{
|
||||
active = false;
|
||||
}
|
||||
std::string pg_state_str;
|
||||
for (int i = 0; i < pg_state_bit_count; i++)
|
||||
{
|
||||
if (pg_it->second.cur_state & pg_state_bits[i])
|
||||
{
|
||||
pg_state_str += "+";
|
||||
pg_state_str += pg_state_names[i];
|
||||
}
|
||||
}
|
||||
if (pg_state_str.size())
|
||||
pgs_by_state[pg_state_str.substr(1)]++;
|
||||
else
|
||||
pgs_by_state["offline"]++;
|
||||
}
|
||||
if (active)
|
||||
{
|
||||
pools_active++;
|
||||
}
|
||||
}
|
||||
for (auto & kv: pgs_by_state)
|
||||
{
|
||||
if (pgs_by_state_str.size())
|
||||
{
|
||||
pgs_by_state_str += "\n ";
|
||||
}
|
||||
pgs_by_state_str += std::to_string(kv.second)+" "+kv.first;
|
||||
}
|
||||
uint64_t object_size = parent->cli->get_bs_block_size();
|
||||
std::string more_states;
|
||||
uint64_t obj_n;
|
||||
obj_n = agg_stats["object_counts"]["misplaced"].uint64_value();
|
||||
if (obj_n > 0)
|
||||
more_states += ", "+format_size(obj_n*object_size)+" misplaced";
|
||||
obj_n = agg_stats["object_counts"]["degraded"].uint64_value();
|
||||
if (obj_n > 0)
|
||||
more_states += ", "+format_size(obj_n*object_size)+" degraded";
|
||||
obj_n = agg_stats["object_counts"]["incomplete"].uint64_value();
|
||||
if (obj_n > 0)
|
||||
more_states += ", "+format_size(obj_n*object_size)+" incomplete";
|
||||
std::string recovery_io;
|
||||
{
|
||||
uint64_t deg_bps = agg_stats["recovery_stats"]["degraded"]["bps"].uint64_value();
|
||||
uint64_t deg_iops = agg_stats["recovery_stats"]["degraded"]["iops"].uint64_value();
|
||||
uint64_t misp_bps = agg_stats["recovery_stats"]["misplaced"]["bps"].uint64_value();
|
||||
uint64_t misp_iops = agg_stats["recovery_stats"]["misplaced"]["iops"].uint64_value();
|
||||
if (deg_iops > 0 || deg_bps > 0)
|
||||
recovery_io += " recovery: "+format_size(deg_bps)+"/s, "+format_size(deg_iops, true)+" op/s\n";
|
||||
if (misp_iops > 0 || misp_bps > 0)
|
||||
recovery_io += " rebalance: "+format_size(misp_bps)+"/s, "+format_size(misp_iops, true)+" op/s\n";
|
||||
}
|
||||
if (parent->json_output)
|
||||
{
|
||||
// JSON output
|
||||
printf("%s\n", json11::Json(json11::Json::object {
|
||||
{ "etcd_alive", etcd_alive },
|
||||
{ "etcd_count", (uint64_t)etcd_states.size() },
|
||||
{ "etcd_db_size", etcd_db_size },
|
||||
{ "mon_count", mon_count },
|
||||
{ "mon_master", mon_master },
|
||||
{ "osd_up", osd_up },
|
||||
{ "osd_count", osd_count },
|
||||
{ "total_raw", total_raw },
|
||||
{ "free_raw", free_raw },
|
||||
{ "down_raw", down_raw },
|
||||
{ "free_down_raw", free_down_raw },
|
||||
{ "clean_data", agg_stats["object_counts"]["clean"].uint64_value() * object_size },
|
||||
{ "misplaced_data", agg_stats["object_counts"]["misplaced"].uint64_value() * object_size },
|
||||
{ "degraded_data", agg_stats["object_counts"]["degraded"].uint64_value() * object_size },
|
||||
{ "incomplete_data", agg_stats["object_counts"]["incomplete"].uint64_value() * object_size },
|
||||
{ "pool_count", pool_count },
|
||||
{ "active_pool_count", pools_active },
|
||||
{ "pg_states", pgs_by_state },
|
||||
{ "op_stats", agg_stats["op_stats"] },
|
||||
{ "recovery_stats", agg_stats["recovery_stats"] },
|
||||
{ "object_counts", agg_stats["object_counts"] },
|
||||
}).dump().c_str());
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
printf(
|
||||
" cluster:\n"
|
||||
" etcd: %d / %ld up, %s database size\n"
|
||||
" mon: %d up%s\n"
|
||||
" osd: %d / %d up\n"
|
||||
" \n"
|
||||
" data:\n"
|
||||
" raw: %s used, %s / %s available%s\n"
|
||||
" state: %s clean%s\n"
|
||||
" pools: %d / %d active\n"
|
||||
" pgs: %s\n"
|
||||
" \n"
|
||||
" io:\n"
|
||||
" client:%s %s/s rd, %s op/s rd, %s/s wr, %s op/s wr\n"
|
||||
"%s",
|
||||
etcd_alive, etcd_states.size(), format_size(etcd_db_size).c_str(),
|
||||
mon_count, mon_master == "" ? "" : (", master "+mon_master).c_str(),
|
||||
osd_up, osd_count,
|
||||
format_size(total_raw-free_raw).c_str(),
|
||||
format_size(free_raw-free_down_raw).c_str(),
|
||||
format_size(total_raw-down_raw).c_str(),
|
||||
(down_raw > 0 ? (", "+format_size(down_raw)+" down").c_str() : ""),
|
||||
format_size(agg_stats["object_counts"]["clean"].uint64_value() * object_size).c_str(), more_states.c_str(),
|
||||
pools_active, pool_count,
|
||||
pgs_by_state_str.c_str(),
|
||||
recovery_io.size() > 0 ? " " : "",
|
||||
format_size(agg_stats["op_stats"]["primary_read"]["bps"].uint64_value()).c_str(),
|
||||
format_size(agg_stats["op_stats"]["primary_read"]["iops"].uint64_value(), true).c_str(),
|
||||
format_size(agg_stats["op_stats"]["primary_write"]["bps"].uint64_value()).c_str(),
|
||||
format_size(agg_stats["op_stats"]["primary_write"]["iops"].uint64_value(), true).c_str(),
|
||||
recovery_io.c_str()
|
||||
);
|
||||
state = 100;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_status(json11::Json cfg)
|
||||
{
|
||||
auto printer = new status_printer_t();
|
||||
printer->parent = this;
|
||||
return [printer](cli_result_t & result)
|
||||
{
|
||||
printer->loop();
|
||||
if (printer->is_done())
|
||||
{
|
||||
result = { .err = 0 };
|
||||
delete printer;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
@@ -9,7 +9,6 @@
|
||||
#define PART_SENT 1
|
||||
#define PART_DONE 2
|
||||
#define PART_ERROR 4
|
||||
#define PART_RETRY 8
|
||||
#define CACHE_DIRTY 1
|
||||
#define CACHE_FLUSHING 2
|
||||
#define CACHE_REPEATING 3
|
||||
@@ -144,7 +143,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
}
|
||||
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
|
||||
{
|
||||
for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
|
||||
for (auto prev = op->prev; prev; prev = prev->prev)
|
||||
{
|
||||
if (prev->opcode == OSD_OP_WRITE && prev->flags & OP_FLUSH_BUFFER)
|
||||
{
|
||||
@@ -152,7 +151,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
}
|
||||
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
|
||||
{
|
||||
// Flushes are always in the beginning (we're scanning from the beginning of the queue)
|
||||
// Flushes are always in the beginning
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -173,7 +172,6 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
|
||||
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
|
||||
{
|
||||
next->prev_wait += inc;
|
||||
assert(next->prev_wait >= 0);
|
||||
if (!next->prev_wait)
|
||||
{
|
||||
if (next->opcode == OSD_OP_SYNC)
|
||||
@@ -193,7 +191,6 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
|
||||
if (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE)
|
||||
{
|
||||
next->prev_wait += inc;
|
||||
assert(next->prev_wait >= 0);
|
||||
if (!next->prev_wait)
|
||||
{
|
||||
if (next->opcode == OSD_OP_SYNC)
|
||||
@@ -374,11 +371,6 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
|
||||
continue_ops();
|
||||
}
|
||||
|
||||
bool cluster_client_t::get_immediate_commit()
|
||||
{
|
||||
return immediate_commit;
|
||||
}
|
||||
|
||||
void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
|
||||
{
|
||||
if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
|
||||
@@ -676,17 +668,14 @@ resume_2:
|
||||
if (!try_send(op, i))
|
||||
{
|
||||
// We'll need to retry again
|
||||
if (op->parts[i].flags & PART_RETRY)
|
||||
op->up_wait = true;
|
||||
if (!retry_timeout_id)
|
||||
{
|
||||
op->up_wait = true;
|
||||
if (!retry_timeout_id)
|
||||
retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
|
||||
{
|
||||
retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
|
||||
{
|
||||
retry_timeout_id = 0;
|
||||
continue_ops(true);
|
||||
});
|
||||
}
|
||||
retry_timeout_id = 0;
|
||||
continue_ops(true);
|
||||
});
|
||||
}
|
||||
op->state = 2;
|
||||
}
|
||||
@@ -755,7 +744,7 @@ resume_3:
|
||||
{
|
||||
for (int i = 0; i < op->parts.size(); i++)
|
||||
{
|
||||
op->parts[i].flags = PART_RETRY;
|
||||
op->parts[i].flags = 0;
|
||||
}
|
||||
goto resume_2;
|
||||
}
|
||||
|
@@ -118,8 +118,6 @@ public:
|
||||
bool is_ready();
|
||||
void on_ready(std::function<void(void)> fn);
|
||||
|
||||
bool get_immediate_commit();
|
||||
|
||||
static void copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers);
|
||||
void continue_ops(bool up_retry = false);
|
||||
inode_list_t *list_inode_start(inode_t inode,
|
||||
|
@@ -200,8 +200,7 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
||||
auto & pool_cfg = st_cli.pool_config[cur_list->pg->lst->pool_id];
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
// Already checked that it exists above, but anyway
|
||||
op->peer_fd = msgr.osd_peer_fds.at(cur_list->osd_num);
|
||||
op->peer_fd = msgr.osd_peer_fds[cur_list->osd_num];
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_list = {
|
||||
.header = {
|
||||
|
@@ -64,42 +64,6 @@ void etcd_state_client_t::etcd_txn_slow(json11::Json txn, std::function<void(std
|
||||
etcd_call("/kv/txn", txn, etcd_slow_timeout, max_etcd_attempts, 0, callback);
|
||||
}
|
||||
|
||||
std::vector<std::string> etcd_state_client_t::get_addresses()
|
||||
{
|
||||
auto addrs = etcd_local;
|
||||
addrs.insert(addrs.end(), etcd_addresses.begin(), etcd_addresses.end());
|
||||
return addrs;
|
||||
}
|
||||
|
||||
void etcd_state_client_t::etcd_call_oneshot(std::string etcd_address, std::string api, json11::Json payload,
|
||||
int timeout, std::function<void(std::string, json11::Json)> callback)
|
||||
{
|
||||
std::string etcd_api_path;
|
||||
int pos = etcd_address.find('/');
|
||||
if (pos >= 0)
|
||||
{
|
||||
etcd_api_path = etcd_address.substr(pos);
|
||||
etcd_address = etcd_address.substr(0, pos);
|
||||
}
|
||||
std::string req = payload.dump();
|
||||
req = "POST "+etcd_api_path+api+" HTTP/1.1\r\n"
|
||||
"Host: "+etcd_address+"\r\n"
|
||||
"Content-Type: application/json\r\n"
|
||||
"Content-Length: "+std::to_string(req.size())+"\r\n"
|
||||
"Connection: close\r\n"
|
||||
"\r\n"+req;
|
||||
auto http_cli = http_init(tfd);
|
||||
auto cb = [http_cli, callback](const http_response_t *response)
|
||||
{
|
||||
std::string err;
|
||||
json11::Json data;
|
||||
response->parse_json_response(err, data);
|
||||
callback(err, data);
|
||||
http_close(http_cli);
|
||||
};
|
||||
http_request(http_cli, etcd_address, req, { .timeout = timeout }, cb);
|
||||
}
|
||||
|
||||
void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int timeout,
|
||||
int retries, int interval, std::function<void(std::string, json11::Json)> callback)
|
||||
{
|
||||
@@ -338,14 +302,9 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
{
|
||||
if (data["result"]["created"].bool_value())
|
||||
{
|
||||
uint64_t watch_id = data["result"]["watch_id"].uint64_value();
|
||||
if (watch_id == ETCD_CONFIG_WATCH_ID ||
|
||||
watch_id == ETCD_PG_STATE_WATCH_ID ||
|
||||
watch_id == ETCD_PG_HISTORY_WATCH_ID ||
|
||||
watch_id == ETCD_OSD_STATE_WATCH_ID)
|
||||
etcd_watches_initialised++;
|
||||
if (etcd_watches_initialised == 4 && this->log_level > 0)
|
||||
if (etcd_watches_initialised == 3 && this->log_level > 0)
|
||||
fprintf(stderr, "Successfully subscribed to etcd at %s\n", selected_etcd_address.c_str());
|
||||
etcd_watches_initialised++;
|
||||
}
|
||||
if (data["result"]["canceled"].bool_value())
|
||||
{
|
||||
@@ -474,10 +433,6 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
{ "progress_notify", true },
|
||||
} }
|
||||
}).dump());
|
||||
if (on_start_watcher_hook)
|
||||
{
|
||||
on_start_watcher_hook(etcd_watch_ws);
|
||||
}
|
||||
if (ws_keepalive_timer < 0)
|
||||
{
|
||||
ws_keepalive_timer = tfd->set_timer(etcd_ws_keepalive_interval*1000, true, [this](int)
|
||||
@@ -963,10 +918,6 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
}
|
||||
if (!value.is_object())
|
||||
{
|
||||
if (on_inode_change_hook != NULL)
|
||||
{
|
||||
on_inode_change_hook(inode_num, true);
|
||||
}
|
||||
this->inode_config.erase(inode_num);
|
||||
}
|
||||
else
|
||||
@@ -981,47 +932,38 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Inode %lu/%lu parent_pool value is invalid, ignoring parent setting\n",
|
||||
inode_num >> (64-POOL_ID_BITS), inode_num & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)
|
||||
inode_num >> (64-POOL_ID_BITS), inode_num & ((1l << (64-POOL_ID_BITS)) - 1)
|
||||
);
|
||||
parent_inode_num = 0;
|
||||
}
|
||||
else
|
||||
parent_inode_num |= parent_pool_id << (64-POOL_ID_BITS);
|
||||
}
|
||||
insert_inode_config((inode_config_t){
|
||||
inode_config_t cfg = (inode_config_t){
|
||||
.num = inode_num,
|
||||
.name = value["name"].string_value(),
|
||||
.size = value["size"].uint64_value(),
|
||||
.parent_id = parent_inode_num,
|
||||
.readonly = value["readonly"].bool_value(),
|
||||
.meta = value["meta"],
|
||||
.mod_revision = kv.mod_revision,
|
||||
});
|
||||
};
|
||||
this->inode_config[inode_num] = cfg;
|
||||
if (cfg.name != "")
|
||||
{
|
||||
this->inode_by_name[cfg.name] = inode_num;
|
||||
for (auto w: watches)
|
||||
{
|
||||
if (w->name == value["name"].string_value())
|
||||
{
|
||||
w->cfg = cfg;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void etcd_state_client_t::insert_inode_config(const inode_config_t & cfg)
|
||||
{
|
||||
this->inode_config[cfg.num] = cfg;
|
||||
if (cfg.name != "")
|
||||
{
|
||||
this->inode_by_name[cfg.name] = cfg.num;
|
||||
for (auto w: watches)
|
||||
{
|
||||
if (w->name == cfg.name)
|
||||
{
|
||||
w->cfg = cfg;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (on_inode_change_hook != NULL)
|
||||
{
|
||||
on_inode_change_hook(cfg.num, false);
|
||||
}
|
||||
}
|
||||
|
||||
inode_watch_t* etcd_state_client_t::watch_inode(std::string name)
|
||||
{
|
||||
inode_watch_t *watch = new inode_watch_t;
|
||||
@@ -1064,10 +1006,6 @@ json11::Json::object etcd_state_client_t::serialize_inode_cfg(inode_config_t *cf
|
||||
{
|
||||
new_cfg["readonly"] = true;
|
||||
}
|
||||
if (cfg->meta.is_object())
|
||||
{
|
||||
new_cfg["meta"] = cfg->meta;
|
||||
}
|
||||
return new_cfg;
|
||||
}
|
||||
|
||||
|
@@ -56,8 +56,6 @@ struct inode_config_t
|
||||
uint64_t size;
|
||||
inode_t parent_id;
|
||||
bool readonly;
|
||||
// Arbitrary metadata
|
||||
json11::Json meta;
|
||||
// Change revision of the metadata in etcd
|
||||
uint64_t mod_revision;
|
||||
};
|
||||
@@ -111,13 +109,9 @@ public:
|
||||
std::function<void(pool_id_t, pg_num_t)> on_change_pg_history_hook;
|
||||
std::function<void(osd_num_t)> on_change_osd_state_hook;
|
||||
std::function<void()> on_reload_hook;
|
||||
std::function<void(inode_t, bool)> on_inode_change_hook;
|
||||
std::function<void(http_co_t *)> on_start_watcher_hook;
|
||||
|
||||
json11::Json::object serialize_inode_cfg(inode_config_t *cfg);
|
||||
etcd_kv_t parse_etcd_kv(const json11::Json & kv_json);
|
||||
std::vector<std::string> get_addresses();
|
||||
void etcd_call_oneshot(std::string etcd_address, std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback);
|
||||
void etcd_call(std::string api, json11::Json payload, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
|
||||
void etcd_txn(json11::Json txn, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
|
||||
void etcd_txn_slow(json11::Json txn, std::function<void(std::string, json11::Json)> callback);
|
||||
@@ -126,7 +120,6 @@ public:
|
||||
void load_pgs();
|
||||
void parse_state(const etcd_kv_t & kv);
|
||||
void parse_config(const json11::Json & config);
|
||||
void insert_inode_config(const inode_config_t & cfg);
|
||||
inode_watch_t* watch_inode(std::string name);
|
||||
void close_watch(inode_watch_t* watch);
|
||||
int address_count();
|
||||
|
@@ -214,14 +214,14 @@ static int sec_setup(struct thread_data *td)
|
||||
|
||||
if (!o->image)
|
||||
{
|
||||
if (!(o->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)))
|
||||
if (!(o->inode & ((1l << (64-POOL_ID_BITS)) - 1)))
|
||||
{
|
||||
td_verror(td, EINVAL, "inode number is missing");
|
||||
return 1;
|
||||
}
|
||||
if (o->pool)
|
||||
{
|
||||
o->inode = (o->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (o->pool << (64-POOL_ID_BITS));
|
||||
o->inode = (o->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (o->pool << (64-POOL_ID_BITS));
|
||||
}
|
||||
if (!(o->inode >> (64-POOL_ID_BITS)))
|
||||
{
|
||||
@@ -351,9 +351,9 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("+++ %s 0x%lx 0x%llx+%lx\n",
|
||||
printf("+++ %s 0x%lx 0x%llx+%llx\n",
|
||||
io->ddir == DDIR_READ ? "READ" : "WRITE",
|
||||
(uint64_t)io, io->offset, (uint64_t)io->xfer_buflen);
|
||||
(uint64_t)io, io->offset, io->xfer_buflen);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -170,14 +170,14 @@ static int sec_init(struct thread_data *td)
|
||||
bsd->block_order = o->block_order == 0 ? 17 : o->block_order;
|
||||
bsd->block_size = 1 << o->block_order;
|
||||
|
||||
sockaddr_storage addr;
|
||||
sockaddr addr;
|
||||
if (!string_to_addr(std::string(o->host ? o->host : "127.0.0.1"), false, o->port > 0 ? o->port : 11203, &addr))
|
||||
{
|
||||
fprintf(stderr, "server address: %s is not valid\n", o->host ? o->host : "127.0.0.1");
|
||||
return 1;
|
||||
}
|
||||
|
||||
bsd->connect_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||
bsd->connect_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
||||
if (bsd->connect_fd < 0)
|
||||
{
|
||||
perror("socket");
|
||||
@@ -355,7 +355,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
|
||||
{
|
||||
if (reply.hdr.retval != io->xfer_buflen)
|
||||
{
|
||||
fprintf(stderr, "Short read: retval = %ld instead of %lu\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
|
||||
fprintf(stderr, "Short read: retval = %ld instead of %llu\n", reply.hdr.retval, io->xfer_buflen);
|
||||
exit(1);
|
||||
}
|
||||
// Support bitmap
|
||||
@@ -380,7 +380,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
|
||||
{
|
||||
if (reply.hdr.retval != io->xfer_buflen)
|
||||
{
|
||||
fprintf(stderr, "Short write: retval = %ld instead of %lu\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
|
||||
fprintf(stderr, "Short write: retval = %ld instead of %llu\n", reply.hdr.retval, io->xfer_buflen);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
@@ -62,10 +62,9 @@ struct http_co_t
|
||||
void run_cb_and_clear();
|
||||
void start_connection();
|
||||
void close_connection();
|
||||
void next_request();
|
||||
void handle_events();
|
||||
void handle_connect_result();
|
||||
void submit_read(bool check_timeout);
|
||||
void submit_read();
|
||||
void submit_send();
|
||||
bool handle_read();
|
||||
void post_message(int type, const std::string & msg);
|
||||
@@ -129,7 +128,6 @@ void http_co_t::run_cb_and_clear()
|
||||
// Call callback after clearing it because otherwise we may hit reenterability problems
|
||||
if (cb != NULL)
|
||||
cb(&parsed);
|
||||
next_request();
|
||||
}
|
||||
|
||||
void http_co_t::send_request(const std::string & host, const std::string & request,
|
||||
@@ -163,6 +161,17 @@ void http_co_t::send_request(const std::string & host, const std::string & reque
|
||||
this->sent = 0;
|
||||
this->response_callback = response_callback;
|
||||
this->parsed = {};
|
||||
if (request_timeout > 0)
|
||||
{
|
||||
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
|
||||
{
|
||||
stackin();
|
||||
close_connection();
|
||||
parsed = { .error = "HTTP request timed out" };
|
||||
run_cb_and_clear();
|
||||
stackout();
|
||||
});
|
||||
}
|
||||
if (state == HTTP_CO_KEEPALIVE)
|
||||
{
|
||||
state = HTTP_CO_SENDING_REQUEST;
|
||||
@@ -172,28 +181,6 @@ void http_co_t::send_request(const std::string & host, const std::string & reque
|
||||
{
|
||||
start_connection();
|
||||
}
|
||||
// Do it _after_ state assignment because set_timer() can actually trigger
|
||||
// other timers and requests (reenterability is our friend)
|
||||
if (request_timeout > 0)
|
||||
{
|
||||
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
|
||||
{
|
||||
stackin();
|
||||
if (state == HTTP_CO_REQUEST_SENT)
|
||||
{
|
||||
// In case of high CPU load, we may not handle etcd responses in time
|
||||
// For this case, first check the socket and only then terminate request with the timeout
|
||||
submit_read(true);
|
||||
}
|
||||
else
|
||||
{
|
||||
close_connection();
|
||||
parsed = { .error = "HTTP request timed out" };
|
||||
run_cb_and_clear();
|
||||
}
|
||||
stackout();
|
||||
});
|
||||
}
|
||||
stackout();
|
||||
}
|
||||
|
||||
@@ -284,19 +271,17 @@ void http_co_t::close_connection()
|
||||
void http_co_t::start_connection()
|
||||
{
|
||||
stackin();
|
||||
struct sockaddr_storage addr;
|
||||
struct sockaddr addr;
|
||||
if (!string_to_addr(host.c_str(), 1, 80, &addr))
|
||||
{
|
||||
close_connection();
|
||||
parsed = { .error = "Invalid address: "+host };
|
||||
run_cb_and_clear();
|
||||
stackout();
|
||||
return;
|
||||
}
|
||||
peer_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||
peer_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
||||
if (peer_fd < 0)
|
||||
{
|
||||
close_connection();
|
||||
parsed = { .error = std::string("socket: ")+strerror(errno) };
|
||||
run_cb_and_clear();
|
||||
stackout();
|
||||
@@ -338,12 +323,10 @@ void http_co_t::handle_events()
|
||||
epoll_events &= ~EPOLLOUT;
|
||||
if (epoll_events & EPOLLIN)
|
||||
{
|
||||
submit_read(false);
|
||||
submit_read();
|
||||
}
|
||||
else if (epoll_events & (EPOLLRDHUP|EPOLLERR))
|
||||
{
|
||||
if (state == HTTP_CO_HEADERS_RECEIVED)
|
||||
std::swap(parsed.body, response);
|
||||
close_connection();
|
||||
run_cb_and_clear();
|
||||
break;
|
||||
@@ -427,11 +410,10 @@ again:
|
||||
stackout();
|
||||
}
|
||||
|
||||
void http_co_t::submit_read(bool check_timeout)
|
||||
void http_co_t::submit_read()
|
||||
{
|
||||
stackin();
|
||||
int res;
|
||||
again:
|
||||
if (rbuf.size() != READ_BUFFER_SIZE)
|
||||
{
|
||||
rbuf.resize(READ_BUFFER_SIZE);
|
||||
@@ -446,29 +428,12 @@ again:
|
||||
}
|
||||
if (res == -EAGAIN || res == -EINTR)
|
||||
{
|
||||
if (check_timeout)
|
||||
{
|
||||
if (res == -EINTR)
|
||||
goto again;
|
||||
else
|
||||
{
|
||||
// Timeout happened and there is no data to read
|
||||
close_connection();
|
||||
parsed = { .error = "HTTP request timed out" };
|
||||
run_cb_and_clear();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
epoll_events = epoll_events & ~EPOLLIN;
|
||||
}
|
||||
epoll_events = epoll_events & ~EPOLLIN;
|
||||
}
|
||||
else if (res <= 0)
|
||||
{
|
||||
// < 0 means error, 0 means EOF
|
||||
epoll_events = epoll_events & ~EPOLLIN;
|
||||
if (state == HTTP_CO_HEADERS_RECEIVED)
|
||||
std::swap(parsed.body, response);
|
||||
close_connection();
|
||||
if (res < 0)
|
||||
parsed = { .error = std::string("recvmsg: ")+strerror(-res) };
|
||||
@@ -536,11 +501,8 @@ bool http_co_t::handle_read()
|
||||
if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size)
|
||||
{
|
||||
std::swap(parsed.body, response);
|
||||
if (!keepalive)
|
||||
close_connection();
|
||||
else
|
||||
state = HTTP_CO_KEEPALIVE;
|
||||
run_cb_and_clear();
|
||||
response_callback(&parsed);
|
||||
parsed.eof = true;
|
||||
}
|
||||
else if (state == HTTP_CO_CHUNKED && response.size() > 0)
|
||||
{
|
||||
@@ -571,14 +533,10 @@ bool http_co_t::handle_read()
|
||||
response_callback(&parsed);
|
||||
parsed.body = "";
|
||||
}
|
||||
else if (parsed.eof)
|
||||
if (parsed.eof && !want_streaming)
|
||||
{
|
||||
// Normal response
|
||||
if (!keepalive)
|
||||
close_connection();
|
||||
else
|
||||
state = HTTP_CO_KEEPALIVE;
|
||||
run_cb_and_clear();
|
||||
response_callback(&parsed);
|
||||
}
|
||||
}
|
||||
else if (state == HTTP_CO_WEBSOCKET && response.size() > 0)
|
||||
@@ -589,20 +547,29 @@ bool http_co_t::handle_read()
|
||||
parsed.body = "";
|
||||
}
|
||||
}
|
||||
if (parsed.eof)
|
||||
{
|
||||
response_callback = NULL;
|
||||
parsed = {};
|
||||
if (!keepalive)
|
||||
{
|
||||
close_connection();
|
||||
}
|
||||
else
|
||||
{
|
||||
state = HTTP_CO_KEEPALIVE;
|
||||
if (keepalive_queue.size() > 0)
|
||||
{
|
||||
auto next = keepalive_queue[0];
|
||||
keepalive_queue.erase(keepalive_queue.begin(), keepalive_queue.begin()+1);
|
||||
next();
|
||||
}
|
||||
}
|
||||
}
|
||||
stackout();
|
||||
return true;
|
||||
}
|
||||
|
||||
void http_co_t::next_request()
|
||||
{
|
||||
if (keepalive_queue.size() > 0)
|
||||
{
|
||||
auto next = keepalive_queue[0];
|
||||
keepalive_queue.erase(keepalive_queue.begin(), keepalive_queue.begin()+1);
|
||||
next();
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t stoull_full(const std::string & str, int base)
|
||||
{
|
||||
if (isspace(str[0]))
|
||||
|
@@ -39,12 +39,6 @@ void osd_messenger_t::init()
|
||||
handle_rdma_events();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifndef SO_ZEROCOPY
|
||||
if (log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "Zero-copy TCP send is not supported in this build, ignoring\n");
|
||||
}
|
||||
#endif
|
||||
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
|
||||
{
|
||||
@@ -168,8 +162,6 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
||||
this->receive_buffer_size = 65536;
|
||||
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
|
||||
config["use_sync_send_recv"].uint64_value();
|
||||
this->use_zerocopy_send = config["use_zerocopy_send"].bool_value() ||
|
||||
config["use_zerocopy_send"].uint64_value();
|
||||
this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
|
||||
if (!this->peer_connect_interval)
|
||||
this->peer_connect_interval = 5;
|
||||
@@ -230,13 +222,13 @@ void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
|
||||
void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
|
||||
{
|
||||
assert(peer_osd != this->osd_num);
|
||||
struct sockaddr_storage addr;
|
||||
struct sockaddr addr;
|
||||
if (!string_to_addr(peer_host, 0, peer_port, &addr))
|
||||
{
|
||||
on_connect_peer(peer_osd, -EINVAL);
|
||||
return;
|
||||
}
|
||||
int peer_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||
int peer_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
||||
if (peer_fd < 0)
|
||||
{
|
||||
on_connect_peer(peer_osd, -errno);
|
||||
@@ -296,7 +288,8 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
|
||||
on_connect_peer(peer_osd, -result);
|
||||
return;
|
||||
}
|
||||
set_socket_options(cl);
|
||||
int one = 1;
|
||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
cl->peer_state = PEER_CONNECTED;
|
||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
@@ -306,23 +299,6 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
|
||||
check_peer_config(cl);
|
||||
}
|
||||
|
||||
void osd_messenger_t::set_socket_options(osd_client_t *cl)
|
||||
{
|
||||
int one = 1;
|
||||
setsockopt(cl->peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
#ifdef SO_ZEROCOPY
|
||||
if (!use_zerocopy_send)
|
||||
cl->zerocopy_send = false;
|
||||
else if (setsockopt(cl->peer_fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)) != 0)
|
||||
{
|
||||
if (log_level > 0)
|
||||
fprintf(stderr, "[OSD %lu] Failed to enable zero-copy send for client %d: %s\n", this->osd_num, cl->peer_fd, strerror(errno));
|
||||
}
|
||||
else
|
||||
cl->zerocopy_send = true;
|
||||
#endif
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
|
||||
{
|
||||
// Mark client as ready (i.e. some data is available)
|
||||
@@ -508,22 +484,23 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||
void osd_messenger_t::accept_connections(int listen_fd)
|
||||
{
|
||||
// Accept new connections
|
||||
sockaddr_storage addr;
|
||||
sockaddr addr;
|
||||
socklen_t peer_addr_size = sizeof(addr);
|
||||
int peer_fd;
|
||||
while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
|
||||
while ((peer_fd = accept(listen_fd, &addr, &peer_addr_size)) >= 0)
|
||||
{
|
||||
assert(peer_fd != 0);
|
||||
fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd,
|
||||
addr_to_string(addr).c_str());
|
||||
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
auto cl = clients[peer_fd] = new osd_client_t();
|
||||
cl->peer_addr = addr;
|
||||
cl->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
||||
cl->peer_fd = peer_fd;
|
||||
cl->peer_state = PEER_CONNECTED;
|
||||
cl->in_buf = malloc_or_die(receive_buffer_size);
|
||||
set_socket_options(cl);
|
||||
int one = 1;
|
||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
clients[peer_fd] = new osd_client_t();
|
||||
clients[peer_fd]->peer_addr = addr;
|
||||
clients[peer_fd]->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
||||
clients[peer_fd]->peer_fd = peer_fd;
|
||||
clients[peer_fd]->peer_state = PEER_CONNECTED;
|
||||
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
|
||||
// Add FD to epoll
|
||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
|
@@ -45,17 +45,11 @@ struct msgr_sendp_t
|
||||
int flags;
|
||||
};
|
||||
|
||||
struct msgr_zc_not_t
|
||||
{
|
||||
osd_op_t *op;
|
||||
uint32_t nsend;
|
||||
};
|
||||
|
||||
struct osd_client_t
|
||||
{
|
||||
int refs = 0;
|
||||
|
||||
sockaddr_storage peer_addr;
|
||||
sockaddr peer_addr;
|
||||
int peer_port;
|
||||
int peer_fd;
|
||||
int peer_state;
|
||||
@@ -63,7 +57,6 @@ struct osd_client_t
|
||||
int ping_time_remaining = 0;
|
||||
int idle_time_remaining = 0;
|
||||
osd_num_t osd_num = 0;
|
||||
bool zerocopy_send = false;
|
||||
|
||||
void *in_buf = NULL;
|
||||
|
||||
@@ -94,12 +87,6 @@ struct osd_client_t
|
||||
int write_state = 0;
|
||||
std::vector<iovec> send_list, next_send_list;
|
||||
std::vector<msgr_sendp_t> outbox, next_outbox;
|
||||
std::vector<msgr_zc_not_t> zerocopy_sent;
|
||||
uint64_t outbox_size = 0, next_outbox_size = 0;
|
||||
uint32_t zerocopy_notification_idx = 0;
|
||||
uint32_t zerocopy_notification_prev = 0;
|
||||
uint8_t zerocopy_notification_buf[256];
|
||||
struct msghdr zerocopy_notification_msg;
|
||||
|
||||
~osd_client_t()
|
||||
{
|
||||
@@ -140,7 +127,6 @@ protected:
|
||||
int osd_ping_timeout = 0;
|
||||
int log_level = 0;
|
||||
bool use_sync_send_recv = false;
|
||||
bool use_zerocopy_send = false;
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
bool use_rdma = true;
|
||||
@@ -195,12 +181,10 @@ protected:
|
||||
void check_peer_config(osd_client_t *cl);
|
||||
void cancel_osd_ops(osd_client_t *cl);
|
||||
void cancel_op(osd_op_t *op);
|
||||
void set_socket_options(osd_client_t *cl);
|
||||
|
||||
bool try_send(osd_client_t *cl);
|
||||
void measure_exec(osd_op_t *cur_op);
|
||||
void handle_send(int result, osd_client_t *cl);
|
||||
void handle_zerocopy_notification(osd_client_t *cl, int res);
|
||||
|
||||
bool handle_read(int result, osd_client_t *cl);
|
||||
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
|
||||
|
@@ -3,6 +3,7 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/mman.h>
|
||||
#include "msgr_rdma.h"
|
||||
#include "messenger.h"
|
||||
|
||||
@@ -54,6 +55,7 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
||||
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level)
|
||||
{
|
||||
int res;
|
||||
bool odp = true;
|
||||
ibv_device **dev_list = NULL;
|
||||
msgr_rdma_context_t *ctx = new msgr_rdma_context_t();
|
||||
ctx->mtu = mtu;
|
||||
@@ -117,9 +119,9 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
|
||||
fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev));
|
||||
goto cleanup;
|
||||
}
|
||||
if (ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid))
|
||||
if ((res = ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid)) != 0)
|
||||
{
|
||||
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index);
|
||||
fprintf(stderr, "Couldn't read RDMA device %s GID index %d: %s\n", ibv_get_device_name(ctx->dev), gid_index, strerror(res));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
@@ -131,9 +133,9 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
|
||||
}
|
||||
|
||||
{
|
||||
if (ibv_query_device_ex(ctx->context, NULL, &ctx->attrx))
|
||||
if ((res = ibv_query_device_ex(ctx->context, NULL, &ctx->attrx)) != 0)
|
||||
{
|
||||
fprintf(stderr, "Couldn't query RDMA device for its features\n");
|
||||
fprintf(stderr, "Couldn't query RDMA device %s for its features: %s\n", ibv_get_device_name(ctx->dev), strerror(res));
|
||||
goto cleanup;
|
||||
}
|
||||
if (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
|
||||
@@ -141,15 +143,20 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
|
||||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
|
||||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV))
|
||||
{
|
||||
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable or does not support RC send and receive with ODP\n");
|
||||
goto cleanup;
|
||||
fprintf(stderr, "Warning: RDMA device isn't implicit ODP (On-Demand Paging) capable, trying to lock all application memory\n");
|
||||
if (mlockall(MCL_CURRENT|MCL_FUTURE|MCL_ONFAULT) != 0)
|
||||
{
|
||||
fprintf(stderr, "mlockall() failed: %s\n", strerror(errno));
|
||||
goto cleanup;
|
||||
}
|
||||
odp = false;
|
||||
}
|
||||
}
|
||||
|
||||
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
|
||||
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | (odp ? IBV_ACCESS_ON_DEMAND : 0));
|
||||
if (!ctx->mr)
|
||||
{
|
||||
fprintf(stderr, "Couldn't register RDMA memory region\n");
|
||||
fprintf(stderr, "Couldn't register RDMA memory region: %s\n", strerror(errno));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
@@ -6,12 +6,6 @@
|
||||
|
||||
#include "messenger.h"
|
||||
|
||||
#include <linux/errqueue.h>
|
||||
|
||||
#ifndef MSG_ZEROCOPY
|
||||
#define MSG_ZEROCOPY 0
|
||||
#endif
|
||||
|
||||
void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
{
|
||||
assert(cur_op->peer_fd);
|
||||
@@ -42,7 +36,6 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
}
|
||||
auto & to_send_list = cl->write_msg.msg_iovlen ? cl->next_send_list : cl->send_list;
|
||||
auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox;
|
||||
auto & to_size = cl->write_msg.msg_iovlen ? cl->next_outbox_size : cl->outbox_size;
|
||||
if (cur_op->op_type == OSD_OP_IN)
|
||||
{
|
||||
measure_exec(cur_op);
|
||||
@@ -53,7 +46,6 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
|
||||
cl->sent_ops[cur_op->req.hdr.id] = cur_op;
|
||||
}
|
||||
to_size += OSD_PACKET_SIZE;
|
||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_HDR });
|
||||
// Bitmap
|
||||
if (cur_op->op_type == OSD_OP_IN &&
|
||||
@@ -65,7 +57,6 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
.iov_len = cur_op->reply.sec_rw.attr_len,
|
||||
});
|
||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||
to_size += cur_op->reply.sec_rw.attr_len;
|
||||
}
|
||||
else if (cur_op->op_type == OSD_OP_OUT &&
|
||||
(cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
|
||||
@@ -76,7 +67,6 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
.iov_len = cur_op->req.sec_rw.attr_len,
|
||||
});
|
||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||
to_size += cur_op->req.sec_rw.attr_len;
|
||||
}
|
||||
// Operation data
|
||||
if ((cur_op->op_type == OSD_OP_IN
|
||||
@@ -96,21 +86,14 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
assert(cur_op->iov.buf[i].iov_base);
|
||||
to_send_list.push_back(cur_op->iov.buf[i]);
|
||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||
to_size += cur_op->iov.buf[i].iov_len;
|
||||
}
|
||||
}
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
||||
{
|
||||
if (cur_op->op_type == OSD_OP_IN && cur_op->reply.hdr.retval > 0)
|
||||
{
|
||||
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->reply.hdr.retval });
|
||||
to_size += cur_op->reply.hdr.retval;
|
||||
}
|
||||
else if (cur_op->op_type == OSD_OP_OUT && cur_op->req.sec_read_bmp.len > 0)
|
||||
{
|
||||
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len });
|
||||
to_size += cur_op->req.sec_read_bmp.len;
|
||||
}
|
||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||
}
|
||||
if (cur_op->op_type == OSD_OP_IN)
|
||||
@@ -194,19 +177,17 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||
}
|
||||
cl->write_msg.msg_iov = cl->send_list.data();
|
||||
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
||||
cl->write_msg.msg_flags = (cl->zerocopy_send && (cl->outbox_size/cl->send_list.size()) >= 4096 ? MSG_ZEROCOPY : 0);
|
||||
cl->refs++;
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
|
||||
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, cl->write_msg.msg_flags);
|
||||
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
cl->write_msg.msg_iov = cl->send_list.data();
|
||||
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
||||
cl->write_msg.msg_flags = (cl->zerocopy_send && (cl->outbox_size/cl->send_list.size()) >= 4096 ? MSG_ZEROCOPY : 0);
|
||||
cl->refs++;
|
||||
int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL | cl->write_msg.msg_flags);
|
||||
int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL);
|
||||
if (result < 0)
|
||||
{
|
||||
result = -errno;
|
||||
@@ -216,62 +197,6 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_zerocopy_notification(osd_client_t *cl, int res)
|
||||
{
|
||||
cl->refs--;
|
||||
if (cl->peer_state == PEER_STOPPED)
|
||||
{
|
||||
if (cl->refs <= 0)
|
||||
{
|
||||
delete cl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (res != 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (cl->zerocopy_notification_msg.msg_flags & MSG_CTRUNC)
|
||||
{
|
||||
fprintf(stderr, "zero-copy send notification truncated on client socket %d\n", cl->peer_fd);
|
||||
return;
|
||||
}
|
||||
for (struct cmsghdr *cm = CMSG_FIRSTHDR(&cl->zerocopy_notification_msg); cm; cm = CMSG_NXTHDR(&cl->zerocopy_notification_msg, cm))
|
||||
{
|
||||
if (cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR)
|
||||
{
|
||||
struct sock_extended_err *serr = (struct sock_extended_err*)CMSG_DATA(cm);
|
||||
if (serr->ee_errno == 0 && serr->ee_origin == SO_EE_ORIGIN_ZEROCOPY)
|
||||
{
|
||||
// completed sends numbered serr->ee_info .. serr->ee_data
|
||||
int start = 0;
|
||||
while (start < cl->zerocopy_sent.size() && cl->zerocopy_sent[start].nsend < serr->ee_info)
|
||||
start++;
|
||||
int end = start;
|
||||
if (serr->ee_data < serr->ee_info)
|
||||
{
|
||||
// counter has wrapped around
|
||||
while (end < cl->zerocopy_sent.size() && cl->zerocopy_sent[end].nsend >= cl->zerocopy_sent[start].nsend)
|
||||
end++;
|
||||
}
|
||||
while (end < cl->zerocopy_sent.size() && cl->zerocopy_sent[end].nsend <= serr->ee_data)
|
||||
end++;
|
||||
if (end > start)
|
||||
{
|
||||
for (int i = start; i < end; i++)
|
||||
{
|
||||
delete cl->zerocopy_sent[i].op;
|
||||
}
|
||||
cl->zerocopy_sent.erase(
|
||||
cl->zerocopy_sent.begin() + start,
|
||||
cl->zerocopy_sent.begin() + end
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::send_replies()
|
||||
{
|
||||
for (int i = 0; i < write_ready_clients.size(); i++)
|
||||
@@ -299,19 +224,16 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (result < 0 && result != -EAGAIN && result != -EINTR && result != -ENOBUFS)
|
||||
if (result < 0 && result != -EAGAIN && result != -EINTR)
|
||||
{
|
||||
// this is a client socket, so don't panic. just disconnect it
|
||||
fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
||||
stop_client(cl->peer_fd);
|
||||
return;
|
||||
}
|
||||
bool used_zerocopy = false;
|
||||
if (result >= 0)
|
||||
{
|
||||
used_zerocopy = (cl->write_msg.msg_flags & MSG_ZEROCOPY) ? true : false;
|
||||
int done = 0;
|
||||
int bytes_written = result;
|
||||
while (result > 0 && done < cl->send_list.size())
|
||||
{
|
||||
iovec & iov = cl->send_list[done];
|
||||
@@ -320,19 +242,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
if (cl->outbox[done].flags & MSGR_SENDP_FREE)
|
||||
{
|
||||
// Reply fully sent
|
||||
if (!used_zerocopy)
|
||||
{
|
||||
delete cl->outbox[done].op;
|
||||
}
|
||||
else
|
||||
{
|
||||
// With zero-copy send the difference is that we must keep the buffer (i.e. the operation)
|
||||
// allocated until we get send notification from MSG_ERRQUEUE
|
||||
cl->zerocopy_sent.push_back((msgr_zc_not_t){
|
||||
.op = cl->outbox[done].op,
|
||||
.nsend = cl->zerocopy_notification_idx,
|
||||
});
|
||||
}
|
||||
delete cl->outbox[done].op;
|
||||
}
|
||||
result -= iov.iov_len;
|
||||
done++;
|
||||
@@ -344,11 +254,6 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (used_zerocopy)
|
||||
{
|
||||
cl->zerocopy_notification_idx++;
|
||||
}
|
||||
cl->outbox_size -= bytes_written;
|
||||
if (done > 0)
|
||||
{
|
||||
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+done);
|
||||
@@ -358,10 +263,8 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
{
|
||||
cl->send_list.insert(cl->send_list.end(), cl->next_send_list.begin(), cl->next_send_list.end());
|
||||
cl->outbox.insert(cl->outbox.end(), cl->next_outbox.begin(), cl->next_outbox.end());
|
||||
cl->outbox_size += cl->next_outbox_size;
|
||||
cl->next_send_list.clear();
|
||||
cl->next_outbox.clear();
|
||||
cl->next_outbox_size = 0;
|
||||
}
|
||||
cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0;
|
||||
#ifdef WITH_RDMA
|
||||
@@ -384,34 +287,4 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
{
|
||||
write_ready_clients.push_back(cl->peer_fd);
|
||||
}
|
||||
if (used_zerocopy && (cl->zerocopy_notification_idx-cl->zerocopy_notification_prev) >= 16 &&
|
||||
cl->zerocopy_sent.size() > 0)
|
||||
{
|
||||
cl->zerocopy_notification_prev = cl->zerocopy_notification_idx;
|
||||
cl->zerocopy_notification_msg = {
|
||||
.msg_control = cl->zerocopy_notification_buf,
|
||||
.msg_controllen = sizeof(cl->zerocopy_notification_buf),
|
||||
};
|
||||
cl->refs++;
|
||||
io_uring_sqe* sqe = NULL;
|
||||
if (ringloop && !use_sync_send_recv)
|
||||
{
|
||||
sqe = ringloop->get_sqe();
|
||||
}
|
||||
if (!sqe)
|
||||
{
|
||||
int res = recvmsg(cl->peer_fd, &cl->zerocopy_notification_msg, MSG_ERRQUEUE|MSG_DONTWAIT);
|
||||
if (res < 0)
|
||||
{
|
||||
res = -errno;
|
||||
}
|
||||
handle_zerocopy_notification(cl, res);
|
||||
}
|
||||
else
|
||||
{
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
data->callback = [this, cl](ring_data_t *data) { handle_zerocopy_notification(cl, data->res); };
|
||||
my_uring_prep_recvmsg(sqe, cl->peer_fd, &cl->zerocopy_notification_msg, MSG_ERRQUEUE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -54,8 +54,6 @@ protected:
|
||||
msghdr read_msg = { 0 }, send_msg = { 0 };
|
||||
iovec read_iov = { 0 };
|
||||
|
||||
std::string logfile = "/dev/null";
|
||||
|
||||
public:
|
||||
~nbd_proxy()
|
||||
{
|
||||
@@ -189,7 +187,7 @@ public:
|
||||
uint64_t pool = cfg["pool"].uint64_value();
|
||||
if (pool)
|
||||
{
|
||||
inode = (inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS));
|
||||
inode = (inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS));
|
||||
}
|
||||
if (!(inode >> (64-POOL_ID_BITS)))
|
||||
{
|
||||
@@ -280,10 +278,6 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
if (cfg["logfile"].is_string())
|
||||
{
|
||||
logfile = cfg["logfile"].string_value();
|
||||
}
|
||||
if (bg)
|
||||
{
|
||||
daemonize();
|
||||
@@ -369,14 +363,13 @@ public:
|
||||
setsid();
|
||||
if (fork())
|
||||
exit(0);
|
||||
chdir("/");
|
||||
close(0);
|
||||
close(1);
|
||||
close(2);
|
||||
open("/dev/null", O_RDONLY);
|
||||
open(logfile.c_str(), O_WRONLY|O_APPEND|O_CREAT, 0666);
|
||||
open(logfile.c_str(), O_WRONLY|O_APPEND|O_CREAT, 0666);
|
||||
if (chdir("/") != 0)
|
||||
fprintf(stderr, "Warning: Failed to chdir into /\n");
|
||||
open("/dev/null", O_WRONLY);
|
||||
open("/dev/null", O_WRONLY);
|
||||
}
|
||||
|
||||
json11::Json::object list_mapped()
|
||||
@@ -532,11 +525,7 @@ protected:
|
||||
{
|
||||
goto end_unmap;
|
||||
}
|
||||
r = write(qd_fd, "32768", 5);
|
||||
if (r != 5)
|
||||
{
|
||||
fprintf(stderr, "Warning: Failed to configure max_sectors_kb\n");
|
||||
}
|
||||
write(qd_fd, "32768", 5);
|
||||
close(qd_fd);
|
||||
if (!fork())
|
||||
{
|
||||
@@ -690,7 +679,6 @@ protected:
|
||||
{
|
||||
assert(result <= cur_left);
|
||||
cur_left -= result;
|
||||
cur_buf = (uint8_t*)cur_buf + result;
|
||||
result = 0;
|
||||
}
|
||||
if (cur_left <= 0)
|
||||
@@ -705,12 +693,6 @@ protected:
|
||||
if (read_state == CL_READ_HDR)
|
||||
{
|
||||
int req_type = be32toh(cur_req.type);
|
||||
if (be32toh(cur_req.magic) == NBD_REQUEST_MAGIC && req_type == NBD_CMD_DISC)
|
||||
{
|
||||
// Disconnect
|
||||
close(nbd_fd);
|
||||
exit(0);
|
||||
}
|
||||
if (be32toh(cur_req.magic) != NBD_REQUEST_MAGIC ||
|
||||
req_type != NBD_CMD_READ && req_type != NBD_CMD_WRITE && req_type != NBD_CMD_FLUSH)
|
||||
{
|
||||
|
1690
src/nfs/nfs.h
1690
src/nfs/nfs.h
File diff suppressed because it is too large
Load Diff
1380
src/nfs/nfs.x
1380
src/nfs/nfs.x
File diff suppressed because it is too large
Load Diff
2954
src/nfs/nfs_xdr.cpp
2954
src/nfs/nfs_xdr.cpp
File diff suppressed because it is too large
Load Diff
@@ -1,190 +0,0 @@
|
||||
/*
|
||||
* Please do not edit this file.
|
||||
* It was generated using rpcgen.
|
||||
*/
|
||||
|
||||
#ifndef _PORTMAP_H_RPCGEN
|
||||
#define _PORTMAP_H_RPCGEN
|
||||
|
||||
#include "xdr_impl.h"
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define PMAP_PORT 111
|
||||
|
||||
struct pmap2_mapping {
|
||||
u_int prog;
|
||||
u_int vers;
|
||||
u_int prot;
|
||||
u_int port;
|
||||
};
|
||||
typedef struct pmap2_mapping pmap2_mapping;
|
||||
|
||||
struct pmap2_call_args {
|
||||
u_int prog;
|
||||
u_int vers;
|
||||
u_int proc;
|
||||
xdr_string_t args;
|
||||
};
|
||||
typedef struct pmap2_call_args pmap2_call_args;
|
||||
|
||||
struct pmap2_call_result {
|
||||
u_int port;
|
||||
xdr_string_t res;
|
||||
};
|
||||
typedef struct pmap2_call_result pmap2_call_result;
|
||||
|
||||
struct pmap2_mapping_list {
|
||||
pmap2_mapping map;
|
||||
struct pmap2_mapping_list *next;
|
||||
};
|
||||
typedef struct pmap2_mapping_list pmap2_mapping_list;
|
||||
|
||||
struct pmap2_dump_result {
|
||||
struct pmap2_mapping_list *list;
|
||||
};
|
||||
typedef struct pmap2_dump_result pmap2_dump_result;
|
||||
|
||||
struct pmap3_string_result {
|
||||
xdr_string_t addr;
|
||||
};
|
||||
typedef struct pmap3_string_result pmap3_string_result;
|
||||
|
||||
struct pmap3_mapping {
|
||||
u_int prog;
|
||||
u_int vers;
|
||||
xdr_string_t netid;
|
||||
xdr_string_t addr;
|
||||
xdr_string_t owner;
|
||||
};
|
||||
typedef struct pmap3_mapping pmap3_mapping;
|
||||
|
||||
struct pmap3_mapping_list {
|
||||
pmap3_mapping map;
|
||||
struct pmap3_mapping_list *next;
|
||||
};
|
||||
typedef struct pmap3_mapping_list pmap3_mapping_list;
|
||||
|
||||
struct pmap3_dump_result {
|
||||
struct pmap3_mapping_list *list;
|
||||
};
|
||||
typedef struct pmap3_dump_result pmap3_dump_result;
|
||||
|
||||
struct pmap3_call_args {
|
||||
u_int prog;
|
||||
u_int vers;
|
||||
u_int proc;
|
||||
xdr_string_t args;
|
||||
};
|
||||
typedef struct pmap3_call_args pmap3_call_args;
|
||||
|
||||
struct pmap3_call_result {
|
||||
u_int port;
|
||||
xdr_string_t res;
|
||||
};
|
||||
typedef struct pmap3_call_result pmap3_call_result;
|
||||
|
||||
struct pmap3_netbuf {
|
||||
u_int maxlen;
|
||||
xdr_string_t buf;
|
||||
};
|
||||
typedef struct pmap3_netbuf pmap3_netbuf;
|
||||
|
||||
typedef pmap2_mapping PMAP2SETargs;
|
||||
|
||||
typedef pmap2_mapping PMAP2UNSETargs;
|
||||
|
||||
typedef pmap2_mapping PMAP2GETPORTargs;
|
||||
|
||||
typedef pmap2_call_args PMAP2CALLITargs;
|
||||
|
||||
typedef pmap2_call_result PMAP2CALLITres;
|
||||
|
||||
typedef pmap2_dump_result PMAP2DUMPres;
|
||||
|
||||
typedef pmap3_mapping PMAP3SETargs;
|
||||
|
||||
typedef pmap3_mapping PMAP3UNSETargs;
|
||||
|
||||
typedef pmap3_mapping PMAP3GETADDRargs;
|
||||
|
||||
typedef pmap3_string_result PMAP3GETADDRres;
|
||||
|
||||
typedef pmap3_dump_result PMAP3DUMPres;
|
||||
|
||||
typedef pmap3_call_result PMAP3CALLITargs;
|
||||
|
||||
typedef pmap3_call_result PMAP3CALLITres;
|
||||
|
||||
typedef pmap3_netbuf PMAP3UADDR2TADDRres;
|
||||
|
||||
typedef pmap3_netbuf PMAP3TADDR2UADDRargs;
|
||||
|
||||
typedef pmap3_string_result PMAP3TADDR2UADDRres;
|
||||
|
||||
#define PMAP_PROGRAM 100000
|
||||
#define PMAP_V2 2
|
||||
|
||||
|
||||
#define PMAP2_NULL 0
|
||||
#define PMAP2_SET 1
|
||||
#define PMAP2_UNSET 2
|
||||
#define PMAP2_GETPORT 3
|
||||
#define PMAP2_DUMP 4
|
||||
#define PMAP2_CALLIT 5
|
||||
|
||||
#define PMAP_V3 3
|
||||
|
||||
|
||||
#define PMAP3_NULL 0
|
||||
#define PMAP3_SET 1
|
||||
#define PMAP3_UNSET 2
|
||||
#define PMAP3_GETADDR 3
|
||||
#define PMAP3_DUMP 4
|
||||
#define PMAP3_CALLIT 5
|
||||
#define PMAP3_GETTIME 6
|
||||
#define PMAP3_UADDR2TADDR 7
|
||||
#define PMAP3_TADDR2UADDR 8
|
||||
|
||||
|
||||
/* the xdr functions */
|
||||
|
||||
|
||||
extern bool_t xdr_pmap2_mapping (XDR *, pmap2_mapping*);
|
||||
extern bool_t xdr_pmap2_call_args (XDR *, pmap2_call_args*);
|
||||
extern bool_t xdr_pmap2_call_result (XDR *, pmap2_call_result*);
|
||||
extern bool_t xdr_pmap2_mapping_list (XDR *, pmap2_mapping_list*);
|
||||
extern bool_t xdr_pmap2_dump_result (XDR *, pmap2_dump_result*);
|
||||
extern bool_t xdr_pmap3_string_result (XDR *, pmap3_string_result*);
|
||||
extern bool_t xdr_pmap3_mapping (XDR *, pmap3_mapping*);
|
||||
extern bool_t xdr_pmap3_mapping_list (XDR *, pmap3_mapping_list*);
|
||||
extern bool_t xdr_pmap3_dump_result (XDR *, pmap3_dump_result*);
|
||||
extern bool_t xdr_pmap3_call_args (XDR *, pmap3_call_args*);
|
||||
extern bool_t xdr_pmap3_call_result (XDR *, pmap3_call_result*);
|
||||
extern bool_t xdr_pmap3_netbuf (XDR *, pmap3_netbuf*);
|
||||
extern bool_t xdr_PMAP2SETargs (XDR *, PMAP2SETargs*);
|
||||
extern bool_t xdr_PMAP2UNSETargs (XDR *, PMAP2UNSETargs*);
|
||||
extern bool_t xdr_PMAP2GETPORTargs (XDR *, PMAP2GETPORTargs*);
|
||||
extern bool_t xdr_PMAP2CALLITargs (XDR *, PMAP2CALLITargs*);
|
||||
extern bool_t xdr_PMAP2CALLITres (XDR *, PMAP2CALLITres*);
|
||||
extern bool_t xdr_PMAP2DUMPres (XDR *, PMAP2DUMPres*);
|
||||
extern bool_t xdr_PMAP3SETargs (XDR *, PMAP3SETargs*);
|
||||
extern bool_t xdr_PMAP3UNSETargs (XDR *, PMAP3UNSETargs*);
|
||||
extern bool_t xdr_PMAP3GETADDRargs (XDR *, PMAP3GETADDRargs*);
|
||||
extern bool_t xdr_PMAP3GETADDRres (XDR *, PMAP3GETADDRres*);
|
||||
extern bool_t xdr_PMAP3DUMPres (XDR *, PMAP3DUMPres*);
|
||||
extern bool_t xdr_PMAP3CALLITargs (XDR *, PMAP3CALLITargs*);
|
||||
extern bool_t xdr_PMAP3CALLITres (XDR *, PMAP3CALLITres*);
|
||||
extern bool_t xdr_PMAP3UADDR2TADDRres (XDR *, PMAP3UADDR2TADDRres*);
|
||||
extern bool_t xdr_PMAP3TADDR2UADDRargs (XDR *, PMAP3TADDR2UADDRargs*);
|
||||
extern bool_t xdr_PMAP3TADDR2UADDRres (XDR *, PMAP3TADDR2UADDRres*);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* !_PORTMAP_H_RPCGEN */
|
@@ -1,168 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2014, Ronnie Sahlberg
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
The views and conclusions contained in the software and documentation are those
|
||||
of the authors and should not be interpreted as representing official policies,
|
||||
either expressed or implied, of the FreeBSD Project.
|
||||
*/
|
||||
|
||||
const PMAP_PORT = 111; /* portmapper port number */
|
||||
|
||||
struct pmap2_mapping {
|
||||
unsigned int prog;
|
||||
unsigned int vers;
|
||||
unsigned int prot;
|
||||
unsigned int port;
|
||||
};
|
||||
|
||||
struct pmap2_call_args {
|
||||
unsigned int prog;
|
||||
unsigned int vers;
|
||||
unsigned int proc;
|
||||
opaque args<>;
|
||||
};
|
||||
|
||||
struct pmap2_call_result {
|
||||
unsigned int port;
|
||||
opaque res<>;
|
||||
};
|
||||
|
||||
struct pmap2_mapping_list {
|
||||
pmap2_mapping map;
|
||||
pmap2_mapping_list *next;
|
||||
};
|
||||
|
||||
struct pmap2_dump_result {
|
||||
struct pmap2_mapping_list *list;
|
||||
};
|
||||
|
||||
struct pmap3_string_result {
|
||||
string addr<>;
|
||||
};
|
||||
|
||||
struct pmap3_mapping {
|
||||
unsigned int prog;
|
||||
unsigned int vers;
|
||||
string netid<>;
|
||||
string addr<>;
|
||||
string owner<>;
|
||||
};
|
||||
|
||||
struct pmap3_mapping_list {
|
||||
pmap3_mapping map;
|
||||
pmap3_mapping_list *next;
|
||||
};
|
||||
|
||||
struct pmap3_dump_result {
|
||||
struct pmap3_mapping_list *list;
|
||||
};
|
||||
|
||||
struct pmap3_call_args {
|
||||
unsigned int prog;
|
||||
unsigned int vers;
|
||||
unsigned int proc;
|
||||
opaque args<>;
|
||||
};
|
||||
|
||||
struct pmap3_call_result {
|
||||
unsigned int port;
|
||||
opaque res<>;
|
||||
};
|
||||
|
||||
struct pmap3_netbuf {
|
||||
unsigned int maxlen;
|
||||
/* This pretty much contains a sockaddr_storage.
|
||||
* Beware differences in endianess for ss_family
|
||||
* and whether or not ss_len exists.
|
||||
*/
|
||||
opaque buf<>;
|
||||
};
|
||||
|
||||
typedef pmap2_mapping PMAP2SETargs;
|
||||
typedef pmap2_mapping PMAP2UNSETargs;
|
||||
typedef pmap2_mapping PMAP2GETPORTargs;
|
||||
typedef pmap2_call_args PMAP2CALLITargs;
|
||||
typedef pmap2_call_result PMAP2CALLITres;
|
||||
typedef pmap2_dump_result PMAP2DUMPres;
|
||||
|
||||
typedef pmap3_mapping PMAP3SETargs;
|
||||
typedef pmap3_mapping PMAP3UNSETargs;
|
||||
typedef pmap3_mapping PMAP3GETADDRargs;
|
||||
typedef pmap3_string_result PMAP3GETADDRres;
|
||||
typedef pmap3_dump_result PMAP3DUMPres;
|
||||
typedef pmap3_call_result PMAP3CALLITargs;
|
||||
typedef pmap3_call_result PMAP3CALLITres;
|
||||
typedef pmap3_netbuf PMAP3UADDR2TADDRres;
|
||||
typedef pmap3_netbuf PMAP3TADDR2UADDRargs;
|
||||
typedef pmap3_string_result PMAP3TADDR2UADDRres;
|
||||
|
||||
program PMAP_PROGRAM {
|
||||
version PMAP_V2 {
|
||||
void
|
||||
PMAP2_NULL(void) = 0;
|
||||
|
||||
uint32_t
|
||||
PMAP2_SET(PMAP2SETargs) = 1;
|
||||
|
||||
uint32_t
|
||||
PMAP2_UNSET(PMAP2UNSETargs) = 2;
|
||||
|
||||
uint32_t
|
||||
PMAP2_GETPORT(PMAP2GETPORTargs) = 3;
|
||||
|
||||
PMAP2DUMPres
|
||||
PMAP2_DUMP(void) = 4;
|
||||
|
||||
PMAP2CALLITres
|
||||
PMAP2_CALLIT(PMAP2CALLITargs) = 5;
|
||||
} = 2;
|
||||
version PMAP_V3 {
|
||||
void
|
||||
PMAP3_NULL(void) = 0;
|
||||
|
||||
uint32_t
|
||||
PMAP3_SET(PMAP3SETargs) = 1;
|
||||
|
||||
uint32_t
|
||||
PMAP3_UNSET(PMAP3UNSETargs) = 2;
|
||||
|
||||
PMAP3GETADDRres
|
||||
PMAP3_GETADDR(PMAP3GETADDRargs) = 3;
|
||||
|
||||
PMAP3DUMPres
|
||||
PMAP3_DUMP(void) = 4;
|
||||
|
||||
PMAP3CALLITres
|
||||
PMAP3_CALLIT(PMAP3CALLITargs) = 5;
|
||||
|
||||
uint32_t
|
||||
PMAP3_GETTIME(void) = 6;
|
||||
|
||||
PMAP3UADDR2TADDRres
|
||||
PMAP3_UADDR2TADDR(string) = 7;
|
||||
|
||||
PMAP3TADDR2UADDRres
|
||||
PMAP3_TADDR2UADDR(PMAP3TADDR2UADDRargs) = 8;
|
||||
} = 3;
|
||||
} = 100000;
|
@@ -1,406 +0,0 @@
|
||||
/*
|
||||
* Please do not edit this file.
|
||||
* It was generated using rpcgen.
|
||||
*/
|
||||
|
||||
#include "portmap.h"
|
||||
#include "xdr_impl_inline.h"
|
||||
|
||||
bool_t
|
||||
xdr_pmap2_mapping (XDR *xdrs, pmap2_mapping *objp)
|
||||
{
|
||||
|
||||
|
||||
if (xdrs->x_op == XDR_ENCODE) {
|
||||
if (1) {
|
||||
if (!xdr_u_int (xdrs, &objp->prog))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->vers))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->prot))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->port))
|
||||
return FALSE;
|
||||
} else {
|
||||
IXDR_PUT_U_LONG(buf, objp->prog);
|
||||
IXDR_PUT_U_LONG(buf, objp->vers);
|
||||
IXDR_PUT_U_LONG(buf, objp->prot);
|
||||
IXDR_PUT_U_LONG(buf, objp->port);
|
||||
}
|
||||
return TRUE;
|
||||
} else if (xdrs->x_op == XDR_DECODE) {
|
||||
if (1) {
|
||||
if (!xdr_u_int (xdrs, &objp->prog))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->vers))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->prot))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->port))
|
||||
return FALSE;
|
||||
} else {
|
||||
objp->prog = IXDR_GET_U_LONG(buf);
|
||||
objp->vers = IXDR_GET_U_LONG(buf);
|
||||
objp->prot = IXDR_GET_U_LONG(buf);
|
||||
objp->port = IXDR_GET_U_LONG(buf);
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
if (!xdr_u_int (xdrs, &objp->prog))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->vers))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->prot))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->port))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_pmap2_call_args (XDR *xdrs, pmap2_call_args *objp)
|
||||
{
|
||||
|
||||
|
||||
if (xdrs->x_op == XDR_ENCODE) {
|
||||
if (1) {
|
||||
if (!xdr_u_int (xdrs, &objp->prog))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->vers))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->proc))
|
||||
return FALSE;
|
||||
|
||||
} else {
|
||||
IXDR_PUT_U_LONG(buf, objp->prog);
|
||||
IXDR_PUT_U_LONG(buf, objp->vers);
|
||||
IXDR_PUT_U_LONG(buf, objp->proc);
|
||||
}
|
||||
if (!xdr_bytes(xdrs, &objp->args, ~0))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
} else if (xdrs->x_op == XDR_DECODE) {
|
||||
if (1) {
|
||||
if (!xdr_u_int (xdrs, &objp->prog))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->vers))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->proc))
|
||||
return FALSE;
|
||||
|
||||
} else {
|
||||
objp->prog = IXDR_GET_U_LONG(buf);
|
||||
objp->vers = IXDR_GET_U_LONG(buf);
|
||||
objp->proc = IXDR_GET_U_LONG(buf);
|
||||
}
|
||||
if (!xdr_bytes(xdrs, &objp->args, ~0))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
if (!xdr_u_int (xdrs, &objp->prog))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->vers))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->proc))
|
||||
return FALSE;
|
||||
if (!xdr_bytes(xdrs, &objp->args, ~0))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_pmap2_call_result (XDR *xdrs, pmap2_call_result *objp)
|
||||
{
|
||||
|
||||
if (!xdr_u_int (xdrs, &objp->port))
|
||||
return FALSE;
|
||||
if (!xdr_bytes(xdrs, &objp->res, ~0))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_pmap2_mapping_list (XDR *xdrs, pmap2_mapping_list *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap2_mapping (xdrs, &objp->map))
|
||||
return FALSE;
|
||||
if (!xdr_pointer (xdrs, (char **)&objp->next, sizeof (pmap2_mapping_list), (xdrproc_t) xdr_pmap2_mapping_list))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_pmap2_dump_result (XDR *xdrs, pmap2_dump_result *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pointer (xdrs, (char **)&objp->list, sizeof (pmap2_mapping_list), (xdrproc_t) xdr_pmap2_mapping_list))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_pmap3_string_result (XDR *xdrs, pmap3_string_result *objp)
|
||||
{
|
||||
|
||||
if (!xdr_string (xdrs, &objp->addr, ~0))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_pmap3_mapping (XDR *xdrs, pmap3_mapping *objp)
|
||||
{
|
||||
|
||||
if (!xdr_u_int (xdrs, &objp->prog))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->vers))
|
||||
return FALSE;
|
||||
if (!xdr_string (xdrs, &objp->netid, ~0))
|
||||
return FALSE;
|
||||
if (!xdr_string (xdrs, &objp->addr, ~0))
|
||||
return FALSE;
|
||||
if (!xdr_string (xdrs, &objp->owner, ~0))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_pmap3_mapping_list (XDR *xdrs, pmap3_mapping_list *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap3_mapping (xdrs, &objp->map))
|
||||
return FALSE;
|
||||
if (!xdr_pointer (xdrs, (char **)&objp->next, sizeof (pmap3_mapping_list), (xdrproc_t) xdr_pmap3_mapping_list))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_pmap3_dump_result (XDR *xdrs, pmap3_dump_result *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pointer (xdrs, (char **)&objp->list, sizeof (pmap3_mapping_list), (xdrproc_t) xdr_pmap3_mapping_list))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_pmap3_call_args (XDR *xdrs, pmap3_call_args *objp)
|
||||
{
|
||||
|
||||
|
||||
if (xdrs->x_op == XDR_ENCODE) {
|
||||
if (1) {
|
||||
if (!xdr_u_int (xdrs, &objp->prog))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->vers))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->proc))
|
||||
return FALSE;
|
||||
|
||||
} else {
|
||||
IXDR_PUT_U_LONG(buf, objp->prog);
|
||||
IXDR_PUT_U_LONG(buf, objp->vers);
|
||||
IXDR_PUT_U_LONG(buf, objp->proc);
|
||||
}
|
||||
if (!xdr_bytes(xdrs, &objp->args, ~0))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
} else if (xdrs->x_op == XDR_DECODE) {
|
||||
if (1) {
|
||||
if (!xdr_u_int (xdrs, &objp->prog))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->vers))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->proc))
|
||||
return FALSE;
|
||||
|
||||
} else {
|
||||
objp->prog = IXDR_GET_U_LONG(buf);
|
||||
objp->vers = IXDR_GET_U_LONG(buf);
|
||||
objp->proc = IXDR_GET_U_LONG(buf);
|
||||
}
|
||||
if (!xdr_bytes(xdrs, &objp->args, ~0))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
if (!xdr_u_int (xdrs, &objp->prog))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->vers))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->proc))
|
||||
return FALSE;
|
||||
if (!xdr_bytes(xdrs, &objp->args, ~0))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_pmap3_call_result (XDR *xdrs, pmap3_call_result *objp)
|
||||
{
|
||||
|
||||
if (!xdr_u_int (xdrs, &objp->port))
|
||||
return FALSE;
|
||||
if (!xdr_bytes(xdrs, &objp->res, ~0))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_pmap3_netbuf (XDR *xdrs, pmap3_netbuf *objp)
|
||||
{
|
||||
|
||||
if (!xdr_u_int (xdrs, &objp->maxlen))
|
||||
return FALSE;
|
||||
if (!xdr_bytes(xdrs, &objp->buf, ~0))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP2SETargs (XDR *xdrs, PMAP2SETargs *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap2_mapping (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP2UNSETargs (XDR *xdrs, PMAP2UNSETargs *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap2_mapping (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP2GETPORTargs (XDR *xdrs, PMAP2GETPORTargs *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap2_mapping (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP2CALLITargs (XDR *xdrs, PMAP2CALLITargs *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap2_call_args (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP2CALLITres (XDR *xdrs, PMAP2CALLITres *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap2_call_result (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP2DUMPres (XDR *xdrs, PMAP2DUMPres *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap2_dump_result (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP3SETargs (XDR *xdrs, PMAP3SETargs *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap3_mapping (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP3UNSETargs (XDR *xdrs, PMAP3UNSETargs *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap3_mapping (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP3GETADDRargs (XDR *xdrs, PMAP3GETADDRargs *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap3_mapping (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP3GETADDRres (XDR *xdrs, PMAP3GETADDRres *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap3_string_result (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP3DUMPres (XDR *xdrs, PMAP3DUMPres *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap3_dump_result (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP3CALLITargs (XDR *xdrs, PMAP3CALLITargs *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap3_call_result (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP3CALLITres (XDR *xdrs, PMAP3CALLITres *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap3_call_result (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP3UADDR2TADDRres (XDR *xdrs, PMAP3UADDR2TADDRres *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap3_netbuf (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP3TADDR2UADDRargs (XDR *xdrs, PMAP3TADDR2UADDRargs *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap3_netbuf (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_PMAP3TADDR2UADDRres (XDR *xdrs, PMAP3TADDR2UADDRres *objp)
|
||||
{
|
||||
|
||||
if (!xdr_pmap3_string_result (xdrs, objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
160
src/nfs/rpc.h
160
src/nfs/rpc.h
@@ -1,160 +0,0 @@
|
||||
/*
|
||||
* Please do not edit this file.
|
||||
* It was generated using rpcgen.
|
||||
*/
|
||||
|
||||
#ifndef _RPC_H_RPCGEN
|
||||
#define _RPC_H_RPCGEN
|
||||
|
||||
#include "xdr_impl.h"
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define RPC_MSG_VERSION 2
|
||||
|
||||
enum rpc_auth_flavor {
|
||||
RPC_AUTH_NONE = 0,
|
||||
RPC_AUTH_SYS = 1,
|
||||
RPC_AUTH_SHORT = 2,
|
||||
RPC_AUTH_DH = 3,
|
||||
RPC_RPCSEC_GSS = 6,
|
||||
};
|
||||
typedef enum rpc_auth_flavor rpc_auth_flavor;
|
||||
|
||||
enum rpc_msg_type {
|
||||
RPC_CALL = 0,
|
||||
RPC_REPLY = 1,
|
||||
};
|
||||
typedef enum rpc_msg_type rpc_msg_type;
|
||||
|
||||
enum rpc_reply_stat {
|
||||
RPC_MSG_ACCEPTED = 0,
|
||||
RPC_MSG_DENIED = 1,
|
||||
};
|
||||
typedef enum rpc_reply_stat rpc_reply_stat;
|
||||
|
||||
enum rpc_accept_stat {
|
||||
RPC_SUCCESS = 0,
|
||||
RPC_PROG_UNAVAIL = 1,
|
||||
RPC_PROG_MISMATCH = 2,
|
||||
RPC_PROC_UNAVAIL = 3,
|
||||
RPC_GARBAGE_ARGS = 4,
|
||||
RPC_SYSTEM_ERR = 5,
|
||||
};
|
||||
typedef enum rpc_accept_stat rpc_accept_stat;
|
||||
|
||||
enum rpc_reject_stat {
|
||||
RPC_MISMATCH = 0,
|
||||
RPC_AUTH_ERROR = 1,
|
||||
};
|
||||
typedef enum rpc_reject_stat rpc_reject_stat;
|
||||
|
||||
enum rpc_auth_stat {
|
||||
RPC_AUTH_OK = 0,
|
||||
RPC_AUTH_BADCRED = 1,
|
||||
RPC_AUTH_REJECTEDCRED = 2,
|
||||
RPC_AUTH_BADVERF = 3,
|
||||
RPC_AUTH_REJECTEDVERF = 4,
|
||||
RPC_AUTH_TOOWEAK = 5,
|
||||
RPC_AUTH_INVALIDRESP = 6,
|
||||
RPC_AUTH_FAILED = 7,
|
||||
};
|
||||
typedef enum rpc_auth_stat rpc_auth_stat;
|
||||
|
||||
struct rpc_opaque_auth {
|
||||
rpc_auth_flavor flavor;
|
||||
xdr_string_t body;
|
||||
};
|
||||
typedef struct rpc_opaque_auth rpc_opaque_auth;
|
||||
|
||||
struct rpc_call_body {
|
||||
u_int rpcvers;
|
||||
u_int prog;
|
||||
u_int vers;
|
||||
u_int proc;
|
||||
rpc_opaque_auth cred;
|
||||
rpc_opaque_auth verf;
|
||||
};
|
||||
typedef struct rpc_call_body rpc_call_body;
|
||||
|
||||
struct rpc_mismatch_info {
|
||||
u_int min_version;
|
||||
u_int max_version;
|
||||
};
|
||||
typedef struct rpc_mismatch_info rpc_mismatch_info;
|
||||
|
||||
struct rpc_accepted_reply_body {
|
||||
rpc_accept_stat stat;
|
||||
union {
|
||||
rpc_mismatch_info mismatch_info;
|
||||
};
|
||||
};
|
||||
typedef struct rpc_accepted_reply_body rpc_accepted_reply_body;
|
||||
|
||||
struct rpc_accepted_reply {
|
||||
rpc_opaque_auth verf;
|
||||
rpc_accepted_reply_body reply_data;
|
||||
};
|
||||
typedef struct rpc_accepted_reply rpc_accepted_reply;
|
||||
|
||||
struct rpc_rejected_reply {
|
||||
rpc_reject_stat stat;
|
||||
union {
|
||||
rpc_mismatch_info mismatch_info;
|
||||
rpc_auth_stat auth_stat;
|
||||
};
|
||||
};
|
||||
typedef struct rpc_rejected_reply rpc_rejected_reply;
|
||||
|
||||
struct rpc_reply_body {
|
||||
rpc_reply_stat stat;
|
||||
union {
|
||||
rpc_accepted_reply areply;
|
||||
rpc_rejected_reply rreply;
|
||||
};
|
||||
};
|
||||
typedef struct rpc_reply_body rpc_reply_body;
|
||||
|
||||
struct rpc_msg_body {
|
||||
rpc_msg_type dir;
|
||||
union {
|
||||
rpc_call_body cbody;
|
||||
rpc_reply_body rbody;
|
||||
};
|
||||
};
|
||||
typedef struct rpc_msg_body rpc_msg_body;
|
||||
|
||||
struct rpc_msg {
|
||||
u_int xid;
|
||||
rpc_msg_body body;
|
||||
};
|
||||
typedef struct rpc_msg rpc_msg;
|
||||
|
||||
/* the xdr functions */
|
||||
|
||||
|
||||
extern bool_t xdr_rpc_auth_flavor (XDR *, rpc_auth_flavor*);
|
||||
extern bool_t xdr_rpc_msg_type (XDR *, rpc_msg_type*);
|
||||
extern bool_t xdr_rpc_reply_stat (XDR *, rpc_reply_stat*);
|
||||
extern bool_t xdr_rpc_accept_stat (XDR *, rpc_accept_stat*);
|
||||
extern bool_t xdr_rpc_reject_stat (XDR *, rpc_reject_stat*);
|
||||
extern bool_t xdr_rpc_auth_stat (XDR *, rpc_auth_stat*);
|
||||
extern bool_t xdr_rpc_opaque_auth (XDR *, rpc_opaque_auth*);
|
||||
extern bool_t xdr_rpc_call_body (XDR *, rpc_call_body*);
|
||||
extern bool_t xdr_rpc_mismatch_info (XDR *, rpc_mismatch_info*);
|
||||
extern bool_t xdr_rpc_accepted_reply_body (XDR *, rpc_accepted_reply_body*);
|
||||
extern bool_t xdr_rpc_accepted_reply (XDR *, rpc_accepted_reply*);
|
||||
extern bool_t xdr_rpc_rejected_reply (XDR *, rpc_rejected_reply*);
|
||||
extern bool_t xdr_rpc_reply_body (XDR *, rpc_reply_body*);
|
||||
extern bool_t xdr_rpc_msg_body (XDR *, rpc_msg_body*);
|
||||
extern bool_t xdr_rpc_msg (XDR *, rpc_msg*);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* !_RPC_H_RPCGEN */
|
113
src/nfs/rpc.x
113
src/nfs/rpc.x
@@ -1,113 +0,0 @@
|
||||
/* Based on RFC 5531 - RPC: Remote Procedure Call Protocol Specification Version 2 */
|
||||
|
||||
const RPC_MSG_VERSION = 2;
|
||||
|
||||
enum rpc_auth_flavor {
|
||||
RPC_AUTH_NONE = 0,
|
||||
RPC_AUTH_SYS = 1,
|
||||
RPC_AUTH_SHORT = 2,
|
||||
RPC_AUTH_DH = 3,
|
||||
RPC_RPCSEC_GSS = 6
|
||||
};
|
||||
|
||||
enum rpc_msg_type {
|
||||
RPC_CALL = 0,
|
||||
RPC_REPLY = 1
|
||||
};
|
||||
|
||||
enum rpc_reply_stat {
|
||||
RPC_MSG_ACCEPTED = 0,
|
||||
RPC_MSG_DENIED = 1
|
||||
};
|
||||
|
||||
enum rpc_accept_stat {
|
||||
RPC_SUCCESS = 0,
|
||||
RPC_PROG_UNAVAIL = 1,
|
||||
RPC_PROG_MISMATCH = 2,
|
||||
RPC_PROC_UNAVAIL = 3,
|
||||
RPC_GARBAGE_ARGS = 4,
|
||||
RPC_SYSTEM_ERR = 5
|
||||
};
|
||||
|
||||
enum rpc_reject_stat {
|
||||
RPC_MISMATCH = 0,
|
||||
RPC_AUTH_ERROR = 1
|
||||
};
|
||||
|
||||
enum rpc_auth_stat {
|
||||
RPC_AUTH_OK = 0,
|
||||
/*
|
||||
* failed at remote end
|
||||
*/
|
||||
RPC_AUTH_BADCRED = 1, /* bogus credentials (seal broken) */
|
||||
RPC_AUTH_REJECTEDCRED = 2, /* client should begin new session */
|
||||
RPC_AUTH_BADVERF = 3, /* bogus verifier (seal broken) */
|
||||
RPC_AUTH_REJECTEDVERF = 4, /* verifier expired or was replayed */
|
||||
RPC_AUTH_TOOWEAK = 5, /* rejected due to security reasons */
|
||||
/*
|
||||
* failed locally
|
||||
*/
|
||||
RPC_AUTH_INVALIDRESP = 6, /* bogus response verifier */
|
||||
RPC_AUTH_FAILED = 7 /* some unknown reason */
|
||||
};
|
||||
|
||||
struct rpc_opaque_auth {
|
||||
rpc_auth_flavor flavor;
|
||||
opaque body<400>;
|
||||
};
|
||||
|
||||
struct rpc_call_body {
|
||||
u_int rpcvers;
|
||||
u_int prog;
|
||||
u_int vers;
|
||||
u_int proc;
|
||||
rpc_opaque_auth cred;
|
||||
rpc_opaque_auth verf;
|
||||
/* procedure-specific parameters start here */
|
||||
};
|
||||
|
||||
struct rpc_mismatch_info {
|
||||
unsigned int min_version;
|
||||
unsigned int max_version;
|
||||
};
|
||||
|
||||
union rpc_accepted_reply_body switch (rpc_accept_stat stat) {
|
||||
case RPC_SUCCESS:
|
||||
void;
|
||||
/* procedure-specific results start here */
|
||||
case RPC_PROG_MISMATCH:
|
||||
rpc_mismatch_info mismatch_info;
|
||||
default:
|
||||
void;
|
||||
};
|
||||
|
||||
struct rpc_accepted_reply {
|
||||
rpc_opaque_auth verf;
|
||||
rpc_accepted_reply_body reply_data;
|
||||
};
|
||||
|
||||
union rpc_rejected_reply switch (rpc_reject_stat stat) {
|
||||
case RPC_MISMATCH:
|
||||
rpc_mismatch_info mismatch_info;
|
||||
case RPC_AUTH_ERROR:
|
||||
rpc_auth_stat auth_stat;
|
||||
};
|
||||
|
||||
union rpc_reply_body switch (rpc_reply_stat stat) {
|
||||
case RPC_MSG_ACCEPTED:
|
||||
rpc_accepted_reply areply;
|
||||
case RPC_MSG_DENIED:
|
||||
rpc_rejected_reply rreply;
|
||||
};
|
||||
|
||||
union rpc_msg_body switch (rpc_msg_type dir) {
|
||||
case RPC_CALL:
|
||||
rpc_call_body cbody;
|
||||
case RPC_REPLY:
|
||||
rpc_reply_body rbody;
|
||||
};
|
||||
|
||||
struct rpc_msg {
|
||||
u_int xid;
|
||||
rpc_msg_body body;
|
||||
};
|
@@ -1,43 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "rpc.h"
|
||||
|
||||
struct rpc_op_t;
|
||||
|
||||
// Handler should return 1 if the request is processed asynchronously
|
||||
// and requires the incoming message to not be freed until processing ends,
|
||||
// 0 otherwise.
|
||||
typedef int (*rpc_handler_t)(void *opaque, rpc_op_t *rop);
|
||||
|
||||
struct rpc_service_proc_t
|
||||
{
|
||||
uint32_t prog;
|
||||
uint32_t vers;
|
||||
uint32_t proc;
|
||||
rpc_handler_t handler_fn;
|
||||
xdrproc_t req_fn;
|
||||
uint32_t req_size;
|
||||
xdrproc_t resp_fn;
|
||||
uint32_t resp_size;
|
||||
void *opaque;
|
||||
};
|
||||
|
||||
inline bool operator < (const rpc_service_proc_t & a, const rpc_service_proc_t & b)
|
||||
{
|
||||
return a.prog < b.prog || a.prog == b.prog && (a.vers < b.vers || a.vers == b.vers && a.proc < b.proc);
|
||||
}
|
||||
|
||||
struct rpc_op_t
|
||||
{
|
||||
void *client;
|
||||
uint8_t *buffer;
|
||||
XDR *xdrs;
|
||||
rpc_msg in_msg, out_msg;
|
||||
void *request;
|
||||
void *reply;
|
||||
xdrproc_t reply_fn;
|
||||
uint32_t reply_marker;
|
||||
bool referenced;
|
||||
};
|
||||
|
||||
void rpc_queue_reply(rpc_op_t *rop);
|
@@ -1,253 +0,0 @@
|
||||
/*
|
||||
* Please do not edit this file.
|
||||
* It was generated using rpcgen.
|
||||
*/
|
||||
|
||||
#include "rpc.h"
|
||||
#include "xdr_impl_inline.h"
|
||||
|
||||
bool_t
|
||||
xdr_rpc_auth_flavor (XDR *xdrs, rpc_auth_flavor *objp)
|
||||
{
|
||||
|
||||
if (!xdr_enum (xdrs, (enum_t *) objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_rpc_msg_type (XDR *xdrs, rpc_msg_type *objp)
|
||||
{
|
||||
|
||||
if (!xdr_enum (xdrs, (enum_t *) objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_rpc_reply_stat (XDR *xdrs, rpc_reply_stat *objp)
|
||||
{
|
||||
|
||||
if (!xdr_enum (xdrs, (enum_t *) objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_rpc_accept_stat (XDR *xdrs, rpc_accept_stat *objp)
|
||||
{
|
||||
|
||||
if (!xdr_enum (xdrs, (enum_t *) objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_rpc_reject_stat (XDR *xdrs, rpc_reject_stat *objp)
|
||||
{
|
||||
|
||||
if (!xdr_enum (xdrs, (enum_t *) objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_rpc_auth_stat (XDR *xdrs, rpc_auth_stat *objp)
|
||||
{
|
||||
|
||||
if (!xdr_enum (xdrs, (enum_t *) objp))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_rpc_opaque_auth (XDR *xdrs, rpc_opaque_auth *objp)
|
||||
{
|
||||
|
||||
if (!xdr_rpc_auth_flavor (xdrs, &objp->flavor))
|
||||
return FALSE;
|
||||
if (!xdr_bytes(xdrs, &objp->body, 400))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_rpc_call_body (XDR *xdrs, rpc_call_body *objp)
|
||||
{
|
||||
|
||||
|
||||
if (xdrs->x_op == XDR_ENCODE) {
|
||||
if (1) {
|
||||
if (!xdr_u_int (xdrs, &objp->rpcvers))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->prog))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->vers))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->proc))
|
||||
return FALSE;
|
||||
|
||||
} else {
|
||||
IXDR_PUT_U_LONG(buf, objp->rpcvers);
|
||||
IXDR_PUT_U_LONG(buf, objp->prog);
|
||||
IXDR_PUT_U_LONG(buf, objp->vers);
|
||||
IXDR_PUT_U_LONG(buf, objp->proc);
|
||||
}
|
||||
if (!xdr_rpc_opaque_auth (xdrs, &objp->cred))
|
||||
return FALSE;
|
||||
if (!xdr_rpc_opaque_auth (xdrs, &objp->verf))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
} else if (xdrs->x_op == XDR_DECODE) {
|
||||
if (1) {
|
||||
if (!xdr_u_int (xdrs, &objp->rpcvers))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->prog))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->vers))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->proc))
|
||||
return FALSE;
|
||||
|
||||
} else {
|
||||
objp->rpcvers = IXDR_GET_U_LONG(buf);
|
||||
objp->prog = IXDR_GET_U_LONG(buf);
|
||||
objp->vers = IXDR_GET_U_LONG(buf);
|
||||
objp->proc = IXDR_GET_U_LONG(buf);
|
||||
}
|
||||
if (!xdr_rpc_opaque_auth (xdrs, &objp->cred))
|
||||
return FALSE;
|
||||
if (!xdr_rpc_opaque_auth (xdrs, &objp->verf))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
if (!xdr_u_int (xdrs, &objp->rpcvers))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->prog))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->vers))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->proc))
|
||||
return FALSE;
|
||||
if (!xdr_rpc_opaque_auth (xdrs, &objp->cred))
|
||||
return FALSE;
|
||||
if (!xdr_rpc_opaque_auth (xdrs, &objp->verf))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_rpc_mismatch_info (XDR *xdrs, rpc_mismatch_info *objp)
|
||||
{
|
||||
|
||||
if (!xdr_u_int (xdrs, &objp->min_version))
|
||||
return FALSE;
|
||||
if (!xdr_u_int (xdrs, &objp->max_version))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_rpc_accepted_reply_body (XDR *xdrs, rpc_accepted_reply_body *objp)
|
||||
{
|
||||
|
||||
if (!xdr_rpc_accept_stat (xdrs, &objp->stat))
|
||||
return FALSE;
|
||||
switch (objp->stat) {
|
||||
case RPC_SUCCESS:
|
||||
break;
|
||||
case RPC_PROG_MISMATCH:
|
||||
if (!xdr_rpc_mismatch_info (xdrs, &objp->mismatch_info))
|
||||
return FALSE;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_rpc_accepted_reply (XDR *xdrs, rpc_accepted_reply *objp)
|
||||
{
|
||||
|
||||
if (!xdr_rpc_opaque_auth (xdrs, &objp->verf))
|
||||
return FALSE;
|
||||
if (!xdr_rpc_accepted_reply_body (xdrs, &objp->reply_data))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_rpc_rejected_reply (XDR *xdrs, rpc_rejected_reply *objp)
|
||||
{
|
||||
|
||||
if (!xdr_rpc_reject_stat (xdrs, &objp->stat))
|
||||
return FALSE;
|
||||
switch (objp->stat) {
|
||||
case RPC_MISMATCH:
|
||||
if (!xdr_rpc_mismatch_info (xdrs, &objp->mismatch_info))
|
||||
return FALSE;
|
||||
break;
|
||||
case RPC_AUTH_ERROR:
|
||||
if (!xdr_rpc_auth_stat (xdrs, &objp->auth_stat))
|
||||
return FALSE;
|
||||
break;
|
||||
default:
|
||||
return FALSE;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_rpc_reply_body (XDR *xdrs, rpc_reply_body *objp)
|
||||
{
|
||||
|
||||
if (!xdr_rpc_reply_stat (xdrs, &objp->stat))
|
||||
return FALSE;
|
||||
switch (objp->stat) {
|
||||
case RPC_MSG_ACCEPTED:
|
||||
if (!xdr_rpc_accepted_reply (xdrs, &objp->areply))
|
||||
return FALSE;
|
||||
break;
|
||||
case RPC_MSG_DENIED:
|
||||
if (!xdr_rpc_rejected_reply (xdrs, &objp->rreply))
|
||||
return FALSE;
|
||||
break;
|
||||
default:
|
||||
return FALSE;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_rpc_msg_body (XDR *xdrs, rpc_msg_body *objp)
|
||||
{
|
||||
|
||||
if (!xdr_rpc_msg_type (xdrs, &objp->dir))
|
||||
return FALSE;
|
||||
switch (objp->dir) {
|
||||
case RPC_CALL:
|
||||
if (!xdr_rpc_call_body (xdrs, &objp->cbody))
|
||||
return FALSE;
|
||||
break;
|
||||
case RPC_REPLY:
|
||||
if (!xdr_rpc_reply_body (xdrs, &objp->rbody))
|
||||
return FALSE;
|
||||
break;
|
||||
default:
|
||||
return FALSE;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t
|
||||
xdr_rpc_msg (XDR *xdrs, rpc_msg *objp)
|
||||
{
|
||||
|
||||
if (!xdr_u_int (xdrs, &objp->xid))
|
||||
return FALSE;
|
||||
if (!xdr_rpc_msg_body (xdrs, &objp->body))
|
||||
return FALSE;
|
||||
return TRUE;
|
||||
}
|
@@ -1,48 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# 1) remove all extern non-xdr functions (service, client)
|
||||
# 2) use xdr_string_t for strings instead of char*
|
||||
# 3) remove K&R #ifdefs
|
||||
# 4) remove register int32_t* buf
|
||||
# 5) remove union names
|
||||
# 6) use xdr_string_t for opaques instead of u_int + char*
|
||||
# 7) TODO: generate normal procedure stubs
|
||||
run_rpcgen() {
|
||||
rpcgen -h $1.x | \
|
||||
perl -e '
|
||||
{ local $/ = undef; $_ = <>; }
|
||||
s/^extern(?!.*"C"|.*bool_t xdr.*XDR).*\n//gm;
|
||||
s/#include <rpc\/rpc.h>/#include "xdr_impl.h"/;
|
||||
s/^typedef char \*/typedef xdr_string_t /gm;
|
||||
s/^(\s*)char \*(?!.*_val)/$1xdr_string_t /gm;
|
||||
# remove union names
|
||||
s/ \w+_u;/;/gs;
|
||||
# use xdr_string_t for opaques
|
||||
s/struct\s*\{\s*u_int\s+\w+_len;\s*char\s+\*\w+_val;\s*\}\s*/xdr_string_t /gs;
|
||||
# remove stdc/k&r
|
||||
s/^#if.*__STDC__.*//gm;
|
||||
s/\n#else[^\n]*K&R.*?\n#endif[^\n]*K&R[^\n]*//gs;
|
||||
print;' > $1.h
|
||||
rpcgen -c $1.x | \
|
||||
perl -pe '
|
||||
s/register int32_t \*buf;\s*//g;
|
||||
s/\bbuf\s*=[^;]+;\s*//g;
|
||||
s/\bbuf\s*==\s*NULL/1/g;
|
||||
# remove union names
|
||||
s/(\.|->)\w+_u\./$1/g;
|
||||
# use xdr_string_t for opaques
|
||||
# xdr_bytes(xdrs, (char**)&objp->data.data_val, (char**)&objp->data.data_len, 400)
|
||||
# -> xdr_bytes(xdrs, &objp->data, 400)
|
||||
# xdr_bytes(xdrs, (char**)&objp->data_val, (char**)&objp->data_len, 400)
|
||||
# -> xdr_bytes(xdrs, objp, 400)
|
||||
s/xdr_bytes\s*\(\s*xdrs,\s*\(\s*char\s*\*\*\s*\)\s*([^()]+?)\.\w+_val\s*,\s*\(\s*u_int\s*\*\s*\)\s*\1\.\w+_len,/xdr_bytes(xdrs, $1,/gs;
|
||||
s/xdr_bytes\s*\(\s*xdrs,\s*\(\s*char\s*\*\*\s*\)\s*&\s*([^()]+?)->\w+_val\s*,\s*\(\s*u_int\s*\*\s*\)\s*&\s*\1->\w+_len,/xdr_bytes(xdrs, $1,/gs;
|
||||
# add include
|
||||
if (/#include/) { $_ .= "#include \"xdr_impl_inline.h\"\n"; }' > ${1}_xdr.cpp
|
||||
}
|
||||
|
||||
run_rpcgen nfs
|
||||
run_rpcgen rpc
|
||||
run_rpcgen portmap
|
@@ -1,107 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// Efficient XDR implementation almost compatible with rpcgen (see run-rpcgen.sh)
|
||||
|
||||
#include "xdr_impl_inline.h"
|
||||
|
||||
XDR* xdr_create()
|
||||
{
|
||||
return new XDR;
|
||||
}
|
||||
|
||||
void xdr_destroy(XDR* xdrs)
|
||||
{
|
||||
xdr_reset(xdrs);
|
||||
delete xdrs;
|
||||
}
|
||||
|
||||
void xdr_reset(XDR *xdrs)
|
||||
{
|
||||
for (auto buf: xdrs->allocs)
|
||||
{
|
||||
free(buf);
|
||||
}
|
||||
xdrs->buf = NULL;
|
||||
xdrs->avail = 0;
|
||||
xdrs->allocs.resize(0);
|
||||
xdrs->in_linked_list.resize(0);
|
||||
xdrs->cur_out.resize(0);
|
||||
xdrs->last_end = 0;
|
||||
xdrs->buf_list.resize(0);
|
||||
}
|
||||
|
||||
int xdr_decode(XDR *xdrs, void *buf, unsigned size, xdrproc_t fn, void *data)
|
||||
{
|
||||
xdrs->x_op = XDR_DECODE;
|
||||
xdrs->buf = (uint8_t*)buf;
|
||||
xdrs->avail = size;
|
||||
return fn(xdrs, data);
|
||||
}
|
||||
|
||||
int xdr_encode(XDR *xdrs, xdrproc_t fn, void *data)
|
||||
{
|
||||
xdrs->x_op = XDR_ENCODE;
|
||||
return fn(xdrs, data);
|
||||
}
|
||||
|
||||
void xdr_encode_finish(XDR *xdrs, iovec **iov_list, unsigned *iov_count)
|
||||
{
|
||||
if (xdrs->last_end < xdrs->cur_out.size())
|
||||
{
|
||||
xdrs->buf_list.push_back((iovec){
|
||||
.iov_base = 0,
|
||||
.iov_len = xdrs->cur_out.size() - xdrs->last_end,
|
||||
});
|
||||
xdrs->last_end = xdrs->cur_out.size();
|
||||
}
|
||||
uint8_t *cur_buf = xdrs->cur_out.data();
|
||||
for (auto & buf: xdrs->buf_list)
|
||||
{
|
||||
if (!buf.iov_base)
|
||||
{
|
||||
buf.iov_base = cur_buf;
|
||||
cur_buf += buf.iov_len;
|
||||
}
|
||||
}
|
||||
*iov_list = xdrs->buf_list.data();
|
||||
*iov_count = xdrs->buf_list.size();
|
||||
}
|
||||
|
||||
void xdr_dump_encoded(XDR *xdrs)
|
||||
{
|
||||
for (auto & buf: xdrs->buf_list)
|
||||
{
|
||||
for (int i = 0; i < buf.iov_len; i++)
|
||||
printf("%02x", ((uint8_t*)buf.iov_base)[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void xdr_add_malloc(XDR *xdrs, void *buf)
|
||||
{
|
||||
xdrs->allocs.push_back(buf);
|
||||
}
|
||||
|
||||
xdr_string_t xdr_copy_string(XDR *xdrs, const std::string & str)
|
||||
{
|
||||
char *cp = (char*)malloc_or_die(str.size()+1);
|
||||
memcpy(cp, str.data(), str.size());
|
||||
cp[str.size()] = 0;
|
||||
xdr_add_malloc(xdrs, cp);
|
||||
return (xdr_string_t){ str.size(), cp };
|
||||
}
|
||||
|
||||
xdr_string_t xdr_copy_string(XDR *xdrs, const char *str)
|
||||
{
|
||||
return xdr_copy_string(xdrs, str, strlen(str));
|
||||
}
|
||||
|
||||
xdr_string_t xdr_copy_string(XDR *xdrs, const char *str, size_t len)
|
||||
{
|
||||
char *cp = (char*)malloc_or_die(len+1);
|
||||
memcpy(cp, str, len);
|
||||
cp[len] = 0;
|
||||
xdr_add_malloc(xdrs, cp);
|
||||
return (xdr_string_t){ len, cp };
|
||||
}
|
@@ -1,83 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// Efficient XDR implementation almost compatible with rpcgen (see run-rpcgen.sh)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <sys/uio.h>
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
|
||||
#define XDR_COPY_LENGTH 128
|
||||
|
||||
struct xdr_string_t
|
||||
{
|
||||
size_t size;
|
||||
char *data;
|
||||
|
||||
operator std::string()
|
||||
{
|
||||
return std::string(data, size);
|
||||
}
|
||||
|
||||
bool operator == (const char *str)
|
||||
{
|
||||
if (!str)
|
||||
return false;
|
||||
int i;
|
||||
for (i = 0; i < size; i++)
|
||||
if (!str[i] || str[i] != data[i])
|
||||
return false;
|
||||
if (str[i])
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool operator != (const char *str)
|
||||
{
|
||||
return !(*this == str);
|
||||
}
|
||||
};
|
||||
|
||||
typedef uint32_t u_int;
|
||||
typedef uint32_t enum_t;
|
||||
typedef uint32_t bool_t;
|
||||
struct XDR;
|
||||
typedef int (*xdrproc_t)(XDR *xdrs, void *data);
|
||||
|
||||
// Create an empty XDR object
|
||||
XDR* xdr_create();
|
||||
|
||||
// Destroy the XDR object
|
||||
void xdr_destroy(XDR* xdrs);
|
||||
|
||||
// Free resources from any previous xdr_decode/xdr_encode calls
|
||||
void xdr_reset(XDR *xdrs);
|
||||
|
||||
// Try to decode <size> bytes from buffer <buf> using <fn>
|
||||
// Result may contain memory allocations that will be valid until the next call to xdr_{reset,destroy,decode,encode}
|
||||
int xdr_decode(XDR *xdrs, void *buf, unsigned size, xdrproc_t fn, void *data);
|
||||
|
||||
// Try to encode <data> using <fn>
|
||||
// May be mixed with xdr_decode
|
||||
// May be called multiple times to encode multiple parts of the same message
|
||||
int xdr_encode(XDR *xdrs, xdrproc_t fn, void *data);
|
||||
|
||||
// Get the result of previous xdr_encodes as a list of <struct iovec>'s
|
||||
// in <iov_list> (start) and <iov_count> (count).
|
||||
// The resulting iov_list is valid until the next call to xdr_{reset,destroy}.
|
||||
// It may contain references to the original data, so original data must not
|
||||
// be freed until the result is fully processed (sent).
|
||||
void xdr_encode_finish(XDR *xdrs, iovec **iov_list, unsigned *iov_count);
|
||||
|
||||
// Remember an allocated buffer to free it later on xdr_reset() or xdr_destroy()
|
||||
void xdr_add_malloc(XDR *xdrs, void *buf);
|
||||
|
||||
xdr_string_t xdr_copy_string(XDR *xdrs, const std::string & str);
|
||||
|
||||
xdr_string_t xdr_copy_string(XDR *xdrs, const char *str);
|
||||
|
||||
xdr_string_t xdr_copy_string(XDR *xdrs, const char *str, size_t len);
|
||||
|
||||
void xdr_dump_encoded(XDR *xdrs);
|
@@ -1,309 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// Efficient XDR implementation almost compatible with rpcgen (see run-rpcgen.sh)
|
||||
|
||||
// XDR in a nutshell:
|
||||
//
|
||||
// int: big endian 32bit
|
||||
// unsigned: BE 32bit
|
||||
// enum: BE 32bit
|
||||
// bool: BE 32bit 0/1
|
||||
// hyper: BE 64bit
|
||||
// unsigned hyper: BE 64bit
|
||||
// float: BE float
|
||||
// double: BE double
|
||||
// quadruple: BE long double
|
||||
// opaque[n] (fixed-length): bytes, padded to !(n%4)
|
||||
// opaque (variable-length): BE 32bit length, then n bytes, padded to !(n%4)
|
||||
// string: same as opaque
|
||||
// array<T>[n] (fixed-length): n items of type T
|
||||
// vector<T> (variable-length): BE 32bit length, then n items of type T
|
||||
// struct: components in the same order as specified
|
||||
// union: BE 32bit variant id, then variant of the union
|
||||
// void: nothing (empty, 0 byte data)
|
||||
// optional (XDR T*): BE 32bit 1/0, then T or nothing
|
||||
// linked list: sequence of optional entries
|
||||
//
|
||||
// RPC over TCP:
|
||||
//
|
||||
// BE 32bit length, then rpc_msg, then the procedure message itself
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "xdr_impl.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <endian.h>
|
||||
#include <vector>
|
||||
|
||||
#include "../malloc_or_die.h"
|
||||
|
||||
#define FALSE 0
|
||||
#define TRUE 1
|
||||
#define XDR_ENCODE 0
|
||||
#define XDR_DECODE 1
|
||||
#define BYTES_PER_XDR_UNIT 4
|
||||
#define IXDR_PUT_U_LONG(a, b)
|
||||
#define IXDR_GET_U_LONG(a) 0
|
||||
#define IXDR_PUT_BOOL(a, b)
|
||||
#define IXDR_GET_BOOL(a) 0
|
||||
#define XDR_INLINE(xdrs, len) NULL
|
||||
|
||||
struct xdr_linked_list_t
|
||||
{
|
||||
xdrproc_t fn;
|
||||
unsigned entry_size, size, cap;
|
||||
void *base;
|
||||
unsigned has_next, link_offset;
|
||||
};
|
||||
|
||||
struct XDR
|
||||
{
|
||||
int x_op;
|
||||
|
||||
// For decoding:
|
||||
uint8_t *buf = NULL;
|
||||
unsigned avail = 0;
|
||||
std::vector<void*> allocs;
|
||||
std::vector<xdr_linked_list_t> in_linked_list;
|
||||
|
||||
// For encoding:
|
||||
std::vector<uint8_t> cur_out;
|
||||
unsigned last_end = 0;
|
||||
std::vector<iovec> buf_list;
|
||||
};
|
||||
|
||||
uint32_t inline len_pad4(uint32_t len)
|
||||
{
|
||||
return ((len+3)/4) * 4;
|
||||
}
|
||||
|
||||
inline int xdr_opaque(XDR *xdrs, void *data, uint32_t len)
|
||||
{
|
||||
if (len <= 0)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
if (xdrs->x_op == XDR_DECODE)
|
||||
{
|
||||
uint32_t padded = len_pad4(len);
|
||||
if (xdrs->avail < padded)
|
||||
return 0;
|
||||
memcpy(data, xdrs->buf, len);
|
||||
xdrs->buf += padded;
|
||||
xdrs->avail -= padded;
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned old = xdrs->cur_out.size();
|
||||
uint32_t pad = (len & 3) ? (4 - (len & 3)) : 0;
|
||||
xdrs->cur_out.resize(old + len + pad);
|
||||
memcpy(xdrs->cur_out.data()+old, data, len);
|
||||
for (uint32_t i = 0; i < pad; i++)
|
||||
xdrs->cur_out[old+i] = 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
inline int xdr_bytes(XDR *xdrs, xdr_string_t *data, uint32_t maxlen)
|
||||
{
|
||||
if (xdrs->x_op == XDR_DECODE)
|
||||
{
|
||||
if (xdrs->avail < 4)
|
||||
return 0;
|
||||
uint32_t len = be32toh(*((uint32_t*)xdrs->buf));
|
||||
uint32_t padded = len_pad4(len);
|
||||
if (xdrs->avail < 4+padded)
|
||||
return 0;
|
||||
data->size = len;
|
||||
data->data = (char*)(xdrs->buf+4);
|
||||
xdrs->buf += 4+padded;
|
||||
xdrs->avail -= 4+padded;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (data->size < XDR_COPY_LENGTH)
|
||||
{
|
||||
unsigned old = xdrs->cur_out.size();
|
||||
xdrs->cur_out.resize(old + 4+data->size);
|
||||
*(uint32_t*)(xdrs->cur_out.data() + old) = htobe32(data->size);
|
||||
memcpy(xdrs->cur_out.data()+old+4, data->data, data->size);
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned old = xdrs->cur_out.size();
|
||||
xdrs->cur_out.resize(old + 4);
|
||||
*(uint32_t*)(xdrs->cur_out.data() + old) = htobe32(data->size);
|
||||
xdrs->buf_list.push_back((iovec){
|
||||
.iov_base = 0,
|
||||
.iov_len = xdrs->cur_out.size() - xdrs->last_end,
|
||||
});
|
||||
xdrs->last_end = xdrs->cur_out.size();
|
||||
xdrs->buf_list.push_back((iovec)
|
||||
{
|
||||
.iov_base = (void*)data->data,
|
||||
.iov_len = data->size,
|
||||
});
|
||||
}
|
||||
if (data->size & 3)
|
||||
{
|
||||
int pad = 4-(data->size & 3);
|
||||
unsigned old = xdrs->cur_out.size();
|
||||
xdrs->cur_out.resize(old+pad);
|
||||
for (int i = 0; i < pad; i++)
|
||||
xdrs->cur_out[old+i] = 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
inline int xdr_string(XDR *xdrs, xdr_string_t *data, uint32_t maxlen)
|
||||
{
|
||||
return xdr_bytes(xdrs, data, maxlen);
|
||||
}
|
||||
|
||||
inline int xdr_u_int(XDR *xdrs, void *data)
|
||||
{
|
||||
if (xdrs->x_op == XDR_DECODE)
|
||||
{
|
||||
if (xdrs->avail < 4)
|
||||
return 0;
|
||||
*((uint32_t*)data) = be32toh(*((uint32_t*)xdrs->buf));
|
||||
xdrs->buf += 4;
|
||||
xdrs->avail -= 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned old = xdrs->cur_out.size();
|
||||
xdrs->cur_out.resize(old + 4);
|
||||
*(uint32_t*)(xdrs->cur_out.data() + old) = htobe32(*(uint32_t*)data);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
inline int xdr_enum(XDR *xdrs, void *data)
|
||||
{
|
||||
return xdr_u_int(xdrs, data);
|
||||
}
|
||||
|
||||
inline int xdr_bool(XDR *xdrs, void *data)
|
||||
{
|
||||
return xdr_u_int(xdrs, data);
|
||||
}
|
||||
|
||||
inline int xdr_uint64_t(XDR *xdrs, void *data)
|
||||
{
|
||||
if (xdrs->x_op == XDR_DECODE)
|
||||
{
|
||||
if (xdrs->avail < 8)
|
||||
return 0;
|
||||
*((uint64_t*)data) = be64toh(*((uint64_t*)xdrs->buf));
|
||||
xdrs->buf += 8;
|
||||
xdrs->avail -= 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned old = xdrs->cur_out.size();
|
||||
xdrs->cur_out.resize(old + 8);
|
||||
*(uint64_t*)(xdrs->cur_out.data() + old) = htobe64(*(uint64_t*)data);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Parse inconvenient shitty linked lists as arrays
|
||||
inline int xdr_pointer(XDR *xdrs, char **data, unsigned entry_size, xdrproc_t entry_fn)
|
||||
{
|
||||
if (xdrs->x_op == XDR_DECODE)
|
||||
{
|
||||
if (xdrs->avail < 4)
|
||||
return 0;
|
||||
uint32_t has_next = be32toh(*((uint32_t*)xdrs->buf));
|
||||
xdrs->buf += 4;
|
||||
xdrs->avail -= 4;
|
||||
*data = NULL;
|
||||
if (!xdrs->in_linked_list.size() ||
|
||||
xdrs->in_linked_list.back().fn != entry_fn)
|
||||
{
|
||||
if (has_next)
|
||||
{
|
||||
unsigned cap = 2;
|
||||
void *base = malloc_or_die(entry_size * cap);
|
||||
xdrs->in_linked_list.push_back((xdr_linked_list_t){
|
||||
.fn = entry_fn,
|
||||
.entry_size = entry_size,
|
||||
.size = 1,
|
||||
.cap = cap,
|
||||
.base = base,
|
||||
.has_next = 0,
|
||||
.link_offset = 0,
|
||||
});
|
||||
*data = (char*)base;
|
||||
if (!entry_fn(xdrs, base))
|
||||
return 0;
|
||||
auto & ll = xdrs->in_linked_list.back();
|
||||
while (ll.has_next)
|
||||
{
|
||||
ll.has_next = 0;
|
||||
if (ll.size >= ll.cap)
|
||||
{
|
||||
ll.cap *= 2;
|
||||
ll.base = realloc_or_die(ll.base, ll.entry_size * ll.cap);
|
||||
}
|
||||
if (!entry_fn(xdrs, (uint8_t*)ll.base + ll.entry_size*ll.size))
|
||||
return 0;
|
||||
ll.size++;
|
||||
}
|
||||
for (unsigned i = 0; i < ll.size-1; i++)
|
||||
{
|
||||
*(void**)((uint8_t*)ll.base + i*ll.entry_size + ll.link_offset) =
|
||||
(uint8_t*)ll.base + (i+1)*ll.entry_size;
|
||||
}
|
||||
xdrs->allocs.push_back(ll.base);
|
||||
xdrs->in_linked_list.pop_back();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
auto & ll = xdrs->in_linked_list.back();
|
||||
xdrs->in_linked_list.back().has_next = has_next;
|
||||
xdrs->in_linked_list.back().link_offset = (uint8_t*)data - (uint8_t*)ll.base - ll.entry_size*ll.size;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned old = xdrs->cur_out.size();
|
||||
xdrs->cur_out.resize(old + 4);
|
||||
*(uint32_t*)(xdrs->cur_out.data() + old) = htobe32(*data ? 1 : 0);
|
||||
if (*data)
|
||||
entry_fn(xdrs, *data);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
inline int xdr_array(XDR *xdrs, char **data, uint32_t* len, uint32_t maxlen, uint32_t entry_size, xdrproc_t fn)
|
||||
{
|
||||
if (xdrs->x_op == XDR_DECODE)
|
||||
{
|
||||
if (xdrs->avail < 4)
|
||||
return 0;
|
||||
*len = be32toh(*((uint32_t*)xdrs->buf));
|
||||
if (*len > maxlen)
|
||||
return 0;
|
||||
xdrs->buf += 4;
|
||||
xdrs->avail -= 4;
|
||||
*data = (char*)malloc_or_die(entry_size * (*len));
|
||||
for (uint32_t i = 0; i < *len; i++)
|
||||
fn(xdrs, *data + entry_size*i);
|
||||
xdrs->allocs.push_back(*data);
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned old = xdrs->cur_out.size();
|
||||
xdrs->cur_out.resize(old + 4);
|
||||
*(uint32_t*)(xdrs->cur_out.data() + old) = htobe32(*len);
|
||||
for (uint32_t i = 0; i < *len; i++)
|
||||
fn(xdrs, *data + entry_size*i);
|
||||
}
|
||||
return 1;
|
||||
}
|
1301
src/nfs_conn.cpp
1301
src/nfs_conn.cpp
File diff suppressed because it is too large
Load Diff
@@ -1,184 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// Portmap service for NFS proxy
|
||||
|
||||
#include <netinet/in.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "nfs/portmap.h"
|
||||
#include "nfs/xdr_impl_inline.h"
|
||||
|
||||
#include "malloc_or_die.h"
|
||||
#include "nfs_portmap.h"
|
||||
#include "sha256.h"
|
||||
#include "base64.h"
|
||||
|
||||
/*
|
||||
* The NULL procedure. All protocols/versions must provide a NULL procedure
|
||||
* as index 0.
|
||||
* It is used by clients, and rpcinfo, to "ping" a service and verify that
|
||||
* the service is available and that it does support the indicated version.
|
||||
*/
|
||||
static int pmap2_null_proc(struct rpc_context *rpc, rpc_op_t *rop)
|
||||
{
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* v2 GETPORT.
|
||||
* This is the lookup function for portmapper version 2.
|
||||
* A client provides program, version and protocol (tcp or udp)
|
||||
* and portmapper returns which port that service is available on,
|
||||
* (or 0 if no such program is registered.)
|
||||
*/
|
||||
static int pmap2_getport_proc(portmap_service_t *self, rpc_op_t *rop)
|
||||
{
|
||||
PMAP2GETPORTargs *args = (PMAP2GETPORTargs *)rop->request;
|
||||
uint32_t *reply = (uint32_t *)rop->reply;
|
||||
auto it = self->reg_ports.lower_bound((portmap_id_t){
|
||||
.prog = args->prog,
|
||||
.vers = args->vers,
|
||||
.udp = args->prot == IPPROTO_UDP,
|
||||
.ipv6 = false,
|
||||
});
|
||||
if (it != self->reg_ports.end() &&
|
||||
it->prog == args->prog && it->vers == args->vers &&
|
||||
it->udp == (args->prot == IPPROTO_UDP))
|
||||
{
|
||||
*reply = it->port;
|
||||
}
|
||||
else
|
||||
{
|
||||
*reply = 0;
|
||||
}
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* v2 DUMP.
|
||||
* This RPC returns a list of all endpoints that are registered with
|
||||
* portmapper.
|
||||
*/
|
||||
static int pmap2_dump_proc(portmap_service_t *self, rpc_op_t *rop)
|
||||
{
|
||||
pmap2_mapping_list *list = (pmap2_mapping_list*)malloc_or_die(sizeof(pmap2_mapping_list) * self->reg_ports.size());
|
||||
xdr_add_malloc(rop->xdrs, list);
|
||||
PMAP2DUMPres *reply = (PMAP2DUMPres *)rop->reply;
|
||||
int i = 0;
|
||||
for (auto it = self->reg_ports.begin(); it != self->reg_ports.end(); it++)
|
||||
{
|
||||
if (it->ipv6)
|
||||
continue;
|
||||
list[i] = {
|
||||
.map = {
|
||||
.prog = it->prog,
|
||||
.vers = it->vers,
|
||||
.prot = it->udp ? IPPROTO_UDP : IPPROTO_TCP,
|
||||
.port = it->port,
|
||||
},
|
||||
.next = list+i+1,
|
||||
};
|
||||
i++;
|
||||
}
|
||||
list[i-1].next = NULL;
|
||||
// Send reply
|
||||
reply->list = list;
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* v3 GETADDR.
|
||||
* This is the lookup function for portmapper version 3.
|
||||
*/
|
||||
static int pmap3_getaddr_proc(portmap_service_t *self, rpc_op_t *rop)
|
||||
{
|
||||
PMAP3GETADDRargs *args = (PMAP3GETADDRargs *)rop->request;
|
||||
PMAP3GETADDRres *reply = (PMAP3GETADDRres *)rop->reply;
|
||||
portmap_id_t ref = (portmap_id_t){
|
||||
.prog = args->prog,
|
||||
.vers = args->vers,
|
||||
.udp = args->netid == "udp" || args->netid == "udp6",
|
||||
.ipv6 = args->netid == "tcp6" || args->netid == "udp6",
|
||||
};
|
||||
auto it = self->reg_ports.lower_bound(ref);
|
||||
if (it != self->reg_ports.end() &&
|
||||
it->prog == ref.prog && it->vers == ref.vers &&
|
||||
it->udp == ref.udp && it->ipv6 == ref.ipv6)
|
||||
{
|
||||
reply->addr = xdr_copy_string(rop->xdrs, it->addr);
|
||||
}
|
||||
else
|
||||
{
|
||||
reply->addr = {};
|
||||
}
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* v3 DUMP.
|
||||
* This RPC returns a list of all endpoints that are registered with
|
||||
* portmapper.
|
||||
*/
|
||||
static std::string netid_udp = "udp";
|
||||
static std::string netid_udp6 = "udp6";
|
||||
static std::string netid_tcp = "tcp";
|
||||
static std::string netid_tcp6 = "tcp6";
|
||||
static int pmap3_dump_proc(portmap_service_t *self, rpc_op_t *rop)
|
||||
{
|
||||
PMAP3DUMPres *reply = (PMAP3DUMPres *)rop->reply;
|
||||
pmap3_mapping_list *list = (pmap3_mapping_list*)malloc_or_die(sizeof(pmap3_mapping_list*) * self->reg_ports.size());
|
||||
xdr_add_malloc(rop->xdrs, list);
|
||||
int i = 0;
|
||||
for (auto it = self->reg_ports.begin(); it != self->reg_ports.end(); it++)
|
||||
{
|
||||
list[i] = (pmap3_mapping_list){
|
||||
.map = (pmap3_mapping){
|
||||
.prog = it->prog,
|
||||
.vers = it->vers,
|
||||
.netid = xdr_copy_string(rop->xdrs, it->ipv6
|
||||
? (it->udp ? netid_udp6 : netid_tcp6)
|
||||
: (it->udp ? netid_udp : netid_tcp)),
|
||||
.addr = xdr_copy_string(rop->xdrs, it->addr), // 0.0.0.0.port
|
||||
.owner = xdr_copy_string(rop->xdrs, it->owner),
|
||||
},
|
||||
.next = list+i+1,
|
||||
};
|
||||
i++;
|
||||
}
|
||||
list[i-1].next = NULL;
|
||||
reply->list = list;
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
portmap_service_t::portmap_service_t()
|
||||
{
|
||||
struct rpc_service_proc_t pt[] = {
|
||||
{PMAP_PROGRAM, PMAP_V2, PMAP2_NULL, (rpc_handler_t)pmap2_null_proc, NULL, 0, NULL, 0, this},
|
||||
{PMAP_PROGRAM, PMAP_V2, PMAP2_GETPORT, (rpc_handler_t)pmap2_getport_proc, (xdrproc_t)xdr_PMAP2GETPORTargs, sizeof(PMAP2GETPORTargs), (xdrproc_t)xdr_u_int, sizeof(u_int), this},
|
||||
{PMAP_PROGRAM, PMAP_V2, PMAP2_DUMP, (rpc_handler_t)pmap2_dump_proc, NULL, 0, (xdrproc_t)xdr_PMAP2DUMPres, sizeof(PMAP2DUMPres), this},
|
||||
{PMAP_PROGRAM, PMAP_V3, PMAP3_NULL, (rpc_handler_t)pmap2_null_proc, NULL, 0, NULL, 0, this},
|
||||
{PMAP_PROGRAM, PMAP_V3, PMAP3_GETADDR, (rpc_handler_t)pmap3_getaddr_proc, (xdrproc_t)xdr_PMAP3GETADDRargs, sizeof(PMAP3GETADDRargs), (xdrproc_t)xdr_string, sizeof(xdr_string_t), this},
|
||||
{PMAP_PROGRAM, PMAP_V3, PMAP3_DUMP, (rpc_handler_t)pmap3_dump_proc, NULL, 0, (xdrproc_t)xdr_PMAP3DUMPres, sizeof(PMAP3DUMPres), this},
|
||||
};
|
||||
for (int i = 0; i < sizeof(pt)/sizeof(pt[0]); i++)
|
||||
{
|
||||
proc_table.push_back(pt[i]);
|
||||
}
|
||||
}
|
||||
|
||||
std::string sha256(const std::string & str)
|
||||
{
|
||||
std::string hash;
|
||||
hash.resize(32);
|
||||
SHA256_CTX ctx;
|
||||
sha256_init(&ctx);
|
||||
sha256_update(&ctx, (uint8_t*)str.data(), str.size());
|
||||
sha256_final(&ctx, (uint8_t*)hash.data());
|
||||
return hash;
|
||||
}
|
@@ -1,39 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// Portmap service for NFS proxy
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
#include "nfs/rpc_impl.h"
|
||||
|
||||
struct portmap_id_t
|
||||
{
|
||||
unsigned prog, vers;
|
||||
bool udp;
|
||||
bool ipv6;
|
||||
unsigned port;
|
||||
std::string owner;
|
||||
std::string addr;
|
||||
};
|
||||
|
||||
class portmap_service_t
|
||||
{
|
||||
public:
|
||||
std::set<portmap_id_t> reg_ports;
|
||||
std::vector<rpc_service_proc_t> proc_table;
|
||||
portmap_service_t();
|
||||
};
|
||||
|
||||
inline bool operator < (const portmap_id_t &a, const portmap_id_t &b)
|
||||
{
|
||||
return a.prog < b.prog || a.prog == b.prog && a.vers < b.vers ||
|
||||
a.prog == b.prog && a.vers == b.vers && a.udp < b.udp ||
|
||||
a.prog == b.prog && a.vers == b.vers && a.udp == b.udp && a.ipv6 < b.ipv6;
|
||||
}
|
||||
|
||||
std::string sha256(const std::string & str);
|
@@ -1,982 +0,0 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// Simplified NFS proxy
|
||||
// Presents all images as files
|
||||
// Keeps image/file list in memory and is thus unsuitable for a large number of files
|
||||
|
||||
#define _XOPEN_SOURCE
|
||||
#include <limits.h>
|
||||
|
||||
#include <netinet/tcp.h>
|
||||
#include <sys/epoll.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
//#include <signal.h>
|
||||
|
||||
#include "nfs/nfs.h"
|
||||
#include "nfs/rpc.h"
|
||||
#include "nfs/portmap.h"
|
||||
|
||||
#include "addr_util.h"
|
||||
#include "base64.h"
|
||||
#include "nfs_proxy.h"
|
||||
#include "http_client.h"
|
||||
#include "cli.h"
|
||||
|
||||
#define ETCD_INODE_STATS_WATCH_ID 101
|
||||
#define ETCD_POOL_STATS_WATCH_ID 102
|
||||
|
||||
const char *exe_name = NULL;
|
||||
|
||||
nfs_proxy_t::~nfs_proxy_t()
|
||||
{
|
||||
if (cmd)
|
||||
delete cmd;
|
||||
if (cli)
|
||||
delete cli;
|
||||
if (epmgr)
|
||||
delete epmgr;
|
||||
if (ringloop)
|
||||
delete ringloop;
|
||||
}
|
||||
|
||||
json11::Json::object nfs_proxy_t::parse_args(int narg, const char *args[])
|
||||
{
|
||||
json11::Json::object cfg;
|
||||
for (int i = 1; i < narg; i++)
|
||||
{
|
||||
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
|
||||
{
|
||||
printf(
|
||||
"Vitastor NFS 3.0 proxy\n"
|
||||
"(c) Vitaliy Filippov, 2021-2022 (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"USAGE:\n"
|
||||
" %s [--etcd_address ADDR] [OTHER OPTIONS]\n"
|
||||
" --subdir <DIR> export images prefixed <DIR>/ (default empty - export all images)\n"
|
||||
" --portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)\n"
|
||||
" --bind <IP> bind service to <IP> address (default 0.0.0.0)\n"
|
||||
" --nfspath <PATH> set NFS export path to <PATH> (default is /)\n"
|
||||
" --port <PORT> use port <PORT> for NFS services (default is 2049)\n"
|
||||
" --pool <POOL> use <POOL> as default pool for new files (images)\n"
|
||||
" --foreground 1 stay in foreground, do not daemonize\n"
|
||||
"\n"
|
||||
"NFS proxy is stateless if you use immediate_commit=all in your cluster, so\n"
|
||||
"you can freely use multiple NFS proxies with L3 load balancing in this case.\n"
|
||||
"\n"
|
||||
"Example start and mount commands for a custom NFS port:\n"
|
||||
" %s --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool\n"
|
||||
" mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp\n",
|
||||
exe_name, exe_name
|
||||
);
|
||||
exit(0);
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == '-')
|
||||
{
|
||||
const char *opt = args[i]+2;
|
||||
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
|
||||
}
|
||||
}
|
||||
return cfg;
|
||||
}
|
||||
|
||||
void nfs_proxy_t::run(json11::Json cfg)
|
||||
{
|
||||
timespec tv;
|
||||
clock_gettime(CLOCK_REALTIME, &tv);
|
||||
srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
|
||||
server_id = (uint64_t)lrand48() | ((uint64_t)lrand48() << 31) | ((uint64_t)lrand48() << 62);
|
||||
// Parse options
|
||||
bind_address = cfg["bind"].string_value();
|
||||
if (bind_address == "")
|
||||
bind_address = "0.0.0.0";
|
||||
default_pool = cfg["pool"].as_string();
|
||||
portmap_enabled = cfg.object_items().find("portmap") == cfg.object_items().end() ||
|
||||
cfg["portmap"].uint64_value() ||
|
||||
cfg["portmap"].string_value() == "yes" ||
|
||||
cfg["portmap"].string_value() == "true";
|
||||
nfs_port = cfg["port"].uint64_value() & 0xffff;
|
||||
if (!nfs_port)
|
||||
nfs_port = 2049;
|
||||
export_root = cfg["nfspath"].string_value();
|
||||
if (!export_root.size())
|
||||
export_root = "/";
|
||||
name_prefix = cfg["subdir"].string_value();
|
||||
{
|
||||
int e = name_prefix.size();
|
||||
while (e > 0 && name_prefix[e-1] == '/')
|
||||
e--;
|
||||
int s = 0;
|
||||
while (s < e && name_prefix[s] == '/')
|
||||
s++;
|
||||
name_prefix = name_prefix.substr(s, e-s);
|
||||
if (name_prefix.size())
|
||||
name_prefix += "/";
|
||||
}
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(512);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||
cmd = new cli_tool_t();
|
||||
cmd->ringloop = ringloop;
|
||||
cmd->epmgr = epmgr;
|
||||
cmd->cli = cli;
|
||||
// We need inode name hashes for NFS handles to remain stateless and <= 64 bytes long
|
||||
dir_info[""] = (nfs_dir_t){
|
||||
.id = 1,
|
||||
.mod_rev = 0,
|
||||
};
|
||||
clock_gettime(CLOCK_REALTIME, &dir_info[""].mtime);
|
||||
watch_stats();
|
||||
assert(cli->st_cli.on_inode_change_hook == NULL);
|
||||
cli->st_cli.on_inode_change_hook = [this](inode_t changed_inode, bool removed)
|
||||
{
|
||||
auto inode_cfg_it = cli->st_cli.inode_config.find(changed_inode);
|
||||
if (inode_cfg_it == cli->st_cli.inode_config.end())
|
||||
{
|
||||
return;
|
||||
}
|
||||
auto & inode_cfg = inode_cfg_it->second;
|
||||
std::string full_name = inode_cfg.name;
|
||||
if (name_prefix != "" && full_name.substr(0, name_prefix.size()) != name_prefix)
|
||||
{
|
||||
return;
|
||||
}
|
||||
// Calculate directory modification time and revision (used as "cookie verifier")
|
||||
timespec now;
|
||||
clock_gettime(CLOCK_REALTIME, &now);
|
||||
dir_info[""].mod_rev = dir_info[""].mod_rev < inode_cfg.mod_revision ? inode_cfg.mod_revision : dir_info[""].mod_rev;
|
||||
dir_info[""].mtime = now;
|
||||
int pos = full_name.find('/', name_prefix.size());
|
||||
while (pos >= 0)
|
||||
{
|
||||
std::string dir = full_name.substr(0, pos);
|
||||
auto & dinf = dir_info[dir];
|
||||
if (!dinf.id)
|
||||
dinf.id = next_dir_id++;
|
||||
dinf.mod_rev = dinf.mod_rev < inode_cfg.mod_revision ? inode_cfg.mod_revision : dinf.mod_rev;
|
||||
dinf.mtime = now;
|
||||
dir_by_hash["S"+base64_encode(sha256(dir))] = dir;
|
||||
pos = full_name.find('/', pos+1);
|
||||
}
|
||||
// Alter inode_by_hash
|
||||
if (removed)
|
||||
{
|
||||
auto ino_it = hash_by_inode.find(changed_inode);
|
||||
if (ino_it != hash_by_inode.end())
|
||||
{
|
||||
inode_by_hash.erase(ino_it->second);
|
||||
hash_by_inode.erase(ino_it);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::string hash = "S"+base64_encode(sha256(full_name));
|
||||
auto hbi_it = hash_by_inode.find(changed_inode);
|
||||
if (hbi_it != hash_by_inode.end() && hbi_it->second != hash)
|
||||
{
|
||||
// inode had a different name, remove old hash=>inode pointer
|
||||
inode_by_hash.erase(hbi_it->second);
|
||||
}
|
||||
inode_by_hash[hash] = changed_inode;
|
||||
hash_by_inode[changed_inode] = hash;
|
||||
}
|
||||
};
|
||||
// Load image metadata
|
||||
while (!cli->is_ready())
|
||||
{
|
||||
ringloop->loop();
|
||||
if (cli->is_ready())
|
||||
break;
|
||||
ringloop->wait();
|
||||
}
|
||||
// Check default pool
|
||||
check_default_pool();
|
||||
// Self-register portmap and NFS
|
||||
pmap.reg_ports.insert((portmap_id_t){
|
||||
.prog = PMAP_PROGRAM,
|
||||
.vers = PMAP_V2,
|
||||
.port = portmap_enabled ? 111 : nfs_port,
|
||||
.owner = "portmapper-service",
|
||||
.addr = portmap_enabled ? "0.0.0.0.0.111" : ("0.0.0.0.0."+std::to_string(nfs_port)),
|
||||
});
|
||||
pmap.reg_ports.insert((portmap_id_t){
|
||||
.prog = PMAP_PROGRAM,
|
||||
.vers = PMAP_V3,
|
||||
.port = portmap_enabled ? 111 : nfs_port,
|
||||
.owner = "portmapper-service",
|
||||
.addr = portmap_enabled ? "0.0.0.0.0.111" : ("0.0.0.0.0."+std::to_string(nfs_port)),
|
||||
});
|
||||
pmap.reg_ports.insert((portmap_id_t){
|
||||
.prog = NFS_PROGRAM,
|
||||
.vers = NFS_V3,
|
||||
.port = nfs_port,
|
||||
.owner = "nfs-server",
|
||||
.addr = "0.0.0.0.0."+std::to_string(nfs_port),
|
||||
});
|
||||
pmap.reg_ports.insert((portmap_id_t){
|
||||
.prog = MOUNT_PROGRAM,
|
||||
.vers = MOUNT_V3,
|
||||
.port = nfs_port,
|
||||
.owner = "rpc.mountd",
|
||||
.addr = "0.0.0.0.0."+std::to_string(nfs_port),
|
||||
});
|
||||
// Create NFS socket and add it to epoll
|
||||
int nfs_socket = create_and_bind_socket(bind_address, nfs_port, 128, NULL);
|
||||
fcntl(nfs_socket, F_SETFL, fcntl(nfs_socket, F_GETFL, 0) | O_NONBLOCK);
|
||||
epmgr->tfd->set_fd_handler(nfs_socket, false, [this](int nfs_socket, int epoll_events)
|
||||
{
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
fprintf(stderr, "Listening portmap socket disconnected, exiting\n");
|
||||
exit(1);
|
||||
}
|
||||
else
|
||||
{
|
||||
do_accept(nfs_socket);
|
||||
}
|
||||
});
|
||||
if (portmap_enabled)
|
||||
{
|
||||
// Create portmap socket and add it to epoll
|
||||
int portmap_socket = create_and_bind_socket(bind_address, 111, 128, NULL);
|
||||
fcntl(portmap_socket, F_SETFL, fcntl(portmap_socket, F_GETFL, 0) | O_NONBLOCK);
|
||||
epmgr->tfd->set_fd_handler(portmap_socket, false, [this](int portmap_socket, int epoll_events)
|
||||
{
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
fprintf(stderr, "Listening portmap socket disconnected, exiting\n");
|
||||
exit(1);
|
||||
}
|
||||
else
|
||||
{
|
||||
do_accept(portmap_socket);
|
||||
}
|
||||
});
|
||||
}
|
||||
if (cfg["foreground"].is_null())
|
||||
{
|
||||
daemonize();
|
||||
}
|
||||
while (true)
|
||||
{
|
||||
ringloop->loop();
|
||||
ringloop->wait();
|
||||
}
|
||||
/*// Sync at the end
|
||||
cluster_op_t *close_sync = new cluster_op_t;
|
||||
close_sync->opcode = OSD_OP_SYNC;
|
||||
close_sync->callback = [&stop](cluster_op_t *op)
|
||||
{
|
||||
stop = true;
|
||||
delete op;
|
||||
};
|
||||
cli->execute(close_sync);*/
|
||||
// Destroy the client
|
||||
delete cli;
|
||||
delete epmgr;
|
||||
delete ringloop;
|
||||
cli = NULL;
|
||||
epmgr = NULL;
|
||||
ringloop = NULL;
|
||||
}
|
||||
|
||||
void nfs_proxy_t::watch_stats()
|
||||
{
|
||||
assert(cli->st_cli.on_start_watcher_hook == NULL);
|
||||
cli->st_cli.on_start_watcher_hook = [this](http_co_t *etcd_watch_ws)
|
||||
{
|
||||
http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object {
|
||||
{ "create_request", json11::Json::object {
|
||||
{ "key", base64_encode(cli->st_cli.etcd_prefix+"/inode/stats/") },
|
||||
{ "range_end", base64_encode(cli->st_cli.etcd_prefix+"/inode/stats0") },
|
||||
{ "start_revision", cli->st_cli.etcd_watch_revision },
|
||||
{ "watch_id", ETCD_INODE_STATS_WATCH_ID },
|
||||
{ "progress_notify", true },
|
||||
} }
|
||||
}).dump());
|
||||
http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object {
|
||||
{ "create_request", json11::Json::object {
|
||||
{ "key", base64_encode(cli->st_cli.etcd_prefix+"/pool/stats/") },
|
||||
{ "range_end", base64_encode(cli->st_cli.etcd_prefix+"/pool/stats0") },
|
||||
{ "start_revision", cli->st_cli.etcd_watch_revision },
|
||||
{ "watch_id", ETCD_POOL_STATS_WATCH_ID },
|
||||
{ "progress_notify", true },
|
||||
} }
|
||||
}).dump());
|
||||
cli->st_cli.etcd_txn_slow(json11::Json::object {
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(cli->st_cli.etcd_prefix+"/inode/stats/") },
|
||||
{ "range_end", base64_encode(cli->st_cli.etcd_prefix+"/inode/stats0") },
|
||||
} }
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(cli->st_cli.etcd_prefix+"/pool/stats/") },
|
||||
{ "range_end", base64_encode(cli->st_cli.etcd_prefix+"/pool/stats0") },
|
||||
} }
|
||||
},
|
||||
} },
|
||||
}, [this](std::string err, json11::Json res)
|
||||
{
|
||||
for (auto & rsp: res["responses"].array_items())
|
||||
{
|
||||
for (auto & item: rsp["response_range"]["kvs"].array_items())
|
||||
{
|
||||
etcd_kv_t kv = cli->st_cli.parse_etcd_kv(item);
|
||||
parse_stats(kv);
|
||||
}
|
||||
}
|
||||
});
|
||||
};
|
||||
cli->st_cli.on_change_hook = [this, old_hook = cli->st_cli.on_change_hook](std::map<std::string, etcd_kv_t> & changes)
|
||||
{
|
||||
for (auto & p: changes)
|
||||
{
|
||||
parse_stats(p.second);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
void nfs_proxy_t::parse_stats(etcd_kv_t & kv)
|
||||
{
|
||||
auto & key = kv.key;
|
||||
if (key.substr(0, cli->st_cli.etcd_prefix.length()+13) == cli->st_cli.etcd_prefix+"/inode/stats/")
|
||||
{
|
||||
pool_id_t pool_id = 0;
|
||||
inode_t inode_num = 0;
|
||||
char null_byte = 0;
|
||||
sscanf(key.c_str() + cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode_num, &null_byte);
|
||||
if (!pool_id || pool_id >= POOL_ID_MAX || !inode_num || null_byte != 0)
|
||||
{
|
||||
fprintf(stderr, "Bad etcd key %s, ignoring\n", key.c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
inode_stats[INODE_WITH_POOL(pool_id, inode_num)] = kv.value;
|
||||
}
|
||||
}
|
||||
else if (key.substr(0, cli->st_cli.etcd_prefix.length()+12) == cli->st_cli.etcd_prefix+"/pool/stats/")
|
||||
{
|
||||
pool_id_t pool_id = 0;
|
||||
char null_byte = 0;
|
||||
sscanf(key.c_str() + cli->st_cli.etcd_prefix.length()+12, "%u%c", &pool_id, &null_byte);
|
||||
if (!pool_id || pool_id >= POOL_ID_MAX)
|
||||
{
|
||||
fprintf(stderr, "Bad etcd key %s, ignoring\n", key.c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
pool_stats[pool_id] = kv.value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void nfs_proxy_t::check_default_pool()
|
||||
{
|
||||
if (default_pool == "")
|
||||
{
|
||||
if (cli->st_cli.pool_config.size() == 1)
|
||||
{
|
||||
default_pool = cli->st_cli.pool_config.begin()->second.name;
|
||||
default_pool_id = cli->st_cli.pool_config.begin()->first;
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "There are %lu pools. Please select default pool with --pool option\n", cli->st_cli.pool_config.size());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (auto & p: cli->st_cli.pool_config)
|
||||
{
|
||||
if (p.second.name == default_pool)
|
||||
{
|
||||
default_pool_id = p.first;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!default_pool_id)
|
||||
{
|
||||
fprintf(stderr, "Pool %s is not found\n", default_pool.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void nfs_proxy_t::do_accept(int listen_fd)
|
||||
{
|
||||
struct sockaddr_storage addr;
|
||||
socklen_t addr_size = sizeof(addr);
|
||||
int nfs_fd = 0;
|
||||
while ((nfs_fd = accept(listen_fd, (struct sockaddr *)&addr, &addr_size)) >= 0)
|
||||
{
|
||||
fprintf(stderr, "New client %d: connection from %s\n", nfs_fd, addr_to_string(addr).c_str());
|
||||
fcntl(nfs_fd, F_SETFL, fcntl(nfs_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
int one = 1;
|
||||
setsockopt(nfs_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
auto cli = new nfs_client_t();
|
||||
cli->parent = this;
|
||||
cli->nfs_fd = nfs_fd;
|
||||
for (auto & fn: pmap.proc_table)
|
||||
{
|
||||
cli->proc_table.insert(fn);
|
||||
}
|
||||
epmgr->tfd->set_fd_handler(nfs_fd, true, [cli](int nfs_fd, int epoll_events)
|
||||
{
|
||||
// Handle incoming event
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
fprintf(stderr, "Client %d disconnected\n", nfs_fd);
|
||||
cli->stop();
|
||||
return;
|
||||
}
|
||||
cli->epoll_events |= epoll_events;
|
||||
if (epoll_events & EPOLLIN)
|
||||
{
|
||||
// Something is available for reading
|
||||
cli->submit_read(0);
|
||||
}
|
||||
if (epoll_events & EPOLLOUT)
|
||||
{
|
||||
cli->submit_send();
|
||||
}
|
||||
});
|
||||
}
|
||||
if (nfs_fd < 0 && errno != EAGAIN)
|
||||
{
|
||||
fprintf(stderr, "Failed to accept connection: %s\n", strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME Move these functions to "rpc_context"
|
||||
void nfs_client_t::select_read_buffer(unsigned wanted_size)
|
||||
{
|
||||
if (free_buffers.size())
|
||||
{
|
||||
auto & b = free_buffers.back();
|
||||
if (b.size < wanted_size)
|
||||
{
|
||||
cur_buffer = {
|
||||
.buf = (uint8_t*)malloc_or_die(wanted_size),
|
||||
.size = wanted_size,
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
cur_buffer = {
|
||||
.buf = b.buf,
|
||||
.size = b.size,
|
||||
};
|
||||
}
|
||||
free_buffers.pop_back();
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned sz = RPC_INIT_BUF_SIZE;
|
||||
if (sz < wanted_size)
|
||||
{
|
||||
sz = wanted_size;
|
||||
}
|
||||
cur_buffer = {
|
||||
.buf = (uint8_t*)malloc_or_die(sz),
|
||||
.size = sz,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
void nfs_client_t::submit_read(unsigned wanted_size)
|
||||
{
|
||||
if (read_msg.msg_iovlen)
|
||||
{
|
||||
return;
|
||||
}
|
||||
io_uring_sqe* sqe = parent->ringloop->get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
read_msg.msg_iovlen = 0;
|
||||
parent->ringloop->wakeup();
|
||||
return;
|
||||
}
|
||||
if (!cur_buffer.buf || cur_buffer.size <= cur_buffer.read_pos)
|
||||
{
|
||||
assert(!wanted_size);
|
||||
if (cur_buffer.buf)
|
||||
{
|
||||
if (cur_buffer.refs > 0)
|
||||
{
|
||||
used_buffers[cur_buffer.buf] = (rpc_used_buffer_t){
|
||||
.size = cur_buffer.size,
|
||||
.refs = cur_buffer.refs,
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
free_buffers.push_back((rpc_free_buffer_t){
|
||||
.buf = cur_buffer.buf,
|
||||
.size = cur_buffer.size,
|
||||
});
|
||||
}
|
||||
}
|
||||
select_read_buffer(wanted_size);
|
||||
}
|
||||
assert(wanted_size <= cur_buffer.size-cur_buffer.read_pos);
|
||||
read_iov = {
|
||||
.iov_base = cur_buffer.buf+cur_buffer.read_pos,
|
||||
.iov_len = wanted_size ? wanted_size : cur_buffer.size-cur_buffer.read_pos,
|
||||
};
|
||||
read_msg.msg_iov = &read_iov;
|
||||
read_msg.msg_iovlen = 1;
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
data->callback = [this](ring_data_t *data) { handle_read(data->res); };
|
||||
my_uring_prep_recvmsg(sqe, nfs_fd, &read_msg, 0);
|
||||
refs++;
|
||||
}
|
||||
|
||||
void nfs_client_t::handle_read(int result)
|
||||
{
|
||||
read_msg.msg_iovlen = 0;
|
||||
if (deref())
|
||||
return;
|
||||
if (result <= 0 && result != -EAGAIN && result != -EINTR)
|
||||
{
|
||||
printf("Failed read from client %d: %d (%s)\n", nfs_fd, result, strerror(-result));
|
||||
stop();
|
||||
return;
|
||||
}
|
||||
if (result > 0)
|
||||
{
|
||||
cur_buffer.read_pos += result;
|
||||
assert(cur_buffer.read_pos <= cur_buffer.size);
|
||||
// Try to parse incoming RPC messages
|
||||
uint8_t *data = cur_buffer.buf + cur_buffer.parsed_pos;
|
||||
unsigned left = cur_buffer.read_pos - cur_buffer.parsed_pos;
|
||||
while (left > 0)
|
||||
{
|
||||
// Assemble all fragments
|
||||
unsigned fragments = 0;
|
||||
uint32_t wanted = 0;
|
||||
while (1)
|
||||
{
|
||||
fragments++;
|
||||
wanted += 4;
|
||||
if (left < wanted)
|
||||
{
|
||||
break;
|
||||
}
|
||||
// FIXME: Limit message size
|
||||
uint32_t frag_size = be32toh(*(uint32_t*)(data + wanted - 4));
|
||||
wanted += (frag_size & 0x7FFFFFFF);
|
||||
if (left < wanted || (frag_size & 0x80000000))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (left >= wanted)
|
||||
{
|
||||
if (fragments > 1)
|
||||
{
|
||||
// Merge fragments. Fragmented messages are probably not that common,
|
||||
// so it's probably fine to do an additional memory copy
|
||||
unsigned frag_offset = 8+be32toh(*(uint32_t*)(data));
|
||||
unsigned dest_offset = 4+be32toh(*(uint32_t*)(data));
|
||||
unsigned frag_num = 1;
|
||||
while (frag_num < fragments)
|
||||
{
|
||||
uint32_t frag_size = be32toh(*(uint32_t*)(data + frag_offset - 4)) & 0x7FFFFFFF;
|
||||
memmove(data + dest_offset, data + frag_offset, frag_size);
|
||||
frag_offset += 4+frag_size;
|
||||
dest_offset += frag_size;
|
||||
frag_num++;
|
||||
}
|
||||
}
|
||||
// Handle full message
|
||||
int referenced = handle_rpc_message(cur_buffer.buf, data+4, wanted-4*fragments);
|
||||
cur_buffer.refs += referenced ? 1 : 0;
|
||||
cur_buffer.parsed_pos += 4+wanted-4*fragments;
|
||||
data += wanted;
|
||||
left -= wanted;
|
||||
}
|
||||
else if (cur_buffer.size >= (data - cur_buffer.buf + wanted))
|
||||
{
|
||||
// Read the tail and come back
|
||||
submit_read(wanted-left);
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
// No place to put the whole tail
|
||||
if (cur_buffer.refs > 0)
|
||||
{
|
||||
used_buffers[cur_buffer.buf] = (rpc_used_buffer_t){
|
||||
.size = cur_buffer.size,
|
||||
.refs = cur_buffer.refs,
|
||||
};
|
||||
select_read_buffer(wanted);
|
||||
memcpy(cur_buffer.buf, data, left);
|
||||
}
|
||||
else if (cur_buffer.size < wanted)
|
||||
{
|
||||
uint8_t *old_buf = cur_buffer.buf;
|
||||
select_read_buffer(wanted);
|
||||
memcpy(cur_buffer.buf, data, left);
|
||||
free(old_buf);
|
||||
}
|
||||
else
|
||||
{
|
||||
memmove(cur_buffer.buf, data, left);
|
||||
}
|
||||
cur_buffer.read_pos = left;
|
||||
cur_buffer.parsed_pos = 0;
|
||||
// Restart from the beginning
|
||||
submit_read(wanted-left);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void nfs_client_t::submit_send()
|
||||
{
|
||||
if (write_msg.msg_iovlen || !send_list.size())
|
||||
{
|
||||
return;
|
||||
}
|
||||
io_uring_sqe* sqe = parent->ringloop->get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
write_msg.msg_iovlen = 0;
|
||||
parent->ringloop->wakeup();
|
||||
return;
|
||||
}
|
||||
write_msg.msg_iov = send_list.data();
|
||||
write_msg.msg_iovlen = send_list.size() < IOV_MAX ? send_list.size() : IOV_MAX;
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
data->callback = [this](ring_data_t *data) { handle_send(data->res); };
|
||||
my_uring_prep_sendmsg(sqe, nfs_fd, &write_msg, 0);
|
||||
refs++;
|
||||
}
|
||||
|
||||
bool nfs_client_t::deref()
|
||||
{
|
||||
refs--;
|
||||
if (stopped && refs <= 0)
|
||||
{
|
||||
stop();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void nfs_client_t::stop()
|
||||
{
|
||||
stopped = true;
|
||||
if (refs <= 0)
|
||||
{
|
||||
parent->epmgr->tfd->set_fd_handler(nfs_fd, true, NULL);
|
||||
close(nfs_fd);
|
||||
delete this;
|
||||
}
|
||||
}
|
||||
|
||||
void nfs_client_t::handle_send(int result)
|
||||
{
|
||||
write_msg.msg_iovlen = 0;
|
||||
if (deref())
|
||||
return;
|
||||
if (result <= 0 && result != -EAGAIN && result != -EINTR)
|
||||
{
|
||||
printf("Failed send to client %d: %d (%s)\n", nfs_fd, result, strerror(-result));
|
||||
stop();
|
||||
return;
|
||||
}
|
||||
if (result > 0)
|
||||
{
|
||||
int done = 0;
|
||||
while (result > 0 && done < send_list.size())
|
||||
{
|
||||
iovec & iov = send_list[done];
|
||||
if (iov.iov_len <= result)
|
||||
{
|
||||
auto rop = outbox[done];
|
||||
if (rop)
|
||||
{
|
||||
// Reply fully sent
|
||||
xdr_reset(rop->xdrs);
|
||||
parent->xdr_pool.push_back(rop->xdrs);
|
||||
if (rop->buffer && rop->referenced)
|
||||
{
|
||||
// Dereference the buffer
|
||||
if (rop->buffer == cur_buffer.buf)
|
||||
{
|
||||
cur_buffer.refs--;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto & ub = used_buffers.at(rop->buffer);
|
||||
assert(ub.refs > 0);
|
||||
ub.refs--;
|
||||
if (ub.refs == 0)
|
||||
{
|
||||
// FIXME Maybe put free_buffers into parent
|
||||
free_buffers.push_back((rpc_free_buffer_t){
|
||||
.buf = rop->buffer,
|
||||
.size = ub.size,
|
||||
});
|
||||
used_buffers.erase(rop->buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
free(rop);
|
||||
}
|
||||
result -= iov.iov_len;
|
||||
done++;
|
||||
}
|
||||
else
|
||||
{
|
||||
iov.iov_len -= result;
|
||||
iov.iov_base = (uint8_t*)iov.iov_base + result;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (done > 0)
|
||||
{
|
||||
send_list.erase(send_list.begin(), send_list.begin()+done);
|
||||
outbox.erase(outbox.begin(), outbox.begin()+done);
|
||||
}
|
||||
if (next_send_list.size())
|
||||
{
|
||||
send_list.insert(send_list.end(), next_send_list.begin(), next_send_list.end());
|
||||
outbox.insert(outbox.end(), next_outbox.begin(), next_outbox.end());
|
||||
next_send_list.clear();
|
||||
next_outbox.clear();
|
||||
}
|
||||
if (outbox.size() > 0)
|
||||
{
|
||||
submit_send();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void rpc_queue_reply(rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)rop->client;
|
||||
iovec *iov_list = NULL;
|
||||
unsigned iov_count = 0;
|
||||
int r = xdr_encode(rop->xdrs, (xdrproc_t)xdr_rpc_msg, &rop->out_msg);
|
||||
assert(r);
|
||||
if (rop->reply_fn != NULL)
|
||||
{
|
||||
r = xdr_encode(rop->xdrs, rop->reply_fn, rop->reply);
|
||||
assert(r);
|
||||
}
|
||||
xdr_encode_finish(rop->xdrs, &iov_list, &iov_count);
|
||||
assert(iov_count > 0);
|
||||
rop->reply_marker = 0;
|
||||
for (unsigned i = 0; i < iov_count; i++)
|
||||
{
|
||||
rop->reply_marker += iov_list[i].iov_len;
|
||||
}
|
||||
rop->reply_marker = htobe32(rop->reply_marker | 0x80000000);
|
||||
auto & to_send_list = self->write_msg.msg_iovlen ? self->next_send_list : self->send_list;
|
||||
auto & to_outbox = self->write_msg.msg_iovlen ? self->next_outbox : self->outbox;
|
||||
to_send_list.push_back((iovec){ .iov_base = &rop->reply_marker, .iov_len = 4 });
|
||||
to_outbox.push_back(NULL);
|
||||
for (unsigned i = 0; i < iov_count; i++)
|
||||
{
|
||||
to_send_list.push_back(iov_list[i]);
|
||||
to_outbox.push_back(NULL);
|
||||
}
|
||||
to_outbox[to_outbox.size()-1] = rop;
|
||||
self->submit_send();
|
||||
}
|
||||
|
||||
int nfs_client_t::handle_rpc_message(void *base_buf, void *msg_buf, uint32_t msg_len)
|
||||
{
|
||||
// Take an XDR object from the pool
|
||||
XDR *xdrs;
|
||||
if (parent->xdr_pool.size())
|
||||
{
|
||||
xdrs = parent->xdr_pool.back();
|
||||
parent->xdr_pool.pop_back();
|
||||
}
|
||||
else
|
||||
{
|
||||
xdrs = xdr_create();
|
||||
}
|
||||
// Decode the RPC header
|
||||
char inmsg_data[sizeof(rpc_msg)];
|
||||
rpc_msg *inmsg = (rpc_msg*)&inmsg_data;
|
||||
if (!xdr_decode(xdrs, msg_buf, msg_len, (xdrproc_t)xdr_rpc_msg, inmsg))
|
||||
{
|
||||
// Invalid message, ignore it
|
||||
xdr_reset(xdrs);
|
||||
parent->xdr_pool.push_back(xdrs);
|
||||
return 0;
|
||||
}
|
||||
if (inmsg->body.dir != RPC_CALL)
|
||||
{
|
||||
// Reply sent to the server? Strange thing. Also ignore it
|
||||
xdr_reset(xdrs);
|
||||
parent->xdr_pool.push_back(xdrs);
|
||||
return 0;
|
||||
}
|
||||
if (inmsg->body.cbody.rpcvers != RPC_MSG_VERSION)
|
||||
{
|
||||
// Bad RPC version
|
||||
rpc_op_t *rop = (rpc_op_t*)malloc_or_die(sizeof(rpc_op_t));
|
||||
u_int x = RPC_MSG_VERSION;
|
||||
*rop = (rpc_op_t){
|
||||
.client = this,
|
||||
.xdrs = xdrs,
|
||||
.out_msg = (rpc_msg){
|
||||
.xid = inmsg->xid,
|
||||
.body = (rpc_msg_body){
|
||||
.dir = RPC_REPLY,
|
||||
.rbody = (rpc_reply_body){
|
||||
.stat = RPC_MSG_DENIED,
|
||||
.rreply = (rpc_rejected_reply){
|
||||
.stat = RPC_MISMATCH,
|
||||
.mismatch_info = (rpc_mismatch_info){
|
||||
// Without at least one reference to a non-constant value (local variable or something else),
|
||||
// with gcc 8 we get "internal compiler error: side-effects element in no-side-effects CONSTRUCTOR" here
|
||||
// FIXME: get rid of this after raising compiler requirement
|
||||
.min_version = x,
|
||||
.max_version = RPC_MSG_VERSION,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
rpc_queue_reply(rop);
|
||||
// Incoming buffer isn't needed to handle request, so return 0
|
||||
return 0;
|
||||
}
|
||||
// Find decoder for the request
|
||||
auto proc_it = proc_table.find((rpc_service_proc_t){
|
||||
.prog = inmsg->body.cbody.prog,
|
||||
.vers = inmsg->body.cbody.vers,
|
||||
.proc = inmsg->body.cbody.proc,
|
||||
});
|
||||
if (proc_it == proc_table.end())
|
||||
{
|
||||
// Procedure not implemented
|
||||
uint32_t min_vers = 0, max_vers = 0;
|
||||
auto prog_it = proc_table.lower_bound((rpc_service_proc_t){
|
||||
.prog = inmsg->body.cbody.prog,
|
||||
});
|
||||
if (prog_it != proc_table.end())
|
||||
{
|
||||
min_vers = prog_it->vers;
|
||||
auto max_vers_it = proc_table.lower_bound((rpc_service_proc_t){
|
||||
.prog = inmsg->body.cbody.prog+1,
|
||||
});
|
||||
assert(max_vers_it != proc_table.begin());
|
||||
max_vers_it--;
|
||||
assert(max_vers_it->prog == inmsg->body.cbody.prog);
|
||||
max_vers = max_vers_it->vers;
|
||||
}
|
||||
rpc_op_t *rop = (rpc_op_t*)malloc_or_die(sizeof(rpc_op_t));
|
||||
*rop = (rpc_op_t){
|
||||
.client = this,
|
||||
.xdrs = xdrs,
|
||||
.out_msg = (rpc_msg){
|
||||
.xid = inmsg->xid,
|
||||
.body = (rpc_msg_body){
|
||||
.dir = RPC_REPLY,
|
||||
.rbody = (rpc_reply_body){
|
||||
.stat = RPC_MSG_ACCEPTED,
|
||||
.areply = (rpc_accepted_reply){
|
||||
.reply_data = (rpc_accepted_reply_body){
|
||||
.stat = (min_vers == 0
|
||||
? RPC_PROG_UNAVAIL
|
||||
: (min_vers <= inmsg->body.cbody.vers &&
|
||||
max_vers >= inmsg->body.cbody.vers
|
||||
? RPC_PROC_UNAVAIL
|
||||
: RPC_PROG_MISMATCH)),
|
||||
.mismatch_info = (rpc_mismatch_info){ .min_version = min_vers, .max_version = max_vers },
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
rpc_queue_reply(rop);
|
||||
// Incoming buffer isn't needed to handle request, so return 0
|
||||
return 0;
|
||||
}
|
||||
// Allocate memory
|
||||
rpc_op_t *rop = (rpc_op_t*)malloc_or_die(
|
||||
sizeof(rpc_op_t) + proc_it->req_size + proc_it->resp_size
|
||||
);
|
||||
rpc_reply_stat x = RPC_MSG_ACCEPTED;
|
||||
*rop = (rpc_op_t){
|
||||
.client = this,
|
||||
.buffer = (uint8_t*)base_buf,
|
||||
.xdrs = xdrs,
|
||||
.out_msg = (rpc_msg){
|
||||
.xid = inmsg->xid,
|
||||
.body = (rpc_msg_body){
|
||||
.dir = RPC_REPLY,
|
||||
.rbody = (rpc_reply_body){
|
||||
// Without at least one reference to a non-constant value (local variable or something else),
|
||||
// with gcc 8 we get "internal compiler error: side-effects element in no-side-effects CONSTRUCTOR" here
|
||||
// FIXME: get rid of this after raising compiler requirement
|
||||
.stat = x,
|
||||
},
|
||||
},
|
||||
},
|
||||
.request = ((uint8_t*)rop) + sizeof(rpc_op_t),
|
||||
.reply = ((uint8_t*)rop) + sizeof(rpc_op_t) + proc_it->req_size,
|
||||
};
|
||||
memcpy(&rop->in_msg, inmsg, sizeof(rpc_msg));
|
||||
// Try to decode the request
|
||||
// req_fn may be NULL, that means function has no arguments
|
||||
if (proc_it->req_fn && !proc_it->req_fn(xdrs, rop->request))
|
||||
{
|
||||
// Invalid request
|
||||
rop->out_msg.body.rbody.areply.reply_data.stat = RPC_GARBAGE_ARGS;
|
||||
rpc_queue_reply(rop);
|
||||
// Incoming buffer isn't needed to handle request, so return 0
|
||||
return 0;
|
||||
}
|
||||
rop->out_msg.body.rbody.areply.reply_data.stat = RPC_SUCCESS;
|
||||
rop->reply_fn = proc_it->resp_fn;
|
||||
int ref = proc_it->handler_fn(proc_it->opaque, rop);
|
||||
rop->referenced = ref ? 1 : 0;
|
||||
return ref;
|
||||
}
|
||||
|
||||
void nfs_proxy_t::daemonize()
|
||||
{
|
||||
if (fork())
|
||||
exit(0);
|
||||
setsid();
|
||||
if (fork())
|
||||
exit(0);
|
||||
if (chdir("/") != 0)
|
||||
fprintf(stderr, "Warning: Failed to chdir into /\n");
|
||||
close(0);
|
||||
close(1);
|
||||
close(2);
|
||||
open("/dev/null", O_RDONLY);
|
||||
open("/dev/null", O_WRONLY);
|
||||
open("/dev/null", O_WRONLY);
|
||||
}
|
||||
|
||||
int main(int narg, const char *args[])
|
||||
{
|
||||
setvbuf(stdout, NULL, _IONBF, 0);
|
||||
setvbuf(stderr, NULL, _IONBF, 0);
|
||||
exe_name = args[0];
|
||||
nfs_proxy_t *p = new nfs_proxy_t();
|
||||
p->run(nfs_proxy_t::parse_args(narg, args));
|
||||
delete p;
|
||||
return 0;
|
||||
}
|
124
src/nfs_proxy.h
124
src/nfs_proxy.h
@@ -1,124 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "cluster_client.h"
|
||||
#include "epoll_manager.h"
|
||||
#include "nfs_portmap.h"
|
||||
#include "nfs/xdr_impl.h"
|
||||
|
||||
#define RPC_INIT_BUF_SIZE 32768
|
||||
|
||||
class cli_tool_t;
|
||||
|
||||
struct nfs_dir_t
|
||||
{
|
||||
uint64_t id;
|
||||
uint64_t mod_rev;
|
||||
timespec mtime;
|
||||
};
|
||||
|
||||
class nfs_proxy_t
|
||||
{
|
||||
public:
|
||||
std::string bind_address;
|
||||
std::string name_prefix;
|
||||
uint64_t fsid = 1;
|
||||
uint64_t server_id = 0;
|
||||
std::string default_pool;
|
||||
std::string export_root;
|
||||
bool portmap_enabled;
|
||||
unsigned nfs_port;
|
||||
|
||||
pool_id_t default_pool_id;
|
||||
|
||||
portmap_service_t pmap;
|
||||
ring_loop_t *ringloop = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
cluster_client_t *cli = NULL;
|
||||
cli_tool_t *cmd = NULL;
|
||||
|
||||
std::vector<XDR*> xdr_pool;
|
||||
|
||||
// filehandle = "S"+base64(sha256(full name with prefix)) or "roothandle" for mount root)
|
||||
|
||||
uint64_t next_dir_id = 2;
|
||||
// filehandle => dir with name_prefix
|
||||
std::map<std::string, std::string> dir_by_hash;
|
||||
// dir with name_prefix => dir info
|
||||
std::map<std::string, nfs_dir_t> dir_info;
|
||||
// filehandle => inode ID
|
||||
std::map<std::string, inode_t> inode_by_hash;
|
||||
// inode ID => filehandle
|
||||
std::map<inode_t, std::string> hash_by_inode;
|
||||
// inode ID => statistics
|
||||
std::map<inode_t, json11::Json> inode_stats;
|
||||
// pool ID => statistics
|
||||
std::map<pool_id_t, json11::Json> pool_stats;
|
||||
|
||||
~nfs_proxy_t();
|
||||
|
||||
static json11::Json::object parse_args(int narg, const char *args[]);
|
||||
void run(json11::Json cfg);
|
||||
void watch_stats();
|
||||
void parse_stats(etcd_kv_t & kv);
|
||||
void check_default_pool();
|
||||
void do_accept(int listen_fd);
|
||||
void daemonize();
|
||||
};
|
||||
|
||||
struct rpc_cur_buffer_t
|
||||
{
|
||||
uint8_t *buf;
|
||||
unsigned size;
|
||||
unsigned read_pos;
|
||||
unsigned parsed_pos;
|
||||
int refs;
|
||||
};
|
||||
|
||||
struct rpc_used_buffer_t
|
||||
{
|
||||
unsigned size;
|
||||
int refs;
|
||||
};
|
||||
|
||||
struct rpc_free_buffer_t
|
||||
{
|
||||
uint8_t *buf;
|
||||
unsigned size;
|
||||
};
|
||||
|
||||
class nfs_client_t
|
||||
{
|
||||
public:
|
||||
nfs_proxy_t *parent = NULL;
|
||||
int nfs_fd;
|
||||
int epoll_events = 0;
|
||||
int refs = 0;
|
||||
bool stopped = false;
|
||||
std::set<rpc_service_proc_t> proc_table;
|
||||
|
||||
// Read state
|
||||
rpc_cur_buffer_t cur_buffer = { 0 };
|
||||
std::map<uint8_t*, rpc_used_buffer_t> used_buffers;
|
||||
std::vector<rpc_free_buffer_t> free_buffers;
|
||||
|
||||
iovec read_iov;
|
||||
msghdr read_msg = { 0 };
|
||||
|
||||
// Write state
|
||||
msghdr write_msg = { 0 };
|
||||
std::vector<iovec> send_list, next_send_list;
|
||||
std::vector<rpc_op_t*> outbox, next_outbox;
|
||||
|
||||
nfs_client_t();
|
||||
~nfs_client_t();
|
||||
|
||||
void select_read_buffer(unsigned wanted_size);
|
||||
void submit_read(unsigned wanted_size);
|
||||
void handle_read(int result);
|
||||
void submit_send();
|
||||
void handle_send(int result);
|
||||
int handle_rpc_message(void *base_buf, void *msg_buf, uint32_t msg_len);
|
||||
|
||||
bool deref();
|
||||
void stop();
|
||||
};
|
47
src/osd.cpp
47
src/osd.cpp
@@ -57,11 +57,7 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
||||
if (this->config["osd_memlock"] == "true" || this->config["osd_memlock"] == "1" || this->config["osd_memlock"] == "yes")
|
||||
{
|
||||
// Lock all OSD memory if requested
|
||||
if (mlockall(MCL_CURRENT|MCL_FUTURE
|
||||
#ifdef MCL_ONFAULT
|
||||
| MCL_ONFAULT
|
||||
#endif
|
||||
) != 0)
|
||||
if (mlockall(MCL_CURRENT|MCL_FUTURE|MCL_ONFAULT) != 0)
|
||||
{
|
||||
fprintf(stderr, "osd_memlock is set to true, but mlockall() failed: %s\n", strerror(errno));
|
||||
exit(-1);
|
||||
@@ -200,7 +196,46 @@ void osd_t::bind_socket()
|
||||
|
||||
// FIXME Support multiple listening sockets
|
||||
|
||||
listen_fd = create_and_bind_socket(bind_address, bind_port, listen_backlog, &listening_port);
|
||||
sockaddr addr;
|
||||
if (!string_to_addr(bind_address, 0, bind_port, &addr))
|
||||
{
|
||||
throw std::runtime_error("bind address "+bind_address+" is not valid");
|
||||
}
|
||||
|
||||
listen_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
||||
if (listen_fd < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("socket: ") + strerror(errno));
|
||||
}
|
||||
int enable = 1;
|
||||
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
|
||||
|
||||
if (bind(listen_fd, &addr, sizeof(addr)) < 0)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("bind: ") + strerror(errno));
|
||||
}
|
||||
if (bind_port == 0)
|
||||
{
|
||||
socklen_t len = sizeof(addr);
|
||||
if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
|
||||
}
|
||||
listening_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
||||
}
|
||||
else
|
||||
{
|
||||
listening_port = bind_port;
|
||||
}
|
||||
|
||||
if (listen(listen_fd, listen_backlog) < 0)
|
||||
{
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("listen: ") + strerror(errno));
|
||||
}
|
||||
|
||||
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
|
||||
epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
|
||||
|
@@ -211,7 +211,7 @@ class osd_t
|
||||
// flushing, recovery and backfill
|
||||
void submit_pg_flush_ops(pg_t & pg);
|
||||
void handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval);
|
||||
bool submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
|
||||
void submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
|
||||
bool pick_next_recovery(osd_recovery_op_t &op);
|
||||
void submit_recovery_op(osd_recovery_op_t *op);
|
||||
bool continue_recovery();
|
||||
|
@@ -189,7 +189,7 @@ void osd_t::report_statistics()
|
||||
for (auto kv: bs->get_inode_space_stats())
|
||||
{
|
||||
pool_id_t pool_id = INODE_POOL(kv.first);
|
||||
uint64_t only_inode_num = INODE_NO_POOL(kv.first);
|
||||
uint64_t only_inode_num = (kv.first & ((1l << (64-POOL_ID_BITS)) - 1));
|
||||
if (!last_pool || pool_id != last_pool)
|
||||
{
|
||||
if (last_pool)
|
||||
@@ -207,7 +207,7 @@ void osd_t::report_statistics()
|
||||
for (auto kv: inode_stats)
|
||||
{
|
||||
pool_id_t pool_id = INODE_POOL(kv.first);
|
||||
uint64_t only_inode_num = (kv.first & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1));
|
||||
uint64_t only_inode_num = (kv.first & ((1l << (64-POOL_ID_BITS)) - 1));
|
||||
if (!last_pool || pool_id != last_pool)
|
||||
{
|
||||
if (last_pool)
|
||||
@@ -457,8 +457,7 @@ void osd_t::renew_lease()
|
||||
if (err == "" && data["result"]["TTL"].string_value() == "")
|
||||
{
|
||||
// Die
|
||||
fprintf(stderr, "Error refreshing etcd lease\n");
|
||||
force_stop(1);
|
||||
throw std::runtime_error("etcd lease has expired");
|
||||
}
|
||||
if (err != "")
|
||||
{
|
||||
@@ -467,8 +466,7 @@ void osd_t::renew_lease()
|
||||
if (etcd_failed_attempts > st_cli.max_etcd_attempts)
|
||||
{
|
||||
// Die
|
||||
fprintf(stderr, "Cluster connection failed\n");
|
||||
force_stop(1);
|
||||
throw std::runtime_error("Cluster connection failed");
|
||||
}
|
||||
// Retry
|
||||
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this](int timer_id)
|
||||
|
@@ -47,8 +47,7 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
|
||||
if (l.second.size() > 0)
|
||||
{
|
||||
fb->flush_ops++;
|
||||
if (!submit_flush_op(pg.pool_id, pg.pg_num, fb, true, l.first, l.second.size(), l.second.data()))
|
||||
return;
|
||||
submit_flush_op(pg.pool_id, pg.pg_num, fb, true, l.first, l.second.size(), l.second.data());
|
||||
}
|
||||
}
|
||||
for (auto & l: fb->stable_lists)
|
||||
@@ -56,8 +55,7 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
|
||||
if (l.second.size() > 0)
|
||||
{
|
||||
fb->flush_ops++;
|
||||
if (!submit_flush_op(pg.pool_id, pg.pg_num, fb, false, l.first, l.second.size(), l.second.data()))
|
||||
return;
|
||||
submit_flush_op(pg.pool_id, pg.pg_num, fb, false, l.first, l.second.size(), l.second.data());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -162,7 +160,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
|
||||
void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
|
||||
{
|
||||
osd_op_t *op = new osd_op_t();
|
||||
// Copy buffer so it gets freed along with the operation
|
||||
@@ -190,8 +188,10 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
||||
else
|
||||
{
|
||||
// Peer
|
||||
int peer_fd = msgr.osd_peer_fds[peer_osd];
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->iov.push_back(op->buf, count * sizeof(obj_ver_id));
|
||||
op->peer_fd = peer_fd;
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_stab = {
|
||||
.header = {
|
||||
@@ -207,21 +207,8 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
||||
handle_flush_op(op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK, pool_id, pg_num, fb, peer_osd, op->reply.hdr.retval);
|
||||
delete op;
|
||||
};
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(peer_osd);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
op->peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
op->reply.hdr.retval = -EPIPE;
|
||||
op->callback(op);
|
||||
return false;
|
||||
}
|
||||
msgr.outbox_push(op);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
||||
|
@@ -9,7 +9,7 @@
|
||||
#define POOL_ID_MAX 0x10000
|
||||
#define POOL_ID_BITS 16
|
||||
#define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))
|
||||
#define INODE_NO_POOL(inode) (inode_t)(inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1))
|
||||
#define INODE_NO_POOL(inode) (inode_t)(inode & ((1l << (64-POOL_ID_BITS)) - 1))
|
||||
#define INODE_WITH_POOL(pool_id, inode) (((inode_t)(pool_id) << (64-POOL_ID_BITS)) | INODE_NO_POOL(inode))
|
||||
|
||||
// Pool ID is 16 bits long
|
||||
|
@@ -29,10 +29,8 @@ void osd_t::handle_peers()
|
||||
degraded_objects += p.second.degraded_objects.size();
|
||||
if (p.second.state & PG_HAS_UNCLEAN)
|
||||
peering_state = peering_state | OSD_FLUSHING_PGS;
|
||||
else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
|
||||
else if (p.second.state & PG_HAS_DEGRADED)
|
||||
peering_state = peering_state | OSD_RECOVERING;
|
||||
ringloop->wakeup();
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -342,7 +340,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
|
||||
else
|
||||
{
|
||||
// Peer
|
||||
auto & cl = msgr.clients.at(msgr.osd_peer_fds.at(role_osd));
|
||||
auto & cl = msgr.clients.at(msgr.osd_peer_fds[role_osd]);
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = cl->peer_fd;
|
||||
@@ -396,9 +394,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||
{
|
||||
if (op->bs_op->retval < 0)
|
||||
{
|
||||
printf("Local OP_LIST failed: retval=%d\n", op->bs_op->retval);
|
||||
force_stop(1);
|
||||
return;
|
||||
throw std::runtime_error("local OP_LIST failed");
|
||||
}
|
||||
add_bs_subop_stats(op);
|
||||
printf(
|
||||
@@ -423,7 +419,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||
// Peer
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = msgr.osd_peer_fds.at(role_osd);
|
||||
op->peer_fd = msgr.osd_peer_fds[role_osd];
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_list = {
|
||||
.header = {
|
||||
|
@@ -437,7 +437,7 @@ void pg_t::calc_object_states(int log_level)
|
||||
st.walk();
|
||||
if (this->state & (PG_DEGRADED|PG_LEFT_ON_DEAD))
|
||||
{
|
||||
assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
|
||||
assert(epoch != ((1ul << PG_EPOCH_BITS)-1));
|
||||
epoch++;
|
||||
}
|
||||
}
|
||||
|
@@ -194,22 +194,18 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
|
||||
// Determine version
|
||||
auto vo_it = pg.ver_override.find(op_data->oid);
|
||||
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||
op_data->prev_set = pg.cur_set.data();
|
||||
if (pg.state != PG_ACTIVE)
|
||||
{
|
||||
// PG may be degraded or have misplaced objects
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
||||
}
|
||||
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Fast happy-path
|
||||
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
|
||||
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
|
||||
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, pg.cur_set.data(), cur_op);
|
||||
op_data->st = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (extend_missing_stripes(op_data->stripes, op_data->prev_set, op_data->pg_data_size, pg.pg_size) < 0)
|
||||
// PG may be degraded or have misplaced objects
|
||||
uint64_t* cur_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
||||
if (extend_missing_stripes(op_data->stripes, cur_set, op_data->pg_data_size, pg.pg_size) < 0)
|
||||
{
|
||||
finish_op(cur_op, -EIO);
|
||||
return;
|
||||
@@ -219,7 +215,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
|
||||
op_data->scheme = pg.scheme;
|
||||
op_data->degraded = 1;
|
||||
cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0);
|
||||
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
|
||||
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, cur_set, cur_op);
|
||||
op_data->st = 1;
|
||||
}
|
||||
}
|
||||
|
@@ -246,6 +246,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
||||
// Send to a remote OSD
|
||||
osd_op_t *subop = op_data->subops+subop_idx;
|
||||
subop->op_type = OSD_OP_OUT;
|
||||
subop->peer_fd = msgr.osd_peer_fds.at(subop_osd_num);
|
||||
// FIXME: Use the pre-allocated buffer
|
||||
subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
|
||||
subop->req = (osd_any_op_t){
|
||||
@@ -286,18 +287,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
||||
}
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(subop_osd_num);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
subop->peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(subop);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
subop->reply.hdr.retval = -EPIPE;
|
||||
subop->callback(subop);
|
||||
}
|
||||
msgr.outbox_push(subop);
|
||||
subop_idx++;
|
||||
}
|
||||
prev = i+1;
|
||||
@@ -400,21 +390,18 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||
stripes[role].read_end = stripes[role].req_end;
|
||||
}
|
||||
uint64_t *cur_set = pg.cur_set.data();
|
||||
if (pg.state != PG_ACTIVE)
|
||||
if (pg.state != PG_ACTIVE && op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
pg_osd_set_state_t *object_state;
|
||||
cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0)
|
||||
{
|
||||
if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0)
|
||||
{
|
||||
free(op_data->chain_reads);
|
||||
op_data->chain_reads = NULL;
|
||||
finish_op(cur_op, -EIO);
|
||||
return -1;
|
||||
}
|
||||
op_data->degraded = 1;
|
||||
free(op_data->chain_reads);
|
||||
op_data->chain_reads = NULL;
|
||||
finish_op(cur_op, -EIO);
|
||||
return -1;
|
||||
}
|
||||
op_data->degraded = 1;
|
||||
}
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
@@ -468,7 +455,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||
auto vo_it = pg.ver_override.find(cur_oid);
|
||||
uint64_t target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||
uint64_t *cur_set = pg.cur_set.data();
|
||||
if (pg.state != PG_ACTIVE)
|
||||
if (pg.state != PG_ACTIVE && op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
pg_osd_set_state_t *object_state;
|
||||
cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
|
||||
|
@@ -182,6 +182,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
else
|
||||
{
|
||||
subop->op_type = OSD_OP_OUT;
|
||||
subop->peer_fd = msgr.osd_peer_fds.at(role_osd_num);
|
||||
subop->bitmap = stripes[stripe_num].bmp_buf;
|
||||
subop->bitmap_len = clean_entry_bitmap_size;
|
||||
subop->req.sec_rw = {
|
||||
@@ -224,18 +225,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(role_osd_num);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
subop->peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(subop);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
subop->reply.hdr.retval = -EPIPE;
|
||||
subop->callback(subop);
|
||||
}
|
||||
msgr.outbox_push(subop);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
@@ -473,6 +463,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
||||
else
|
||||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = msgr.osd_peer_fds.at(chunk.osd_num);
|
||||
subops[i].req = (osd_any_op_t){ .sec_del = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
@@ -486,18 +477,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
||||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(chunk.osd_num);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
subops[i].peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(&subops[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
subops[i].reply.hdr.retval = -EPIPE;
|
||||
subops[i].callback(&subops[i]);
|
||||
}
|
||||
msgr.outbox_push(&subops[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -587,6 +567,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||
else
|
||||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = msgr.osd_peer_fds.at(stab_osd.osd_num);
|
||||
subops[i].req = (osd_any_op_t){ .sec_stab = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
@@ -600,18 +581,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(stab_osd.osd_num);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
subops[i].peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(&subops[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
subops[i].reply.hdr.retval = -EPIPE;
|
||||
subops[i].callback(&subops[i]);
|
||||
}
|
||||
msgr.outbox_push(&subops[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -144,9 +144,9 @@ resume_3:
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((op_data->fact_ver & ((uint64_t)1 << (64-PG_EPOCH_BITS) - 1)) == ((uint64_t)1 << (64-PG_EPOCH_BITS) - 1))
|
||||
if ((op_data->fact_ver & (1ul<<(64-PG_EPOCH_BITS) - 1)) == (1ul<<(64-PG_EPOCH_BITS) - 1))
|
||||
{
|
||||
assert(pg.epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
|
||||
assert(pg.epoch != ((1ul << PG_EPOCH_BITS)-1));
|
||||
pg.epoch++;
|
||||
}
|
||||
op_data->target_ver = op_data->fact_ver + 1;
|
||||
|
@@ -8,7 +8,7 @@
|
||||
#include "osd_id.h"
|
||||
|
||||
#ifndef MEM_ALIGNMENT
|
||||
#define MEM_ALIGNMENT 4096
|
||||
#define MEM_ALIGNMENT 512
|
||||
#endif
|
||||
|
||||
struct buf_len_t
|
||||
|
@@ -134,14 +134,14 @@ int main(int narg, char *args[])
|
||||
|
||||
int connect_osd(const char *osd_address, int osd_port)
|
||||
{
|
||||
struct sockaddr_storage addr;
|
||||
struct sockaddr addr;
|
||||
if (!string_to_addr(osd_address, 0, osd_port, &addr))
|
||||
{
|
||||
fprintf(stderr, "server address: %s is not valid\n", osd_address);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int connect_fd = socket(addr.ss_family, SOCK_STREAM, 0);
|
||||
int connect_fd = socket(addr.sa_family, SOCK_STREAM, 0);
|
||||
if (connect_fd < 0)
|
||||
{
|
||||
perror("socket");
|
||||
|
@@ -262,7 +262,7 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
||||
client->pool = qdict_get_try_int(options, "pool", 0);
|
||||
if (client->pool)
|
||||
{
|
||||
client->inode = (client->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
|
||||
client->inode = (client->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
|
||||
}
|
||||
client->size = qdict_get_try_int(options, "size", 0);
|
||||
}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user