Compare commits

..

1 Commits

Author SHA1 Message Date
61ae4e903a WIP/experimental LRC matrix generator 2022-04-14 20:16:09 +03:00
126 changed files with 867 additions and 13883 deletions

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
project(vitastor)
set(VERSION "0.6.17")
set(VERSION "0.6.16")
add_subdirectory(src)

View File

@@ -52,7 +52,6 @@ Vitastor на данный момент находится в статусе п
- Слияние снапшотов (vitastor-cli {snap-rm,flatten,merge})
- Консольный интерфейс для управления образами (vitastor-cli {ls,create,modify})
- Плагин для Proxmox
- Упрощённая NFS-прокси для эмуляции файлового доступа к образам (подходит для VMWare)
## Планы развития
@@ -60,6 +59,7 @@ Vitastor на данный момент находится в статусе п
- Другие инструменты администрирования
- Плагины для OpenNebula и других облачных систем
- iSCSI-прокси
- Упрощённый NFS прокси
- Более быстрое переключение при отказах
- Фоновая проверка целостности без контрольных сумм (сверка реплик)
- Контрольные суммы
@@ -530,48 +530,9 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
### NFS
В Vitastor реализована упрощённая NFS 3.0 прокси для эмуляции файлового доступа к образам.
Это не полноценная файловая система, т.к. метаданные всех файлов (образов) сохраняются
в etcd и всё время хранятся в оперативной памяти - то есть, положить туда много файлов
не получится.
Однако в качестве способа доступа к образам виртуальных машин NFS прокси прекрасно подходит
и позволяет подключить Vitastor, например, к VMWare.
При этом, если вы используете режим immediate_commit=all (для SSD с конденсаторами или HDD
с отключённым кэшем), то NFS-сервер не имеет состояния и вы можете свободно поднять
его в нескольких экземплярах и использовать поверх них сетевой балансировщик нагрузки или
схему с отказоустойчивостью.
Использование vitastor-nfs:
```
vitastor-nfs [--etcd_address ADDR] [ДРУГИЕ ОПЦИИ]
--subdir <DIR> экспортировать "поддиректорию" - образы с префиксом имени <DIR>/ (по умолчанию пусто - экспортировать все образы)
--portmap 0 отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий)
--bind <IP> принимать соединения по адресу <IP> (по умолчанию 0.0.0.0 - на всех)
--nfspath <PATH> установить путь NFS-экспорта в <PATH> (по умолчанию /)
--port <PORT> использовать порт <PORT> для NFS-сервисов (по умолчанию 2049)
--pool <POOL> исползовать пул <POOL> для новых образов (обязательно, если пул в кластере не один)
--foreground 1 не уходить в фон после запуска
```
Пример монтирования Vitastor через NFS:
```
vitastor-nfs --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
```
```
mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
```
### Kubernetes
У Vitastor есть CSI-плагин для Kubernetes, поддерживающий RWO, а также блочные RWX, тома.
У Vitastor есть CSI-плагин для Kubernetes, поддерживающий RWO-тома.
Для установки возьмите манифесты из директории [csi/deploy/](csi/deploy/), поместите
вашу конфигурацию подключения к Vitastor в [csi/deploy/001-csi-config-map.yaml](001-csi-config-map.yaml),

View File

@@ -46,7 +46,6 @@ breaking changes in the future. However, the following is implemented:
- Snapshot merge tool (vitastor-cli {snap-rm,flatten,merge})
- Image management CLI (vitastor-cli {ls,create,modify})
- Proxmox storage plugin
- Simplified NFS proxy for file-based image access emulation (suitable for VMWare)
## Roadmap
@@ -54,6 +53,7 @@ breaking changes in the future. However, the following is implemented:
- Other administrative tools
- Plugins for OpenNebula and other cloud systems
- iSCSI proxy
- Simplified NFS proxy
- Faster failover
- Scrubbing without checksums (verification of replicas)
- Checksums
@@ -479,49 +479,9 @@ It will output the device name, like /dev/nbd0 which you can then format and mou
Again, you can use `--pool <POOL> --inode <INODE> --size <SIZE>` insteaf of `--image <IMAGE>` if you want.
### NFS
Vitastor has a simplified NFS 3.0 proxy for file-based image access emulation. It's not
suitable as a full-featured file system, at least because all file/image metadata is stored
in etcd and kept in memory all the time - thus you can't put a lot of files in it.
However, NFS proxy is totally fine as a method to provide VM image access and allows to
plug Vitastor into, for example, VMWare. It's important to note that for VMWare it's a much
better access method than iSCSI, because with iSCSI we'd have to put all VM images into one
Vitastor image exported as a LUN to VMWare and formatted with VMFS. VMWare doesn't use VMFS
over NFS.
NFS proxy is stateless if you use immediate_commit=all mode (for SSD with capacitors or
HDDs with disabled cache), so you can run multiple NFS proxies and use a network load
balancer or any failover method you want to in that case.
vitastor-nfs usage:
```
vitastor-nfs [--etcd_address ADDR] [OTHER OPTIONS]
--subdir <DIR> export images prefixed <DIR>/ (default empty - export all images)
--portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)
--bind <IP> bind service to <IP> address (default 0.0.0.0)
--nfspath <PATH> set NFS export path to <PATH> (default is /)
--port <PORT> use port <PORT> for NFS services (default is 2049)
--pool <POOL> use <POOL> as default pool for new files (images)
--foreground 1 stay in foreground, do not daemonize
```
Example start and mount commands:
```
vitastor-nfs --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
```
```
mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
```
### Kubernetes
Vitastor has a CSI plugin for Kubernetes which supports RWO (and block RWX) volumes.
Vitastor has a CSI plugin for Kubernetes which supports RWO volumes.
To deploy it, take manifests from [csi/deploy/](csi/deploy/) directory, put your
Vitastor configuration in [csi/deploy/001-csi-config-map.yaml](001-csi-config-map.yaml),

View File

@@ -1,4 +1,4 @@
VERSION ?= v0.6.17
VERSION ?= v0.6.16
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v0.6.17
image: vitalif/vitastor-csi:v0.6.16
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -116,7 +116,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v0.6.17
image: vitalif/vitastor-csi:v0.6.16
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -1,13 +0,0 @@
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: test-vitastor-pvc-block
spec:
storageClassName: vitastor
volumeMode: Block
accessModes:
- ReadWriteMany
resources:
requests:
storage: 10Gi

View File

@@ -1,17 +0,0 @@
apiVersion: v1
kind: Pod
metadata:
name: vitastor-test-block-pvc
namespace: default
spec:
containers:
- name: vitastor-test-block-pvc
image: nginx
volumeDevices:
- name: data
devicePath: /dev/xvda
volumes:
- name: data
persistentVolumeClaim:
claimName: test-vitastor-pvc-block
readOnly: false

View File

@@ -1,17 +0,0 @@
apiVersion: v1
kind: Pod
metadata:
name: vitastor-test-nginx
namespace: default
spec:
containers:
- name: vitastor-test-nginx
image: nginx
volumeMounts:
- mountPath: /usr/share/nginx/html/s3
name: data
volumes:
- name: data
persistentVolumeClaim:
claimName: test-vitastor-pvc
readOnly: false

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "0.6.17"
vitastorCSIDriverVersion = "0.6.16"
)
// Config struct fills the parameters of request or user input

View File

@@ -67,44 +67,29 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
klog.Infof("received node publish volume request %+v", protosanitizer.StripSecrets(req))
targetPath := req.GetTargetPath()
isBlock := req.GetVolumeCapability().GetBlock() != nil
// Check that it's not already mounted
_, error := mount.IsNotMountPoint(ns.mounter, targetPath)
free, error := mount.IsNotMountPoint(ns.mounter, targetPath)
if (error != nil)
{
if (os.IsNotExist(error))
{
if (isBlock)
error := os.MkdirAll(targetPath, 0777)
if (error != nil)
{
pathFile, err := os.OpenFile(targetPath, os.O_CREATE|os.O_RDWR, 0o600)
if (err != nil)
{
klog.Errorf("failed to create block device mount target %s with error: %v", targetPath, err)
return nil, status.Error(codes.Internal, err.Error())
}
err = pathFile.Close()
if (err != nil)
{
klog.Errorf("failed to close %s with error: %v", targetPath, err)
return nil, status.Error(codes.Internal, err.Error())
}
}
else
{
err := os.MkdirAll(targetPath, 0777)
if (err != nil)
{
klog.Errorf("failed to create fs mount target %s with error: %v", targetPath, err)
return nil, status.Error(codes.Internal, err.Error())
}
return nil, status.Error(codes.Internal, error.Error())
}
free = true
}
else
{
return nil, status.Error(codes.Internal, error.Error())
}
}
if (!free)
{
return &csi.NodePublishVolumeResponse{}, nil
}
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
@@ -164,6 +149,7 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
// Format the device (ext4 or xfs)
fsType := req.GetVolumeCapability().GetMount().GetFsType()
isBlock := req.GetVolumeCapability().GetBlock() != nil
opt := req.GetVolumeCapability().GetMount().GetMountFlags()
opt = append(opt, "_netdev")
if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||

2
debian/changelog vendored
View File

@@ -1,4 +1,4 @@
vitastor (0.6.17-1) unstable; urgency=medium
vitastor (0.6.16-1) unstable; urgency=medium
* RDMA support
* Bugfixes

View File

@@ -2,6 +2,5 @@ usr/bin/vita
usr/bin/vitastor-cli
usr/bin/vitastor-rm
usr/bin/vitastor-nbd
usr/bin/vitastor-nfs
usr/lib/*/libvitastor*.so*
mon/make-osd.sh /usr/lib/vitastor

View File

@@ -33,8 +33,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.6.17; \
cd vitastor-0.6.17; \
cp -r /root/vitastor vitastor-0.6.16; \
cd vitastor-0.6.16; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -47,8 +47,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.17.orig.tar.xz vitastor-0.6.17; \
cd vitastor-0.6.17; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.16.orig.tar.xz vitastor-0.6.16; \
cd vitastor-0.6.16; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -1,55 +0,0 @@
#!/usr/bin/nodejs
const fs = require('fs');
const yaml = require('yaml');
const L = {
en: {},
ru: {
Type: 'Тип',
Default: 'Значение по умолчанию',
Minimum: 'Минимальное значение',
},
};
const types = {
en: {
string: 'string',
bool: 'boolean',
int: 'integer',
sec: 'seconds',
ms: 'milliseconds',
us: 'microseconds',
},
ru: {
string: 'строка',
bool: 'булево (да/нет)',
int: 'целое число',
sec: 'секунды',
ms: 'миллисекунды',
us: 'микросекунды',
},
};
const params_files = fs.readdirSync(__dirname+'/params')
.filter(f => f.substr(-4) == '.yml')
.map(f => f.substr(0, f.length-4));
for (const file of params_files)
{
const cfg = yaml.parse(fs.readFileSync(__dirname+'/params/'+file+'.yml', { encoding: 'utf-8' }));
for (const lang in types)
{
let out = '\n\n{{< toc >}}';
for (const c of cfg)
{
out += `\n\n## ${c.name}\n\n`;
out += `- ${L[lang]['Type'] || 'Type'}: ${c["type_"+lang] || types[lang][c.type] || c.type}\n`;
if (c.default !== undefined)
out += `- ${L[lang]['Default'] || 'Default'}: ${c.default}\n`;
if (c.min !== undefined)
out += `- ${L[lang]['Minimum'] || 'Minimum'}: ${c.min}\n`;
out += `\n`+(c["info_"+lang] || c["info"]).replace(/\s+$/, '');
}
const head = fs.readFileSync(__dirname+'/params/head/'+file+'.'+lang+'.md', { encoding: 'utf-8' });
fs.writeFileSync(__dirname+'/hugo/content/config/'+file+'.'+lang+'.md', head.replace(/\s+$/, '')+out+"\n");
}
}

View File

@@ -1,6 +0,0 @@
---
title: "{{ replace .Name "-" " " | title }}"
date: {{ .Date }}
draft: true
---

View File

@@ -1,35 +0,0 @@
baseURL: http://localhost
title: Vitastor
theme: hugo-geekdoc
#languageCode: en-us
pluralizeListTitles: false
# Geekdoc required configuration
pygmentsUseClasses: true
pygmentsCodeFences: true
disablePathToLower: true
# Required if you want to render robots.txt template
enableRobotsTXT: true
defaultContentLanguage: en
languages:
en:
weight: 1
languageName: English
ru:
weight: 1
languageName: Русский
markup:
goldmark:
renderer:
# Needed for mermaid shortcode
unsafe: true
tableOfContents:
startLevel: 1
endLevel: 9
taxonomies:
tag: tags

View File

@@ -1,6 +0,0 @@
## The Idea
Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
architecturally similar to Ceph which means strong consistency, primary-replication,
symmetric clustering and automatic data distribution over any number of drives
of any size with configurable redundancy (replication or erasure codes/XOR).

View File

@@ -1,61 +0,0 @@
---
title: Parameter Reference
weight: 1
---
Vitastor configuration consists of:
- Configuration parameters (key-value), described here
- [Pool configuration]({{< ref "config/pool" >}})
- OSD placement tree configuration
- Inode configuration i.e. image metadata like name, size and parent reference
Configuration parameters can be set in 3 places:
- Configuration file (`/etc/vitastor/vitastor.conf` or other path)
- etcd key `/vitastor/config/global`. Most variables can be set there, but etcd
connection parameters should obviously be set in the configuration file.
- Command line of Vitastor components: OSD, mon, fio and QEMU options,
OpenStack/Proxmox/etc configuration. The latter doesn't allow to set all
variables directly, but it allows to override the configuration file and
set everything you need inside it.
In the future, additional configuration methods may be added:
- OSD superblock which will, by design, contain parameters related to the disk
layout and to one specific OSD.
- OSD-specific keys in etcd like `/vitastor/config/osd/<number>`.
## Common Parameters
These are the most common parameters which apply to all components of Vitastor.
[See the list]({{< ref "common" >}})
## Cluster-Wide Disk Layout Parameters
These parameters apply to clients and OSDs and can't be changed after OSD
initialization.
[See the list]({{< ref "layout-cluster" >}})
## OSD Disk Layout Parameters
These parameters apply to OSDs and can't be changed after OSD initialization.
[See the list]({{< ref "layout-osd" >}})
## Network Protocol Parameters
These parameters apply to clients and OSDs and can be changed with a restart.
[See the list]({{< ref "network" >}})
## Runtime OSD Parameters
These parameters apply to OSDs and can be changed with an OSD restart.
[See the list]({{< ref "osd" >}})
## Monitor Parameters
These parameters only apply to Monitors.
[See the list]({{< ref "monitor" >}})

View File

@@ -1,63 +0,0 @@
---
title: Перечень настроек
weight: 1
---
Конфигурация Vitastor состоит из:
- Параметров (ключ-значение), описанных на данной странице
- Настроек пулов
- Настроек дерева OSD
- Настроек инодов, т.е. метаданных образов, таких, как имя, размер и ссылки на
родительский образ
Параметры конфигурации могут задаваться в 3 местах:
- Файле конфигурации (`/etc/vitastor/vitastor.conf` или по другому пути)
- Ключе в etcd `/vitastor/config/global`. Большая часть параметров может
задаваться там, кроме, естественно, самих параметров соединения с etcd,
которые должны задаваться в файле конфигурации
- В командной строке компонентов Vitastor: OSD, монитора, опциях fio и QEMU,
настроек OpenStack, Proxmox и т.п. Последние, как правило, не включают полный
набор параметров напрямую, но разрешают определить путь к файлу конфигурации
и задать любые параметры в нём.
В будущем также могут быть добавлены другие способы конфигурации:
- Суперблок OSD, в котором будут храниться параметры OSD, связанные с дисковым
форматом и с этим конкретным OSD.
- OSD-специфичные ключи в etcd типа `/vitastor/config/osd/<номер>`.
## Общие параметры
Это наиболее общие параметры, используемые всеми компонентами Vitastor.
[Посмотреть список]({{< ref "common" >}})
## Дисковые параметры уровня кластера
Эти параметры используются клиентами и OSD и не могут быть изменены после
инициализации OSD.
[Посмотреть список]({{< ref "layout-cluster" >}})
## Дисковые параметры OSD
Эти параметры используются OSD и не могут быть изменены после инициализации OSD.
[Посмотреть список]({{< ref "layout-osd" >}})
## Параметры сетевого протокола
Эти параметры используются клиентами и OSD и могут быть изменены с перезапуском.
[Посмотреть список]({{< ref "network" >}})
## Изменяемые параметры OSD
Эти параметры используются OSD и могут быть изменены с перезапуском.
[Посмотреть список]({{< ref "osd" >}})
## Параметры мониторов
Данные параметры используются только мониторами Vitastor.
[Посмотреть список]({{< ref "monitor" >}})

View File

@@ -1,178 +0,0 @@
---
title: Pool configuration
weight: 100
---
Pool configuration is set in etcd key `/vitastor/config/pools` in the following
JSON format:
```
{
"<Numeric ID>": {
"name": "<name>",
...other parameters...
}
}
```
{{< toc >}}
# Parameters
## name
- Type: string
- Required
Pool name.
## scheme
- Type: string
- Required
- One of: "replicated", "xor" or "jerasure"
Redundancy scheme used for data in this pool.
## pg_size
- Type: integer
- Required
Total number of disks for PGs of this pool - i.e., number of replicas for
replicated pools and number of data plus parity disks for EC/XOR pools.
## parity_chunks
- Type: integer
Number of parity chunks for EC/XOR pools. For such pools, data will be lost
if you lose more than parity_chunks disks at once, so this parameter can be
equally described as FTT (number of failures to tolerate).
Required for EC/XOR pools, ignored for replicated pools.
## pg_minsize
- Type: integer
- Required
Number of available live disks for PGs of this pool to remain active.
That is, if it becomes impossible to place PG data on at least (pg_minsize)
OSDs, PG is deactivated for both read and write. So you know that a fresh
write always goes to at least (pg_minsize) OSDs (disks).
FIXME: pg_minsize behaviour may be changed in the future to only make PGs
read-only instead of deactivating them.
## pg_count
- Type: integer
- Required
Number of PGs for this pool. The value should be big enough for the monitor /
LP solver to be able to optimize data placement.
"Enough" is usually around 64-128 PGs per OSD, i.e. you set pg_count for pool
to (total OSD count * 100 / pg_size). You can round it to the closest power of 2,
because it makes it easier to reduce or increase PG count later by dividing or
multiplying it by 2.
In Vitastor, PGs are ephemeral, so you can change pool PG count anytime just
by overwriting pool configuration in etcd. Amount of the data affected by
rebalance will be smaller if the new PG count is a multiple of the old PG count
or vice versa.
## failure_domain
- Type: string
- Default: host
Failure domain specification. Must be "host" or "osd" or refer to one of the
placement tree levels, defined in [placement_levels]({{< ref "config/monitor#placement_levels" >}}).
Two replicas, or two parts in case of EC/XOR, of the same block of data are
never put on OSDs in the same failure domain (for example, on the same host).
So failure domain specifies the unit which failure you are protecting yourself
from.
## max_osd_combinations
- Type: integer
- Default: 10000
Vitastor data placement algorithm is based on the LP solver and OSD combinations
which are fed to it are generated ramdonly. This parameter specifies the maximum
number of combinations to generate when optimising PG placement.
This parameter usually doesn't require to be changed.
## pg_stripe_size
- Type: integer
- Default: 0
Specifies the stripe size for this pool according to which images are split into
different PGs. Stripe size can't be smaller than [block_size]({{< ref "config/layout-cluster#block_size" >}})
multiplied by (pg_size - parity_chunks) for EC/XOR pools, or 1 for replicated pools,
and the same value is used by default.
This means first `pg_stripe_size = (block_size * (pg_size-parity_chunks))` bytes
of an image go to one PG, next `pg_stripe_size` bytes go to another PG and so on.
Usually doesn't require to be changed separately from the block size.
## root_node
- Type: string
Specifies the root node of the OSD tree to restrict this pool OSDs to.
Referenced root node must exist in /vitastor/config/node_placement.
## osd_tags
- Type: string or array of strings
Specifies OSD tags to restrict this pool to. If multiple tags are specified,
only OSDs having all of these tags will be used for this pool.
## primary_affinity_tags
- Type: string or array of strings
Specifies OSD tags to prefer putting primary OSDs in this pool to.
Note that for EC/XOR pools Vitastor always prefers to put primary OSD on one
of the OSDs containing a data chunk for a PG.
# Examples
## Replicated pool
```
{
"1": {
"name":"testpool",
"scheme":"replicated",
"pg_size":2,
"pg_minsize":1,
"pg_count":256,
"failure_domain":"host"
}
}
```
## Erasure-coded pool
```
{
"2": {
"name":"ecpool",
"scheme":"jerasure",
"pg_size":3,
"parity_chunks":1,
"pg_minsize":2,
"pg_count":256,
"failure_domain":"host"
}
}
```

View File

@@ -1,41 +0,0 @@
---
title: Packages
weight: 2
---
## Debian
- Trust Vitastor package signing key:
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
- Add Vitastor package repository to your /etc/apt/sources.list:
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
- For Debian 10 (Buster) also enable backports repository:
`deb http://deb.debian.org/debian buster-backports main`
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`
## CentOS
- Add Vitastor package repository:
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
- Enable EPEL: `yum/dnf install epel-release`
- Enable additional CentOS repositories:
- CentOS 7: `yum install centos-release-scl`
- CentOS 8: `dnf install centos-release-advanced-virtualization`
- Enable elrepo-kernel:
- CentOS 7: `yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm`
- CentOS 8: `dnf install https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm`
- Install packages: `yum/dnf install vitastor lpsolve etcd kernel-ml qemu-kvm`
## Installation requirements
- Linux kernel 5.4 or newer, for io_uring support. 5.8 or later is highly
recommended because io_uring is a relatively new technology and there is
at least one bug which reproduces with io_uring and HP SmartArray
controllers in 5.4
- liburing 0.4 or newer
- lp_solve
- etcd 3.4.15 or newer. Earlier versions won't work because of various bugs,
for example [#12402](https://github.com/etcd-io/etcd/pull/12402).
- node.js 10 or newer

View File

@@ -1,72 +0,0 @@
---
title: Quick Start
weight: 1
---
Prepare:
- Get some SATA or NVMe SSDs with capacitors (server-grade drives). You can use desktop SSDs
with lazy fsync, but prepare for inferior single-thread latency. Read more about capacitors
[here]({{< ref "config/layout-cluster#immediate_commit" >}}).
- Get a fast network (at least 10 Gbit/s). Something like Mellanox ConnectX-4 with RoCEv2 is ideal.
- Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
- [Install Vitastor packages]({{< ref "installation/packages" >}}).
## Configure monitors
On the monitor hosts:
- Edit variables at the top of `/usr/lib/vitastor/mon/make-units.sh` to desired values.
- Create systemd units for the monitor and etcd: `/usr/lib/vitastor/mon/make-units.sh`
- Start etcd and monitors: `systemctl start etcd vitastor-mon`
## Configure OSDs
- Put etcd_address and osd_network into `/etc/vitastor/vitastor.conf`. Example:
```
{
"etcd_address": ["10.200.1.10:2379","10.200.1.11:2379","10.200.1.12:2379"],
"osd_network": "10.200.1.0/24"
}
```
- Initialize OSDs:
- Simplest, SSD-only: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
- Hybrid, HDD+SSD: `/usr/lib/vitastor/mon/make-osd-hybrid.js /dev/sda /dev/sdb ...` &mdash; pass all your
devices (HDD and SSD) to this script &mdash; it will partition disks and initialize journals on its own.
This script skips HDDs which are already partitioned so if you want to use non-empty disks for
Vitastor you should first wipe them with `wipefs -a`. SSDs with GPT partition table are not skipped,
but some free unpartitioned space must be available because the script creates new partitions for journals.
- You can change OSD configuration in units or in `vitastor.conf`.
Check [Configuration Reference]({{< ref "config" >}}) for parameter descriptions.
- `systemctl start vitastor.target` everywhere.
- If all your drives have capacitors, create global configuration in etcd: \
`etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
## Create a pool
Create pool configuration in etcd:
```
etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
"scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'
```
For jerasure pools the configuration should look like the following:
```
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
"scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
```
After you do this, one of the monitors will configure PGs and OSDs will start them.
You can check PG states with `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. All PGs should become 'active'.
## Create an image
Use vitastor-cli ([read CLI documentation here]({{< ref "usage/cli" >}})):
```
vitastor-cli create -s 10G testimg
```
After that, you can run benchmarks or start QEMU manually with this image.

View File

@@ -1,54 +0,0 @@
---
title: Building from Source
weight: 3
---
## Requirements
- gcc and g++ 8 or newer, clang 10 or newer, or other compiler with C++11 plus
designated initializers support from C++20
- CMake
- liburing, jerasure headers
## Basic instructions
Download source, for example using git: `git clone --recurse-submodules https://yourcmc.ru/git/vitalif/vitastor/`
Get `fio` source and symlink it into `<vitastor>/fio`. If you don't want to build fio engine,
you can disable it by passing `-DWITH_FIO=no` to cmake.
Build and install Vitastor:
```
cd vitastor
mkdir build
cd build
cmake .. && make -j8 install
```
## QEMU Driver
It's recommended to build the QEMU driver (qemu_driver.c) in-tree, as a part of
QEMU build process. To do that:
- Install vitastor client library headers (from source or from vitastor-client-dev package)
- Take a corresponding patch from `patches/qemu-*-vitastor.patch` and apply it to QEMU source
- Copy `src/qemu_driver.c` to QEMU source directory as `block/block-vitastor.c`
- Build QEMU as usual
But it is also possible to build it out-of-tree. To do that:
- Get QEMU source, begin to build it, stop the build and copy headers:
- `<qemu>/include` &rarr; `<vitastor>/qemu/include`
- Debian:
* Use qemu packages from the main repository
* `<qemu>/b/qemu/config-host.h` &rarr; `<vitastor>/qemu/b/qemu/config-host.h`
* `<qemu>/b/qemu/qapi` &rarr; `<vitastor>/qemu/b/qemu/qapi`
- CentOS 8:
* Use qemu packages from the Advanced-Virtualization repository. To enable it, run
`yum install centos-release-advanced-virtualization.noarch` and then `yum install qemu`
* `<qemu>/config-host.h` &rarr; `<vitastor>/qemu/b/qemu/config-host.h`
* For QEMU 3.0+: `<qemu>/qapi` &rarr; `<vitastor>/qemu/b/qemu/qapi`
* For QEMU 2.0+: `<qemu>/qapi-types.h` &rarr; `<vitastor>/qemu/b/qemu/qapi-types.h`
- `config-host.h` and `qapi` are required because they contain generated headers
- Configure Vitastor with `WITH_QEMU=yes` and, if you're on RHEL, also with `QEMU_PLUGINDIR=qemu-kvm`:
`cmake .. -DWITH_QEMU=yes`.
- After that, Vitastor will build `block-vitastor.so` during its build process.

View File

@@ -1,4 +0,0 @@
---
title: Introduction
weight: -1
---

View File

@@ -1,73 +0,0 @@
---
title: Architecture
weight: 3
---
For people familiar with Ceph, Vitastor is quite similar:
- Vitastor also has Pools, PGs, OSDs, Monitors, Failure Domains, Placement Tree:
- OSD (Object Storage Daemon) is a process that stores data and serves read/write requests.
- PG (Placement Group) is a container for data that (normally) shares the same replicas.
- Pool is a container for data that has the same redundancy scheme and placement rules.
- Monitor is a separate daemon that watches cluster state and controls data distribution.
- Failure Domain is a group of OSDs that you allow to fail. It's "host" by default.
- Placement Tree groups OSDs in a hierarchy to later split them into Failure Domains.
- Vitastor also distributes every image data across the whole cluster.
- Vitastor is also transactional (every write to the cluster is atomic).
- OSDs also have journal and metadata and they can also be put on separate drives.
- Just like in Ceph, client library attempts to recover from any cluster failure so
you can basically reboot the whole cluster and only pause, but not crash, your clients
(please report a bug if the client crashes in that case).
However, there are also differences:
- Vitastor's main focus is on SSDs. Hybrid SSD+HDD setups are also possible.
- Vitastor OSD is (and will always be) single-threaded. If you want to dedicate more than 1 core
per drive you should run multiple OSDs each on a different partition of the drive.
Vitastor isn't CPU-hungry though (as opposed to Ceph), so 1 core is sufficient in a lot of cases.
- Metadata and journal are always kept in memory. Metadata size depends linearly on drive capacity
and data store block size which is 128 KB by default. With 128 KB blocks metadata should occupy
around 512 MB per 1 TB (which is still less than Ceph wants). Journal doesn't have to be big,
the example test below was conducted with only 16 MB journal. A big journal is probably even
harmful as dirty write metadata also take some memory.
- Vitastor storage layer doesn't have internal copy-on-write or redirect-write. I know that maybe
it's possible to create a good copy-on-write storage, but it's much harder and makes performance
less deterministic, so CoW isn't used in Vitastor.
- The basic layer of Vitastor is block storage with fixed-size blocks, not object storage with
rich semantics like in Ceph (RADOS).
- There's a "lazy fsync" mode which allows to batch writes before flushing them to the disk.
This allows to use Vitastor with desktop SSDs, but still lowers performance due to additional
network roundtrips, so use server SSDs with capacitor-based power loss protection
("Advanced Power Loss Protection") for best performance.
- PGs are ephemeral. This means that they aren't stored on data disks and only exist in memory
while OSDs are running.
- Recovery process is per-object (per-block), not per-PG. Also there are no PGLOGs.
- Monitors don't store data. Cluster configuration and state is stored in etcd in simple human-readable
JSON structures. Monitors only watch cluster state and handle data movement.
Thus Vitastor's Monitor isn't a critical component of the system and is more similar to Ceph's Manager.
Vitastor's Monitor is implemented in node.js.
- PG distribution isn't based on consistent hashes. All PG mappings are stored in etcd.
Rebalancing PGs between OSDs is done by mathematical optimization - data distribution problem
is reduced to a linear programming problem and solved by lp_solve. This allows for almost
perfect (96-99% uniformity compared to Ceph's 80-90%) data distribution in most cases, ability
to map PGs by hand without breaking rebalancing logic, reduced OSD peer-to-peer communication
(on average, OSDs have fewer peers) and less data movement. It also probably has a drawback -
this method may fail in very large clusters, but up to several hundreds of OSDs it's perfectly fine.
It's also easy to add consistent hashes in the future if something proves their necessity.
- There's no separate CRUSH layer. You select pool redundancy scheme, placement root, failure domain
and so on directly in pool configuration.
- Images are global i.e. you can't create multiple images with the same name in different pools.
## Implementation Principles
- I like architecturally simple solutions. Vitastor is and will always be designed
exactly like that.
- I also like reinventing the wheel to some extent, like writing my own HTTP client
for etcd interaction instead of using prebuilt libraries, because in this case
I'm confident about what my code does and what it doesn't do.
- I don't care about C++ "best practices" like RAII or proper inheritance or usage of
smart pointers or whatever and I don't intend to change my mind, so if you're here
looking for ideal reference C++ code, this probably isn't the right place.
- I like node.js better than any other dynamically-typed language interpreter
because it's faster than any other interpreter in the world, has neutral C-like
syntax and built-in event loop. That's why Monitor is implemented in node.js.

View File

@@ -1,34 +0,0 @@
---
title: Author and License
weight: 3
---
Copyright (c) Vitaliy Filippov (vitalif [at] yourcmc.ru), 2019+
Join Vitastor Telegram Chat: https://t.me/vitastor
All server-side code (OSD, Monitor and so on) is licensed under the terms of
Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
GNU GPLv3.0 with the additional "Network Interaction" clause which requires
opensourcing all programs directly or indirectly interacting with Vitastor
through a computer network and expressly designed to be used in conjunction
with it ("Proxy Programs"). Proxy Programs may be made public not only under
the terms of the same license, but also under the terms of any GPL-Compatible
Free Software License, as listed by the Free Software Foundation.
This is a stricter copyleft license than the Affero GPL.
Please note that VNPL doesn't require you to open the code of proprietary
software running inside a VM if it's not specially designed to be used with
Vitastor.
Basically, you can't use the software in a proprietary environment to provide
its functionality to users without opensourcing all intermediary components
standing between the user and Vitastor or purchasing a commercial license
from the author 😀.
Client libraries (cluster_client and so on) are dual-licensed under the same
VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
software like QEMU and fio.
You can find the full text of VNPL-1.1 in the file [VNPL-1.1.txt](VNPL-1.1.txt).
GPL 2.0 is also included in this repository as [GPL-2.0.txt](GPL-2.0.txt).

View File

@@ -1,60 +0,0 @@
---
title: Features
weight: 1
---
Vitastor is currently a pre-release and it still misses some important features.
However, the following is implemented:
- Basic part: highly-available block storage with symmetric clustering and no SPOF
- Performance ;-D
- Multiple redundancy schemes: Replication, XOR n+1, Reed-Solomon erasure codes
based on jerasure library with any number of data and parity drives in a group
- Configuration via simple JSON data structures in etcd (parameters, pools and images)
- Automatic data distribution over OSDs, with support for:
- Mathematical optimization for better uniformity and less data movement
- Multiple pools
- Placement tree, OSD selection by tags (device classes) and placement root
- Configurable failure domains
- Recovery of degraded blocks
- Rebalancing (data movement between OSDs)
- Lazy fsync support
- Per-OSD and per-image I/O and space usage statistics in etcd
- Snapshots and copy-on-write image clones
- Write throttling to smooth random write workloads in SSD+HDD configurations
- RDMA/RoCEv2 support via libibverbs
CLI (vitastor-cli):
- Pool listing and space stats (df)
- Image listing, space and I/O stats (ls)
- Image and snapshot creation (create, modify)
- Image removal and snapshot merge (rm, flatten, merge, rm-data)
Plugins and packaging:
- Debian and CentOS packages
- Generic user-space client library
- Native QEMU driver
- Loadable fio engine for benchmarks
- NBD proxy for kernel mounts
- CSI plugin for Kubernetes
- OpenStack support: Cinder driver, Nova and libvirt patches
- Proxmox storage plugin and packages
## Roadmap
The following features are planned for the future:
- Better OSD creation and auto-start tools
- Other administrative tools
- Web GUI
- OpenNebula plugin
- iSCSI proxy
- Simplified NFS proxy
- Multi-threaded client
- Faster failover
- Scrubbing without checksums (verification of replicas)
- Checksums
- Tiered storage (SSD caching)
- NVDIMM support
- Compression (possibly)
- Read caching using system page cache (possibly)

View File

@@ -1,93 +0,0 @@
---
title: Example Comparison with Ceph
weight: 4
---
Hardware configuration: 4 nodes, each with:
- 6x SATA SSD Intel D3-S4510 3.84 TB
- 2x Xeon Gold 6242 (16 cores @ 2.8 GHz)
- 384 GB RAM
- 1x 25 GbE network interface (Mellanox ConnectX-4 LX), connected to a Juniper QFX5200 switch
CPU powersaving was disabled. Both Vitastor and Ceph were configured with 2 OSDs per 1 SSD.
All of the results below apply to 4 KB blocks and random access (unless indicated otherwise).
T8Q64 tests were conducted over 8 400GB RBD images from all hosts (every host was running 2 instances of fio).
This is because Ceph has performance penalties related to running multiple clients over a single RBD image.
cephx_sign_messages was set to false during tests, RocksDB and Bluestore settings were left at defaults.
T8Q64 read test was conducted over 1 larger inode (3.2T) from all hosts (every host was running 2 instances of fio).
Vitastor has no performance penalties related to running multiple clients over a single inode.
If conducted from one node with all primary OSDs moved to other nodes the result was slightly lower (689000 iops),
this is because all operations resulted in network roundtrips between the client and the primary OSD.
When fio was colocated with OSDs (like in Ceph benchmarks above), 1/4 of the read workload actually
used the loopback network.
Vitastor was configured with: `--disable_data_fsync true --immediate_commit all --flusher_count 8
--disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096
--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024
--journal_size 16777216`.
## Raw drive performance
- T1Q1 write ~27000 iops (~0.037ms latency)
- T1Q1 read ~9800 iops (~0.101ms latency)
- T1Q32 write ~60000 iops
- T1Q32 read ~81700 iops
## 2 replicas
### Ceph 15.2.4 (Bluestore)
- T1Q1 write ~1000 iops (~1ms latency)
- T1Q1 read ~1750 iops (~0.57ms latency)
- T8Q64 write ~100000 iops, total CPU usage by OSDs about 40 virtual cores on each node
- T8Q64 read ~480000 iops, total CPU usage by OSDs about 40 virtual cores on each node
In fact, not that bad for Ceph. These servers are an example of well-balanced Ceph nodes.
However, CPU usage and I/O latency were through the roof, as usual.
### Vitastor 0.4.0 (native)
- T1Q1 write: 7087 iops (0.14ms latency)
- T1Q1 read: 6838 iops (0.145ms latency)
- T2Q64 write: 162000 iops, total CPU usage by OSDs about 3 virtual cores on each node
- T8Q64 read: 895000 iops, total CPU usage by OSDs about 4 virtual cores on each node
- Linear write (4M T1Q32): 2800 MB/s
- Linear read (4M T1Q32): 1500 MB/s
### Vitastor 0.4.0 (NBD)
NBD is currently required to mount Vitastor via kernel, but it imposes additional overhead
due to additional copying between the kernel and userspace. This mostly hurts linear
bandwidth, not iops.
Vitastor with single-threaded NBD on the same hardware:
- T1Q1 write: 6000 iops (0.166ms latency)
- T1Q1 read: 5518 iops (0.18ms latency)
- T1Q128 write: 94400 iops
- T1Q128 read: 103000 iops
- Linear write (4M T1Q128): 1266 MB/s (compared to 2800 MB/s via fio)
- Linear read (4M T1Q128): 975 MB/s (compared to 1500 MB/s via fio)
## EC/XOR 2+1
### Ceph 15.2.4
- T1Q1 write: 730 iops (~1.37ms latency)
- T1Q1 read: 1500 iops with cold cache (~0.66ms latency), 2300 iops after 2 minute metadata cache warmup (~0.435ms latency)
- T4Q128 write (4 RBD images): 45300 iops, total CPU usage by OSDs about 30 virtual cores on each node
- T8Q64 read (4 RBD images): 278600 iops, total CPU usage by OSDs about 40 virtual cores on each node
- Linear write (4M T1Q32): 1950 MB/s before preallocation, 2500 MB/s after preallocation
- Linear read (4M T1Q32): 2400 MB/s
### Vitastor 0.4.0
- T1Q1 write: 2808 iops (~0.355ms latency)
- T1Q1 read: 6190 iops (~0.16ms latency)
- T2Q64 write: 85500 iops, total CPU usage by OSDs about 3.4 virtual cores on each node
- T8Q64 read: 812000 iops, total CPU usage by OSDs about 4.7 virtual cores on each node
- Linear write (4M T1Q32): 3200 MB/s
- Linear read (4M T1Q32): 1800 MB/s

View File

@@ -1,46 +0,0 @@
---
title: Vitastor's Theoretical Maximum Performance
weight: 3
---
Replicated setups:
- Single-threaded (T1Q1) read latency: 1 network roundtrip + 1 disk read.
- Single-threaded write+fsync latency:
- With immediate commit: 2 network roundtrips + 1 disk write.
- With lazy commit: 4 network roundtrips + 1 disk write + 1 disk flush.
- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)).
- Saturated parallel write iops: min(network bandwidth, sum(disk write iops / number of replicas / write amplification)).
EC/XOR setups:
- Single-threaded (T1Q1) read latency: 1.5 network roundtrips + 1 disk read.
- Single-threaded write+fsync latency:
- With immediate commit: 3.5 network roundtrips + 1 disk read + 2 disk writes.
- With lazy commit: 5.5 network roundtrips + 1 disk read + 2 disk writes + 2 disk fsyncs.
- 0.5 in actually (k-1)/k which means that an additional roundtrip doesn't happen when
the read sub-operation can be served locally.
- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)).
- Saturated parallel write iops: min(network bandwidth, sum(disk write iops * number of data drives / (number of data + parity drives) / write amplification)).
In fact, you should put disk write iops under the condition of ~10% reads / ~90% writes in this formula.
Write amplification for 4 KB blocks is usually 3-5 in Vitastor:
1. Journal block write
2. Journal data write
3. Metadata block write
4. Another journal block write for EC/XOR setups
5. Data block write
If you manage to get an SSD which handles 512 byte blocks well (Optane?) you may
lower 1, 3 and 4 to 512 bytes (1/8 of data size) and get WA as low as 2.375.
Lazy fsync also reduces WA for parallel workloads because journal blocks are only
written when they fill up or fsync is requested.
## In Practice
In practice, using tests from [Understanding Performance]({{< ref "performance/understanding" >}})
and good server-grade SSD/NVMe drives, you should head for:
- At least 5000 T1Q1 replicated read and write iops (maximum 0.2ms latency)
- At least ~80k parallel read iops or ~30k write iops per 1 core (1 OSD)
- Disk-speed or wire-speed linear reads and writes, whichever is the bottleneck in your case
If your results are lower, that may mean you have bad drives, bad network or some kind of misconfiguration.

View File

@@ -1,6 +0,0 @@
---
title: Tuning
weight: 2
---
- Disable CPU powersaving

View File

@@ -1,52 +0,0 @@
---
title: Understanding Storage Performance
weight: 1
---
The most important thing for fast storage is latency, not parallel iops.
The best possible latency is achieved with one thread and queue depth of 1 which basically means
"client load as low as possible". In this case IOPS = 1/latency, and this number doesn't
scale with number of servers, drives, server processes or threads and so on.
Single-threaded IOPS and latency numbers only depend on *how fast a single daemon is*.
Why is it important? It's important because some of the applications *can't* use
queue depth greater than 1 because their task isn't parallelizable. A notable example
is any ACID DBMS because all of them write their WALs sequentially with fsync()s.
fsync, by the way, is another important thing often missing in benchmarks. The point is
that drives have cache buffers and don't guarantee that your data is actually persisted
until you call fsync() which is translated to a FLUSH CACHE command by the OS.
Desktop SSDs are very fast without fsync - NVMes, for example, can process ~80000 write
operations per second with queue depth of 1 without fsync - but they're really slow with
fsync because they have to actually write data to flash chips when you call fsync. Typical
number is around 1000-2000 iops with fsync.
Server SSDs often have supercapacitors that act as a built-in UPS and allow the drive
to flush its DRAM cache to the persistent flash storage when a power loss occurs.
This makes them perform equally well with and without fsync. This feature is called
"Advanced Power Loss Protection" by Intel; other vendors either call it similarly
or directly as "Full Capacitor-Based Power Loss Protection".
All software-defined storages that I currently know are slow in terms of latency.
Notable examples are Ceph and internal SDSes used by cloud providers like Amazon, Google,
Yandex and so on. They're all slow and can only reach ~0.3ms read and ~0.6ms 4 KB write latency
with best-in-slot hardware.
And that's in the SSD era when you can buy an SSD that has ~0.04ms latency for 100 $.
I use the following 6 commands with small variations to benchmark any storage:
- Linear write:
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4M -iodepth=32 -rw=write -runtime=60 -filename=/dev/sdX`
- Linear read:
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4M -iodepth=32 -rw=read -runtime=60 -filename=/dev/sdX`
- Random write latency (T1Q1, this hurts storages the most):
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=1 -fsync=1 -rw=randwrite -runtime=60 -filename=/dev/sdX`
- Random read latency (T1Q1):
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=1 -rw=randread -runtime=60 -filename=/dev/sdX`
- Parallel write iops (use numjobs if a single CPU core is insufficient to saturate the load):
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=128 [-numjobs=4 -group_reporting] -rw=randwrite -runtime=60 -filename=/dev/sdX`
- Parallel read iops (use numjobs if a single CPU core is insufficient to saturate the load):
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=128 [-numjobs=4 -group_reporting] -rw=randread -runtime=60 -filename=/dev/sdX`

View File

@@ -1,183 +0,0 @@
---
title: Vitastor CLI
weight: 1
---
vitastor-cli is a command-line tool for administrative tasks like image management.
It supports the following commands:
{{< toc >}}
Global options:
```
--etcd_address ADDR Etcd connection address
--iodepth N Send N operations in parallel to each OSD when possible (default 32)
--parallel_osds M Work with M osds in parallel when possible (default 4)
--progress 1|0 Report progress (default 1)
--cas 1|0 Use online CAS writes when possible (default auto)
--no-color Disable colored output
--json JSON output
```
## status
`vitastor-cli status`
Show cluster status.
Example output:
```
cluster:
etcd: 1 / 1 up, 1.8 M database size
mon: 1 up, master stump
osd: 8 / 12 up
data:
raw: 498.5 G used, 301.2 G / 799.7 G available, 399.8 G down
state: 156.6 G clean, 97.6 G misplaced
pools: 2 / 3 active
pgs: 30 active
34 active+has_misplaced
32 offline
io:
client: 0 B/s rd, 0 op/s rd, 0 B/s wr, 0 op/s wr
rebalance: 989.8 M/s, 7.9 K op/s
```
## df
`vitastor-cli df`
Show pool space statistics.
Example output:
```
NAME SCHEME PGS TOTAL USED AVAILABLE USED% EFFICIENCY
testpool 2/1 32 100 G 34.2 G 60.7 G 39.23% 100%
size1 1/1 32 199.9 G 10 G 121.5 G 39.23% 100%
kaveri 2/1 32 0 B 10 G 0 B 100% 0%
```
In the example above, "kaveri" pool has "zero" efficiency because all its OSD are down.
## ls
`vitastor-cli ls [-l] [-p POOL] [--sort FIELD] [-r] [-n N] [<glob> ...]`
List images (only matching `<glob>` pattern(s) if passed).
Options:
```
-p|--pool POOL Filter images by pool ID or name
-l|--long Also report allocated size and I/O statistics
--del Also include delete operation statistics
--sort FIELD Sort by specified field (name, size, used_size, <read|write|delete>_<iops|bps|lat|queue>)
-r|--reverse Sort in descending order
-n|--count N Only list first N items
```
Example output:
```
NAME POOL SIZE USED READ IOPS QUEUE LAT WRITE IOPS QUEUE LAT FLAGS PARENT
debian9 testpool 20 G 12.3 G 0 B/s 0 0 0 us 0 B/s 0 0 0 us RO
pve/vm-100-disk-0 testpool 20 G 0 B 0 B/s 0 0 0 us 0 B/s 0 0 0 us - debian9
pve/base-101-disk-0 testpool 20 G 0 B 0 B/s 0 0 0 us 0 B/s 0 0 0 us RO debian9
pve/vm-102-disk-0 testpool 32 G 36.4 M 0 B/s 0 0 0 us 0 B/s 0 0 0 us - pve/base-101-disk-0
debian9-test testpool 20 G 36.6 M 0 B/s 0 0 0 us 0 B/s 0 0 0 us - debian9
bench testpool 10 G 10 G 0 B/s 0 0 0 us 0 B/s 0 0 0 us -
bench-kaveri kaveri 10 G 10 G 0 B/s 0 0 0 us 0 B/s 0 0 0 us -
```
## create
`vitastor-cli create -s|--size <size> [-p|--pool <id|name>] [--parent <parent_name>[@<snapshot>]] <name>`
Create an image. You may use K/M/G/T suffixes for `<size>`. If `--parent` is specified,
a copy-on-write image clone is created. Parent must be a snapshot (readonly image).
Pool must be specified if there is more than one pool.
```
vitastor-cli create --snapshot <snapshot> [-p|--pool <id|name>] <image>
vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
```
Create a snapshot of image `<name>` (either form can be used). May be used live if only a single writer is active.
## modify
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
Rename, resize image or change its readonly status. Images with children can't be made read-write.
If the new size is smaller than the old size, extra data will be purged.
You should resize file system in the image, if present, before shrinking it.
```
-f|--force Proceed with shrinking or setting readwrite flag even if the image has children.
```
## rm
`vitastor-cli rm <from> [<to>] [--writers-stopped]`
Remove `<from>` or all layers between `<from>` and `<to>` (`<to>` must be a child of `<from>`),
rebasing all their children accordingly. --writers-stopped allows merging to be a bit
more effective in case of a single 'slim' read-write child and 'fat' removed parent:
the child is merged into parent and parent is renamed to child in that case.
In other cases parent layers are always merged into children.
## flatten
`vitastor-cli flatten <layer>`
Flatten a layer, i.e. merge data and detach it from parents.
## rm-data
`vitastor-cli rm-data --pool <pool> --inode <inode> [--wait-list] [--min-offset <offset>]`
Remove inode data without changing metadata.
```
--wait-list Retrieve full objects listings before starting to remove objects.
Requires more memory, but allows to show correct removal progress.
--min-offset Purge only data starting with specified offset.
```
## merge-data
`vitastor-cli merge-data <from> <to> [--target <target>]`
Merge layer data without changing metadata. Merge `<from>`..`<to>` to `<target>`.
`<to>` must be a child of `<from>` and `<target>` may be one of the layers between
`<from>` and `<to>`, including `<from>` and `<to>`.
## alloc-osd
`vitastor-cli alloc-osd`
Allocate a new OSD number and reserve it by creating empty `/osd/stats/<n>` key.
## simple-offsets
`vitastor-cli simple-offsets <device>`
Calculate offsets for simple&stupid (no superblock) OSD deployment.
Options:
```
--object_size 128k Set blockstore block size
--bitmap_granularity 4k Set bitmap granularity
--journal_size 16M Set journal size
--device_block_size 4k Set device block size
--journal_offset 0 Set journal offset
--device_size 0 Set device size
--format text Result format: json, options, env, or text
```

View File

@@ -1,20 +0,0 @@
---
title: NBD
weight: 6
---
To create a local block device for a Vitastor image, use NBD. For example:
```
vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
```
It will output the device name, like /dev/nbd0 which you can then format and mount as a normal block device.
You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.
To unmap the device run:
```
vitastor-nbd unmap /dev/nbd0
```

View File

@@ -1,39 +0,0 @@
---
title: QEMU and qemu-img
weight: 2
---
You need patched QEMU version to use Vitastor driver.
To start a VM using plain QEMU command-line with Vitastor disk, use the following commands:
Old syntax (-drive):
```
qemu-system-x86_64 -enable-kvm -m 1024 \
-drive 'file=vitastor:etcd_host=192.168.7.2\:2379/v3:image=debian9',
format=raw,if=none,id=drive-virtio-disk0,cache=none \
-device 'virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
id=virtio-disk0,bootindex=1,write-cache=off' \
-vnc 0.0.0.0:0
```
New syntax (-blockdev):
```
qemu-system-x86_64 -enable-kvm -m 1024 \
-blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
"cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
-device 'virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
id=virtio-disk0,bootindex=1,write-cache=off' \
-vnc 0.0.0.0:0
```
For qemu-img, you should use `vitastor:etcd_host=<HOST>:image=<IMAGE>` as filename. For example:
```
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=debian10'
```
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
if you don't want to use inode metadata.

View File

@@ -1,37 +0,0 @@
---
nav_navigation: Навигация
nav_tags: Теги
nav_more: Подробнее
nav_top: К началу
form_placeholder_search: Поиск
error_page_title: Открыта несуществующая страница
error_message_title: Потерялись?
error_message_code: Ошибка 404
error_message_text: >
Похоже, страница, которую вы открыли, не существует. Попробуйте найти
нужную информацию с <a class="gdoc-error__link" href="{{ . }}">главной страницы</a>.
button_toggle_dark: Переключить тёмный/светлый/авто режим
button_nav_open: Показать навигацию
button_nav_close: Скрыть навигацию
button_menu_open: Открыть меню
button_menu_close: Закрыть меню
button_homepage: На главную
title_anchor_prefix: "Ссылка на:"
posts_read_more: Читать подробнее
posts_read_time:
one: "Одна минута на чтение"
other: "{{ . }} минут(ы) на чтение"
posts_update_prefix: Обновлено
footer_build_with: >
Сделано на <a href="https://gohugo.io/" class="gdoc-footer__link">Hugo</a> с
<svg class="icon gdoc_heart"><use xlink:href="#gdoc_heart"></use></svg>
footer_legal_notice: Правовая информация
footer_privacy_policy: Приватность
language_switch_no_tranlation_prefix: "Страница не переведена:"

View File

@@ -1,34 +0,0 @@
<footer class="gdoc-footer">
<div class="container flex">
<div class="flex flex-wrap" style="flex: 1">
<span class="gdoc-footer__item gdoc-footer__item--row">
&copy; Vitaliy Filippov, 2021+
</span>
</div>
<div class="flex flex-wrap">
{{ with .Site.Params.GeekdocLegalNotice }}
<span class="gdoc-footer__item gdoc-footer__item--row">
<a href="{{ . | relURL }}" class="gdoc-footer__link">{{ i18n "footer_legal_notice" }}</a>
</span>
{{ end }}
{{ with .Site.Params.GeekdocPrivacyPolicy }}
<span class="gdoc-footer__item gdoc-footer__item--row">
<a href="{{ . | relURL }}" class="gdoc-footer__link">{{ i18n "footer_privacy_policy" }}</a>
</span>
{{ end }}
</div>
{{ if (default true .Site.Params.GeekdocBackToTop) }}
<div class="flex flex-25 justify-end">
<span class="gdoc-footer__item gdoc-footer__item--row" style="margin-right: 50px">
{{ i18n "footer_build_with" | safeHTML }}
</span>
<span class="gdoc-footer__item">
<a class="gdoc-footer__link fake-link" href="#" aria-label="{{ i18n "nav_top" }}">
<svg class="icon gdoc_keyboard_arrow_up"><use xlink:href="#gdoc_keyboard_arrow_up"></use></svg>
<span class="hidden-mobile">{{ i18n "nav_top" }}</span>
</a>
</span>
</div>
{{ end }}
</div>
</footer>

View File

@@ -1,215 +0,0 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
xmlns:osb="http://www.openswatchbook.org/uri/2009/osb"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
sodipodi:docname="logo_only2.svg"
inkscape:version="1.0.2 (e86c870879, 2021-01-15)"
id="svg1340"
version="1.1"
viewBox="0 0 100 86.80192"
height="86.801918mm"
width="100mm"
inkscape:export-filename="/var/home/vitali/SVN/vitastor/presentation/logos/logo_only.png"
inkscape:export-xdpi="92.889999"
inkscape:export-ydpi="92.889999">
<defs
id="defs1334">
<linearGradient
osb:paint="gradient"
id="linearGradient866">
<stop
id="stop862"
offset="0"
style="stop-color:#c0c0c0;stop-opacity:1" />
<stop
id="stop864"
offset="1"
style="stop-color:#000000;stop-opacity:0" />
</linearGradient>
<linearGradient
id="linearGradient846"
osb:paint="gradient">
<stop
style="stop-color:#ffd42a;stop-opacity:1"
offset="0"
id="stop842" />
<stop
style="stop-color:#ffa200;stop-opacity:1"
offset="1"
id="stop844" />
</linearGradient>
<radialGradient
r="50"
fy="159.11139"
fx="202.36813"
cy="159.11139"
cx="202.36813"
gradientTransform="matrix(1.2462942,-1.2279529,0.77712408,0.78873143,-190.96813,230.1331)"
gradientUnits="userSpaceOnUse"
id="radialGradient1530"
xlink:href="#linearGradient1352"
inkscape:collect="always" />
<linearGradient
inkscape:collect="always"
id="linearGradient1352">
<stop
style="stop-color:#00c9e6;stop-opacity:1"
offset="0"
id="stop1348" />
<stop
style="stop-color:#5240d3;stop-opacity:1"
offset="1"
id="stop1350" />
</linearGradient>
<linearGradient
y2="62.555599"
x2="51.484566"
y1="62.555599"
x1="38.105473"
gradientTransform="rotate(-16.930773,271.11609,-412.42594)"
gradientUnits="userSpaceOnUse"
id="linearGradient1508"
xlink:href="#linearGradient1323"
inkscape:collect="always" />
<linearGradient
inkscape:collect="always"
id="linearGradient1323">
<stop
style="stop-color:#000000;stop-opacity:0.47178105"
offset="0"
id="stop1319" />
<stop
style="stop-color:#eeaaff;stop-opacity:0;"
offset="1"
id="stop1321" />
</linearGradient>
<radialGradient
r="21.541935"
fy="24.614815"
fx="45.312912"
cy="24.614815"
cx="45.312912"
gradientTransform="matrix(1.0933447,0.13113705,-0.12664108,1.0558599,-1.082187,93.974708)"
gradientUnits="userSpaceOnUse"
id="radialGradient1504"
xlink:href="#linearGradient846"
inkscape:collect="always" />
<filter
style="color-interpolation-filters:sRGB"
inkscape:label="Drop Shadow"
id="filter1497"
width="2"
height="2"
x="-0.5"
y="-0.5">
<feFlood
flood-opacity="0.498039"
flood-color="rgb(0,0,0)"
result="flood"
id="feFlood1487" />
<feComposite
in="flood"
in2="SourceGraphic"
operator="in"
result="composite1"
id="feComposite1489" />
<feGaussianBlur
in="composite1"
stdDeviation="6"
result="blur"
id="feGaussianBlur1491" />
<feOffset
dx="0"
dy="6"
result="offset"
id="feOffset1493" />
<feComposite
in="offset"
in2="offset"
operator="atop"
result="composite2"
id="feComposite1495" />
</filter>
<radialGradient
r="21.541935"
fy="24.614815"
fx="45.312912"
cy="24.614815"
cx="45.312912"
gradientTransform="matrix(1.0933447,0.13113705,-0.12664108,1.0558599,-1.082187,93.974708)"
gradientUnits="userSpaceOnUse"
id="radialGradient1506"
xlink:href="#linearGradient846"
inkscape:collect="always" />
</defs>
<sodipodi:namedview
inkscape:window-maximized="1"
inkscape:window-y="0"
inkscape:window-x="0"
inkscape:window-height="992"
inkscape:window-width="1920"
fit-margin-bottom="0"
fit-margin-right="0"
fit-margin-left="0"
fit-margin-top="-30"
showgrid="false"
inkscape:document-rotation="0"
inkscape:current-layer="layer1"
inkscape:document-units="mm"
inkscape:cy="47.914558"
inkscape:cx="-103.69646"
inkscape:zoom="0.7"
inkscape:pageshadow="2"
inkscape:pageopacity="1"
borderopacity="1.0"
bordercolor="#666666"
pagecolor="#000000"
id="base" />
<metadata
id="metadata1337">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<g
transform="translate(-133.26969,-52.101187)"
id="layer1"
inkscape:groupmode="layer"
inkscape:label="Слой 1">
<path
style="fill:url(#radialGradient1530);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 133.26969,59.089473 50,75.000087 50,-75.000087 z"
id="path1528"
sodipodi:nodetypes="cccc" />
<path
d="m 194.29572,89.403603 -8.41706,2.562119 -2.50682,7.49308 7.17785,23.579008 9.60097,-14.40173 z"
style="fill:url(#linearGradient1508);fill-opacity:1;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.501961"
id="path1459" />
<g
transform="translate(135.70225,-49.385894)"
id="g1465">
<path
id="path1461"
style="fill:url(#radialGradient1504);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;filter:url(#filter1497)"
d="m 28.817436,101.36529 c 3.112699,10.74423 6.225077,21.48892 9.333984,32.23438 2.519532,0 5.039063,0 7.558594,0 -0.985406,8.09729 -2.085815,16.18202 -2.951172,24.29297 -0.06053,0.88723 1.098131,1.61652 1.76,0.9155 1.007514,-1.05482 1.676008,-2.3829 2.528566,-3.56053 7.51538,-11.37722 14.987447,-22.78299 22.482919,-34.17333 -3.239584,0 -6.479167,0 -9.71875,0 2.887267,-6.79562 5.775365,-13.59088 8.662109,-20.38672 -13.284505,0 -26.56901,0 -39.853516,0 0.06576,0.22591 0.131511,0.45182 0.197266,0.67773 z" />
<path
sodipodi:nodetypes="cccccccc"
id="path1463"
d="m 30.735882,102.2764 h 35.342242 l -8.662729,20.3854 h 9.173783 l -22.106472,33.62346 3.027029,-24.27377 H 39.34604 Z"
style="fill:url(#radialGradient1506);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
</g>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 7.4 KiB

View File

@@ -1,138 +0,0 @@
/* Global customization */
:root {
--code-max-height: 60rem;
}
/* Light mode theming */
:root,
:root[color-mode="light"] {
--header-background: #404050;
--header-font-color: #ffffff;
--body-background: #ffffff;
--body-font-color: #343a40;
--button-background: #62cb97;
--button-border-color: #4ec58a;
--link-color: #c54e8a;
--link-color-visited: #c54e8a;
--code-background: #f5f6f8;
--code-accent-color: #e3e7eb;
--code-accent-color-lite: #eff1f3;
--accent-color: #e9ecef;
--accent-color-lite: #f8f9fa;
--control-icons: #b2bac1;
--footer-background: #606070;
--footer-font-color: #ffffff;
--footer-link-color: #ffcc5c;
--footer-link-color-visited: #ffcc5c;
}
@media (prefers-color-scheme: light) {
:root {
--header-background: #404050;
--header-font-color: #ffffff;
--body-background: #ffffff;
--body-font-color: #343a40;
--button-background: #62cb97;
--button-border-color: #4ec58a;
--link-color: #c54e8a;
--link-color-visited: #c54e8a;
--code-background: #f5f6f8;
--code-accent-color: #e3e7eb;
--code-accent-color-lite: #eff1f3;
--accent-color: #e9ecef;
--accent-color-lite: #f8f9fa;
--control-icons: #b2bac1;
--footer-background: #606070;
--footer-font-color: #ffffff;
--footer-link-color: #ffcc5c;
--footer-link-color-visited: #ffcc5c;
}
}
/* Dark mode theming */
:root[color-mode="dark"] {
--header-background: #202830;
--header-font-color: #ffffff;
--body-background: #343a44;
--body-font-color: #ced3d8;
--button-background: #62cb97;
--button-border-color: #4ec58a;
--link-color: #7ac29e;
--link-color-visited: #7ac29e;
--code-background: #2f353a;
--code-accent-color: #262b2f;
--code-accent-color-lite: #2b3035;
--accent-color: #2b3035;
--accent-color-lite: #2f353a;
--control-icons: #b2bac1;
--footer-background: #2f333e;
--footer-font-color: #cccccc;
--footer-link-color: #7ac29e;
--footer-link-color-visited: #7ac29e;
}
@media (prefers-color-scheme: dark) {
:root {
--header-background: #404070;
--header-font-color: #ffffff;
--body-background: #343a40;
--body-font-color: #ced3d8;
--button-background: #62cb97;
--button-border-color: #4ec58a;
--link-color: #7ac29e;
--link-color-visited: #7ac29e;
--code-background: #2f353a;
--code-accent-color: #262b2f;
--code-accent-color-lite: #2b3035;
--accent-color: #2b3035;
--accent-color-lite: #2f353a;
--control-icons: #b2bac1;
--footer-background: #2f333e;
--footer-font-color: #cccccc;
--footer-link-color: #7ac29e;
--footer-link-color-visited: #7ac29e;
}
}
.gdoc-brand__img {
width: 48px;
height: auto;
margin-top: -4px;
margin-bottom: -4px;
}
.gdoc-menu-header > span {
display: flex;
flex-direction: row-reverse;
}
span.gdoc-language {
margin-right: 20px;
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 709 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.5 KiB

View File

@@ -1,196 +0,0 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
xmlns:osb="http://www.openswatchbook.org/uri/2009/osb"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
sodipodi:docname="favicon.svg"
inkscape:version="1.0.2 (e86c870879, 2021-01-15)"
id="svg1340"
version="1.1"
viewBox="0 0 100 100"
height="100mm"
width="100mm"
inkscape:export-filename="/var/home/vitali/SVN/vitastor/docs/static/favicon/favicon-64x64.png"
inkscape:export-xdpi="16.26"
inkscape:export-ydpi="16.26">
<defs
id="defs1334">
<linearGradient
osb:paint="gradient"
id="linearGradient866">
<stop
id="stop862"
offset="0"
style="stop-color:#c0c0c0;stop-opacity:1" />
<stop
id="stop864"
offset="1"
style="stop-color:#000000;stop-opacity:0" />
</linearGradient>
<linearGradient
id="linearGradient846"
osb:paint="gradient">
<stop
style="stop-color:#ffd42a;stop-opacity:1"
offset="0"
id="stop842" />
<stop
style="stop-color:#ffa200;stop-opacity:1"
offset="1"
id="stop844" />
</linearGradient>
<radialGradient
r="50"
fy="159.11139"
fx="202.36813"
cy="159.11139"
cx="202.36813"
gradientTransform="matrix(1.2462942,-1.2279529,0.77712408,0.78873143,-190.96813,230.1331)"
gradientUnits="userSpaceOnUse"
id="radialGradient1530"
xlink:href="#linearGradient1352"
inkscape:collect="always" />
<linearGradient
inkscape:collect="always"
id="linearGradient1352">
<stop
style="stop-color:#00c9e6;stop-opacity:1"
offset="0"
id="stop1348" />
<stop
style="stop-color:#5240d3;stop-opacity:1"
offset="1"
id="stop1350" />
</linearGradient>
<linearGradient
y2="62.555599"
x2="51.484566"
y1="62.555599"
x1="38.105473"
gradientTransform="rotate(-16.930773,271.11609,-412.42594)"
gradientUnits="userSpaceOnUse"
id="linearGradient1508"
xlink:href="#linearGradient1323"
inkscape:collect="always" />
<linearGradient
inkscape:collect="always"
id="linearGradient1323">
<stop
style="stop-color:#000000;stop-opacity:0.47178105"
offset="0"
id="stop1319" />
<stop
style="stop-color:#eeaaff;stop-opacity:0;"
offset="1"
id="stop1321" />
</linearGradient>
<filter
style="color-interpolation-filters:sRGB"
inkscape:label="Drop Shadow"
id="filter1497"
width="2"
height="2"
x="-0.5"
y="-0.5">
<feFlood
flood-opacity="0.498039"
flood-color="rgb(0,0,0)"
result="flood"
id="feFlood1487" />
<feComposite
in="flood"
in2="SourceGraphic"
operator="in"
result="composite1"
id="feComposite1489" />
<feGaussianBlur
in="composite1"
stdDeviation="6"
result="blur"
id="feGaussianBlur1491" />
<feOffset
dx="0"
dy="6"
result="offset"
id="feOffset1493" />
<feComposite
in="offset"
in2="offset"
operator="atop"
result="composite2"
id="feComposite1495" />
</filter>
<radialGradient
r="21.541935"
fy="24.614815"
fx="45.312912"
cy="24.614815"
cx="45.312912"
gradientTransform="matrix(1.6678615,0.20004527,-0.19318681,1.6106796,108.48083,22.966962)"
gradientUnits="userSpaceOnUse"
id="radialGradient1506"
xlink:href="#linearGradient846"
inkscape:collect="always" />
</defs>
<sodipodi:namedview
inkscape:window-maximized="1"
inkscape:window-y="0"
inkscape:window-x="0"
inkscape:window-height="992"
inkscape:window-width="1920"
fit-margin-bottom="0"
fit-margin-right="0"
fit-margin-left="0"
fit-margin-top="0"
showgrid="false"
inkscape:document-rotation="0"
inkscape:current-layer="layer1"
inkscape:document-units="mm"
inkscape:cy="83.752268"
inkscape:cx="-103.69645"
inkscape:zoom="0.7"
inkscape:pageshadow="2"
inkscape:pageopacity="0"
borderopacity="1.0"
bordercolor="#666666"
pagecolor="#000000"
id="base" />
<metadata
id="metadata1337">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<g
transform="translate(-133.26969,-35.630924)"
id="layer1"
inkscape:groupmode="layer"
inkscape:label="Слой 1">
<path
style="fill:url(#radialGradient1530);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 133.26969,59.089473 50,75.000087 50,-75.000087 z"
id="path1528"
sodipodi:nodetypes="cccc" />
<path
d="m 194.29572,89.403603 -8.41706,2.562119 -2.50682,7.49308 7.17785,23.579008 9.60097,-14.40173 z"
style="fill:url(#linearGradient1508);fill-opacity:1;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.501961"
id="path1459" />
<path
sodipodi:nodetypes="cccccccc"
id="path1463"
d="m 157.01826,35.630924 h 53.91343 l -13.21471,31.09726 h 13.99432 l -33.7227,51.291496 4.61762,-37.02885 h -12.45344 z"
style="fill:url(#radialGradient1506);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
</g>
</svg>

Before

Width:  |  Height:  |  Size: 6.2 KiB

View File

@@ -1,6 +0,0 @@
---
title: Common Parameters
weight: 1
---
These are the most common parameters which apply to all components of Vitastor.

View File

@@ -1,6 +0,0 @@
---
title: Общие параметры
weight: 1
---
Это наиболее общие параметры, используемые всеми компонентами Vitastor.

View File

@@ -1,7 +0,0 @@
---
title: Cluster-Wide Disk Layout Parameters
weight: 2
---
These parameters apply to clients and OSDs, are fixed at the moment of OSD drive
initialization and can't be changed after it without losing data.

View File

@@ -1,7 +0,0 @@
---
title: Дисковые параметры уровня кластера
weight: 2
---
Данные параметры используются клиентами и OSD, задаются в момент инициализации
диска OSD и не могут быть изменены после этого без потери данных.

View File

@@ -1,7 +0,0 @@
---
title: OSD Disk Layout Parameters
weight: 3
---
These parameters apply to OSDs, are fixed at the moment of OSD drive
initialization and can't be changed after it without losing data.

View File

@@ -1,8 +0,0 @@
---
title: Дисковые параметры OSD
weight: 3
---
Данные параметры используются только OSD и, также как и общекластерные
дисковые параметры, задаются в момент инициализации дисков OSD и не могут быть
изменены после этого без потери данных.

View File

@@ -1,6 +0,0 @@
---
title: Monitor Parameters
weight: 6
---
These parameters only apply to Monitors.

View File

@@ -1,6 +0,0 @@
---
title: Параметры мониторов
weight: 6
---
Данные параметры используются только мониторами Vitastor.

View File

@@ -1,7 +0,0 @@
---
title: Network Protocol Parameters
weight: 4
---
These parameters apply to clients and OSDs and affect network connection logic
between clients, OSDs and etcd.

View File

@@ -1,7 +0,0 @@
---
title: Параметры сетевого протокола
weight: 4
---
Данные параметры используются клиентами и OSD и влияют на логику сетевого
взаимодействия между клиентами, OSD, а также etcd.

View File

@@ -1,7 +0,0 @@
---
title: Runtime OSD Parameters
weight: 5
---
These parameters only apply to OSDs, are not fixed at the moment of OSD drive
initialization and can be changed with an OSD restart.

View File

@@ -1,8 +0,0 @@
---
title: Изменяемые параметры OSD
weight: 5
---
Данные параметры используются только OSD, но, в отличие от дисковых параметров,
не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
момент с перезапуском OSD.

View File

@@ -248,8 +248,6 @@
row and slow down significantly (from 25000+ iops to ~3000 iops). When
this option is set, Vitastor will always move to the next sector of the
journal after writing it instead of possibly overwriting it the second time.
Most (99%) other SSDs don't need this option.
info_ru: |
Включайте данную опцию для SSD вроде Intel D3-S4510 и D3-S4610, которые
ОЧЕНЬ не любят, когда ПО перезаписывает один и тот же сектор несколько раз
@@ -258,8 +256,6 @@
данная опция установлена, Vitastor всегда переходит к следующему сектору
журнала после записи вместо потенциально повторной перезаписи того же
самого сектора.
Почти все другие SSD (99% моделей) не требуют данной опции.
- name: throttle_small_writes
type: bool
default: false

View File

@@ -160,8 +160,6 @@ const etcd_tree = {
root_node?: 'rack1',
// restrict pool to OSDs having all of these tags
osd_tags?: 'nvme' | [ 'nvme', ... ],
// prefer to put primary on OSD with these tags
primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
},
...
}, */
@@ -226,19 +224,15 @@ const etcd_tree = {
}, */
},
inodestats: {
/* <pool_id>: {
<inode_t>: {
read: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
write: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
},
/* <inode_t>: {
read: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
write: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
}, */
},
space: {
/* <osd_num_t>: {
<pool_id>: {
<inode_t>: uint64_t, // bytes
},
<inode_t>: uint64_t, // bytes
}, */
},
},
@@ -905,39 +899,27 @@ class Mon
return this.seed + 2147483648;
}
pick_primary(pool_id, osd_set, up_osds, aff_osds)
pick_primary(pool_id, osd_set, up_osds)
{
let alive_set;
if (this.state.config.pools[pool_id].scheme === 'replicated')
{
// Prefer "affinity" OSDs
alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]);
if (!alive_set.length)
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
}
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
else
{
// Prefer data OSDs for EC because they can actually read something without an additional network hop
const pg_data_size = (this.state.config.pools[pool_id].pg_size||0) -
(this.state.config.pools[pool_id].parity_chunks||0);
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && aff_osds[osd_num]);
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && up_osds[osd_num]);
if (!alive_set.length)
alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]);
if (!alive_set.length)
{
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && up_osds[osd_num]);
if (!alive_set.length)
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
}
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
}
if (!alive_set.length)
return 0;
return alive_set[this.rng() % alive_set.length];
}
save_new_pgs_txn(request, pool_id, up_osds, osd_tree, prev_pgs, new_pgs, pg_history)
save_new_pgs_txn(request, pool_id, up_osds, prev_pgs, new_pgs, pg_history)
{
const aff_osds = this.get_affinity_osds(this.state.config.pools[pool_id], up_osds, osd_tree);
const pg_items = {};
this.reset_rng();
new_pgs.map((osd_set, i) =>
@@ -945,7 +927,7 @@ class Mon
osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
pg_items[i+1] = {
osd_set,
primary: this.pick_primary(pool_id, osd_set, up_osds, aff_osds),
primary: this.pick_primary(pool_id, osd_set, up_osds),
};
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
prev_pgs[i].filter(osd_num => osd_num).length > 0)
@@ -1076,13 +1058,6 @@ class Mon
console.log('Pool '+pool_id+' has invalid osd_tags (must be a string or array of strings)');
return false;
}
if (pool_cfg.primary_affinity_tags && typeof(pool_cfg.primary_affinity_tags) != 'string' &&
(!(pool_cfg.primary_affinity_tags instanceof Array) || pool_cfg.primary_affinity_tags.filter(t => typeof t != 'string').length > 0))
{
if (warn)
console.log('Pool '+pool_id+' has invalid primary_affinity_tags (must be a string or array of strings)');
return false;
}
return true;
}
@@ -1112,17 +1087,6 @@ class Mon
}
}
get_affinity_osds(pool_cfg, up_osds, osd_tree)
{
let aff_osds = up_osds;
if (pool_cfg.primary_affinity_tags)
{
aff_osds = { ...up_osds };
this.filter_osds_by_tags(osd_tree, { x: aff_osds }, pool_cfg.primary_affinity_tags);
}
return aff_osds;
}
async recheck_pgs()
{
// Take configuration and state, check it against the stored configuration hash
@@ -1153,7 +1117,7 @@ class Mon
{
prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set;
}
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, osd_tree, prev_pgs, [], []);
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, prev_pgs, [], []);
}
}
for (const pool_id in this.state.config.pools)
@@ -1260,7 +1224,7 @@ class Mon
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
value: b64(JSON.stringify(this.state.pool.stats[pool_id])),
} });
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, osd_tree, real_prev_pgs, optimize_result.int_pgs, pg_history);
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, real_prev_pgs, optimize_result.int_pgs, pg_history);
}
this.state.config.pgs.hash = tree_hash;
await this.save_pg_config(etcd_request);
@@ -1277,14 +1241,13 @@ class Mon
continue;
}
const replicated = pool_cfg.scheme === 'replicated';
const aff_osds = this.get_affinity_osds(pool_cfg, up_osds, osd_tree);
this.reset_rng();
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
{
const pg_cfg = this.state.config.pgs.items[pool_id][pg_num];
if (pg_cfg)
{
const new_primary = this.pick_primary(pool_id, pg_cfg.osd_set, up_osds, aff_osds);
const new_primary = this.pick_primary(pool_id, pg_cfg.osd_set, up_osds);
if (pg_cfg.primary != new_primary)
{
console.log(

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '0.6.17'
VERSION = '0.6.16'
LOG = logging.getLogger(__name__)

View File

@@ -25,4 +25,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.6.17/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.17$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-0.6.16/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.16$(rpm --eval '%dist').tar.gz *

View File

@@ -34,7 +34,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.6.17.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.6.16.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.6.17
Version: 0.6.16
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.6.17.el7.tar.gz
Source0: vitastor-0.6.16.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
@@ -119,7 +119,6 @@ cp -r mon %buildroot/usr/lib/vitastor
%files -n vitastor-client
%_bindir/vitastor-nbd
%_bindir/vitastor-nfs
%_bindir/vitastor-cli
%_bindir/vitastor-rm
%_bindir/vita

View File

@@ -33,7 +33,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.6.17.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.6.16.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.6.17
Version: 0.6.16
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.6.17.el8.tar.gz
Source0: vitastor-0.6.16.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
@@ -116,7 +116,6 @@ cp -r mon %buildroot/usr/lib/vitastor
%files -n vitastor-client
%_bindir/vitastor-nbd
%_bindir/vitastor-nfs
%_bindir/vitastor-cli
%_bindir/vitastor-rm
%_bindir/vita

View File

@@ -15,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="0.6.17")
add_definitions(-DVERSION="0.6.16")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -124,18 +124,6 @@ add_library(vitastor_client SHARED
cluster_client.cpp
cluster_client_list.cpp
vitastor_c.cpp
cli_common.cpp
cli_alloc_osd.cpp
cli_simple_offsets.cpp
cli_status.cpp
cli_df.cpp
cli_ls.cpp
cli_create.cpp
cli_modify.cpp
cli_flatten.cpp
cli_merge.cpp
cli_rm_data.cpp
cli_rm.cpp
)
set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h")
target_link_libraries(vitastor_client
@@ -164,24 +152,10 @@ target_link_libraries(vitastor-nbd
vitastor_client
)
# vitastor-nfs
add_executable(vitastor-nfs
nfs_proxy.cpp
nfs_conn.cpp
nfs_portmap.cpp
sha256.c
nfs/xdr_impl.cpp
nfs/rpc_xdr.cpp
nfs/portmap_xdr.cpp
nfs/nfs_xdr.cpp
)
target_link_libraries(vitastor-nfs
vitastor_client
)
# vitastor-cli
add_executable(vitastor-cli
cli.cpp
cli.cpp cli_alloc_osd.cpp cli_simple_offsets.cpp cli_status.cpp cli_df.cpp
cli_ls.cpp cli_create.cpp cli_modify.cpp cli_flatten.cpp cli_merge.cpp cli_rm_data.cpp cli_rm.cpp
)
target_link_libraries(vitastor-cli
vitastor_client
@@ -270,7 +244,7 @@ target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mo
### Install
install(TARGETS vitastor-osd vitastor-dump-journal vitastor-nbd vitastor-nfs vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install(TARGETS vitastor-osd vitastor-dump-journal vitastor-nbd vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-rm)
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vita)
install(

View File

@@ -25,7 +25,7 @@ allocator::allocator(uint64_t blocks)
size = free = blocks;
last_one_mask = (blocks % 64) == 0
? UINT64_MAX
: (((uint64_t)1 << (blocks % 64)) - 1);
: ((1l << (blocks % 64)) - 1);
for (uint64_t i = 0; i < total; i++)
{
mask[i] = 0;
@@ -79,7 +79,7 @@ void allocator::set(uint64_t addr, bool value)
}
if (value)
{
mask[last] = mask[last] | ((uint64_t)1 << bit);
mask[last] = mask[last] | (1l << bit);
if (mask[last] != (!is_last || cur_addr/64 < size/64
? UINT64_MAX : last_one_mask))
{
@@ -88,7 +88,7 @@ void allocator::set(uint64_t addr, bool value)
}
else
{
mask[last] = mask[last] & ~((uint64_t)1 << bit);
mask[last] = mask[last] & ~(1l << bit);
}
is_last = false;
if (p2 > 1)

View File

@@ -131,7 +131,6 @@ resume_1:
}
// Skip superblock
bs->meta_offset += bs->meta_block_size;
bs->meta_len -= bs->meta_block_size;
prev_done = 0;
done_len = 0;
done_pos = 0;

View File

@@ -2,7 +2,8 @@
// License: VNPL-1.1 (see README.md for details)
/**
* CLI tool and also a library for administrative tasks
* CLI tool
* Currently can (a) remove inodes and (b) merge snapshot/clone layers
*/
#include <vector>
@@ -16,9 +17,7 @@
static const char *exe_name = NULL;
static void help();
static json11::Json::object parse_args(int narg, const char *args[])
json11::Json::object cli_tool_t::parse_args(int narg, const char *args[])
{
json11::Json::object cfg;
json11::Json::array cmd;
@@ -80,7 +79,7 @@ static json11::Json::object parse_args(int narg, const char *args[])
return cfg;
}
static void help()
void cli_tool_t::help()
{
printf(
"Vitastor command-line tool\n"
@@ -165,171 +164,224 @@ static void help()
exit(0);
}
static int run(cli_tool_t *p, json11::Json::object cfg)
void cli_tool_t::change_parent(inode_t cur, inode_t new_parent)
{
auto cur_cfg_it = cli->st_cli.inode_config.find(cur);
if (cur_cfg_it == cli->st_cli.inode_config.end())
{
fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
exit(1);
}
inode_config_t new_cfg = cur_cfg_it->second;
std::string cur_name = new_cfg.name;
std::string cur_cfg_key = base64_encode(cli->st_cli.etcd_prefix+
"/config/inode/"+std::to_string(INODE_POOL(cur))+
"/"+std::to_string(INODE_NO_POOL(cur)));
new_cfg.parent_id = new_parent;
json11::Json::object cur_cfg_json = cli->st_cli.serialize_inode_cfg(&new_cfg);
waiting++;
cli->st_cli.etcd_txn_slow(json11::Json::object {
{ "compare", json11::Json::array {
json11::Json::object {
{ "target", "MOD" },
{ "key", cur_cfg_key },
{ "result", "LESS" },
{ "mod_revision", new_cfg.mod_revision+1 },
},
} },
{ "success", json11::Json::array {
json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", cur_cfg_key },
{ "value", base64_encode(json11::Json(cur_cfg_json).dump()) },
} }
},
} },
}, [this, new_parent, cur, cur_name](std::string err, json11::Json res)
{
if (err != "")
{
fprintf(stderr, "Error changing parent of %s: %s\n", cur_name.c_str(), err.c_str());
exit(1);
}
if (!res["succeeded"].bool_value())
{
fprintf(stderr, "Inode %s was modified during snapshot deletion\n", cur_name.c_str());
exit(1);
}
if (new_parent)
{
auto new_parent_it = cli->st_cli.inode_config.find(new_parent);
std::string new_parent_name = new_parent_it != cli->st_cli.inode_config.end()
? new_parent_it->second.name : "<unknown>";
printf(
"Parent of layer %s (inode %lu in pool %u) changed to %s (inode %lu in pool %u)\n",
cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur),
new_parent_name.c_str(), INODE_NO_POOL(new_parent), INODE_POOL(new_parent)
);
}
else
{
printf(
"Parent of layer %s (inode %lu in pool %u) detached\n",
cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur)
);
}
waiting--;
ringloop->wakeup();
});
}
void cli_tool_t::etcd_txn(json11::Json txn)
{
waiting++;
cli->st_cli.etcd_txn_slow(txn, [this](std::string err, json11::Json res)
{
waiting--;
if (err != "")
{
fprintf(stderr, "Error reading from etcd: %s\n", err.c_str());
exit(1);
}
etcd_result = res;
ringloop->wakeup();
});
}
inode_config_t* cli_tool_t::get_inode_cfg(const std::string & name)
{
for (auto & ic: cli->st_cli.inode_config)
{
if (ic.second.name == name)
{
return &ic.second;
}
}
fprintf(stderr, "Layer %s not found\n", name.c_str());
exit(1);
}
void cli_tool_t::run(json11::Json cfg)
{
cli_result_t result;
p->parse_config(cfg);
json11::Json::array cmd = cfg["command"].array_items();
cfg.erase("command");
std::function<bool(cli_result_t &)> action_cb;
if (!cmd.size())
{
result = { .err = EINVAL, .text = "command is missing" };
fprintf(stderr, "command is missing\n");
exit(1);
}
else if (cmd[0] == "status")
{
// Show cluster status
action_cb = p->start_status(cfg);
action_cb = start_status(cfg);
}
else if (cmd[0] == "df")
{
// Show pool space stats
action_cb = p->start_df(cfg);
action_cb = start_df(cfg);
}
else if (cmd[0] == "ls")
{
// List images
if (cmd.size() > 1)
{
cmd.erase(cmd.begin(), cmd.begin()+1);
cfg["names"] = cmd;
}
action_cb = p->start_ls(cfg);
action_cb = start_ls(cfg);
}
else if (cmd[0] == "snap-create")
{
// Create snapshot
std::string name = cmd.size() > 1 ? cmd[1].string_value() : "";
int pos = name.find('@');
if (pos == std::string::npos || pos == name.length()-1)
{
result = (cli_result_t){ .err = EINVAL, .text = "Please specify new snapshot name after @" };
}
else
{
cfg["image"] = name.substr(0, pos);
cfg["snapshot"] = name.substr(pos + 1);
action_cb = p->start_create(cfg);
}
}
else if (cmd[0] == "create")
else if (cmd[0] == "create" || cmd[0] == "snap-create")
{
// Create image/snapshot
if (cmd.size() > 1)
{
cfg["image"] = cmd[1];
}
action_cb = p->start_create(cfg);
action_cb = start_create(cfg);
}
else if (cmd[0] == "modify")
{
// Modify image
if (cmd.size() > 1)
{
cfg["image"] = cmd[1];
}
action_cb = p->start_modify(cfg);
action_cb = start_modify(cfg);
}
else if (cmd[0] == "rm-data")
{
// Delete inode data
action_cb = p->start_rm_data(cfg);
action_cb = start_rm(cfg);
}
else if (cmd[0] == "merge-data")
{
// Merge layer data without affecting metadata
if (cmd.size() > 1)
{
cfg["from"] = cmd[1];
if (cmd.size() > 2)
cfg["to"] = cmd[2];
}
action_cb = p->start_merge(cfg);
action_cb = start_merge(cfg);
}
else if (cmd[0] == "flatten")
{
// Merge layer data without affecting metadata
if (cmd.size() > 1)
{
cfg["image"] = cmd[1];
}
action_cb = p->start_flatten(cfg);
action_cb = start_flatten(cfg);
}
else if (cmd[0] == "rm")
{
// Remove multiple snapshots and rebase their children
if (cmd.size() > 1)
{
cfg["from"] = cmd[1];
if (cmd.size() > 2)
cfg["to"] = cmd[2];
}
action_cb = p->start_rm(cfg);
action_cb = start_snap_rm(cfg);
}
else if (cmd[0] == "alloc-osd")
{
// Allocate a new OSD number
action_cb = p->start_alloc_osd(cfg);
action_cb = start_alloc_osd(cfg);
}
else if (cmd[0] == "simple-offsets")
{
// Calculate offsets for simple & stupid OSD deployment without superblock
if (cmd.size() > 1)
{
cfg["device"] = cmd[1];
}
action_cb = p->simple_offsets(cfg);
action_cb = simple_offsets(cfg);
}
else
{
result = { .err = EINVAL, .text = "unknown command: "+cmd[0].string_value() };
fprintf(stderr, "unknown command: %s\n", cmd[0].string_value().c_str());
exit(1);
}
if (action_cb != NULL)
if (action_cb == NULL)
{
// Create client
json11::Json cfg_j = cfg;
p->ringloop = new ring_loop_t(512);
p->epmgr = new epoll_manager_t(p->ringloop);
p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg_j);
// Smaller timeout by default for more interactiveness
p->cli->st_cli.etcd_slow_timeout = p->cli->st_cli.etcd_quick_timeout;
p->loop_and_wait(action_cb, [&](const cli_result_t & r)
return;
}
color = !cfg["no-color"].bool_value();
json_output = cfg["json"].bool_value();
iodepth = cfg["iodepth"].uint64_value();
if (!iodepth)
iodepth = 32;
parallel_osds = cfg["parallel_osds"].uint64_value();
if (!parallel_osds)
parallel_osds = 4;
log_level = cfg["log_level"].int64_value();
progress = cfg["progress"].uint64_value() ? true : false;
list_first = cfg["wait-list"].uint64_value() ? true : false;
// Create client
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
// Smaller timeout by default for more interactiveness
cli->st_cli.etcd_slow_timeout = cli->st_cli.etcd_quick_timeout;
cli->on_ready([this]()
{
// Initialize job
consumer.loop = [this]()
{
result = r;
action_cb = NULL;
});
// Loop until it completes
while (action_cb != NULL)
{
p->ringloop->loop();
if (action_cb != NULL)
p->ringloop->wait();
}
// Destroy the client
delete p->cli;
delete p->epmgr;
delete p->ringloop;
p->cli = NULL;
p->epmgr = NULL;
p->ringloop = NULL;
}
// Print result
if (p->json_output && !result.data.is_null())
{
bool done = action_cb();
if (done)
{
action_cb = NULL;
}
}
ringloop->submit();
};
ringloop->register_consumer(&consumer);
consumer.loop();
});
// Loop until it completes
while (action_cb != NULL)
{
printf("%s\n", result.data.dump().c_str());
ringloop->loop();
if (action_cb != NULL)
ringloop->wait();
}
else if (p->json_output && result.err)
{
printf("%s\n", json11::Json(json11::Json::object {
{ "error_code", result.err },
{ "error_text", result.text },
}).dump().c_str());
}
else if (result.text != "")
{
fprintf(result.err ? stderr : stdout, result.text[result.text.size()-1] == '\n' ? "%s" : "%s\n", result.text.c_str());
}
return result.err;
// Destroy the client
delete cli;
delete epmgr;
delete ringloop;
cli = NULL;
epmgr = NULL;
ringloop = NULL;
}
int main(int narg, const char *args[])
@@ -338,7 +390,7 @@ int main(int narg, const char *args[])
setvbuf(stderr, NULL, _IONBF, 0);
exe_name = args[0];
cli_tool_t *p = new cli_tool_t();
int r = run(p, parse_args(narg, args));
p->run(cli_tool_t::parse_args(narg, args));
delete p;
return r;
return 0;
}

View File

@@ -19,18 +19,11 @@ class epoll_manager_t;
class cluster_client_t;
struct inode_config_t;
struct cli_result_t
{
int err;
std::string text;
json11::Json data;
};
class cli_tool_t
{
public:
uint64_t iodepth = 4, parallel_osds = 32;
bool progress = false;
uint64_t iodepth = 0, parallel_osds = 0;
bool progress = true;
bool list_first = false;
bool json_output = false;
int log_level = 0;
@@ -41,33 +34,34 @@ public:
cluster_client_t *cli = NULL;
int waiting = 0;
cli_result_t etcd_err;
json11::Json etcd_result;
ring_consumer_t consumer;
std::function<bool(void)> action_cb;
void parse_config(json11::Json cfg);
void run(json11::Json cfg);
void change_parent(inode_t cur, inode_t new_parent, cli_result_t *result);
void change_parent(inode_t cur, inode_t new_parent);
inode_config_t* get_inode_cfg(const std::string & name);
static json11::Json::object parse_args(int narg, const char *args[]);
static void help();
friend struct rm_inode_t;
friend struct snap_merger_t;
friend struct snap_flattener_t;
friend struct snap_remover_t;
std::function<bool(cli_result_t &)> start_status(json11::Json);
std::function<bool(cli_result_t &)> start_df(json11::Json);
std::function<bool(cli_result_t &)> start_ls(json11::Json);
std::function<bool(cli_result_t &)> start_create(json11::Json);
std::function<bool(cli_result_t &)> start_modify(json11::Json);
std::function<bool(cli_result_t &)> start_rm_data(json11::Json);
std::function<bool(cli_result_t &)> start_merge(json11::Json);
std::function<bool(cli_result_t &)> start_flatten(json11::Json);
std::function<bool(cli_result_t &)> start_rm(json11::Json);
std::function<bool(cli_result_t &)> start_alloc_osd(json11::Json cfg);
std::function<bool(cli_result_t &)> simple_offsets(json11::Json cfg);
// Should be called like loop_and_wait(start_status(), <completion callback>)
void loop_and_wait(std::function<bool(cli_result_t &)> loop_cb, std::function<void(const cli_result_t &)> complete_cb);
std::function<bool(void)> start_status(json11::Json cfg);
std::function<bool(void)> start_df(json11::Json);
std::function<bool(void)> start_ls(json11::Json);
std::function<bool(void)> start_create(json11::Json);
std::function<bool(void)> start_modify(json11::Json);
std::function<bool(void)> start_rm(json11::Json);
std::function<bool(void)> start_merge(json11::Json);
std::function<bool(void)> start_flatten(json11::Json);
std::function<bool(void)> start_snap_rm(json11::Json);
std::function<bool(void)> start_alloc_osd(json11::Json cfg, uint64_t *out = NULL);
std::function<bool(void)> simple_offsets(json11::Json cfg);
void etcd_txn(json11::Json txn);
};

View File

@@ -16,7 +16,6 @@ struct alloc_osd_t
uint64_t new_id = 1;
int state = 0;
cli_result_t result;
bool is_done()
{
@@ -63,12 +62,6 @@ struct alloc_osd_t
state = 1;
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
if (!parent->etcd_result["succeeded"].bool_value())
{
std::vector<osd_num_t> used;
@@ -106,23 +99,23 @@ struct alloc_osd_t
}
} while (!parent->etcd_result["succeeded"].bool_value());
state = 100;
result = (cli_result_t){
.text = std::to_string(new_id),
.data = json11::Json(new_id),
};
}
};
std::function<bool(cli_result_t &)> cli_tool_t::start_alloc_osd(json11::Json cfg)
std::function<bool(void)> cli_tool_t::start_alloc_osd(json11::Json cfg, uint64_t *out)
{
json11::Json::array cmd = cfg["command"].array_items();
auto alloc_osd = new alloc_osd_t();
alloc_osd->parent = this;
return [alloc_osd](cli_result_t & result)
return [alloc_osd, out]()
{
alloc_osd->loop();
if (alloc_osd->is_done())
{
result = alloc_osd->result;
if (out)
*out = alloc_osd->new_id;
else if (alloc_osd->new_id)
printf("%lu\n", alloc_osd->new_id);
delete alloc_osd;
return true;
}

View File

@@ -1,149 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "base64.h"
#include "cluster_client.h"
#include "cli.h"
void cli_tool_t::change_parent(inode_t cur, inode_t new_parent, cli_result_t *result)
{
auto cur_cfg_it = cli->st_cli.inode_config.find(cur);
if (cur_cfg_it == cli->st_cli.inode_config.end())
{
char buf[128];
snprintf(buf, 128, "Inode 0x%lx disappeared", cur);
*result = (cli_result_t){ .err = EIO, .text = buf };
return;
}
inode_config_t new_cfg = cur_cfg_it->second;
std::string cur_name = new_cfg.name;
std::string cur_cfg_key = base64_encode(cli->st_cli.etcd_prefix+
"/config/inode/"+std::to_string(INODE_POOL(cur))+
"/"+std::to_string(INODE_NO_POOL(cur)));
new_cfg.parent_id = new_parent;
json11::Json::object cur_cfg_json = cli->st_cli.serialize_inode_cfg(&new_cfg);
waiting++;
cli->st_cli.etcd_txn_slow(json11::Json::object {
{ "compare", json11::Json::array {
json11::Json::object {
{ "target", "MOD" },
{ "key", cur_cfg_key },
{ "result", "LESS" },
{ "mod_revision", new_cfg.mod_revision+1 },
},
} },
{ "success", json11::Json::array {
json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", cur_cfg_key },
{ "value", base64_encode(json11::Json(cur_cfg_json).dump()) },
} }
},
} },
}, [this, result, new_parent, cur, cur_name](std::string err, json11::Json res)
{
if (err != "")
{
*result = (cli_result_t){ .err = EIO, .text = "Error changing parent of "+cur_name+": "+err };
}
else if (!res["succeeded"].bool_value())
{
*result = (cli_result_t){ .err = EAGAIN, .text = "Image "+cur_name+" was modified during change" };
}
else if (new_parent)
{
auto new_parent_it = cli->st_cli.inode_config.find(new_parent);
std::string new_parent_name = new_parent_it != cli->st_cli.inode_config.end()
? new_parent_it->second.name : "<unknown>";
*result = (cli_result_t){
.text = "Parent of layer "+cur_name+" (inode "+std::to_string(INODE_NO_POOL(cur))+
" in pool "+std::to_string(INODE_POOL(cur))+") changed to "+new_parent_name+
" (inode "+std::to_string(INODE_NO_POOL(new_parent))+" in pool "+std::to_string(INODE_POOL(new_parent))+")",
};
}
else
{
*result = (cli_result_t){
.text = "Parent of layer "+cur_name+" (inode "+std::to_string(INODE_NO_POOL(cur))+
" in pool "+std::to_string(INODE_POOL(cur))+") detached",
};
}
waiting--;
ringloop->wakeup();
});
}
void cli_tool_t::etcd_txn(json11::Json txn)
{
waiting++;
cli->st_cli.etcd_txn_slow(txn, [this](std::string err, json11::Json res)
{
waiting--;
if (err != "")
etcd_err = (cli_result_t){ .err = EIO, .text = "Error communicating with etcd: "+err };
else
etcd_err = (cli_result_t){ .err = 0 };
etcd_result = res;
ringloop->wakeup();
});
}
inode_config_t* cli_tool_t::get_inode_cfg(const std::string & name)
{
for (auto & ic: cli->st_cli.inode_config)
{
if (ic.second.name == name)
{
return &ic.second;
}
}
return NULL;
}
void cli_tool_t::parse_config(json11::Json cfg)
{
color = !cfg["no-color"].bool_value();
json_output = cfg["json"].bool_value();
iodepth = cfg["iodepth"].uint64_value();
if (!iodepth)
iodepth = 32;
parallel_osds = cfg["parallel_osds"].uint64_value();
if (!parallel_osds)
parallel_osds = 4;
log_level = cfg["log_level"].int64_value();
progress = cfg["progress"].uint64_value() ? true : false;
list_first = cfg["wait-list"].uint64_value() ? true : false;
}
struct cli_result_looper_t
{
ring_consumer_t consumer;
cli_result_t result;
std::function<bool(cli_result_t &)> loop_cb;
std::function<void(const cli_result_t &)> complete_cb;
};
void cli_tool_t::loop_and_wait(std::function<bool(cli_result_t &)> loop_cb, std::function<void(const cli_result_t &)> complete_cb)
{
auto *looper = new cli_result_looper_t();
looper->loop_cb = loop_cb;
looper->complete_cb = complete_cb;
looper->consumer.loop = [this, looper]()
{
bool done = looper->loop_cb(looper->result);
if (done)
{
ringloop->unregister_consumer(&looper->consumer);
looper->loop_cb = NULL;
looper->complete_cb(looper->result);
delete looper;
return;
}
ringloop->submit();
};
cli->on_ready([this, looper]()
{
ringloop->register_consumer(&looper->consumer);
ringloop->wakeup();
});
}

View File

@@ -25,18 +25,14 @@ struct image_creator_t
pool_id_t new_pool_id = 0;
std::string new_pool_name;
std::string image_name, new_snap, new_parent;
json11::Json new_meta;
uint64_t size;
bool force_size = false;
pool_id_t old_pool_id = 0;
inode_t new_parent_id = 0;
inode_t new_id = 0, old_id = 0;
uint64_t max_id_mod_rev = 0, cfg_mod_rev = 0, idx_mod_rev = 0;
inode_config_t new_cfg;
int state = 0;
cli_result_t result;
bool is_done()
{
@@ -47,27 +43,13 @@ struct image_creator_t
{
if (state >= 1)
goto resume_1;
if (image_name == "")
{
// FIXME: EINVAL -> specific codes for every error
result = (cli_result_t){ .err = EINVAL, .text = "Image name is missing" };
state = 100;
return;
}
if (image_name.find('@') != std::string::npos)
{
result = (cli_result_t){ .err = EINVAL, .text = "Image name can't contain @ character" };
state = 100;
return;
}
if (new_pool_id)
{
auto & pools = parent->cli->st_cli.pool_config;
if (pools.find(new_pool_id) == pools.end())
{
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+std::to_string(new_pool_id)+" does not exist" };
state = 100;
return;
fprintf(stderr, "Pool %u does not exist\n", new_pool_id);
exit(1);
}
}
else if (new_pool_name != "")
@@ -82,9 +64,8 @@ struct image_creator_t
}
if (!new_pool_id)
{
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+new_pool_name+" does not exist" };
state = 100;
return;
fprintf(stderr, "Pool %s does not exist\n", new_pool_name.c_str());
exit(1);
}
}
else if (parent->cli->st_cli.pool_config.size() == 1)
@@ -110,9 +91,8 @@ struct image_creator_t
{
if (ic.second.name == image_name)
{
result = (cli_result_t){ .err = EEXIST, .text = "Image "+image_name+" already exists" };
state = 100;
return;
fprintf(stderr, "Image %s already exists\n", image_name.c_str());
exit(1);
}
if (ic.second.name == new_parent)
{
@@ -129,21 +109,18 @@ struct image_creator_t
}
if (new_parent != "" && !new_parent_id)
{
result = (cli_result_t){ .err = ENOENT, .text = "Parent image "+new_parent+" not found" };
state = 100;
return;
fprintf(stderr, "Parent image not found\n");
exit(1);
}
if (!new_pool_id)
{
result = (cli_result_t){ .err = EINVAL, .text = "Pool name or ID is missing" };
state = 100;
return;
fprintf(stderr, "Pool name or ID is missing\n");
exit(1);
}
if (!size && !force_size)
if (!size)
{
result = (cli_result_t){ .err = EINVAL, .text = "Image size is missing" };
state = 100;
return;
fprintf(stderr, "Image size is missing\n");
exit(1);
}
do
{
@@ -154,36 +131,23 @@ struct image_creator_t
resume_2:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
extract_next_id(parent->etcd_result["responses"][0]);
attempt_create();
state = 3;
resume_3:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
if (!parent->etcd_result["succeeded"].bool_value() &&
parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items().size() > 0)
{
result = (cli_result_t){ .err = EEXIST, .text = "Image "+image_name+" already exists" };
state = 100;
return;
fprintf(stderr, "Image %s already exists\n", image_name.c_str());
exit(1);
}
} while (!parent->etcd_result["succeeded"].bool_value());
// Save into inode_config for library users to be able to take it from there immediately
new_cfg.mod_revision = parent->etcd_result["responses"][0]["response_put"]["header"]["revision"].uint64_value();
parent->cli->st_cli.insert_inode_config(new_cfg);
result = (cli_result_t){ .err = 0, .text = "Image "+image_name+" created" };
if (parent->progress)
{
printf("Image %s created\n", image_name.c_str());
}
state = 100;
}
@@ -199,16 +163,14 @@ resume_3:
{
if (ic.second.name == image_name+"@"+new_snap)
{
result = (cli_result_t){ .err = EEXIST, .text = "Snapshot "+image_name+"@"+new_snap+" already exists" };
state = 100;
return;
fprintf(stderr, "Snapshot %s@%s already exists\n", image_name.c_str(), new_snap.c_str());
exit(1);
}
}
if (new_parent != "")
{
result = (cli_result_t){ .err = EINVAL, .text = "Parent can't be specified for snapshots" };
state = 100;
return;
fprintf(stderr, "--parent can't be used with snapshots\n");
exit(1);
}
do
{
@@ -220,9 +182,8 @@ resume_3:
return;
if (!old_id)
{
result = (cli_result_t){ .err = ENOENT, .text = "Image "+image_name+" does not exist" };
state = 100;
return;
fprintf(stderr, "Image %s does not exist\n", image_name.c_str());
exit(1);
}
if (!new_pool_id)
{
@@ -234,24 +195,17 @@ resume_3:
resume_4:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
if (!parent->etcd_result["succeeded"].bool_value() &&
parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items().size() > 0)
{
result = (cli_result_t){ .err = EEXIST, .text = "Snapshot "+image_name+"@"+new_snap+" already exists" };
state = 100;
return;
fprintf(stderr, "Snapshot %s@%s already exists\n", image_name.c_str(), new_snap.c_str());
exit(1);
}
} while (!parent->etcd_result["succeeded"].bool_value());
// Save into inode_config for library users to be able to take it from there immediately
new_cfg.mod_revision = parent->etcd_result["responses"][0]["response_put"]["header"]["revision"].uint64_value();
parent->cli->st_cli.insert_inode_config(new_cfg);
result = (cli_result_t){ .err = 0, .text = "Snapshot "+image_name+"@"+new_snap+" created" };
if (parent->progress)
{
printf("Snapshot %s@%s created\n", image_name.c_str(), new_snap.c_str());
}
state = 100;
}
@@ -305,12 +259,6 @@ resume_4:
resume_2:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
extract_next_id(parent->etcd_result["responses"][0]);
old_id = 0;
old_pool_id = 0;
@@ -340,9 +288,8 @@ resume_2:
idx_mod_rev = kv.mod_revision;
if (!old_id || !old_pool_id || old_pool_id >= POOL_ID_MAX)
{
result = (cli_result_t){ .err = ENOENT, .text = "Invalid pool or inode ID in etcd key "+kv.key };
state = 100;
return;
fprintf(stderr, "Invalid pool or inode ID in etcd key %s\n", kv.key.c_str());
exit(1);
}
}
parent->etcd_txn(json11::Json::object {
@@ -361,12 +308,6 @@ resume_2:
resume_3:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
{
auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]);
size = kv.value["size"].uint64_value();
@@ -383,13 +324,12 @@ resume_3:
void attempt_create()
{
new_cfg = {
inode_config_t new_cfg = {
.num = INODE_WITH_POOL(new_pool_id, new_id),
.name = image_name,
.size = size,
.parent_id = (new_snap != "" ? INODE_WITH_POOL(old_pool_id, old_id) : new_parent_id),
.readonly = false,
.meta = new_meta,
};
json11::Json::array checks = json11::Json::array {
json11::Json::object {
@@ -517,76 +457,77 @@ uint64_t parse_size(std::string size_str)
if (type_char == 'k' || type_char == 'm' || type_char == 'g' || type_char == 't')
{
if (type_char == 'k')
mul = (uint64_t)1<<10;
mul = 1l<<10;
else if (type_char == 'm')
mul = (uint64_t)1<<20;
mul = 1l<<20;
else if (type_char == 'g')
mul = (uint64_t)1<<30;
mul = 1l<<30;
else /*if (type_char == 't')*/
mul = (uint64_t)1<<40;
mul = 1l<<40;
size_str = size_str.substr(0, size_str.length()-1);
}
uint64_t size = json11::Json(size_str).uint64_value() * mul;
if (size == 0 && size_str != "0" && (size_str != "" || mul != 1))
{
return UINT64_MAX;
fprintf(stderr, "Invalid syntax for size: %s\n", size_str.c_str());
exit(1);
}
return size;
}
std::function<bool(cli_result_t &)> cli_tool_t::start_create(json11::Json cfg)
std::function<bool(void)> cli_tool_t::start_create(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto image_creator = new image_creator_t();
image_creator->parent = this;
image_creator->image_name = cfg["image"].string_value();
image_creator->image_name = cmd.size() > 1 ? cmd[1].string_value() : "";
image_creator->new_pool_id = cfg["pool"].uint64_value();
image_creator->new_pool_name = cfg["pool"].string_value();
image_creator->force_size = cfg["force_size"].bool_value();
if (cfg["image_meta"].is_object())
{
image_creator->new_meta = cfg["image-meta"];
}
if (cfg["snapshot"].string_value() != "")
{
image_creator->new_snap = cfg["snapshot"].string_value();
}
else if (cmd[0] == "snap-create")
{
int p = image_creator->image_name.find('@');
if (p == std::string::npos || p == image_creator->image_name.length()-1)
{
fprintf(stderr, "Please specify new snapshot name after @\n");
exit(1);
}
image_creator->new_snap = image_creator->image_name.substr(p + 1);
image_creator->image_name = image_creator->image_name.substr(0, p);
}
image_creator->new_parent = cfg["parent"].string_value();
if (cfg["size"].string_value() != "")
{
image_creator->size = parse_size(cfg["size"].string_value());
if (image_creator->size == UINT64_MAX)
if (image_creator->size % 4096)
{
return [size = cfg["size"].string_value()](cli_result_t & result)
{
result = (cli_result_t){ .err = EINVAL, .text = "Invalid syntax for size: "+size };
return true;
};
}
if ((image_creator->size % 4096) && !cfg["force_size"].bool_value())
{
delete image_creator;
return [](cli_result_t & result)
{
result = (cli_result_t){ .err = EINVAL, .text = "Size should be a multiple of 4096" };
return true;
};
fprintf(stderr, "Size should be a multiple of 4096\n");
exit(1);
}
if (image_creator->new_snap != "")
{
delete image_creator;
return [](cli_result_t & result)
{
result = (cli_result_t){ .err = EINVAL, .text = "Size can't be specified for snapshots" };
return true;
};
fprintf(stderr, "--size can't be specified for snapshots\n");
exit(1);
}
}
return [image_creator](cli_result_t & result)
if (image_creator->image_name == "")
{
fprintf(stderr, "Image name is missing\n");
exit(1);
}
if (image_creator->image_name.find('@') != std::string::npos)
{
fprintf(stderr, "Image name can't contain @ character\n");
exit(1);
}
return [image_creator]()
{
image_creator->loop();
if (image_creator->is_done())
{
result = image_creator->result;
delete image_creator;
return true;
}

View File

@@ -12,7 +12,6 @@ struct pool_lister_t
int state = 0;
json11::Json space_info;
cli_result_t result;
std::map<pool_id_t, json11::Json::object> pool_stats;
bool is_done()
@@ -53,12 +52,6 @@ struct pool_lister_t
resume_1:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
space_info = parent->etcd_result;
std::map<pool_id_t, uint64_t> osd_free;
for (auto & kv_item: space_info["responses"][0]["response_range"]["kvs"].array_items())
@@ -131,8 +124,8 @@ resume_1:
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
: "EC "+std::to_string(pool_cfg.pg_size-pool_cfg.parity_chunks)+"+"+std::to_string(pool_cfg.parity_chunks) },
{ "used_raw", (uint64_t)(pool_stats[pool_cfg.id]["used_raw_tb"].number_value() * ((uint64_t)1<<40)) },
{ "total_raw", (uint64_t)(pool_stats[pool_cfg.id]["total_raw_tb"].number_value() * ((uint64_t)1<<40)) },
{ "used_raw", (uint64_t)(pool_stats[pool_cfg.id]["used_raw_tb"].number_value() * (1l<<40)) },
{ "total_raw", (uint64_t)(pool_stats[pool_cfg.id]["total_raw_tb"].number_value() * (1l<<40)) },
{ "max_available", pool_avail },
{ "raw_to_usable", pool_stats[pool_cfg.id]["raw_to_usable"].number_value() },
{ "space_efficiency", pool_stats[pool_cfg.id]["space_efficiency"].number_value() },
@@ -157,12 +150,10 @@ resume_1:
get_stats();
if (parent->waiting > 0)
return;
if (state == 100)
return;
if (parent->json_output)
{
// JSON output
result.data = to_list();
printf("%s\n", json11::Json(to_list()).dump().c_str());
state = 100;
return;
}
@@ -215,22 +206,21 @@ resume_1:
: 100)+"%";
kv.second["eff_fmt"] = format_q(kv.second["space_efficiency"].number_value()*100)+"%";
}
result.data = to_list();
result.text = print_table(result.data, cols, parent->color);
printf("%s", print_table(to_list(), cols, parent->color).c_str());
state = 100;
}
};
std::function<bool(cli_result_t &)> cli_tool_t::start_df(json11::Json cfg)
std::function<bool(void)> cli_tool_t::start_df(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto lister = new pool_lister_t();
lister->parent = this;
return [lister](cli_result_t & result)
return [lister]()
{
lister->loop();
if (lister->is_done())
{
result = lister->result;
delete lister;
return true;
}

View File

@@ -22,19 +22,12 @@ struct snap_flattener_t
std::string top_parent_name;
inode_t target_id = 0;
int state = 0;
std::function<bool(cli_result_t &)> merger_cb;
cli_result_t result;
std::function<bool(void)> merger_cb;
void get_merge_parents()
{
// Get all parents of target
inode_config_t *target_cfg = parent->get_inode_cfg(target_name);
if (!target_cfg)
{
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+target_name+" not found" };
state = 100;
return;
}
target_id = target_cfg->num;
std::vector<inode_t> chain_list;
inode_config_t *cur = target_cfg;
@@ -44,34 +37,23 @@ struct snap_flattener_t
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
if (it == parent->cli->st_cli.inode_config.end())
{
result = (cli_result_t){
.err = ENOENT,
.text = "Parent inode of layer "+cur->name+" (id "+std::to_string(cur->parent_id)+") does not exist",
.data = json11::Json::object {
{ "error", "parent-not-found" },
{ "inode_id", cur->num },
{ "inode_name", cur->name },
{ "parent_id", cur->parent_id },
},
};
state = 100;
return;
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
exit(1);
}
cur = &it->second;
chain_list.push_back(cur->num);
}
if (cur->parent_id != 0)
{
result = (cli_result_t){ .err = EBADF, .text = "Layer "+target_name+" has a loop in parents" };
state = 100;
return;
fprintf(stderr, "Layer %s has a loop in parents\n", target_name.c_str());
exit(1);
}
top_parent_name = cur->name;
}
bool is_done()
{
return state == 100;
return state == 5;
}
void loop()
@@ -82,20 +64,11 @@ struct snap_flattener_t
goto resume_2;
else if (state == 3)
goto resume_3;
if (target_name == "")
{
result = (cli_result_t){ .err = EINVAL, .text = "Layer to flatten not specified" };
state = 100;
return;
}
// Get parent layers
get_merge_parents();
if (state == 100)
return;
// Start merger
merger_cb = parent->start_merge(json11::Json::object {
{ "from", top_parent_name },
{ "to", target_name },
{ "command", json11::Json::array{ "merge-data", top_parent_name, target_name } },
{ "target", target_name },
{ "delete-source", false },
{ "cas", use_cas },
@@ -103,19 +76,14 @@ struct snap_flattener_t
});
// Wait for it
resume_1:
while (!merger_cb(result))
while (!merger_cb())
{
state = 1;
return;
}
merger_cb = NULL;
if (result.err)
{
state = 100;
return;
}
// Change parent
parent->change_parent(target_id, 0, &result);
parent->change_parent(target_id, 0);
// Wait for it to complete
state = 2;
resume_2:
@@ -124,26 +92,31 @@ resume_2:
state = 3;
resume_3:
// Done
state = 100;
return;
}
};
std::function<bool(cli_result_t &)> cli_tool_t::start_flatten(json11::Json cfg)
std::function<bool(void)> cli_tool_t::start_flatten(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto flattener = new snap_flattener_t();
flattener->parent = this;
flattener->target_name = cfg["image"].string_value();
flattener->target_name = cmd.size() > 1 ? cmd[1].string_value() : "";
if (flattener->target_name == "")
{
fprintf(stderr, "Layer to flatten argument is missing\n");
exit(1);
}
flattener->fsync_interval = cfg["fsync-interval"].uint64_value();
if (!flattener->fsync_interval)
flattener->fsync_interval = 128;
if (!cfg["cas"].is_null())
flattener->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
return [flattener](cli_result_t & result)
return [flattener]()
{
flattener->loop();
if (flattener->is_done())
{
result = flattener->result;
delete flattener;
return true;
}

View File

@@ -24,7 +24,6 @@ struct image_lister_t
int state = 0;
std::map<inode_t, json11::Json::object> stats;
json11::Json space_info;
cli_result_t result;
bool is_done()
{
@@ -45,9 +44,8 @@ struct image_lister_t
}
if (!list_pool_id)
{
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+list_pool_name+" does not exist" };
state = 100;
return;
fprintf(stderr, "Pool %s does not exist\n", list_pool_name.c_str());
exit(1);
}
}
for (auto & ic: parent->cli->st_cli.inode_config)
@@ -118,12 +116,6 @@ struct image_lister_t
resume_1:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
space_info = parent->etcd_result;
std::map<pool_id_t, uint64_t> pool_pg_real_size;
for (auto & kv_item: space_info["responses"][0]["response_range"]["kvs"].array_items())
@@ -253,13 +245,11 @@ resume_1:
get_stats();
if (parent->waiting > 0)
return;
if (state == 100)
return;
}
result.data = to_list();
if (parent->json_output)
{
// JSON output
printf("%s\n", json11::Json(to_list()).dump().c_str());
state = 100;
return;
}
@@ -369,7 +359,7 @@ resume_1:
kv.second["size_fmt"] = format_size(kv.second["size"].uint64_value());
kv.second["ro"] = kv.second["readonly"].bool_value() ? "RO" : "-";
}
result.text = print_table(to_list(), cols, parent->color);
printf("%s", print_table(to_list(), cols, parent->color).c_str());
state = 100;
}
};
@@ -446,8 +436,8 @@ std::string print_table(json11::Json items, json11::Json header, bool use_esc)
return str;
}
static uint64_t size_thresh[] = { (uint64_t)1024*1024*1024*1024, (uint64_t)1024*1024*1024, (uint64_t)1024*1024, 1024, 0 };
static uint64_t size_thresh_d[] = { (uint64_t)1000000000000, (uint64_t)1000000000, (uint64_t)1000000, (uint64_t)1000, 0 };
static uint64_t size_thresh[] = { 1024l*1024*1024*1024, 1024l*1024*1024, 1024l*1024, 1024, 0 };
static uint64_t size_thresh_d[] = { 1000000000000l, 1000000000l, 1000000l, 1000l, 0 };
static const int size_thresh_n = sizeof(size_thresh)/sizeof(size_thresh[0]);
static const char *size_unit = "TGMKB";
@@ -556,8 +546,9 @@ back:
return true;
}
std::function<bool(cli_result_t &)> cli_tool_t::start_ls(json11::Json cfg)
std::function<bool(void)> cli_tool_t::start_ls(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto lister = new image_lister_t();
lister->parent = this;
lister->list_pool_id = cfg["pool"].uint64_value();
@@ -567,16 +558,15 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_ls(json11::Json cfg)
lister->sort_field = cfg["sort"].string_value();
lister->reverse = cfg["reverse"].bool_value();
lister->max_count = cfg["count"].uint64_value();
for (auto & item: cfg["names"].array_items())
for (int i = 1; i < cmd.size(); i++)
{
lister->only_names.insert(item.string_value());
lister->only_names.insert(cmd[i].string_value());
}
return [lister](cli_result_t & result)
return [lister]()
{
lister->loop();
if (lister->is_done())
{
result = lister->result;
delete lister;
return true;
}

View File

@@ -12,9 +12,6 @@ struct snap_rw_op_t
cluster_op_t op;
int todo = 0;
uint32_t start = 0, end = 0;
int error_code = 0;
uint64_t error_offset = 0;
bool error_read = false;
};
// Layer merge is the base for multiple operations:
@@ -57,45 +54,17 @@ struct snap_merger_t
uint64_t last_written_offset = 0;
int deleted_unsynced = 0;
uint64_t processed = 0, to_process = 0;
std::string rwo_error;
cli_result_t result;
void start_merge()
{
if (from_name == "" || to_name == "")
{
result = (cli_result_t){ .err = EINVAL, .text = "Beginning or end of the merge sequence is missing" };
state = 100;
return;
}
check_delete_source = delete_source || check_delete_source;
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
if (!from_cfg)
{
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+from_name+" not found" };
state = 100;
return;
}
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
if (!to_cfg)
{
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+to_name+" not found" };
state = 100;
return;
}
inode_config_t *target_cfg = target_name == "" ? from_cfg : parent->get_inode_cfg(target_name);
if (!target_cfg)
{
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+target_name+" not found" };
state = 100;
return;
}
if (to_cfg->num == from_cfg->num)
{
result = (cli_result_t){ .err = EINVAL, .text = "Only one layer specified, nothing to merge" };
state = 100;
return;
fprintf(stderr, "Only one layer specified, nothing to merge\n");
exit(1);
}
// Check that to_cfg is actually a child of from_cfg and target_cfg is somewhere between them
std::vector<inode_t> chain_list;
@@ -109,18 +78,8 @@ struct snap_merger_t
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
if (it == parent->cli->st_cli.inode_config.end())
{
result = (cli_result_t){
.err = ENOENT,
.text = "Parent inode of layer "+cur->name+" (id "+std::to_string(cur->parent_id)+") does not exist",
.data = json11::Json::object {
{ "error", "parent-not-found" },
{ "inode_id", cur->num },
{ "inode_name", cur->name },
{ "parent_id", cur->parent_id },
},
};
state = 100;
return;
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
exit(1);
}
cur = &it->second;
chain_list.push_back(cur->num);
@@ -128,9 +87,8 @@ struct snap_merger_t
}
if (cur->parent_id != from_cfg->num)
{
result = (cli_result_t){ .err = EINVAL, .text = "Layer "+to_name+" is not a child of "+from_name };
state = 100;
return;
fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
exit(1);
}
chain_list.push_back(from_cfg->num);
layer_block_size[from_cfg->num] = get_block_size(from_cfg->num);
@@ -141,9 +99,8 @@ struct snap_merger_t
}
if (sources.find(target_cfg->num) == sources.end())
{
result = (cli_result_t){ .err = EINVAL, .text = "Layer "+target_name+" is not between "+to_name+" and "+from_name };
state = 100;
return;
fprintf(stderr, "Layer %s is not between %s and %s\n", target_name.c_str(), to_name.c_str(), from_name.c_str());
exit(1);
}
target = target_cfg->num;
target_rank = sources.at(target);
@@ -173,15 +130,14 @@ struct snap_merger_t
int parent_rank = it->second;
if (parent_rank < to_rank && (parent_rank >= target_rank || check_delete_source))
{
result = (cli_result_t){
.err = EINVAL,
.text = "Layers at or above "+(check_delete_source ? from_name : target_name)+
", but below "+to_name+" are not allowed to have other children, but "+
ic.second.name+" is a child of "+
parent->cli->st_cli.inode_config.at(ic.second.parent_id).name,
};
state = 100;
return;
fprintf(
stderr, "Layers at or above %s, but below %s are not allowed"
" to have other children, but %s is a child of %s\n",
(check_delete_source ? from_name.c_str() : target_name.c_str()),
to_name.c_str(), ic.second.name.c_str(),
parent->cli->st_cli.inode_config.at(ic.second.parent_id).name.c_str()
);
exit(1);
}
if (parent_rank >= to_rank)
{
@@ -196,14 +152,11 @@ struct snap_merger_t
use_cas = 0;
}
sources.erase(target);
if (parent->progress)
{
printf(
"Merging %ld layer(s) into target %s%s (inode %lu in pool %u)\n",
sources.size(), target_cfg->name.c_str(),
use_cas ? " online (with CAS)" : "", INODE_NO_POOL(target), INODE_POOL(target)
);
}
printf(
"Merging %ld layer(s) into target %s%s (inode %lu in pool %u)\n",
sources.size(), target_cfg->name.c_str(),
use_cas ? " online (with CAS)" : "", INODE_NO_POOL(target), INODE_POOL(target)
);
target_block_size = get_block_size(target);
}
@@ -226,7 +179,7 @@ struct snap_merger_t
bool is_done()
{
return state == 100;
return state == 6;
}
void continue_merge()
@@ -241,8 +194,8 @@ struct snap_merger_t
goto resume_4;
else if (state == 5)
goto resume_5;
else if (state == 100)
goto resume_100;
else if (state == 6)
goto resume_6;
// Get parents and so on
start_merge();
// First list lower layers
@@ -300,8 +253,7 @@ struct snap_merger_t
oit = merge_offsets.begin();
resume_5:
// Now read, overwrite and optionally delete offsets one by one
while (in_flight < parent->iodepth*parent->parallel_osds &&
oit != merge_offsets.end() && !rwo_error.size())
while (in_flight < parent->iodepth*parent->parallel_osds && oit != merge_offsets.end())
{
in_flight++;
read_and_write(*oit);
@@ -312,15 +264,6 @@ struct snap_merger_t
printf("\rOverwriting blocks: %lu/%lu", processed, to_process);
}
}
if (in_flight == 0 && rwo_error.size())
{
result = (cli_result_t){
.err = EIO,
.text = rwo_error,
};
state = 100;
return;
}
if (in_flight > 0 || oit != merge_offsets.end())
{
// Wait until overwrites finish
@@ -331,9 +274,9 @@ struct snap_merger_t
printf("\rOverwriting blocks: %lu/%lu\n", to_process, to_process);
}
// Done
result = (cli_result_t){ .text = "Done, layers from "+from_name+" to "+to_name+" merged into "+target_name };
state = 100;
resume_100:
printf("Done, layers from %s to %s merged into %s\n", from_name.c_str(), to_name.c_str(), target_name.c_str());
state = 6;
resume_6:
return;
}
@@ -371,10 +314,7 @@ struct snap_merger_t
if (status & INODE_LIST_DONE)
{
auto & name = parent->cli->st_cli.inode_config.at(src).name;
if (parent->progress)
{
printf("Got listing of layer %s (inode %lu in pool %u)\n", name.c_str(), INODE_NO_POOL(src), INODE_POOL(src));
}
printf("Got listing of layer %s (inode %lu in pool %u)\n", name.c_str(), INODE_NO_POOL(src), INODE_POOL(src));
if (delete_source)
{
// Sort the inode listing
@@ -456,9 +396,8 @@ struct snap_merger_t
{
if (op->retval != op->len)
{
rwo->error_code = -op->retval;
rwo->error_offset = op->offset;
rwo->error_read = true;
fprintf(stderr, "error reading target at offset %lx: %s\n", op->offset, strerror(-op->retval));
exit(1);
}
next_write(rwo);
};
@@ -471,7 +410,7 @@ struct snap_merger_t
// FIXME: Allow to use single write with "holes" (OSDs don't allow it yet)
uint32_t gran = parent->cli->get_bs_bitmap_granularity();
uint64_t bitmap_size = target_block_size / gran;
while (rwo->end < bitmap_size && !rwo->error_code)
while (rwo->end < bitmap_size)
{
auto bit = ((*((uint8_t*)rwo->op.bitmap_buf + (rwo->end >> 3))) & (1 << (rwo->end & 0x7)));
if (!bit)
@@ -495,7 +434,7 @@ struct snap_merger_t
rwo->end++;
}
}
if (rwo->end > rwo->start && !rwo->error_code)
if (rwo->end > rwo->start)
{
// write start->end
rwo->todo++;
@@ -534,9 +473,8 @@ struct snap_merger_t
delete subop;
return;
}
rwo->error_code = -subop->retval;
rwo->error_offset = subop->offset;
rwo->error_read = false;
fprintf(stderr, "error writing target at offset %lx: %s\n", subop->offset, strerror(-subop->retval));
exit(1);
}
// Increment CAS version
rwo->op.version++;
@@ -572,12 +510,11 @@ struct snap_merger_t
{
if (!rwo->todo)
{
if (!rwo->error_code &&
last_written_offset < rwo->op.offset+target_block_size)
if (last_written_offset < rwo->op.offset+target_block_size)
{
last_written_offset = rwo->op.offset+target_block_size;
}
if (!rwo->error_code && delete_source)
if (delete_source)
{
deleted_unsynced++;
if (deleted_unsynced >= fsync_interval)
@@ -607,13 +544,6 @@ struct snap_merger_t
}
}
free(rwo->buf);
if (rwo->error_code)
{
char buf[1024];
snprintf(buf, 1024, "Error %s target at offset %lx: %s",
rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(rwo->error_code));
rwo_error = std::string(buf);
}
delete rwo;
in_flight--;
continue_merge_reent();
@@ -621,25 +551,30 @@ struct snap_merger_t
}
};
std::function<bool(cli_result_t &)> cli_tool_t::start_merge(json11::Json cfg)
std::function<bool(void)> cli_tool_t::start_merge(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto merger = new snap_merger_t();
merger->parent = this;
merger->from_name = cfg["from"].string_value();
merger->to_name = cfg["to"].string_value();
merger->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
merger->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
merger->target_name = cfg["target"].string_value();
if (merger->from_name == "" || merger->to_name == "")
{
fprintf(stderr, "Beginning or end of the merge sequence is missing\n");
exit(1);
}
merger->delete_source = cfg["delete-source"].string_value() != "";
merger->fsync_interval = cfg["fsync-interval"].uint64_value();
if (!merger->fsync_interval)
merger->fsync_interval = 128;
if (!cfg["cas"].is_null())
merger->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
return [merger](cli_result_t & result)
return [merger]()
{
merger->continue_merge_reent();
if (merger->is_done())
{
result = merger->result;
delete merger;
return true;
}

View File

@@ -13,7 +13,6 @@ struct image_changer_t
std::string image_name;
std::string new_name;
uint64_t new_size = 0;
bool force_size = false;
bool set_readonly = false, set_readwrite = false, force = false;
// interval between fsyncs
int fsync_interval = 128;
@@ -24,8 +23,7 @@ struct image_changer_t
bool has_children = false;
int state = 0;
std::function<bool(cli_result_t &)> cb;
cli_result_t result;
std::function<bool(void)> cb;
bool is_done()
{
@@ -38,18 +36,6 @@ struct image_changer_t
goto resume_1;
else if (state == 2)
goto resume_2;
if (image_name == "")
{
result = (cli_result_t){ .err = EINVAL, .text = "Image name is missing" };
state = 100;
return;
}
if (new_size != 0 && (new_size % 4096) && !force_size)
{
result = (cli_result_t){ .err = EINVAL, .text = "Image size should be a multiple of 4096" };
state = 100;
return;
}
for (auto & ic: parent->cli->st_cli.inode_config)
{
if (ic.second.name == image_name)
@@ -60,16 +46,14 @@ struct image_changer_t
}
if (new_name != "" && ic.second.name == new_name)
{
result = (cli_result_t){ .err = EEXIST, .text = "Image "+new_name+" already exists" };
state = 100;
return;
fprintf(stderr, "Image %s already exists\n", new_name.c_str());
exit(1);
}
}
if (!inode_num)
{
result = (cli_result_t){ .err = ENOENT, .text = "Image "+image_name+" does not exist" };
state = 100;
return;
fprintf(stderr, "Image %s does not exist\n", image_name.c_str());
exit(1);
}
for (auto & ic: parent->cli->st_cli.inode_config)
{
@@ -81,43 +65,37 @@ struct image_changer_t
}
if ((!set_readwrite || !cfg.readonly) &&
(!set_readonly || cfg.readonly) &&
(!new_size && !force_size || cfg.size == new_size) &&
(!new_size || cfg.size == new_size) &&
(new_name == "" || new_name == image_name))
{
result = (cli_result_t){ .text = "No change" };
printf("No change\n");
state = 100;
return;
}
if (new_size != 0 || force_size)
if (new_size != 0)
{
if (cfg.size >= new_size)
{
// Check confirmation when trimming an image with children
if (has_children && !force)
{
result = (cli_result_t){ .err = EINVAL, .text = "Image "+image_name+" has children. Refusing to shrink it without --force" };
state = 100;
return;
fprintf(stderr, "Image %s has children. Refusing to shrink it without --force\n", image_name.c_str());
exit(1);
}
// Shrink the image first
cb = parent->start_rm_data(json11::Json::object {
cb = parent->start_rm(json11::Json::object {
{ "inode", INODE_NO_POOL(inode_num) },
{ "pool", (uint64_t)INODE_POOL(inode_num) },
{ "fsync-interval", fsync_interval },
{ "min-offset", ((new_size+4095)/4096)*4096 },
{ "min-offset", new_size },
});
resume_1:
while (!cb(result))
while (!cb())
{
state = 1;
return;
}
cb = NULL;
if (result.err)
{
state = 100;
return;
}
}
cfg.size = new_size;
}
@@ -131,9 +109,8 @@ resume_1:
// Check confirmation when making an image with children read-write
if (has_children && !force)
{
result = (cli_result_t){ .err = EINVAL, .text = "Image "+image_name+" has children. Refusing to make it read-write without --force" };
state = 100;
return;
fprintf(stderr, "Image %s has children. Refusing to make it read-write without --force\n", image_name.c_str());
exit(1);
}
}
if (new_name != "")
@@ -201,38 +178,34 @@ resume_1:
resume_2:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
if (!parent->etcd_result["succeeded"].bool_value())
{
result = (cli_result_t){ .err = EAGAIN, .text = "Image "+image_name+" was modified by someone else, please repeat your request" };
state = 100;
return;
fprintf(stderr, "Image %s was modified by someone else, please repeat your request\n", image_name.c_str());
exit(1);
}
// Save into inode_config for library users to be able to take it from there immediately
cfg.mod_revision = parent->etcd_result["responses"][0]["response_put"]["header"]["revision"].uint64_value();
if (new_name != "")
{
parent->cli->st_cli.inode_by_name.erase(image_name);
}
parent->cli->st_cli.insert_inode_config(cfg);
result = (cli_result_t){ .err = 0, .text = "Image "+image_name+" modified" };
printf("Image %s modified\n", image_name.c_str());
state = 100;
}
};
std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
std::function<bool(void)> cli_tool_t::start_modify(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto changer = new image_changer_t();
changer->parent = this;
changer->image_name = cfg["image"].string_value();
changer->image_name = cmd.size() > 1 ? cmd[1].string_value() : "";
if (changer->image_name == "")
{
fprintf(stderr, "Image name is missing\n");
exit(1);
}
changer->new_name = cfg["rename"].string_value();
changer->new_size = parse_size(cfg["resize"].as_string());
changer->force_size = cfg["force_size"].bool_value();
changer->new_size = parse_size(cfg["resize"].string_value());
if (changer->new_size != 0 && (changer->new_size % 4096))
{
fprintf(stderr, "Image size should be a multiple of 4096\n");
exit(1);
}
changer->force = cfg["force"].bool_value();
changer->set_readonly = cfg["readonly"].bool_value();
changer->set_readwrite = cfg["readwrite"].bool_value();
@@ -240,12 +213,11 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
if (!changer->fsync_interval)
changer->fsync_interval = 128;
// FIXME Check that the image doesn't have children when shrinking
return [changer](cli_result_t & result)
return [changer]()
{
changer->loop();
if (changer->is_done())
{
result = changer->result;
delete changer;
return true;
}

View File

@@ -63,13 +63,11 @@ struct snap_remover_t
inode_t new_parent = 0;
int state = 0;
int current_child = 0;
std::function<bool(cli_result_t &)> cb;
cli_result_t result;
std::function<bool(void)> cb;
bool is_done()
{
return state == 100;
return state == 9;
}
void loop()
@@ -90,28 +88,13 @@ struct snap_remover_t
goto resume_7;
else if (state == 8)
goto resume_8;
else if (state == 100)
goto resume_100;
assert(!state);
if (from_name == "")
{
result = (cli_result_t){ .err = EINVAL, .text = "Layer to remove argument is missing" };
state = 100;
return;
}
if (to_name == "")
{
to_name = from_name;
}
else if (state == 9)
goto resume_9;
// Get children to merge
get_merge_children();
if (state == 100)
return;
// Try to select an inode for the "inverse" optimized scenario
// Read statistics from etcd to do it
read_stats();
if (state == 100)
return;
state = 1;
resume_1:
if (parent->waiting > 0)
@@ -123,72 +106,42 @@ resume_1:
if (merge_children[current_child] == inverse_child)
continue;
start_merge_child(merge_children[current_child], merge_children[current_child]);
if (state == 100)
return;
resume_2:
while (!cb(result))
while (!cb())
{
state = 2;
return;
}
cb = NULL;
if (result.err)
{
state = 100;
return;
}
parent->change_parent(merge_children[current_child], new_parent, &result);
parent->change_parent(merge_children[current_child], new_parent);
state = 3;
resume_3:
if (parent->waiting > 0)
return;
if (result.err)
{
state = 100;
return;
}
else if (parent->progress)
printf("%s\n", result.text.c_str());
}
// Merge our "inverse" child into our "inverse" parent
if (inverse_child != 0)
{
start_merge_child(inverse_child, inverse_parent);
if (state == 100)
return;
resume_4:
while (!cb(result))
while (!cb())
{
state = 4;
return;
}
cb = NULL;
if (result.err)
{
state = 100;
return;
}
// Delete "inverse" child data
start_delete_source(inverse_child);
if (state == 100)
return;
resume_5:
while (!cb(result))
while (!cb())
{
state = 5;
return;
}
cb = NULL;
if (result.err)
{
state = 100;
return;
}
// Delete "inverse" child metadata, rename parent over it,
// and also change parent links of the previous "inverse" child
rename_inverse_parent();
if (state == 100)
return;
state = 6;
resume_6:
if (parent->waiting > 0)
@@ -201,27 +154,20 @@ resume_6:
continue;
start_delete_source(chain_list[current_child]);
resume_7:
while (!cb(result))
while (!cb())
{
state = 7;
return;
}
cb = NULL;
if (result.err)
{
state = 100;
return;
}
delete_inode_config(chain_list[current_child]);
if (state == 100)
return;
state = 8;
resume_8:
if (parent->waiting > 0)
return;
}
state = 100;
resume_100:
state = 9;
resume_9:
// Done
return;
}
@@ -230,19 +176,7 @@ resume_100:
{
// Get all children of from..to
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
if (!from_cfg)
{
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+from_name+" not found" };
state = 100;
return;
}
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
if (!to_cfg)
{
result = (cli_result_t){ .err = ENOENT, .text = "Layer "+to_name+" not found" };
state = 100;
return;
}
// Check that to_cfg is actually a child of from_cfg
// FIXME de-copypaste the following piece of code with snap_merger_t
inode_config_t *cur = to_cfg;
@@ -252,19 +186,16 @@ resume_100:
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
if (it == parent->cli->st_cli.inode_config.end())
{
char buf[1024];
snprintf(buf, 1024, "Parent inode of layer %s (id 0x%lx) not found", cur->name.c_str(), cur->parent_id);
state = 100;
return;
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
exit(1);
}
cur = &it->second;
chain_list.push_back(cur->num);
}
if (cur->num != from_cfg->num)
{
result = (cli_result_t){ .err = EINVAL, .text = "Layer "+to_name+" is not a child of "+from_name };
state = 100;
return;
fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
exit(1);
}
new_parent = from_cfg->parent_id;
// Calculate ranks
@@ -332,9 +263,8 @@ resume_100:
parent->waiting--;
if (err != "")
{
result = (cli_result_t){ .err = EIO, .text = "Error reading layer statistics from etcd: "+err };
state = 100;
return;
fprintf(stderr, "Error reading layer statistics from etcd: %s\n", err.c_str());
exit(1);
}
for (auto inode_result: data["responses"].array_items())
{
@@ -345,16 +275,14 @@ resume_100:
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
if (!inode || null_byte != 0)
{
result = (cli_result_t){ .err = EIO, .text = "Bad key returned from etcd: "+kv.key };
state = 100;
return;
fprintf(stderr, "Bad key returned from etcd: %s\n", kv.key.c_str());
exit(1);
}
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id);
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
{
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+std::to_string(pool_id)+" does not exist" };
state = 100;
return;
fprintf(stderr, "Pool %u does not exist\n", pool_id);
exit(1);
}
inode = INODE_WITH_POOL(pool_id, inode);
auto & pool_cfg = pool_cfg_it->second;
@@ -396,20 +324,14 @@ resume_100:
auto child_it = parent->cli->st_cli.inode_config.find(inverse_child);
if (child_it == parent->cli->st_cli.inode_config.end())
{
char buf[1024];
snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_child);
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
state = 100;
return;
fprintf(stderr, "Inode %ld disappeared\n", inverse_child);
exit(1);
}
auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent);
if (target_it == parent->cli->st_cli.inode_config.end())
{
char buf[1024];
snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_parent);
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
state = 100;
return;
fprintf(stderr, "Inode %ld disappeared\n", inverse_parent);
exit(1);
}
inode_config_t *child_cfg = &child_it->second;
inode_config_t *target_cfg = &target_it->second;
@@ -500,22 +422,18 @@ resume_100:
parent->waiting--;
if (err != "")
{
result = (cli_result_t){ .err = EIO, .text = "Error renaming "+target_name+" to "+child_name+": "+err };
state = 100;
return;
fprintf(stderr, "Error renaming %s to %s: %s\n", target_name.c_str(), child_name.c_str(), err.c_str());
exit(1);
}
if (!res["succeeded"].bool_value())
{
result = (cli_result_t){
.err = EAGAIN,
.text = "Parent ("+target_name+"), child ("+child_name+"), or one of its children"
" configuration was modified during rename",
};
state = 100;
return;
fprintf(
stderr, "Parent (%s), child (%s), or one of its children"
" configuration was modified during rename\n", target_name.c_str(), child_name.c_str()
);
exit(1);
}
if (parent->progress)
printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
parent->ringloop->wakeup();
});
}
@@ -525,11 +443,8 @@ resume_100:
auto cur_cfg_it = parent->cli->st_cli.inode_config.find(cur);
if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
{
char buf[1024];
snprintf(buf, 1024, "Inode 0x%lx disappeared", cur);
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
state = 100;
return;
fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
exit(1);
}
inode_config_t *cur_cfg = &cur_cfg_it->second;
std::string cur_name = cur_cfg->name;
@@ -560,26 +475,20 @@ resume_100:
} },
},
} },
}, [this, cur, cur_name](std::string err, json11::Json res)
}, [this, cur_name](std::string err, json11::Json res)
{
parent->waiting--;
if (err != "")
{
result = (cli_result_t){ .err = EIO, .text = "Error deleting "+cur_name+": "+err };
state = 100;
return;
fprintf(stderr, "Error deleting %s: %s\n", cur_name.c_str(), err.c_str());
exit(1);
}
if (!res["succeeded"].bool_value())
{
result = (cli_result_t){ .err = EAGAIN, .text = "Layer "+cur_name+" was modified during deletion" };
state = 100;
return;
fprintf(stderr, "Layer %s configuration was modified during deletion\n", cur_name.c_str());
exit(1);
}
// Modify inode_config for library users to be able to take it from there immediately
parent->cli->st_cli.inode_by_name.erase(cur_name);
parent->cli->st_cli.inode_config.erase(cur);
if (parent->progress)
printf("Layer %s deleted\n", cur_name.c_str());
printf("Layer %s deleted\n", cur_name.c_str());
parent->ringloop->wakeup();
});
}
@@ -589,24 +498,17 @@ resume_100:
auto child_it = parent->cli->st_cli.inode_config.find(child_inode);
if (child_it == parent->cli->st_cli.inode_config.end())
{
char buf[1024];
snprintf(buf, 1024, "Inode 0x%lx disappeared", child_inode);
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
state = 100;
return;
fprintf(stderr, "Inode %ld disappeared\n", child_inode);
exit(1);
}
auto target_it = parent->cli->st_cli.inode_config.find(target_inode);
if (target_it == parent->cli->st_cli.inode_config.end())
{
char buf[1024];
snprintf(buf, 1024, "Inode 0x%lx disappeared", target_inode);
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
state = 100;
return;
fprintf(stderr, "Inode %ld disappeared\n", target_inode);
exit(1);
}
cb = parent->start_merge(json11::Json::object {
{ "from", from_name },
{ "to", child_it->second.name },
{ "command", json11::Json::array{ "merge-data", from_name, child_it->second.name } },
{ "target", target_it->second.name },
{ "delete-source", false },
{ "cas", use_cas },
@@ -619,13 +521,10 @@ resume_100:
auto source = parent->cli->st_cli.inode_config.find(inode);
if (source == parent->cli->st_cli.inode_config.end())
{
char buf[1024];
snprintf(buf, 1024, "Inode 0x%lx disappeared", inode);
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
state = 100;
return;
fprintf(stderr, "Inode %ld disappeared\n", inode);
exit(1);
}
cb = parent->start_rm_data(json11::Json::object {
cb = parent->start_rm(json11::Json::object {
{ "inode", inode },
{ "pool", (uint64_t)INODE_POOL(inode) },
{ "fsync-interval", fsync_interval },
@@ -633,12 +532,22 @@ resume_100:
}
};
std::function<bool(cli_result_t &)> cli_tool_t::start_rm(json11::Json cfg)
std::function<bool(void)> cli_tool_t::start_snap_rm(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto snap_remover = new snap_remover_t();
snap_remover->parent = this;
snap_remover->from_name = cfg["from"].string_value();
snap_remover->to_name = cfg["to"].string_value();
snap_remover->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
snap_remover->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
if (snap_remover->from_name == "")
{
fprintf(stderr, "Layer to remove argument is missing\n");
exit(1);
}
if (snap_remover->to_name == "")
{
snap_remover->to_name = snap_remover->from_name;
}
snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
if (!snap_remover->fsync_interval)
snap_remover->fsync_interval = 128;
@@ -646,12 +555,11 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_rm(json11::Json cfg)
snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
if (!cfg["writers_stopped"].is_null())
snap_remover->writers_stopped = true;
return [snap_remover](cli_result_t & result)
return [snap_remover]()
{
snap_remover->loop();
if (snap_remover->is_done())
{
result = snap_remover->result;
delete snap_remover;
return true;
}

View File

@@ -32,9 +32,6 @@ struct rm_inode_t
uint64_t pgs_to_list = 0;
bool lists_done = false;
int state = 0;
int error_count = 0;
cli_result_t result;
void start_delete()
{
@@ -77,13 +74,8 @@ struct rm_inode_t
});
if (!lister)
{
result = (cli_result_t){
.err = EIO,
.text = "Failed to list objects of inode "+std::to_string(INODE_NO_POOL(inode))+
" from pool "+std::to_string(INODE_POOL(inode)),
};
state = 100;
return;
fprintf(stderr, "Failed to list inode %lu from pool %u objects\n", INODE_NO_POOL(inode), INODE_POOL(inode));
exit(1);
}
pgs_to_list = parent->cli->list_pg_count(lister);
parent->cli->list_inode_next(lister, parent->parallel_osds);
@@ -126,7 +118,6 @@ struct rm_inode_t
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
op->req.rw.inode, op->req.rw.offset,
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
error_count++;
}
delete op;
cur_list->obj_done++;
@@ -170,43 +161,31 @@ struct rm_inode_t
}
if (lists_done && !lists.size())
{
result = (cli_result_t){
.err = error_count > 0 ? EIO : 0,
.text = error_count > 0 ? "Some blocks were not removed" : (
"Done, inode "+std::to_string(INODE_NO_POOL(inode))+" from pool "+
std::to_string(pool_id)+" removed"),
};
state = 100;
printf("Done, inode %lu in pool %u data removed\n", INODE_NO_POOL(inode), pool_id);
state = 2;
}
}
bool is_done()
bool loop()
{
return state == 100;
}
void loop()
{
if (state == 1)
goto resume_1;
if (state == 100)
return;
if (!pool_id)
if (state == 0)
{
result = (cli_result_t){ .err = EINVAL, .text = "Pool is not specified" };
state = 100;
return;
start_delete();
state = 1;
}
start_delete();
if (state == 100)
return;
state = 1;
resume_1:
continue_delete();
else if (state == 1)
{
continue_delete();
}
else if (state == 2)
{
return true;
}
return false;
}
};
std::function<bool(cli_result_t &)> cli_tool_t::start_rm_data(json11::Json cfg)
std::function<bool(void)> cli_tool_t::start_rm(json11::Json cfg)
{
auto remover = new rm_inode_t();
remover->parent = this;
@@ -214,16 +193,19 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_rm_data(json11::Json cfg)
remover->pool_id = cfg["pool"].uint64_value();
if (remover->pool_id)
{
remover->inode = (remover->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
remover->inode = (remover->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
}
remover->pool_id = INODE_POOL(remover->inode);
remover->min_offset = cfg["min-offset"].uint64_value();
return [remover](cli_result_t & result)
if (!remover->pool_id)
{
remover->loop();
if (remover->is_done())
fprintf(stderr, "pool is missing\n");
exit(1);
}
remover->min_offset = cfg["min-offset"].uint64_value();
return [remover]()
{
if (remover->loop())
{
result = remover->result;
delete remover;
return true;
}

View File

@@ -11,9 +11,9 @@
#include <sys/stat.h>
// Calculate offsets for a block device and print OSD command line parameters
std::function<bool(cli_result_t &)> cli_tool_t::simple_offsets(json11::Json cfg)
std::function<bool(void)> cli_tool_t::simple_offsets(json11::Json cfg)
{
std::string device = cfg["device"].string_value();
std::string device = cfg["command"][1].string_value();
uint64_t object_size = parse_size(cfg["object_size"].string_value());
uint64_t bitmap_granularity = parse_size(cfg["bitmap_granularity"].string_value());
uint64_t journal_size = parse_size(cfg["journal_size"].string_value());

View File

@@ -83,12 +83,6 @@ resume_1:
resume_2:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
fprintf(stderr, "%s\n", parent->etcd_err.text.c_str());
state = 100;
return;
}
mon_members = parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items();
osd_stats = parent->etcd_result["responses"][1]["response_range"]["kvs"].array_items();
if (parent->etcd_result["responses"][2]["response_range"]["kvs"].array_items().size() > 0)
@@ -223,7 +217,7 @@ resume_2:
// JSON output
printf("%s\n", json11::Json(json11::Json::object {
{ "etcd_alive", etcd_alive },
{ "etcd_count", (uint64_t)etcd_states.size() },
{ "etcd_count", etcd_states.size() },
{ "etcd_db_size", etcd_db_size },
{ "mon_count", mon_count },
{ "mon_master", mon_master },
@@ -283,16 +277,16 @@ resume_2:
}
};
std::function<bool(cli_result_t &)> cli_tool_t::start_status(json11::Json cfg)
std::function<bool(void)> cli_tool_t::start_status(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto printer = new status_printer_t();
printer->parent = this;
return [printer](cli_result_t & result)
return [printer]()
{
printer->loop();
if (printer->is_done())
{
result = { .err = 0 };
delete printer;
return true;
}

View File

@@ -9,7 +9,6 @@
#define PART_SENT 1
#define PART_DONE 2
#define PART_ERROR 4
#define PART_RETRY 8
#define CACHE_DIRTY 1
#define CACHE_FLUSHING 2
#define CACHE_REPEATING 3
@@ -374,11 +373,6 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
continue_ops();
}
bool cluster_client_t::get_immediate_commit()
{
return immediate_commit;
}
void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
{
if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
@@ -676,17 +670,14 @@ resume_2:
if (!try_send(op, i))
{
// We'll need to retry again
if (op->parts[i].flags & PART_RETRY)
op->up_wait = true;
if (!retry_timeout_id)
{
op->up_wait = true;
if (!retry_timeout_id)
retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
{
retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
{
retry_timeout_id = 0;
continue_ops(true);
});
}
retry_timeout_id = 0;
continue_ops(true);
});
}
op->state = 2;
}
@@ -755,7 +746,7 @@ resume_3:
{
for (int i = 0; i < op->parts.size(); i++)
{
op->parts[i].flags = PART_RETRY;
op->parts[i].flags = 0;
}
goto resume_2;
}

View File

@@ -118,8 +118,6 @@ public:
bool is_ready();
void on_ready(std::function<void(void)> fn);
bool get_immediate_commit();
static void copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers);
void continue_ops(bool up_retry = false);
inode_list_t *list_inode_start(inode_t inode,

View File

@@ -89,7 +89,7 @@ void etcd_state_client_t::etcd_call_oneshot(std::string etcd_address, std::strin
"Connection: close\r\n"
"\r\n"+req;
auto http_cli = http_init(tfd);
auto cb = [http_cli, callback](const http_response_t *response)
auto cb = [this, http_cli, callback](const http_response_t *response)
{
std::string err;
json11::Json data;
@@ -338,14 +338,9 @@ void etcd_state_client_t::start_etcd_watcher()
{
if (data["result"]["created"].bool_value())
{
uint64_t watch_id = data["result"]["watch_id"].uint64_value();
if (watch_id == ETCD_CONFIG_WATCH_ID ||
watch_id == ETCD_PG_STATE_WATCH_ID ||
watch_id == ETCD_PG_HISTORY_WATCH_ID ||
watch_id == ETCD_OSD_STATE_WATCH_ID)
etcd_watches_initialised++;
if (etcd_watches_initialised == 4 && this->log_level > 0)
if (etcd_watches_initialised == 3 && this->log_level > 0)
fprintf(stderr, "Successfully subscribed to etcd at %s\n", selected_etcd_address.c_str());
etcd_watches_initialised++;
}
if (data["result"]["canceled"].bool_value())
{
@@ -474,10 +469,6 @@ void etcd_state_client_t::start_etcd_watcher()
{ "progress_notify", true },
} }
}).dump());
if (on_start_watcher_hook)
{
on_start_watcher_hook(etcd_watch_ws);
}
if (ws_keepalive_timer < 0)
{
ws_keepalive_timer = tfd->set_timer(etcd_ws_keepalive_interval*1000, true, [this](int)
@@ -963,10 +954,6 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
}
if (!value.is_object())
{
if (on_inode_change_hook != NULL)
{
on_inode_change_hook(inode_num, true);
}
this->inode_config.erase(inode_num);
}
else
@@ -981,47 +968,38 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
{
fprintf(
stderr, "Inode %lu/%lu parent_pool value is invalid, ignoring parent setting\n",
inode_num >> (64-POOL_ID_BITS), inode_num & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)
inode_num >> (64-POOL_ID_BITS), inode_num & ((1l << (64-POOL_ID_BITS)) - 1)
);
parent_inode_num = 0;
}
else
parent_inode_num |= parent_pool_id << (64-POOL_ID_BITS);
}
insert_inode_config((inode_config_t){
inode_config_t cfg = (inode_config_t){
.num = inode_num,
.name = value["name"].string_value(),
.size = value["size"].uint64_value(),
.parent_id = parent_inode_num,
.readonly = value["readonly"].bool_value(),
.meta = value["meta"],
.mod_revision = kv.mod_revision,
});
};
this->inode_config[inode_num] = cfg;
if (cfg.name != "")
{
this->inode_by_name[cfg.name] = inode_num;
for (auto w: watches)
{
if (w->name == value["name"].string_value())
{
w->cfg = cfg;
}
}
}
}
}
}
}
void etcd_state_client_t::insert_inode_config(const inode_config_t & cfg)
{
this->inode_config[cfg.num] = cfg;
if (cfg.name != "")
{
this->inode_by_name[cfg.name] = cfg.num;
for (auto w: watches)
{
if (w->name == cfg.name)
{
w->cfg = cfg;
}
}
}
if (on_inode_change_hook != NULL)
{
on_inode_change_hook(cfg.num, false);
}
}
inode_watch_t* etcd_state_client_t::watch_inode(std::string name)
{
inode_watch_t *watch = new inode_watch_t;
@@ -1064,10 +1042,6 @@ json11::Json::object etcd_state_client_t::serialize_inode_cfg(inode_config_t *cf
{
new_cfg["readonly"] = true;
}
if (cfg->meta.is_object())
{
new_cfg["meta"] = cfg->meta;
}
return new_cfg;
}

View File

@@ -56,8 +56,6 @@ struct inode_config_t
uint64_t size;
inode_t parent_id;
bool readonly;
// Arbitrary metadata
json11::Json meta;
// Change revision of the metadata in etcd
uint64_t mod_revision;
};
@@ -111,8 +109,6 @@ public:
std::function<void(pool_id_t, pg_num_t)> on_change_pg_history_hook;
std::function<void(osd_num_t)> on_change_osd_state_hook;
std::function<void()> on_reload_hook;
std::function<void(inode_t, bool)> on_inode_change_hook;
std::function<void(http_co_t *)> on_start_watcher_hook;
json11::Json::object serialize_inode_cfg(inode_config_t *cfg);
etcd_kv_t parse_etcd_kv(const json11::Json & kv_json);
@@ -126,7 +122,6 @@ public:
void load_pgs();
void parse_state(const etcd_kv_t & kv);
void parse_config(const json11::Json & config);
void insert_inode_config(const inode_config_t & cfg);
inode_watch_t* watch_inode(std::string name);
void close_watch(inode_watch_t* watch);
int address_count();

View File

@@ -214,14 +214,14 @@ static int sec_setup(struct thread_data *td)
if (!o->image)
{
if (!(o->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)))
if (!(o->inode & ((1l << (64-POOL_ID_BITS)) - 1)))
{
td_verror(td, EINVAL, "inode number is missing");
return 1;
}
if (o->pool)
{
o->inode = (o->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (o->pool << (64-POOL_ID_BITS));
o->inode = (o->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (o->pool << (64-POOL_ID_BITS));
}
if (!(o->inode >> (64-POOL_ID_BITS)))
{

2
src/lrc/Makefile Normal file
View File

@@ -0,0 +1,2 @@
mat: mat.c
gcc -O3 -I/usr/include/jerasure -o mat mat.c -lJerasure

277
src/lrc/mat.c Normal file
View File

@@ -0,0 +1,277 @@
#include <jerasure/reed_sol.h>
#include <jerasure.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Generate LRC matrix: (groups*local + global) code rows with (data_drives) columns
// w should be >= log2(data_drives + groups*local + global), but not necessary 8/16/32
int* reed_sol_vandermonde_lrc_matrix(int data_drives, int groups, int local, int global, int w)
{
if (w < 0 || w > 32 || data_drives + groups*local + global > (1<<w))
{
return NULL;
}
int *lrc_matrix = (int*)malloc(sizeof(int) * (local*groups+global));
int *matrix = reed_sol_vandermonde_coding_matrix(data_drives, local+global, w);
for (int gr = 0; gr < groups; gr++)
{
for (int l = 0; l < local; l++)
{
for (int j = 0; j < data_drives; j++)
{
lrc_matrix[(gr*local+l)*data_drives + j] = (j / (data_drives/groups)) == gr ? matrix[l*data_drives + j] : 0;
}
}
}
for (int i = 0; i < global; i++)
{
for (int j = 0; j < data_drives; j++)
{
lrc_matrix[(groups*local+i)*data_drives + j] = matrix[(local+i)*data_drives + j];
}
}
free(matrix);
return lrc_matrix;
}
// Check if the generated LRC with given parameters is Maximally Reconstructible (MR-LRC)
// Example of a MR-LRC: (8, 2, 1, 2, 6, 8)
void check_mr_lrc(int data_drives, int groups, int local, int global, int matrix_w, int w, int print)
{
}
int main()
{
int W = 8, MATRIX_W = 6;
int n = 8, groups = 2, local = 1, global = 2;
//n = 4, groups = 2, local = 1, global = 1;
int total_rows = n+groups*local+global;
int *matrix = reed_sol_vandermonde_lrc_matrix(n, groups, local, global, MATRIX_W);
int *lrc_matrix = (int*)malloc(sizeof(int) * total_rows*n);
// Fill identity+LRC matrix
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
lrc_matrix[i*n + j] = j == i ? 1 : 0;
memcpy(lrc_matrix + n*n, matrix, (total_rows-n)*n*sizeof(int));
free(matrix);
matrix = NULL;
// Print LRC matrix
for (int i = 0; i < total_rows; i++)
{
for (int j = 0; j < n; j++)
{
printf("%d ", lrc_matrix[i*n+j]);
}
printf("\n");
}
int impossible = 0, success = 0, failures = 0;
int *lost_per_group = (int*)malloc(sizeof(int) * groups);
for (int lost = local+global+1; lost <= groups*local+global; lost++)
//int lost = groups*local+global;
{
int *erased_matrix = (int*)malloc(sizeof(int) * (total_rows-lost)*n);
int *inverted_matrix = (int*)malloc(sizeof(int) * (total_rows-lost)*n);
int *p = (int*)malloc(sizeof(int) * (total_rows-lost));
for (int i = 0; i < n; i++)
p[i] = i;
int *p2 = (int*)malloc(sizeof(int) * n);
if (total_rows-lost > n)
{
p[n-1] = n; // skip combinations with all N data disks (0..n-1)
for (int i = n; i < total_rows-lost; i++)
p[i] = i+1;
p[total_rows-lost-1]--; // will be incremented on the first step
}
int inc = total_rows-lost-1;
while (1)
{
p[inc]++;
if (p[inc] >= n+groups*local+global)
{
if (inc == 0)
break;
inc--;
}
else if (inc+1 < total_rows-lost)
{
p[inc+1] = p[inc];
inc++;
}
else
{
// Check if it should be recoverable
for (int gr = 0; gr < groups; gr++)
{
lost_per_group[gr] = ((gr+1)*(n/groups) > n ? (n - gr*(n/groups)) : n/groups);
}
// Calculate count of data chunks lost in each group
for (int j = 0; j < total_rows-lost; j++)
{
if (j < n)
{
lost_per_group[(p[j] / (n/groups))]--;
}
}
// Every local parity chunk is supposed to restore 1 missing chunk inside its group
// So, subtract local parity chunk counts from each group lost chunk count
for (int j = 0; j < total_rows-lost; j++)
{
if (p[j] >= n && p[j] < n+groups*local && lost_per_group[(p[j]-n)/local] > 0)
{
lost_per_group[(p[j]-n)/local]--;
}
}
// Every global parity chunk is supposed to restore 1 chunk of all that are still missing
int still_missing = 0;
for (int gr = 0; gr < groups; gr++)
{
still_missing += lost_per_group[gr];
}
for (int j = 0; j < total_rows-lost; j++)
{
if (p[j] >= n+groups*local && still_missing > 0)
{
still_missing--;
}
}
if (still_missing <= 0)
{
// We hope it can be recoverable. Try to invert it
int invert_ok = -1;
if (total_rows-lost == n)
{
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
erased_matrix[i*n+j] = lrc_matrix[p[i]*n+j];
invert_ok = jerasure_invert_matrix(erased_matrix, inverted_matrix, n, W);
}
else
{
// Check submatrices
for (int i = 0; i < n; i++)
p2[i] = i;
p2[n-1]--;
int inc2 = n-1;
while (1)
{
p2[inc2]++;
if (p2[inc2] >= total_rows-lost)
{
if (inc2 == 0)
break;
inc2--;
}
else if (inc2+1 < n)
{
p2[inc2+1] = p2[inc2];
inc2++;
}
else
{
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
erased_matrix[i*n+j] = lrc_matrix[p[p2[i]]*n+j];
invert_ok = jerasure_invert_matrix(erased_matrix, inverted_matrix, n, W);
if (invert_ok == 0)
break;
}
}
}
if (invert_ok < 0)
{
failures++;
printf("\nFAIL: ");
for (int i = 0; i < total_rows-lost; i++)
{
printf("%d ", p[i]);
}
printf("\nDIRECT:\n");
for (int i = 0; i < total_rows-lost; i++)
{
for (int j = 0; j < n; j++)
printf("%d ", lrc_matrix[p[i]*n+j]);
printf("\n");
}
printf("INVERSE:\n");
for (int i = 0; i < total_rows-lost; i++)
{
for (int j = 0; j < n; j++)
printf("%d ", inverted_matrix[i*n+j]);
printf("\n");
}
}
else
{
success++;
printf("OK: ");
for (int i = 0; i < total_rows-lost; i++)
{
printf("%d ", p[i]);
}
printf("\n");
}
}
else
{
impossible++;
printf("IMPOSSIBLE: ");
for (int i = 0; i < total_rows-lost; i++)
{
printf("%d ", p[i]);
}
printf("\n");
}
}
}
free(p2);
free(p);
free(inverted_matrix);
free(erased_matrix);
}
free(lost_per_group);
printf("\n%d recovered, %d impossible, %d failures\n", success, impossible, failures);
return 0;
}
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
//
// Can't recover
// 1 2 4 5 8 9 10 11 -1
// 2 3 4 6 8 9 10 11 -1
// FULL:
// 1 0 0 0 0 0 0 0
// 0 1 0 0 0 0 0 0
// 0 0 1 0 0 0 0 0
// 0 0 0 1 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 1 0 0
// 0 0 0 0 0 0 1 0
// 0 0 0 0 0 0 0 1
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
// FIRST UNRECOVERABLE:
// 0 1 0 0 0 0 0 0
// 0 0 1 0 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 1 0 0
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
// SECOND UNRECOVERABLE:
// 0 0 1 0 0 0 0 0
// 0 0 0 1 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 0 1 0
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
// Ho ho ho

View File

@@ -189,7 +189,7 @@ public:
uint64_t pool = cfg["pool"].uint64_value();
if (pool)
{
inode = (inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS));
inode = (inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS));
}
if (!(inode >> (64-POOL_ID_BITS)))
{

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,190 +0,0 @@
/*
* Please do not edit this file.
* It was generated using rpcgen.
*/
#ifndef _PORTMAP_H_RPCGEN
#define _PORTMAP_H_RPCGEN
#include "xdr_impl.h"
#ifdef __cplusplus
extern "C" {
#endif
#define PMAP_PORT 111
struct pmap2_mapping {
u_int prog;
u_int vers;
u_int prot;
u_int port;
};
typedef struct pmap2_mapping pmap2_mapping;
struct pmap2_call_args {
u_int prog;
u_int vers;
u_int proc;
xdr_string_t args;
};
typedef struct pmap2_call_args pmap2_call_args;
struct pmap2_call_result {
u_int port;
xdr_string_t res;
};
typedef struct pmap2_call_result pmap2_call_result;
struct pmap2_mapping_list {
pmap2_mapping map;
struct pmap2_mapping_list *next;
};
typedef struct pmap2_mapping_list pmap2_mapping_list;
struct pmap2_dump_result {
struct pmap2_mapping_list *list;
};
typedef struct pmap2_dump_result pmap2_dump_result;
struct pmap3_string_result {
xdr_string_t addr;
};
typedef struct pmap3_string_result pmap3_string_result;
struct pmap3_mapping {
u_int prog;
u_int vers;
xdr_string_t netid;
xdr_string_t addr;
xdr_string_t owner;
};
typedef struct pmap3_mapping pmap3_mapping;
struct pmap3_mapping_list {
pmap3_mapping map;
struct pmap3_mapping_list *next;
};
typedef struct pmap3_mapping_list pmap3_mapping_list;
struct pmap3_dump_result {
struct pmap3_mapping_list *list;
};
typedef struct pmap3_dump_result pmap3_dump_result;
struct pmap3_call_args {
u_int prog;
u_int vers;
u_int proc;
xdr_string_t args;
};
typedef struct pmap3_call_args pmap3_call_args;
struct pmap3_call_result {
u_int port;
xdr_string_t res;
};
typedef struct pmap3_call_result pmap3_call_result;
struct pmap3_netbuf {
u_int maxlen;
xdr_string_t buf;
};
typedef struct pmap3_netbuf pmap3_netbuf;
typedef pmap2_mapping PMAP2SETargs;
typedef pmap2_mapping PMAP2UNSETargs;
typedef pmap2_mapping PMAP2GETPORTargs;
typedef pmap2_call_args PMAP2CALLITargs;
typedef pmap2_call_result PMAP2CALLITres;
typedef pmap2_dump_result PMAP2DUMPres;
typedef pmap3_mapping PMAP3SETargs;
typedef pmap3_mapping PMAP3UNSETargs;
typedef pmap3_mapping PMAP3GETADDRargs;
typedef pmap3_string_result PMAP3GETADDRres;
typedef pmap3_dump_result PMAP3DUMPres;
typedef pmap3_call_result PMAP3CALLITargs;
typedef pmap3_call_result PMAP3CALLITres;
typedef pmap3_netbuf PMAP3UADDR2TADDRres;
typedef pmap3_netbuf PMAP3TADDR2UADDRargs;
typedef pmap3_string_result PMAP3TADDR2UADDRres;
#define PMAP_PROGRAM 100000
#define PMAP_V2 2
#define PMAP2_NULL 0
#define PMAP2_SET 1
#define PMAP2_UNSET 2
#define PMAP2_GETPORT 3
#define PMAP2_DUMP 4
#define PMAP2_CALLIT 5
#define PMAP_V3 3
#define PMAP3_NULL 0
#define PMAP3_SET 1
#define PMAP3_UNSET 2
#define PMAP3_GETADDR 3
#define PMAP3_DUMP 4
#define PMAP3_CALLIT 5
#define PMAP3_GETTIME 6
#define PMAP3_UADDR2TADDR 7
#define PMAP3_TADDR2UADDR 8
/* the xdr functions */
extern bool_t xdr_pmap2_mapping (XDR *, pmap2_mapping*);
extern bool_t xdr_pmap2_call_args (XDR *, pmap2_call_args*);
extern bool_t xdr_pmap2_call_result (XDR *, pmap2_call_result*);
extern bool_t xdr_pmap2_mapping_list (XDR *, pmap2_mapping_list*);
extern bool_t xdr_pmap2_dump_result (XDR *, pmap2_dump_result*);
extern bool_t xdr_pmap3_string_result (XDR *, pmap3_string_result*);
extern bool_t xdr_pmap3_mapping (XDR *, pmap3_mapping*);
extern bool_t xdr_pmap3_mapping_list (XDR *, pmap3_mapping_list*);
extern bool_t xdr_pmap3_dump_result (XDR *, pmap3_dump_result*);
extern bool_t xdr_pmap3_call_args (XDR *, pmap3_call_args*);
extern bool_t xdr_pmap3_call_result (XDR *, pmap3_call_result*);
extern bool_t xdr_pmap3_netbuf (XDR *, pmap3_netbuf*);
extern bool_t xdr_PMAP2SETargs (XDR *, PMAP2SETargs*);
extern bool_t xdr_PMAP2UNSETargs (XDR *, PMAP2UNSETargs*);
extern bool_t xdr_PMAP2GETPORTargs (XDR *, PMAP2GETPORTargs*);
extern bool_t xdr_PMAP2CALLITargs (XDR *, PMAP2CALLITargs*);
extern bool_t xdr_PMAP2CALLITres (XDR *, PMAP2CALLITres*);
extern bool_t xdr_PMAP2DUMPres (XDR *, PMAP2DUMPres*);
extern bool_t xdr_PMAP3SETargs (XDR *, PMAP3SETargs*);
extern bool_t xdr_PMAP3UNSETargs (XDR *, PMAP3UNSETargs*);
extern bool_t xdr_PMAP3GETADDRargs (XDR *, PMAP3GETADDRargs*);
extern bool_t xdr_PMAP3GETADDRres (XDR *, PMAP3GETADDRres*);
extern bool_t xdr_PMAP3DUMPres (XDR *, PMAP3DUMPres*);
extern bool_t xdr_PMAP3CALLITargs (XDR *, PMAP3CALLITargs*);
extern bool_t xdr_PMAP3CALLITres (XDR *, PMAP3CALLITres*);
extern bool_t xdr_PMAP3UADDR2TADDRres (XDR *, PMAP3UADDR2TADDRres*);
extern bool_t xdr_PMAP3TADDR2UADDRargs (XDR *, PMAP3TADDR2UADDRargs*);
extern bool_t xdr_PMAP3TADDR2UADDRres (XDR *, PMAP3TADDR2UADDRres*);
#ifdef __cplusplus
}
#endif
#endif /* !_PORTMAP_H_RPCGEN */

View File

@@ -1,168 +0,0 @@
/*
Copyright (c) 2014, Ronnie Sahlberg
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The views and conclusions contained in the software and documentation are those
of the authors and should not be interpreted as representing official policies,
either expressed or implied, of the FreeBSD Project.
*/
const PMAP_PORT = 111; /* portmapper port number */
struct pmap2_mapping {
unsigned int prog;
unsigned int vers;
unsigned int prot;
unsigned int port;
};
struct pmap2_call_args {
unsigned int prog;
unsigned int vers;
unsigned int proc;
opaque args<>;
};
struct pmap2_call_result {
unsigned int port;
opaque res<>;
};
struct pmap2_mapping_list {
pmap2_mapping map;
pmap2_mapping_list *next;
};
struct pmap2_dump_result {
struct pmap2_mapping_list *list;
};
struct pmap3_string_result {
string addr<>;
};
struct pmap3_mapping {
unsigned int prog;
unsigned int vers;
string netid<>;
string addr<>;
string owner<>;
};
struct pmap3_mapping_list {
pmap3_mapping map;
pmap3_mapping_list *next;
};
struct pmap3_dump_result {
struct pmap3_mapping_list *list;
};
struct pmap3_call_args {
unsigned int prog;
unsigned int vers;
unsigned int proc;
opaque args<>;
};
struct pmap3_call_result {
unsigned int port;
opaque res<>;
};
struct pmap3_netbuf {
unsigned int maxlen;
/* This pretty much contains a sockaddr_storage.
* Beware differences in endianess for ss_family
* and whether or not ss_len exists.
*/
opaque buf<>;
};
typedef pmap2_mapping PMAP2SETargs;
typedef pmap2_mapping PMAP2UNSETargs;
typedef pmap2_mapping PMAP2GETPORTargs;
typedef pmap2_call_args PMAP2CALLITargs;
typedef pmap2_call_result PMAP2CALLITres;
typedef pmap2_dump_result PMAP2DUMPres;
typedef pmap3_mapping PMAP3SETargs;
typedef pmap3_mapping PMAP3UNSETargs;
typedef pmap3_mapping PMAP3GETADDRargs;
typedef pmap3_string_result PMAP3GETADDRres;
typedef pmap3_dump_result PMAP3DUMPres;
typedef pmap3_call_result PMAP3CALLITargs;
typedef pmap3_call_result PMAP3CALLITres;
typedef pmap3_netbuf PMAP3UADDR2TADDRres;
typedef pmap3_netbuf PMAP3TADDR2UADDRargs;
typedef pmap3_string_result PMAP3TADDR2UADDRres;
program PMAP_PROGRAM {
version PMAP_V2 {
void
PMAP2_NULL(void) = 0;
uint32_t
PMAP2_SET(PMAP2SETargs) = 1;
uint32_t
PMAP2_UNSET(PMAP2UNSETargs) = 2;
uint32_t
PMAP2_GETPORT(PMAP2GETPORTargs) = 3;
PMAP2DUMPres
PMAP2_DUMP(void) = 4;
PMAP2CALLITres
PMAP2_CALLIT(PMAP2CALLITargs) = 5;
} = 2;
version PMAP_V3 {
void
PMAP3_NULL(void) = 0;
uint32_t
PMAP3_SET(PMAP3SETargs) = 1;
uint32_t
PMAP3_UNSET(PMAP3UNSETargs) = 2;
PMAP3GETADDRres
PMAP3_GETADDR(PMAP3GETADDRargs) = 3;
PMAP3DUMPres
PMAP3_DUMP(void) = 4;
PMAP3CALLITres
PMAP3_CALLIT(PMAP3CALLITargs) = 5;
uint32_t
PMAP3_GETTIME(void) = 6;
PMAP3UADDR2TADDRres
PMAP3_UADDR2TADDR(string) = 7;
PMAP3TADDR2UADDRres
PMAP3_TADDR2UADDR(PMAP3TADDR2UADDRargs) = 8;
} = 3;
} = 100000;

View File

@@ -1,406 +0,0 @@
/*
* Please do not edit this file.
* It was generated using rpcgen.
*/
#include "portmap.h"
#include "xdr_impl_inline.h"
bool_t
xdr_pmap2_mapping (XDR *xdrs, pmap2_mapping *objp)
{
if (xdrs->x_op == XDR_ENCODE) {
if (1) {
if (!xdr_u_int (xdrs, &objp->prog))
return FALSE;
if (!xdr_u_int (xdrs, &objp->vers))
return FALSE;
if (!xdr_u_int (xdrs, &objp->prot))
return FALSE;
if (!xdr_u_int (xdrs, &objp->port))
return FALSE;
} else {
IXDR_PUT_U_LONG(buf, objp->prog);
IXDR_PUT_U_LONG(buf, objp->vers);
IXDR_PUT_U_LONG(buf, objp->prot);
IXDR_PUT_U_LONG(buf, objp->port);
}
return TRUE;
} else if (xdrs->x_op == XDR_DECODE) {
if (1) {
if (!xdr_u_int (xdrs, &objp->prog))
return FALSE;
if (!xdr_u_int (xdrs, &objp->vers))
return FALSE;
if (!xdr_u_int (xdrs, &objp->prot))
return FALSE;
if (!xdr_u_int (xdrs, &objp->port))
return FALSE;
} else {
objp->prog = IXDR_GET_U_LONG(buf);
objp->vers = IXDR_GET_U_LONG(buf);
objp->prot = IXDR_GET_U_LONG(buf);
objp->port = IXDR_GET_U_LONG(buf);
}
return TRUE;
}
if (!xdr_u_int (xdrs, &objp->prog))
return FALSE;
if (!xdr_u_int (xdrs, &objp->vers))
return FALSE;
if (!xdr_u_int (xdrs, &objp->prot))
return FALSE;
if (!xdr_u_int (xdrs, &objp->port))
return FALSE;
return TRUE;
}
bool_t
xdr_pmap2_call_args (XDR *xdrs, pmap2_call_args *objp)
{
if (xdrs->x_op == XDR_ENCODE) {
if (1) {
if (!xdr_u_int (xdrs, &objp->prog))
return FALSE;
if (!xdr_u_int (xdrs, &objp->vers))
return FALSE;
if (!xdr_u_int (xdrs, &objp->proc))
return FALSE;
} else {
IXDR_PUT_U_LONG(buf, objp->prog);
IXDR_PUT_U_LONG(buf, objp->vers);
IXDR_PUT_U_LONG(buf, objp->proc);
}
if (!xdr_bytes(xdrs, &objp->args, ~0))
return FALSE;
return TRUE;
} else if (xdrs->x_op == XDR_DECODE) {
if (1) {
if (!xdr_u_int (xdrs, &objp->prog))
return FALSE;
if (!xdr_u_int (xdrs, &objp->vers))
return FALSE;
if (!xdr_u_int (xdrs, &objp->proc))
return FALSE;
} else {
objp->prog = IXDR_GET_U_LONG(buf);
objp->vers = IXDR_GET_U_LONG(buf);
objp->proc = IXDR_GET_U_LONG(buf);
}
if (!xdr_bytes(xdrs, &objp->args, ~0))
return FALSE;
return TRUE;
}
if (!xdr_u_int (xdrs, &objp->prog))
return FALSE;
if (!xdr_u_int (xdrs, &objp->vers))
return FALSE;
if (!xdr_u_int (xdrs, &objp->proc))
return FALSE;
if (!xdr_bytes(xdrs, &objp->args, ~0))
return FALSE;
return TRUE;
}
bool_t
xdr_pmap2_call_result (XDR *xdrs, pmap2_call_result *objp)
{
if (!xdr_u_int (xdrs, &objp->port))
return FALSE;
if (!xdr_bytes(xdrs, &objp->res, ~0))
return FALSE;
return TRUE;
}
bool_t
xdr_pmap2_mapping_list (XDR *xdrs, pmap2_mapping_list *objp)
{
if (!xdr_pmap2_mapping (xdrs, &objp->map))
return FALSE;
if (!xdr_pointer (xdrs, (char **)&objp->next, sizeof (pmap2_mapping_list), (xdrproc_t) xdr_pmap2_mapping_list))
return FALSE;
return TRUE;
}
bool_t
xdr_pmap2_dump_result (XDR *xdrs, pmap2_dump_result *objp)
{
if (!xdr_pointer (xdrs, (char **)&objp->list, sizeof (pmap2_mapping_list), (xdrproc_t) xdr_pmap2_mapping_list))
return FALSE;
return TRUE;
}
bool_t
xdr_pmap3_string_result (XDR *xdrs, pmap3_string_result *objp)
{
if (!xdr_string (xdrs, &objp->addr, ~0))
return FALSE;
return TRUE;
}
bool_t
xdr_pmap3_mapping (XDR *xdrs, pmap3_mapping *objp)
{
if (!xdr_u_int (xdrs, &objp->prog))
return FALSE;
if (!xdr_u_int (xdrs, &objp->vers))
return FALSE;
if (!xdr_string (xdrs, &objp->netid, ~0))
return FALSE;
if (!xdr_string (xdrs, &objp->addr, ~0))
return FALSE;
if (!xdr_string (xdrs, &objp->owner, ~0))
return FALSE;
return TRUE;
}
bool_t
xdr_pmap3_mapping_list (XDR *xdrs, pmap3_mapping_list *objp)
{
if (!xdr_pmap3_mapping (xdrs, &objp->map))
return FALSE;
if (!xdr_pointer (xdrs, (char **)&objp->next, sizeof (pmap3_mapping_list), (xdrproc_t) xdr_pmap3_mapping_list))
return FALSE;
return TRUE;
}
bool_t
xdr_pmap3_dump_result (XDR *xdrs, pmap3_dump_result *objp)
{
if (!xdr_pointer (xdrs, (char **)&objp->list, sizeof (pmap3_mapping_list), (xdrproc_t) xdr_pmap3_mapping_list))
return FALSE;
return TRUE;
}
bool_t
xdr_pmap3_call_args (XDR *xdrs, pmap3_call_args *objp)
{
if (xdrs->x_op == XDR_ENCODE) {
if (1) {
if (!xdr_u_int (xdrs, &objp->prog))
return FALSE;
if (!xdr_u_int (xdrs, &objp->vers))
return FALSE;
if (!xdr_u_int (xdrs, &objp->proc))
return FALSE;
} else {
IXDR_PUT_U_LONG(buf, objp->prog);
IXDR_PUT_U_LONG(buf, objp->vers);
IXDR_PUT_U_LONG(buf, objp->proc);
}
if (!xdr_bytes(xdrs, &objp->args, ~0))
return FALSE;
return TRUE;
} else if (xdrs->x_op == XDR_DECODE) {
if (1) {
if (!xdr_u_int (xdrs, &objp->prog))
return FALSE;
if (!xdr_u_int (xdrs, &objp->vers))
return FALSE;
if (!xdr_u_int (xdrs, &objp->proc))
return FALSE;
} else {
objp->prog = IXDR_GET_U_LONG(buf);
objp->vers = IXDR_GET_U_LONG(buf);
objp->proc = IXDR_GET_U_LONG(buf);
}
if (!xdr_bytes(xdrs, &objp->args, ~0))
return FALSE;
return TRUE;
}
if (!xdr_u_int (xdrs, &objp->prog))
return FALSE;
if (!xdr_u_int (xdrs, &objp->vers))
return FALSE;
if (!xdr_u_int (xdrs, &objp->proc))
return FALSE;
if (!xdr_bytes(xdrs, &objp->args, ~0))
return FALSE;
return TRUE;
}
bool_t
xdr_pmap3_call_result (XDR *xdrs, pmap3_call_result *objp)
{
if (!xdr_u_int (xdrs, &objp->port))
return FALSE;
if (!xdr_bytes(xdrs, &objp->res, ~0))
return FALSE;
return TRUE;
}
bool_t
xdr_pmap3_netbuf (XDR *xdrs, pmap3_netbuf *objp)
{
if (!xdr_u_int (xdrs, &objp->maxlen))
return FALSE;
if (!xdr_bytes(xdrs, &objp->buf, ~0))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP2SETargs (XDR *xdrs, PMAP2SETargs *objp)
{
if (!xdr_pmap2_mapping (xdrs, objp))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP2UNSETargs (XDR *xdrs, PMAP2UNSETargs *objp)
{
if (!xdr_pmap2_mapping (xdrs, objp))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP2GETPORTargs (XDR *xdrs, PMAP2GETPORTargs *objp)
{
if (!xdr_pmap2_mapping (xdrs, objp))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP2CALLITargs (XDR *xdrs, PMAP2CALLITargs *objp)
{
if (!xdr_pmap2_call_args (xdrs, objp))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP2CALLITres (XDR *xdrs, PMAP2CALLITres *objp)
{
if (!xdr_pmap2_call_result (xdrs, objp))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP2DUMPres (XDR *xdrs, PMAP2DUMPres *objp)
{
if (!xdr_pmap2_dump_result (xdrs, objp))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP3SETargs (XDR *xdrs, PMAP3SETargs *objp)
{
if (!xdr_pmap3_mapping (xdrs, objp))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP3UNSETargs (XDR *xdrs, PMAP3UNSETargs *objp)
{
if (!xdr_pmap3_mapping (xdrs, objp))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP3GETADDRargs (XDR *xdrs, PMAP3GETADDRargs *objp)
{
if (!xdr_pmap3_mapping (xdrs, objp))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP3GETADDRres (XDR *xdrs, PMAP3GETADDRres *objp)
{
if (!xdr_pmap3_string_result (xdrs, objp))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP3DUMPres (XDR *xdrs, PMAP3DUMPres *objp)
{
if (!xdr_pmap3_dump_result (xdrs, objp))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP3CALLITargs (XDR *xdrs, PMAP3CALLITargs *objp)
{
if (!xdr_pmap3_call_result (xdrs, objp))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP3CALLITres (XDR *xdrs, PMAP3CALLITres *objp)
{
if (!xdr_pmap3_call_result (xdrs, objp))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP3UADDR2TADDRres (XDR *xdrs, PMAP3UADDR2TADDRres *objp)
{
if (!xdr_pmap3_netbuf (xdrs, objp))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP3TADDR2UADDRargs (XDR *xdrs, PMAP3TADDR2UADDRargs *objp)
{
if (!xdr_pmap3_netbuf (xdrs, objp))
return FALSE;
return TRUE;
}
bool_t
xdr_PMAP3TADDR2UADDRres (XDR *xdrs, PMAP3TADDR2UADDRres *objp)
{
if (!xdr_pmap3_string_result (xdrs, objp))
return FALSE;
return TRUE;
}

View File

@@ -1,160 +0,0 @@
/*
* Please do not edit this file.
* It was generated using rpcgen.
*/
#ifndef _RPC_H_RPCGEN
#define _RPC_H_RPCGEN
#include "xdr_impl.h"
#ifdef __cplusplus
extern "C" {
#endif
#define RPC_MSG_VERSION 2
enum rpc_auth_flavor {
RPC_AUTH_NONE = 0,
RPC_AUTH_SYS = 1,
RPC_AUTH_SHORT = 2,
RPC_AUTH_DH = 3,
RPC_RPCSEC_GSS = 6,
};
typedef enum rpc_auth_flavor rpc_auth_flavor;
enum rpc_msg_type {
RPC_CALL = 0,
RPC_REPLY = 1,
};
typedef enum rpc_msg_type rpc_msg_type;
enum rpc_reply_stat {
RPC_MSG_ACCEPTED = 0,
RPC_MSG_DENIED = 1,
};
typedef enum rpc_reply_stat rpc_reply_stat;
enum rpc_accept_stat {
RPC_SUCCESS = 0,
RPC_PROG_UNAVAIL = 1,
RPC_PROG_MISMATCH = 2,
RPC_PROC_UNAVAIL = 3,
RPC_GARBAGE_ARGS = 4,
RPC_SYSTEM_ERR = 5,
};
typedef enum rpc_accept_stat rpc_accept_stat;
enum rpc_reject_stat {
RPC_MISMATCH = 0,
RPC_AUTH_ERROR = 1,
};
typedef enum rpc_reject_stat rpc_reject_stat;
enum rpc_auth_stat {
RPC_AUTH_OK = 0,
RPC_AUTH_BADCRED = 1,
RPC_AUTH_REJECTEDCRED = 2,
RPC_AUTH_BADVERF = 3,
RPC_AUTH_REJECTEDVERF = 4,
RPC_AUTH_TOOWEAK = 5,
RPC_AUTH_INVALIDRESP = 6,
RPC_AUTH_FAILED = 7,
};
typedef enum rpc_auth_stat rpc_auth_stat;
struct rpc_opaque_auth {
rpc_auth_flavor flavor;
xdr_string_t body;
};
typedef struct rpc_opaque_auth rpc_opaque_auth;
struct rpc_call_body {
u_int rpcvers;
u_int prog;
u_int vers;
u_int proc;
rpc_opaque_auth cred;
rpc_opaque_auth verf;
};
typedef struct rpc_call_body rpc_call_body;
struct rpc_mismatch_info {
u_int min_version;
u_int max_version;
};
typedef struct rpc_mismatch_info rpc_mismatch_info;
struct rpc_accepted_reply_body {
rpc_accept_stat stat;
union {
rpc_mismatch_info mismatch_info;
};
};
typedef struct rpc_accepted_reply_body rpc_accepted_reply_body;
struct rpc_accepted_reply {
rpc_opaque_auth verf;
rpc_accepted_reply_body reply_data;
};
typedef struct rpc_accepted_reply rpc_accepted_reply;
struct rpc_rejected_reply {
rpc_reject_stat stat;
union {
rpc_mismatch_info mismatch_info;
rpc_auth_stat auth_stat;
};
};
typedef struct rpc_rejected_reply rpc_rejected_reply;
struct rpc_reply_body {
rpc_reply_stat stat;
union {
rpc_accepted_reply areply;
rpc_rejected_reply rreply;
};
};
typedef struct rpc_reply_body rpc_reply_body;
struct rpc_msg_body {
rpc_msg_type dir;
union {
rpc_call_body cbody;
rpc_reply_body rbody;
};
};
typedef struct rpc_msg_body rpc_msg_body;
struct rpc_msg {
u_int xid;
rpc_msg_body body;
};
typedef struct rpc_msg rpc_msg;
/* the xdr functions */
extern bool_t xdr_rpc_auth_flavor (XDR *, rpc_auth_flavor*);
extern bool_t xdr_rpc_msg_type (XDR *, rpc_msg_type*);
extern bool_t xdr_rpc_reply_stat (XDR *, rpc_reply_stat*);
extern bool_t xdr_rpc_accept_stat (XDR *, rpc_accept_stat*);
extern bool_t xdr_rpc_reject_stat (XDR *, rpc_reject_stat*);
extern bool_t xdr_rpc_auth_stat (XDR *, rpc_auth_stat*);
extern bool_t xdr_rpc_opaque_auth (XDR *, rpc_opaque_auth*);
extern bool_t xdr_rpc_call_body (XDR *, rpc_call_body*);
extern bool_t xdr_rpc_mismatch_info (XDR *, rpc_mismatch_info*);
extern bool_t xdr_rpc_accepted_reply_body (XDR *, rpc_accepted_reply_body*);
extern bool_t xdr_rpc_accepted_reply (XDR *, rpc_accepted_reply*);
extern bool_t xdr_rpc_rejected_reply (XDR *, rpc_rejected_reply*);
extern bool_t xdr_rpc_reply_body (XDR *, rpc_reply_body*);
extern bool_t xdr_rpc_msg_body (XDR *, rpc_msg_body*);
extern bool_t xdr_rpc_msg (XDR *, rpc_msg*);
#ifdef __cplusplus
}
#endif
#endif /* !_RPC_H_RPCGEN */

View File

@@ -1,113 +0,0 @@
/* Based on RFC 5531 - RPC: Remote Procedure Call Protocol Specification Version 2 */
const RPC_MSG_VERSION = 2;
enum rpc_auth_flavor {
RPC_AUTH_NONE = 0,
RPC_AUTH_SYS = 1,
RPC_AUTH_SHORT = 2,
RPC_AUTH_DH = 3,
RPC_RPCSEC_GSS = 6
};
enum rpc_msg_type {
RPC_CALL = 0,
RPC_REPLY = 1
};
enum rpc_reply_stat {
RPC_MSG_ACCEPTED = 0,
RPC_MSG_DENIED = 1
};
enum rpc_accept_stat {
RPC_SUCCESS = 0,
RPC_PROG_UNAVAIL = 1,
RPC_PROG_MISMATCH = 2,
RPC_PROC_UNAVAIL = 3,
RPC_GARBAGE_ARGS = 4,
RPC_SYSTEM_ERR = 5
};
enum rpc_reject_stat {
RPC_MISMATCH = 0,
RPC_AUTH_ERROR = 1
};
enum rpc_auth_stat {
RPC_AUTH_OK = 0,
/*
* failed at remote end
*/
RPC_AUTH_BADCRED = 1, /* bogus credentials (seal broken) */
RPC_AUTH_REJECTEDCRED = 2, /* client should begin new session */
RPC_AUTH_BADVERF = 3, /* bogus verifier (seal broken) */
RPC_AUTH_REJECTEDVERF = 4, /* verifier expired or was replayed */
RPC_AUTH_TOOWEAK = 5, /* rejected due to security reasons */
/*
* failed locally
*/
RPC_AUTH_INVALIDRESP = 6, /* bogus response verifier */
RPC_AUTH_FAILED = 7 /* some unknown reason */
};
struct rpc_opaque_auth {
rpc_auth_flavor flavor;
opaque body<400>;
};
struct rpc_call_body {
u_int rpcvers;
u_int prog;
u_int vers;
u_int proc;
rpc_opaque_auth cred;
rpc_opaque_auth verf;
/* procedure-specific parameters start here */
};
struct rpc_mismatch_info {
unsigned int min_version;
unsigned int max_version;
};
union rpc_accepted_reply_body switch (rpc_accept_stat stat) {
case RPC_SUCCESS:
void;
/* procedure-specific results start here */
case RPC_PROG_MISMATCH:
rpc_mismatch_info mismatch_info;
default:
void;
};
struct rpc_accepted_reply {
rpc_opaque_auth verf;
rpc_accepted_reply_body reply_data;
};
union rpc_rejected_reply switch (rpc_reject_stat stat) {
case RPC_MISMATCH:
rpc_mismatch_info mismatch_info;
case RPC_AUTH_ERROR:
rpc_auth_stat auth_stat;
};
union rpc_reply_body switch (rpc_reply_stat stat) {
case RPC_MSG_ACCEPTED:
rpc_accepted_reply areply;
case RPC_MSG_DENIED:
rpc_rejected_reply rreply;
};
union rpc_msg_body switch (rpc_msg_type dir) {
case RPC_CALL:
rpc_call_body cbody;
case RPC_REPLY:
rpc_reply_body rbody;
};
struct rpc_msg {
u_int xid;
rpc_msg_body body;
};

View File

@@ -1,43 +0,0 @@
#pragma once
#include "rpc.h"
struct rpc_op_t;
// Handler should return 1 if the request is processed asynchronously
// and requires the incoming message to not be freed until processing ends,
// 0 otherwise.
typedef int (*rpc_handler_t)(void *opaque, rpc_op_t *rop);
struct rpc_service_proc_t
{
uint32_t prog;
uint32_t vers;
uint32_t proc;
rpc_handler_t handler_fn;
xdrproc_t req_fn;
uint32_t req_size;
xdrproc_t resp_fn;
uint32_t resp_size;
void *opaque;
};
inline bool operator < (const rpc_service_proc_t & a, const rpc_service_proc_t & b)
{
return a.prog < b.prog || a.prog == b.prog && (a.vers < b.vers || a.vers == b.vers && a.proc < b.proc);
}
struct rpc_op_t
{
void *client;
uint8_t *buffer;
XDR *xdrs;
rpc_msg in_msg, out_msg;
void *request;
void *reply;
xdrproc_t reply_fn;
uint32_t reply_marker;
bool referenced;
};
void rpc_queue_reply(rpc_op_t *rop);

View File

@@ -1,253 +0,0 @@
/*
* Please do not edit this file.
* It was generated using rpcgen.
*/
#include "rpc.h"
#include "xdr_impl_inline.h"
bool_t
xdr_rpc_auth_flavor (XDR *xdrs, rpc_auth_flavor *objp)
{
if (!xdr_enum (xdrs, (enum_t *) objp))
return FALSE;
return TRUE;
}
bool_t
xdr_rpc_msg_type (XDR *xdrs, rpc_msg_type *objp)
{
if (!xdr_enum (xdrs, (enum_t *) objp))
return FALSE;
return TRUE;
}
bool_t
xdr_rpc_reply_stat (XDR *xdrs, rpc_reply_stat *objp)
{
if (!xdr_enum (xdrs, (enum_t *) objp))
return FALSE;
return TRUE;
}
bool_t
xdr_rpc_accept_stat (XDR *xdrs, rpc_accept_stat *objp)
{
if (!xdr_enum (xdrs, (enum_t *) objp))
return FALSE;
return TRUE;
}
bool_t
xdr_rpc_reject_stat (XDR *xdrs, rpc_reject_stat *objp)
{
if (!xdr_enum (xdrs, (enum_t *) objp))
return FALSE;
return TRUE;
}
bool_t
xdr_rpc_auth_stat (XDR *xdrs, rpc_auth_stat *objp)
{
if (!xdr_enum (xdrs, (enum_t *) objp))
return FALSE;
return TRUE;
}
bool_t
xdr_rpc_opaque_auth (XDR *xdrs, rpc_opaque_auth *objp)
{
if (!xdr_rpc_auth_flavor (xdrs, &objp->flavor))
return FALSE;
if (!xdr_bytes(xdrs, &objp->body, 400))
return FALSE;
return TRUE;
}
bool_t
xdr_rpc_call_body (XDR *xdrs, rpc_call_body *objp)
{
if (xdrs->x_op == XDR_ENCODE) {
if (1) {
if (!xdr_u_int (xdrs, &objp->rpcvers))
return FALSE;
if (!xdr_u_int (xdrs, &objp->prog))
return FALSE;
if (!xdr_u_int (xdrs, &objp->vers))
return FALSE;
if (!xdr_u_int (xdrs, &objp->proc))
return FALSE;
} else {
IXDR_PUT_U_LONG(buf, objp->rpcvers);
IXDR_PUT_U_LONG(buf, objp->prog);
IXDR_PUT_U_LONG(buf, objp->vers);
IXDR_PUT_U_LONG(buf, objp->proc);
}
if (!xdr_rpc_opaque_auth (xdrs, &objp->cred))
return FALSE;
if (!xdr_rpc_opaque_auth (xdrs, &objp->verf))
return FALSE;
return TRUE;
} else if (xdrs->x_op == XDR_DECODE) {
if (1) {
if (!xdr_u_int (xdrs, &objp->rpcvers))
return FALSE;
if (!xdr_u_int (xdrs, &objp->prog))
return FALSE;
if (!xdr_u_int (xdrs, &objp->vers))
return FALSE;
if (!xdr_u_int (xdrs, &objp->proc))
return FALSE;
} else {
objp->rpcvers = IXDR_GET_U_LONG(buf);
objp->prog = IXDR_GET_U_LONG(buf);
objp->vers = IXDR_GET_U_LONG(buf);
objp->proc = IXDR_GET_U_LONG(buf);
}
if (!xdr_rpc_opaque_auth (xdrs, &objp->cred))
return FALSE;
if (!xdr_rpc_opaque_auth (xdrs, &objp->verf))
return FALSE;
return TRUE;
}
if (!xdr_u_int (xdrs, &objp->rpcvers))
return FALSE;
if (!xdr_u_int (xdrs, &objp->prog))
return FALSE;
if (!xdr_u_int (xdrs, &objp->vers))
return FALSE;
if (!xdr_u_int (xdrs, &objp->proc))
return FALSE;
if (!xdr_rpc_opaque_auth (xdrs, &objp->cred))
return FALSE;
if (!xdr_rpc_opaque_auth (xdrs, &objp->verf))
return FALSE;
return TRUE;
}
bool_t
xdr_rpc_mismatch_info (XDR *xdrs, rpc_mismatch_info *objp)
{
if (!xdr_u_int (xdrs, &objp->min_version))
return FALSE;
if (!xdr_u_int (xdrs, &objp->max_version))
return FALSE;
return TRUE;
}
bool_t
xdr_rpc_accepted_reply_body (XDR *xdrs, rpc_accepted_reply_body *objp)
{
if (!xdr_rpc_accept_stat (xdrs, &objp->stat))
return FALSE;
switch (objp->stat) {
case RPC_SUCCESS:
break;
case RPC_PROG_MISMATCH:
if (!xdr_rpc_mismatch_info (xdrs, &objp->mismatch_info))
return FALSE;
break;
default:
break;
}
return TRUE;
}
bool_t
xdr_rpc_accepted_reply (XDR *xdrs, rpc_accepted_reply *objp)
{
if (!xdr_rpc_opaque_auth (xdrs, &objp->verf))
return FALSE;
if (!xdr_rpc_accepted_reply_body (xdrs, &objp->reply_data))
return FALSE;
return TRUE;
}
bool_t
xdr_rpc_rejected_reply (XDR *xdrs, rpc_rejected_reply *objp)
{
if (!xdr_rpc_reject_stat (xdrs, &objp->stat))
return FALSE;
switch (objp->stat) {
case RPC_MISMATCH:
if (!xdr_rpc_mismatch_info (xdrs, &objp->mismatch_info))
return FALSE;
break;
case RPC_AUTH_ERROR:
if (!xdr_rpc_auth_stat (xdrs, &objp->auth_stat))
return FALSE;
break;
default:
return FALSE;
}
return TRUE;
}
bool_t
xdr_rpc_reply_body (XDR *xdrs, rpc_reply_body *objp)
{
if (!xdr_rpc_reply_stat (xdrs, &objp->stat))
return FALSE;
switch (objp->stat) {
case RPC_MSG_ACCEPTED:
if (!xdr_rpc_accepted_reply (xdrs, &objp->areply))
return FALSE;
break;
case RPC_MSG_DENIED:
if (!xdr_rpc_rejected_reply (xdrs, &objp->rreply))
return FALSE;
break;
default:
return FALSE;
}
return TRUE;
}
bool_t
xdr_rpc_msg_body (XDR *xdrs, rpc_msg_body *objp)
{
if (!xdr_rpc_msg_type (xdrs, &objp->dir))
return FALSE;
switch (objp->dir) {
case RPC_CALL:
if (!xdr_rpc_call_body (xdrs, &objp->cbody))
return FALSE;
break;
case RPC_REPLY:
if (!xdr_rpc_reply_body (xdrs, &objp->rbody))
return FALSE;
break;
default:
return FALSE;
}
return TRUE;
}
bool_t
xdr_rpc_msg (XDR *xdrs, rpc_msg *objp)
{
if (!xdr_u_int (xdrs, &objp->xid))
return FALSE;
if (!xdr_rpc_msg_body (xdrs, &objp->body))
return FALSE;
return TRUE;
}

View File

@@ -1,48 +0,0 @@
#!/bin/bash
set -e
# 1) remove all extern non-xdr functions (service, client)
# 2) use xdr_string_t for strings instead of char*
# 3) remove K&R #ifdefs
# 4) remove register int32_t* buf
# 5) remove union names
# 6) use xdr_string_t for opaques instead of u_int + char*
# 7) TODO: generate normal procedure stubs
run_rpcgen() {
rpcgen -h $1.x | \
perl -e '
{ local $/ = undef; $_ = <>; }
s/^extern(?!.*"C"|.*bool_t xdr.*XDR).*\n//gm;
s/#include <rpc\/rpc.h>/#include "xdr_impl.h"/;
s/^typedef char \*/typedef xdr_string_t /gm;
s/^(\s*)char \*(?!.*_val)/$1xdr_string_t /gm;
# remove union names
s/ \w+_u;/;/gs;
# use xdr_string_t for opaques
s/struct\s*\{\s*u_int\s+\w+_len;\s*char\s+\*\w+_val;\s*\}\s*/xdr_string_t /gs;
# remove stdc/k&r
s/^#if.*__STDC__.*//gm;
s/\n#else[^\n]*K&R.*?\n#endif[^\n]*K&R[^\n]*//gs;
print;' > $1.h
rpcgen -c $1.x | \
perl -pe '
s/register int32_t \*buf;\s*//g;
s/\bbuf\s*=[^;]+;\s*//g;
s/\bbuf\s*==\s*NULL/1/g;
# remove union names
s/(\.|->)\w+_u\./$1/g;
# use xdr_string_t for opaques
# xdr_bytes(xdrs, (char**)&objp->data.data_val, (char**)&objp->data.data_len, 400)
# -> xdr_bytes(xdrs, &objp->data, 400)
# xdr_bytes(xdrs, (char**)&objp->data_val, (char**)&objp->data_len, 400)
# -> xdr_bytes(xdrs, objp, 400)
s/xdr_bytes\s*\(\s*xdrs,\s*\(\s*char\s*\*\*\s*\)\s*([^()]+?)\.\w+_val\s*,\s*\(\s*u_int\s*\*\s*\)\s*\1\.\w+_len,/xdr_bytes(xdrs, $1,/gs;
s/xdr_bytes\s*\(\s*xdrs,\s*\(\s*char\s*\*\*\s*\)\s*&\s*([^()]+?)->\w+_val\s*,\s*\(\s*u_int\s*\*\s*\)\s*&\s*\1->\w+_len,/xdr_bytes(xdrs, $1,/gs;
# add include
if (/#include/) { $_ .= "#include \"xdr_impl_inline.h\"\n"; }' > ${1}_xdr.cpp
}
run_rpcgen nfs
run_rpcgen rpc
run_rpcgen portmap

View File

@@ -1,107 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
//
// Efficient XDR implementation almost compatible with rpcgen (see run-rpcgen.sh)
#include "xdr_impl_inline.h"
XDR* xdr_create()
{
return new XDR;
}
void xdr_destroy(XDR* xdrs)
{
xdr_reset(xdrs);
delete xdrs;
}
void xdr_reset(XDR *xdrs)
{
for (auto buf: xdrs->allocs)
{
free(buf);
}
xdrs->buf = NULL;
xdrs->avail = 0;
xdrs->allocs.resize(0);
xdrs->in_linked_list.resize(0);
xdrs->cur_out.resize(0);
xdrs->last_end = 0;
xdrs->buf_list.resize(0);
}
int xdr_decode(XDR *xdrs, void *buf, unsigned size, xdrproc_t fn, void *data)
{
xdrs->x_op = XDR_DECODE;
xdrs->buf = (uint8_t*)buf;
xdrs->avail = size;
return fn(xdrs, data);
}
int xdr_encode(XDR *xdrs, xdrproc_t fn, void *data)
{
xdrs->x_op = XDR_ENCODE;
return fn(xdrs, data);
}
void xdr_encode_finish(XDR *xdrs, iovec **iov_list, unsigned *iov_count)
{
if (xdrs->last_end < xdrs->cur_out.size())
{
xdrs->buf_list.push_back((iovec){
.iov_base = 0,
.iov_len = xdrs->cur_out.size() - xdrs->last_end,
});
xdrs->last_end = xdrs->cur_out.size();
}
uint8_t *cur_buf = xdrs->cur_out.data();
for (auto & buf: xdrs->buf_list)
{
if (!buf.iov_base)
{
buf.iov_base = cur_buf;
cur_buf += buf.iov_len;
}
}
*iov_list = xdrs->buf_list.data();
*iov_count = xdrs->buf_list.size();
}
void xdr_dump_encoded(XDR *xdrs)
{
for (auto & buf: xdrs->buf_list)
{
for (int i = 0; i < buf.iov_len; i++)
printf("%02x", ((uint8_t*)buf.iov_base)[i]);
}
printf("\n");
}
void xdr_add_malloc(XDR *xdrs, void *buf)
{
xdrs->allocs.push_back(buf);
}
xdr_string_t xdr_copy_string(XDR *xdrs, const std::string & str)
{
char *cp = (char*)malloc_or_die(str.size()+1);
memcpy(cp, str.data(), str.size());
cp[str.size()] = 0;
xdr_add_malloc(xdrs, cp);
return (xdr_string_t){ str.size(), cp };
}
xdr_string_t xdr_copy_string(XDR *xdrs, const char *str)
{
return xdr_copy_string(xdrs, str, strlen(str));
}
xdr_string_t xdr_copy_string(XDR *xdrs, const char *str, size_t len)
{
char *cp = (char*)malloc_or_die(len+1);
memcpy(cp, str, len);
cp[len] = 0;
xdr_add_malloc(xdrs, cp);
return (xdr_string_t){ len, cp };
}

Some files were not shown because too many files have changed in this diff Show More