Compare commits
74 Commits
aa79d1db1c
...
00eb5b3ba0
Author | SHA1 | Date |
---|---|---|
Vitaliy Filippov | 00eb5b3ba0 | |
Vitaliy Filippov | 88516ab4bd | |
Vitaliy Filippov | 6221126b4f | |
Vitaliy Filippov | 6783d4a13c | |
Vitaliy Filippov | dcbe1afac3 | |
Vitaliy Filippov | 0bde28c24a | |
Vitaliy Filippov | bb8ca6184e | |
Vitaliy Filippov | 87310ef7bb | |
Vitaliy Filippov | 4f4b2dab80 | |
Vitaliy Filippov | f70da82317 | |
Vitaliy Filippov | e42148f347 | |
Vitaliy Filippov | c289584469 | |
Vitaliy Filippov | 018e89f867 | |
Vitaliy Filippov | 603dc68f11 | |
Vitaliy Filippov | 7b12342933 | |
Vitaliy Filippov | 44bf0f16ee | |
Vitaliy Filippov | 8840c84572 | |
Vitaliy Filippov | 5b747c12ec | |
Vitaliy Filippov | 05f5f46162 | |
Vitaliy Filippov | b5604191c8 | |
Vitaliy Filippov | e871de27de | |
Vitaliy Filippov | f600ce98e2 | |
Vitaliy Filippov | 57605a5c13 | |
Vitaliy Filippov | 29bd4561bb | |
Vitaliy Filippov | 7142460ec8 | |
Vitaliy Filippov | d03f19ebe5 | |
Vitaliy Filippov | 88f9d18be3 | |
Vitaliy Filippov | 6213fbd8c6 | |
Vitaliy Filippov | 3aee37eadd | |
Vitaliy Filippov | ecfc753e93 | |
Vitaliy Filippov | a574f9ad71 | |
Vitaliy Filippov | 7c235c9103 | |
Vitaliy Filippov | e5bb986164 | |
Vitaliy Filippov | 181795d748 | |
Vitaliy Filippov | 8cdc38805b | |
Vitaliy Filippov | 0cd455d17f | |
Vitaliy Filippov | 32ba653ba6 | |
Vitaliy Filippov | 231d4b15fc | |
Vitaliy Filippov | 9dc4d5fd7b | |
Vitaliy Filippov | e58538fa47 | |
Vitaliy Filippov | 11ac9e7024 | |
Vitaliy Filippov | 511bc3df1c | |
Vitaliy Filippov | a64f0d1f73 | |
Vitaliy Filippov | ec5f7c6b87 | |
Vitaliy Filippov | 3ebed9a749 | |
Vitaliy Filippov | eab67a6e8f | |
Vitaliy Filippov | 20993d9b7a | |
Vitaliy Filippov | 5cf9b343c0 | |
Vitaliy Filippov | 79ae0aadcd | |
Vitaliy Filippov | 605afc3583 | |
Vitaliy Filippov | c0681d8242 | |
Vitaliy Filippov | 763e77b4f4 | |
Vitaliy Filippov | 19426aa4c5 | |
Vitaliy Filippov | 08f586bcec | |
Vitaliy Filippov | f1cd87473a | |
Vitaliy Filippov | 1bd8d2da56 | |
Vitaliy Filippov | a7396d2baf | |
Vitaliy Filippov | e98a38810d | |
Vitaliy Filippov | 28c4324c36 | |
Vitaliy Filippov | 31ec3fa8f5 | |
Vitaliy Filippov | e4fa26f60a | |
Vitaliy Filippov | 59ae27f9e5 | |
Vitaliy Filippov | 2c6a301d9b | |
Vitaliy Filippov | 01558349f8 | |
Vitaliy Filippov | 36f4717d0d | |
Vitaliy Filippov | babaf2a0ce | |
Vitaliy Filippov | 5773f1a375 | |
Vitaliy Filippov | 57222a9f79 | |
Vitaliy Filippov | 61ef000c6e | |
Vitaliy Filippov | 7d5e1cc393 | |
Vitaliy Filippov | 5e7f27a02d | |
Vitaliy Filippov | fd1d8a8520 | |
Vitaliy Filippov | c364e14c40 | |
Vitaliy Filippov | 3ebbfa0428 |
|
@ -22,7 +22,7 @@ RUN apt-get update
|
|||
RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
|
||||
liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
|
||||
RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
|
||||
RUN apt-get -y install jq lp-solve sudo
|
||||
RUN apt-get -y install jq lp-solve sudo nfs-common
|
||||
RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
|
||||
|
||||
RUN set -ex; \
|
||||
|
|
|
@ -856,3 +856,21 @@ jobs:
|
|||
echo ""
|
||||
done
|
||||
|
||||
test_nfs:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||
steps:
|
||||
- name: Run test
|
||||
id: test
|
||||
timeout-minutes: 3
|
||||
run: /root/vitastor/tests/test_nfs.sh
|
||||
- name: Print logs
|
||||
if: always() && steps.test.outcome == 'failure'
|
||||
run: |
|
||||
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||
echo "-------- $i --------"
|
||||
cat $i
|
||||
echo ""
|
||||
done
|
||||
|
||||
|
|
|
@ -6,8 +6,8 @@
|
|||
|
||||
Вернём былую скорость кластерному блочному хранилищу!
|
||||
|
||||
Vitastor - распределённая блочная SDS (программная СХД), прямой аналог Ceph RBD и
|
||||
внутренних СХД популярных облачных провайдеров. Однако, в отличие от них, Vitastor
|
||||
Vitastor - распределённая блочная и файловая SDS (программная СХД), прямой аналог Ceph RBD и CephFS,
|
||||
а также внутренних СХД популярных облачных провайдеров. Однако, в отличие от них, Vitastor
|
||||
быстрый и при этом простой. Только пока маленький :-).
|
||||
|
||||
Vitastor архитектурно похож на Ceph, что означает атомарность и строгую консистентность,
|
||||
|
@ -63,7 +63,7 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
|
|||
- [fio](docs/usage/fio.ru.md) для тестов производительности
|
||||
- [NBD](docs/usage/nbd.ru.md) для монтирования ядром
|
||||
- [QEMU и qemu-img](docs/usage/qemu.ru.md)
|
||||
- [NFS](docs/usage/nfs.ru.md)-прокси для VMWare и подобных
|
||||
- [NFS](docs/usage/nfs.ru.md) кластерная файловая система и псевдо-ФС прокси
|
||||
- Производительность
|
||||
- [Понимание сути производительности](docs/performance/understanding.ru.md)
|
||||
- [Теоретический максимум](docs/performance/theoretical.ru.md)
|
||||
|
|
|
@ -6,9 +6,9 @@
|
|||
|
||||
Make Clustered Block Storage Fast Again.
|
||||
|
||||
Vitastor is a distributed block SDS, direct replacement of Ceph RBD and internal SDS's
|
||||
of public clouds. However, in contrast to them, Vitastor is fast and simple at the same time.
|
||||
The only thing is it's slightly young :-).
|
||||
Vitastor is a distributed block and file SDS, direct replacement of Ceph RBD and CephFS,
|
||||
and also internal SDS's of public clouds. However, in contrast to them, Vitastor is fast
|
||||
and simple at the same time. The only thing is it's slightly young :-).
|
||||
|
||||
Vitastor is architecturally similar to Ceph which means strong consistency,
|
||||
primary-replication, symmetric clustering and automatic data distribution over any
|
||||
|
@ -63,7 +63,7 @@ Read more details below in the documentation.
|
|||
- [fio](docs/usage/fio.en.md) for benchmarks
|
||||
- [NBD](docs/usage/nbd.en.md) for kernel mounts
|
||||
- [QEMU and qemu-img](docs/usage/qemu.en.md)
|
||||
- [NFS](docs/usage/nfs.en.md) emulator for VMWare and similar
|
||||
- [NFS](docs/usage/nfs.en.md) clustered file system and pseudo-FS proxy
|
||||
- Performance
|
||||
- [Understanding storage performance](docs/performance/understanding.en.md)
|
||||
- [Theoretical performance](docs/performance/theoretical.en.md)
|
||||
|
|
|
@ -41,6 +41,7 @@ Parameters:
|
|||
- [osd_tags](#osd_tags)
|
||||
- [primary_affinity_tags](#primary_affinity_tags)
|
||||
- [scrub_interval](#scrub_interval)
|
||||
- [used_for_fs](#used_for_fs)
|
||||
|
||||
Examples:
|
||||
|
||||
|
@ -299,6 +300,25 @@ of the OSDs containing a data chunk for a PG.
|
|||
Automatic scrubbing interval for this pool. Overrides
|
||||
[global scrub_interval setting](osd.en.md#scrub_interval).
|
||||
|
||||
## used_for_fs
|
||||
|
||||
- Type: string
|
||||
|
||||
If non-empty, the pool is marked as used for VitastorFS with metadata stored
|
||||
in block image (regular Vitastor volume) named as the value of this pool parameter.
|
||||
|
||||
When a pool is marked as used for VitastorFS, regular block volume creation in it
|
||||
is disabled (vitastor-cli refuses to create images without --force) to protect
|
||||
the user from block volume and FS file ID collisions and data loss.
|
||||
|
||||
[vitastor-nfs](../usage/nfs.ru.md), in its turn, refuses to use pools not marked
|
||||
for the corresponding FS when starting. This also implies that you can use one
|
||||
pool only for one VitastorFS.
|
||||
|
||||
The second thing that is disabled for VitastorFS pools is reporting per-inode space
|
||||
usage statistics in etcd because a FS pool may store a very large number of files
|
||||
and statistics for them all would take a lot of space in etcd.
|
||||
|
||||
# Examples
|
||||
|
||||
## Replicated pool
|
||||
|
|
|
@ -40,6 +40,7 @@
|
|||
- [osd_tags](#osd_tags)
|
||||
- [primary_affinity_tags](#primary_affinity_tags)
|
||||
- [scrub_interval](#scrub_interval)
|
||||
- [used_for_fs](#used_for_fs)
|
||||
|
||||
Примеры:
|
||||
|
||||
|
@ -306,6 +307,27 @@ OSD с "all".
|
|||
Интервал скраба, то есть, автоматической фоновой проверки данных для данного пула.
|
||||
Переопределяет [глобальную настройку scrub_interval](osd.ru.md#scrub_interval).
|
||||
|
||||
## used_for_fs
|
||||
|
||||
- Type: string
|
||||
|
||||
Если непусто, пул помечается как используемый для файловой системы VitastorFS с
|
||||
метаданными, хранимыми в блочном образе Vitastor с именем, равным значению
|
||||
этого параметра.
|
||||
|
||||
Когда пул помечается как используемый для VitastorFS, создание обычных блочных
|
||||
образов в нём отключается (vitastor-cli отказывается создавать образы без --force),
|
||||
чтобы защитить пользователя от коллизий ID файлов и блочных образов и, таким
|
||||
образом, от потери данных.
|
||||
|
||||
[vitastor-nfs](../usage/nfs.ru.md), в свою очередь, при запуске отказывается
|
||||
использовать для ФС пулы, не выделенные для неё. Это также означает, что один
|
||||
пул может использоваться только для одной VitastorFS.
|
||||
|
||||
Также для ФС-пулов отключается передача статистики в etcd по отдельным инодам,
|
||||
так как ФС-пул может содержать очень много файлов и статистика по ним всем
|
||||
заняла бы очень много места в etcd.
|
||||
|
||||
# Примеры
|
||||
|
||||
## Реплицированный пул
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
- Basic part: highly-available block storage with symmetric clustering and no SPOF
|
||||
- [Performance](../performance/comparison1.en.md) ;-D
|
||||
- [Cluster file system](../usage/nfs.en.md#vitastorfs)
|
||||
- [Multiple redundancy schemes](../config/pool.en.md#scheme): Replication, XOR n+1, Reed-Solomon erasure codes
|
||||
based on jerasure and ISA-L libraries with any number of data and parity drives in a group
|
||||
- Configuration via simple JSON data structures in etcd (parameters, pools and images)
|
||||
|
@ -46,13 +47,12 @@
|
|||
- [CSI plugin for Kubernetes](../installation/kubernetes.en.md)
|
||||
- [OpenStack support: Cinder driver, Nova and libvirt patches](../installation/openstack.en.md)
|
||||
- [Proxmox storage plugin and packages](../installation/proxmox.en.md)
|
||||
- [Simplified NFS proxy for file-based image access emulation (suitable for VMWare)](../usage/nfs.en.md)
|
||||
- [Simplified NFS proxy for file-based image access emulation (suitable for VMWare)](../usage/nfs.en.md#pseudo-fs)
|
||||
|
||||
## Roadmap
|
||||
|
||||
The following features are planned for the future:
|
||||
|
||||
- File system
|
||||
- Control plane optimisation
|
||||
- Other administrative tools
|
||||
- Web GUI
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
- Базовая часть - надёжное кластерное блочное хранилище без единой точки отказа
|
||||
- [Производительность](../performance/comparison1.ru.md) ;-D
|
||||
- [Кластерная файловая система](../usage/nfs.ru.md#vitastorfs)
|
||||
- [Несколько схем отказоустойчивости](../config/pool.ru.md#scheme): репликация, XOR n+1 (1 диск чётности), коды коррекции ошибок
|
||||
Рида-Соломона на основе библиотек jerasure и ISA-L с любым числом дисков данных и чётности в группе
|
||||
- Конфигурация через простые человекочитаемые JSON-структуры в etcd
|
||||
|
@ -48,11 +49,10 @@
|
|||
- [CSI-плагин для Kubernetes](../installation/kubernetes.ru.md)
|
||||
- [Базовая поддержка OpenStack: драйвер Cinder, патчи для Nova и libvirt](../installation/openstack.ru.md)
|
||||
- [Плагин для Proxmox](../installation/proxmox.ru.md)
|
||||
- [Упрощённая NFS-прокси для эмуляции файлового доступа к образам (подходит для VMWare)](../usage/nfs.ru.md)
|
||||
- [Упрощённая NFS-прокси для эмуляции файлового доступа к образам (подходит для VMWare)](../usage/nfs.ru.md#псевдо-фс)
|
||||
|
||||
## Планы развития
|
||||
|
||||
- Файловая система
|
||||
- Оптимизация слоя управления
|
||||
- Другие инструменты администрирования
|
||||
- Web-интерфейс
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
- [Check cluster status](#check-cluster-status)
|
||||
- [Create an image](#create-an-image)
|
||||
- [Install plugins](#install-plugins)
|
||||
- [Create VitastorFS](#create-vitastorfs)
|
||||
|
||||
## Preparation
|
||||
|
||||
|
@ -114,3 +115,9 @@ After that, you can [run benchmarks](../usage/fio.en.md) or [start QEMU manually
|
|||
- [Proxmox](../installation/proxmox.en.md)
|
||||
- [OpenStack](../installation/openstack.en.md)
|
||||
- [Kubernetes CSI](../installation/kubernetes.en.md)
|
||||
|
||||
## Create VitastorFS
|
||||
|
||||
If you want to use clustered file system in addition to VM or container images:
|
||||
|
||||
- [Follow the instructions here](../usage/nfs.en.md#vitastorfs)
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
- [Проверьте состояние кластера](#проверьте-состояние-кластера)
|
||||
- [Создайте образ](#создайте-образ)
|
||||
- [Установите плагины](#установите-плагины)
|
||||
- [Создайте VitastorFS](#создайте-vitastorfs)
|
||||
|
||||
## Подготовка
|
||||
|
||||
|
@ -116,3 +117,10 @@ vitastor-cli create -s 10G testimg
|
|||
- [Proxmox](../installation/proxmox.ru.md)
|
||||
- [OpenStack](../installation/openstack.ru.md)
|
||||
- [Kubernetes CSI](../installation/kubernetes.ru.md)
|
||||
|
||||
## Создайте VitastorFS
|
||||
|
||||
Если вы хотите использовать не только блочные образы виртуальных машин или контейнеров,
|
||||
а также кластерную файловую систему, то:
|
||||
|
||||
- [Следуйте инструкциям](../usage/nfs.en.md#vitastorfs)
|
||||
|
|
|
@ -267,6 +267,7 @@ Optional parameters:
|
|||
| `--immediate_commit none` | Put pool only on OSDs with this or larger immediate_commit (none < small < all) |
|
||||
| `--primary_affinity_tags tags` | Prefer to put primary copies on OSDs with all specified tags |
|
||||
| `--scrub_interval <time>` | Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y |
|
||||
| `--used_for_fs <name>` | Mark pool as used for VitastorFS with metadata in image <name> |
|
||||
| `--pg_stripe_size <number>` | Increase object grouping stripe |
|
||||
| `--max_osd_combinations 10000` | Maximum number of random combinations for LP solver input |
|
||||
| `--wait` | Wait for the new pool to come online |
|
||||
|
@ -288,7 +289,7 @@ Modify an existing pool. Modifiable parameters:
|
|||
|
||||
```
|
||||
[-s|--pg_size <number>] [--pg_minsize <number>] [-n|--pg_count <count>]
|
||||
[--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>]
|
||||
[--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--no_inode_stats 0|1]
|
||||
[--max_osd_combinations <number>] [--primary_affinity_tags <tags>] [--scrub_interval <time>]
|
||||
```
|
||||
|
||||
|
|
|
@ -4,42 +4,146 @@
|
|||
|
||||
[Читать на русском](nfs.ru.md)
|
||||
|
||||
# NFS
|
||||
# VitastorFS and pseudo-FS
|
||||
|
||||
Vitastor has a simplified NFS 3.0 proxy for file-based image access emulation. It's not
|
||||
suitable as a full-featured file system, at least because all file/image metadata is stored
|
||||
in etcd and kept in memory all the time - thus you can't put a lot of files in it.
|
||||
Vitastor has two file system implementations. Both can be used via `vitastor-nfs`.
|
||||
|
||||
However, NFS proxy is totally fine as a method to provide VM image access and allows to
|
||||
plug Vitastor into, for example, VMWare. It's important to note that for VMWare it's a much
|
||||
better access method than iSCSI, because with iSCSI we'd have to put all VM images into one
|
||||
Vitastor image exported as a LUN to VMWare and formatted with VMFS. VMWare doesn't use VMFS
|
||||
over NFS.
|
||||
Commands:
|
||||
- [mount](#mount)
|
||||
- [start](#start)
|
||||
|
||||
NFS proxy is stateless if you use immediate_commit=all mode (for SSD with capacitors or
|
||||
HDDs with disabled cache), so you can run multiple NFS proxies and use a network load
|
||||
balancer or any failover method you want to in that case.
|
||||
## Pseudo-FS
|
||||
|
||||
vitastor-nfs usage:
|
||||
Simplified pseudo-FS proxy is used for file-based image access emulation. It's not
|
||||
suitable as a full-featured file system: it lacks a lot of FS features, it stores
|
||||
all file/image metadata in memory and in etcd. So it's fine for hundreds or thousands
|
||||
of large files/images, but not for millions.
|
||||
|
||||
Pseudo-FS proxy is intended for environments where other block volume access methods
|
||||
can't be used or impose additional restrictions - for example, VMWare. NFS is better
|
||||
for VMWare than, for example, iSCSI, because with iSCSI, VMWare puts all VM images
|
||||
into one large shared block image in its own VMFS file system, and with NFS, VMWare
|
||||
doesn't use VMFS and puts each VM disk in a regular file which is equal to one
|
||||
Vitastor block image, just as originally intended.
|
||||
|
||||
To use Vitastor pseudo-FS locally, run `vitastor-nfs mount --block /mnt/vita`.
|
||||
|
||||
Also you can start the network server:
|
||||
|
||||
```
|
||||
vitastor-nfs [STANDARD OPTIONS] [OTHER OPTIONS]
|
||||
|
||||
--subdir <DIR> export images prefixed <DIR>/ (default empty - export all images)
|
||||
--portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)
|
||||
--bind <IP> bind service to <IP> address (default 0.0.0.0)
|
||||
--nfspath <PATH> set NFS export path to <PATH> (default is /)
|
||||
--port <PORT> use port <PORT> for NFS services (default is 2049)
|
||||
--pool <POOL> use <POOL> as default pool for new files (images)
|
||||
--foreground 1 stay in foreground, do not daemonize
|
||||
vitastor-nfs start --block --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
|
||||
```
|
||||
|
||||
Example start and mount commands (etcd_address is optional):
|
||||
To mount the FS exported by this server, run:
|
||||
|
||||
```
|
||||
vitastor-nfs --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
|
||||
mount server:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
```
|
||||
|
||||
```
|
||||
mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
```
|
||||
## VitastorFS
|
||||
|
||||
VitastorFS is a full-featured clustered (Read-Write-Many) file system. It supports most POSIX
|
||||
features like hierarchical organization, symbolic links, hard links, quick renames and so on.
|
||||
|
||||
VitastorFS metadata is stored in a Parallel Optimistic B-Tree key-value database,
|
||||
implemented over a regular Vitastor block volume. Directory entries and inodes
|
||||
are stored in a simple human-readable JSON format in the B-Tree. `vitastor-kv` tool
|
||||
can be used to inspect the database.
|
||||
|
||||
To use VitastorFS:
|
||||
|
||||
1. Create a pool or choose an existing empty pool for FS data
|
||||
2. Create an image for FS metadata, preferably in a faster (SSD or replica-HDD) pool,
|
||||
but you can create it in the data pool too if you want (image size doesn't matter):
|
||||
`vitastor-cli create -s 10G -p fastpool testfs`
|
||||
3. Mark data pool as an FS pool: `vitastor-cli modify-pool --used-for-fs testfs data-pool`
|
||||
4. Either mount the FS: `vitastor-nfs mount --fs testfs --pool data-pool /mnt/vita`
|
||||
5. Or start the NFS server: `vitastor-nfs start --fs testfs --pool data-pool`
|
||||
|
||||
### Supported POSIX features
|
||||
|
||||
- Read-after-write semantics (read returns new data immediately after write)
|
||||
- Linear and random read and write
|
||||
- Writing outside current file size
|
||||
- Hierarchical structure, immediate rename of files and directories
|
||||
- File size change support (truncate)
|
||||
- Permissions (chmod/chown)
|
||||
- Flushing data to stable storage (if required) (fsync)
|
||||
- Symbolic links
|
||||
- Hard links
|
||||
- Special files (devices, sockets, named pipes)
|
||||
- File modification and attribute change time tracking (mtime and ctime)
|
||||
- Modification time (mtime) and last access time (atime) change support (utimes)
|
||||
- Correct handling of directory listing during file creation/deletion
|
||||
|
||||
### Limitations
|
||||
|
||||
POSIX features currently not implemented in VitastorFS:
|
||||
- File locking is not supported
|
||||
- Actually used space is not counted, so `du` always reports apparent file sizes
|
||||
instead of actually allocated space
|
||||
- Access times (`atime`) are not tracked (like `-o noatime`)
|
||||
- Modification time (`mtime`) is updated lazily every second (like `-o lazytime`)
|
||||
|
||||
Other notable missing features which should be addressed in the future:
|
||||
- Defragmentation of "shared" inodes. Files smaller than pool object size (block_size
|
||||
multiplied by data part count if pool is EC) are internally stored in large block
|
||||
volumes sequentially, one after another, and leave garbage after deleting or resizing.
|
||||
Defragmentator will be implemented to collect this garbage.
|
||||
- Inode ID reuse. Currently inode IDs always grow, the limit is 2^48 inodes, so
|
||||
in theory you may hit it if you create and delete a very large number of files
|
||||
- Compaction of the key-value B-Tree. Current implementation never merges or deletes
|
||||
B-Tree blocks, so B-Tree may become bloated over time. Currently you can
|
||||
use `vitastor-kv dumpjson` & `loadjson` commands to recreate the index in such
|
||||
situations.
|
||||
- Filesystem check tool. VitastorFS doesn't have journal because it would impose a
|
||||
severe performance hit, optimistic CAS-based transactions are used instead of it.
|
||||
So, again, in theory an abnormal shutdown of the FS server may leave some garbage
|
||||
in the DB. The FS is implemented is such way that this garbage doesn't affect its
|
||||
function, but having a tool to clean it up still seems a right thing to do.
|
||||
|
||||
## Horizontal scaling
|
||||
|
||||
Linux NFS 3.0 client doesn't support built-in scaling or failover, i.e. you can't
|
||||
specify multiple server addresses when mounting the FS.
|
||||
|
||||
However, you can use any regular TCP load balancing over multiple NFS servers.
|
||||
It's absolutely safe with `immediate_commit=all` and `client_enable_writeback=false`
|
||||
settings, because Vitastor NFS proxy doesn't keep uncommitted data in memory
|
||||
with these settings. But it may even work without `immediate_commit=all` because
|
||||
the Linux NFS client repeats all uncommitted writes if it loses the connection.
|
||||
|
||||
## Commands
|
||||
|
||||
### mount
|
||||
|
||||
`vitastor-nfs (--fs <NAME> | --block) [-o <OPT>] mount <MOUNTPOINT>`
|
||||
|
||||
Start local filesystem server and mount file system to <MOUNTPOINT>.
|
||||
|
||||
Use regular `umount <MOUNTPOINT>` to unmount the FS.
|
||||
|
||||
The server will be automatically stopped when the FS is unmounted.
|
||||
|
||||
| `-o|--options <OPT>` | Pass additional NFS mount options (ex.: -o async). |
|
||||
|
||||
### start
|
||||
|
||||
`vitastor-nfs (--fs <NAME> | --block) start`
|
||||
|
||||
Start network NFS server. Options:
|
||||
|
||||
| `--bind <IP>` | bind service to <IP> address (default 0.0.0.0) |
|
||||
| `--port <PORT>` | use port <PORT> for NFS services (default is 2049) |
|
||||
| `--portmap 0` | do not listen on port 111 (portmap/rpcbind, requires root) |
|
||||
|
||||
## Common options
|
||||
|
||||
| `--fs <NAME>` | use VitastorFS with metadata in image <NAME> |
|
||||
| `--block` | use pseudo-FS presenting images as files |
|
||||
| `--pool <POOL>` | use <POOL> as default pool for new files |
|
||||
| `--subdir <DIR>` | export <DIR> instead of root directory |
|
||||
| `--nfspath <PATH>` | set NFS export path to <PATH> (default is /) |
|
||||
| `--pidfile <FILE>` | write process ID to the specified file |
|
||||
| `--logfile <FILE>` | log to the specified file |
|
||||
| `--foreground 1` | stay in foreground, do not daemonize |
|
||||
|
|
|
@ -4,41 +4,152 @@
|
|||
|
||||
[Read in English](nfs.en.md)
|
||||
|
||||
# NFS
|
||||
# VitastorFS и псевдо-ФС
|
||||
|
||||
В Vitastor реализована упрощённая NFS 3.0 прокси для эмуляции файлового доступа к образам.
|
||||
Это не полноценная файловая система, т.к. метаданные всех файлов (образов) сохраняются
|
||||
в etcd и всё время хранятся в оперативной памяти - то есть, положить туда много файлов
|
||||
не получится.
|
||||
В Vitastor есть две реализации файловой системы. Обе используются через `vitastor-nfs`.
|
||||
|
||||
Однако в качестве способа доступа к образам виртуальных машин NFS прокси прекрасно подходит
|
||||
и позволяет подключить Vitastor, например, к VMWare.
|
||||
Команды:
|
||||
- [mount](#mount)
|
||||
- [start](#start)
|
||||
|
||||
При этом, если вы используете режим immediate_commit=all (для SSD с конденсаторами или HDD
|
||||
с отключённым кэшем), то NFS-сервер не имеет состояния и вы можете свободно поднять
|
||||
его в нескольких экземплярах и использовать поверх них сетевой балансировщик нагрузки или
|
||||
схему с отказоустойчивостью.
|
||||
## Псевдо-ФС
|
||||
|
||||
Использование vitastor-nfs:
|
||||
Упрощённая реализация псевдо-ФС используется для эмуляции файлового доступа к блочным
|
||||
образам Vitastor. Это не полноценная файловая система - в ней отсутствуют многие функции
|
||||
POSIX ФС, а метаданные всех файлов (образов) сохраняются в etcd и всё время хранятся в
|
||||
оперативной памяти - то есть, псевдо-ФС подходит для сотен или тысяч файлов, но не миллионов.
|
||||
|
||||
Псевдо-ФС предназначена для доступа к образам виртуальных машин в средах, где другие
|
||||
способы невозможны или неудобны - например, в VMWare. Для VMWare это лучшая опция, чем
|
||||
iSCSI, так как при использовании iSCSI VMWare размещает все виртуальные машины в одном
|
||||
большом блочном образе внутри собственной ФС VMFS, а с NFS VMFS не используется и каждый
|
||||
диск ВМ представляется в виде одного файла, то есть, соответствует одному блочному образу
|
||||
Vitastor, как это и задумано изначально.
|
||||
|
||||
Чтобы подключить псевдо-ФС Vitastor, выполните команду `vitastor-nfs mount --block /mnt/vita`.
|
||||
|
||||
Либо же запустите сетевой вариант сервера:
|
||||
|
||||
```
|
||||
vitastor-nfs [СТАНДАРТНЫЕ ОПЦИИ] [ДРУГИЕ ОПЦИИ]
|
||||
|
||||
--subdir <DIR> экспортировать "поддиректорию" - образы с префиксом имени <DIR>/ (по умолчанию пусто - экспортировать все образы)
|
||||
--portmap 0 отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий)
|
||||
--bind <IP> принимать соединения по адресу <IP> (по умолчанию 0.0.0.0 - на всех)
|
||||
--nfspath <PATH> установить путь NFS-экспорта в <PATH> (по умолчанию /)
|
||||
--port <PORT> использовать порт <PORT> для NFS-сервисов (по умолчанию 2049)
|
||||
--pool <POOL> использовать пул <POOL> для новых образов (обязательно, если пул в кластере не один)
|
||||
--foreground 1 не уходить в фон после запуска
|
||||
vitastor-nfs start --block --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
|
||||
```
|
||||
|
||||
Пример монтирования Vitastor через NFS (etcd_address необязателен):
|
||||
Примонтировать ФС, запущенную с такими опциями, можно следующей командой:
|
||||
|
||||
```
|
||||
vitastor-nfs --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
|
||||
mount server:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
```
|
||||
|
||||
```
|
||||
mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
```
|
||||
## VitastorFS
|
||||
|
||||
VitastorFS - полноценная кластерная (Read-Write-Many) файловая система. Она поддерживает
|
||||
большую часть функций POSIX - иерархическую организацию, символические ссылки, жёсткие
|
||||
ссылки, быстрые переименования и так далее.
|
||||
|
||||
Метаданные VitastorFS хранятся в собственной реализации БД формата ключ-значения,
|
||||
основанной на Параллельном Оптимистичном Б-дереве поверх обычного блочного образа Vitastor.
|
||||
И записи каталогов, и иноды, как обычно в Vitastor, хранятся в простом человекочитаемом
|
||||
JSON-формате :-). Для инспекции содержимого БД можно использовать инструмент `vitastor-kv`.
|
||||
|
||||
Чтобы использовать VitastorFS:
|
||||
|
||||
1. Создайте пул для данных ФС или выберите существующий пустой пул
|
||||
2. Создайте блочный образ для метаданных ФС, желательно, в более быстром пуле (на SSD
|
||||
или по крайней мере на HDD, но без EC), но можно и в том же пуле, что данные
|
||||
(размер образа значения не имеет):
|
||||
`vitastor-cli create -s 10G -p fastpool testfs`
|
||||
3. Пометьте пул данных как ФС-пул: `vitastor-cli modify-pool --used-for-fs testfs data-pool`
|
||||
4. Либо примонтируйте ФС: `vitastor-nfs mount --fs testfs --pool data-pool /mnt/vita`
|
||||
5. Либо запустите сетевой NFS-сервер: `vitastor-nfs start --fs testfs --pool data-pool`
|
||||
|
||||
### Поддерживаемые функции POSIX
|
||||
|
||||
- Чтение актуальной версии данных сразу после записи
|
||||
- Последовательное и произвольное чтение и запись
|
||||
- Запись за пределами текущего размера файла
|
||||
- Иерархическая организация, мгновенное переименование файлов и каталогов
|
||||
- Изменение размера файла (truncate)
|
||||
- Права на файлы (chmod/chown)
|
||||
- Фиксация данных на диски (когда необходимо) (fsync)
|
||||
- Символические ссылки
|
||||
- Жёсткие ссылки
|
||||
- Специальные файлы (устройства, сокеты, каналы)
|
||||
- Отслеживание времён модификации (mtime), изменения атрибутов (ctime)
|
||||
- Ручное изменение времён модификации (mtime), последнего доступа (atime)
|
||||
- Корректная обработка изменений списка файлов во время листинга
|
||||
|
||||
### Ограничения
|
||||
|
||||
Отсутствующие на данный момент в VitastorFS функции POSIX:
|
||||
- Блокировки файлов не поддерживаются
|
||||
- Фактически занятое файлами место не подсчитывается и не возвращается вызовами
|
||||
stat(2), так что `du` всегда показывает сумму размеров файлов, а не фактически занятое место
|
||||
- Времена доступа (`atime`) не отслеживаются (как будто ФС смонтирована с `-o noatime`)
|
||||
- Времена модификации (`mtime`) отслеживаются асинхронно (как будто ФС смонтирована с `-o lazytime`)
|
||||
|
||||
Другие недостающие, которые нужно решить в будущем:
|
||||
- Дефрагментация "общих инодов". На уровне реализации ФС файлы, меньшие, чем размер
|
||||
объекта пула (block_size умножить на число частей данных, если пул EC),
|
||||
упаковываются друг за другом в большие "общие" иноды/тома. Если такие файлы удалять
|
||||
или увеличивать, они перемещаются и оставляют за собой "мусор", вот тут-то и нужен
|
||||
дефрагментатор.
|
||||
- Переиспользование номеров инодов. В текущей реализации номера инодов всё время
|
||||
увеличиваются, так что в теории вы можете упереться в лимит, если насоздаёте
|
||||
и наудаляете больше, чем 2^48 файлов.
|
||||
- Очистка места в Б-дереве метаданных. Текущая реализация никогда не сливает и не
|
||||
удаляет блоки Б-дерева, так что в теории дерево может разростись и стать неоптимальным.
|
||||
Если вы столкнётесь с такой ситуацией сейчас, вы можете решить её с помощью
|
||||
команд `vitastor-kv dumpjson` и `loadjson` (т.е. пересоздав и загрузив обратно все метаданные ФС).
|
||||
- Инструмент проверки метаданных файловой системы. У VitastorFS нет журнала, так как
|
||||
журнал бы сильно замедлил реализацию, вместо него используются оптимистичные
|
||||
транзакции на основе CAS (сравнить-и-записать), и теоретически при нештатном
|
||||
завершении сервера ФС в БД также могут оставаться неконсистентные "мусорные"
|
||||
записи. ФС устроена так, что на работу они не влияют, но для порядка и их стоит
|
||||
уметь подчищать.
|
||||
|
||||
## Горизонтальное масштабирование
|
||||
|
||||
Клиент Linux NFS 3.0 не поддерживает встроенное масштабирование или отказоустойчивость.
|
||||
То есть, вы не можете задать несколько адресов серверов при монтировании ФС.
|
||||
|
||||
Однако вы можете использовать любые стандартные сетевые балансировщики нагрузки
|
||||
или схемы с отказоустойчивостью. Это точно безопасно при настройках `immediate_commit=all` и
|
||||
`client_enable_writeback=false`, так как с ними NFS-сервер Vitastor вообще не хранит
|
||||
в памяти ещё не зафиксированные на дисках данные; и вполне вероятно безопасно
|
||||
даже без `immediate_commit=all`, потому что NFS-клиент ядра Linux повторяет все
|
||||
незафиксированные запросы при потере соединения.
|
||||
|
||||
## Команды
|
||||
|
||||
### mount
|
||||
|
||||
`vitastor-nfs (--fs <NAME> | --block) mount [-o <OPT>] <MOUNTPOINT>`
|
||||
|
||||
Запустить локальный сервер и примонтировать ФС в директорию <MOUNTPOINT>.
|
||||
|
||||
Чтобы отмонтировать ФС, используйте обычную команду `umount <MOUNTPOINT>`.
|
||||
|
||||
Сервер автоматически останавливается при отмонтировании ФС.
|
||||
|
||||
| `-o|--options <OPT>` | Передать дополнительные опции монтирования NFS (пример: -o async). |
|
||||
|
||||
### start
|
||||
|
||||
`vitastor-nfs (--fs <NAME> | --block) start`
|
||||
|
||||
Запустить сетевой NFS-сервер. Опции:
|
||||
|
||||
| `--bind <IP>` | принимать соединения по адресу <IP> (по умолчанию 0.0.0.0 - на всех) |
|
||||
| `--port <PORT>` | использовать порт <PORT> для NFS-сервисов (по умолчанию 2049) |
|
||||
| `--portmap 0` | отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий) |
|
||||
|
||||
## Общие опции
|
||||
|
||||
| `--fs <NAME>` | использовать VitastorFS с метаданными в образе <NAME> |
|
||||
| `--block` | использовать псевдо-ФС для доступа к блочным образам |
|
||||
| `--pool <POOL>` | использовать пул <POOL> для новых файлов (обязательно, если пул в кластере не один) |
|
||||
| `--subdir <DIR>` | экспортировать подкаталог <DIR>, а не корень ФС |
|
||||
| `--nfspath <PATH>` | установить путь NFS-экспорта в <PATH> (по умолчанию /) |
|
||||
| `--pidfile <FILE>` | записать ID процесса в заданный файл |
|
||||
| `--logfile <FILE>` | записывать логи в заданный файл |
|
||||
| `--foreground 1` | не уходить в фон после запуска |
|
||||
|
|
|
@ -37,7 +37,7 @@ const etcd_allow = new RegExp('^'+[
|
|||
'pg/history/[1-9]\\d*/[1-9]\\d*',
|
||||
'pool/stats/[1-9]\\d*',
|
||||
'history/last_clean_pgs',
|
||||
'inode/stats/[1-9]\\d*/[1-9]\\d*',
|
||||
'inode/stats/[1-9]\\d*/\\d+',
|
||||
'pool/stats/[1-9]\\d*',
|
||||
'stats',
|
||||
'index/image/.*',
|
||||
|
@ -1737,8 +1737,11 @@ class Mon
|
|||
for (const inode_num in this.state.osd.space[osd_num][pool_id])
|
||||
{
|
||||
const u = BigInt(this.state.osd.space[osd_num][pool_id][inode_num]||0);
|
||||
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
|
||||
inode_stats[pool_id][inode_num].raw_used += u;
|
||||
if (inode_num)
|
||||
{
|
||||
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
|
||||
inode_stats[pool_id][inode_num].raw_used += u;
|
||||
}
|
||||
this.state.pool.stats[pool_id].used_raw_tb += u;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -185,10 +185,48 @@ target_link_libraries(vitastor-nbd
|
|||
vitastor_client
|
||||
)
|
||||
|
||||
# libvitastor_kv.so
|
||||
add_library(vitastor_kv SHARED
|
||||
kv_db.cpp
|
||||
kv_db.h
|
||||
)
|
||||
target_link_libraries(vitastor_kv
|
||||
vitastor_client
|
||||
)
|
||||
set_target_properties(vitastor_kv PROPERTIES VERSION ${VERSION} SOVERSION 0)
|
||||
|
||||
# vitastor-kv
|
||||
add_executable(vitastor-kv
|
||||
kv_cli.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-kv
|
||||
vitastor_kv
|
||||
)
|
||||
|
||||
add_executable(vitastor-kv-stress
|
||||
kv_stress.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-kv-stress
|
||||
vitastor_kv
|
||||
)
|
||||
|
||||
# vitastor-nfs
|
||||
add_executable(vitastor-nfs
|
||||
nfs_proxy.cpp
|
||||
nfs_conn.cpp
|
||||
nfs_block.cpp
|
||||
nfs_kv.cpp
|
||||
nfs_kv_create.cpp
|
||||
nfs_kv_getattr.cpp
|
||||
nfs_kv_link.cpp
|
||||
nfs_kv_lookup.cpp
|
||||
nfs_kv_read.cpp
|
||||
nfs_kv_readdir.cpp
|
||||
nfs_kv_remove.cpp
|
||||
nfs_kv_rename.cpp
|
||||
nfs_kv_setattr.cpp
|
||||
nfs_kv_write.cpp
|
||||
nfs_fsstat.cpp
|
||||
nfs_mount.cpp
|
||||
nfs_portmap.cpp
|
||||
sha256.c
|
||||
nfs/xdr_impl.cpp
|
||||
|
@ -198,6 +236,7 @@ add_executable(vitastor-nfs
|
|||
)
|
||||
target_link_libraries(vitastor-nfs
|
||||
vitastor_client
|
||||
vitastor_kv
|
||||
)
|
||||
|
||||
# vitastor-cli
|
||||
|
|
|
@ -82,3 +82,8 @@ uint32_t blockstore_t::get_bitmap_granularity()
|
|||
{
|
||||
return impl->get_bitmap_granularity();
|
||||
}
|
||||
|
||||
void blockstore_t::set_no_inode_stats(const std::vector<uint64_t> & pool_ids)
|
||||
{
|
||||
impl->set_no_inode_stats(pool_ids);
|
||||
}
|
||||
|
|
|
@ -216,6 +216,9 @@ public:
|
|||
// Get per-inode space usage statistics
|
||||
std::map<uint64_t, uint64_t> & get_inode_space_stats();
|
||||
|
||||
// Set per-pool no_inode_stats
|
||||
void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
|
||||
|
||||
// Print diagnostics to stdout
|
||||
void dump_diagnostics();
|
||||
|
||||
|
|
|
@ -733,3 +733,86 @@ void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expecte
|
|||
fprintf(stderr, "Disk %s failed: result is %d, expected %d. Can't continue, sorry :-(\n", op, retval, expected);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
void blockstore_impl_t::set_no_inode_stats(const std::vector<uint64_t> & pool_ids)
|
||||
{
|
||||
for (auto & np: no_inode_stats)
|
||||
{
|
||||
np.second = 2;
|
||||
}
|
||||
for (auto pool_id: pool_ids)
|
||||
{
|
||||
if (!no_inode_stats[pool_id])
|
||||
recalc_inode_space_stats(pool_id, false);
|
||||
no_inode_stats[pool_id] = 1;
|
||||
}
|
||||
for (auto np_it = no_inode_stats.begin(); np_it != no_inode_stats.end(); )
|
||||
{
|
||||
if (np_it->second == 2)
|
||||
{
|
||||
recalc_inode_space_stats(np_it->first, true);
|
||||
no_inode_stats.erase(np_it++);
|
||||
}
|
||||
else
|
||||
np_it++;
|
||||
}
|
||||
}
|
||||
|
||||
void blockstore_impl_t::recalc_inode_space_stats(uint64_t pool_id, bool per_inode)
|
||||
{
|
||||
auto sp_begin = inode_space_stats.lower_bound((pool_id << (64-POOL_ID_BITS)));
|
||||
auto sp_end = inode_space_stats.lower_bound(((pool_id+1) << (64-POOL_ID_BITS)));
|
||||
inode_space_stats.erase(sp_begin, sp_end);
|
||||
auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
|
||||
while (sh_it != clean_db_shards.end() &&
|
||||
(sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
|
||||
{
|
||||
for (auto & pair: sh_it->second)
|
||||
{
|
||||
uint64_t space_id = per_inode ? pair.first.inode : (pool_id << (64-POOL_ID_BITS));
|
||||
inode_space_stats[space_id] += dsk.data_block_size;
|
||||
}
|
||||
sh_it++;
|
||||
}
|
||||
object_id last_oid = {};
|
||||
bool last_exists = false;
|
||||
auto dirty_it = dirty_db.lower_bound((obj_ver_id){ .oid = { .inode = (pool_id << (64-POOL_ID_BITS)) } });
|
||||
while (dirty_it != dirty_db.end() && (dirty_it->first.oid.inode >> (64-POOL_ID_BITS)) == pool_id)
|
||||
{
|
||||
if (IS_STABLE(dirty_it->second.state) && (IS_BIG_WRITE(dirty_it->second.state) || IS_DELETE(dirty_it->second.state)))
|
||||
{
|
||||
bool exists = false;
|
||||
if (last_oid == dirty_it->first.oid)
|
||||
{
|
||||
exists = last_exists;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto & clean_db = clean_db_shard(dirty_it->first.oid);
|
||||
auto clean_it = clean_db.find(dirty_it->first.oid);
|
||||
exists = clean_it != clean_db.end();
|
||||
}
|
||||
uint64_t space_id = per_inode ? dirty_it->first.oid.inode : (pool_id << (64-POOL_ID_BITS));
|
||||
if (IS_BIG_WRITE(dirty_it->second.state))
|
||||
{
|
||||
if (!exists)
|
||||
inode_space_stats[space_id] += dsk.data_block_size;
|
||||
last_exists = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (exists)
|
||||
{
|
||||
auto & sp = inode_space_stats[space_id];
|
||||
if (sp > dsk.data_block_size)
|
||||
sp -= dsk.data_block_size;
|
||||
else
|
||||
inode_space_stats.erase(space_id);
|
||||
}
|
||||
last_exists = false;
|
||||
}
|
||||
last_oid = dirty_it->first.oid;
|
||||
}
|
||||
dirty_it++;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -272,6 +272,7 @@ class blockstore_impl_t
|
|||
|
||||
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
|
||||
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
|
||||
std::map<uint64_t, int> no_inode_stats;
|
||||
uint8_t *clean_bitmaps = NULL;
|
||||
blockstore_dirty_db_t dirty_db;
|
||||
std::vector<blockstore_op_t*> submit_queue;
|
||||
|
@ -318,6 +319,7 @@ class blockstore_impl_t
|
|||
|
||||
blockstore_clean_db_t& clean_db_shard(object_id oid);
|
||||
void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
|
||||
void recalc_inode_space_stats(uint64_t pool_id, bool per_inode);
|
||||
|
||||
// Journaling
|
||||
void prepare_journal_sector_write(int sector, blockstore_op_t *op);
|
||||
|
@ -428,6 +430,9 @@ public:
|
|||
// Space usage statistics
|
||||
std::map<uint64_t, uint64_t> inode_space_stats;
|
||||
|
||||
// Set per-pool no_inode_stats
|
||||
void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
|
||||
|
||||
// Print diagnostics to stdout
|
||||
void dump_diagnostics();
|
||||
|
||||
|
|
|
@ -238,6 +238,7 @@ resume_2:
|
|||
data->iov = { bufs[i].buf, (size_t)bufs[i].size };
|
||||
data->callback = [this, i](ring_data_t *data) { handle_event(data, i); };
|
||||
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
|
||||
bs->ringloop->submit();
|
||||
bufs[i].state = INIT_META_WRITING;
|
||||
submitted++;
|
||||
}
|
||||
|
|
|
@ -487,18 +487,24 @@ void blockstore_impl_t::mark_stable(obj_ver_id v, bool forget_dirty)
|
|||
}
|
||||
if (!exists)
|
||||
{
|
||||
inode_space_stats[dirty_it->first.oid.inode] += dsk.data_block_size;
|
||||
uint64_t space_id = dirty_it->first.oid.inode;
|
||||
if (no_inode_stats[dirty_it->first.oid.inode >> (64-POOL_ID_BITS)])
|
||||
space_id = space_id & ~(((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
|
||||
inode_space_stats[space_id] += dsk.data_block_size;
|
||||
used_blocks++;
|
||||
}
|
||||
big_to_flush++;
|
||||
}
|
||||
else if (IS_DELETE(dirty_it->second.state))
|
||||
{
|
||||
auto & sp = inode_space_stats[dirty_it->first.oid.inode];
|
||||
uint64_t space_id = dirty_it->first.oid.inode;
|
||||
if (no_inode_stats[dirty_it->first.oid.inode >> (64-POOL_ID_BITS)])
|
||||
space_id = space_id & ~(((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
|
||||
auto & sp = inode_space_stats[space_id];
|
||||
if (sp > dsk.data_block_size)
|
||||
sp -= dsk.data_block_size;
|
||||
else
|
||||
inode_space_stats.erase(dirty_it->first.oid.inode);
|
||||
inode_space_stats.erase(space_id);
|
||||
used_blocks--;
|
||||
big_to_flush++;
|
||||
}
|
||||
|
|
|
@ -131,6 +131,7 @@ static const char* help_text =
|
|||
" --immediate_commit none Put pool only on OSDs with this or larger immediate_commit (none < small < all)\n"
|
||||
" --primary_affinity_tags tags Prefer to put primary copies on OSDs with all specified tags\n"
|
||||
" --scrub_interval <time> Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y\n"
|
||||
" --used_for_fs <name> Mark pool as used for VitastorFS with metadata in image <name>\n"
|
||||
" --pg_stripe_size <number> Increase object grouping stripe\n"
|
||||
" --max_osd_combinations 10000 Maximum number of random combinations for LP solver input\n"
|
||||
" --wait Wait for the new pool to come online\n"
|
||||
|
@ -142,7 +143,7 @@ static const char* help_text =
|
|||
"vitastor-cli modify-pool|pool-modify <id|name> [--name <new_name>] [PARAMETERS...]\n"
|
||||
" Modify an existing pool. Modifiable parameters:\n"
|
||||
" [-s|--pg_size <number>] [--pg_minsize <number>] [-n|--pg_count <count>]\n"
|
||||
" [--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>]\n"
|
||||
" [--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--used_for_fs <name>]\n"
|
||||
" [--max_osd_combinations <number>] [--primary_affinity_tags <tags>] [--scrub_interval <time>]\n"
|
||||
" Non-modifiable parameters (changing them WILL lead to data loss):\n"
|
||||
" [--block_size <size>] [--bitmap_granularity <size>]\n"
|
||||
|
@ -185,7 +186,6 @@ static json11::Json::object parse_args(int narg, const char *args[])
|
|||
for (int i = 1; i < narg; i++)
|
||||
{
|
||||
bool argHasValue = (!(i == narg-1) && (args[i+1][0] != '-'));
|
||||
|
||||
if (args[i][0] == '-' && args[i][1] == 'h' && args[i][2] == 0)
|
||||
{
|
||||
cfg["help"] = "1";
|
||||
|
|
|
@ -153,6 +153,7 @@ void cli_tool_t::loop_and_wait(std::function<bool(cli_result_t &)> loop_cb, std:
|
|||
ringloop->unregister_consumer(&looper->consumer);
|
||||
looper->loop_cb = NULL;
|
||||
looper->complete_cb(looper->result);
|
||||
ringloop->submit();
|
||||
delete looper;
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -27,6 +27,7 @@ struct image_creator_t
|
|||
std::string image_name, new_snap, new_parent;
|
||||
json11::Json new_meta;
|
||||
uint64_t size;
|
||||
bool force = false;
|
||||
bool force_size = false;
|
||||
|
||||
pool_id_t old_pool_id = 0;
|
||||
|
@ -45,6 +46,7 @@ struct image_creator_t
|
|||
|
||||
void loop()
|
||||
{
|
||||
auto & pools = parent->cli->st_cli.pool_config;
|
||||
if (state >= 1)
|
||||
goto resume_1;
|
||||
if (image_name == "")
|
||||
|
@ -62,7 +64,6 @@ struct image_creator_t
|
|||
}
|
||||
if (new_pool_id)
|
||||
{
|
||||
auto & pools = parent->cli->st_cli.pool_config;
|
||||
if (pools.find(new_pool_id) == pools.end())
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+std::to_string(new_pool_id)+" does not exist" };
|
||||
|
@ -72,7 +73,7 @@ struct image_creator_t
|
|||
}
|
||||
else if (new_pool_name != "")
|
||||
{
|
||||
for (auto & ic: parent->cli->st_cli.pool_config)
|
||||
for (auto & ic: pools)
|
||||
{
|
||||
if (ic.second.name == new_pool_name)
|
||||
{
|
||||
|
@ -87,10 +88,20 @@ struct image_creator_t
|
|||
return;
|
||||
}
|
||||
}
|
||||
else if (parent->cli->st_cli.pool_config.size() == 1)
|
||||
else if (pools.size() == 1)
|
||||
{
|
||||
auto it = parent->cli->st_cli.pool_config.begin();
|
||||
new_pool_id = it->first;
|
||||
new_pool_id = pools.begin()->first;
|
||||
}
|
||||
if (new_pool_id && !pools.at(new_pool_id).used_for_fs.empty() && !force)
|
||||
{
|
||||
result = (cli_result_t){
|
||||
.err = EINVAL,
|
||||
.text = "Pool "+pools.at(new_pool_id).name+
|
||||
" is used for VitastorFS "+pools.at(new_pool_id).used_for_fs+
|
||||
". Use --force if you really know what you are doing",
|
||||
};
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
state = 1;
|
||||
resume_1:
|
||||
|
@ -532,6 +543,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_create(json11::Json cfg)
|
|||
image_creator->image_name = cfg["image"].string_value();
|
||||
image_creator->new_pool_id = cfg["pool"].uint64_value();
|
||||
image_creator->new_pool_name = cfg["pool"].string_value();
|
||||
image_creator->force = cfg["force"].bool_value();
|
||||
image_creator->force_size = cfg["force_size"].bool_value();
|
||||
if (cfg["image_meta"].is_object())
|
||||
{
|
||||
|
|
|
@ -82,7 +82,7 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
|||
value = value.uint64_value();
|
||||
}
|
||||
else if (key == "name" || key == "scheme" || key == "immediate_commit" ||
|
||||
key == "failure_domain" || key == "root_node" || key == "scrub_interval")
|
||||
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_fs")
|
||||
{
|
||||
// OK
|
||||
}
|
||||
|
@ -119,6 +119,10 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
|||
{
|
||||
new_cfg.erase("parity_chunks");
|
||||
}
|
||||
if (new_cfg.find("used_for_fs") != new_cfg.end() && new_cfg["used_for_fs"].string_value() == "")
|
||||
{
|
||||
new_cfg.erase("used_for_fs");
|
||||
}
|
||||
|
||||
// Prevent autovivification of object keys. Now we don't modify the config, we just check it
|
||||
json11::Json cfg = new_cfg;
|
||||
|
@ -248,7 +252,7 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
|||
// immediate_commit
|
||||
if (!cfg["immediate_commit"].is_null() && !etcd_state_client_t::parse_immediate_commit(cfg["immediate_commit"].string_value()))
|
||||
{
|
||||
return "immediate_commit must be one of \"all\", \"small\", or \"none\", but it is "+cfg["scrub_interval"].as_string();
|
||||
return "immediate_commit must be one of \"all\", \"small\", or \"none\", but it is "+cfg["immediate_commit"].as_string();
|
||||
}
|
||||
|
||||
// scrub_interval
|
||||
|
|
|
@ -536,6 +536,7 @@ resume_3:
|
|||
{ "name", "Name" },
|
||||
{ "id", "ID" },
|
||||
{ "scheme_name", "Scheme" },
|
||||
{ "used_for_fs", "Used for VitastorFS" },
|
||||
{ "status", "Status" },
|
||||
{ "pg_count_fmt", "PGs" },
|
||||
{ "pg_minsize", "PG minsize" },
|
||||
|
@ -547,6 +548,7 @@ resume_3:
|
|||
{ "bitmap_granularity_fmt", "Bitmap granularity" },
|
||||
{ "immediate_commit", "Immediate commit" },
|
||||
{ "scrub_interval", "Scrub interval" },
|
||||
{ "inode_stats_fmt", "Per-inode stats" },
|
||||
{ "pg_stripe_size", "PG stripe size" },
|
||||
{ "max_osd_combinations", "Max OSD combinations" },
|
||||
{ "total_fmt", "Total" },
|
||||
|
|
|
@ -112,6 +112,24 @@ resume_1:
|
|||
return;
|
||||
}
|
||||
|
||||
if (new_cfg.find("used_for_fs") != new_cfg.end() && !force)
|
||||
{
|
||||
// Check that pool doesn't have images
|
||||
auto img_it = parent->cli->st_cli.inode_config.lower_bound(INODE_WITH_POOL(pool_id, 0));
|
||||
if (img_it != parent->cli->st_cli.inode_config.end() && INODE_POOL(img_it->first) == pool_id &&
|
||||
img_it->second.name == new_cfg["used_for_fs"].string_value())
|
||||
{
|
||||
// Only allow metadata image to exist in the FS pool
|
||||
img_it++;
|
||||
}
|
||||
if (img_it != parent->cli->st_cli.inode_config.end() && INODE_POOL(img_it->first) == pool_id)
|
||||
{
|
||||
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+pool_name+" has block images, delete them before using it for VitastorFS" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Update pool
|
||||
auto pls = kv.value.object_items();
|
||||
pls[std::to_string(pool_id)] = new_cfg;
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
#include "cluster_client_impl.h"
|
||||
#include "http_client.h" // json_is_true
|
||||
|
||||
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
|
||||
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config)
|
||||
{
|
||||
wb = new writeback_cache_t();
|
||||
|
||||
|
@ -573,7 +573,7 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
|
|||
return;
|
||||
}
|
||||
if (op->opcode == OSD_OP_WRITE && enable_writeback && !(op->flags & OP_FLUSH_BUFFER) &&
|
||||
!op->version /* FIXME no CAS writeback */)
|
||||
!op->version /* no CAS writeback */)
|
||||
{
|
||||
if (wb->writebacks_active >= client_max_writeback_iodepth)
|
||||
{
|
||||
|
@ -595,7 +595,7 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
|
|||
}
|
||||
if (op->opcode == OSD_OP_WRITE && !(op->flags & OP_IMMEDIATE_COMMIT))
|
||||
{
|
||||
if (!(op->flags & OP_FLUSH_BUFFER))
|
||||
if (!(op->flags & OP_FLUSH_BUFFER) && !op->version /* no CAS write-repeat */)
|
||||
{
|
||||
wb->copy_write(op, CACHE_WRITTEN);
|
||||
}
|
||||
|
@ -673,7 +673,7 @@ bool cluster_client_t::check_rw(cluster_op_t *op)
|
|||
return false;
|
||||
}
|
||||
// Check alignment
|
||||
if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
|
||||
if (!op->len && (op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
|
||||
op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
|
@ -1174,7 +1174,6 @@ static inline void mem_or(void *res, const void *r2, unsigned int len)
|
|||
void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
{
|
||||
cluster_op_t *op = part->parent;
|
||||
op->inflight_count--;
|
||||
int expected = part->op.req.hdr.opcode == OSD_OP_SYNC ? 0 : part->op.req.rw.len;
|
||||
if (part->op.reply.hdr.retval != expected)
|
||||
{
|
||||
|
@ -1197,7 +1196,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||
);
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (log_level > 0)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "%s operation failed on OSD %ju: retval=%jd (expected %d)\n",
|
||||
|
@ -1213,6 +1212,11 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||
op->retry_after = op->retval == -EIO ? client_eio_retry_interval : client_retry_interval;
|
||||
}
|
||||
reset_retry_timer(op->retry_after);
|
||||
if (stop_fd >= 0)
|
||||
{
|
||||
msgr.stop_client(stop_fd);
|
||||
}
|
||||
op->inflight_count--;
|
||||
if (op->inflight_count == 0)
|
||||
{
|
||||
if (op->opcode == OSD_OP_SYNC)
|
||||
|
@ -1220,14 +1224,11 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||
else
|
||||
continue_rw(op);
|
||||
}
|
||||
if (stop_fd >= 0)
|
||||
{
|
||||
msgr.stop_client(stop_fd);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// OK
|
||||
op->inflight_count--;
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && !(op->flags & OP_IMMEDIATE_COMMIT))
|
||||
dirty_osds.insert(part->osd_num);
|
||||
part->flags |= PART_DONE;
|
||||
|
|
|
@ -123,7 +123,7 @@ public:
|
|||
json11::Json::object cli_config, file_config, etcd_global_config;
|
||||
json11::Json::object config;
|
||||
|
||||
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
|
||||
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config);
|
||||
~cluster_client_t();
|
||||
void execute(cluster_op_t *op);
|
||||
void execute_raw(osd_num_t osd_num, osd_op_t *op);
|
||||
|
|
|
@ -16,11 +16,6 @@
|
|||
void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
|
||||
{
|
||||
std::string device = cfg["device"].string_value();
|
||||
if (device == "")
|
||||
{
|
||||
fprintf(stderr, "Device path is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
uint64_t data_block_size = parse_size(cfg["object_size"].string_value());
|
||||
uint64_t bitmap_granularity = parse_size(cfg["bitmap_granularity"].string_value());
|
||||
uint64_t journal_size = parse_size(cfg["journal_size"].string_value());
|
||||
|
@ -57,6 +52,11 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
|
|||
uint64_t orig_device_size = device_size;
|
||||
if (!device_size)
|
||||
{
|
||||
if (device == "")
|
||||
{
|
||||
fprintf(stderr, "Device path is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
struct stat st;
|
||||
if (stat(device.c_str(), &st) < 0)
|
||||
{
|
||||
|
|
|
@ -132,9 +132,6 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output);
|
|||
|
||||
uint64_t sscanf_json(const char *fmt, const json11::Json & str);
|
||||
void fromhexstr(const std::string & from, int bytes, uint8_t *to);
|
||||
std::string realpath_str(std::string path, bool nofail = true);
|
||||
std::string read_all_fd(int fd);
|
||||
std::string read_file(std::string file, bool allow_enoent = false);
|
||||
int disable_cache(std::string dev);
|
||||
std::string get_parent_device(std::string dev);
|
||||
bool json_is_true(const json11::Json & val);
|
||||
|
|
|
@ -42,36 +42,6 @@ void fromhexstr(const std::string & from, int bytes, uint8_t *to)
|
|||
}
|
||||
}
|
||||
|
||||
std::string realpath_str(std::string path, bool nofail)
|
||||
{
|
||||
char *p = realpath((char*)path.c_str(), NULL);
|
||||
if (!p)
|
||||
{
|
||||
fprintf(stderr, "Failed to resolve %s: %s\n", path.c_str(), strerror(errno));
|
||||
return nofail ? path : "";
|
||||
}
|
||||
std::string rp(p);
|
||||
free(p);
|
||||
return rp;
|
||||
}
|
||||
|
||||
std::string read_file(std::string file, bool allow_enoent)
|
||||
{
|
||||
std::string res;
|
||||
int fd = open(file.c_str(), O_RDONLY);
|
||||
if (fd < 0 || (res = read_all_fd(fd)) == "")
|
||||
{
|
||||
int err = errno;
|
||||
if (fd >= 0)
|
||||
close(fd);
|
||||
if (!allow_enoent || err != ENOENT)
|
||||
fprintf(stderr, "Can't read %s: %s\n", file.c_str(), strerror(err));
|
||||
return "";
|
||||
}
|
||||
close(fd);
|
||||
return res;
|
||||
}
|
||||
|
||||
// returns 1 = check error, 0 = write through, -1 = write back
|
||||
// (similar to 1 = warning, -1 = error, 0 = success in disable_cache)
|
||||
static int check_queue_cache(std::string dev, std::string parent_dev)
|
||||
|
|
|
@ -101,7 +101,7 @@ void epoll_manager_t::handle_uring_event()
|
|||
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
|
||||
data->callback = [this](ring_data_t *data)
|
||||
{
|
||||
if (data->res < 0)
|
||||
if (data->res < 0 && data->res != -ECANCELED)
|
||||
{
|
||||
throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
|
||||
}
|
||||
|
|
|
@ -863,6 +863,8 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||
pc.scrub_interval = parse_time(pool_item.second["scrub_interval"].string_value());
|
||||
if (!pc.scrub_interval)
|
||||
pc.scrub_interval = 0;
|
||||
// Mark pool as VitastorFS pool (disable per-inode stats and block volume creation)
|
||||
pc.used_for_fs = pool_item.second["used_for_fs"].as_string();
|
||||
// Immediate Commit Mode
|
||||
pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
|
||||
? parse_immediate_commit(pool_item.second["immediate_commit"].string_value())
|
||||
|
|
|
@ -60,6 +60,7 @@ struct pool_config_t
|
|||
uint64_t pg_stripe_size;
|
||||
std::map<pg_num_t, pg_config_t> pg_config;
|
||||
uint64_t scrub_interval;
|
||||
std::string used_for_fs;
|
||||
};
|
||||
|
||||
struct inode_config_t
|
||||
|
|
|
@ -0,0 +1,673 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// Vitastor shared key/value database test CLI
|
||||
|
||||
#define _XOPEN_SOURCE
|
||||
#include <limits.h>
|
||||
|
||||
#include <netinet/tcp.h>
|
||||
#include <sys/epoll.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
//#include <signal.h>
|
||||
|
||||
#include "epoll_manager.h"
|
||||
#include "str_util.h"
|
||||
#include "kv_db.h"
|
||||
|
||||
const char *exe_name = NULL;
|
||||
|
||||
class kv_cli_t
|
||||
{
|
||||
public:
|
||||
json11::Json::object cfg;
|
||||
std::vector<std::string> cli_cmd;
|
||||
|
||||
kv_dbw_t *db = NULL;
|
||||
ring_loop_t *ringloop = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
cluster_client_t *cli = NULL;
|
||||
int load_parallelism = 128;
|
||||
bool opened = false;
|
||||
bool interactive = false, is_file = false;
|
||||
int in_progress = 0;
|
||||
char *cur_cmd = NULL;
|
||||
int cur_cmd_size = 0, cur_cmd_alloc = 0;
|
||||
bool finished = false, eof = false;
|
||||
|
||||
std::function<void(int)> load_cb;
|
||||
bool loading_json = false, in_loadjson = false;
|
||||
int load_state = 0;
|
||||
std::string load_key;
|
||||
|
||||
~kv_cli_t();
|
||||
|
||||
void parse_args(int narg, const char *args[]);
|
||||
void run();
|
||||
void read_cmd();
|
||||
void next_cmd();
|
||||
std::vector<std::string> parse_cmd(const std::string & cmdstr);
|
||||
void handle_cmd(const std::vector<std::string> & cmd, std::function<void(int)> cb);
|
||||
void loadjson();
|
||||
};
|
||||
|
||||
kv_cli_t::~kv_cli_t()
|
||||
{
|
||||
if (cur_cmd)
|
||||
{
|
||||
free(cur_cmd);
|
||||
cur_cmd = NULL;
|
||||
}
|
||||
cur_cmd_alloc = 0;
|
||||
if (db)
|
||||
delete db;
|
||||
if (cli)
|
||||
{
|
||||
cli->flush();
|
||||
delete cli;
|
||||
}
|
||||
if (epmgr)
|
||||
delete epmgr;
|
||||
if (ringloop)
|
||||
delete ringloop;
|
||||
}
|
||||
|
||||
void kv_cli_t::parse_args(int narg, const char *args[])
|
||||
{
|
||||
bool db = false;
|
||||
for (int i = 1; i < narg; i++)
|
||||
{
|
||||
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
|
||||
{
|
||||
printf(
|
||||
"Vitastor Key/Value CLI\n"
|
||||
"(c) Vitaliy Filippov, 2023+ (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"USAGE: %s [OPTIONS] [<IMAGE> [<COMMAND>]]\n"
|
||||
"\n"
|
||||
"COMMANDS:\n"
|
||||
" get <key>\n"
|
||||
" set <key> <value>\n"
|
||||
" del <key>\n"
|
||||
" list [<start> [end]]\n"
|
||||
" dump [<start> [end]]\n"
|
||||
" dumpjson [<start> [end]]\n"
|
||||
" loadjson\n"
|
||||
"\n"
|
||||
"<IMAGE> should be the name of Vitastor image with the DB.\n"
|
||||
"Without <COMMAND>, you get an interactive DB shell.\n"
|
||||
"\n"
|
||||
"OPTIONS:\n"
|
||||
" --kv_block_size 4k\n"
|
||||
" Key-value B-Tree block size\n"
|
||||
" --kv_memory_limit 128M\n"
|
||||
" Maximum memory to use for vitastor-kv index cache\n"
|
||||
" --kv_allocate_blocks 4\n"
|
||||
" Number of PG blocks used for new tree block allocation in parallel\n"
|
||||
" --kv_evict_max_misses 10\n"
|
||||
" Eviction algorithm parameter: retry eviction from another random spot\n"
|
||||
" if this number of keys is used currently or was used recently\n"
|
||||
" --kv_evict_attempts_per_level 3\n"
|
||||
" Retry eviction at most this number of times per tree level, starting\n"
|
||||
" with bottom-most levels\n"
|
||||
" --kv_evict_unused_age 1000\n"
|
||||
" Evict only keys unused during this number of last operations\n"
|
||||
" --kv_log_level 1\n"
|
||||
" Log level. 0 = errors, 1 = warnings, 10 = trace operations\n"
|
||||
,
|
||||
exe_name
|
||||
);
|
||||
exit(0);
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == '-')
|
||||
{
|
||||
const char *opt = args[i]+2;
|
||||
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
|
||||
}
|
||||
else if (!db)
|
||||
{
|
||||
cfg["db"] = args[i];
|
||||
db = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
cli_cmd.push_back(args[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void kv_cli_t::run()
|
||||
{
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(512);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||
db = new kv_dbw_t(cli);
|
||||
// Load image metadata
|
||||
while (!cli->is_ready())
|
||||
{
|
||||
ringloop->loop();
|
||||
if (cli->is_ready())
|
||||
break;
|
||||
ringloop->wait();
|
||||
}
|
||||
// Open if DB is set in options
|
||||
if (cfg.find("db") != cfg.end())
|
||||
{
|
||||
bool done = false;
|
||||
handle_cmd({ "open", cfg.at("db").string_value() }, [&done](int res) { if (res != 0) exit(1); done = true; });
|
||||
while (!done)
|
||||
{
|
||||
ringloop->loop();
|
||||
if (done)
|
||||
break;
|
||||
ringloop->wait();
|
||||
}
|
||||
}
|
||||
// Run single command from CLI
|
||||
if (cli_cmd.size())
|
||||
{
|
||||
bool done = false;
|
||||
handle_cmd(cli_cmd, [&done](int res) { if (res != 0) exit(1); done = true; });
|
||||
while (!done)
|
||||
{
|
||||
ringloop->loop();
|
||||
if (done)
|
||||
break;
|
||||
ringloop->wait();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Run interactive shell
|
||||
fcntl(0, F_SETFL, fcntl(0, F_GETFL, 0) | O_NONBLOCK);
|
||||
try
|
||||
{
|
||||
epmgr->tfd->set_fd_handler(0, false, [this](int fd, int events)
|
||||
{
|
||||
if (events & EPOLLIN)
|
||||
{
|
||||
read_cmd();
|
||||
}
|
||||
if (events & EPOLLRDHUP)
|
||||
{
|
||||
epmgr->tfd->set_fd_handler(0, false, NULL);
|
||||
finished = true;
|
||||
}
|
||||
});
|
||||
interactive = isatty(0);
|
||||
if (interactive)
|
||||
printf("> ");
|
||||
}
|
||||
catch (std::exception & e)
|
||||
{
|
||||
// Can't add to epoll, STDIN is probably a file
|
||||
is_file = true;
|
||||
read_cmd();
|
||||
}
|
||||
while (!finished)
|
||||
{
|
||||
ringloop->loop();
|
||||
if (!finished)
|
||||
ringloop->wait();
|
||||
}
|
||||
}
|
||||
// Destroy the client
|
||||
delete db;
|
||||
db = NULL;
|
||||
cli->flush();
|
||||
delete cli;
|
||||
delete epmgr;
|
||||
delete ringloop;
|
||||
cli = NULL;
|
||||
epmgr = NULL;
|
||||
ringloop = NULL;
|
||||
}
|
||||
|
||||
void kv_cli_t::read_cmd()
|
||||
{
|
||||
if (!cur_cmd_alloc)
|
||||
{
|
||||
cur_cmd_alloc = 65536;
|
||||
cur_cmd = (char*)malloc_or_die(cur_cmd_alloc);
|
||||
}
|
||||
while (cur_cmd_size < cur_cmd_alloc)
|
||||
{
|
||||
int r = read(0, cur_cmd+cur_cmd_size, cur_cmd_alloc-cur_cmd_size);
|
||||
if (r < 0 && errno != EAGAIN)
|
||||
fprintf(stderr, "Error reading from stdin: %s\n", strerror(errno));
|
||||
if (r > 0)
|
||||
cur_cmd_size += r;
|
||||
if (r == 0)
|
||||
eof = true;
|
||||
if (r <= 0)
|
||||
break;
|
||||
}
|
||||
next_cmd();
|
||||
}
|
||||
|
||||
void kv_cli_t::next_cmd()
|
||||
{
|
||||
if (loading_json)
|
||||
{
|
||||
loadjson();
|
||||
return;
|
||||
}
|
||||
if (in_progress > 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
int pos = 0;
|
||||
for (; pos < cur_cmd_size; pos++)
|
||||
{
|
||||
if (cur_cmd[pos] == '\n' || cur_cmd[pos] == '\r')
|
||||
{
|
||||
auto cmd = trim(std::string(cur_cmd, pos));
|
||||
pos++;
|
||||
memmove(cur_cmd, cur_cmd+pos, cur_cmd_size-pos);
|
||||
cur_cmd_size -= pos;
|
||||
in_progress++;
|
||||
handle_cmd(parse_cmd(cmd), [this](int res)
|
||||
{
|
||||
in_progress--;
|
||||
if (interactive)
|
||||
printf("> ");
|
||||
next_cmd();
|
||||
if (!in_progress)
|
||||
read_cmd();
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (eof && !in_progress)
|
||||
{
|
||||
finished = true;
|
||||
}
|
||||
}
|
||||
|
||||
struct kv_cli_list_t
|
||||
{
|
||||
kv_dbw_t *db = NULL;
|
||||
void *handle = NULL;
|
||||
int format = 0;
|
||||
int n = 0;
|
||||
std::function<void(int)> cb;
|
||||
};
|
||||
|
||||
std::vector<std::string> kv_cli_t::parse_cmd(const std::string & str)
|
||||
{
|
||||
std::vector<std::string> res;
|
||||
size_t pos = 0;
|
||||
auto cmd = scan_escaped(str, pos);
|
||||
if (cmd.empty())
|
||||
return res;
|
||||
res.push_back(cmd);
|
||||
int max_args = (cmd == "set" || cmd == "config" ||
|
||||
cmd == "list" || cmd == "dump" || cmd == "dumpjson" ? 3 :
|
||||
(cmd == "open" || cmd == "get" || cmd == "del" ? 2 : 1));
|
||||
while (pos < str.size() && res.size() < max_args)
|
||||
{
|
||||
if (res.size() == max_args-1)
|
||||
{
|
||||
// Allow unquoted last argument
|
||||
pos = str.find_first_not_of(" \t\r\n", pos);
|
||||
if (pos == std::string::npos)
|
||||
break;
|
||||
if (str[pos] != '"' && str[pos] != '\'')
|
||||
{
|
||||
res.push_back(trim(str.substr(pos)));
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto arg = scan_escaped(str, pos);
|
||||
if (arg.size())
|
||||
res.push_back(arg);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
void kv_cli_t::loadjson()
|
||||
{
|
||||
// simple streaming json parser
|
||||
if (in_progress >= load_parallelism || in_loadjson)
|
||||
{
|
||||
return;
|
||||
}
|
||||
in_loadjson = true;
|
||||
if (load_state == 5)
|
||||
{
|
||||
st_5:
|
||||
if (!in_progress)
|
||||
{
|
||||
loading_json = false;
|
||||
auto cb = std::move(load_cb);
|
||||
cb(0);
|
||||
}
|
||||
in_loadjson = false;
|
||||
return;
|
||||
}
|
||||
do
|
||||
{
|
||||
read_cmd();
|
||||
size_t pos = 0;
|
||||
while (true)
|
||||
{
|
||||
while (pos < cur_cmd_size && is_white(cur_cmd[pos]))
|
||||
{
|
||||
pos++;
|
||||
}
|
||||
if (pos >= cur_cmd_size)
|
||||
{
|
||||
break;
|
||||
}
|
||||
if (load_state == 0 || load_state == 2)
|
||||
{
|
||||
char expected = "{ :"[load_state];
|
||||
if (cur_cmd[pos] != expected)
|
||||
{
|
||||
fprintf(stderr, "Unexpected %c, expected %c\n", cur_cmd[pos], expected);
|
||||
exit(1);
|
||||
}
|
||||
pos++;
|
||||
load_state++;
|
||||
}
|
||||
else if (load_state == 1 || load_state == 3)
|
||||
{
|
||||
if (cur_cmd[pos] != '"')
|
||||
{
|
||||
fprintf(stderr, "Unexpected %c, expected \"\n", cur_cmd[pos]);
|
||||
exit(1);
|
||||
}
|
||||
size_t prev = pos;
|
||||
auto str = scan_escaped(cur_cmd, cur_cmd_size, pos, false);
|
||||
if (pos == prev)
|
||||
{
|
||||
break;
|
||||
}
|
||||
load_state++;
|
||||
if (load_state == 2)
|
||||
{
|
||||
load_key = str;
|
||||
}
|
||||
else
|
||||
{
|
||||
in_progress++;
|
||||
handle_cmd({ "set", load_key, str }, [this](int res)
|
||||
{
|
||||
in_progress--;
|
||||
next_cmd();
|
||||
});
|
||||
if (in_progress >= load_parallelism)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (load_state == 4)
|
||||
{
|
||||
if (cur_cmd[pos] == ',')
|
||||
{
|
||||
pos++;
|
||||
load_state = 1;
|
||||
}
|
||||
else if (cur_cmd[pos] == '}')
|
||||
{
|
||||
pos++;
|
||||
load_state = 5;
|
||||
goto st_5;
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "Unexpected %c, expected , or }\n", cur_cmd[pos]);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (pos < cur_cmd_size)
|
||||
{
|
||||
memmove(cur_cmd, cur_cmd+pos, cur_cmd_size-pos);
|
||||
}
|
||||
cur_cmd_size -= pos;
|
||||
} while (loading_json && is_file);
|
||||
in_loadjson = false;
|
||||
}
|
||||
|
||||
void kv_cli_t::handle_cmd(const std::vector<std::string> & cmd, std::function<void(int)> cb)
|
||||
{
|
||||
if (!cmd.size())
|
||||
{
|
||||
cb(-EINVAL);
|
||||
return;
|
||||
}
|
||||
auto & opname = cmd[0];
|
||||
if (!opened && opname != "open" && opname != "config" && opname != "quit" && opname != "q")
|
||||
{
|
||||
fprintf(stderr, "Error: database not opened\n");
|
||||
cb(-EINVAL);
|
||||
return;
|
||||
}
|
||||
if (opname == "open")
|
||||
{
|
||||
auto name = cmd.size() > 1 ? cmd[1] : "";
|
||||
uint64_t pool_id = 0;
|
||||
inode_t inode_id = 0;
|
||||
int scanned = sscanf(name.c_str(), "%lu %lu", &pool_id, &inode_id);
|
||||
if (scanned < 2 || !pool_id || !inode_id)
|
||||
{
|
||||
inode_id = 0;
|
||||
name = trim(name);
|
||||
for (auto & ic: cli->st_cli.inode_config)
|
||||
{
|
||||
if (ic.second.name == name)
|
||||
{
|
||||
inode_id = ic.first;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!inode_id)
|
||||
{
|
||||
fprintf(stderr, "Usage: open <image> OR open <pool_id> <inode_id>\n");
|
||||
cb(-EINVAL);
|
||||
return;
|
||||
}
|
||||
}
|
||||
else
|
||||
inode_id = INODE_WITH_POOL(pool_id, inode_id);
|
||||
db->open(inode_id, cfg, [=](int res)
|
||||
{
|
||||
if (res < 0)
|
||||
{
|
||||
fprintf(stderr, "Error opening index: %s (code %d)\n", strerror(-res), res);
|
||||
}
|
||||
else
|
||||
{
|
||||
opened = true;
|
||||
fprintf(interactive ? stdout : stderr, "Index opened. Current size: %lu bytes\n", db->get_size());
|
||||
}
|
||||
cb(res);
|
||||
});
|
||||
}
|
||||
else if (opname == "config")
|
||||
{
|
||||
if (cmd.size() < 3)
|
||||
{
|
||||
fprintf(stderr, "Usage: config <property> <value>\n");
|
||||
cb(-EINVAL);
|
||||
return;
|
||||
}
|
||||
auto & key = cmd[1];
|
||||
auto & value = cmd[2];
|
||||
if (key != "kv_memory_limit" &&
|
||||
key != "kv_allocate_blocks" &&
|
||||
key != "kv_evict_max_misses" &&
|
||||
key != "kv_evict_attempts_per_level" &&
|
||||
key != "kv_evict_unused_age" &&
|
||||
key != "kv_log_level" &&
|
||||
key != "kv_block_size")
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Allowed properties: kv_block_size, kv_memory_limit, kv_allocate_blocks,"
|
||||
" kv_evict_max_misses, kv_evict_attempts_per_level, kv_evict_unused_age, kv_log_level\n"
|
||||
);
|
||||
cb(-EINVAL);
|
||||
}
|
||||
else if (key == "kv_block_size")
|
||||
{
|
||||
if (opened)
|
||||
{
|
||||
fprintf(stderr, "kv_block_size can't be set after opening DB\n");
|
||||
cb(-EINVAL);
|
||||
}
|
||||
else
|
||||
{
|
||||
cfg[key] = value;
|
||||
cb(0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
cfg[key] = value;
|
||||
db->set_config(cfg);
|
||||
cb(0);
|
||||
}
|
||||
}
|
||||
else if (opname == "get" || opname == "set" || opname == "del")
|
||||
{
|
||||
if (opname == "get" || opname == "del")
|
||||
{
|
||||
if (cmd.size() < 2)
|
||||
{
|
||||
fprintf(stderr, "Usage: %s <key>\n", opname.c_str());
|
||||
cb(-EINVAL);
|
||||
return;
|
||||
}
|
||||
auto & key = cmd[1];
|
||||
if (opname == "get")
|
||||
{
|
||||
db->get(key, [this, cb](int res, const std::string & value)
|
||||
{
|
||||
if (res < 0)
|
||||
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
|
||||
else
|
||||
{
|
||||
write(1, value.c_str(), value.size());
|
||||
write(1, "\n", 1);
|
||||
}
|
||||
cb(res);
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
db->del(key, [this, cb](int res)
|
||||
{
|
||||
if (res < 0)
|
||||
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
|
||||
else
|
||||
fprintf(interactive ? stdout : stderr, "OK\n");
|
||||
cb(res);
|
||||
});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (cmd.size() < 3)
|
||||
{
|
||||
fprintf(stderr, "Usage: set <key> <value>\n");
|
||||
cb(-EINVAL);
|
||||
return;
|
||||
}
|
||||
auto & key = cmd[1];
|
||||
auto & value = cmd[2];
|
||||
db->set(key, value, [this, cb, l = loading_json](int res)
|
||||
{
|
||||
if (res < 0)
|
||||
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
|
||||
else if (!l)
|
||||
fprintf(interactive ? stdout : stderr, "OK\n");
|
||||
cb(res);
|
||||
});
|
||||
}
|
||||
}
|
||||
else if (opname == "list" || opname == "dump" || opname == "dumpjson")
|
||||
{
|
||||
kv_cli_list_t *lst = new kv_cli_list_t;
|
||||
std::string start = cmd.size() >= 2 ? cmd[1] : "";
|
||||
std::string end = cmd.size() >= 3 ? cmd[2] : "";
|
||||
lst->handle = db->list_start(start);
|
||||
lst->db = db;
|
||||
lst->format = opname == "dump" ? 1 : (opname == "dumpjson" ? 2 : 0);
|
||||
lst->cb = std::move(cb);
|
||||
db->list_next(lst->handle, [lst](int res, const std::string & key, const std::string & value)
|
||||
{
|
||||
if (res < 0)
|
||||
{
|
||||
if (res != -ENOENT)
|
||||
{
|
||||
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
|
||||
}
|
||||
if (lst->format == 2)
|
||||
printf("\n}\n");
|
||||
lst->db->list_close(lst->handle);
|
||||
lst->cb(res == -ENOENT ? 0 : res);
|
||||
delete lst;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (lst->format == 2)
|
||||
printf(lst->n ? ",\n %s: %s" : "{\n %s: %s", addslashes(key).c_str(), addslashes(value).c_str());
|
||||
else if (lst->format == 1)
|
||||
printf("set %s %s\n", auto_addslashes(key).c_str(), value.c_str());
|
||||
else
|
||||
printf("%s = %s\n", key.c_str(), value.c_str());
|
||||
lst->n++;
|
||||
lst->db->list_next(lst->handle, NULL);
|
||||
}
|
||||
});
|
||||
}
|
||||
else if (opname == "loadjson")
|
||||
{
|
||||
loading_json = true;
|
||||
load_state = 0;
|
||||
load_cb = cb;
|
||||
loadjson();
|
||||
}
|
||||
else if (opname == "close")
|
||||
{
|
||||
db->close([=]()
|
||||
{
|
||||
fprintf(interactive ? stdout : stderr, "Index closed\n");
|
||||
opened = false;
|
||||
cb(0);
|
||||
});
|
||||
}
|
||||
else if (opname == "quit" || opname == "q")
|
||||
{
|
||||
::close(0);
|
||||
finished = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Unknown operation: %s. Supported operations:\n"
|
||||
"open <image>\nopen <pool_id> <inode_id>\n"
|
||||
"config <property> <value>\n"
|
||||
"get <key>\nset <key> <value>\ndel <key>\n"
|
||||
"list [<start> [end]]\ndump [<start> [end]]\ndumpjson [<start> [end]]\nloadjson\n"
|
||||
"close\nquit\n", opname.c_str()
|
||||
);
|
||||
cb(-EINVAL);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int narg, const char *args[])
|
||||
{
|
||||
setvbuf(stdout, NULL, _IONBF, 0);
|
||||
setvbuf(stderr, NULL, _IONBF, 0);
|
||||
exe_name = args[0];
|
||||
kv_cli_t *p = new kv_cli_t();
|
||||
p->parse_args(narg, args);
|
||||
p->run();
|
||||
delete p;
|
||||
return 0;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// Vitastor shared key/value database
|
||||
// Parallel optimistic B-Tree O:-)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cluster_client.h"
|
||||
|
||||
struct kv_db_t;
|
||||
|
||||
struct kv_dbw_t
|
||||
{
|
||||
kv_dbw_t(cluster_client_t *cli);
|
||||
~kv_dbw_t();
|
||||
|
||||
void open(inode_t inode_id, json11::Json cfg, std::function<void(int)> cb);
|
||||
void set_config(json11::Json cfg);
|
||||
void close(std::function<void()> cb);
|
||||
|
||||
uint64_t get_size();
|
||||
|
||||
void get(const std::string & key, std::function<void(int res, const std::string & value)> cb,
|
||||
bool allow_old_cached = false);
|
||||
void set(const std::string & key, const std::string & value, std::function<void(int res)> cb,
|
||||
std::function<bool(int res, const std::string & value)> cas_compare = NULL);
|
||||
void del(const std::string & key, std::function<void(int res)> cb,
|
||||
std::function<bool(int res, const std::string & value)> cas_compare = NULL);
|
||||
|
||||
void* list_start(const std::string & start);
|
||||
void list_next(void *handle, std::function<void(int res, const std::string & key, const std::string & value)> cb);
|
||||
void list_close(void *handle);
|
||||
|
||||
kv_db_t *db;
|
||||
};
|
|
@ -0,0 +1,701 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// Vitastor shared key/value database stress tester / benchmark
|
||||
|
||||
#define _XOPEN_SOURCE
|
||||
#include <limits.h>
|
||||
|
||||
#include <netinet/tcp.h>
|
||||
#include <sys/epoll.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
//#include <signal.h>
|
||||
|
||||
#include "epoll_manager.h"
|
||||
#include "str_util.h"
|
||||
#include "kv_db.h"
|
||||
|
||||
const char *exe_name = NULL;
|
||||
|
||||
struct kv_test_listing_t
|
||||
{
|
||||
uint64_t count = 0, done = 0;
|
||||
void *handle = NULL;
|
||||
std::string next_after;
|
||||
std::set<std::string> inflights;
|
||||
timespec tv_begin;
|
||||
bool error = false;
|
||||
};
|
||||
|
||||
struct kv_test_lat_t
|
||||
{
|
||||
const char *name = NULL;
|
||||
uint64_t usec = 0, count = 0;
|
||||
};
|
||||
|
||||
struct kv_test_stat_t
|
||||
{
|
||||
kv_test_lat_t get, add, update, del, list;
|
||||
uint64_t list_keys = 0;
|
||||
};
|
||||
|
||||
class kv_test_t
|
||||
{
|
||||
public:
|
||||
// Config
|
||||
json11::Json::object kv_cfg;
|
||||
std::string key_prefix, key_suffix;
|
||||
uint64_t inode_id = 0;
|
||||
uint64_t op_count = 1000000;
|
||||
uint64_t runtime_sec = 0;
|
||||
uint64_t parallelism = 4;
|
||||
uint64_t reopen_prob = 1;
|
||||
uint64_t get_prob = 30000;
|
||||
uint64_t add_prob = 20000;
|
||||
uint64_t update_prob = 20000;
|
||||
uint64_t del_prob = 5000;
|
||||
uint64_t list_prob = 300;
|
||||
uint64_t min_key_len = 10;
|
||||
uint64_t max_key_len = 70;
|
||||
uint64_t min_value_len = 50;
|
||||
uint64_t max_value_len = 300;
|
||||
uint64_t min_list_count = 10;
|
||||
uint64_t max_list_count = 1000;
|
||||
uint64_t print_stats_interval = 1;
|
||||
bool json_output = false;
|
||||
uint64_t log_level = 1;
|
||||
bool trace = false;
|
||||
bool stop_on_error = false;
|
||||
// FIXME: Multiple clients
|
||||
kv_test_stat_t stat, prev_stat;
|
||||
timespec prev_stat_time, start_stat_time;
|
||||
|
||||
// State
|
||||
kv_dbw_t *db = NULL;
|
||||
ring_loop_t *ringloop = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
cluster_client_t *cli = NULL;
|
||||
ring_consumer_t consumer;
|
||||
bool finished = false;
|
||||
uint64_t total_prob = 0;
|
||||
uint64_t ops_sent = 0, ops_done = 0;
|
||||
int stat_timer_id = -1;
|
||||
int in_progress = 0;
|
||||
bool reopening = false;
|
||||
std::set<kv_test_listing_t*> listings;
|
||||
std::set<std::string> changing_keys;
|
||||
std::map<std::string, std::string> values;
|
||||
|
||||
~kv_test_t();
|
||||
|
||||
static json11::Json::object parse_args(int narg, const char *args[]);
|
||||
void parse_config(json11::Json cfg);
|
||||
void run(json11::Json cfg);
|
||||
void loop();
|
||||
void print_stats(kv_test_stat_t & prev_stat, timespec & prev_stat_time);
|
||||
void print_total_stats();
|
||||
void start_change(const std::string & key);
|
||||
void stop_change(const std::string & key);
|
||||
void add_stat(kv_test_lat_t & stat, timespec tv_begin);
|
||||
};
|
||||
|
||||
kv_test_t::~kv_test_t()
|
||||
{
|
||||
if (db)
|
||||
delete db;
|
||||
if (cli)
|
||||
{
|
||||
cli->flush();
|
||||
delete cli;
|
||||
}
|
||||
if (epmgr)
|
||||
delete epmgr;
|
||||
if (ringloop)
|
||||
delete ringloop;
|
||||
}
|
||||
|
||||
json11::Json::object kv_test_t::parse_args(int narg, const char *args[])
|
||||
{
|
||||
json11::Json::object cfg;
|
||||
for (int i = 1; i < narg; i++)
|
||||
{
|
||||
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
|
||||
{
|
||||
printf(
|
||||
"Vitastor Key/Value DB stress tester / benchmark\n"
|
||||
"(c) Vitaliy Filippov, 2023+ (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"USAGE: %s --pool_id POOL_ID --inode_id INODE_ID [OPTIONS]\n"
|
||||
" --op_count 1000000\n"
|
||||
" Total operations to run during test. 0 means unlimited\n"
|
||||
" --key_prefix \"\"\n"
|
||||
" Prefix for all keys read or written (to avoid collisions)\n"
|
||||
" --key_suffix \"\"\n"
|
||||
" Suffix for all keys read or written (to avoid collisions, but scan all DB)\n"
|
||||
" --runtime 0\n"
|
||||
" Run for this number of seconds. 0 means unlimited\n"
|
||||
" --parallelism 4\n"
|
||||
" Run this number of operations in parallel\n"
|
||||
" --get_prob 30000\n"
|
||||
" Fraction of key retrieve operations\n"
|
||||
" --add_prob 20000\n"
|
||||
" Fraction of key addition operations\n"
|
||||
" --update_prob 20000\n"
|
||||
" Fraction of key update operations\n"
|
||||
" --del_prob 30000\n"
|
||||
" Fraction of key delete operations\n"
|
||||
" --list_prob 300\n"
|
||||
" Fraction of listing operations\n"
|
||||
" --reopen_prob 1\n"
|
||||
" Fraction of database reopens\n"
|
||||
" --min_key_len 10\n"
|
||||
" Minimum key size in bytes\n"
|
||||
" --max_key_len 70\n"
|
||||
" Maximum key size in bytes\n"
|
||||
" --min_value_len 50\n"
|
||||
" Minimum value size in bytes\n"
|
||||
" --max_value_len 300\n"
|
||||
" Maximum value size in bytes\n"
|
||||
" --min_list_count 10\n"
|
||||
" Minimum number of keys read in listing (0 = all keys)\n"
|
||||
" --max_list_count 1000\n"
|
||||
" Maximum number of keys read in listing\n"
|
||||
" --print_stats 1\n"
|
||||
" Print operation statistics every this number of seconds\n"
|
||||
" --json\n"
|
||||
" JSON output\n"
|
||||
" --stop_on_error 0\n"
|
||||
" Stop on first execution error, mismatch, lost key or extra key during listing\n"
|
||||
" --kv_block_size 4k\n"
|
||||
" Key-value B-Tree block size\n"
|
||||
" --kv_memory_limit 128M\n"
|
||||
" Maximum memory to use for vitastor-kv index cache\n"
|
||||
" --kv_allocate_blocks 4\n"
|
||||
" Number of PG blocks used for new tree block allocation in parallel\n"
|
||||
" --kv_evict_max_misses 10\n"
|
||||
" Eviction algorithm parameter: retry eviction from another random spot\n"
|
||||
" if this number of keys is used currently or was used recently\n"
|
||||
" --kv_evict_attempts_per_level 3\n"
|
||||
" Retry eviction at most this number of times per tree level, starting\n"
|
||||
" with bottom-most levels\n"
|
||||
" --kv_evict_unused_age 1000\n"
|
||||
" Evict only keys unused during this number of last operations\n"
|
||||
" --kv_log_level 1\n"
|
||||
" Log level. 0 = errors, 1 = warnings, 10 = trace operations\n",
|
||||
exe_name
|
||||
);
|
||||
exit(0);
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == '-')
|
||||
{
|
||||
const char *opt = args[i]+2;
|
||||
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
|
||||
}
|
||||
}
|
||||
return cfg;
|
||||
}
|
||||
|
||||
void kv_test_t::parse_config(json11::Json cfg)
|
||||
{
|
||||
inode_id = INODE_WITH_POOL(cfg["pool_id"].uint64_value(), cfg["inode_id"].uint64_value());
|
||||
if (cfg["op_count"].uint64_value() > 0)
|
||||
op_count = cfg["op_count"].uint64_value();
|
||||
key_prefix = cfg["key_prefix"].string_value();
|
||||
key_suffix = cfg["key_suffix"].string_value();
|
||||
if (cfg["runtime"].uint64_value() > 0)
|
||||
runtime_sec = cfg["runtime"].uint64_value();
|
||||
if (cfg["parallelism"].uint64_value() > 0)
|
||||
parallelism = cfg["parallelism"].uint64_value();
|
||||
if (!cfg["reopen_prob"].is_null())
|
||||
reopen_prob = cfg["reopen_prob"].uint64_value();
|
||||
if (!cfg["get_prob"].is_null())
|
||||
get_prob = cfg["get_prob"].uint64_value();
|
||||
if (!cfg["add_prob"].is_null())
|
||||
add_prob = cfg["add_prob"].uint64_value();
|
||||
if (!cfg["update_prob"].is_null())
|
||||
update_prob = cfg["update_prob"].uint64_value();
|
||||
if (!cfg["del_prob"].is_null())
|
||||
del_prob = cfg["del_prob"].uint64_value();
|
||||
if (!cfg["list_prob"].is_null())
|
||||
list_prob = cfg["list_prob"].uint64_value();
|
||||
if (!cfg["min_key_len"].is_null())
|
||||
min_key_len = cfg["min_key_len"].uint64_value();
|
||||
if (cfg["max_key_len"].uint64_value() > 0)
|
||||
max_key_len = cfg["max_key_len"].uint64_value();
|
||||
if (!cfg["min_value_len"].is_null())
|
||||
min_value_len = cfg["min_value_len"].uint64_value();
|
||||
if (cfg["max_value_len"].uint64_value() > 0)
|
||||
max_value_len = cfg["max_value_len"].uint64_value();
|
||||
if (!cfg["min_list_count"].is_null())
|
||||
min_list_count = cfg["min_list_count"].uint64_value();
|
||||
if (!cfg["max_list_count"].is_null())
|
||||
max_list_count = cfg["max_list_count"].uint64_value();
|
||||
if (!cfg["print_stats"].is_null())
|
||||
print_stats_interval = cfg["print_stats"].uint64_value();
|
||||
if (!cfg["json"].is_null())
|
||||
json_output = true;
|
||||
if (!cfg["stop_on_error"].is_null())
|
||||
stop_on_error = cfg["stop_on_error"].bool_value();
|
||||
if (!cfg["kv_block_size"].is_null())
|
||||
kv_cfg["kv_block_size"] = cfg["kv_block_size"];
|
||||
if (!cfg["kv_memory_limit"].is_null())
|
||||
kv_cfg["kv_memory_limit"] = cfg["kv_memory_limit"];
|
||||
if (!cfg["kv_allocate_blocks"].is_null())
|
||||
kv_cfg["kv_allocate_blocks"] = cfg["kv_allocate_blocks"];
|
||||
if (!cfg["kv_evict_max_misses"].is_null())
|
||||
kv_cfg["kv_evict_max_misses"] = cfg["kv_evict_max_misses"];
|
||||
if (!cfg["kv_evict_attempts_per_level"].is_null())
|
||||
kv_cfg["kv_evict_attempts_per_level"] = cfg["kv_evict_attempts_per_level"];
|
||||
if (!cfg["kv_evict_unused_age"].is_null())
|
||||
kv_cfg["kv_evict_unused_age"] = cfg["kv_evict_unused_age"];
|
||||
if (!cfg["kv_log_level"].is_null())
|
||||
{
|
||||
log_level = cfg["kv_log_level"].uint64_value();
|
||||
trace = log_level >= 10;
|
||||
kv_cfg["kv_log_level"] = cfg["kv_log_level"];
|
||||
}
|
||||
total_prob = reopen_prob+get_prob+add_prob+update_prob+del_prob+list_prob;
|
||||
stat.get.name = "get";
|
||||
stat.add.name = "add";
|
||||
stat.update.name = "update";
|
||||
stat.del.name = "del";
|
||||
stat.list.name = "list";
|
||||
}
|
||||
|
||||
void kv_test_t::run(json11::Json cfg)
|
||||
{
|
||||
srand48(time(NULL));
|
||||
parse_config(cfg);
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(512);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||
db = new kv_dbw_t(cli);
|
||||
// Load image metadata
|
||||
while (!cli->is_ready())
|
||||
{
|
||||
ringloop->loop();
|
||||
if (cli->is_ready())
|
||||
break;
|
||||
ringloop->wait();
|
||||
}
|
||||
// Run
|
||||
reopening = true;
|
||||
db->open(inode_id, kv_cfg, [this](int res)
|
||||
{
|
||||
reopening = false;
|
||||
if (res < 0)
|
||||
{
|
||||
fprintf(stderr, "ERROR: Open index: %d (%s)\n", res, strerror(-res));
|
||||
exit(1);
|
||||
}
|
||||
if (trace)
|
||||
printf("Index opened\n");
|
||||
ringloop->wakeup();
|
||||
});
|
||||
consumer.loop = [this]() { loop(); };
|
||||
ringloop->register_consumer(&consumer);
|
||||
if (print_stats_interval)
|
||||
stat_timer_id = epmgr->tfd->set_timer(print_stats_interval*1000, true, [this](int) { print_stats(prev_stat, prev_stat_time); });
|
||||
clock_gettime(CLOCK_REALTIME, &start_stat_time);
|
||||
prev_stat_time = start_stat_time;
|
||||
while (!finished)
|
||||
{
|
||||
ringloop->loop();
|
||||
if (!finished)
|
||||
ringloop->wait();
|
||||
}
|
||||
if (stat_timer_id >= 0)
|
||||
epmgr->tfd->clear_timer(stat_timer_id);
|
||||
ringloop->unregister_consumer(&consumer);
|
||||
// Print total stats
|
||||
print_total_stats();
|
||||
// Destroy the client
|
||||
delete db;
|
||||
db = NULL;
|
||||
cli->flush();
|
||||
delete cli;
|
||||
delete epmgr;
|
||||
delete ringloop;
|
||||
cli = NULL;
|
||||
epmgr = NULL;
|
||||
ringloop = NULL;
|
||||
}
|
||||
|
||||
static const char *base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789@+/";
|
||||
|
||||
std::string random_str(int len)
|
||||
{
|
||||
std::string str;
|
||||
str.resize(len);
|
||||
for (int i = 0; i < len; i++)
|
||||
{
|
||||
str[i] = base64_chars[lrand48() % 64];
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
void kv_test_t::loop()
|
||||
{
|
||||
if (reopening)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (ops_done >= op_count)
|
||||
{
|
||||
finished = true;
|
||||
}
|
||||
while (!finished && ops_sent < op_count && in_progress < parallelism)
|
||||
{
|
||||
uint64_t dice = (lrand48() % total_prob);
|
||||
if (dice < reopen_prob)
|
||||
{
|
||||
reopening = true;
|
||||
db->close([this]()
|
||||
{
|
||||
if (trace)
|
||||
printf("Index closed\n");
|
||||
db->open(inode_id, kv_cfg, [this](int res)
|
||||
{
|
||||
reopening = false;
|
||||
if (res < 0)
|
||||
{
|
||||
fprintf(stderr, "ERROR: Reopen index: %d (%s)\n", res, strerror(-res));
|
||||
finished = true;
|
||||
return;
|
||||
}
|
||||
if (trace)
|
||||
printf("Index reopened\n");
|
||||
ringloop->wakeup();
|
||||
});
|
||||
});
|
||||
return;
|
||||
}
|
||||
else if (dice < reopen_prob+get_prob)
|
||||
{
|
||||
// get existing
|
||||
auto key = random_str(max_key_len);
|
||||
auto k_it = values.lower_bound(key);
|
||||
if (k_it == values.end())
|
||||
continue;
|
||||
key = k_it->first;
|
||||
if (changing_keys.find(key) != changing_keys.end())
|
||||
continue;
|
||||
in_progress++;
|
||||
ops_sent++;
|
||||
if (trace)
|
||||
printf("get %s\n", key.c_str());
|
||||
timespec tv_begin;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_begin);
|
||||
db->get(key, [this, key, tv_begin](int res, const std::string & value)
|
||||
{
|
||||
add_stat(stat.get, tv_begin);
|
||||
ops_done++;
|
||||
in_progress--;
|
||||
auto it = values.find(key);
|
||||
if (res != (it == values.end() ? -ENOENT : 0))
|
||||
{
|
||||
fprintf(stderr, "ERROR: get %s: %d (%s)\n", key.c_str(), res, strerror(-res));
|
||||
if (stop_on_error)
|
||||
exit(1);
|
||||
}
|
||||
else if (it != values.end() && value != it->second)
|
||||
{
|
||||
fprintf(stderr, "ERROR: get %s: mismatch: %s vs %s\n", key.c_str(), value.c_str(), it->second.c_str());
|
||||
if (stop_on_error)
|
||||
exit(1);
|
||||
}
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
else if (dice < reopen_prob+get_prob+add_prob+update_prob)
|
||||
{
|
||||
bool is_add = false;
|
||||
std::string key;
|
||||
if (dice < reopen_prob+get_prob+add_prob)
|
||||
{
|
||||
// add
|
||||
is_add = true;
|
||||
uint64_t key_len = min_key_len + (max_key_len > min_key_len ? lrand48() % (max_key_len-min_key_len) : 0);
|
||||
key = key_prefix + random_str(key_len) + key_suffix;
|
||||
}
|
||||
else
|
||||
{
|
||||
// update
|
||||
key = random_str(max_key_len);
|
||||
auto k_it = values.lower_bound(key);
|
||||
if (k_it == values.end())
|
||||
continue;
|
||||
key = k_it->first;
|
||||
}
|
||||
if (changing_keys.find(key) != changing_keys.end())
|
||||
continue;
|
||||
uint64_t value_len = min_value_len + (max_value_len > min_value_len ? lrand48() % (max_value_len-min_value_len) : 0);
|
||||
auto value = random_str(value_len);
|
||||
start_change(key);
|
||||
ops_sent++;
|
||||
in_progress++;
|
||||
if (trace)
|
||||
printf("set %s = %s\n", key.c_str(), value.c_str());
|
||||
timespec tv_begin;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_begin);
|
||||
db->set(key, value, [this, key, value, tv_begin, is_add](int res)
|
||||
{
|
||||
add_stat(is_add ? stat.add : stat.update, tv_begin);
|
||||
stop_change(key);
|
||||
ops_done++;
|
||||
in_progress--;
|
||||
if (res != 0)
|
||||
{
|
||||
fprintf(stderr, "ERROR: set %s = %s: %d (%s)\n", key.c_str(), value.c_str(), res, strerror(-res));
|
||||
if (stop_on_error)
|
||||
exit(1);
|
||||
}
|
||||
else
|
||||
{
|
||||
values[key] = value;
|
||||
}
|
||||
ringloop->wakeup();
|
||||
}, NULL);
|
||||
}
|
||||
else if (dice < reopen_prob+get_prob+add_prob+update_prob+del_prob)
|
||||
{
|
||||
// delete
|
||||
auto key = random_str(max_key_len);
|
||||
auto k_it = values.lower_bound(key);
|
||||
if (k_it == values.end())
|
||||
continue;
|
||||
key = k_it->first;
|
||||
if (changing_keys.find(key) != changing_keys.end())
|
||||
continue;
|
||||
start_change(key);
|
||||
ops_sent++;
|
||||
in_progress++;
|
||||
if (trace)
|
||||
printf("del %s\n", key.c_str());
|
||||
timespec tv_begin;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_begin);
|
||||
db->del(key, [this, key, tv_begin](int res)
|
||||
{
|
||||
add_stat(stat.del, tv_begin);
|
||||
stop_change(key);
|
||||
ops_done++;
|
||||
in_progress--;
|
||||
if (res != 0)
|
||||
{
|
||||
fprintf(stderr, "ERROR: del %s: %d (%s)\n", key.c_str(), res, strerror(-res));
|
||||
if (stop_on_error)
|
||||
exit(1);
|
||||
}
|
||||
else
|
||||
{
|
||||
values.erase(key);
|
||||
}
|
||||
ringloop->wakeup();
|
||||
}, NULL);
|
||||
}
|
||||
else if (dice < reopen_prob+get_prob+add_prob+update_prob+del_prob+list_prob)
|
||||
{
|
||||
// list
|
||||
ops_sent++;
|
||||
in_progress++;
|
||||
auto key = random_str(max_key_len);
|
||||
auto lst = new kv_test_listing_t;
|
||||
auto k_it = values.lower_bound(key);
|
||||
lst->count = min_list_count + (max_list_count > min_list_count ? lrand48() % (max_list_count-min_list_count) : 0);
|
||||
lst->handle = db->list_start(k_it == values.begin() ? key_prefix : key);
|
||||
lst->next_after = k_it == values.begin() ? key_prefix : key;
|
||||
lst->inflights = changing_keys;
|
||||
listings.insert(lst);
|
||||
if (trace)
|
||||
printf("list from %s\n", key.c_str());
|
||||
clock_gettime(CLOCK_REALTIME, &lst->tv_begin);
|
||||
db->list_next(lst->handle, [this, lst](int res, const std::string & key, const std::string & value)
|
||||
{
|
||||
if (log_level >= 11)
|
||||
printf("list: %s = %s\n", key.c_str(), value.c_str());
|
||||
if (res >= 0 && key_prefix.size() && (key.size() < key_prefix.size() ||
|
||||
key.substr(0, key_prefix.size()) != key_prefix))
|
||||
{
|
||||
// stop at this key
|
||||
res = -ENOENT;
|
||||
}
|
||||
if (res < 0 || (lst->count > 0 && lst->done >= lst->count))
|
||||
{
|
||||
add_stat(stat.list, lst->tv_begin);
|
||||
if (res == 0)
|
||||
{
|
||||
// ok (done >= count)
|
||||
}
|
||||
else if (res != -ENOENT)
|
||||
{
|
||||
fprintf(stderr, "ERROR: list: %d (%s)\n", res, strerror(-res));
|
||||
lst->error = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto k_it = lst->next_after == "" ? values.begin() : values.upper_bound(lst->next_after);
|
||||
while (k_it != values.end())
|
||||
{
|
||||
while (k_it != values.end() && lst->inflights.find(k_it->first) != lst->inflights.end())
|
||||
k_it++;
|
||||
if (k_it != values.end())
|
||||
{
|
||||
fprintf(stderr, "ERROR: list: missing key %s\n", (k_it++)->first.c_str());
|
||||
lst->error = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (lst->error && stop_on_error)
|
||||
exit(1);
|
||||
ops_done++;
|
||||
in_progress--;
|
||||
db->list_close(lst->handle);
|
||||
delete lst;
|
||||
listings.erase(lst);
|
||||
ringloop->wakeup();
|
||||
}
|
||||
else
|
||||
{
|
||||
stat.list_keys++;
|
||||
// Do not check modified keys in listing
|
||||
// Listing may return their old or new state
|
||||
if ((!key_suffix.size() || key.size() >= key_suffix.size() &&
|
||||
key.substr(key.size()-key_suffix.size()) == key_suffix) &&
|
||||
lst->inflights.find(key) == lst->inflights.end())
|
||||
{
|
||||
lst->done++;
|
||||
auto k_it = lst->next_after == "" ? values.begin() : values.upper_bound(lst->next_after);
|
||||
while (true)
|
||||
{
|
||||
while (k_it != values.end() && lst->inflights.find(k_it->first) != lst->inflights.end())
|
||||
{
|
||||
k_it++;
|
||||
}
|
||||
if (k_it == values.end() || k_it->first > key)
|
||||
{
|
||||
fprintf(stderr, "ERROR: list: extra key %s\n", key.c_str());
|
||||
lst->error = true;
|
||||
break;
|
||||
}
|
||||
else if (k_it->first < key)
|
||||
{
|
||||
fprintf(stderr, "ERROR: list: missing key %s\n", k_it->first.c_str());
|
||||
lst->error = true;
|
||||
lst->next_after = k_it->first;
|
||||
k_it++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (k_it->second != value)
|
||||
{
|
||||
fprintf(stderr, "ERROR: list: mismatch: %s = %s but should be %s\n",
|
||||
key.c_str(), value.c_str(), k_it->second.c_str());
|
||||
lst->error = true;
|
||||
}
|
||||
lst->next_after = k_it->first;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
db->list_next(lst->handle, NULL);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void kv_test_t::add_stat(kv_test_lat_t & stat, timespec tv_begin)
|
||||
{
|
||||
timespec tv_end;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
||||
int64_t usec = (tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
|
||||
(tv_end.tv_nsec - tv_begin.tv_nsec)/1000;
|
||||
if (usec > 0)
|
||||
stat.usec += usec;
|
||||
stat.count++;
|
||||
}
|
||||
|
||||
void kv_test_t::print_stats(kv_test_stat_t & prev_stat, timespec & prev_stat_time)
|
||||
{
|
||||
timespec cur_stat_time;
|
||||
clock_gettime(CLOCK_REALTIME, &cur_stat_time);
|
||||
int64_t usec = (cur_stat_time.tv_sec - prev_stat_time.tv_sec)*1000000 +
|
||||
(cur_stat_time.tv_nsec - prev_stat_time.tv_nsec)/1000;
|
||||
if (usec > 0)
|
||||
{
|
||||
kv_test_lat_t *lats[] = { &stat.get, &stat.add, &stat.update, &stat.del, &stat.list };
|
||||
kv_test_lat_t *prev[] = { &prev_stat.get, &prev_stat.add, &prev_stat.update, &prev_stat.del, &prev_stat.list };
|
||||
if (!json_output)
|
||||
{
|
||||
char buf[128] = { 0 };
|
||||
for (int i = 0; i < sizeof(lats)/sizeof(lats[0]); i++)
|
||||
{
|
||||
snprintf(buf, sizeof(buf)-1, "%.1f %s/s (%lu us)", (lats[i]->count-prev[i]->count)*1000000.0/usec,
|
||||
lats[i]->name, (lats[i]->usec-prev[i]->usec)/(lats[i]->count-prev[i]->count > 0 ? lats[i]->count-prev[i]->count : 1));
|
||||
int k;
|
||||
for (k = strlen(buf); k < strlen(lats[i]->name)+21; k++)
|
||||
buf[k] = ' ';
|
||||
buf[k] = 0;
|
||||
printf("%s", buf);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
int64_t runtime = (cur_stat_time.tv_sec - start_stat_time.tv_sec)*1000000 +
|
||||
(cur_stat_time.tv_nsec - start_stat_time.tv_nsec)/1000;
|
||||
printf("{\"runtime\":%.1f", (double)runtime/1000000.0);
|
||||
for (int i = 0; i < sizeof(lats)/sizeof(lats[0]); i++)
|
||||
{
|
||||
if (lats[i]->count > prev[i]->count)
|
||||
{
|
||||
printf(
|
||||
",\"%s\":{\"avg\":{\"iops\":%.1f,\"usec\":%lu},\"total\":{\"count\":%lu,\"usec\":%lu}}",
|
||||
lats[i]->name, (lats[i]->count-prev[i]->count)*1000000.0/usec,
|
||||
(lats[i]->usec-prev[i]->usec)/(lats[i]->count-prev[i]->count),
|
||||
lats[i]->count, lats[i]->usec
|
||||
);
|
||||
}
|
||||
}
|
||||
printf("}\n");
|
||||
}
|
||||
}
|
||||
prev_stat = stat;
|
||||
prev_stat_time = cur_stat_time;
|
||||
}
|
||||
|
||||
void kv_test_t::print_total_stats()
|
||||
{
|
||||
if (!json_output)
|
||||
printf("Total:\n");
|
||||
kv_test_stat_t start_stats;
|
||||
timespec start_stat_time = this->start_stat_time;
|
||||
print_stats(start_stats, start_stat_time);
|
||||
}
|
||||
|
||||
void kv_test_t::start_change(const std::string & key)
|
||||
{
|
||||
changing_keys.insert(key);
|
||||
for (auto lst: listings)
|
||||
{
|
||||
lst->inflights.insert(key);
|
||||
}
|
||||
}
|
||||
|
||||
void kv_test_t::stop_change(const std::string & key)
|
||||
{
|
||||
changing_keys.erase(key);
|
||||
}
|
||||
|
||||
int main(int narg, const char *args[])
|
||||
{
|
||||
setvbuf(stdout, NULL, _IONBF, 0);
|
||||
setvbuf(stderr, NULL, _IONBF, 0);
|
||||
exe_name = args[0];
|
||||
kv_test_t *p = new kv_test_t();
|
||||
p->run(kv_test_t::parse_args(narg, args));
|
||||
delete p;
|
||||
return 0;
|
||||
}
|
|
@ -146,7 +146,7 @@ public:
|
|||
" Note that nbd_timeout, nbd_max_devices and nbd_max_part options may also be specified\n"
|
||||
" in /etc/vitastor/vitastor.conf or in other configuration file specified with --config_file.\n"
|
||||
" --logfile /path/to/log/file.txt\n"
|
||||
" Wite log messages to the specified file instead of dropping them (in background mode)\n"
|
||||
" Write log messages to the specified file instead of dropping them (in background mode)\n"
|
||||
" or printing them to the standard output (in foreground mode).\n"
|
||||
" --dev_num N\n"
|
||||
" Use the specified device /dev/nbdN instead of automatic selection.\n"
|
||||
|
@ -298,7 +298,7 @@ public:
|
|||
}
|
||||
}
|
||||
}
|
||||
if (cfg["logfile"].is_string())
|
||||
if (cfg["logfile"].string_value() != "")
|
||||
{
|
||||
logfile = cfg["logfile"].string_value();
|
||||
}
|
||||
|
|
|
@ -1,23 +1,18 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS connection handler for NFS proxy
|
||||
// NFS proxy over Vitastor block images
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "str_util.h"
|
||||
|
||||
#include "nfs_proxy.h"
|
||||
|
||||
#include "nfs_common.h"
|
||||
#include "nfs_block.h"
|
||||
#include "nfs/nfs.h"
|
||||
|
||||
#include "cli.h"
|
||||
|
||||
#define TRUE 1
|
||||
#define FALSE 0
|
||||
|
||||
#define MAX_REQUEST_SIZE 128*1024*1024
|
||||
|
||||
static unsigned len_pad4(unsigned len)
|
||||
{
|
||||
return len + (len&3 ? 4-(len&3) : 0);
|
||||
|
@ -28,10 +23,10 @@ static std::string get_inode_name(nfs_client_t *self, diropargs3 & what)
|
|||
// Get name
|
||||
std::string dirhash = what.dir;
|
||||
std::string dir;
|
||||
if (dirhash != "roothandle")
|
||||
if (dirhash != NFS_ROOT_HANDLE)
|
||||
{
|
||||
auto dir_it = self->parent->dir_by_hash.find(dirhash);
|
||||
if (dir_it != self->parent->dir_by_hash.end())
|
||||
auto dir_it = self->parent->blockfs->dir_by_hash.find(dirhash);
|
||||
if (dir_it != self->parent->blockfs->dir_by_hash.end())
|
||||
dir = dir_it->second;
|
||||
else
|
||||
return "";
|
||||
|
@ -39,27 +34,12 @@ static std::string get_inode_name(nfs_client_t *self, diropargs3 & what)
|
|||
std::string name = what.name;
|
||||
return (dir.size()
|
||||
? dir+"/"+name
|
||||
: self->parent->name_prefix+name);
|
||||
}
|
||||
|
||||
static nfsstat3 vitastor_nfs_map_err(int err)
|
||||
{
|
||||
return (err == EINVAL ? NFS3ERR_INVAL
|
||||
: (err == ENOENT ? NFS3ERR_NOENT
|
||||
: (err == ENOSPC ? NFS3ERR_NOSPC
|
||||
: (err == EEXIST ? NFS3ERR_EXIST
|
||||
: (err == EIO ? NFS3ERR_IO : (err ? NFS3ERR_IO : NFS3_OK))))));
|
||||
}
|
||||
|
||||
static int nfs3_null_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
: self->parent->blockfs->name_prefix+name);
|
||||
}
|
||||
|
||||
static fattr3 get_dir_attributes(nfs_client_t *self, std::string dir)
|
||||
{
|
||||
auto & dinf = self->parent->dir_info.at(dir);
|
||||
auto & dinf = self->parent->blockfs->dir_info.at(dir);
|
||||
return (fattr3){
|
||||
.type = NF3DIR,
|
||||
.mode = 0755,
|
||||
|
@ -108,7 +88,7 @@ static fattr3 get_file_attributes(nfs_client_t *self, inode_t inode_num)
|
|||
};
|
||||
}
|
||||
|
||||
static int nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
GETATTR3args *args = (GETATTR3args*)rop->request;
|
||||
|
@ -116,12 +96,12 @@ static int nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
|
|||
bool is_dir = false;
|
||||
std::string dirhash = args->object;
|
||||
std::string dir;
|
||||
if (args->object == "roothandle")
|
||||
if (args->object == NFS_ROOT_HANDLE)
|
||||
is_dir = true;
|
||||
else
|
||||
{
|
||||
auto dir_it = self->parent->dir_by_hash.find(dirhash);
|
||||
if (dir_it != self->parent->dir_by_hash.end())
|
||||
auto dir_it = self->parent->blockfs->dir_by_hash.find(dirhash);
|
||||
if (dir_it != self->parent->blockfs->dir_by_hash.end())
|
||||
{
|
||||
is_dir = true;
|
||||
dir = dir_it->second;
|
||||
|
@ -140,8 +120,8 @@ static int nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
|
|||
else
|
||||
{
|
||||
uint64_t inode_num = 0;
|
||||
auto inode_num_it = self->parent->inode_by_hash.find(dirhash);
|
||||
if (inode_num_it != self->parent->inode_by_hash.end())
|
||||
auto inode_num_it = self->parent->blockfs->inode_by_hash.find(dirhash);
|
||||
if (inode_num_it != self->parent->blockfs->inode_by_hash.end())
|
||||
inode_num = inode_num_it->second;
|
||||
auto inode_it = self->parent->cli->st_cli.inode_config.find(inode_num);
|
||||
if (inode_num && inode_it != self->parent->cli->st_cli.inode_config.end())
|
||||
|
@ -179,16 +159,16 @@ static int nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int nfs3_setattr_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_setattr_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
SETATTR3args *args = (SETATTR3args*)rop->request;
|
||||
SETATTR3res *reply = (SETATTR3res*)rop->reply;
|
||||
std::string handle = args->object;
|
||||
auto ino_it = self->parent->inode_by_hash.find(handle);
|
||||
if (ino_it == self->parent->inode_by_hash.end())
|
||||
auto ino_it = self->parent->blockfs->inode_by_hash.find(handle);
|
||||
if (ino_it == self->parent->blockfs->inode_by_hash.end())
|
||||
{
|
||||
if (handle == "roothandle" || self->parent->dir_by_hash.find(handle) != self->parent->dir_by_hash.end())
|
||||
if (handle == NFS_ROOT_HANDLE || self->parent->blockfs->dir_by_hash.find(handle) != self->parent->blockfs->dir_by_hash.end())
|
||||
{
|
||||
if (args->new_attributes.size.set_it)
|
||||
{
|
||||
|
@ -228,7 +208,7 @@ static int nfs3_setattr_proc(void *opaque, rpc_op_t *rop)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int nfs3_lookup_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_lookup_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
LOOKUP3args *args = (LOOKUP3args*)rop->request;
|
||||
|
@ -255,8 +235,8 @@ static int nfs3_lookup_proc(void *opaque, rpc_op_t *rop)
|
|||
return 0;
|
||||
}
|
||||
}
|
||||
auto dir_it = self->parent->dir_info.find(full_name);
|
||||
if (dir_it != self->parent->dir_info.end())
|
||||
auto dir_it = self->parent->blockfs->dir_info.find(full_name);
|
||||
if (dir_it != self->parent->blockfs->dir_info.end())
|
||||
{
|
||||
*reply = (LOOKUP3res){
|
||||
.status = NFS3_OK,
|
||||
|
@ -277,7 +257,7 @@ static int nfs3_lookup_proc(void *opaque, rpc_op_t *rop)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int nfs3_access_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_access_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
//nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
ACCESS3args *args = (ACCESS3args*)rop->request;
|
||||
|
@ -292,7 +272,7 @@ static int nfs3_access_proc(void *opaque, rpc_op_t *rop)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int nfs3_readlink_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_readlink_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
//nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
//READLINK3args *args = (READLINK3args*)rop->request;
|
||||
|
@ -303,14 +283,14 @@ static int nfs3_readlink_proc(void *opaque, rpc_op_t *rop)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int nfs3_read_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_read_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
READ3args *args = (READ3args*)rop->request;
|
||||
READ3res *reply = (READ3res*)rop->reply;
|
||||
std::string handle = args->file;
|
||||
auto ino_it = self->parent->inode_by_hash.find(handle);
|
||||
if (ino_it == self->parent->inode_by_hash.end())
|
||||
auto ino_it = self->parent->blockfs->inode_by_hash.find(handle);
|
||||
if (ino_it == self->parent->blockfs->inode_by_hash.end())
|
||||
{
|
||||
*reply = (READ3res){ .status = NFS3ERR_NOENT };
|
||||
rpc_queue_reply(rop);
|
||||
|
@ -367,14 +347,14 @@ static int nfs3_read_proc(void *opaque, rpc_op_t *rop)
|
|||
|
||||
static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t new_size, uint64_t offset, uint64_t count, void *buf);
|
||||
|
||||
static int nfs3_write_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_write_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
WRITE3args *args = (WRITE3args*)rop->request;
|
||||
WRITE3res *reply = (WRITE3res*)rop->reply;
|
||||
std::string handle = args->file;
|
||||
auto ino_it = self->parent->inode_by_hash.find(handle);
|
||||
if (ino_it == self->parent->inode_by_hash.end())
|
||||
auto ino_it = self->parent->blockfs->inode_by_hash.find(handle);
|
||||
if (ino_it == self->parent->blockfs->inode_by_hash.end())
|
||||
{
|
||||
*reply = (WRITE3res){ .status = NFS3ERR_NOENT };
|
||||
rpc_queue_reply(rop);
|
||||
|
@ -480,8 +460,8 @@ static void complete_extend_write(nfs_client_t *self, rpc_op_t *rop, inode_t ino
|
|||
|
||||
static void complete_extend_inode(nfs_client_t *self, uint64_t inode, uint64_t new_size, int err)
|
||||
{
|
||||
auto ext_it = self->extend_writes.lower_bound((extend_size_t){ .inode = inode, .new_size = 0 });
|
||||
while (ext_it != self->extend_writes.end() &&
|
||||
auto ext_it = self->parent->blockfs->extend_writes.lower_bound((extend_size_t){ .inode = inode, .new_size = 0 });
|
||||
while (ext_it != self->parent->blockfs->extend_writes.end() &&
|
||||
ext_it->first.inode == inode &&
|
||||
ext_it->first.new_size <= new_size)
|
||||
{
|
||||
|
@ -490,7 +470,7 @@ static void complete_extend_inode(nfs_client_t *self, uint64_t inode, uint64_t n
|
|||
{
|
||||
complete_extend_write(self, ext_it->second.rop, inode, ext_it->second.write_res < 0
|
||||
? ext_it->second.write_res : ext_it->second.resize_res);
|
||||
self->extend_writes.erase(ext_it++);
|
||||
self->parent->blockfs->extend_writes.erase(ext_it++);
|
||||
}
|
||||
else
|
||||
ext_it++;
|
||||
|
@ -500,7 +480,7 @@ static void complete_extend_inode(nfs_client_t *self, uint64_t inode, uint64_t n
|
|||
static void extend_inode(nfs_client_t *self, uint64_t inode, uint64_t new_size)
|
||||
{
|
||||
// Send an extend request
|
||||
auto & ext = self->extends[inode];
|
||||
auto & ext = self->parent->blockfs->extends[inode];
|
||||
ext.cur_extend = new_size;
|
||||
auto inode_it = self->parent->cli->st_cli.inode_config.find(inode);
|
||||
if (inode_it != self->parent->cli->st_cli.inode_config.end() &&
|
||||
|
@ -514,10 +494,10 @@ static void extend_inode(nfs_client_t *self, uint64_t inode, uint64_t new_size)
|
|||
{ "force_size", true },
|
||||
}), [=](const cli_result_t & r)
|
||||
{
|
||||
auto & ext = self->extends[inode];
|
||||
auto & ext = self->parent->blockfs->extends[inode];
|
||||
if (r.err)
|
||||
{
|
||||
fprintf(stderr, "Error extending inode %ju to %ju bytes: %s\n", inode, new_size, r.text.c_str());
|
||||
fprintf(stderr, "Error extending inode %lu to %lu bytes: %s\n", inode, new_size, r.text.c_str());
|
||||
}
|
||||
if (r.err == EAGAIN || ext.next_extend > ext.cur_extend)
|
||||
{
|
||||
|
@ -548,7 +528,7 @@ static void nfs_do_write(nfs_client_t *self, std::multimap<extend_size_t, extend
|
|||
{
|
||||
auto inode = op->inode;
|
||||
int write_res = op->retval < 0 ? op->retval : (op->retval != op->len ? -ERANGE : 0);
|
||||
if (ewr_it == self->extend_writes.end())
|
||||
if (ewr_it == self->parent->blockfs->extend_writes.end())
|
||||
{
|
||||
complete_extend_write(self, rop, inode, write_res);
|
||||
}
|
||||
|
@ -558,7 +538,7 @@ static void nfs_do_write(nfs_client_t *self, std::multimap<extend_size_t, extend
|
|||
if (ewr_it->second.resize_res <= 0)
|
||||
{
|
||||
complete_extend_write(self, rop, inode, write_res < 0 ? write_res : ewr_it->second.resize_res);
|
||||
self->extend_writes.erase(ewr_it);
|
||||
self->parent->blockfs->extend_writes.erase(ewr_it);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -572,7 +552,7 @@ static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode,
|
|||
if (inode_it != self->parent->cli->st_cli.inode_config.end() &&
|
||||
inode_it->second.size < new_size)
|
||||
{
|
||||
auto ewr_it = self->extend_writes.emplace((extend_size_t){
|
||||
auto ewr_it = self->parent->blockfs->extend_writes.emplace((extend_size_t){
|
||||
.inode = inode,
|
||||
.new_size = new_size,
|
||||
}, (extend_write_t){
|
||||
|
@ -580,7 +560,7 @@ static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode,
|
|||
.resize_res = 1,
|
||||
.write_res = 1,
|
||||
});
|
||||
auto & ext = self->extends[inode];
|
||||
auto & ext = self->parent->blockfs->extends[inode];
|
||||
if (ext.cur_extend > 0)
|
||||
{
|
||||
// Already resizing, just wait
|
||||
|
@ -595,11 +575,11 @@ static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode,
|
|||
}
|
||||
else
|
||||
{
|
||||
nfs_do_write(self, self->extend_writes.end(), rop, inode, offset, count, buf);
|
||||
nfs_do_write(self, self->parent->blockfs->extend_writes.end(), rop, inode, offset, count, buf);
|
||||
}
|
||||
}
|
||||
|
||||
static int nfs3_create_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_create_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
CREATE3args *args = (CREATE3args*)rop->request;
|
||||
|
@ -650,7 +630,7 @@ static int nfs3_create_proc(void *opaque, rpc_op_t *rop)
|
|||
return 1;
|
||||
}
|
||||
|
||||
static int nfs3_mkdir_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_mkdir_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
MKDIR3args *args = (MKDIR3args*)rop->request;
|
||||
|
@ -669,19 +649,19 @@ static int nfs3_mkdir_proc(void *opaque, rpc_op_t *rop)
|
|||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
auto dir_id_it = self->parent->dir_info.find(full_name);
|
||||
if (dir_id_it != self->parent->dir_info.end())
|
||||
auto dir_id_it = self->parent->blockfs->dir_info.find(full_name);
|
||||
if (dir_id_it != self->parent->blockfs->dir_info.end())
|
||||
{
|
||||
*reply = (MKDIR3res){ .status = NFS3ERR_EXIST };
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
// FIXME: Persist empty directories in some etcd keys, like /vitastor/dir/...
|
||||
self->parent->dir_info[full_name] = (nfs_dir_t){
|
||||
.id = self->parent->next_dir_id++,
|
||||
self->parent->blockfs->dir_info[full_name] = (nfs_dir_t){
|
||||
.id = self->parent->blockfs->next_dir_id++,
|
||||
.mod_rev = 0,
|
||||
};
|
||||
self->parent->dir_by_hash["S"+base64_encode(sha256(full_name))] = full_name;
|
||||
self->parent->blockfs->dir_by_hash["S"+base64_encode(sha256(full_name))] = full_name;
|
||||
*reply = (MKDIR3res){
|
||||
.status = NFS3_OK,
|
||||
.resok = (MKDIR3resok){
|
||||
|
@ -700,7 +680,7 @@ static int nfs3_mkdir_proc(void *opaque, rpc_op_t *rop)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int nfs3_symlink_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_symlink_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
// nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
// SYMLINK3args *args = (SYMLINK3args*)rop->request;
|
||||
|
@ -711,7 +691,7 @@ static int nfs3_symlink_proc(void *opaque, rpc_op_t *rop)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int nfs3_mknod_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_mknod_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
// nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
// MKNOD3args *args = (MKNOD3args*)rop->request;
|
||||
|
@ -722,7 +702,7 @@ static int nfs3_mknod_proc(void *opaque, rpc_op_t *rop)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int nfs3_remove_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_remove_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
REMOVE3res *reply = (REMOVE3res*)rop->reply;
|
||||
|
@ -752,7 +732,7 @@ static int nfs3_remove_proc(void *opaque, rpc_op_t *rop)
|
|||
return 1;
|
||||
}
|
||||
|
||||
static int nfs3_rmdir_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_rmdir_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
RMDIR3args *args = (RMDIR3args*)rop->request;
|
||||
|
@ -764,8 +744,8 @@ static int nfs3_rmdir_proc(void *opaque, rpc_op_t *rop)
|
|||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
auto dir_it = self->parent->dir_info.find(full_name);
|
||||
if (dir_it == self->parent->dir_info.end())
|
||||
auto dir_it = self->parent->blockfs->dir_info.find(full_name);
|
||||
if (dir_it == self->parent->blockfs->dir_info.end())
|
||||
{
|
||||
*reply = (RMDIR3res){ .status = NFS3ERR_NOENT };
|
||||
rpc_queue_reply(rop);
|
||||
|
@ -781,8 +761,8 @@ static int nfs3_rmdir_proc(void *opaque, rpc_op_t *rop)
|
|||
return 0;
|
||||
}
|
||||
}
|
||||
self->parent->dir_by_hash.erase("S"+base64_encode(sha256(full_name)));
|
||||
self->parent->dir_info.erase(dir_it);
|
||||
self->parent->blockfs->dir_by_hash.erase("S"+base64_encode(sha256(full_name)));
|
||||
self->parent->blockfs->dir_info.erase(dir_it);
|
||||
*reply = (RMDIR3res){ .status = NFS3_OK };
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
|
@ -811,12 +791,12 @@ static int continue_dir_rename(nfs_dir_rename_state *rename_st)
|
|||
if (!rename_st->items.size())
|
||||
{
|
||||
// old dir
|
||||
auto old_info = self->parent->dir_info.at(rename_st->old_name);
|
||||
self->parent->dir_info.erase(rename_st->old_name);
|
||||
self->parent->dir_by_hash.erase("S"+base64_encode(sha256(rename_st->old_name)));
|
||||
auto old_info = self->parent->blockfs->dir_info.at(rename_st->old_name);
|
||||
self->parent->blockfs->dir_info.erase(rename_st->old_name);
|
||||
self->parent->blockfs->dir_by_hash.erase("S"+base64_encode(sha256(rename_st->old_name)));
|
||||
// new dir
|
||||
self->parent->dir_info[rename_st->new_name] = old_info;
|
||||
self->parent->dir_by_hash["S"+base64_encode(sha256(rename_st->new_name))] = rename_st->new_name;
|
||||
self->parent->blockfs->dir_info[rename_st->new_name] = old_info;
|
||||
self->parent->blockfs->dir_by_hash["S"+base64_encode(sha256(rename_st->new_name))] = rename_st->new_name;
|
||||
RENAME3res *reply = (RENAME3res*)rename_st->rop->reply;
|
||||
*reply = (RENAME3res){
|
||||
.status = NFS3_OK,
|
||||
|
@ -853,7 +833,7 @@ static int continue_dir_rename(nfs_dir_rename_state *rename_st)
|
|||
|
||||
static void nfs_do_rename(nfs_client_t *self, rpc_op_t *rop, std::string old_name, std::string new_name);
|
||||
|
||||
static int nfs3_rename_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_rename_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
RENAME3args *args = (RENAME3args*)rop->request;
|
||||
|
@ -866,8 +846,8 @@ static int nfs3_rename_proc(void *opaque, rpc_op_t *rop)
|
|||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
bool old_is_dir = self->parent->dir_info.find(old_name) != self->parent->dir_info.end();
|
||||
bool new_is_dir = self->parent->dir_info.find(new_name) != self->parent->dir_info.end();
|
||||
bool old_is_dir = self->parent->blockfs->dir_info.find(old_name) != self->parent->blockfs->dir_info.end();
|
||||
bool new_is_dir = self->parent->blockfs->dir_info.find(new_name) != self->parent->blockfs->dir_info.end();
|
||||
bool old_is_file = false, new_is_file = false;
|
||||
for (auto & ic: self->parent->cli->st_cli.inode_config)
|
||||
{
|
||||
|
@ -948,7 +928,7 @@ static void nfs_do_rename(nfs_client_t *self, rpc_op_t *rop, std::string old_nam
|
|||
});
|
||||
}
|
||||
|
||||
static int nfs3_link_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_link_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
//nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
//LINK3args *args = (LINK3args*)rop->request;
|
||||
|
@ -962,7 +942,7 @@ static int nfs3_link_proc(void *opaque, rpc_op_t *rop)
|
|||
static void fill_dir_entry(nfs_client_t *self, rpc_op_t *rop,
|
||||
std::map<std::string, nfs_dir_t>::iterator dir_id_it, struct entryplus3 *entry, bool is_plus)
|
||||
{
|
||||
if (dir_id_it == self->parent->dir_info.end())
|
||||
if (dir_id_it == self->parent->blockfs->dir_info.end())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
@ -980,7 +960,7 @@ static void fill_dir_entry(nfs_client_t *self, rpc_op_t *rop,
|
|||
}
|
||||
}
|
||||
|
||||
static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
|
||||
static void block_nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
READDIRPLUS3args plus_args;
|
||||
|
@ -999,13 +979,13 @@ static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
|
|||
}
|
||||
std::string dirhash = args->dir;
|
||||
std::string dir;
|
||||
if (dirhash != "roothandle")
|
||||
if (dirhash != NFS_ROOT_HANDLE)
|
||||
{
|
||||
auto dir_it = self->parent->dir_by_hash.find(dirhash);
|
||||
if (dir_it != self->parent->dir_by_hash.end())
|
||||
auto dir_it = self->parent->blockfs->dir_by_hash.find(dirhash);
|
||||
if (dir_it != self->parent->blockfs->dir_by_hash.end())
|
||||
dir = dir_it->second;
|
||||
}
|
||||
std::string prefix = dir.size() ? dir+"/" : self->parent->name_prefix;
|
||||
std::string prefix = dir.size() ? dir+"/" : self->parent->blockfs->name_prefix;
|
||||
std::map<std::string, struct entryplus3> entries;
|
||||
for (auto & ic: self->parent->cli->st_cli.inode_config)
|
||||
{
|
||||
|
@ -1043,12 +1023,12 @@ static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
|
|||
}
|
||||
else
|
||||
{
|
||||
// skip directories, they will be added from dir_info
|
||||
// skip directories, they will be added from blockfs->dir_info
|
||||
}
|
||||
}
|
||||
// Add directories from dir_info
|
||||
for (auto dir_id_it = self->parent->dir_info.lower_bound(prefix);
|
||||
dir_id_it != self->parent->dir_info.end(); dir_id_it++)
|
||||
// Add directories from blockfs->dir_info
|
||||
for (auto dir_id_it = self->parent->blockfs->dir_info.lower_bound(prefix);
|
||||
dir_id_it != self->parent->blockfs->dir_info.end(); dir_id_it++)
|
||||
{
|
||||
if (prefix != "" && dir_id_it->first.substr(0, prefix.size()) != prefix)
|
||||
break;
|
||||
|
@ -1061,12 +1041,12 @@ static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
|
|||
}
|
||||
// Add . and ..
|
||||
{
|
||||
auto dir_id_it = self->parent->dir_info.find(dir);
|
||||
auto dir_id_it = self->parent->blockfs->dir_info.find(dir);
|
||||
fill_dir_entry(self, rop, dir_id_it, &entries["."], is_plus);
|
||||
auto sl = dir.rfind("/");
|
||||
if (sl != std::string::npos)
|
||||
{
|
||||
auto dir_id_it = self->parent->dir_info.find(dir.substr(0, sl));
|
||||
auto dir_id_it = self->parent->blockfs->dir_info.find(dir.substr(0, sl));
|
||||
fill_dir_entry(self, rop, dir_id_it, &entries[".."], is_plus);
|
||||
}
|
||||
}
|
||||
|
@ -1147,7 +1127,7 @@ static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
|
|||
{
|
||||
READDIRPLUS3res *reply = (READDIRPLUS3res*)rop->reply;
|
||||
*reply = { .status = NFS3_OK };
|
||||
*(uint64_t*)(reply->resok.cookieverf) = self->parent->dir_info.at(dir).mod_rev;
|
||||
*(uint64_t*)(reply->resok.cookieverf) = self->parent->blockfs->dir_info.at(dir).mod_rev;
|
||||
reply->resok.reply.entries = entries.size() ? &entries.begin()->second : NULL;
|
||||
reply->resok.reply.eof = eof;
|
||||
}
|
||||
|
@ -1155,250 +1135,135 @@ static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
|
|||
{
|
||||
READDIR3res *reply = (READDIR3res*)rop->reply;
|
||||
*reply = { .status = NFS3_OK };
|
||||
*(uint64_t*)(reply->resok.cookieverf) = self->parent->dir_info.at(dir).mod_rev;
|
||||
*(uint64_t*)(reply->resok.cookieverf) = self->parent->blockfs->dir_info.at(dir).mod_rev;
|
||||
reply->resok.reply.entries = entries.size() ? (entry3*)&entries.begin()->second : NULL;
|
||||
reply->resok.reply.eof = eof;
|
||||
}
|
||||
rpc_queue_reply(rop);
|
||||
}
|
||||
|
||||
static int nfs3_readdir_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_readdir_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs3_readdir_common(opaque, rop, false);
|
||||
block_nfs3_readdir_common(opaque, rop, false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nfs3_readdirplus_proc(void *opaque, rpc_op_t *rop)
|
||||
static int block_nfs3_readdirplus_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs3_readdir_common(opaque, rop, true);
|
||||
block_nfs3_readdir_common(opaque, rop, true);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Get file system statistics
|
||||
static int nfs3_fsstat_proc(void *opaque, rpc_op_t *rop)
|
||||
void block_fs_state_t::init(nfs_proxy_t *proxy, json11::Json cfg)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
//FSSTAT3args *args = (FSSTAT3args*)rop->request;
|
||||
FSSTAT3res *reply = (FSSTAT3res*)rop->reply;
|
||||
uint64_t tbytes = 0, fbytes = 0;
|
||||
auto pst_it = self->parent->pool_stats.find(self->parent->default_pool_id);
|
||||
if (pst_it != self->parent->pool_stats.end())
|
||||
name_prefix = cfg["subdir"].string_value();
|
||||
{
|
||||
auto ttb = pst_it->second["total_raw_tb"].number_value();
|
||||
auto ftb = (pst_it->second["total_raw_tb"].number_value() - pst_it->second["used_raw_tb"].number_value());
|
||||
tbytes = ttb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40);
|
||||
fbytes = ftb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40);
|
||||
int e = name_prefix.size();
|
||||
while (e > 0 && name_prefix[e-1] == '/')
|
||||
e--;
|
||||
int s = 0;
|
||||
while (s < e && name_prefix[s] == '/')
|
||||
s++;
|
||||
name_prefix = name_prefix.substr(s, e-s);
|
||||
if (name_prefix.size())
|
||||
name_prefix += "/";
|
||||
}
|
||||
*reply = (FSSTAT3res){
|
||||
.status = NFS3_OK,
|
||||
.resok = (FSSTAT3resok){
|
||||
.obj_attributes = {
|
||||
.attributes_follow = 1,
|
||||
.attributes = get_dir_attributes(self, ""),
|
||||
},
|
||||
.tbytes = tbytes, // total bytes
|
||||
.fbytes = fbytes, // free bytes
|
||||
.abytes = fbytes, // available bytes
|
||||
.tfiles = (size3)(1 << 31), // maximum total files
|
||||
.ffiles = (size3)(1 << 31), // free files
|
||||
.afiles = (size3)(1 << 31), // available files
|
||||
.invarsec = 0,
|
||||
},
|
||||
// We need inode name hashes for NFS handles to remain stateless and <= 64 bytes long
|
||||
dir_info[""] = (nfs_dir_t){
|
||||
.id = 1,
|
||||
.mod_rev = 0,
|
||||
};
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nfs3_fsinfo_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
FSINFO3args *args = (FSINFO3args*)rop->request;
|
||||
FSINFO3res *reply = (FSINFO3res*)rop->reply;
|
||||
if (args->fsroot != "roothandle")
|
||||
clock_gettime(CLOCK_REALTIME, &dir_info[""].mtime);
|
||||
assert(proxy->cli->st_cli.on_inode_change_hook == NULL);
|
||||
proxy->cli->st_cli.on_inode_change_hook = [this, proxy](inode_t changed_inode, bool removed)
|
||||
{
|
||||
// Example error
|
||||
*reply = (FSINFO3res){ .status = NFS3ERR_INVAL };
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fill info
|
||||
*reply = (FSINFO3res){
|
||||
.status = NFS3_OK,
|
||||
.resok = (FSINFO3resok){
|
||||
.obj_attributes = {
|
||||
.attributes_follow = 1,
|
||||
.attributes = get_dir_attributes(self, ""),
|
||||
},
|
||||
.rtmax = 128*1024*1024,
|
||||
.rtpref = 128*1024*1024,
|
||||
.rtmult = 4096,
|
||||
.wtmax = 128*1024*1024,
|
||||
.wtpref = 128*1024*1024,
|
||||
.wtmult = 4096,
|
||||
.dtpref = 128,
|
||||
.maxfilesize = 0x7fffffffffffffff,
|
||||
.time_delta = {
|
||||
.seconds = 1,
|
||||
.nseconds = 0,
|
||||
},
|
||||
.properties = FSF3_SYMLINK | FSF3_HOMOGENEOUS,
|
||||
},
|
||||
};
|
||||
}
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nfs3_pathconf_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
//nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
PATHCONF3args *args = (PATHCONF3args*)rop->request;
|
||||
PATHCONF3res *reply = (PATHCONF3res*)rop->reply;
|
||||
if (args->object != "roothandle")
|
||||
{
|
||||
// Example error
|
||||
*reply = (PATHCONF3res){ .status = NFS3ERR_INVAL };
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fill info
|
||||
bool_t x = FALSE;
|
||||
*reply = (PATHCONF3res){
|
||||
.status = NFS3_OK,
|
||||
.resok = (PATHCONF3resok){
|
||||
.obj_attributes = {
|
||||
// Without at least one reference to a non-constant value (local variable or something else),
|
||||
// with gcc 8 we get "internal compiler error: side-effects element in no-side-effects CONSTRUCTOR" here
|
||||
// FIXME: get rid of this after raising compiler requirement
|
||||
.attributes_follow = x,
|
||||
},
|
||||
.linkmax = 0,
|
||||
.name_max = 255,
|
||||
.no_trunc = TRUE,
|
||||
.chown_restricted = FALSE,
|
||||
.case_insensitive = FALSE,
|
||||
.case_preserving = TRUE,
|
||||
},
|
||||
};
|
||||
}
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nfs3_commit_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
//COMMIT3args *args = (COMMIT3args*)rop->request;
|
||||
cluster_op_t *op = new cluster_op_t;
|
||||
// fsync. we don't know how to fsync a single inode, so just fsync everything
|
||||
op->opcode = OSD_OP_SYNC;
|
||||
op->callback = [self, rop](cluster_op_t *op)
|
||||
{
|
||||
COMMIT3res *reply = (COMMIT3res*)rop->reply;
|
||||
*reply = (COMMIT3res){ .status = vitastor_nfs_map_err(op->retval) };
|
||||
*(uint64_t*)reply->resok.verf = self->parent->server_id;
|
||||
rpc_queue_reply(rop);
|
||||
auto inode_cfg_it = proxy->cli->st_cli.inode_config.find(changed_inode);
|
||||
if (inode_cfg_it == proxy->cli->st_cli.inode_config.end())
|
||||
{
|
||||
return;
|
||||
}
|
||||
auto & inode_cfg = inode_cfg_it->second;
|
||||
std::string full_name = inode_cfg.name;
|
||||
if (proxy->blockfs->name_prefix != "" && full_name.substr(0, proxy->blockfs->name_prefix.size()) != proxy->blockfs->name_prefix)
|
||||
{
|
||||
return;
|
||||
}
|
||||
// Calculate directory modification time and revision (used as "cookie verifier")
|
||||
timespec now;
|
||||
clock_gettime(CLOCK_REALTIME, &now);
|
||||
dir_info[""].mod_rev = dir_info[""].mod_rev < inode_cfg.mod_revision ? inode_cfg.mod_revision : dir_info[""].mod_rev;
|
||||
dir_info[""].mtime = now;
|
||||
int pos = full_name.find('/', proxy->blockfs->name_prefix.size());
|
||||
while (pos >= 0)
|
||||
{
|
||||
std::string dir = full_name.substr(0, pos);
|
||||
auto & dinf = dir_info[dir];
|
||||
if (!dinf.id)
|
||||
dinf.id = next_dir_id++;
|
||||
dinf.mod_rev = dinf.mod_rev < inode_cfg.mod_revision ? inode_cfg.mod_revision : dinf.mod_rev;
|
||||
dinf.mtime = now;
|
||||
dir_by_hash["S"+base64_encode(sha256(dir))] = dir;
|
||||
pos = full_name.find('/', pos+1);
|
||||
}
|
||||
// Alter inode_by_hash
|
||||
if (removed)
|
||||
{
|
||||
auto ino_it = hash_by_inode.find(changed_inode);
|
||||
if (ino_it != hash_by_inode.end())
|
||||
{
|
||||
inode_by_hash.erase(ino_it->second);
|
||||
hash_by_inode.erase(ino_it);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::string hash = "S"+base64_encode(sha256(full_name));
|
||||
auto hbi_it = hash_by_inode.find(changed_inode);
|
||||
if (hbi_it != hash_by_inode.end() && hbi_it->second != hash)
|
||||
{
|
||||
// inode had a different name, remove old hash=>inode pointer
|
||||
inode_by_hash.erase(hbi_it->second);
|
||||
}
|
||||
inode_by_hash[hash] = changed_inode;
|
||||
hash_by_inode[changed_inode] = hash;
|
||||
}
|
||||
};
|
||||
self->parent->cli->execute(op);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int mount3_mnt_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
//nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
//nfs_dirpath *args = (nfs_dirpath*)rop->request;
|
||||
nfs_mountres3 *reply = (nfs_mountres3*)rop->reply;
|
||||
u_int flavor = RPC_AUTH_NONE;
|
||||
reply->fhs_status = MNT3_OK;
|
||||
reply->mountinfo.fhandle = xdr_copy_string(rop->xdrs, "roothandle");
|
||||
reply->mountinfo.auth_flavors.auth_flavors_len = 1;
|
||||
reply->mountinfo.auth_flavors.auth_flavors_val = (u_int*)xdr_copy_string(rop->xdrs, (char*)&flavor, sizeof(u_int)).data;
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mount3_dump_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
nfs_mountlist *reply = (nfs_mountlist*)rop->reply;
|
||||
*reply = (struct nfs_mountbody*)malloc_or_die(sizeof(struct nfs_mountbody));
|
||||
xdr_add_malloc(rop->xdrs, *reply);
|
||||
(*reply)->ml_hostname = xdr_copy_string(rop->xdrs, "127.0.0.1");
|
||||
(*reply)->ml_directory = xdr_copy_string(rop->xdrs, self->parent->export_root);
|
||||
(*reply)->ml_next = NULL;
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mount3_umnt_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
//nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
//nfs_dirpath *arg = (nfs_dirpath*)rop->request;
|
||||
// do nothing
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mount3_umntall_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
// do nothing
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mount3_export_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
nfs_exports *reply = (nfs_exports*)rop->reply;
|
||||
*reply = (struct nfs_exportnode*)calloc_or_die(1, sizeof(struct nfs_exportnode) + sizeof(struct nfs_groupnode));
|
||||
xdr_add_malloc(rop->xdrs, *reply);
|
||||
(*reply)->ex_dir = xdr_copy_string(rop->xdrs, self->parent->export_root);
|
||||
(*reply)->ex_groups = (struct nfs_groupnode*)(reply+1);
|
||||
(*reply)->ex_groups->gr_name = xdr_copy_string(rop->xdrs, "127.0.0.1");
|
||||
(*reply)->ex_groups->gr_next = NULL;
|
||||
(*reply)->ex_next = NULL;
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
nfs_client_t::nfs_client_t()
|
||||
void nfs_block_procs(nfs_client_t *self)
|
||||
{
|
||||
struct rpc_service_proc_t pt[] = {
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_NULL, nfs3_null_proc, NULL, 0, NULL, 0, this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_GETATTR, nfs3_getattr_proc, (xdrproc_t)xdr_GETATTR3args, sizeof(GETATTR3args), (xdrproc_t)xdr_GETATTR3res, sizeof(GETATTR3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_SETATTR, nfs3_setattr_proc, (xdrproc_t)xdr_SETATTR3args, sizeof(SETATTR3args), (xdrproc_t)xdr_SETATTR3res, sizeof(SETATTR3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_LOOKUP, nfs3_lookup_proc, (xdrproc_t)xdr_LOOKUP3args, sizeof(LOOKUP3args), (xdrproc_t)xdr_LOOKUP3res, sizeof(LOOKUP3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_ACCESS, nfs3_access_proc, (xdrproc_t)xdr_ACCESS3args, sizeof(ACCESS3args), (xdrproc_t)xdr_ACCESS3res, sizeof(ACCESS3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_READLINK, nfs3_readlink_proc, (xdrproc_t)xdr_READLINK3args, sizeof(READLINK3args), (xdrproc_t)xdr_READLINK3res, sizeof(READLINK3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_READ, nfs3_read_proc, (xdrproc_t)xdr_READ3args, sizeof(READ3args), (xdrproc_t)xdr_READ3res, sizeof(READ3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_WRITE, nfs3_write_proc, (xdrproc_t)xdr_WRITE3args, sizeof(WRITE3args), (xdrproc_t)xdr_WRITE3res, sizeof(WRITE3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_CREATE, nfs3_create_proc, (xdrproc_t)xdr_CREATE3args, sizeof(CREATE3args), (xdrproc_t)xdr_CREATE3res, sizeof(CREATE3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_MKDIR, nfs3_mkdir_proc, (xdrproc_t)xdr_MKDIR3args, sizeof(MKDIR3args), (xdrproc_t)xdr_MKDIR3res, sizeof(MKDIR3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_SYMLINK, nfs3_symlink_proc, (xdrproc_t)xdr_SYMLINK3args, sizeof(SYMLINK3args), (xdrproc_t)xdr_SYMLINK3res, sizeof(SYMLINK3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_MKNOD, nfs3_mknod_proc, (xdrproc_t)xdr_MKNOD3args, sizeof(MKNOD3args), (xdrproc_t)xdr_MKNOD3res, sizeof(MKNOD3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_REMOVE, nfs3_remove_proc, (xdrproc_t)xdr_REMOVE3args, sizeof(REMOVE3args), (xdrproc_t)xdr_REMOVE3res, sizeof(REMOVE3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_RMDIR, nfs3_rmdir_proc, (xdrproc_t)xdr_RMDIR3args, sizeof(RMDIR3args), (xdrproc_t)xdr_RMDIR3res, sizeof(RMDIR3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_RENAME, nfs3_rename_proc, (xdrproc_t)xdr_RENAME3args, sizeof(RENAME3args), (xdrproc_t)xdr_RENAME3res, sizeof(RENAME3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_LINK, nfs3_link_proc, (xdrproc_t)xdr_LINK3args, sizeof(LINK3args), (xdrproc_t)xdr_LINK3res, sizeof(LINK3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_READDIR, nfs3_readdir_proc, (xdrproc_t)xdr_READDIR3args, sizeof(READDIR3args), (xdrproc_t)xdr_READDIR3res, sizeof(READDIR3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_READDIRPLUS, nfs3_readdirplus_proc, (xdrproc_t)xdr_READDIRPLUS3args, sizeof(READDIRPLUS3args), (xdrproc_t)xdr_READDIRPLUS3res, sizeof(READDIRPLUS3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_FSSTAT, nfs3_fsstat_proc, (xdrproc_t)xdr_FSSTAT3args, sizeof(FSSTAT3args), (xdrproc_t)xdr_FSSTAT3res, sizeof(FSSTAT3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_FSINFO, nfs3_fsinfo_proc, (xdrproc_t)xdr_FSINFO3args, sizeof(FSINFO3args), (xdrproc_t)xdr_FSINFO3res, sizeof(FSINFO3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_PATHCONF, nfs3_pathconf_proc, (xdrproc_t)xdr_PATHCONF3args, sizeof(PATHCONF3args), (xdrproc_t)xdr_PATHCONF3res, sizeof(PATHCONF3res), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_COMMIT, nfs3_commit_proc, (xdrproc_t)xdr_COMMIT3args, sizeof(COMMIT3args), (xdrproc_t)xdr_COMMIT3res, sizeof(COMMIT3res), this},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_NULL, nfs3_null_proc, NULL, 0, NULL, 0, this},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_MNT, mount3_mnt_proc, (xdrproc_t)xdr_nfs_dirpath, sizeof(nfs_dirpath), (xdrproc_t)xdr_nfs_mountres3, sizeof(nfs_mountres3), this},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_DUMP, mount3_dump_proc, NULL, 0, (xdrproc_t)xdr_nfs_mountlist, sizeof(nfs_mountlist), this},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNT, mount3_umnt_proc, (xdrproc_t)xdr_nfs_dirpath, sizeof(nfs_dirpath), NULL, 0, this},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNTALL, mount3_umntall_proc, NULL, 0, NULL, 0, this},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_EXPORT, mount3_export_proc, NULL, 0, (xdrproc_t)xdr_nfs_exports, sizeof(nfs_exports), this},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_NULL, nfs3_null_proc, NULL, 0, NULL, 0, self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_GETATTR, block_nfs3_getattr_proc, (xdrproc_t)xdr_GETATTR3args, sizeof(GETATTR3args), (xdrproc_t)xdr_GETATTR3res, sizeof(GETATTR3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_SETATTR, block_nfs3_setattr_proc, (xdrproc_t)xdr_SETATTR3args, sizeof(SETATTR3args), (xdrproc_t)xdr_SETATTR3res, sizeof(SETATTR3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_LOOKUP, block_nfs3_lookup_proc, (xdrproc_t)xdr_LOOKUP3args, sizeof(LOOKUP3args), (xdrproc_t)xdr_LOOKUP3res, sizeof(LOOKUP3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_ACCESS, block_nfs3_access_proc, (xdrproc_t)xdr_ACCESS3args, sizeof(ACCESS3args), (xdrproc_t)xdr_ACCESS3res, sizeof(ACCESS3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_READLINK, block_nfs3_readlink_proc, (xdrproc_t)xdr_READLINK3args, sizeof(READLINK3args), (xdrproc_t)xdr_READLINK3res, sizeof(READLINK3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_READ, block_nfs3_read_proc, (xdrproc_t)xdr_READ3args, sizeof(READ3args), (xdrproc_t)xdr_READ3res, sizeof(READ3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_WRITE, block_nfs3_write_proc, (xdrproc_t)xdr_WRITE3args, sizeof(WRITE3args), (xdrproc_t)xdr_WRITE3res, sizeof(WRITE3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_CREATE, block_nfs3_create_proc, (xdrproc_t)xdr_CREATE3args, sizeof(CREATE3args), (xdrproc_t)xdr_CREATE3res, sizeof(CREATE3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_MKDIR, block_nfs3_mkdir_proc, (xdrproc_t)xdr_MKDIR3args, sizeof(MKDIR3args), (xdrproc_t)xdr_MKDIR3res, sizeof(MKDIR3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_SYMLINK, block_nfs3_symlink_proc, (xdrproc_t)xdr_SYMLINK3args, sizeof(SYMLINK3args), (xdrproc_t)xdr_SYMLINK3res, sizeof(SYMLINK3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_MKNOD, block_nfs3_mknod_proc, (xdrproc_t)xdr_MKNOD3args, sizeof(MKNOD3args), (xdrproc_t)xdr_MKNOD3res, sizeof(MKNOD3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_REMOVE, block_nfs3_remove_proc, (xdrproc_t)xdr_REMOVE3args, sizeof(REMOVE3args), (xdrproc_t)xdr_REMOVE3res, sizeof(REMOVE3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_RMDIR, block_nfs3_rmdir_proc, (xdrproc_t)xdr_RMDIR3args, sizeof(RMDIR3args), (xdrproc_t)xdr_RMDIR3res, sizeof(RMDIR3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_RENAME, block_nfs3_rename_proc, (xdrproc_t)xdr_RENAME3args, sizeof(RENAME3args), (xdrproc_t)xdr_RENAME3res, sizeof(RENAME3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_LINK, block_nfs3_link_proc, (xdrproc_t)xdr_LINK3args, sizeof(LINK3args), (xdrproc_t)xdr_LINK3res, sizeof(LINK3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_READDIR, block_nfs3_readdir_proc, (xdrproc_t)xdr_READDIR3args, sizeof(READDIR3args), (xdrproc_t)xdr_READDIR3res, sizeof(READDIR3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_READDIRPLUS, block_nfs3_readdirplus_proc, (xdrproc_t)xdr_READDIRPLUS3args, sizeof(READDIRPLUS3args), (xdrproc_t)xdr_READDIRPLUS3res, sizeof(READDIRPLUS3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_FSSTAT, nfs3_fsstat_proc, (xdrproc_t)xdr_FSSTAT3args, sizeof(FSSTAT3args), (xdrproc_t)xdr_FSSTAT3res, sizeof(FSSTAT3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_FSINFO, nfs3_fsinfo_proc, (xdrproc_t)xdr_FSINFO3args, sizeof(FSINFO3args), (xdrproc_t)xdr_FSINFO3res, sizeof(FSINFO3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_PATHCONF, nfs3_pathconf_proc, (xdrproc_t)xdr_PATHCONF3args, sizeof(PATHCONF3args), (xdrproc_t)xdr_PATHCONF3res, sizeof(PATHCONF3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_COMMIT, nfs3_commit_proc, (xdrproc_t)xdr_COMMIT3args, sizeof(COMMIT3args), (xdrproc_t)xdr_COMMIT3res, sizeof(COMMIT3res), self},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_NULL, nfs3_null_proc, NULL, 0, NULL, 0, self},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_MNT, mount3_mnt_proc, (xdrproc_t)xdr_nfs_dirpath, sizeof(nfs_dirpath), (xdrproc_t)xdr_nfs_mountres3, sizeof(nfs_mountres3), self},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_DUMP, mount3_dump_proc, NULL, 0, (xdrproc_t)xdr_nfs_mountlist, sizeof(nfs_mountlist), self},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNT, mount3_umnt_proc, (xdrproc_t)xdr_nfs_dirpath, sizeof(nfs_dirpath), NULL, 0, self},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNTALL, mount3_umntall_proc, NULL, 0, NULL, 0, self},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_EXPORT, mount3_export_proc, NULL, 0, (xdrproc_t)xdr_nfs_exports, sizeof(nfs_exports), self},
|
||||
};
|
||||
for (int i = 0; i < sizeof(pt)/sizeof(pt[0]); i++)
|
||||
{
|
||||
proc_table.insert(pt[i]);
|
||||
self->proc_table.insert(pt[i]);
|
||||
}
|
||||
}
|
||||
|
||||
nfs_client_t::~nfs_client_t()
|
||||
{
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS proxy over Vitastor block images - header
|
||||
|
||||
#pragma once
|
||||
|
||||
struct nfs_dir_t
|
||||
{
|
||||
uint64_t id;
|
||||
uint64_t mod_rev;
|
||||
timespec mtime;
|
||||
};
|
||||
|
||||
struct extend_size_t
|
||||
{
|
||||
inode_t inode;
|
||||
uint64_t new_size;
|
||||
};
|
||||
|
||||
inline bool operator < (const extend_size_t &a, const extend_size_t &b)
|
||||
{
|
||||
return a.inode < b.inode || a.inode == b.inode && a.new_size < b.new_size;
|
||||
}
|
||||
|
||||
struct extend_write_t
|
||||
{
|
||||
rpc_op_t *rop;
|
||||
int resize_res, write_res; // 1 = started, 0 = completed OK, -errno = completed with error
|
||||
};
|
||||
|
||||
struct extend_inode_t
|
||||
{
|
||||
uint64_t cur_extend = 0, next_extend = 0;
|
||||
};
|
||||
|
||||
struct block_fs_state_t
|
||||
{
|
||||
std::string name_prefix;
|
||||
|
||||
// filehandle = "S"+base64(sha256(full name with prefix)) or "roothandle" for mount root)
|
||||
uint64_t next_dir_id = 2;
|
||||
// filehandle => dir with name_prefix
|
||||
std::map<std::string, std::string> dir_by_hash;
|
||||
// dir with name_prefix => dir info
|
||||
std::map<std::string, nfs_dir_t> dir_info;
|
||||
// filehandle => inode ID
|
||||
std::map<std::string, inode_t> inode_by_hash;
|
||||
// inode ID => filehandle
|
||||
std::map<inode_t, std::string> hash_by_inode;
|
||||
|
||||
// inode extend requests in progress
|
||||
std::map<inode_t, extend_inode_t> extends;
|
||||
std::multimap<extend_size_t, extend_write_t> extend_writes;
|
||||
|
||||
void init(nfs_proxy_t *proxy, json11::Json cfg);
|
||||
};
|
||||
|
||||
nfsstat3 vitastor_nfs_map_err(int err);
|
|
@ -0,0 +1,22 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS proxy - common functions
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "nfs/nfs.h"
|
||||
|
||||
void nfs_block_procs(nfs_client_t *self);
|
||||
void nfs_kv_procs(nfs_client_t *self);
|
||||
int nfs3_fsstat_proc(void *opaque, rpc_op_t *rop);
|
||||
int nfs3_fsinfo_proc(void *opaque, rpc_op_t *rop);
|
||||
int nfs3_pathconf_proc(void *opaque, rpc_op_t *rop);
|
||||
int nfs3_access_proc(void *opaque, rpc_op_t *rop);
|
||||
int nfs3_null_proc(void *opaque, rpc_op_t *rop);
|
||||
int nfs3_commit_proc(void *opaque, rpc_op_t *rop);
|
||||
int mount3_mnt_proc(void *opaque, rpc_op_t *rop);
|
||||
int mount3_dump_proc(void *opaque, rpc_op_t *rop);
|
||||
int mount3_umnt_proc(void *opaque, rpc_op_t *rop);
|
||||
int mount3_umntall_proc(void *opaque, rpc_op_t *rop);
|
||||
int mount3_export_proc(void *opaque, rpc_op_t *rop);
|
|
@ -0,0 +1,124 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS proxy - common FSSTAT, FSINFO, PATHCONF
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "nfs_proxy.h"
|
||||
#include "nfs_kv.h"
|
||||
|
||||
// Get file system statistics
|
||||
int nfs3_fsstat_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
//FSSTAT3args *args = (FSSTAT3args*)rop->request;
|
||||
if (self->parent->trace)
|
||||
fprintf(stderr, "[%d] FSSTAT\n", self->nfs_fd);
|
||||
FSSTAT3res *reply = (FSSTAT3res*)rop->reply;
|
||||
uint64_t tbytes = 0, fbytes = 0;
|
||||
auto pst_it = self->parent->pool_stats.find(self->parent->default_pool_id);
|
||||
if (pst_it != self->parent->pool_stats.end())
|
||||
{
|
||||
auto ttb = pst_it->second["total_raw_tb"].number_value();
|
||||
auto ftb = (pst_it->second["total_raw_tb"].number_value() - pst_it->second["used_raw_tb"].number_value());
|
||||
tbytes = ttb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40);
|
||||
fbytes = ftb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40);
|
||||
}
|
||||
*reply = (FSSTAT3res){
|
||||
.status = NFS3_OK,
|
||||
.resok = (FSSTAT3resok){
|
||||
.obj_attributes = {
|
||||
.attributes_follow = 0,
|
||||
//.attributes = get_root_attributes(self),
|
||||
},
|
||||
.tbytes = tbytes, // total bytes
|
||||
.fbytes = fbytes, // free bytes
|
||||
.abytes = fbytes, // available bytes
|
||||
.tfiles = (size3)1 << (63-POOL_ID_BITS), // maximum total files
|
||||
.ffiles = (size3)1 << (63-POOL_ID_BITS), // free files
|
||||
.afiles = (size3)1 << (63-POOL_ID_BITS), // available files
|
||||
.invarsec = 0,
|
||||
},
|
||||
};
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int nfs3_fsinfo_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
FSINFO3args *args = (FSINFO3args*)rop->request;
|
||||
FSINFO3res *reply = (FSINFO3res*)rop->reply;
|
||||
if (self->parent->trace)
|
||||
fprintf(stderr, "[%d] FSINFO %s\n", self->nfs_fd, std::string(args->fsroot).c_str());
|
||||
if (args->fsroot != NFS_ROOT_HANDLE)
|
||||
{
|
||||
*reply = (FSINFO3res){ .status = NFS3ERR_INVAL };
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fill info
|
||||
*reply = (FSINFO3res){
|
||||
.status = NFS3_OK,
|
||||
.resok = (FSINFO3resok){
|
||||
.obj_attributes = {
|
||||
.attributes_follow = 0,
|
||||
//.attributes = get_root_attributes(self),
|
||||
},
|
||||
.rtmax = 128*1024*1024,
|
||||
.rtpref = 128*1024*1024,
|
||||
.rtmult = 4096,
|
||||
.wtmax = 128*1024*1024,
|
||||
.wtpref = 128*1024*1024,
|
||||
.wtmult = 4096,
|
||||
.dtpref = 128,
|
||||
.maxfilesize = 0x7fffffffffffffff,
|
||||
.time_delta = {
|
||||
.seconds = 1,
|
||||
.nseconds = 0,
|
||||
},
|
||||
.properties = FSF3_SYMLINK | FSF3_HOMOGENEOUS,
|
||||
},
|
||||
};
|
||||
}
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int nfs3_pathconf_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
PATHCONF3args *args = (PATHCONF3args*)rop->request;
|
||||
PATHCONF3res *reply = (PATHCONF3res*)rop->reply;
|
||||
if (self->parent->trace)
|
||||
fprintf(stderr, "[%d] PATHCONF %s\n", self->nfs_fd, std::string(args->object).c_str());
|
||||
if (args->object != NFS_ROOT_HANDLE)
|
||||
{
|
||||
*reply = (PATHCONF3res){ .status = NFS3ERR_INVAL };
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fill info
|
||||
*reply = (PATHCONF3res){
|
||||
.status = NFS3_OK,
|
||||
.resok = (PATHCONF3resok){
|
||||
.obj_attributes = {
|
||||
// Without at least one reference to a non-constant value (local variable or something else),
|
||||
// with gcc 8 we get "internal compiler error: side-effects element in no-side-effects CONSTRUCTOR" here
|
||||
// FIXME: get rid of this after raising compiler requirement
|
||||
.attributes_follow = 0,
|
||||
//.attributes = get_root_attributes(self),
|
||||
},
|
||||
.linkmax = 0,
|
||||
.name_max = 255,
|
||||
.no_trunc = TRUE,
|
||||
.chown_restricted = FALSE,
|
||||
.case_insensitive = FALSE,
|
||||
.case_preserving = TRUE,
|
||||
},
|
||||
};
|
||||
}
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,332 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS proxy over VitastorKV database - common functions
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "str_util.h"
|
||||
#include "nfs_proxy.h"
|
||||
#include "nfs_common.h"
|
||||
#include "nfs_kv.h"
|
||||
|
||||
nfstime3 nfstime_from_str(const std::string & s)
|
||||
{
|
||||
nfstime3 t;
|
||||
auto p = s.find(".");
|
||||
if (p != std::string::npos)
|
||||
{
|
||||
t.seconds = stoull_full(s.substr(0, p), 10);
|
||||
t.nseconds = stoull_full(s.substr(p+1), 10);
|
||||
p = s.size()-p-1;
|
||||
for (; p < 9; p++)
|
||||
t.nseconds *= 10;
|
||||
for (; p > 9; p--)
|
||||
t.nseconds /= 10;
|
||||
}
|
||||
else
|
||||
t.seconds = stoull_full(s, 10);
|
||||
return t;
|
||||
}
|
||||
|
||||
static std::string timespec_to_str(timespec t)
|
||||
{
|
||||
char buf[64];
|
||||
snprintf(buf, sizeof(buf), "%ju.%09ju", t.tv_sec, t.tv_nsec);
|
||||
int l = strlen(buf);
|
||||
while (l > 0 && buf[l-1] == '0')
|
||||
l--;
|
||||
if (l > 0 && buf[l-1] == '.')
|
||||
l--;
|
||||
buf[l] = 0;
|
||||
return buf;
|
||||
}
|
||||
|
||||
std::string nfstime_to_str(nfstime3 t)
|
||||
{
|
||||
return timespec_to_str((timespec){ .tv_sec = t.seconds, .tv_nsec = t.nseconds });
|
||||
}
|
||||
|
||||
std::string nfstime_now_str()
|
||||
{
|
||||
timespec t;
|
||||
clock_gettime(CLOCK_REALTIME, &t);
|
||||
return timespec_to_str(t);
|
||||
}
|
||||
|
||||
int kv_map_type(const std::string & type)
|
||||
{
|
||||
return (type == "" || type == "file" ? NF3REG :
|
||||
(type == "dir" ? NF3DIR :
|
||||
(type == "blk" ? NF3BLK :
|
||||
(type == "chr" ? NF3CHR :
|
||||
(type == "link" ? NF3LNK :
|
||||
(type == "sock" ? NF3SOCK :
|
||||
(type == "fifo" ? NF3FIFO : -1)))))));
|
||||
}
|
||||
|
||||
fattr3 get_kv_attributes(nfs_client_t *self, uint64_t ino, json11::Json attrs)
|
||||
{
|
||||
auto type = kv_map_type(attrs["type"].string_value());
|
||||
auto mode = attrs["mode"].uint64_value();
|
||||
auto nlink = attrs["nlink"].uint64_value();
|
||||
nfstime3 mtime = nfstime_from_str(attrs["mtime"].string_value());
|
||||
nfstime3 atime = attrs["atime"].is_null() ? mtime : nfstime_from_str(attrs["atime"].string_value());
|
||||
nfstime3 ctime = attrs["ctime"].is_null() ? mtime : nfstime_from_str(attrs["ctime"].string_value());
|
||||
// In theory we could store the binary structure itself, but JSON is simpler :-)
|
||||
return (fattr3){
|
||||
.type = (type == 0 ? NF3REG : (ftype3)type),
|
||||
.mode = (attrs["mode"].is_null() ? (type == NF3DIR ? 0755 : 0644) : (uint32_t)mode),
|
||||
.nlink = (nlink == 0 ? 1 : (uint32_t)nlink),
|
||||
.uid = (uint32_t)attrs["uid"].uint64_value(),
|
||||
.gid = (uint32_t)attrs["gid"].uint64_value(),
|
||||
.size = (type == NF3DIR ? 4096 : attrs["size"].uint64_value()),
|
||||
// FIXME Counting actual used file size would require reworking statistics
|
||||
.used = (type == NF3DIR ? 4096 : attrs["size"].uint64_value()),
|
||||
.rdev = (type == NF3BLK || type == NF3CHR
|
||||
? (specdata3){ (uint32_t)attrs["major"].uint64_value(), (uint32_t)attrs["minor"].uint64_value() }
|
||||
: (specdata3){}),
|
||||
.fsid = self->parent->fsid,
|
||||
.fileid = ino,
|
||||
.atime = atime,
|
||||
.mtime = mtime,
|
||||
.ctime = ctime,
|
||||
};
|
||||
}
|
||||
|
||||
std::string kv_direntry_key(uint64_t dir_ino, const std::string & filename)
|
||||
{
|
||||
// encode as: d <length> <hex dir_ino> / <filename>
|
||||
char key[24] = { 0 };
|
||||
snprintf(key, sizeof(key), "d-%jx/", dir_ino);
|
||||
int n = strnlen(key, sizeof(key)-1) - 3;
|
||||
if (n < 10)
|
||||
key[1] = '0'+n;
|
||||
else
|
||||
key[1] = 'A'+(n-10);
|
||||
return (char*)key + filename;
|
||||
}
|
||||
|
||||
std::string kv_direntry_filename(const std::string & key)
|
||||
{
|
||||
// decode as: d <length> <hex dir_ino> / <filename>
|
||||
auto pos = key.find("/");
|
||||
if (pos != std::string::npos)
|
||||
return key.substr(pos+1);
|
||||
return key;
|
||||
}
|
||||
|
||||
std::string kv_inode_key(uint64_t ino)
|
||||
{
|
||||
char key[32] = { 0 };
|
||||
snprintf(key, sizeof(key), "i%x", INODE_POOL(ino));
|
||||
int n = strnlen(key, sizeof(key)-1);
|
||||
snprintf(key+n+1, sizeof(key)-n-1, "%jx", INODE_NO_POOL(ino));
|
||||
int m = strnlen(key+n+1, sizeof(key)-n-2);
|
||||
key[n] = 'G'+m;
|
||||
return std::string(key);
|
||||
}
|
||||
|
||||
std::string kv_fh(uint64_t ino)
|
||||
{
|
||||
char key[32] = { 0 };
|
||||
snprintf(key, sizeof(key), "S%jx", ino);
|
||||
return key;
|
||||
}
|
||||
|
||||
uint64_t kv_fh_inode(const std::string & fh)
|
||||
{
|
||||
if (fh == NFS_ROOT_HANDLE)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
else if (fh[0] == 'S')
|
||||
{
|
||||
uint64_t ino = 0;
|
||||
int r = sscanf(fh.c_str()+1, "%jx", &ino);
|
||||
if (r == 1)
|
||||
return ino;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool kv_fh_valid(const std::string & fh)
|
||||
{
|
||||
return fh == NFS_ROOT_HANDLE || fh[0] == 'S';
|
||||
}
|
||||
|
||||
void nfs_kv_procs(nfs_client_t *self)
|
||||
{
|
||||
struct rpc_service_proc_t pt[] = {
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_NULL, nfs3_null_proc, NULL, 0, NULL, 0, self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_GETATTR, kv_nfs3_getattr_proc, (xdrproc_t)xdr_GETATTR3args, sizeof(GETATTR3args), (xdrproc_t)xdr_GETATTR3res, sizeof(GETATTR3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_SETATTR, kv_nfs3_setattr_proc, (xdrproc_t)xdr_SETATTR3args, sizeof(SETATTR3args), (xdrproc_t)xdr_SETATTR3res, sizeof(SETATTR3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_LOOKUP, kv_nfs3_lookup_proc, (xdrproc_t)xdr_LOOKUP3args, sizeof(LOOKUP3args), (xdrproc_t)xdr_LOOKUP3res, sizeof(LOOKUP3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_ACCESS, nfs3_access_proc, (xdrproc_t)xdr_ACCESS3args, sizeof(ACCESS3args), (xdrproc_t)xdr_ACCESS3res, sizeof(ACCESS3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_READLINK, kv_nfs3_readlink_proc, (xdrproc_t)xdr_READLINK3args, sizeof(READLINK3args), (xdrproc_t)xdr_READLINK3res, sizeof(READLINK3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_READ, kv_nfs3_read_proc, (xdrproc_t)xdr_READ3args, sizeof(READ3args), (xdrproc_t)xdr_READ3res, sizeof(READ3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_WRITE, kv_nfs3_write_proc, (xdrproc_t)xdr_WRITE3args, sizeof(WRITE3args), (xdrproc_t)xdr_WRITE3res, sizeof(WRITE3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_CREATE, kv_nfs3_create_proc, (xdrproc_t)xdr_CREATE3args, sizeof(CREATE3args), (xdrproc_t)xdr_CREATE3res, sizeof(CREATE3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_MKDIR, kv_nfs3_mkdir_proc, (xdrproc_t)xdr_MKDIR3args, sizeof(MKDIR3args), (xdrproc_t)xdr_MKDIR3res, sizeof(MKDIR3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_SYMLINK, kv_nfs3_symlink_proc, (xdrproc_t)xdr_SYMLINK3args, sizeof(SYMLINK3args), (xdrproc_t)xdr_SYMLINK3res, sizeof(SYMLINK3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_MKNOD, kv_nfs3_mknod_proc, (xdrproc_t)xdr_MKNOD3args, sizeof(MKNOD3args), (xdrproc_t)xdr_MKNOD3res, sizeof(MKNOD3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_REMOVE, kv_nfs3_remove_proc, (xdrproc_t)xdr_REMOVE3args, sizeof(REMOVE3args), (xdrproc_t)xdr_REMOVE3res, sizeof(REMOVE3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_RMDIR, kv_nfs3_rmdir_proc, (xdrproc_t)xdr_RMDIR3args, sizeof(RMDIR3args), (xdrproc_t)xdr_RMDIR3res, sizeof(RMDIR3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_RENAME, kv_nfs3_rename_proc, (xdrproc_t)xdr_RENAME3args, sizeof(RENAME3args), (xdrproc_t)xdr_RENAME3res, sizeof(RENAME3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_LINK, kv_nfs3_link_proc, (xdrproc_t)xdr_LINK3args, sizeof(LINK3args), (xdrproc_t)xdr_LINK3res, sizeof(LINK3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_READDIR, kv_nfs3_readdir_proc, (xdrproc_t)xdr_READDIR3args, sizeof(READDIR3args), (xdrproc_t)xdr_READDIR3res, sizeof(READDIR3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_READDIRPLUS, kv_nfs3_readdirplus_proc, (xdrproc_t)xdr_READDIRPLUS3args, sizeof(READDIRPLUS3args), (xdrproc_t)xdr_READDIRPLUS3res, sizeof(READDIRPLUS3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_FSSTAT, nfs3_fsstat_proc, (xdrproc_t)xdr_FSSTAT3args, sizeof(FSSTAT3args), (xdrproc_t)xdr_FSSTAT3res, sizeof(FSSTAT3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_FSINFO, nfs3_fsinfo_proc, (xdrproc_t)xdr_FSINFO3args, sizeof(FSINFO3args), (xdrproc_t)xdr_FSINFO3res, sizeof(FSINFO3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_PATHCONF, nfs3_pathconf_proc, (xdrproc_t)xdr_PATHCONF3args, sizeof(PATHCONF3args), (xdrproc_t)xdr_PATHCONF3res, sizeof(PATHCONF3res), self},
|
||||
{NFS_PROGRAM, NFS_V3, NFS3_COMMIT, nfs3_commit_proc, (xdrproc_t)xdr_COMMIT3args, sizeof(COMMIT3args), (xdrproc_t)xdr_COMMIT3res, sizeof(COMMIT3res), self},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_NULL, nfs3_null_proc, NULL, 0, NULL, 0, self},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_MNT, mount3_mnt_proc, (xdrproc_t)xdr_nfs_dirpath, sizeof(nfs_dirpath), (xdrproc_t)xdr_nfs_mountres3, sizeof(nfs_mountres3), self},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_DUMP, mount3_dump_proc, NULL, 0, (xdrproc_t)xdr_nfs_mountlist, sizeof(nfs_mountlist), self},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNT, mount3_umnt_proc, (xdrproc_t)xdr_nfs_dirpath, sizeof(nfs_dirpath), NULL, 0, self},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNTALL, mount3_umntall_proc, NULL, 0, NULL, 0, self},
|
||||
{MOUNT_PROGRAM, MOUNT_V3, MOUNT3_EXPORT, mount3_export_proc, NULL, 0, (xdrproc_t)xdr_nfs_exports, sizeof(nfs_exports), self},
|
||||
};
|
||||
for (int i = 0; i < sizeof(pt)/sizeof(pt[0]); i++)
|
||||
{
|
||||
self->proc_table.insert(pt[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void kv_fs_state_t::init(nfs_proxy_t *proxy, json11::Json cfg)
|
||||
{
|
||||
this->proxy = proxy;
|
||||
auto & pool_cfg = proxy->cli->st_cli.pool_config.at(proxy->default_pool_id);
|
||||
fs_kv_inode = cfg["fs"].uint64_value();
|
||||
if (fs_kv_inode)
|
||||
{
|
||||
if (!INODE_POOL(fs_kv_inode))
|
||||
{
|
||||
fprintf(stderr, "FS metadata inode number must include pool\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (auto & ic: proxy->cli->st_cli.inode_config)
|
||||
{
|
||||
if (ic.second.name == cfg["fs"].string_value())
|
||||
{
|
||||
fs_kv_inode = ic.first;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!fs_kv_inode)
|
||||
{
|
||||
fprintf(stderr, "FS metadata image \"%s\" does not exist\n", cfg["fs"].string_value().c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (proxy->cli->st_cli.inode_config.find(fs_kv_inode) != proxy->cli->st_cli.inode_config.end())
|
||||
{
|
||||
auto & name = proxy->cli->st_cli.inode_config.at(fs_kv_inode).name;
|
||||
if (pool_cfg.used_for_fs != name)
|
||||
{
|
||||
fprintf(stderr, "Please mark pool as used for this file system with `vitastor-cli modify-pool --used-for-fs %s %s`\n",
|
||||
name.c_str(), cfg["fs"].string_value().c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
auto img_it = proxy->cli->st_cli.inode_config.lower_bound(INODE_WITH_POOL(proxy->default_pool_id+1, 0));
|
||||
if (img_it != proxy->cli->st_cli.inode_config.begin())
|
||||
{
|
||||
img_it--;
|
||||
if (img_it != proxy->cli->st_cli.inode_config.begin() && INODE_POOL(img_it->first) == proxy->default_pool_id)
|
||||
{
|
||||
idgen[proxy->default_pool_id].min_id = INODE_NO_POOL(img_it->first) + 1;
|
||||
}
|
||||
}
|
||||
readdir_getattr_parallel = cfg["readdir_getattr_parallel"].uint64_value();
|
||||
if (!readdir_getattr_parallel)
|
||||
readdir_getattr_parallel = 8;
|
||||
id_alloc_batch_size = cfg["id_alloc_batch_size"].uint64_value();
|
||||
if (!id_alloc_batch_size)
|
||||
id_alloc_batch_size = 200;
|
||||
touch_interval = cfg["touch_interval"].uint64_value();
|
||||
if (touch_interval < 100) // ms
|
||||
touch_interval = 100;
|
||||
pool_block_size = pool_cfg.pg_stripe_size;
|
||||
pool_alignment = pool_cfg.bitmap_granularity;
|
||||
// Open DB and wait
|
||||
int open_res = 0;
|
||||
bool open_done = false;
|
||||
proxy->db = new kv_dbw_t(proxy->cli);
|
||||
proxy->db->open(fs_kv_inode, cfg, [&](int res)
|
||||
{
|
||||
open_done = true;
|
||||
open_res = res;
|
||||
});
|
||||
while (!open_done)
|
||||
{
|
||||
proxy->ringloop->loop();
|
||||
if (open_done)
|
||||
break;
|
||||
proxy->ringloop->wait();
|
||||
}
|
||||
if (open_res < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to open key/value filesystem metadata index: %s (code %d)\n",
|
||||
strerror(-open_res), open_res);
|
||||
exit(1);
|
||||
}
|
||||
fs_inode_count = ((uint64_t)1 << (64-POOL_ID_BITS)) - 1;
|
||||
shared_inode_threshold = pool_block_size;
|
||||
if (!cfg["shared_inode_threshold"].is_null())
|
||||
{
|
||||
shared_inode_threshold = cfg["shared_inode_threshold"].uint64_value();
|
||||
}
|
||||
zero_block.resize(pool_block_size < 1048576 ? 1048576 : pool_block_size);
|
||||
scrap_block.resize(pool_block_size < 1048576 ? 1048576 : pool_block_size);
|
||||
touch_timer_id = proxy->epmgr->tfd->set_timer(touch_interval, true, [this](int){ touch_inodes(); });
|
||||
}
|
||||
|
||||
kv_fs_state_t::~kv_fs_state_t()
|
||||
{
|
||||
if (proxy && touch_timer_id >= 0)
|
||||
{
|
||||
proxy->epmgr->tfd->clear_timer(touch_timer_id);
|
||||
touch_timer_id = -1;
|
||||
}
|
||||
}
|
||||
|
||||
static void touch_inode(nfs_proxy_t *proxy, inode_t ino, bool allow_cache)
|
||||
{
|
||||
kv_read_inode(proxy, ino, [proxy, ino](int res, const std::string & value, json11::Json attrs)
|
||||
{
|
||||
if (!res)
|
||||
{
|
||||
auto ientry = attrs.object_items();
|
||||
ientry["mtime"] = ientry["ctime"] = nfstime_now_str();
|
||||
ientry.erase("verf");
|
||||
// FIXME: Use "update" query
|
||||
bool *found = new bool;
|
||||
*found = true;
|
||||
proxy->db->set(kv_inode_key(ino), json11::Json(ientry).dump(), [proxy, ino, found](int res)
|
||||
{
|
||||
if (!*found)
|
||||
res = -ENOENT;
|
||||
delete found;
|
||||
if (res == -EAGAIN)
|
||||
touch_inode(proxy, ino, false);
|
||||
}, [value, found](int res, const std::string & old_value)
|
||||
{
|
||||
*found = res == 0;
|
||||
return res == 0 && old_value == value;
|
||||
});
|
||||
}
|
||||
}, allow_cache);
|
||||
}
|
||||
|
||||
void kv_fs_state_t::touch_inodes()
|
||||
{
|
||||
std::set<inode_t> q = std::move(touch_queue);
|
||||
for (auto ino: q)
|
||||
{
|
||||
touch_inode(proxy, ino, true);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS proxy over VitastorKV database - header
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "nfs/nfs.h"
|
||||
|
||||
#define KV_ROOT_INODE 1
|
||||
#define SHARED_FILE_MAGIC_V1 0x711A5158A6EDF17E
|
||||
|
||||
struct nfs_kv_write_state;
|
||||
|
||||
struct list_cookie_t
|
||||
{
|
||||
uint64_t dir_ino, cookieverf, cookie;
|
||||
};
|
||||
|
||||
inline bool operator < (const list_cookie_t & a, const list_cookie_t & b)
|
||||
{
|
||||
return a.dir_ino < b.dir_ino || a.dir_ino == b.dir_ino &&
|
||||
(a.cookieverf < b.cookieverf || a.cookieverf == b.cookieverf && a.cookie < b.cookie);
|
||||
};
|
||||
|
||||
struct list_cookie_val_t
|
||||
{
|
||||
std::string key;
|
||||
};
|
||||
|
||||
struct shared_alloc_queue_t
|
||||
{
|
||||
nfs_kv_write_state *st;
|
||||
int state;
|
||||
};
|
||||
|
||||
struct kv_inode_extend_t
|
||||
{
|
||||
int refcnt = 0;
|
||||
uint64_t cur_extend = 0, next_extend = 0, done_extend = 0;
|
||||
std::vector<std::function<void()>> waiters;
|
||||
};
|
||||
|
||||
struct kv_idgen_t
|
||||
{
|
||||
uint64_t next_id = 1, allocated_id = 0;
|
||||
uint64_t min_id = 1;
|
||||
std::vector<uint64_t> unallocated_ids;
|
||||
};
|
||||
|
||||
struct kv_fs_state_t
|
||||
{
|
||||
nfs_proxy_t *proxy = NULL;
|
||||
int touch_timer_id = -1;
|
||||
|
||||
uint64_t fs_kv_inode = 0;
|
||||
uint64_t fs_inode_count = 0;
|
||||
int readdir_getattr_parallel = 8, id_alloc_batch_size = 200;
|
||||
uint64_t pool_block_size = 0;
|
||||
uint64_t pool_alignment = 0;
|
||||
uint64_t shared_inode_threshold = 0;
|
||||
uint64_t touch_interval = 1000;
|
||||
|
||||
std::map<list_cookie_t, list_cookie_val_t> list_cookies;
|
||||
std::map<pool_id_t, kv_idgen_t> idgen;
|
||||
std::vector<shared_alloc_queue_t> allocating_shared;
|
||||
uint64_t cur_shared_inode = 0, cur_shared_offset = 0;
|
||||
std::map<inode_t, kv_inode_extend_t> extends;
|
||||
std::set<inode_t> touch_queue;
|
||||
|
||||
std::vector<uint8_t> zero_block;
|
||||
std::vector<uint8_t> scrap_block;
|
||||
|
||||
void init(nfs_proxy_t *proxy, json11::Json cfg);
|
||||
void touch_inodes();
|
||||
~kv_fs_state_t();
|
||||
};
|
||||
|
||||
struct shared_file_header_t
|
||||
{
|
||||
uint64_t magic = 0;
|
||||
uint64_t inode = 0;
|
||||
uint64_t alloc = 0;
|
||||
};
|
||||
|
||||
struct nfs_rmw_t
|
||||
{
|
||||
nfs_proxy_t *parent = NULL;
|
||||
uint64_t ino = 0;
|
||||
uint64_t offset = 0;
|
||||
uint8_t *buf = NULL;
|
||||
uint64_t size = 0;
|
||||
uint8_t *part_buf = NULL;
|
||||
uint64_t version = 0;
|
||||
nfs_rmw_t *other = NULL;
|
||||
std::function<void(nfs_rmw_t *)> cb;
|
||||
int res = 0;
|
||||
};
|
||||
|
||||
nfsstat3 vitastor_nfs_map_err(int err);
|
||||
nfstime3 nfstime_from_str(const std::string & s);
|
||||
std::string nfstime_to_str(nfstime3 t);
|
||||
std::string nfstime_now_str();
|
||||
int kv_map_type(const std::string & type);
|
||||
fattr3 get_kv_attributes(nfs_client_t *self, uint64_t ino, json11::Json attrs);
|
||||
std::string kv_direntry_key(uint64_t dir_ino, const std::string & filename);
|
||||
std::string kv_direntry_filename(const std::string & key);
|
||||
std::string kv_inode_key(uint64_t ino);
|
||||
std::string kv_fh(uint64_t ino);
|
||||
uint64_t kv_fh_inode(const std::string & fh);
|
||||
bool kv_fh_valid(const std::string & fh);
|
||||
void allocate_new_id(nfs_client_t *self, pool_id_t pool_id, std::function<void(int res, uint64_t new_id)> cb);
|
||||
void kv_read_inode(nfs_proxy_t *proxy, uint64_t ino,
|
||||
std::function<void(int res, const std::string & value, json11::Json ientry)> cb,
|
||||
bool allow_cache = false);
|
||||
uint64_t align_shared_size(nfs_client_t *self, uint64_t size);
|
||||
void nfs_do_rmw(nfs_rmw_t *rmw);
|
||||
|
||||
int kv_nfs3_getattr_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_setattr_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_lookup_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_readlink_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_read_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_write_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_create_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_mkdir_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_symlink_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_mknod_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_remove_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_rmdir_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_rename_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_link_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_readdir_proc(void *opaque, rpc_op_t *rop);
|
||||
int kv_nfs3_readdirplus_proc(void *opaque, rpc_op_t *rop);
|
|
@ -0,0 +1,365 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS proxy over VitastorKV database - CREATE, MKDIR, SYMLINK, MKNOD
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "str_util.h"
|
||||
#include "nfs_proxy.h"
|
||||
#include "nfs_kv.h"
|
||||
|
||||
void allocate_new_id(nfs_client_t *self, pool_id_t pool_id, std::function<void(int res, uint64_t new_id)> cb)
|
||||
{
|
||||
auto & idgen = self->parent->kvfs->idgen[pool_id];
|
||||
if (idgen.unallocated_ids.size())
|
||||
{
|
||||
auto new_id = idgen.unallocated_ids.back();
|
||||
idgen.unallocated_ids.pop_back();
|
||||
cb(0, INODE_WITH_POOL(pool_id, new_id));
|
||||
return;
|
||||
}
|
||||
else if (idgen.next_id <= idgen.allocated_id)
|
||||
{
|
||||
idgen.next_id++;
|
||||
cb(0, INODE_WITH_POOL(pool_id, idgen.next_id-1));
|
||||
return;
|
||||
}
|
||||
// FIXME: Maybe allow FS and block volumes to cohabitate in the same pool, but with different ID ranges
|
||||
else if (idgen.next_id >= ((uint64_t)1 << (64-POOL_ID_BITS)))
|
||||
{
|
||||
cb(-ENOSPC, 0);
|
||||
return;
|
||||
}
|
||||
self->parent->db->get((pool_id ? "id"+std::to_string(pool_id) : "id"), [=](int res, const std::string & prev_str)
|
||||
{
|
||||
auto & idgen = self->parent->kvfs->idgen[pool_id];
|
||||
if (res < 0 && res != -ENOENT)
|
||||
{
|
||||
cb(res, 0);
|
||||
return;
|
||||
}
|
||||
uint64_t prev_val = stoull_full(prev_str);
|
||||
if (prev_val >= ((uint64_t)1 << (64-POOL_ID_BITS)))
|
||||
{
|
||||
cb(-ENOSPC, 0);
|
||||
return;
|
||||
}
|
||||
if (prev_val < idgen.min_id)
|
||||
{
|
||||
prev_val = idgen.min_id;
|
||||
}
|
||||
uint64_t new_val = prev_val + self->parent->kvfs->id_alloc_batch_size;
|
||||
if (new_val >= self->parent->kvfs->fs_inode_count)
|
||||
{
|
||||
new_val = self->parent->kvfs->fs_inode_count;
|
||||
}
|
||||
self->parent->db->set((pool_id ? "id"+std::to_string(pool_id) : "id"), std::to_string(new_val), [=](int res)
|
||||
{
|
||||
if (res == -EAGAIN)
|
||||
{
|
||||
// CAS failure - retry
|
||||
allocate_new_id(self, pool_id, cb);
|
||||
}
|
||||
else if (res < 0)
|
||||
{
|
||||
cb(res, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto & idgen = self->parent->kvfs->idgen[pool_id];
|
||||
idgen.next_id = prev_val+2;
|
||||
idgen.allocated_id = new_val;
|
||||
cb(0, INODE_WITH_POOL(pool_id, prev_val+1));
|
||||
}
|
||||
}, [prev_val](int res, const std::string & value)
|
||||
{
|
||||
// FIXME: Allow to modify value from CAS callback? ("update" query)
|
||||
return res < 0 || stoull_full(value) == prev_val;
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
struct kv_create_state
|
||||
{
|
||||
nfs_client_t *self = NULL;
|
||||
rpc_op_t *rop = NULL;
|
||||
bool exclusive = false;
|
||||
uint64_t verf = 0;
|
||||
uint64_t dir_ino = 0;
|
||||
std::string filename;
|
||||
// state
|
||||
int res = 0;
|
||||
pool_id_t pool_id = 0;
|
||||
uint64_t new_id = 0;
|
||||
json11::Json::object attrobj;
|
||||
json11::Json attrs;
|
||||
std::string direntry_text;
|
||||
uint64_t dup_ino = 0;
|
||||
std::function<void(int res)> cb;
|
||||
};
|
||||
|
||||
static void kv_continue_create(kv_create_state *st, int state)
|
||||
{
|
||||
if (state == 0) {}
|
||||
else if (state == 1) goto resume_1;
|
||||
else if (state == 2) goto resume_2;
|
||||
else if (state == 3) goto resume_3;
|
||||
else if (state == 4) goto resume_4;
|
||||
else if (state == 5) goto resume_5;
|
||||
if (st->self->parent->trace)
|
||||
fprintf(stderr, "[%d] CREATE %ju/%s ATTRS %s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str(), json11::Json(st->attrobj).dump().c_str());
|
||||
if (st->filename == "" || st->filename.find("/") != std::string::npos)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(-EINVAL);
|
||||
return;
|
||||
}
|
||||
st->attrobj["ctime"] = nfstime_now_str();
|
||||
if (st->attrobj.find("mtime") == st->attrobj.end())
|
||||
st->attrobj["mtime"] = st->attrobj["ctime"];
|
||||
st->attrs = std::move(st->attrobj);
|
||||
resume_1:
|
||||
// Generate inode ID
|
||||
// Directories and special files don't need pool
|
||||
st->pool_id = kv_map_type(st->attrs["type"].string_value()) == NF3REG
|
||||
? st->self->parent->default_pool_id
|
||||
: 0;
|
||||
allocate_new_id(st->self, st->pool_id, [st](int res, uint64_t new_id)
|
||||
{
|
||||
st->res = res;
|
||||
st->new_id = new_id;
|
||||
kv_continue_create(st, 2);
|
||||
});
|
||||
return;
|
||||
resume_2:
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
st->self->parent->db->set(kv_inode_key(st->new_id), st->attrs.dump().c_str(), [st](int res)
|
||||
{
|
||||
st->res = res;
|
||||
kv_continue_create(st, 3);
|
||||
}, [st](int res, const std::string & value)
|
||||
{
|
||||
return res == -ENOENT;
|
||||
});
|
||||
return;
|
||||
resume_3:
|
||||
if (st->res == -EAGAIN)
|
||||
{
|
||||
// Inode ID generator failure - retry
|
||||
goto resume_1;
|
||||
}
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
{
|
||||
auto direntry = json11::Json::object{ { "ino", st->new_id } };
|
||||
if (st->attrs["type"].string_value() == "dir")
|
||||
{
|
||||
direntry["type"] = "dir";
|
||||
}
|
||||
st->direntry_text = json11::Json(direntry).dump().c_str();
|
||||
}
|
||||
// Set direntry
|
||||
st->dup_ino = 0;
|
||||
st->self->parent->db->set(kv_direntry_key(st->dir_ino, st->filename), st->direntry_text, [st](int res)
|
||||
{
|
||||
st->res = res;
|
||||
kv_continue_create(st, 4);
|
||||
}, [st](int res, const std::string & value)
|
||||
{
|
||||
// CAS compare - check that the key doesn't exist
|
||||
if (res == 0)
|
||||
{
|
||||
std::string err;
|
||||
auto direntry = json11::Json::parse(value, err);
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Invalid JSON in direntry %s = %s: %s, overwriting\n",
|
||||
kv_direntry_key(st->dir_ino, st->filename).c_str(), value.c_str(), err.c_str());
|
||||
return true;
|
||||
}
|
||||
if (st->exclusive && direntry["verf"].uint64_value() == st->verf)
|
||||
{
|
||||
st->dup_ino = direntry["ino"].uint64_value();
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
return;
|
||||
resume_4:
|
||||
if (st->res == -EAGAIN)
|
||||
{
|
||||
// Direntry already exists
|
||||
st->self->parent->db->del(kv_inode_key(st->new_id), [st](int res)
|
||||
{
|
||||
st->res = res;
|
||||
kv_continue_create(st, 5);
|
||||
});
|
||||
resume_5:
|
||||
if (st->res < 0)
|
||||
{
|
||||
fprintf(stderr, "failed to delete duplicate inode %ju left from create %s (code %d)\n", st->new_id, strerror(-st->res), st->res);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto & idgen = st->self->parent->kvfs->idgen[INODE_POOL(st->new_id)];
|
||||
idgen.unallocated_ids.push_back(INODE_NO_POOL(st->new_id));
|
||||
}
|
||||
if (st->dup_ino)
|
||||
{
|
||||
// Successfully created by the previous "exclusive" request
|
||||
st->new_id = st->dup_ino;
|
||||
}
|
||||
st->res = st->dup_ino ? 0 : -EEXIST;
|
||||
}
|
||||
if (!st->res)
|
||||
{
|
||||
st->self->parent->kvfs->touch_queue.insert(st->dir_ino);
|
||||
}
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
}
|
||||
|
||||
static void kv_create_setattr(json11::Json::object & attrobj, sattr3 & sattr)
|
||||
{
|
||||
if (sattr.mode.set_it)
|
||||
attrobj["mode"] = (uint64_t)sattr.mode.mode;
|
||||
if (sattr.uid.set_it)
|
||||
attrobj["uid"] = (uint64_t)sattr.uid.uid;
|
||||
if (sattr.gid.set_it)
|
||||
attrobj["gid"] = (uint64_t)sattr.gid.gid;
|
||||
if (sattr.atime.set_it)
|
||||
attrobj["atime"] = nfstime_to_str(sattr.atime.atime);
|
||||
if (sattr.mtime.set_it)
|
||||
attrobj["mtime"] = nfstime_to_str(sattr.mtime.mtime);
|
||||
}
|
||||
|
||||
template<class T, class Tok> static void kv_create_reply(kv_create_state *st, int res)
|
||||
{
|
||||
T *reply = (T*)st->rop->reply;
|
||||
if (res < 0)
|
||||
{
|
||||
*reply = (T){ .status = vitastor_nfs_map_err(-res) };
|
||||
}
|
||||
else
|
||||
{
|
||||
*reply = (T){
|
||||
.status = NFS3_OK,
|
||||
.resok = (Tok){
|
||||
.obj = {
|
||||
.handle_follows = 1,
|
||||
.handle = xdr_copy_string(st->rop->xdrs, kv_fh(st->new_id)),
|
||||
},
|
||||
.obj_attributes = {
|
||||
.attributes_follow = 1,
|
||||
.attributes = get_kv_attributes(st->self, st->new_id, st->attrs),
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
rpc_queue_reply(st->rop);
|
||||
delete st;
|
||||
}
|
||||
|
||||
int kv_nfs3_create_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
kv_create_state *st = new kv_create_state;
|
||||
st->self = (nfs_client_t*)opaque;
|
||||
st->rop = rop;
|
||||
auto args = (CREATE3args*)rop->request;
|
||||
st->exclusive = args->how.mode == NFS_EXCLUSIVE;
|
||||
st->verf = st->exclusive ? *(uint64_t*)&args->how.verf : 0;
|
||||
st->dir_ino = kv_fh_inode(args->where.dir);
|
||||
st->filename = args->where.name;
|
||||
if (args->how.mode == NFS_EXCLUSIVE)
|
||||
{
|
||||
st->attrobj["verf"] = *(uint64_t*)&args->how.verf;
|
||||
}
|
||||
else if (args->how.mode == NFS_UNCHECKED)
|
||||
{
|
||||
kv_create_setattr(st->attrobj, args->how.obj_attributes);
|
||||
if (args->how.obj_attributes.size.set_it)
|
||||
{
|
||||
st->attrobj["size"] = (uint64_t)args->how.obj_attributes.size.size;
|
||||
st->attrobj["empty"] = true;
|
||||
}
|
||||
}
|
||||
st->cb = [st](int res) { kv_create_reply<CREATE3res, CREATE3resok>(st, res); };
|
||||
kv_continue_create(st, 0);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int kv_nfs3_mkdir_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
kv_create_state *st = new kv_create_state;
|
||||
st->self = (nfs_client_t*)opaque;
|
||||
st->rop = rop;
|
||||
auto args = (MKDIR3args*)rop->request;
|
||||
st->dir_ino = kv_fh_inode(args->where.dir);
|
||||
st->filename = args->where.name;
|
||||
st->attrobj["type"] = "dir";
|
||||
st->attrobj["parent_ino"] = st->dir_ino;
|
||||
kv_create_setattr(st->attrobj, args->attributes);
|
||||
st->cb = [st](int res) { kv_create_reply<MKDIR3res, MKDIR3resok>(st, res); };
|
||||
kv_continue_create(st, 0);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int kv_nfs3_symlink_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
kv_create_state *st = new kv_create_state;
|
||||
st->self = (nfs_client_t*)opaque;
|
||||
st->rop = rop;
|
||||
auto args = (SYMLINK3args*)rop->request;
|
||||
st->dir_ino = kv_fh_inode(args->where.dir);
|
||||
st->filename = args->where.name;
|
||||
st->attrobj["type"] = "link";
|
||||
st->attrobj["symlink"] = (std::string)args->symlink.symlink_data;
|
||||
kv_create_setattr(st->attrobj, args->symlink.symlink_attributes);
|
||||
st->cb = [st](int res) { kv_create_reply<SYMLINK3res, SYMLINK3resok>(st, res); };
|
||||
kv_continue_create(st, 0);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int kv_nfs3_mknod_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
kv_create_state *st = new kv_create_state;
|
||||
st->self = (nfs_client_t*)opaque;
|
||||
st->rop = rop;
|
||||
auto args = (MKNOD3args*)rop->request;
|
||||
st->dir_ino = kv_fh_inode(args->where.dir);
|
||||
st->filename = args->where.name;
|
||||
if (args->what.type == NF3CHR || args->what.type == NF3BLK)
|
||||
{
|
||||
st->attrobj["type"] = (args->what.type == NF3CHR ? "chr" : "blk");
|
||||
st->attrobj["major"] = (uint64_t)args->what.chr_device.spec.specdata1;
|
||||
st->attrobj["minor"] = (uint64_t)args->what.chr_device.spec.specdata2;
|
||||
kv_create_setattr(st->attrobj, args->what.chr_device.dev_attributes);
|
||||
}
|
||||
else if (args->what.type == NF3SOCK || args->what.type == NF3FIFO)
|
||||
{
|
||||
st->attrobj["type"] = (args->what.type == NF3SOCK ? "sock" : "fifo");
|
||||
kv_create_setattr(st->attrobj, args->what.sock_attributes);
|
||||
}
|
||||
else
|
||||
{
|
||||
*(MKNOD3res*)rop->reply = (MKNOD3res){ .status = NFS3ERR_INVAL };
|
||||
rpc_queue_reply(rop);
|
||||
delete st;
|
||||
return 0;
|
||||
}
|
||||
st->cb = [st](int res) { kv_create_reply<MKNOD3res, MKNOD3resok>(st, res); };
|
||||
kv_continue_create(st, 0);
|
||||
return 1;
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS proxy over VitastorKV database - GETATTR
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "nfs_proxy.h"
|
||||
#include "nfs_kv.h"
|
||||
|
||||
// Attributes are always stored in the inode
|
||||
void kv_read_inode(nfs_proxy_t *proxy, uint64_t ino,
|
||||
std::function<void(int res, const std::string & value, json11::Json ientry)> cb,
|
||||
bool allow_cache)
|
||||
{
|
||||
auto key = kv_inode_key(ino);
|
||||
proxy->db->get(key, [=](int res, const std::string & value)
|
||||
{
|
||||
if (ino == KV_ROOT_INODE && res == -ENOENT)
|
||||
{
|
||||
// Allow root inode to not exist
|
||||
cb(0, "", json11::Json(json11::Json::object{ { "type", "dir" } }));
|
||||
return;
|
||||
}
|
||||
if (res < 0)
|
||||
{
|
||||
if (res != -ENOENT)
|
||||
fprintf(stderr, "Error reading inode %s: %s (code %d)\n", kv_inode_key(ino).c_str(), strerror(-res), res);
|
||||
cb(res, "", json11::Json());
|
||||
return;
|
||||
}
|
||||
std::string err;
|
||||
auto attrs = json11::Json::parse(value, err);
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Invalid JSON in inode %s = %s: %s\n", kv_inode_key(ino).c_str(), value.c_str(), err.c_str());
|
||||
res = -EIO;
|
||||
}
|
||||
cb(res, value, attrs);
|
||||
}, allow_cache);
|
||||
}
|
||||
|
||||
int kv_nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
GETATTR3args *args = (GETATTR3args*)rop->request;
|
||||
GETATTR3res *reply = (GETATTR3res*)rop->reply;
|
||||
std::string fh = args->object;
|
||||
auto ino = kv_fh_inode(fh);
|
||||
if (self->parent->trace)
|
||||
fprintf(stderr, "[%d] GETATTR %ju\n", self->nfs_fd, ino);
|
||||
if (!kv_fh_valid(fh))
|
||||
{
|
||||
*reply = (GETATTR3res){ .status = NFS3ERR_INVAL };
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
kv_read_inode(self->parent, ino, [=](int res, const std::string & value, json11::Json attrs)
|
||||
{
|
||||
if (self->parent->trace)
|
||||
fprintf(stderr, "[%d] GETATTR %ju -> %s\n", self->nfs_fd, ino, value.c_str());
|
||||
if (res < 0)
|
||||
{
|
||||
*reply = (GETATTR3res){ .status = vitastor_nfs_map_err(-res) };
|
||||
}
|
||||
else
|
||||
{
|
||||
*reply = (GETATTR3res){
|
||||
.status = NFS3_OK,
|
||||
.resok = (GETATTR3resok){
|
||||
.obj_attributes = get_kv_attributes(self, ino, attrs),
|
||||
},
|
||||
};
|
||||
}
|
||||
rpc_queue_reply(rop);
|
||||
});
|
||||
return 1;
|
||||
}
|
|
@ -0,0 +1,193 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS proxy over VitastorKV database - LINK
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "nfs_proxy.h"
|
||||
#include "nfs_kv.h"
|
||||
|
||||
struct nfs_kv_link_state
|
||||
{
|
||||
nfs_client_t *self = NULL;
|
||||
rpc_op_t *rop = NULL;
|
||||
uint64_t ino = 0;
|
||||
uint64_t dir_ino = 0;
|
||||
std::string filename;
|
||||
std::string ientry_text;
|
||||
json11::Json ientry;
|
||||
bool retrying = false;
|
||||
int wait = 0;
|
||||
int res = 0, res2 = 0;
|
||||
std::function<void(int)> cb;
|
||||
};
|
||||
|
||||
static void nfs_kv_continue_link(nfs_kv_link_state *st, int state)
|
||||
{
|
||||
// 1) Read the source inode
|
||||
// 2) If it's a directory - fail with -EISDIR
|
||||
// 3) Create the new direntry with the same inode reference
|
||||
// 4) Update the inode entry with refcount++
|
||||
// 5) Retry update if CAS failed but the inode exists
|
||||
// 6) Otherwise fail and remove the new direntry
|
||||
// Yeah we may leave a bad direntry if we crash
|
||||
// But the other option is to possibly leave an inode with too big refcount
|
||||
if (state == 0) {}
|
||||
else if (state == 1) goto resume_1;
|
||||
else if (state == 2) goto resume_2;
|
||||
else if (state == 3) goto resume_3;
|
||||
else if (state == 4) goto resume_4;
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "BUG: invalid state in nfs_kv_continue_link()");
|
||||
abort();
|
||||
}
|
||||
resume_0:
|
||||
// Check that the source inode exists and is not a directory
|
||||
st->wait = st->retrying ? 1 : 2;
|
||||
st->res2 = 0;
|
||||
kv_read_inode(st->self->parent, st->ino, [st](int res, const std::string & value, json11::Json attrs)
|
||||
{
|
||||
st->res = res == 0 ? (attrs["type"].string_value() == "dir" ? -EISDIR : 0) : res;
|
||||
st->ientry_text = value;
|
||||
st->ientry = attrs;
|
||||
if (!--st->wait)
|
||||
nfs_kv_continue_link(st, 1);
|
||||
});
|
||||
if (!st->retrying)
|
||||
{
|
||||
// Check that the new directory exists
|
||||
kv_read_inode(st->self->parent, st->dir_ino, [st](int res, const std::string & value, json11::Json attrs)
|
||||
{
|
||||
st->res2 = res == 0 ? (attrs["type"].string_value() == "dir" ? 0 : -ENOTDIR) : res;
|
||||
if (!--st->wait)
|
||||
nfs_kv_continue_link(st, 1);
|
||||
});
|
||||
}
|
||||
return;
|
||||
resume_1:
|
||||
if (st->res < 0 || st->res2 < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res < 0 ? st->res : st->res2);
|
||||
return;
|
||||
}
|
||||
// Write the new direntry
|
||||
if (!st->retrying)
|
||||
{
|
||||
st->self->parent->db->set(kv_direntry_key(st->dir_ino, st->filename),
|
||||
json11::Json(json11::Json::object{ { "ino", st->ino } }).dump(), [st](int res)
|
||||
{
|
||||
st->res = res;
|
||||
nfs_kv_continue_link(st, 2);
|
||||
}, [st](int res, const std::string & old_value)
|
||||
{
|
||||
return res == -ENOENT;
|
||||
});
|
||||
return;
|
||||
resume_2:
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Increase inode refcount
|
||||
{
|
||||
auto new_ientry = st->ientry.object_items();
|
||||
auto nlink = new_ientry["nlink"].uint64_value();
|
||||
new_ientry["nlink"] = nlink ? nlink+1 : 2;
|
||||
new_ientry["ctime"] = nfstime_now_str();
|
||||
st->ientry = new_ientry;
|
||||
}
|
||||
st->self->parent->db->set(kv_inode_key(st->ino), st->ientry.dump(), [st](int res)
|
||||
{
|
||||
st->res = res;
|
||||
nfs_kv_continue_link(st, 3);
|
||||
}, [st](int res, const std::string & old_value)
|
||||
{
|
||||
st->res2 = res;
|
||||
return res == 0 && old_value == st->ientry_text;
|
||||
});
|
||||
return;
|
||||
resume_3:
|
||||
if (st->res2 == -ENOENT)
|
||||
{
|
||||
st->res = -ENOENT;
|
||||
}
|
||||
if (st->res == -EAGAIN)
|
||||
{
|
||||
// Re-read inode and retry
|
||||
st->retrying = true;
|
||||
goto resume_0;
|
||||
}
|
||||
if (st->res < 0)
|
||||
{
|
||||
// Maybe inode was deleted in the meantime, delete our direntry
|
||||
st->self->parent->db->del(kv_direntry_key(st->dir_ino, st->filename), [st](int res)
|
||||
{
|
||||
st->res2 = res;
|
||||
nfs_kv_continue_link(st, 4);
|
||||
});
|
||||
return;
|
||||
resume_4:
|
||||
if (st->res2 < 0)
|
||||
{
|
||||
fprintf(stderr, "Warning: failed to delete new linked direntry %ju/%s: %s (code %d)\n",
|
||||
st->dir_ino, st->filename.c_str(), strerror(-st->res2), st->res2);
|
||||
}
|
||||
}
|
||||
if (!st->res)
|
||||
{
|
||||
st->self->parent->kvfs->touch_queue.insert(st->dir_ino);
|
||||
}
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
}
|
||||
|
||||
int kv_nfs3_link_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
auto st = new nfs_kv_link_state;
|
||||
st->self = (nfs_client_t*)opaque;
|
||||
st->rop = rop;
|
||||
LINK3args *args = (LINK3args*)rop->request;
|
||||
st->ino = kv_fh_inode(args->file);
|
||||
st->dir_ino = kv_fh_inode(args->link.dir);
|
||||
st->filename = args->link.name;
|
||||
if (st->self->parent->trace)
|
||||
fprintf(stderr, "[%d] LINK %ju -> %ju/%s\n", st->self->nfs_fd, st->ino, st->dir_ino, st->filename.c_str());
|
||||
if (!st->ino || !st->dir_ino || st->filename == "")
|
||||
{
|
||||
LINK3res *reply = (LINK3res*)rop->reply;
|
||||
*reply = (LINK3res){ .status = NFS3ERR_INVAL };
|
||||
rpc_queue_reply(rop);
|
||||
delete st;
|
||||
return 0;
|
||||
}
|
||||
st->cb = [st](int res)
|
||||
{
|
||||
LINK3res *reply = (LINK3res*)st->rop->reply;
|
||||
if (res < 0)
|
||||
{
|
||||
*reply = (LINK3res){ .status = vitastor_nfs_map_err(res) };
|
||||
}
|
||||
else
|
||||
{
|
||||
*reply = (LINK3res){
|
||||
.status = NFS3_OK,
|
||||
.resok = (LINK3resok){
|
||||
.file_attributes = (post_op_attr){
|
||||
.attributes_follow = 1,
|
||||
.attributes = get_kv_attributes(st->self, st->ino, st->ientry),
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
rpc_queue_reply(st->rop);
|
||||
delete st;
|
||||
};
|
||||
nfs_kv_continue_link(st, 0);
|
||||
return 1;
|
||||
}
|
|
@ -0,0 +1,104 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS proxy over VitastorKV database - LOOKUP, READLINK
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "nfs_proxy.h"
|
||||
#include "nfs_kv.h"
|
||||
|
||||
int kv_nfs3_lookup_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
LOOKUP3args *args = (LOOKUP3args*)rop->request;
|
||||
LOOKUP3res *reply = (LOOKUP3res*)rop->reply;
|
||||
inode_t dir_ino = kv_fh_inode(args->what.dir);
|
||||
std::string filename = args->what.name;
|
||||
if (self->parent->trace)
|
||||
fprintf(stderr, "[%d] LOOKUP %ju/%s\n", self->nfs_fd, dir_ino, filename.c_str());
|
||||
if (!dir_ino || filename == "")
|
||||
{
|
||||
*reply = (LOOKUP3res){ .status = NFS3ERR_INVAL };
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
self->parent->db->get(kv_direntry_key(dir_ino, filename), [=](int res, const std::string & value)
|
||||
{
|
||||
if (res < 0)
|
||||
{
|
||||
*reply = (LOOKUP3res){ .status = vitastor_nfs_map_err(-res) };
|
||||
rpc_queue_reply(rop);
|
||||
return;
|
||||
}
|
||||
std::string err;
|
||||
auto direntry = json11::Json::parse(value, err);
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Invalid JSON in direntry %s = %s: %s\n", kv_direntry_key(dir_ino, filename).c_str(), value.c_str(), err.c_str());
|
||||
*reply = (LOOKUP3res){ .status = NFS3ERR_IO };
|
||||
rpc_queue_reply(rop);
|
||||
return;
|
||||
}
|
||||
uint64_t ino = direntry["ino"].uint64_value();
|
||||
kv_read_inode(self->parent, ino, [=](int res, const std::string & value, json11::Json ientry)
|
||||
{
|
||||
if (res < 0)
|
||||
{
|
||||
*reply = (LOOKUP3res){ .status = vitastor_nfs_map_err(res == -ENOENT ? -EIO : res) };
|
||||
rpc_queue_reply(rop);
|
||||
return;
|
||||
}
|
||||
*reply = (LOOKUP3res){
|
||||
.status = NFS3_OK,
|
||||
.resok = (LOOKUP3resok){
|
||||
.object = xdr_copy_string(rop->xdrs, kv_fh(ino)),
|
||||
.obj_attributes = {
|
||||
.attributes_follow = 1,
|
||||
.attributes = get_kv_attributes(self, ino, ientry),
|
||||
},
|
||||
},
|
||||
};
|
||||
rpc_queue_reply(rop);
|
||||
});
|
||||
});
|
||||
return 1;
|
||||
}
|
||||
|
||||
int kv_nfs3_readlink_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
READLINK3args *args = (READLINK3args*)rop->request;
|
||||
if (self->parent->trace)
|
||||
fprintf(stderr, "[%d] READLINK %ju\n", self->nfs_fd, kv_fh_inode(args->symlink));
|
||||
READLINK3res *reply = (READLINK3res*)rop->reply;
|
||||
if (!kv_fh_valid(args->symlink) || args->symlink == NFS_ROOT_HANDLE)
|
||||
{
|
||||
// Invalid filehandle or trying to read symlink from root entry
|
||||
*reply = (READLINK3res){ .status = NFS3ERR_INVAL };
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
kv_read_inode(self->parent, kv_fh_inode(args->symlink), [=](int res, const std::string & value, json11::Json attrs)
|
||||
{
|
||||
if (res < 0)
|
||||
{
|
||||
*reply = (READLINK3res){ .status = vitastor_nfs_map_err(-res) };
|
||||
}
|
||||
else if (attrs["type"] != "link")
|
||||
{
|
||||
*reply = (READLINK3res){ .status = NFS3ERR_INVAL };
|
||||
}
|
||||
else
|
||||
{
|
||||
*reply = (READLINK3res){
|
||||
.status = NFS3_OK,
|
||||
.resok = (READLINK3resok){
|
||||
.data = xdr_copy_string(rop->xdrs, attrs["symlink"].string_value()),
|
||||
},
|
||||
};
|
||||
}
|
||||
rpc_queue_reply(rop);
|
||||
});
|
||||
return 1;
|
||||
}
|
|
@ -0,0 +1,198 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS proxy over VitastorKV database - READ
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "nfs_proxy.h"
|
||||
#include "nfs_kv.h"
|
||||
|
||||
struct nfs_kv_read_state
|
||||
{
|
||||
nfs_client_t *self = NULL;
|
||||
rpc_op_t *rop = NULL;
|
||||
bool allow_cache = true;
|
||||
inode_t ino = 0;
|
||||
uint64_t offset = 0, size = 0;
|
||||
std::function<void(int)> cb;
|
||||
// state
|
||||
int res = 0;
|
||||
int eof = 0;
|
||||
json11::Json ientry;
|
||||
uint64_t aligned_size = 0, aligned_offset = 0;
|
||||
uint8_t *aligned_buf = NULL;
|
||||
cluster_op_t *op = NULL;
|
||||
uint8_t *buf = NULL;
|
||||
};
|
||||
|
||||
#define align_down(size) ((size) & ~(st->self->parent->kvfs->pool_alignment-1))
|
||||
#define align_up(size) (((size) + st->self->parent->kvfs->pool_alignment-1) & ~(st->self->parent->kvfs->pool_alignment-1))
|
||||
|
||||
static void nfs_kv_continue_read(nfs_kv_read_state *st, int state)
|
||||
{
|
||||
if (state == 0) {}
|
||||
else if (state == 1) goto resume_1;
|
||||
else if (state == 2) goto resume_2;
|
||||
else if (state == 3) goto resume_3;
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "BUG: invalid state in nfs_kv_continue_read()");
|
||||
abort();
|
||||
}
|
||||
resume_0:
|
||||
if (st->offset + sizeof(shared_file_header_t) < st->self->parent->kvfs->shared_inode_threshold)
|
||||
{
|
||||
kv_read_inode(st->self->parent, st->ino, [st](int res, const std::string & value, json11::Json attrs)
|
||||
{
|
||||
st->res = res;
|
||||
st->ientry = attrs;
|
||||
nfs_kv_continue_read(st, 1);
|
||||
}, st->allow_cache);
|
||||
return;
|
||||
resume_1:
|
||||
if (st->res < 0 || kv_map_type(st->ientry["type"].string_value()) != NF3REG)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res < 0 ? st->res : -EINVAL);
|
||||
return;
|
||||
}
|
||||
if (st->ientry["shared_ino"].uint64_value() != 0)
|
||||
{
|
||||
if (st->offset >= st->ientry["size"].uint64_value())
|
||||
{
|
||||
st->size = 0;
|
||||
st->eof = 1;
|
||||
auto cb = std::move(st->cb);
|
||||
cb(0);
|
||||
return;
|
||||
}
|
||||
st->op = new cluster_op_t;
|
||||
{
|
||||
st->op->opcode = OSD_OP_READ;
|
||||
st->op->inode = st->ientry["shared_ino"].uint64_value();
|
||||
// Always read including header to react if the file was possibly moved away
|
||||
auto read_offset = st->ientry["shared_offset"].uint64_value();
|
||||
st->op->offset = align_down(read_offset);
|
||||
if (st->op->offset < read_offset)
|
||||
{
|
||||
st->op->iov.push_back(st->self->parent->kvfs->scrap_block.data(),
|
||||
read_offset-st->op->offset);
|
||||
}
|
||||
auto read_size = st->offset+st->size;
|
||||
if (read_size > st->ientry["size"].uint64_value())
|
||||
{
|
||||
st->eof = 1;
|
||||
st->size = st->ientry["size"].uint64_value()-st->offset;
|
||||
read_size = st->ientry["size"].uint64_value();
|
||||
}
|
||||
read_size += sizeof(shared_file_header_t);
|
||||
assert(!st->aligned_buf);
|
||||
st->aligned_buf = (uint8_t*)malloc_or_die(read_size);
|
||||
st->buf = st->aligned_buf + sizeof(shared_file_header_t) + st->offset;
|
||||
st->op->iov.push_back(st->aligned_buf, read_size);
|
||||
st->op->len = align_up(read_offset+read_size) - st->op->offset;
|
||||
if (read_offset+read_size < st->op->offset+st->op->len)
|
||||
{
|
||||
st->op->iov.push_back(st->self->parent->kvfs->scrap_block.data(),
|
||||
st->op->offset+st->op->len - (read_offset+read_size));
|
||||
}
|
||||
}
|
||||
st->op->callback = [st, state](cluster_op_t *op)
|
||||
{
|
||||
st->res = op->retval == op->len ? 0 : op->retval;
|
||||
delete op;
|
||||
nfs_kv_continue_read(st, 2);
|
||||
};
|
||||
st->self->parent->cli->execute(st->op);
|
||||
return;
|
||||
resume_2:
|
||||
if (st->res < 0)
|
||||
{
|
||||
free(st->aligned_buf);
|
||||
st->aligned_buf = NULL;
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
auto hdr = ((shared_file_header_t*)st->aligned_buf);
|
||||
if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino)
|
||||
{
|
||||
// Got unrelated data - retry from the beginning
|
||||
free(st->aligned_buf);
|
||||
st->aligned_buf = NULL;
|
||||
st->allow_cache = false;
|
||||
goto resume_0;
|
||||
}
|
||||
auto cb = std::move(st->cb);
|
||||
cb(0);
|
||||
return;
|
||||
}
|
||||
}
|
||||
st->aligned_offset = align_down(st->offset);
|
||||
st->aligned_size = align_up(st->offset+st->size) - st->aligned_offset;
|
||||
assert(!st->aligned_buf);
|
||||
st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size);
|
||||
st->buf = st->aligned_buf + st->offset - st->aligned_offset;
|
||||
st->op = new cluster_op_t;
|
||||
st->op->opcode = OSD_OP_READ;
|
||||
st->op->inode = st->ino;
|
||||
st->op->offset = st->aligned_offset;
|
||||
st->op->len = st->aligned_size;
|
||||
st->op->iov.push_back(st->aligned_buf, st->aligned_size);
|
||||
st->op->callback = [st](cluster_op_t *op)
|
||||
{
|
||||
st->res = op->retval;
|
||||
delete op;
|
||||
nfs_kv_continue_read(st, 3);
|
||||
};
|
||||
st->self->parent->cli->execute(st->op);
|
||||
return;
|
||||
resume_3:
|
||||
if (st->res < 0)
|
||||
{
|
||||
free(st->aligned_buf);
|
||||
st->aligned_buf = NULL;
|
||||
}
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res < 0 ? st->res : 0);
|
||||
return;
|
||||
}
|
||||
|
||||
int kv_nfs3_read_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
READ3args *args = (READ3args*)rop->request;
|
||||
READ3res *reply = (READ3res*)rop->reply;
|
||||
auto ino = kv_fh_inode(args->file);
|
||||
if (args->count > MAX_REQUEST_SIZE || !ino)
|
||||
{
|
||||
*reply = (READ3res){ .status = NFS3ERR_INVAL };
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
auto st = new nfs_kv_read_state;
|
||||
st->self = (nfs_client_t*)opaque;
|
||||
st->rop = rop;
|
||||
st->ino = ino;
|
||||
st->offset = args->offset;
|
||||
st->size = args->count;
|
||||
st->cb = [st](int res)
|
||||
{
|
||||
READ3res *reply = (READ3res*)st->rop->reply;
|
||||
*reply = (READ3res){ .status = vitastor_nfs_map_err(res) };
|
||||
if (res == 0)
|
||||
{
|
||||
xdr_add_malloc(st->rop->xdrs, st->aligned_buf);
|
||||
reply->resok.data.data = (char*)st->buf;
|
||||
reply->resok.data.size = st->size;
|
||||
reply->resok.count = st->size;
|
||||
reply->resok.eof = st->eof;
|
||||
}
|
||||
rpc_queue_reply(st->rop);
|
||||
delete st;
|
||||
};
|
||||
if (st->self->parent->trace)
|
||||
fprintf(stderr, "[%d] READ %ju %ju+%ju\n", st->self->nfs_fd, st->ino, st->offset, st->size);
|
||||
nfs_kv_continue_read(st, 0);
|
||||
return 1;
|
||||
}
|
|
@ -0,0 +1,375 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS proxy over VitastorKV database - READDIR, READDIRPLUS
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "nfs_proxy.h"
|
||||
#include "nfs_kv.h"
|
||||
|
||||
static unsigned len_pad4(unsigned len)
|
||||
{
|
||||
return len + (len&3 ? 4-(len&3) : 0);
|
||||
}
|
||||
|
||||
struct nfs_kv_readdir_state
|
||||
{
|
||||
nfs_client_t *self = NULL;
|
||||
rpc_op_t *rop = NULL;
|
||||
// Request:
|
||||
bool is_plus = false;
|
||||
uint64_t cookie = 0;
|
||||
uint64_t cookieverf = 0;
|
||||
uint64_t dir_ino = 0;
|
||||
uint64_t maxcount = 0;
|
||||
std::function<void(int)> cb;
|
||||
// State:
|
||||
int res = 0;
|
||||
std::string prefix, start;
|
||||
void *list_handle;
|
||||
uint64_t parent_ino = 0;
|
||||
std::string ientry_text, parent_ientry_text;
|
||||
json11::Json ientry, parent_ientry;
|
||||
std::string cur_key, cur_value;
|
||||
int reply_size = 0;
|
||||
int to_skip = 0;
|
||||
uint64_t offset = 0;
|
||||
int getattr_running = 0, getattr_cur = 0;
|
||||
// Result:
|
||||
bool eof = false;
|
||||
//uint64_t cookieverf = 0; // same field
|
||||
std::vector<entryplus3> entries;
|
||||
};
|
||||
|
||||
static void nfs_kv_continue_readdir(nfs_kv_readdir_state *st, int state);
|
||||
|
||||
static void kv_getattr_next(nfs_kv_readdir_state *st)
|
||||
{
|
||||
while (st->is_plus && st->getattr_cur < st->entries.size() && st->getattr_running < st->self->parent->kvfs->readdir_getattr_parallel)
|
||||
{
|
||||
auto idx = st->getattr_cur++;
|
||||
st->getattr_running++;
|
||||
kv_read_inode(st->self->parent, st->entries[idx].fileid, [st, idx](int res, const std::string & value, json11::Json ientry)
|
||||
{
|
||||
if (res == 0)
|
||||
{
|
||||
st->entries[idx].name_attributes = (post_op_attr){
|
||||
// FIXME: maybe do not read parent attributes and leave them to a GETATTR?
|
||||
.attributes_follow = 1,
|
||||
.attributes = get_kv_attributes(st->self, st->entries[idx].fileid, ientry),
|
||||
};
|
||||
}
|
||||
st->getattr_running--;
|
||||
kv_getattr_next(st);
|
||||
if (st->getattr_running == 0 && !st->list_handle)
|
||||
{
|
||||
nfs_kv_continue_readdir(st, 4);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void nfs_kv_continue_readdir(nfs_kv_readdir_state *st, int state)
|
||||
{
|
||||
if (state == 0) {}
|
||||
else if (state == 1) goto resume_1;
|
||||
else if (state == 2) goto resume_2;
|
||||
else if (state == 3) goto resume_3;
|
||||
else if (state == 4) goto resume_4;
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "BUG: invalid state in nfs_kv_continue_readdir()");
|
||||
abort();
|
||||
}
|
||||
// Limit results based on maximum reply size
|
||||
// Sadly we have to calculate reply size by hand
|
||||
// reply without entries is 4+4+(dir_attributes ? sizeof(fattr3) : 0)+8+4 bytes
|
||||
st->reply_size = 20;
|
||||
if (st->reply_size > st->maxcount)
|
||||
{
|
||||
// Error, too small max reply size
|
||||
auto cb = std::move(st->cb);
|
||||
cb(-NFS3ERR_TOOSMALL);
|
||||
return;
|
||||
}
|
||||
// Add . and ..
|
||||
if (st->cookie <= 1)
|
||||
{
|
||||
kv_read_inode(st->self->parent, st->dir_ino, [st](int res, const std::string & value, json11::Json ientry)
|
||||
{
|
||||
st->res = res;
|
||||
st->ientry_text = value;
|
||||
st->ientry = ientry;
|
||||
nfs_kv_continue_readdir(st, 1);
|
||||
});
|
||||
return;
|
||||
resume_1:
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
if (st->cookie == 0)
|
||||
{
|
||||
auto fh = kv_fh(st->dir_ino);
|
||||
auto entry_size = 20 + 4/*len_pad4(".")*/ + (st->is_plus ? 8 + 88 + len_pad4(fh.size()) : 0);
|
||||
if (st->reply_size + entry_size > st->maxcount)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(-NFS3ERR_TOOSMALL);
|
||||
return;
|
||||
}
|
||||
entryplus3 dot = {};
|
||||
dot.name = xdr_copy_string(st->rop->xdrs, ".");
|
||||
dot.fileid = st->dir_ino;
|
||||
dot.name_attributes = (post_op_attr){
|
||||
.attributes_follow = 1,
|
||||
.attributes = get_kv_attributes(st->self, st->dir_ino, st->ientry),
|
||||
};
|
||||
dot.name_handle = (post_op_fh3){
|
||||
.handle_follows = 1,
|
||||
.handle = xdr_copy_string(st->rop->xdrs, fh),
|
||||
};
|
||||
st->entries.push_back(dot);
|
||||
st->reply_size += entry_size;
|
||||
}
|
||||
st->parent_ino = st->ientry["parent_ino"].uint64_value();
|
||||
if (st->parent_ino)
|
||||
{
|
||||
kv_read_inode(st->self->parent, st->ientry["parent_ino"].uint64_value(), [st](int res, const std::string & value, json11::Json ientry)
|
||||
{
|
||||
st->res = res;
|
||||
st->parent_ientry_text = value;
|
||||
st->parent_ientry = ientry;
|
||||
nfs_kv_continue_readdir(st, 2);
|
||||
});
|
||||
return;
|
||||
resume_2:
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
}
|
||||
auto fh = kv_fh(st->parent_ino);
|
||||
auto entry_size = 20 + 4/*len_pad4("..")*/ + (st->is_plus ? 8 + 88 + len_pad4(fh.size()) : 0);
|
||||
if (st->reply_size + entry_size > st->maxcount)
|
||||
{
|
||||
st->eof = false;
|
||||
auto cb = std::move(st->cb);
|
||||
cb(0);
|
||||
return;
|
||||
}
|
||||
entryplus3 dotdot = {};
|
||||
dotdot.name = xdr_copy_string(st->rop->xdrs, "..");
|
||||
dotdot.fileid = st->dir_ino;
|
||||
dotdot.name_attributes = (post_op_attr){
|
||||
// FIXME: maybe do not read parent attributes and leave them to a GETATTR?
|
||||
.attributes_follow = 1,
|
||||
.attributes = get_kv_attributes(st->self,
|
||||
st->parent_ino ? st->parent_ino : st->dir_ino,
|
||||
st->parent_ino ? st->parent_ientry : st->ientry),
|
||||
};
|
||||
dotdot.name_handle = (post_op_fh3){
|
||||
.handle_follows = 1,
|
||||
.handle = xdr_copy_string(st->rop->xdrs, fh),
|
||||
};
|
||||
st->entries.push_back(dotdot);
|
||||
st->reply_size += entry_size;
|
||||
}
|
||||
st->prefix = kv_direntry_key(st->dir_ino, "");
|
||||
st->eof = true;
|
||||
st->start = st->prefix;
|
||||
if (st->cookie > 1)
|
||||
{
|
||||
auto lc_it = st->self->parent->kvfs->list_cookies.find((list_cookie_t){ st->dir_ino, st->cookieverf, st->cookie });
|
||||
if (lc_it != st->self->parent->kvfs->list_cookies.end())
|
||||
{
|
||||
st->start = st->prefix+lc_it->second.key;
|
||||
st->to_skip = 1;
|
||||
st->offset = st->cookie;
|
||||
}
|
||||
else
|
||||
{
|
||||
st->to_skip = st->cookie-2;
|
||||
st->offset = 2;
|
||||
st->cookieverf = ((uint64_t)lrand48() | ((uint64_t)lrand48() << 31) | ((uint64_t)lrand48() << 62));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
st->to_skip = 0;
|
||||
st->offset = 2;
|
||||
st->cookieverf = ((uint64_t)lrand48() | ((uint64_t)lrand48() << 31) | ((uint64_t)lrand48() << 62));
|
||||
}
|
||||
{
|
||||
auto lc_it = st->self->parent->kvfs->list_cookies.lower_bound((list_cookie_t){ st->dir_ino, st->cookieverf, 0 });
|
||||
if (lc_it != st->self->parent->kvfs->list_cookies.end() &&
|
||||
lc_it->first.dir_ino == st->dir_ino &&
|
||||
lc_it->first.cookieverf == st->cookieverf &&
|
||||
lc_it->first.cookie < st->cookie)
|
||||
{
|
||||
auto lc_start = lc_it;
|
||||
while (lc_it != st->self->parent->kvfs->list_cookies.end() && lc_it->first.cookieverf == st->cookieverf)
|
||||
{
|
||||
lc_it++;
|
||||
}
|
||||
st->self->parent->kvfs->list_cookies.erase(lc_start, lc_it);
|
||||
}
|
||||
}
|
||||
st->getattr_cur = st->entries.size();
|
||||
st->list_handle = st->self->parent->db->list_start(st->start);
|
||||
st->self->parent->db->list_next(st->list_handle, [=](int res, const std::string & key, const std::string & value)
|
||||
{
|
||||
st->res = res;
|
||||
st->cur_key = key;
|
||||
st->cur_value = value;
|
||||
nfs_kv_continue_readdir(st, 3);
|
||||
});
|
||||
return;
|
||||
while (st->list_handle)
|
||||
{
|
||||
st->self->parent->db->list_next(st->list_handle, NULL);
|
||||
return;
|
||||
resume_3:
|
||||
if (st->res == -ENOENT || st->cur_key.size() < st->prefix.size() || st->cur_key.substr(0, st->prefix.size()) != st->prefix)
|
||||
{
|
||||
st->self->parent->db->list_close(st->list_handle);
|
||||
st->list_handle = NULL;
|
||||
break;
|
||||
}
|
||||
if (st->to_skip > 0)
|
||||
{
|
||||
st->to_skip--;
|
||||
continue;
|
||||
}
|
||||
std::string err;
|
||||
auto direntry = json11::Json::parse(st->cur_value, err);
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "readdir: direntry %s contains invalid JSON: %s, skipping\n",
|
||||
st->cur_key.c_str(), st->cur_value.c_str());
|
||||
continue;
|
||||
}
|
||||
auto ino = direntry["ino"].uint64_value();
|
||||
auto name = kv_direntry_filename(st->cur_key);
|
||||
if (st->self->parent->trace)
|
||||
{
|
||||
fprintf(stderr, "[%d] READDIR %ju %lu %s\n",
|
||||
st->self->nfs_fd, st->dir_ino, st->offset, name.c_str());
|
||||
}
|
||||
auto fh = kv_fh(ino);
|
||||
// 1 entry3 is (8+4+(filename_len+3)/4*4+8) bytes
|
||||
// 1 entryplus3 is (8+4+(filename_len+3)/4*4+8
|
||||
// + 4+(name_attributes ? (sizeof(fattr3) = 84) : 0)
|
||||
// + 4+(name_handle ? 4+(handle_len+3)/4*4 : 0)) bytes
|
||||
auto entry_size = 20 + len_pad4(name.size()) + (st->is_plus ? 8 + 88 + len_pad4(fh.size()) : 0);
|
||||
if (st->reply_size + entry_size > st->maxcount)
|
||||
{
|
||||
st->eof = false;
|
||||
st->self->parent->db->list_close(st->list_handle);
|
||||
st->list_handle = NULL;
|
||||
break;
|
||||
}
|
||||
st->reply_size += entry_size;
|
||||
auto idx = st->entries.size();
|
||||
st->entries.push_back((entryplus3){});
|
||||
auto entry = &st->entries[idx];
|
||||
entry->name = xdr_copy_string(st->rop->xdrs, name);
|
||||
entry->fileid = ino;
|
||||
entry->cookie = st->offset++;
|
||||
st->self->parent->kvfs->list_cookies[(list_cookie_t){ st->dir_ino, st->cookieverf, entry->cookie }] = { .key = name };
|
||||
if (st->is_plus)
|
||||
{
|
||||
entry->name_handle = (post_op_fh3){
|
||||
.handle_follows = 1,
|
||||
.handle = xdr_copy_string(st->rop->xdrs, fh),
|
||||
};
|
||||
kv_getattr_next(st);
|
||||
}
|
||||
}
|
||||
resume_4:
|
||||
while (st->getattr_running > 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
void *prev = NULL;
|
||||
for (int i = 0; i < st->entries.size(); i++)
|
||||
{
|
||||
entryplus3 *entry = &st->entries[i];
|
||||
if (prev)
|
||||
{
|
||||
if (st->is_plus)
|
||||
((entryplus3*)prev)->nextentry = entry;
|
||||
else
|
||||
((entry3*)prev)->nextentry = (entry3*)entry;
|
||||
}
|
||||
prev = entry;
|
||||
}
|
||||
// Send reply
|
||||
auto cb = std::move(st->cb);
|
||||
cb(0);
|
||||
}
|
||||
|
||||
static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
|
||||
{
|
||||
auto st = new nfs_kv_readdir_state;
|
||||
st->self = (nfs_client_t*)opaque;
|
||||
st->rop = rop;
|
||||
st->is_plus = is_plus;
|
||||
if (st->is_plus)
|
||||
{
|
||||
READDIRPLUS3args *args = (READDIRPLUS3args*)rop->request;
|
||||
st->dir_ino = kv_fh_inode(args->dir);
|
||||
st->cookie = args->cookie;
|
||||
st->cookieverf = *((uint64_t*)args->cookieverf);
|
||||
st->maxcount = args->maxcount;
|
||||
}
|
||||
else
|
||||
{
|
||||
READDIR3args *args = ((READDIR3args*)rop->request);
|
||||
st->dir_ino = kv_fh_inode(args->dir);
|
||||
st->cookie = args->cookie;
|
||||
st->cookieverf = *((uint64_t*)args->cookieverf);
|
||||
st->maxcount = args->count;
|
||||
}
|
||||
if (st->self->parent->trace)
|
||||
fprintf(stderr, "[%d] READDIR %ju VERF %jx OFFSET %ju LIMIT %ju\n", st->self->nfs_fd, st->dir_ino, st->cookieverf, st->cookie, st->maxcount);
|
||||
st->cb = [st](int res)
|
||||
{
|
||||
if (st->is_plus)
|
||||
{
|
||||
READDIRPLUS3res *reply = (READDIRPLUS3res*)st->rop->reply;
|
||||
*reply = (READDIRPLUS3res){ .status = vitastor_nfs_map_err(res) };
|
||||
*(uint64_t*)(reply->resok.cookieverf) = st->cookieverf;
|
||||
reply->resok.reply.entries = st->entries.size() ? &st->entries[0] : NULL;
|
||||
reply->resok.reply.eof = st->eof;
|
||||
}
|
||||
else
|
||||
{
|
||||
READDIR3res *reply = (READDIR3res*)st->rop->reply;
|
||||
*reply = (READDIR3res){ .status = vitastor_nfs_map_err(res) };
|
||||
*(uint64_t*)(reply->resok.cookieverf) = st->cookieverf;
|
||||
reply->resok.reply.entries = st->entries.size() ? (entry3*)&st->entries[0] : NULL;
|
||||
reply->resok.reply.eof = st->eof;
|
||||
}
|
||||
rpc_queue_reply(st->rop);
|
||||
delete st;
|
||||
};
|
||||
nfs_kv_continue_readdir(st, 0);
|
||||
}
|
||||
|
||||
int kv_nfs3_readdir_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs3_readdir_common(opaque, rop, false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int kv_nfs3_readdirplus_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs3_readdir_common(opaque, rop, true);
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,321 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS proxy over VitastorKV database - REMOVE, RMDIR
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "nfs_proxy.h"
|
||||
#include "nfs_kv.h"
|
||||
#include "cli.h"
|
||||
|
||||
struct kv_del_state
|
||||
{
|
||||
nfs_client_t *self = NULL;
|
||||
rpc_op_t *rop = NULL;
|
||||
uint64_t dir_ino = 0;
|
||||
std::string filename;
|
||||
uint64_t ino = 0;
|
||||
void *list_handle = NULL;
|
||||
std::string prefix, list_key, direntry_text, ientry_text;
|
||||
json11::Json direntry, ientry;
|
||||
int type = 0;
|
||||
bool is_rmdir = false;
|
||||
bool rm_data = false;
|
||||
bool allow_cache = true;
|
||||
int res = 0, res2 = 0;
|
||||
std::function<void(int)> cb;
|
||||
};
|
||||
|
||||
static void nfs_kv_continue_delete(kv_del_state *st, int state)
|
||||
{
|
||||
// Overall algorithm:
|
||||
// 1) Get inode attributes and check that it's not a directory (REMOVE)
|
||||
// 2) Get inode attributes and check that it is a directory (RMDIR)
|
||||
// 3) Delete direntry with CAS
|
||||
// 4) Check that the directory didn't contain files (RMDIR) and restore it if it did
|
||||
// 5) Reduce inode refcount by 1 or delete inode
|
||||
// 6) If regular file and inode is deleted: delete data
|
||||
if (state == 0) {}
|
||||
else if (state == 1) goto resume_1;
|
||||
else if (state == 2) goto resume_2;
|
||||
else if (state == 3) goto resume_3;
|
||||
else if (state == 4) goto resume_4;
|
||||
else if (state == 5) goto resume_5;
|
||||
else if (state == 6) goto resume_6;
|
||||
else if (state == 7) goto resume_7;
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "BUG: invalid state in nfs_kv_continue_delete()");
|
||||
abort();
|
||||
}
|
||||
resume_0:
|
||||
st->self->parent->db->get(kv_direntry_key(st->dir_ino, st->filename), [st](int res, const std::string & value)
|
||||
{
|
||||
st->res = res;
|
||||
st->direntry_text = value;
|
||||
nfs_kv_continue_delete(st, 1);
|
||||
}, st->allow_cache);
|
||||
return;
|
||||
resume_1:
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
{
|
||||
std::string err;
|
||||
st->direntry = json11::Json::parse(st->direntry_text, err);
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Invalid JSON in direntry %s = %s: %s, deleting\n",
|
||||
kv_direntry_key(st->dir_ino, st->filename).c_str(), st->direntry_text.c_str(), err.c_str());
|
||||
// Just delete direntry and skip inode
|
||||
}
|
||||
else
|
||||
{
|
||||
st->ino = st->direntry["ino"].uint64_value();
|
||||
}
|
||||
}
|
||||
// Get inode
|
||||
st->self->parent->db->get(kv_inode_key(st->ino), [st](int res, const std::string & value)
|
||||
{
|
||||
st->res = res;
|
||||
st->ientry_text = value;
|
||||
nfs_kv_continue_delete(st, 2);
|
||||
}, st->allow_cache);
|
||||
return;
|
||||
resume_2:
|
||||
if (st->res < 0)
|
||||
{
|
||||
fprintf(stderr, "error reading inode %s: %s (code %d)\n",
|
||||
kv_inode_key(st->ino).c_str(), strerror(-st->res), st->res);
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
{
|
||||
std::string err;
|
||||
st->ientry = json11::Json::parse(st->ientry_text, err);
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Invalid JSON in inode %s = %s: %s, treating as a regular file\n",
|
||||
kv_inode_key(st->ino).c_str(), st->ientry_text.c_str(), err.c_str());
|
||||
}
|
||||
}
|
||||
// (1-2) Check type
|
||||
st->type = kv_map_type(st->ientry["type"].string_value());
|
||||
if (st->type == -1 || st->is_rmdir != (st->type == NF3DIR))
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->is_rmdir ? -ENOTDIR : -EISDIR);
|
||||
return;
|
||||
}
|
||||
// (3) Delete direntry with CAS
|
||||
st->self->parent->db->del(kv_direntry_key(st->dir_ino, st->filename), [st](int res)
|
||||
{
|
||||
st->res = res;
|
||||
nfs_kv_continue_delete(st, 3);
|
||||
}, [st](int res, const std::string & value)
|
||||
{
|
||||
return value == st->direntry_text;
|
||||
});
|
||||
return;
|
||||
resume_3:
|
||||
if (st->res == -EAGAIN)
|
||||
{
|
||||
// CAS failure, restart from the beginning
|
||||
st->allow_cache = false;
|
||||
goto resume_0;
|
||||
}
|
||||
else if (st->res < 0 && st->res != -ENOENT)
|
||||
{
|
||||
fprintf(stderr, "failed to remove direntry %s: %s (code %d)\n",
|
||||
kv_direntry_key(st->dir_ino, st->filename).c_str(), strerror(-st->res), st->res);
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
if (!st->ino)
|
||||
{
|
||||
// direntry contained invalid JSON and was deleted, finish
|
||||
auto cb = std::move(st->cb);
|
||||
cb(0);
|
||||
return;
|
||||
}
|
||||
if (st->is_rmdir)
|
||||
{
|
||||
// (4) Check if directory actually is not empty
|
||||
st->list_handle = st->self->parent->db->list_start(kv_direntry_key(st->ino, ""));
|
||||
st->self->parent->db->list_next(st->list_handle, [st](int res, const std::string & key, const std::string & value)
|
||||
{
|
||||
st->res = res;
|
||||
st->list_key = key;
|
||||
st->self->parent->db->list_close(st->list_handle);
|
||||
nfs_kv_continue_delete(st, 4);
|
||||
});
|
||||
return;
|
||||
resume_4:
|
||||
st->prefix = kv_direntry_key(st->ino, "");
|
||||
if (st->res == -ENOENT || st->list_key.size() < st->prefix.size() || st->list_key.substr(0, st->prefix.size()) != st->prefix)
|
||||
{
|
||||
// OK, directory is empty
|
||||
}
|
||||
else
|
||||
{
|
||||
// Not OK, restore direntry
|
||||
st->self->parent->db->del(kv_direntry_key(st->dir_ino, st->filename), [st](int res)
|
||||
{
|
||||
st->res2 = res;
|
||||
nfs_kv_continue_delete(st, 5);
|
||||
}, [st](int res, const std::string & value)
|
||||
{
|
||||
return res == -ENOENT;
|
||||
});
|
||||
return;
|
||||
resume_5:
|
||||
if (st->res2 < 0)
|
||||
{
|
||||
fprintf(stderr, "failed to restore direntry %s (%s): %s (code %d)",
|
||||
kv_direntry_key(st->dir_ino, st->filename).c_str(), st->direntry_text.c_str(), strerror(-st->res2), st->res2);
|
||||
fprintf(stderr, " - inode %ju may be left as garbage\n", st->ino);
|
||||
}
|
||||
if (st->res < 0)
|
||||
{
|
||||
fprintf(stderr, "failed to list entries from %s: %s (code %d)\n",
|
||||
kv_direntry_key(st->ino, "").c_str(), strerror(-st->res), st->res);
|
||||
}
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res < 0 ? st->res : -ENOTEMPTY);
|
||||
return;
|
||||
}
|
||||
}
|
||||
// (5) Reduce inode refcount by 1 or delete inode
|
||||
if (st->ientry["nlink"].uint64_value() > 1)
|
||||
{
|
||||
auto copy = st->ientry.object_items();
|
||||
copy["nlink"] = st->ientry["nlink"].uint64_value()-1;
|
||||
copy["ctime"] = nfstime_now_str();
|
||||
st->self->parent->db->set(kv_inode_key(st->ino), json11::Json(copy).dump(), [st](int res)
|
||||
{
|
||||
st->res = res;
|
||||
nfs_kv_continue_delete(st, 6);
|
||||
}, [st](int res, const std::string & old_value)
|
||||
{
|
||||
return old_value == st->ientry_text;
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
st->self->parent->kvfs->touch_queue.erase(st->ino);
|
||||
st->self->parent->db->del(kv_inode_key(st->ino), [st](int res)
|
||||
{
|
||||
st->res = res;
|
||||
nfs_kv_continue_delete(st, 6);
|
||||
}, [st](int res, const std::string & old_value)
|
||||
{
|
||||
return old_value == st->ientry_text;
|
||||
});
|
||||
}
|
||||
return;
|
||||
resume_6:
|
||||
if (st->res < 0)
|
||||
{
|
||||
// Assume EAGAIN is OK, maybe someone created a hard link in the meantime
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res == -EAGAIN ? 0 : st->res);
|
||||
return;
|
||||
}
|
||||
// (6) If regular file and inode is deleted: delete data
|
||||
if ((!st->type || st->type == NF3REG) && st->ientry["nlink"].uint64_value() <= 1 &&
|
||||
!st->ientry["shared_ino"].uint64_value())
|
||||
{
|
||||
// Remove data
|
||||
st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {
|
||||
{ "inode", INODE_NO_POOL(st->ino) },
|
||||
{ "pool", (uint64_t)INODE_POOL(st->ino) },
|
||||
}), [st](const cli_result_t & r)
|
||||
{
|
||||
if (r.err)
|
||||
{
|
||||
fprintf(stderr, "Failed to remove inode %jx data: %s (code %d)\n",
|
||||
st->ino, r.text.c_str(), r.err);
|
||||
}
|
||||
st->res = r.err;
|
||||
nfs_kv_continue_delete(st, 7);
|
||||
});
|
||||
return;
|
||||
resume_7:
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
if (!st->res)
|
||||
{
|
||||
st->self->parent->kvfs->touch_queue.insert(st->dir_ino);
|
||||
}
|
||||
auto cb = std::move(st->cb);
|
||||
cb(0);
|
||||
}
|
||||
|
||||
int kv_nfs3_remove_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
kv_del_state *st = new kv_del_state;
|
||||
st->self = (nfs_client_t*)opaque;
|
||||
st->rop = rop;
|
||||
REMOVE3res *reply = (REMOVE3res*)rop->reply;
|
||||
REMOVE3args *args = (REMOVE3args*)rop->request;
|
||||
st->dir_ino = kv_fh_inode(args->object.dir);
|
||||
st->filename = args->object.name;
|
||||
if (st->self->parent->trace)
|
||||
fprintf(stderr, "[%d] REMOVE %ju/%s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str());
|
||||
if (!st->dir_ino)
|
||||
{
|
||||
*reply = (REMOVE3res){ .status = NFS3ERR_INVAL };
|
||||
rpc_queue_reply(rop);
|
||||
delete st;
|
||||
return 0;
|
||||
}
|
||||
st->cb = [st](int res)
|
||||
{
|
||||
*((REMOVE3res*)st->rop->reply) = (REMOVE3res){
|
||||
.status = vitastor_nfs_map_err(res),
|
||||
};
|
||||
rpc_queue_reply(st->rop);
|
||||
delete st;
|
||||
};
|
||||
nfs_kv_continue_delete(st, 0);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int kv_nfs3_rmdir_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
kv_del_state *st = new kv_del_state;
|
||||
st->self = (nfs_client_t*)opaque;
|
||||
st->rop = rop;
|
||||
RMDIR3args *args = (RMDIR3args*)rop->request;
|
||||
RMDIR3res *reply = (RMDIR3res*)rop->reply;
|
||||
st->dir_ino = kv_fh_inode(args->object.dir);
|
||||
st->filename = args->object.name;
|
||||
st->is_rmdir = true;
|
||||
if (st->self->parent->trace)
|
||||
fprintf(stderr, "[%d] RMDIR %ju/%s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str());
|
||||
if (!st->dir_ino)
|
||||
{
|
||||
*reply = (RMDIR3res){ .status = NFS3ERR_INVAL };
|
||||
rpc_queue_reply(rop);
|
||||
delete st;
|
||||
return 0;
|
||||
}
|
||||
st->cb = [st](int res)
|
||||
{
|
||||
*((RMDIR3res*)st->rop->reply) = (RMDIR3res){
|
||||
.status = vitastor_nfs_map_err(res),
|
||||
};
|
||||
rpc_queue_reply(st->rop);
|
||||
delete st;
|
||||
};
|
||||
nfs_kv_continue_delete(st, 0);
|
||||
return 1;
|
||||
}
|
|
@ -0,0 +1,401 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS proxy over VitastorKV database - RENAME
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "nfs_proxy.h"
|
||||
#include "nfs_kv.h"
|
||||
#include "cli.h"
|
||||
|
||||
struct nfs_kv_rename_state
|
||||
{
|
||||
nfs_client_t *self = NULL;
|
||||
rpc_op_t *rop = NULL;
|
||||
// params:
|
||||
uint64_t old_dir_ino = 0, new_dir_ino = 0;
|
||||
std::string old_name, new_name;
|
||||
// state:
|
||||
bool allow_cache = true;
|
||||
std::string old_direntry_text, old_ientry_text, new_direntry_text, new_ientry_text;
|
||||
json11::Json old_direntry, old_ientry, new_direntry, new_ientry;
|
||||
std::string new_dir_prefix;
|
||||
void *list_handle = NULL;
|
||||
bool new_exists = false;
|
||||
bool rm_dest_data = false;
|
||||
int res = 0, res2 = 0;
|
||||
std::function<void(int)> cb;
|
||||
};
|
||||
|
||||
static void nfs_kv_continue_rename(nfs_kv_rename_state *st, int state)
|
||||
{
|
||||
// Algorithm (non-atomic of course):
|
||||
// 1) Read source direntry
|
||||
// 2) Read destination direntry
|
||||
// 3) If destination exists:
|
||||
// 3.1) Check file/folder compatibility (EISDIR/ENOTDIR)
|
||||
// 3.2) Check if destination is empty if it's a folder
|
||||
// 4) If not:
|
||||
// 4.1) Check that the destination directory is actually a directory
|
||||
// 5) Overwrite destination direntry, restart from beginning if CAS failure
|
||||
// 6) Delete source direntry, restart from beginning if CAS failure
|
||||
// 7) If the moved direntry was a regular file:
|
||||
// 7.1) Read inode
|
||||
// 7.2) Delete inode if its link count <= 1
|
||||
// 7.3) Delete inode data if its link count <= 1 and it's a regular non-shared file
|
||||
// 7.4) Reduce link count by 1 if it's > 1
|
||||
// 8) If the moved direntry is a directory:
|
||||
// 8.1) Change parent_ino reference in its inode
|
||||
if (state == 0) {}
|
||||
else if (state == 1) goto resume_1;
|
||||
else if (state == 2) goto resume_2;
|
||||
else if (state == 3) goto resume_3;
|
||||
else if (state == 4) goto resume_4;
|
||||
else if (state == 5) goto resume_5;
|
||||
else if (state == 6) goto resume_6;
|
||||
else if (state == 7) goto resume_7;
|
||||
else if (state == 8) goto resume_8;
|
||||
else if (state == 9) goto resume_9;
|
||||
else if (state == 10) goto resume_10;
|
||||
else if (state == 11) goto resume_11;
|
||||
else if (state == 12) goto resume_12;
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "BUG: invalid state in nfs_kv_continue_rename()");
|
||||
abort();
|
||||
}
|
||||
resume_0:
|
||||
// Read the old direntry
|
||||
st->self->parent->db->get(kv_direntry_key(st->old_dir_ino, st->old_name), [=](int res, const std::string & value)
|
||||
{
|
||||
st->res = res;
|
||||
st->old_direntry_text = value;
|
||||
nfs_kv_continue_rename(st, 1);
|
||||
}, st->allow_cache);
|
||||
return;
|
||||
resume_1:
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
{
|
||||
std::string err;
|
||||
st->old_direntry = json11::Json::parse(st->old_direntry_text, err);
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Invalid JSON in direntry %s = %s: %s\n",
|
||||
kv_direntry_key(st->old_dir_ino, st->old_name).c_str(),
|
||||
st->old_direntry_text.c_str(), err.c_str());
|
||||
auto cb = std::move(st->cb);
|
||||
cb(-EIO);
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Read the new direntry
|
||||
st->self->parent->db->get(kv_direntry_key(st->new_dir_ino, st->new_name), [=](int res, const std::string & value)
|
||||
{
|
||||
st->res = res;
|
||||
st->new_direntry_text = value;
|
||||
nfs_kv_continue_rename(st, 2);
|
||||
}, st->allow_cache);
|
||||
return;
|
||||
resume_2:
|
||||
if (st->res < 0 && st->res != -ENOENT)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
if (st->res == 0)
|
||||
{
|
||||
std::string err;
|
||||
st->new_direntry = json11::Json::parse(st->new_direntry_text, err);
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "Invalid JSON in direntry %s = %s: %s\n",
|
||||
kv_direntry_key(st->new_dir_ino, st->new_name).c_str(),
|
||||
st->new_direntry_text.c_str(), err.c_str());
|
||||
auto cb = std::move(st->cb);
|
||||
cb(-EIO);
|
||||
return;
|
||||
}
|
||||
}
|
||||
st->new_exists = st->res == 0;
|
||||
if (st->new_exists)
|
||||
{
|
||||
// Check file/folder compatibility (EISDIR/ENOTDIR)
|
||||
if ((st->old_direntry["type"] == "dir") != (st->new_direntry["type"] == "dir"))
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb((st->new_direntry["type"] == "dir") ? -ENOTDIR : -EISDIR);
|
||||
return;
|
||||
}
|
||||
if (st->new_direntry["type"] == "dir")
|
||||
{
|
||||
// Check that the destination directory is empty
|
||||
st->new_dir_prefix = kv_direntry_key(st->new_direntry["ino"].uint64_value(), "");
|
||||
st->list_handle = st->self->parent->db->list_start(st->new_dir_prefix);
|
||||
st->self->parent->db->list_next(st->list_handle, [st](int res, const std::string & key, const std::string & value)
|
||||
{
|
||||
st->res = res;
|
||||
nfs_kv_continue_rename(st, 3);
|
||||
});
|
||||
return;
|
||||
resume_3:
|
||||
st->self->parent->db->list_close(st->list_handle);
|
||||
if (st->res != -ENOENT)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(-ENOTEMPTY);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Check that the new directory is actually a directory
|
||||
kv_read_inode(st->self->parent, st->new_dir_ino, [st](int res, const std::string & value, json11::Json attrs)
|
||||
{
|
||||
st->res = res == 0 ? (attrs["type"].string_value() == "dir" ? 0 : -ENOTDIR) : res;
|
||||
nfs_kv_continue_rename(st, 4);
|
||||
});
|
||||
return;
|
||||
resume_4:
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Write the new direntry
|
||||
st->self->parent->db->set(kv_direntry_key(st->new_dir_ino, st->new_name), st->old_direntry_text, [st](int res)
|
||||
{
|
||||
st->res = res;
|
||||
nfs_kv_continue_rename(st, 5);
|
||||
}, [st](int res, const std::string & old_value)
|
||||
{
|
||||
return st->new_exists ? (old_value == st->new_direntry_text) : (res == -ENOENT);
|
||||
});
|
||||
return;
|
||||
resume_5:
|
||||
if (st->res == -EAGAIN)
|
||||
{
|
||||
// CAS failure
|
||||
st->allow_cache = false;
|
||||
goto resume_0;
|
||||
}
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
// Delete the old direntry
|
||||
st->self->parent->db->del(kv_direntry_key(st->old_dir_ino, st->old_name), [st](int res)
|
||||
{
|
||||
st->res = res;
|
||||
nfs_kv_continue_rename(st, 6);
|
||||
}, [=](int res, const std::string & old_value)
|
||||
{
|
||||
return res == 0 && old_value == st->old_direntry_text;
|
||||
});
|
||||
return;
|
||||
resume_6:
|
||||
if (st->res == -EAGAIN)
|
||||
{
|
||||
// CAS failure
|
||||
st->allow_cache = false;
|
||||
goto resume_0;
|
||||
}
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
st->allow_cache = true;
|
||||
resume_7again:
|
||||
if (st->new_exists && st->new_direntry["type"].string_value() != "dir")
|
||||
{
|
||||
// (Maybe) delete old destination file data
|
||||
kv_read_inode(st->self->parent, st->new_direntry["ino"].uint64_value(), [st](int res, const std::string & value, json11::Json attrs)
|
||||
{
|
||||
st->res = res;
|
||||
st->new_ientry_text = value;
|
||||
st->new_ientry = attrs;
|
||||
nfs_kv_continue_rename(st, 7);
|
||||
}, st->allow_cache);
|
||||
return;
|
||||
resume_7:
|
||||
if (st->res == 0)
|
||||
{
|
||||
// (5) Reduce inode refcount by 1 or delete inode
|
||||
if (st->new_ientry["nlink"].uint64_value() > 1)
|
||||
{
|
||||
auto copy = st->new_ientry.object_items();
|
||||
copy["nlink"] = st->new_ientry["nlink"].uint64_value()-1;
|
||||
copy["ctime"] = nfstime_now_str();
|
||||
copy.erase("verf");
|
||||
st->self->parent->db->set(kv_inode_key(st->new_direntry["ino"].uint64_value()), json11::Json(copy).dump(), [st](int res)
|
||||
{
|
||||
st->res = res;
|
||||
nfs_kv_continue_rename(st, 8);
|
||||
}, [st](int res, const std::string & old_value)
|
||||
{
|
||||
return old_value == st->new_ientry_text;
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
st->rm_dest_data = kv_map_type(st->new_ientry["type"].string_value()) == NF3REG
|
||||
&& !st->new_ientry["shared_ino"].uint64_value();
|
||||
st->self->parent->db->del(kv_inode_key(st->new_direntry["ino"].uint64_value()), [st](int res)
|
||||
{
|
||||
st->res = res;
|
||||
nfs_kv_continue_rename(st, 8);
|
||||
}, [st](int res, const std::string & old_value)
|
||||
{
|
||||
return old_value == st->new_ientry_text;
|
||||
});
|
||||
}
|
||||
return;
|
||||
resume_8:
|
||||
if (st->res == -EAGAIN)
|
||||
{
|
||||
// CAS failure - re-read inode
|
||||
st->allow_cache = false;
|
||||
goto resume_7again;
|
||||
}
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
// Delete inode data if required
|
||||
if (st->rm_dest_data)
|
||||
{
|
||||
st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {
|
||||
{ "inode", INODE_NO_POOL(st->new_direntry["ino"].uint64_value()) },
|
||||
{ "pool", (uint64_t)INODE_POOL(st->new_direntry["ino"].uint64_value()) },
|
||||
}), [st](const cli_result_t & r)
|
||||
{
|
||||
if (r.err)
|
||||
{
|
||||
fprintf(stderr, "Failed to remove inode %jx data: %s (code %d)\n",
|
||||
st->new_direntry["ino"].uint64_value(), r.text.c_str(), r.err);
|
||||
}
|
||||
st->res = r.err;
|
||||
nfs_kv_continue_rename(st, 9);
|
||||
});
|
||||
return;
|
||||
resume_9:
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (st->old_direntry["type"].string_value() == "dir" && st->new_dir_ino != st->old_dir_ino)
|
||||
{
|
||||
// Change parent_ino in old ientry
|
||||
st->allow_cache = true;
|
||||
resume_10:
|
||||
kv_read_inode(st->self->parent, st->old_direntry["ino"].uint64_value(), [st](int res, const std::string & value, json11::Json ientry)
|
||||
{
|
||||
st->res = res;
|
||||
st->old_ientry_text = value;
|
||||
st->old_ientry = ientry;
|
||||
nfs_kv_continue_rename(st, 11);
|
||||
}, st->allow_cache);
|
||||
return;
|
||||
resume_11:
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
{
|
||||
auto ientry_new = st->old_ientry.object_items();
|
||||
ientry_new["parent_ino"] = st->new_dir_ino;
|
||||
ientry_new["ctime"] = nfstime_now_str();
|
||||
ientry_new.erase("verf");
|
||||
st->self->parent->db->set(kv_inode_key(st->old_direntry["ino"].uint64_value()), json11::Json(ientry_new).dump(), [st](int res)
|
||||
{
|
||||
st->res = res;
|
||||
nfs_kv_continue_rename(st, 12);
|
||||
}, [st](int res, const std::string & old_value)
|
||||
{
|
||||
return old_value == st->old_ientry_text;
|
||||
});
|
||||
}
|
||||
return;
|
||||
resume_12:
|
||||
if (st->res == -EAGAIN)
|
||||
{
|
||||
// CAS failure - try again
|
||||
st->allow_cache = false;
|
||||
goto resume_10;
|
||||
}
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (!st->res)
|
||||
{
|
||||
st->self->parent->kvfs->touch_queue.insert(st->old_dir_ino);
|
||||
st->self->parent->kvfs->touch_queue.insert(st->new_dir_ino);
|
||||
}
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
}
|
||||
|
||||
int kv_nfs3_rename_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
auto st = new nfs_kv_rename_state;
|
||||
st->self = (nfs_client_t*)opaque;
|
||||
st->rop = rop;
|
||||
RENAME3args *args = (RENAME3args*)rop->request;
|
||||
st->old_dir_ino = kv_fh_inode(args->from.dir);
|
||||
st->new_dir_ino = kv_fh_inode(args->to.dir);
|
||||
st->old_name = args->from.name;
|
||||
st->new_name = args->to.name;
|
||||
if (st->self->parent->trace)
|
||||
fprintf(stderr, "[%d] RENAME %ju/%s -> %ju/%s\n", st->self->nfs_fd, st->old_dir_ino, st->old_name.c_str(), st->new_dir_ino, st->new_name.c_str());
|
||||
if (!st->old_dir_ino || !st->new_dir_ino || st->old_name == "" || st->new_name == "")
|
||||
{
|
||||
RENAME3res *reply = (RENAME3res*)rop->reply;
|
||||
*reply = (RENAME3res){ .status = NFS3ERR_INVAL };
|
||||
rpc_queue_reply(rop);
|
||||
delete st;
|
||||
return 0;
|
||||
}
|
||||
if (st->old_dir_ino == st->new_dir_ino && st->old_name == st->new_name)
|
||||
{
|
||||
RENAME3res *reply = (RENAME3res*)rop->reply;
|
||||
*reply = (RENAME3res){ .status = NFS3_OK };
|
||||
rpc_queue_reply(st->rop);
|
||||
delete st;
|
||||
return 0;
|
||||
}
|
||||
st->cb = [st](int res)
|
||||
{
|
||||
RENAME3res *reply = (RENAME3res*)st->rop->reply;
|
||||
*reply = (RENAME3res){ .status = vitastor_nfs_map_err(res) };
|
||||
rpc_queue_reply(st->rop);
|
||||
delete st;
|
||||
};
|
||||
nfs_kv_continue_rename(st, 0);
|
||||
return 1;
|
||||
}
|
|
@ -0,0 +1,204 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS proxy over VitastorKV database - SETATTR
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "nfs_proxy.h"
|
||||
#include "nfs_kv.h"
|
||||
#include "cli.h"
|
||||
|
||||
struct nfs_kv_setattr_state
|
||||
{
|
||||
nfs_client_t *self = NULL;
|
||||
rpc_op_t *rop = NULL;
|
||||
uint64_t ino = 0;
|
||||
uint64_t old_size = 0, new_size = 0;
|
||||
std::string expected_ctime;
|
||||
json11::Json::object set_attrs;
|
||||
int res = 0, cas_res = 0;
|
||||
std::string ientry_text;
|
||||
json11::Json ientry;
|
||||
json11::Json::object new_attrs;
|
||||
std::function<void(int)> cb;
|
||||
};
|
||||
|
||||
static void nfs_kv_continue_setattr(nfs_kv_setattr_state *st, int state)
|
||||
{
|
||||
// FIXME: NFS client does a lot of setattr calls, so maybe process them asynchronously
|
||||
if (state == 0) {}
|
||||
else if (state == 1) goto resume_1;
|
||||
else if (state == 2) goto resume_2;
|
||||
else if (state == 3) goto resume_3;
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "BUG: invalid state in nfs_kv_continue_setattr()");
|
||||
abort();
|
||||
}
|
||||
st->self->parent->kvfs->touch_queue.erase(st->ino);
|
||||
resume_0:
|
||||
kv_read_inode(st->self->parent, st->ino, [st](int res, const std::string & value, json11::Json attrs)
|
||||
{
|
||||
st->res = res;
|
||||
st->ientry_text = value;
|
||||
st->ientry = attrs;
|
||||
nfs_kv_continue_setattr(st, 1);
|
||||
});
|
||||
return;
|
||||
resume_1:
|
||||
if (st->res < 0)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
if (st->ientry["type"].string_value() != "file" &&
|
||||
st->ientry["type"].string_value() != "" &&
|
||||
!st->set_attrs["size"].is_null())
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(-EINVAL);
|
||||
return;
|
||||
}
|
||||
if (st->expected_ctime != "")
|
||||
{
|
||||
auto actual_ctime = (st->ientry["ctime"].is_null() ? st->ientry["mtime"] : st->ientry["ctime"]);
|
||||
if (actual_ctime != st->expected_ctime)
|
||||
{
|
||||
auto cb = std::move(st->cb);
|
||||
cb(NFS3ERR_NOT_SYNC);
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Now we can update it
|
||||
st->new_attrs = st->ientry.object_items();
|
||||
st->old_size = st->ientry["size"].uint64_value();
|
||||
for (auto & kv: st->set_attrs)
|
||||
{
|
||||
if (kv.first == "size")
|
||||
{
|
||||
st->new_size = kv.second.uint64_value();
|
||||
}
|
||||
st->new_attrs[kv.first] = kv.second;
|
||||
}
|
||||
st->new_attrs.erase("verf");
|
||||
st->new_attrs["ctime"] = nfstime_now_str();
|
||||
st->self->parent->db->set(kv_inode_key(st->ino), json11::Json(st->new_attrs).dump(), [st](int res)
|
||||
{
|
||||
st->res = res;
|
||||
nfs_kv_continue_setattr(st, 2);
|
||||
}, [st](int res, const std::string & cas_value)
|
||||
{
|
||||
st->cas_res = res;
|
||||
return (res == 0 || res == -ENOENT && st->ino == KV_ROOT_INODE) && cas_value == st->ientry_text;
|
||||
});
|
||||
return;
|
||||
resume_2:
|
||||
if (st->cas_res == -ENOENT)
|
||||
{
|
||||
st->res = -ENOENT;
|
||||
}
|
||||
if (st->res == -EAGAIN)
|
||||
{
|
||||
// Retry
|
||||
goto resume_0;
|
||||
}
|
||||
if (st->res < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to update inode %ju: %s (code %d)\n", st->ino, strerror(-st->res), st->res);
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
return;
|
||||
}
|
||||
if (!st->set_attrs["size"].is_null() &&
|
||||
st->ientry["size"].uint64_value() > st->set_attrs["size"].uint64_value() &&
|
||||
!st->ientry["shared_ino"].uint64_value())
|
||||
{
|
||||
// Delete extra data when downsizing
|
||||
st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {
|
||||
{ "inode", INODE_NO_POOL(st->ino) },
|
||||
{ "pool", (uint64_t)INODE_POOL(st->ino) },
|
||||
{ "min_offset", st->set_attrs["size"].uint64_value() },
|
||||
}), [st](const cli_result_t & r)
|
||||
{
|
||||
if (r.err)
|
||||
{
|
||||
fprintf(stderr, "Failed to truncate inode %ju: %s (code %d)\n",
|
||||
st->ino, r.text.c_str(), r.err);
|
||||
}
|
||||
st->res = r.err;
|
||||
nfs_kv_continue_setattr(st, 3);
|
||||
});
|
||||
return;
|
||||
}
|
||||
resume_3:
|
||||
auto cb = std::move(st->cb);
|
||||
cb(st->res);
|
||||
}
|
||||
|
||||
int kv_nfs3_setattr_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_kv_setattr_state *st = new nfs_kv_setattr_state;
|
||||
st->self = (nfs_client_t*)opaque;
|
||||
st->rop = rop;
|
||||
auto args = (SETATTR3args*)rop->request;
|
||||
auto reply = (SETATTR3res*)rop->reply;
|
||||
std::string fh = args->object;
|
||||
if (!kv_fh_valid(fh))
|
||||
{
|
||||
*reply = (SETATTR3res){ .status = NFS3ERR_INVAL };
|
||||
rpc_queue_reply(rop);
|
||||
delete st;
|
||||
return 0;
|
||||
}
|
||||
st->ino = kv_fh_inode(fh);
|
||||
if (args->guard.check)
|
||||
st->expected_ctime = nfstime_to_str(args->guard.obj_ctime);
|
||||
if (args->new_attributes.size.set_it)
|
||||
st->set_attrs["size"] = args->new_attributes.size.size;
|
||||
if (args->new_attributes.mode.set_it)
|
||||
st->set_attrs["mode"] = (uint64_t)args->new_attributes.mode.mode;
|
||||
if (args->new_attributes.uid.set_it)
|
||||
st->set_attrs["uid"] = (uint64_t)args->new_attributes.uid.uid;
|
||||
if (args->new_attributes.gid.set_it)
|
||||
st->set_attrs["gid"] = (uint64_t)args->new_attributes.gid.gid;
|
||||
if (args->new_attributes.atime.set_it == SET_TO_SERVER_TIME)
|
||||
st->set_attrs["atime"] = nfstime_now_str();
|
||||
else if (args->new_attributes.atime.set_it == SET_TO_CLIENT_TIME)
|
||||
st->set_attrs["atime"] = nfstime_to_str(args->new_attributes.atime.atime);
|
||||
if (args->new_attributes.mtime.set_it == SET_TO_SERVER_TIME)
|
||||
st->set_attrs["mtime"] = nfstime_now_str();
|
||||
else if (args->new_attributes.mtime.set_it == SET_TO_CLIENT_TIME)
|
||||
st->set_attrs["mtime"] = nfstime_to_str(args->new_attributes.mtime.mtime);
|
||||
if (st->self->parent->trace)
|
||||
fprintf(stderr, "[%d] SETATTR %ju ATTRS %s\n", st->self->nfs_fd, st->ino, json11::Json(st->set_attrs).dump().c_str());
|
||||
st->cb = [st](int res)
|
||||
{
|
||||
auto reply = (SETATTR3res*)st->rop->reply;
|
||||
if (res < 0)
|
||||
{
|
||||
*reply = (SETATTR3res){
|
||||
.status = vitastor_nfs_map_err(res),
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
*reply = (SETATTR3res){
|
||||
.status = NFS3_OK,
|
||||
.resok = (SETATTR3resok){
|
||||
.obj_wcc = (wcc_data){
|
||||
.after = (post_op_attr){
|
||||
.attributes_follow = 1,
|
||||
.attributes = get_kv_attributes(st->self, st->ino, st->new_attrs),
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
rpc_queue_reply(st->rop);
|
||||
delete st;
|
||||
};
|
||||
nfs_kv_continue_setattr(st, 0);
|
||||
return 1;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,126 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// NFS proxy - common NULL, ACCESS, COMMIT, DUMP, EXPORT, MNT, UMNT, UMNTALL
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "nfs_proxy.h"
|
||||
#include "nfs/nfs.h"
|
||||
|
||||
nfsstat3 vitastor_nfs_map_err(int err)
|
||||
{
|
||||
if (err < 0)
|
||||
{
|
||||
err = -err;
|
||||
}
|
||||
return (err == EINVAL ? NFS3ERR_INVAL
|
||||
: (err == ENOENT ? NFS3ERR_NOENT
|
||||
: (err == ENOSPC ? NFS3ERR_NOSPC
|
||||
: (err == EEXIST ? NFS3ERR_EXIST
|
||||
: (err == EISDIR ? NFS3ERR_ISDIR
|
||||
: (err == ENOTDIR ? NFS3ERR_NOTDIR
|
||||
: (err == ENOTEMPTY ? NFS3ERR_NOTEMPTY
|
||||
: (err == EIO ? NFS3ERR_IO : (err ? NFS3ERR_IO : NFS3_OK)))))))));
|
||||
}
|
||||
|
||||
int nfs3_null_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int nfs3_access_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
//nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
ACCESS3args *args = (ACCESS3args*)rop->request;
|
||||
ACCESS3res *reply = (ACCESS3res*)rop->reply;
|
||||
*reply = (ACCESS3res){
|
||||
.status = NFS3_OK,
|
||||
.resok = (ACCESS3resok){
|
||||
.access = args->access,
|
||||
},
|
||||
};
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int nfs3_commit_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
//COMMIT3args *args = (COMMIT3args*)rop->request;
|
||||
cluster_op_t *op = new cluster_op_t;
|
||||
// fsync. we don't know how to fsync a single inode, so just fsync everything
|
||||
op->opcode = OSD_OP_SYNC;
|
||||
op->callback = [self, rop](cluster_op_t *op)
|
||||
{
|
||||
COMMIT3res *reply = (COMMIT3res*)rop->reply;
|
||||
*reply = (COMMIT3res){ .status = vitastor_nfs_map_err(op->retval) };
|
||||
*(uint64_t*)reply->resok.verf = self->parent->server_id;
|
||||
rpc_queue_reply(rop);
|
||||
};
|
||||
self->parent->cli->execute(op);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int mount3_mnt_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
//nfs_dirpath *args = (nfs_dirpath*)rop->request;
|
||||
if (self->parent->trace)
|
||||
fprintf(stderr, "[%d] MNT\n", self->nfs_fd);
|
||||
nfs_mountres3 *reply = (nfs_mountres3*)rop->reply;
|
||||
u_int flavor = RPC_AUTH_NONE;
|
||||
reply->fhs_status = MNT3_OK;
|
||||
reply->mountinfo.fhandle = xdr_copy_string(rop->xdrs, NFS_ROOT_HANDLE);
|
||||
reply->mountinfo.auth_flavors.auth_flavors_len = 1;
|
||||
reply->mountinfo.auth_flavors.auth_flavors_val = (u_int*)xdr_copy_string(rop->xdrs, (char*)&flavor, sizeof(u_int)).data;
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mount3_dump_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
if (self->parent->trace)
|
||||
fprintf(stderr, "[%d] DUMP\n", self->nfs_fd);
|
||||
nfs_mountlist *reply = (nfs_mountlist*)rop->reply;
|
||||
*reply = (struct nfs_mountbody*)malloc_or_die(sizeof(struct nfs_mountbody));
|
||||
xdr_add_malloc(rop->xdrs, *reply);
|
||||
(*reply)->ml_hostname = xdr_copy_string(rop->xdrs, "127.0.0.1");
|
||||
(*reply)->ml_directory = xdr_copy_string(rop->xdrs, self->parent->export_root);
|
||||
(*reply)->ml_next = NULL;
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mount3_umnt_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
//nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
//nfs_dirpath *arg = (nfs_dirpath*)rop->request;
|
||||
// do nothing
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mount3_umntall_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
// do nothing
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mount3_export_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
nfs_exports *reply = (nfs_exports*)rop->reply;
|
||||
*reply = (struct nfs_exportnode*)calloc_or_die(1, sizeof(struct nfs_exportnode) + sizeof(struct nfs_groupnode));
|
||||
xdr_add_malloc(rop->xdrs, *reply);
|
||||
(*reply)->ex_dir = xdr_copy_string(rop->xdrs, self->parent->export_root);
|
||||
(*reply)->ex_groups = (struct nfs_groupnode*)(reply+1);
|
||||
(*reply)->ex_groups->gr_name = xdr_copy_string(rop->xdrs, "127.0.0.1");
|
||||
(*reply)->ex_groups->gr_next = NULL;
|
||||
(*reply)->ex_next = NULL;
|
||||
rpc_queue_reply(rop);
|
||||
return 0;
|
||||
}
|
|
@ -10,9 +10,10 @@
|
|||
|
||||
#include <netinet/tcp.h>
|
||||
#include <sys/epoll.h>
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
//#include <signal.h>
|
||||
#include <signal.h>
|
||||
|
||||
#include "nfs/nfs.h"
|
||||
#include "nfs/rpc.h"
|
||||
|
@ -21,6 +22,9 @@
|
|||
#include "addr_util.h"
|
||||
#include "str_util.h"
|
||||
#include "nfs_proxy.h"
|
||||
#include "nfs_kv.h"
|
||||
#include "nfs_block.h"
|
||||
#include "nfs_common.h"
|
||||
#include "http_client.h"
|
||||
#include "cli.h"
|
||||
|
||||
|
@ -31,6 +35,12 @@ const char *exe_name = NULL;
|
|||
|
||||
nfs_proxy_t::~nfs_proxy_t()
|
||||
{
|
||||
if (kvfs)
|
||||
delete kvfs;
|
||||
if (blockfs)
|
||||
delete blockfs;
|
||||
if (db)
|
||||
delete db;
|
||||
if (cmd)
|
||||
delete cmd;
|
||||
if (cli)
|
||||
|
@ -44,43 +54,90 @@ nfs_proxy_t::~nfs_proxy_t()
|
|||
delete ringloop;
|
||||
}
|
||||
|
||||
static const char* help_text =
|
||||
"Vitastor NFS 3.0 proxy " VERSION "\n"
|
||||
"(c) Vitaliy Filippov, 2021+ (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"vitastor-nfs (--fs <NAME> | --block) [-o <OPT>] mount <MOUNTPOINT>\n"
|
||||
" Start local filesystem server and mount file system to <MOUNTPOINT>.\n"
|
||||
" Use regular `umount <MOUNTPOINT>` to unmount the FS.\n"
|
||||
" The server will be automatically stopped when the FS is unmounted.\n"
|
||||
" -o|--options <OPT> Pass additional NFS mount options (ex.: -o async).\n"
|
||||
"\n"
|
||||
"vitastor-nfs (--fs <NAME> | --block) start\n"
|
||||
" Start network NFS server. Options:\n"
|
||||
" --bind <IP> bind service to <IP> address (default 0.0.0.0)\n"
|
||||
" --port <PORT> use port <PORT> for NFS services (default is 2049)\n"
|
||||
" --portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)\n"
|
||||
"\n"
|
||||
"OPTIONS:\n"
|
||||
" --fs <NAME> use VitastorFS with metadata in image <NAME>\n"
|
||||
" --block use pseudo-FS presenting images as files\n"
|
||||
" --pool <POOL> use <POOL> as default pool for new files\n"
|
||||
" --subdir <DIR> export <DIR> instead of root directory\n"
|
||||
" --nfspath <PATH> set NFS export path to <PATH> (default is /)\n"
|
||||
" --pidfile <FILE> write process ID to the specified file\n"
|
||||
" --logfile <FILE> log to the specified file\n"
|
||||
" --foreground 1 stay in foreground, do not daemonize\n"
|
||||
"\n"
|
||||
"NFS proxy is stateless if you use immediate_commit=all in your cluster and if\n"
|
||||
"you do not use client_enable_writeback=true, so you can freely use multiple\n"
|
||||
"NFS proxies with L3 load balancing in this case.\n"
|
||||
"\n"
|
||||
"Example start and mount commands for a custom NFS port:\n"
|
||||
" vitastor-nfs start --block --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool\n"
|
||||
" mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp\n"
|
||||
"Or just:\n"
|
||||
" vitastor-nfs mount --block --pool testpool /mnt/\n"
|
||||
;
|
||||
|
||||
json11::Json::object nfs_proxy_t::parse_args(int narg, const char *args[])
|
||||
{
|
||||
json11::Json::object cfg;
|
||||
std::vector<std::string> cmd;
|
||||
for (int i = 1; i < narg; i++)
|
||||
{
|
||||
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
|
||||
{
|
||||
printf(
|
||||
"Vitastor NFS 3.0 proxy\n"
|
||||
"(c) Vitaliy Filippov, 2021-2022 (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"USAGE:\n"
|
||||
" %s [STANDARD OPTIONS] [OTHER OPTIONS]\n"
|
||||
" --subdir <DIR> export images prefixed <DIR>/ (default empty - export all images)\n"
|
||||
" --portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)\n"
|
||||
" --bind <IP> bind service to <IP> address (default 0.0.0.0)\n"
|
||||
" --nfspath <PATH> set NFS export path to <PATH> (default is /)\n"
|
||||
" --port <PORT> use port <PORT> for NFS services (default is 2049)\n"
|
||||
" --pool <POOL> use <POOL> as default pool for new files (images)\n"
|
||||
" --foreground 1 stay in foreground, do not daemonize\n"
|
||||
"\n"
|
||||
"NFS proxy is stateless if you use immediate_commit=all in your cluster and if\n"
|
||||
"you do not use client_enable_writeback=true, so you can freely use multiple\n"
|
||||
"NFS proxies with L3 load balancing in this case.\n"
|
||||
"\n"
|
||||
"Example start and mount commands for a custom NFS port:\n"
|
||||
" %s --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool\n"
|
||||
" mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp\n",
|
||||
exe_name, exe_name
|
||||
);
|
||||
printf("%s", help_text);
|
||||
exit(0);
|
||||
}
|
||||
else if (!strcmp(args[i], "-o") || !strcmp(args[i], "--options"))
|
||||
{
|
||||
if (i >= narg-1)
|
||||
{
|
||||
printf("%s", help_text);
|
||||
exit(0);
|
||||
}
|
||||
const std::string & old = cfg["options"].string_value();
|
||||
cfg["options"] = old != "" ? old+","+args[i+1] : args[i+1];
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == '-')
|
||||
{
|
||||
const char *opt = args[i]+2;
|
||||
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
|
||||
cfg[opt] = !strcmp(opt, "json") || !strcmp(opt, "block") || i == narg-1 ? "1" : args[++i];
|
||||
}
|
||||
else
|
||||
{
|
||||
cmd.push_back(args[i]);
|
||||
}
|
||||
}
|
||||
if (cfg.find("block") == cfg.end() && cfg.find("fs") == cfg.end())
|
||||
{
|
||||
fprintf(stderr, "Specify one of --block or --fs NAME. Use vitastor-nfs --help for details\n");
|
||||
exit(1);
|
||||
}
|
||||
if (cmd.size() >= 2 && cmd[0] == "mount")
|
||||
{
|
||||
cfg["mount"] = cmd[1];
|
||||
}
|
||||
else if (cmd.size() >= 1 && cmd[0] == "start")
|
||||
{
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%s", help_text);
|
||||
exit(1);
|
||||
}
|
||||
return cfg;
|
||||
}
|
||||
|
@ -92,6 +149,10 @@ void nfs_proxy_t::run(json11::Json cfg)
|
|||
srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
|
||||
server_id = (uint64_t)lrand48() | ((uint64_t)lrand48() << 31) | ((uint64_t)lrand48() << 62);
|
||||
// Parse options
|
||||
if (cfg["logfile"].string_value() != "")
|
||||
logfile = cfg["logfile"].string_value();
|
||||
pidfile = cfg["pidfile"].string_value();
|
||||
trace = cfg["log_level"].uint64_value() > 5 || cfg["trace"].uint64_value() > 0;
|
||||
bind_address = cfg["bind"].string_value();
|
||||
if (bind_address == "")
|
||||
bind_address = "0.0.0.0";
|
||||
|
@ -103,18 +164,6 @@ void nfs_proxy_t::run(json11::Json cfg)
|
|||
export_root = cfg["nfspath"].string_value();
|
||||
if (!export_root.size())
|
||||
export_root = "/";
|
||||
name_prefix = cfg["subdir"].string_value();
|
||||
{
|
||||
int e = name_prefix.size();
|
||||
while (e > 0 && name_prefix[e-1] == '/')
|
||||
e--;
|
||||
int s = 0;
|
||||
while (s < e && name_prefix[s] == '/')
|
||||
s++;
|
||||
name_prefix = name_prefix.substr(s, e-s);
|
||||
if (name_prefix.size())
|
||||
name_prefix += "/";
|
||||
}
|
||||
if (cfg["client_writeback_allowed"].is_null())
|
||||
{
|
||||
// NFS is always aware of fsync, so we allow write-back cache
|
||||
|
@ -123,6 +172,16 @@ void nfs_proxy_t::run(json11::Json cfg)
|
|||
obj["client_writeback_allowed"] = true;
|
||||
cfg = obj;
|
||||
}
|
||||
mountpoint = cfg["mount"].string_value();
|
||||
if (mountpoint != "")
|
||||
{
|
||||
bind_address = "127.0.0.1";
|
||||
nfs_port = 0;
|
||||
portmap_enabled = false;
|
||||
exit_on_umount = true;
|
||||
}
|
||||
mountopts = cfg["options"].string_value();
|
||||
fsname = cfg["fs"].string_value();
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
|
@ -131,67 +190,7 @@ void nfs_proxy_t::run(json11::Json cfg)
|
|||
cmd->ringloop = ringloop;
|
||||
cmd->epmgr = epmgr;
|
||||
cmd->cli = cli;
|
||||
// We need inode name hashes for NFS handles to remain stateless and <= 64 bytes long
|
||||
dir_info[""] = (nfs_dir_t){
|
||||
.id = 1,
|
||||
.mod_rev = 0,
|
||||
};
|
||||
clock_gettime(CLOCK_REALTIME, &dir_info[""].mtime);
|
||||
watch_stats();
|
||||
assert(cli->st_cli.on_inode_change_hook == NULL);
|
||||
cli->st_cli.on_inode_change_hook = [this](inode_t changed_inode, bool removed)
|
||||
{
|
||||
auto inode_cfg_it = cli->st_cli.inode_config.find(changed_inode);
|
||||
if (inode_cfg_it == cli->st_cli.inode_config.end())
|
||||
{
|
||||
return;
|
||||
}
|
||||
auto & inode_cfg = inode_cfg_it->second;
|
||||
std::string full_name = inode_cfg.name;
|
||||
if (name_prefix != "" && full_name.substr(0, name_prefix.size()) != name_prefix)
|
||||
{
|
||||
return;
|
||||
}
|
||||
// Calculate directory modification time and revision (used as "cookie verifier")
|
||||
timespec now;
|
||||
clock_gettime(CLOCK_REALTIME, &now);
|
||||
dir_info[""].mod_rev = dir_info[""].mod_rev < inode_cfg.mod_revision ? inode_cfg.mod_revision : dir_info[""].mod_rev;
|
||||
dir_info[""].mtime = now;
|
||||
int pos = full_name.find('/', name_prefix.size());
|
||||
while (pos >= 0)
|
||||
{
|
||||
std::string dir = full_name.substr(0, pos);
|
||||
auto & dinf = dir_info[dir];
|
||||
if (!dinf.id)
|
||||
dinf.id = next_dir_id++;
|
||||
dinf.mod_rev = dinf.mod_rev < inode_cfg.mod_revision ? inode_cfg.mod_revision : dinf.mod_rev;
|
||||
dinf.mtime = now;
|
||||
dir_by_hash["S"+base64_encode(sha256(dir))] = dir;
|
||||
pos = full_name.find('/', pos+1);
|
||||
}
|
||||
// Alter inode_by_hash
|
||||
if (removed)
|
||||
{
|
||||
auto ino_it = hash_by_inode.find(changed_inode);
|
||||
if (ino_it != hash_by_inode.end())
|
||||
{
|
||||
inode_by_hash.erase(ino_it->second);
|
||||
hash_by_inode.erase(ino_it);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::string hash = "S"+base64_encode(sha256(full_name));
|
||||
auto hbi_it = hash_by_inode.find(changed_inode);
|
||||
if (hbi_it != hash_by_inode.end() && hbi_it->second != hash)
|
||||
{
|
||||
// inode had a different name, remove old hash=>inode pointer
|
||||
inode_by_hash.erase(hbi_it->second);
|
||||
}
|
||||
inode_by_hash[hash] = changed_inode;
|
||||
hash_by_inode[changed_inode] = hash;
|
||||
}
|
||||
};
|
||||
// Load image metadata
|
||||
while (!cli->is_ready())
|
||||
{
|
||||
|
@ -202,6 +201,17 @@ void nfs_proxy_t::run(json11::Json cfg)
|
|||
}
|
||||
// Check default pool
|
||||
check_default_pool();
|
||||
// Check if we're using VitastorFS
|
||||
if (fsname == "")
|
||||
{
|
||||
blockfs = new block_fs_state_t();
|
||||
blockfs->init(this, cfg);
|
||||
}
|
||||
else
|
||||
{
|
||||
kvfs = new kv_fs_state_t();
|
||||
kvfs->init(this, cfg);
|
||||
}
|
||||
// Self-register portmap and NFS
|
||||
pmap.reg_ports.insert((portmap_id_t){
|
||||
.prog = PMAP_PROGRAM,
|
||||
|
@ -232,7 +242,7 @@ void nfs_proxy_t::run(json11::Json cfg)
|
|||
.addr = "0.0.0.0.0."+std::to_string(nfs_port),
|
||||
});
|
||||
// Create NFS socket and add it to epoll
|
||||
int nfs_socket = create_and_bind_socket(bind_address, nfs_port, 128, NULL);
|
||||
int nfs_socket = create_and_bind_socket(bind_address, nfs_port, 128, &listening_port);
|
||||
fcntl(nfs_socket, F_SETFL, fcntl(nfs_socket, F_GETFL, 0) | O_NONBLOCK);
|
||||
epmgr->tfd->set_fd_handler(nfs_socket, false, [this](int nfs_socket, int epoll_events)
|
||||
{
|
||||
|
@ -264,17 +274,40 @@ void nfs_proxy_t::run(json11::Json cfg)
|
|||
}
|
||||
});
|
||||
}
|
||||
if (mountpoint != "")
|
||||
{
|
||||
mount_fs();
|
||||
}
|
||||
if (cfg["foreground"].is_null())
|
||||
{
|
||||
daemonize();
|
||||
}
|
||||
while (true)
|
||||
if (pidfile != "")
|
||||
{
|
||||
write_pid();
|
||||
}
|
||||
while (!finished)
|
||||
{
|
||||
ringloop->loop();
|
||||
ringloop->wait();
|
||||
}
|
||||
// Destroy the client
|
||||
cli->flush();
|
||||
if (kvfs)
|
||||
{
|
||||
delete kvfs;
|
||||
kvfs = NULL;
|
||||
}
|
||||
if (blockfs)
|
||||
{
|
||||
delete blockfs;
|
||||
blockfs = NULL;
|
||||
}
|
||||
if (db)
|
||||
{
|
||||
delete db;
|
||||
db = NULL;
|
||||
}
|
||||
delete cli;
|
||||
delete epmgr;
|
||||
delete ringloop;
|
||||
|
@ -351,7 +384,7 @@ void nfs_proxy_t::parse_stats(etcd_kv_t & kv)
|
|||
inode_t inode_num = 0;
|
||||
char null_byte = 0;
|
||||
int scanned = sscanf(key.c_str() + cli->st_cli.etcd_prefix.length()+13, "%u/%ju%c", &pool_id, &inode_num, &null_byte);
|
||||
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || !inode_num)
|
||||
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX)
|
||||
{
|
||||
fprintf(stderr, "Bad etcd key %s, ignoring\n", key.c_str());
|
||||
}
|
||||
|
@ -382,8 +415,9 @@ void nfs_proxy_t::check_default_pool()
|
|||
{
|
||||
if (cli->st_cli.pool_config.size() == 1)
|
||||
{
|
||||
default_pool = cli->st_cli.pool_config.begin()->second.name;
|
||||
default_pool_id = cli->st_cli.pool_config.begin()->first;
|
||||
auto pool_it = cli->st_cli.pool_config.begin();
|
||||
default_pool_id = pool_it->first;
|
||||
default_pool = pool_it->second.name;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -416,11 +450,17 @@ void nfs_proxy_t::do_accept(int listen_fd)
|
|||
int nfs_fd = 0;
|
||||
while ((nfs_fd = accept(listen_fd, (struct sockaddr *)&addr, &addr_size)) >= 0)
|
||||
{
|
||||
fprintf(stderr, "New client %d: connection from %s\n", nfs_fd, addr_to_string(addr).c_str());
|
||||
if (trace)
|
||||
fprintf(stderr, "New client %d: connection from %s\n", nfs_fd, addr_to_string(addr).c_str());
|
||||
active_connections++;
|
||||
fcntl(nfs_fd, F_SETFL, fcntl(nfs_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
int one = 1;
|
||||
setsockopt(nfs_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
auto cli = new nfs_client_t();
|
||||
if (kvfs)
|
||||
nfs_kv_procs(cli);
|
||||
else
|
||||
nfs_block_procs(cli);
|
||||
cli->parent = this;
|
||||
cli->nfs_fd = nfs_fd;
|
||||
for (auto & fn: pmap.proc_table)
|
||||
|
@ -432,8 +472,12 @@ void nfs_proxy_t::do_accept(int listen_fd)
|
|||
// Handle incoming event
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
fprintf(stderr, "Client %d disconnected\n", nfs_fd);
|
||||
auto parent = cli->parent;
|
||||
if (parent->trace)
|
||||
fprintf(stderr, "Client %d disconnected\n", nfs_fd);
|
||||
cli->stop();
|
||||
parent->active_connections--;
|
||||
parent->check_exit();
|
||||
return;
|
||||
}
|
||||
cli->epoll_events |= epoll_events;
|
||||
|
@ -544,7 +588,7 @@ void nfs_client_t::handle_read(int result)
|
|||
read_msg.msg_iovlen = 0;
|
||||
if (deref())
|
||||
return;
|
||||
if (result <= 0 && result != -EAGAIN && result != -EINTR)
|
||||
if (result <= 0 && result != -EAGAIN && result != -EINTR && result != -ECANCELED)
|
||||
{
|
||||
printf("Failed read from client %d: %d (%s)\n", nfs_fd, result, strerror(-result));
|
||||
stop();
|
||||
|
@ -639,8 +683,8 @@ void nfs_client_t::handle_read(int result)
|
|||
return;
|
||||
}
|
||||
}
|
||||
submit_read(0);
|
||||
}
|
||||
submit_read(0);
|
||||
}
|
||||
|
||||
void nfs_client_t::submit_send()
|
||||
|
@ -968,8 +1012,164 @@ void nfs_proxy_t::daemonize()
|
|||
close(1);
|
||||
close(2);
|
||||
open("/dev/null", O_RDONLY);
|
||||
open("/dev/null", O_WRONLY);
|
||||
open("/dev/null", O_WRONLY);
|
||||
open(logfile.c_str(), O_WRONLY|O_APPEND|O_CREAT, 0666);
|
||||
open(logfile.c_str(), O_WRONLY|O_APPEND|O_CREAT, 0666);
|
||||
}
|
||||
|
||||
void nfs_proxy_t::write_pid()
|
||||
{
|
||||
int fd = open(pidfile.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666);
|
||||
if (fd < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to create pid file %s: %s (code %d)\n", pidfile.c_str(), strerror(errno), errno);
|
||||
return;
|
||||
}
|
||||
auto pid = std::to_string(getpid());
|
||||
if (write(fd, pid.c_str(), pid.size()) < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to write pid to %s: %s (code %d)\n", pidfile.c_str(), strerror(errno), errno);
|
||||
}
|
||||
close(fd);
|
||||
}
|
||||
|
||||
static pid_t wanted_pid = 0;
|
||||
static bool child_finished = false;
|
||||
static int child_status = -1;
|
||||
|
||||
void single_child_handler(int signal)
|
||||
{
|
||||
child_finished = true;
|
||||
waitpid(wanted_pid, &child_status, WNOHANG);
|
||||
}
|
||||
|
||||
void nfs_proxy_t::mount_fs()
|
||||
{
|
||||
check_already_mounted();
|
||||
signal(SIGCHLD, single_child_handler);
|
||||
auto pid = fork();
|
||||
if (pid < 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to fork: %s (code %d)\n", strerror(errno), errno);
|
||||
exit(1);
|
||||
}
|
||||
if (pid > 0)
|
||||
{
|
||||
// Parent - loop and wait until child finishes
|
||||
wanted_pid = pid;
|
||||
exit_on_umount = false;
|
||||
while (!child_finished)
|
||||
{
|
||||
ringloop->loop();
|
||||
ringloop->wait();
|
||||
}
|
||||
if (!WIFEXITED(child_status) || WEXITSTATUS(child_status) != 0)
|
||||
{
|
||||
// Mounting failed
|
||||
exit(1);
|
||||
}
|
||||
if (fsname != "")
|
||||
fprintf(stderr, "Successfully mounted VitastorFS %s at %s\n", fsname.c_str(), mountpoint.c_str());
|
||||
else
|
||||
fprintf(stderr, "Successfully mounted Vitastor pseudo-FS at %s\n", mountpoint.c_str());
|
||||
finished = false;
|
||||
exit_on_umount = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Child
|
||||
std::string src = ("localhost:"+export_root);
|
||||
std::string opts = ("port="+std::to_string(listening_port)+",mountport="+std::to_string(listening_port)+",nfsvers=3,nolock,tcp");
|
||||
bool hard = false, async = false;
|
||||
for (auto & opt: explode(",", mountopts, true))
|
||||
{
|
||||
if (opt == "hard")
|
||||
hard = true;
|
||||
else if (opt == "async")
|
||||
async = true;
|
||||
else if (opt.substr(0, 4) != "port" && opt.substr(0, 9) != "mountport" &&
|
||||
opt.substr(0, 7) != "nfsvers" && opt.substr(0, 5) != "proto" &&
|
||||
opt != "udp" && opt != "tcp" && opt != "rdma")
|
||||
{
|
||||
opts += ","+opt;
|
||||
}
|
||||
}
|
||||
if (!hard)
|
||||
opts += ",soft";
|
||||
if (!async)
|
||||
opts += ",sync";
|
||||
const char *args[] = { "mount", src.c_str(), mountpoint.c_str(), "-o", opts.c_str(), NULL };
|
||||
execvp("mount", (char* const*)args);
|
||||
fprintf(stderr, "Failed to run mount %s %s -o %s: %s (code %d)\n",
|
||||
src.c_str(), mountpoint.c_str(), opts.c_str(), strerror(errno), errno);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
void nfs_proxy_t::check_already_mounted()
|
||||
{
|
||||
std::string realpoint = realpath_str(mountpoint, false);
|
||||
if (realpoint == "")
|
||||
{
|
||||
return;
|
||||
}
|
||||
std::string mountstr = read_file("/proc/mounts");
|
||||
if (mountstr == "")
|
||||
{
|
||||
return;
|
||||
}
|
||||
auto mounts = explode("\n", mountstr, true);
|
||||
for (auto & str: mounts)
|
||||
{
|
||||
auto mnt = explode(" ", str, true);
|
||||
if (mnt.size() >= 2 && mnt[1] == realpoint)
|
||||
{
|
||||
fprintf(stderr, "%s is already mounted\n", mountpoint.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void nfs_proxy_t::check_exit()
|
||||
{
|
||||
if (active_connections || !exit_on_umount)
|
||||
{
|
||||
return;
|
||||
}
|
||||
fprintf(stderr, "All active NFS connections are closed, checking /proc/mounts\n");
|
||||
std::string mountstr = read_file("/proc/mounts");
|
||||
if (mountstr == "")
|
||||
{
|
||||
return;
|
||||
}
|
||||
auto port_opt = "port="+std::to_string(listening_port);
|
||||
auto mountport_opt = "mountport="+std::to_string(listening_port);
|
||||
auto mounts = explode("\n", mountstr, true);
|
||||
for (auto & str: mounts)
|
||||
{
|
||||
auto opts = explode(" ", str, true);
|
||||
if (opts[2].size() >= 3 && opts[2].substr(0, 3) == "nfs" && opts.size() >= 4)
|
||||
{
|
||||
opts = explode(",", opts[3], true);
|
||||
bool port_found = false;
|
||||
bool addr_found = false;
|
||||
for (auto & opt: opts)
|
||||
{
|
||||
if (opt == port_opt || opt == mountport_opt)
|
||||
port_found = true;
|
||||
if (opt == "addr=127.0.0.1" || opt == "mountaddr=127.0.0.1")
|
||||
addr_found = true;
|
||||
}
|
||||
if (port_found && addr_found)
|
||||
{
|
||||
// OK, do not unmount
|
||||
fprintf(stderr, "NFS mount to 127.0.0.1:%d still active, leaving server active\n", listening_port);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "NFS mount to 127.0.0.1:%d not found, exiting\n", listening_port);
|
||||
// Not found, unmount
|
||||
finished = true;
|
||||
}
|
||||
|
||||
int main(int narg, const char *args[])
|
||||
|
|
|
@ -4,51 +4,54 @@
|
|||
#include "epoll_manager.h"
|
||||
#include "nfs_portmap.h"
|
||||
#include "nfs/xdr_impl.h"
|
||||
#include "kv_db.h"
|
||||
|
||||
#define NFS_ROOT_HANDLE "R"
|
||||
#define RPC_INIT_BUF_SIZE 32768
|
||||
#define MAX_REQUEST_SIZE 128*1024*1024
|
||||
#define TRUE 1
|
||||
#define FALSE 0
|
||||
|
||||
class cli_tool_t;
|
||||
|
||||
struct nfs_dir_t
|
||||
{
|
||||
uint64_t id;
|
||||
uint64_t mod_rev;
|
||||
timespec mtime;
|
||||
};
|
||||
struct kv_fs_state_t;
|
||||
struct block_fs_state_t;
|
||||
|
||||
class nfs_proxy_t
|
||||
{
|
||||
public:
|
||||
std::string bind_address;
|
||||
std::string name_prefix;
|
||||
uint64_t fsid = 1;
|
||||
uint64_t server_id = 0;
|
||||
// FIXME: Maybe allow to create files in different pools?
|
||||
std::string default_pool;
|
||||
std::string export_root;
|
||||
bool portmap_enabled;
|
||||
unsigned nfs_port;
|
||||
int trace = 0;
|
||||
std::string logfile = "/dev/null";
|
||||
std::string pidfile;
|
||||
bool exit_on_umount = false;
|
||||
std::string mountpoint;
|
||||
std::string mountopts;
|
||||
std::string fsname;
|
||||
|
||||
pool_id_t default_pool_id;
|
||||
int active_connections = 0;
|
||||
bool finished = false;
|
||||
int listening_port = 0;
|
||||
pool_id_t default_pool_id = 0;
|
||||
|
||||
portmap_service_t pmap;
|
||||
ring_loop_t *ringloop = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
cluster_client_t *cli = NULL;
|
||||
cli_tool_t *cmd = NULL;
|
||||
kv_dbw_t *db = NULL;
|
||||
kv_fs_state_t *kvfs = NULL;
|
||||
block_fs_state_t *blockfs = NULL;
|
||||
|
||||
std::vector<XDR*> xdr_pool;
|
||||
|
||||
// filehandle = "S"+base64(sha256(full name with prefix)) or "roothandle" for mount root)
|
||||
|
||||
uint64_t next_dir_id = 2;
|
||||
// filehandle => dir with name_prefix
|
||||
std::map<std::string, std::string> dir_by_hash;
|
||||
// dir with name_prefix => dir info
|
||||
std::map<std::string, nfs_dir_t> dir_info;
|
||||
// filehandle => inode ID
|
||||
std::map<std::string, inode_t> inode_by_hash;
|
||||
// inode ID => filehandle
|
||||
std::map<inode_t, std::string> hash_by_inode;
|
||||
// inode ID => statistics
|
||||
std::map<inode_t, json11::Json> inode_stats;
|
||||
// pool ID => statistics
|
||||
|
@ -63,6 +66,10 @@ public:
|
|||
void check_default_pool();
|
||||
void do_accept(int listen_fd);
|
||||
void daemonize();
|
||||
void write_pid();
|
||||
void mount_fs();
|
||||
void check_already_mounted();
|
||||
void check_exit();
|
||||
};
|
||||
|
||||
struct rpc_cur_buffer_t
|
||||
|
@ -86,28 +93,6 @@ struct rpc_free_buffer_t
|
|||
unsigned size;
|
||||
};
|
||||
|
||||
struct extend_size_t
|
||||
{
|
||||
inode_t inode;
|
||||
uint64_t new_size;
|
||||
};
|
||||
|
||||
inline bool operator < (const extend_size_t &a, const extend_size_t &b)
|
||||
{
|
||||
return a.inode < b.inode || a.inode == b.inode && a.new_size < b.new_size;
|
||||
}
|
||||
|
||||
struct extend_write_t
|
||||
{
|
||||
rpc_op_t *rop;
|
||||
int resize_res, write_res; // 1 = started, 0 = completed OK, -errno = completed with error
|
||||
};
|
||||
|
||||
struct extend_inode_t
|
||||
{
|
||||
uint64_t cur_extend = 0, next_extend = 0;
|
||||
};
|
||||
|
||||
class nfs_client_t
|
||||
{
|
||||
public:
|
||||
|
@ -122,8 +107,6 @@ public:
|
|||
rpc_cur_buffer_t cur_buffer = { 0 };
|
||||
std::map<uint8_t*, rpc_used_buffer_t> used_buffers;
|
||||
std::vector<rpc_free_buffer_t> free_buffers;
|
||||
std::map<inode_t, extend_inode_t> extends;
|
||||
std::multimap<extend_size_t, extend_write_t> extend_writes;
|
||||
|
||||
iovec read_iov;
|
||||
msghdr read_msg = { 0 };
|
||||
|
@ -133,9 +116,6 @@ public:
|
|||
std::vector<iovec> send_list, next_send_list;
|
||||
std::vector<rpc_op_t*> outbox, next_outbox;
|
||||
|
||||
nfs_client_t();
|
||||
~nfs_client_t();
|
||||
|
||||
void select_read_buffer(unsigned wanted_size);
|
||||
void submit_read(unsigned wanted_size);
|
||||
void handle_read(int result);
|
||||
|
|
|
@ -239,6 +239,7 @@ class osd_t
|
|||
void report_statistics();
|
||||
void report_pg_state(pg_t & pg);
|
||||
void report_pg_states();
|
||||
void apply_no_inode_stats();
|
||||
void apply_pg_count();
|
||||
void apply_pg_config();
|
||||
|
||||
|
|
|
@ -388,9 +388,18 @@ void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes
|
|||
etcd_global_config = changes[st_cli.etcd_prefix+"/config/global"].value.object_items();
|
||||
parse_config(false);
|
||||
}
|
||||
bool pools = changes.find(st_cli.etcd_prefix+"/config/pools") != changes.end();
|
||||
if (pools)
|
||||
{
|
||||
apply_no_inode_stats();
|
||||
}
|
||||
if (run_primary)
|
||||
{
|
||||
apply_pg_count();
|
||||
bool pgs = changes.find(st_cli.etcd_prefix+"/config/pgs") != changes.end();
|
||||
if (pools || pgs)
|
||||
{
|
||||
apply_pg_count();
|
||||
}
|
||||
apply_pg_config();
|
||||
}
|
||||
}
|
||||
|
@ -414,6 +423,8 @@ void osd_t::on_reload_config_hook(json11::Json::object & global_config)
|
|||
// Acquire lease
|
||||
void osd_t::acquire_lease()
|
||||
{
|
||||
// Apply no_inode_stats before the first statistics report
|
||||
apply_no_inode_stats();
|
||||
// Maximum lease TTL is (report interval) + retries * (timeout + repeat interval)
|
||||
st_cli.etcd_call("/lease/grant", json11::Json::object {
|
||||
{ "TTL", etcd_report_interval+(st_cli.max_etcd_attempts*(2*st_cli.etcd_quick_timeout)+999)/1000 }
|
||||
|
@ -602,11 +613,32 @@ void osd_t::on_load_pgs_hook(bool success)
|
|||
else
|
||||
{
|
||||
peering_state &= ~OSD_LOADING_PGS;
|
||||
apply_pg_count();
|
||||
apply_pg_config();
|
||||
apply_no_inode_stats();
|
||||
if (run_primary)
|
||||
{
|
||||
apply_pg_count();
|
||||
apply_pg_config();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::apply_no_inode_stats()
|
||||
{
|
||||
if (!bs)
|
||||
{
|
||||
return;
|
||||
}
|
||||
std::vector<uint64_t> no_inode_stats;
|
||||
for (auto & pool_item: st_cli.pool_config)
|
||||
{
|
||||
if (!pool_item.second.used_for_fs.empty())
|
||||
{
|
||||
no_inode_stats.push_back(pool_item.first);
|
||||
}
|
||||
}
|
||||
bs->set_no_inode_stats(no_inode_stats);
|
||||
}
|
||||
|
||||
void osd_t::apply_pg_count()
|
||||
{
|
||||
for (auto & pool_item: st_cli.pool_config)
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
#define POOL_ID_MAX 0x10000
|
||||
#define POOL_ID_BITS 16
|
||||
#define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))
|
||||
#define INODE_NO_POOL(inode) (inode_t)(inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1))
|
||||
#define INODE_NO_POOL(inode) (inode_t)((inode) & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1))
|
||||
#define INODE_WITH_POOL(pool_id, inode) (((inode_t)(pool_id) << (64-POOL_ID_BITS)) | INODE_NO_POOL(inode))
|
||||
|
||||
// Pool ID is 16 bits long
|
||||
|
|
117
src/str_util.cpp
117
src/str_util.cpp
|
@ -1,9 +1,10 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include "str_util.h"
|
||||
|
||||
std::string base64_encode(const std::string &in)
|
||||
|
@ -304,6 +305,23 @@ std::string read_all_fd(int fd)
|
|||
return res;
|
||||
}
|
||||
|
||||
std::string read_file(std::string file, bool allow_enoent)
|
||||
{
|
||||
std::string res;
|
||||
int fd = open(file.c_str(), O_RDONLY);
|
||||
if (fd < 0 || (res = read_all_fd(fd)) == "")
|
||||
{
|
||||
int err = errno;
|
||||
if (fd >= 0)
|
||||
close(fd);
|
||||
if (!allow_enoent || err != ENOENT)
|
||||
fprintf(stderr, "Failed to read %s: %s (code %d)\n", file.c_str(), strerror(err), err);
|
||||
return "";
|
||||
}
|
||||
close(fd);
|
||||
return res;
|
||||
}
|
||||
|
||||
std::string str_repeat(const std::string & str, int times)
|
||||
{
|
||||
std::string r;
|
||||
|
@ -348,3 +366,100 @@ std::vector<std::string> explode(const std::string & sep, const std::string & va
|
|||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
std::string scan_escaped(const std::string & cmd, size_t & pos, bool allow_unquoted)
|
||||
{
|
||||
return scan_escaped(cmd.data(), cmd.size(), pos, allow_unquoted);
|
||||
}
|
||||
|
||||
// extract possibly single- or double-quoted part of string with escape characters
|
||||
std::string scan_escaped(const char *cmd, size_t size, size_t & pos, bool allow_unquoted)
|
||||
{
|
||||
auto orig = pos;
|
||||
while (pos < size && is_white(cmd[pos]))
|
||||
pos++;
|
||||
if (pos >= size)
|
||||
{
|
||||
pos = orig;
|
||||
return "";
|
||||
}
|
||||
if (cmd[pos] != '"' && cmd[pos] != '\'')
|
||||
{
|
||||
if (!allow_unquoted)
|
||||
{
|
||||
pos = orig;
|
||||
return "";
|
||||
}
|
||||
auto pos2 = pos;
|
||||
while (pos2 < size && !is_white(cmd[pos2]))
|
||||
pos2++;
|
||||
auto key = std::string(cmd+pos, pos2-pos);
|
||||
pos = pos2;
|
||||
return key;
|
||||
}
|
||||
char quot = cmd[pos];
|
||||
pos++;
|
||||
std::string key;
|
||||
while (true)
|
||||
{
|
||||
auto pos2 = pos;
|
||||
while (pos2 < size && cmd[pos2] != '\\' && cmd[pos2] != quot)
|
||||
pos2++;
|
||||
if (pos2 >= size || pos2 == size-1 && cmd[pos2] == '\\')
|
||||
{
|
||||
// Unfinished string literal
|
||||
pos = orig;
|
||||
return "";
|
||||
}
|
||||
if (pos2 > pos)
|
||||
key += std::string(cmd+pos, pos2-pos);
|
||||
pos = pos2;
|
||||
if (cmd[pos] == quot)
|
||||
{
|
||||
pos++;
|
||||
break;
|
||||
}
|
||||
else /* if (cmd[pos] == '\\') */
|
||||
{
|
||||
key += cmd[++pos];
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
return key;
|
||||
}
|
||||
|
||||
std::string auto_addslashes(const std::string & str, const char *toescape)
|
||||
{
|
||||
auto pos = str.find_first_of(toescape);
|
||||
if (pos == std::string::npos)
|
||||
return str;
|
||||
return addslashes(str, toescape);
|
||||
}
|
||||
|
||||
std::string addslashes(const std::string & str, const char *toescape)
|
||||
{
|
||||
std::string res = "\"";
|
||||
auto pos = 0;
|
||||
while (pos < str.size())
|
||||
{
|
||||
auto pos2 = str.find_first_of(toescape, pos);
|
||||
if (pos2 == std::string::npos)
|
||||
return res + str.substr(pos) + "\"";
|
||||
res += str.substr(pos, pos2-pos)+"\\"+str[pos2];
|
||||
pos = pos2+1;
|
||||
}
|
||||
return res+"\"";
|
||||
}
|
||||
|
||||
std::string realpath_str(std::string path, bool nofail)
|
||||
{
|
||||
char *p = realpath((char*)path.c_str(), NULL);
|
||||
if (!p)
|
||||
{
|
||||
fprintf(stderr, "Failed to resolve %s: %s\n", path.c_str(), strerror(errno));
|
||||
return nofail ? path : "";
|
||||
}
|
||||
std::string rp(p);
|
||||
free(p);
|
||||
return rp;
|
||||
}
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define is_white(a) ((a) == ' ' || (a) == '\t' || (a) == '\r' || (a) == '\n')
|
||||
|
||||
std::string base64_encode(const std::string &in);
|
||||
std::string base64_decode(const std::string &in);
|
||||
uint64_t parse_size(std::string size_str, bool *ok = NULL);
|
||||
|
@ -18,7 +20,13 @@ std::string format_size(uint64_t size, bool nobytes = false);
|
|||
void print_help(const char *help_text, std::string exe_name, std::string cmd, bool all);
|
||||
uint64_t parse_time(std::string time_str, bool *ok = NULL);
|
||||
std::string read_all_fd(int fd);
|
||||
std::string read_file(std::string file, bool allow_enoent = false);
|
||||
std::string str_repeat(const std::string & str, int times);
|
||||
size_t utf8_length(const std::string & s);
|
||||
size_t utf8_length(const char *s);
|
||||
std::vector<std::string> explode(const std::string & sep, const std::string & value, bool trim);
|
||||
std::string scan_escaped(const char *cmd, size_t size, size_t & pos, bool allow_unquoted = true);
|
||||
std::string scan_escaped(const std::string & cmd, size_t & pos, bool allow_unquoted = true);
|
||||
std::string auto_addslashes(const std::string & str, const char *toescape = "\\\"");
|
||||
std::string addslashes(const std::string & str, const char *toescape = "\\\"");
|
||||
std::string realpath_str(std::string path, bool nofail = true);
|
||||
|
|
|
@ -68,3 +68,5 @@ SCHEME=xor ./test_scrub.sh
|
|||
PG_SIZE=3 ./test_scrub.sh
|
||||
PG_SIZE=6 PG_MINSIZE=4 OSD_COUNT=6 SCHEME=ec ./test_scrub.sh
|
||||
SCHEME=ec ./test_scrub.sh
|
||||
|
||||
./test_nfs.sh
|
||||
|
|
|
@ -0,0 +1,168 @@
|
|||
#!/bin/bash -ex
|
||||
|
||||
PG_COUNT=16
|
||||
. `dirname $0`/run_3osds.sh
|
||||
|
||||
build/src/vitastor-cli --etcd_address $ETCD_URL create -s 10G fsmeta
|
||||
build/src/vitastor-cli --etcd_address $ETCD_URL modify-pool --used-for-fs fsmeta testpool
|
||||
build/src/vitastor-nfs start --fs fsmeta --etcd_address $ETCD_URL --portmap 0 --port 2050 --foreground 1 --trace 1 >>./testdata/nfs.log 2>&1 &
|
||||
NFS_PID=$!
|
||||
|
||||
mkdir -p testdata/nfs
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
MNT=$(pwd)/testdata/nfs
|
||||
trap "sudo umount -f $MNT"' || true; kill -9 $(jobs -p)' EXIT
|
||||
|
||||
# write small file
|
||||
ls -l ./testdata/nfs
|
||||
dd if=/dev/urandom of=./testdata/f1 bs=100k count=1
|
||||
cp testdata/f1 ./testdata/nfs/
|
||||
sudo umount ./testdata/nfs/
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
ls -l ./testdata/nfs | grep f1
|
||||
diff ./testdata/f1 ./testdata/nfs/f1
|
||||
format_green "100K file ok"
|
||||
|
||||
# overwrite it inplace
|
||||
dd if=/dev/urandom of=./testdata/f1_90k bs=90k count=1
|
||||
cp testdata/f1_90k ./testdata/nfs/f1
|
||||
sudo umount ./testdata/nfs/
|
||||
format_green "inplace overwrite 90K ok"
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
ls -l ./testdata/nfs | grep f1
|
||||
# create another copy
|
||||
dd if=./testdata/f1_90k of=./testdata/nfs/f1_nfs bs=1M
|
||||
diff ./testdata/f1_90k ./testdata/nfs/f1_nfs
|
||||
sudo umount ./testdata/nfs/
|
||||
format_green "another copy 90K ok"
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
ls -l ./testdata/nfs | grep f1
|
||||
cp ./testdata/nfs/f1 ./testdata/f1_nfs
|
||||
diff ./testdata/f1_90k ./testdata/nfs/f1
|
||||
format_green "90K data ok"
|
||||
|
||||
# test partial shared overwrite
|
||||
dd if=/dev/urandom of=./testdata/f1_90k bs=9317 count=1 seek=5 conv=notrunc
|
||||
dd if=./testdata/f1_90k of=./testdata/nfs/f1 bs=9317 count=1 skip=5 seek=5 conv=notrunc
|
||||
sudo umount ./testdata/nfs/
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
diff ./testdata/f1_90k ./testdata/nfs/f1
|
||||
format_green "partial inplace shared overwrite ok"
|
||||
|
||||
# move it to a larger shared space
|
||||
dd if=/dev/urandom of=./testdata/f1_110k bs=110k count=1
|
||||
cp testdata/f1_110k ./testdata/nfs/f1
|
||||
sudo umount ./testdata/nfs/
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
ls -l ./testdata/nfs | grep f1
|
||||
diff ./testdata/f1_110k ./testdata/nfs/f1
|
||||
format_green "move shared 90K -> 110K ok"
|
||||
|
||||
# extend it to large file + rm
|
||||
dd if=/dev/urandom of=./testdata/f1_2M bs=2M count=1
|
||||
cp ./testdata/f1_2M ./testdata/nfs/f1
|
||||
sudo umount ./testdata/nfs/
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
ls -l ./testdata/nfs | grep f1
|
||||
cp ./testdata/nfs/f1 ./testdata/f1_nfs
|
||||
diff ./testdata/f1_2M ./testdata/nfs/f1
|
||||
rm ./testdata/nfs/f1
|
||||
format_green "extend to 2M + rm ok"
|
||||
|
||||
# mkdir
|
||||
mkdir -p ./testdata/nfs/dir1/dir2
|
||||
echo abcdef > ./testdata/nfs/dir1/dir2/hnpfls
|
||||
# rename dir
|
||||
mv ./testdata/nfs/dir1 ./testdata/nfs/dir3
|
||||
sudo umount ./testdata/nfs/
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
ls -l ./testdata/nfs | grep dir3
|
||||
ls -l ./testdata/nfs/dir3 | grep dir2
|
||||
ls -l ./testdata/nfs/dir3/dir2 | grep hnpfls
|
||||
echo abcdef > ./testdata/hnpfls
|
||||
diff ./testdata/hnpfls ./testdata/nfs/dir3/dir2/hnpfls
|
||||
format_green "rename dir with file ok"
|
||||
|
||||
# touch
|
||||
touch -t 202401011404 ./testdata/nfs/dir3/dir2/hnpfls
|
||||
sudo chown 65534:65534 ./testdata/nfs/dir3/dir2/hnpfls
|
||||
sudo chmod 755 ./testdata/nfs/dir3/dir2/hnpfls
|
||||
sudo umount ./testdata/nfs/
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
T=`stat -c '%a %u %g %y' ./testdata/nfs/dir3/dir2/hnpfls | perl -pe 's/(:\d+)(.*)/$1/'`
|
||||
[[ "$T" = "755 65534 65534 2024-01-01 14:04" ]]
|
||||
format_green "set attrs ok"
|
||||
|
||||
# move dir
|
||||
mv ./testdata/nfs/dir3/dir2 ./testdata/nfs/
|
||||
sudo umount ./testdata/nfs/
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
ls -l ./testdata/nfs | grep dir3
|
||||
ls -l ./testdata/nfs | grep dir2
|
||||
format_green "move dir ok"
|
||||
|
||||
# symlink, readlink
|
||||
ln -s dir2 ./testdata/nfs/sym2
|
||||
[[ "`stat -c '%A' ./testdata/nfs/sym2`" = "lrwxrwxrwx" ]]
|
||||
sudo umount ./testdata/nfs/
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
[[ "`stat -c '%A' ./testdata/nfs/sym2`" = "lrwxrwxrwx" ]]
|
||||
[[ "`readlink ./testdata/nfs/sym2`" = "dir2" ]]
|
||||
format_green "symlink, readlink ok"
|
||||
|
||||
# mknod: chr, blk, sock, fifo + remove
|
||||
sudo mknod ./testdata/nfs/nod_chr c 1 5
|
||||
sudo mknod ./testdata/nfs/nod_blk b 2 6
|
||||
mkfifo ./testdata/nfs/nod_fifo
|
||||
perl -e 'use Socket; socket($sock, PF_UNIX, SOCK_STREAM, undef) || die $!; bind($sock, sockaddr_un("./testdata/nfs/nod_sock")) || die $!;'
|
||||
chmod 777 ./testdata/nfs/nod_*
|
||||
sudo umount ./testdata/nfs/
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
[[ "`ls testdata|wc -l`" -ge 4 ]]
|
||||
[[ "`stat -c '%A' ./testdata/nfs/nod_blk`" = "brwxrwxrwx" ]]
|
||||
[[ "`stat -c '%A' ./testdata/nfs/nod_chr`" = "crwxrwxrwx" ]]
|
||||
[[ "`stat -c '%A' ./testdata/nfs/nod_fifo`" = "prwxrwxrwx" ]]
|
||||
[[ "`stat -c '%A' ./testdata/nfs/nod_sock`" = "srwxrwxrwx" ]]
|
||||
sudo rm ./testdata/nfs/nod_*
|
||||
format_green "mknod + rm ok"
|
||||
|
||||
# hardlink
|
||||
echo ABCDEF > ./testdata/nfs/linked1
|
||||
i=`stat -c '%i' ./testdata/nfs/linked1`
|
||||
ln ./testdata/nfs/linked1 ./testdata/nfs/linked2
|
||||
[[ "`stat -c '%i' ./testdata/nfs/linked2`" -eq $i ]]
|
||||
echo BABABA > ./testdata/nfs/linked2
|
||||
diff ./testdata/nfs/linked2 ./testdata/nfs/linked1
|
||||
sudo umount ./testdata/nfs/
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
diff ./testdata/nfs/linked2 ./testdata/nfs/linked1
|
||||
[[ "`cat ./testdata/nfs/linked2`" = "BABABA" ]]
|
||||
rm ./testdata/nfs/linked2
|
||||
sudo umount ./testdata/nfs/
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
[[ "`cat ./testdata/nfs/linked1`" = "BABABA" ]]
|
||||
format_green "hardlink ok"
|
||||
|
||||
# rm small
|
||||
ls -l ./testdata/nfs
|
||||
dd if=/dev/urandom of=./testdata/nfs/smallfile bs=100k count=1
|
||||
sudo umount ./testdata/nfs/
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
rm ./testdata/nfs/smallfile
|
||||
if ls ./testdata/nfs | grep smallfile; then false; fi
|
||||
sudo umount ./testdata/nfs/
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
if ls ./testdata/nfs | grep smallfile; then false; fi
|
||||
format_green "rm small ok"
|
||||
|
||||
# rename over existing
|
||||
echo ZXCVBN > ./testdata/nfs/over1
|
||||
mv ./testdata/nfs/over1 ./testdata/nfs/linked2
|
||||
sudo umount ./testdata/nfs/
|
||||
sudo mount localhost:/ ./testdata/nfs -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
|
||||
if ls ./testdata/nfs | grep over1; then false; fi
|
||||
[[ "`cat ./testdata/nfs/linked2`" = "ZXCVBN" ]]
|
||||
[[ "`cat ./testdata/nfs/linked1`" = "BABABA" ]]
|
||||
format_green "rename over existing file ok"
|
||||
|
||||
format_green OK
|
Loading…
Reference in New Issue