Compare commits

..

105 Commits

Author SHA1 Message Date
6261809e87 WIP Simplified NFS proxy 2022-04-17 20:19:45 +03:00
d7e64e6ea1 Prepare CLI functions for integration - return results and errors 2022-04-17 20:19:45 +03:00
98e3528a14 Add block PVC and pod examples 2022-04-17 15:43:37 +03:00
8e88f77101 Fix CSI driver volumeMode: Block support 2022-04-17 15:39:11 +03:00
caa2cc2e6c Fix 32bit build error 2022-04-16 01:48:24 +03:00
842ba8b831 Use (uint64_t)1 instead of 1l / 1ul 2022-04-16 01:48:14 +03:00
1493823f9e Note about starting monitors 2022-04-12 15:00:28 +03:00
c857272f44 Comment: epoch is uint64_t 2022-04-10 12:21:37 +03:00
340a4b4f27 Release 0.6.16
- Implement `vitastor-cli status` (print cluster status) command
- Add a new `make-osd-hybrid.js` script to quickly prepare a lot of hybrid (HDD+SSD) OSDs
- Implement snapshot deletion for Cinder driver (only works in a healthy cluster)
- Fix a huge :) bug causing reads to return all zeroes during rebalance. Add a test to prevent it in the future
- Disconnect NBD proxy correctly without leaving a zombie [vitastor-nbd] process in D state
- Fix a rare write hang appearing with small write throttling enabled
2022-04-09 01:16:52 +03:00
5118980315 Add a script to run all tests 2022-04-09 01:14:00 +03:00
d71cc174e3 Implement CLI status command 2022-04-09 00:25:51 +03:00
0eb929f1ba Fix change_pg_count test (statistic reporting may take some time) 2022-04-08 11:58:53 +03:00
83146fa3e2 Fix the same HUGE bug for regular reads during rebalance 2022-04-08 11:50:09 +03:00
15dcaf7903 Add the same "rebalance" test with regular reads 2022-04-08 11:48:31 +03:00
cd18ef7323 Disconnect NBD proxy correctly without leaving a zombie [vitastor-nbd] process in D state 2022-04-07 16:03:35 +03:00
39531ef1a6 Fix incorrect chained reads during rebalance (the bug detected by test_rebalance_verify.sh) 2022-04-07 15:56:58 +03:00
d334914948 Fix the test so it actually fails indicating a bug :-) 2022-04-07 15:56:26 +03:00
c373425562 Fix nbd log 2022-04-07 15:55:38 +03:00
3615e57879 Register standby monitors in etcd in /mon/member 2022-04-04 00:48:52 +03:00
0edc6fe5a6 Add notes about the new script 2022-04-03 13:04:34 +03:00
9c30df83e3 Fix a HUGE :) bug in NBD proxy
The bug could result in corrupted data on large writes
2022-04-03 10:42:06 +03:00
a420c77107 Add rebalance-verify test 2022-04-03 10:42:06 +03:00
4100d829c7 Allow to override log file for daemonized NBD proxy 2022-04-03 02:41:04 +03:00
79ebda933e Fix a write hang with throttling due to timer reenterability / triggerability 2022-03-28 01:42:06 +03:00
65d08e067e Add a script for preparing hybrid (HDD+SSD) OSDs 2022-03-28 01:11:26 +03:00
d289753df4 Implement snapshot deletion for Cinder driver 2022-03-08 11:41:29 +03:00
85298ddae2 Release 0.6.15
- Make peering much faster in medium to large clusters
- Fix a reenterability issue which could rarely lead to peering process hangs
2022-03-06 19:16:34 +03:00
e23296a327 Rename cli_rm -> cli_rm_data, cli_snap_rm -> cli_rm 2022-02-24 14:34:14 +03:00
839ec9e6e0 Shard clean_db by PGs to speedup listings 2022-02-20 00:21:24 +03:00
7cbfdff41a Replace some throws with force_stop 2022-02-20 00:21:19 +03:00
951272f27f Try to process PG one after another 2022-02-19 19:25:55 +03:00
a3fb1d4c98 Fix reenterability around set_timer 2022-02-19 18:28:12 +03:00
88402e6eb6 Move next_request to run_cb_and_clear 2022-02-19 16:59:03 +03:00
390239c51b Don't terminate HTTP requests with timeouts if response is already available in the socket 2022-02-19 13:37:12 +03:00
b7b2adfa32 Fix http client not continuing requests in case of failure to connect 2022-02-19 13:36:26 +03:00
36c276358b Attempt to fix "head-of-line blocking" by LIST operations 2022-02-18 01:31:45 +03:00
117d6f0612 Release 0.6.14
- Fix IPv6 address parsing
- Fix "cannot read bytes of undefined" in the monitor on a fresh DB
- Fix possible hangs of read requests on OSD restarts without immediate_commit=all mode
- Fix OSDs skipping misplaced recovery in some cases
- Fix OSDs possibly dying with "map::at" errors when other OSDs are stopped
- Fix division by zero in ls if all pool OSDs are down
2022-02-17 14:43:44 +03:00
7d79c58095 Use the larger sockaddr_storage structure 2022-02-12 11:22:56 +03:00
46d2bc100f Add some tolerance to stat calculation so it does not fail on a fresh DB 2022-02-11 16:37:16 +03:00
732e2804e9 Fix operation dependency counter underflow for reads without immediate_commit=all mode 2022-02-11 10:54:11 +03:00
abaec2008c Fix OSDs missing misplaced recovery 2022-02-11 01:00:24 +03:00
8129d238a4 Different fio versions have different types for xfer_buflen, but Vitastor anyway does not support 128-bit offsets 2022-02-10 01:21:04 +03:00
61ebed144a Fix OSDs possibly dying with "map::at" errors when other OSDs are stopped 2022-02-09 10:35:29 +03:00
9d3ba113aa Extract bind socket code into a utility function 2022-02-06 00:39:52 +03:00
9788045dc9 Fix division by zero in ls if all pool OSDs are down 2022-02-05 17:03:37 +03:00
d6b0d29af6 4k MEM_ALIGNMENT 2022-02-05 17:03:37 +03:00
36f352f06f Release 0.6.13
- Fix client hangs possible on OSD restarts (bug affected versions from 0.5.11)
- Fix "Assertion `sqe != NULL' failed" io_uring-related crashes possible
  on some kernels (0.6.11 increased probability of this bug)
- Fix timeout=0 in NBD proxy
- Fix build under centos 7
2022-02-03 01:50:30 +03:00
318cc463c2 Fix warnings 2022-02-03 01:50:30 +03:00
145e5cfb86 MCL_ONFAULT is not available under centos 7 2022-02-03 01:42:19 +03:00
73ae578981 Add osd_memlock option 2022-02-02 01:40:22 +03:00
20ee4ed758 Update some parameter docs 2022-02-01 22:46:13 +03:00
63de79d1b2 Change > to | to preserve newlines 2022-02-01 22:45:12 +03:00
f712967079 And one more sqe starvation fix 2022-02-01 02:50:16 +03:00
df0cd85352 Fix another part of the "async sqe clear" bug (followup to d9857a5340) 2022-02-01 01:14:56 +03:00
ebaf4d7a72 Fix compatibility with fio 3.28+ 2022-01-31 23:39:14 +03:00
d4bc10542c Fix compatibility with liburing >= 2.1 where it only has __pad2[2] 2022-01-31 22:49:40 +03:00
140309620a Free recv_buf in nbd_proxy 2022-01-31 20:37:58 +03:00
0a610ee943 Destroy the client after completing CLI command 2022-01-31 18:27:04 +03:00
f3ce166064 Do not print nan% in df when a pool has no available OSDs 2022-01-31 18:23:57 +03:00
717d303370 Handle get_sqe failures, don't die with "will fall out of sync" in epoll_manager
Problem is that in recent kernels io_uring may return completions BEFORE
clearing the submission queue. I.e. for example its capacity is 512, there
were 512 requests, one of them completed, so when the request completion is
processed the queue "should have" 1 free slot. But sometimes it doesn't because
io_uring doesn't always clear the submission queue before sending CQE :-/
2022-01-31 02:52:20 +03:00
d9857a5340 Check for SQEs, not for completions
Should finally fix Assertion `sqe != NULL' failed introduced after journaling
refactor in 0.6.11...
2022-01-31 02:19:10 +03:00
eb5d9153e8 Fix build under centos 7 2022-01-30 20:29:44 +03:00
ae6d1ed1d5 Remove completed items 2022-01-30 20:20:06 +03:00
d123e58ea3 Fix yaml syntax - remove ` in default 2022-01-29 02:08:48 +03:00
d9869d8116 Add parameter documentation 2022-01-28 02:45:54 +03:00
4047ca606f Add missing cancel_op(currently being read op) when stopping a client
Fixes client hangs possible after stopping & restarting an osd.
Hangs happened when a connection was closed in the middle of reading a READ
operation reply from the network. In this case the operation being read was
in read_op and the client didn't free it when closing the connection.

Test case for msgr_read.cpp:
- Partially read reply for a READ operation
- stop_client()
- Check that the READ operation returns EPIPE

The bug was actually introduced in 0.5.11.
2022-01-28 01:53:52 +03:00
218e294e9c > 0, of course 2022-01-24 13:36:09 +03:00
c1929cabe0 Release 0.6.12
etcd connection stability, clang & elbrus support

- Fix build under CLang and Elbrus LCC compilers, making Vitastor compatible
  with Elbrus CPUs :)
- Completely fix the bug where OSDs didn't connect to peers and incorrectly marked
  PGs as incomplete
- Limit I/O depth for deletes the same way as for small writes. Makes OSD crashes
  with "Assertion failed: sqe != NULL" during image deletion go away
- Fix a very old, but rare, journaling bug (credits to https://github.com/mirrorll)
- Fix flushing of unclean journaled objects leading to OSDs sometimes hanging
  after failover in EC setups (bug was introduced in 0.6.7)
- Fix several problems that could prevent smooth operation of a Vitastor cluster
  under the condition of partial etcd failure:
  - OSDs could randomly fail due to too strict error handling
  - New clients and OSDs could be unable to start because of the lack of retries
  - CLI could fail some commands because of the lack of retries
  - Monitor could stop receiving state updates because of the lack of websocket pings
- Fix monitor being unable to rebalance PGs after a downscale of pool pg_size (3->2)
- Exit with failure when trying to nbd map or benchmark a non-existing image
- Use HTTP keep-alive for etcd connections
- Allow to configure etcd request timeouts and retries
- Allow to configure NBD timeout, max devices and partitions, and set default to
  up to 64 devices with up to 3 partitions each
2022-01-24 01:15:25 +03:00
cc6b24e03a Allow to configure NBD timeout, max devices and partitions
Also set default NBD devices/partitions to 64/3, Linux default is 16/16 which is way too low
2022-01-24 01:15:19 +03:00
0757ba630a Do not happily NBD "map" non-existing images, do not try to benchmark them too 2022-01-23 23:03:42 +03:00
2a0b881685 Respect max_write_iodepth for deletes 2022-01-23 22:05:23 +03:00
9a15b843ff Do not set pg_real_size to 0 2022-01-23 20:15:04 +03:00
8dc1ffb13b Try to connect with PG peers before deciding it's incomplete :)
I already attempted to fix it in 0.6.11, but it happened so that the fix was
only partial :)
2022-01-23 19:19:26 +03:00
ba63af49b4 Add etcd retries everywhere (they were missing in some places) 2022-01-23 17:21:48 +03:00
31b9c683ee Fix flushing of unclean objects
This was preventing OSD failover when there were some unclean objects.
Bug was introduced in aa436027c8
2022-01-23 00:45:11 +03:00
3abcac058f Check for double response_callback call more 2022-01-23 00:26:20 +03:00
e01c4db702 Add paranoic if()s to prevent accidental double free of etcd_watch_ws 2022-01-23 00:16:09 +03:00
a5cf06acd0 Remove etcd timeout and keepalive interval hardcode 2022-01-23 00:00:00 +03:00
9c3653b1e1 Handle EINTR 2022-01-22 23:59:37 +03:00
23e578b6a2 Fix common.sh 2022-01-21 01:51:25 +03:00
7920414bee Fix build under older gcc (debian buster) 2022-01-20 10:34:52 +03:00
098e369a3b Fix rand initialization, add etcd connection/disconnection logging 2022-01-20 00:45:49 +03:00
a43ef525a2 Remove two last end()s from http_client (should have been removed in the keepalive patch) 2022-01-20 00:44:18 +03:00
8a6b07d8f7 Add a 2/5 etcd failure test 2022-01-20 00:43:22 +03:00
Vitaliy Filippov
2c930d55fb Merge pull request #41 from promobit-bitblaze/1-small-fix
#1 fix deps
2022-01-18 11:19:08 +03:00
d798e0821e #1 fix deps 2022-01-18 13:30:53 +06:00
e591a3e9f7 Include sys/stat.h in messenger.cpp
No idea why, but it builds without it on x86 and does not build on e2k
2022-01-17 13:43:29 +03:00
77cc18420a Fix leaks detected by clang scan-build (only 1 of 4 may be important though) 2022-01-16 00:11:59 +03:00
7bdd92ca4f Fix build under clang and some warnings
Build problems fixed:
- void* pointer arithmetic which is a GNU extension (works as byte*)
- "variable size object may not be initialized" which is OK under GCC
- nullptr_t related error in json11 (it lacks 'operator <' in clang)

Warnings fixed:
- empty nested struct initializer { 0 } replaced by {}
- removed several unused lambda captures
2022-01-16 00:02:54 +03:00
8f64fc61e7 Ignore empty events in mon 2022-01-08 11:41:00 +03:00
4a9f001d9e Make mon also ping etcd websockets regularly 2022-01-05 17:28:51 +03:00
8c908316d9 Add a test with an OSD being added 2022-01-05 17:06:24 +03:00
515a2e6e33 Only die when detecting a real race condition, not just a CAS failure 2022-01-05 17:05:25 +03:00
68b6763ebe Add asserts for lp-optimizer tests, pass ordered from the monitor 2022-01-03 20:37:07 +03:00
9c6168bf17 Remove fill_parsed_response 2022-01-03 20:08:26 +03:00
08e467270a Fix pg_size changing from 3 to 2 2022-01-03 17:56:54 +03:00
5473d5b4a2 Rework HTTP client to use keepalive, move getifaddr_list to addr_util 2022-01-03 14:52:01 +03:00
Vitaliy Filippov
c3304bce27 Merge pull request #38 from mirrorll/master
journal check_available error
2021-12-31 12:45:16 +03:00
ec2852c598 Add minsize_1 test 2021-12-28 10:54:36 +03:00
b9f5c2a823 Support zero-copy send in fio_sec_osd to allow testing it
Prelimilary results:
- CPU usage drops significantly. For example, in T1Q8 128K write test against
  stub_uring_osd with 10G network and Athlon X4 860k CPU it drops from 100% to 30%
- Latency becomes slightly worse. In T1Q1 4K write test in the same environment
  latency increases from 56 to 63 us.
- Small write throughput also becomes slightly worse. In T1Q128 4K write test
  against stub iops decreases from 138k to ~110k (unstable, fluctuates 100k..120k).
  Note that this is without io_uring, of course.
2021-12-27 02:12:44 +03:00
e9d2f79aa7 Support reading bitmaps in fio_sec_osd 2021-12-27 02:12:44 +03:00
0785bdf8b3 Release 0.6.11
- Slightly reduce journaling write amplification (requires no_same_sector_overwrites=false)
- Fix listen_backlog (it was 0) because it could more than halve OSD socket send speed
- Support IPv6 OSD addresses
- Do not try to initialize client in simple-offsets
- Fix OSDs sometimes marking PGs incomplete instead of trying to connect with peers
- Allow to configure OSD placement in node_placement
- Allow to run with 4k sector size block devices. Natural, but it was forbidden
2021-12-26 21:11:24 +03:00
b57e44748b Send 4 byte bitmap in stub_uring_osd 2021-12-25 11:38:13 +03:00
1bbe62f29c Fix uninitialized listen_backlog which was leading to REALLY SLOW send speeds!!! 2021-12-25 11:38:13 +03:00
lihai
3061c30132 journal check_available error 2021-12-21 09:39:58 +08:00
136 changed files with 6923 additions and 2271 deletions

3
.gitmodules vendored
View File

@@ -4,3 +4,6 @@
[submodule "json11"]
path = json11
url = ../json11.git
[submodule "libnfs"]
path = libnfs
url = ../libnfs.git

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
project(vitastor)
set(VERSION "0.6.10")
set(VERSION "0.6.16")
add_subdirectory(src)

View File

@@ -55,11 +55,11 @@ Vitastor на данный момент находится в статусе п
## Планы развития
- Поддержка удаления снапшотов (слияния слоёв)
- Более корректные скрипты разметки дисков и автоматического запуска OSD
- Другие инструменты администрирования
- Плагины для OpenNebula и других облачных систем
- iSCSI-прокси
- Упрощённый NFS прокси
- Более быстрое переключение при отказах
- Фоновая проверка целостности без контрольных сумм (сверка реплик)
- Контрольные суммы
@@ -407,6 +407,7 @@ Vitastor с однопоточной NBD прокси на том же стен
- На хостах мониторов:
- Пропишите нужные вам значения в файле `/usr/lib/vitastor/mon/make-units.sh`
- Создайте юниты systemd для etcd и мониторов: `/usr/lib/vitastor/mon/make-units.sh`
- Запустите etcd и мониторы: `systemctl start etcd vitastor-mon`
- Пропишите etcd_address и osd_network в `/etc/vitastor/vitastor.conf`. Например:
```
{
@@ -414,7 +415,14 @@ Vitastor с однопоточной NBD прокси на том же стен
"osd_network": "10.200.1.0/24"
}
```
- Создайте юниты systemd для OSD: `/usr/lib/vitastor/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
- Инициализуйте OSD:
- SSD: `/usr/lib/vitastor/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
- Гибридные, HDD+SSD: `/usr/lib/vitastor/mon/make-osd-hybrid.js /dev/sda /dev/sdb ...` - передайте
все ваши SSD и HDD скрипту в командной строке подряд, скрипт автоматически выделит разделы под
журналы на SSD и данные на HDD. Скрипт пропускает HDD, на которых уже есть разделы
или вообще какие-то данные, поэтому если диски непустые, сначала очистите их с помощью
`wipefs -a`. SSD с таблицей разделов не пропускаются, но так как скрипт создаёт новые разделы
для журналов, на SSD должно быть доступно свободное нераспределённое место.
- Вы можете менять параметры OSD в юнитах systemd или в `vitastor.conf`. Смысл некоторых параметров:
- `disable_data_fsync 1` - отключает fsync, используется с SSD с конденсаторами.
- `immediate_commit all` - используется с SSD с конденсаторами.
@@ -430,7 +438,6 @@ Vitastor с однопоточной NBD прокси на том же стен
диски, используемые на одном из тестовых стендов - Intel D3-S4510 - очень сильно не любят такую
перезапись, и для них была добавлена эта опция. Когда данный режим включён, также нужно поднимать
значение `journal_sector_buffer_count`, так как иначе Vitastor не хватит буферов для записи в журнал.
- Запустите все etcd: `systemctl start etcd`
- Создайте глобальную конфигурацию в etcd: `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
(если все ваши диски - серверные с конденсаторами).
- Создайте пулы: `etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`.

View File

@@ -49,11 +49,11 @@ breaking changes in the future. However, the following is implemented:
## Roadmap
- Snapshot deletion (layer merge) support
- Better OSD creation and auto-start tools
- Other administrative tools
- Plugins for OpenNebula, Proxmox and other cloud systems
- Plugins for OpenNebula and other cloud systems
- iSCSI proxy
- Simplified NFS proxy
- Faster failover
- Scrubbing without checksums (verification of replicas)
- Checksums
@@ -360,6 +360,7 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
- On the monitor hosts:
- Edit variables at the top of `/usr/lib/vitastor/mon/make-units.sh` to desired values.
- Create systemd units for the monitor and etcd: `/usr/lib/vitastor/mon/make-units.sh`
- Start etcd and monitors: `systemctl start etcd vitastor-mon`
- Put etcd_address and osd_network into `/etc/vitastor/vitastor.conf`. Example:
```
{
@@ -367,7 +368,13 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
"osd_network": "10.200.1.0/24"
}
```
- Create systemd units for your OSDs: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
- Initialize OSDs:
- Simplest, SSD-only: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
- Hybrid, HDD+SSD: `/usr/lib/vitastor/mon/make-osd-hybrid.js /dev/sda /dev/sdb ...` - pass all your
devices (HDD and SSD) to this script - it will partition disks and initialize journals on its own.
This script skips HDDs which are already partitioned so if you want to use non-empty disks for
Vitastor you should first wipe them with `wipefs -a`. SSDs with GPT partition table are not skipped,
but some free unpartitioned space must be available because the script creates new partitions for journals.
- You can change OSD configuration in units or in `vitastor.conf`. Notable configuration variables:
- `disable_data_fsync 1` - only safe with server-grade drives with capacitors.
- `immediate_commit all` - use this if all your drives are server-grade.

View File

@@ -1,4 +1,4 @@
VERSION ?= v0.6.10
VERSION ?= v0.6.16
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v0.6.10
image: vitalif/vitastor-csi:v0.6.16
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -116,7 +116,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v0.6.10
image: vitalif/vitastor-csi:v0.6.16
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -0,0 +1,13 @@
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: test-vitastor-pvc-block
spec:
storageClassName: vitastor
volumeMode: Block
accessModes:
- ReadWriteMany
resources:
requests:
storage: 10Gi

View File

@@ -0,0 +1,17 @@
apiVersion: v1
kind: Pod
metadata:
name: vitastor-test-block-pvc
namespace: default
spec:
containers:
- name: vitastor-test-block-pvc
image: nginx
volumeDevices:
- name: data
devicePath: /dev/xvda
volumes:
- name: data
persistentVolumeClaim:
claimName: test-vitastor-pvc-block
readOnly: false

View File

@@ -0,0 +1,17 @@
apiVersion: v1
kind: Pod
metadata:
name: vitastor-test-nginx
namespace: default
spec:
containers:
- name: vitastor-test-nginx
image: nginx
volumeMounts:
- mountPath: /usr/share/nginx/html/s3
name: data
volumes:
- name: data
persistentVolumeClaim:
claimName: test-vitastor-pvc
readOnly: false

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "0.6.10"
vitastorCSIDriverVersion = "0.6.16"
)
// Config struct fills the parameters of request or user input

View File

@@ -67,29 +67,44 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
klog.Infof("received node publish volume request %+v", protosanitizer.StripSecrets(req))
targetPath := req.GetTargetPath()
isBlock := req.GetVolumeCapability().GetBlock() != nil
// Check that it's not already mounted
free, error := mount.IsNotMountPoint(ns.mounter, targetPath)
_, error := mount.IsNotMountPoint(ns.mounter, targetPath)
if (error != nil)
{
if (os.IsNotExist(error))
{
error := os.MkdirAll(targetPath, 0777)
if (error != nil)
if (isBlock)
{
return nil, status.Error(codes.Internal, error.Error())
pathFile, err := os.OpenFile(targetPath, os.O_CREATE|os.O_RDWR, 0o600)
if (err != nil)
{
klog.Errorf("failed to create block device mount target %s with error: %v", targetPath, err)
return nil, status.Error(codes.Internal, err.Error())
}
err = pathFile.Close()
if (err != nil)
{
klog.Errorf("failed to close %s with error: %v", targetPath, err)
return nil, status.Error(codes.Internal, err.Error())
}
}
else
{
err := os.MkdirAll(targetPath, 0777)
if (err != nil)
{
klog.Errorf("failed to create fs mount target %s with error: %v", targetPath, err)
return nil, status.Error(codes.Internal, err.Error())
}
}
free = true
}
else
{
return nil, status.Error(codes.Internal, error.Error())
}
}
if (!free)
{
return &csi.NodePublishVolumeResponse{}, nil
}
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
@@ -149,7 +164,6 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
// Format the device (ext4 or xfs)
fsType := req.GetVolumeCapability().GetMount().GetFsType()
isBlock := req.GetVolumeCapability().GetBlock() != nil
opt := req.GetVolumeCapability().GetMount().GetMountFlags()
opt = append(opt, "_netdev")
if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||

2
debian/changelog vendored
View File

@@ -1,4 +1,4 @@
vitastor (0.6.10-1) unstable; urgency=medium
vitastor (0.6.16-1) unstable; urgency=medium
* RDMA support
* Bugfixes

View File

@@ -33,8 +33,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.6.10; \
cd vitastor-0.6.10; \
cp -r /root/vitastor vitastor-0.6.16; \
cd vitastor-0.6.16; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -47,8 +47,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.10.orig.tar.xz vitastor-0.6.10; \
cd vitastor-0.6.10; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.16.orig.tar.xz vitastor-0.6.16; \
cd vitastor-0.6.16; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

35
docs/params/common.yml Normal file
View File

@@ -0,0 +1,35 @@
- name: config_path
type: string
default: "/etc/vitastor/vitastor.conf"
info: |
Path to the JSON configuration file. Configuration file is optional,
a non-existing configuration file does not prevent Vitastor from
running if required parameters are specified.
info_ru: |
Путь к файлу конфигурации в формате JSON. Файл конфигурации необязателен,
без него Vitastor тоже будет работать, если переданы необходимые параметры.
- name: etcd_address
type: string or array of strings
type_ru: строка или массив строк
info: |
etcd connection endpoint(s). Multiple endpoints may be delimited by "," or
specified in a JSON array `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
Note that https is not supported for etcd connections yet.
info_ru: |
Адрес(а) подключения к etcd. Несколько адресов могут разделяться запятой
или указываться в виде JSON-массива `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
- name: etcd_prefix
type: string
default: "/vitastor"
info: |
Prefix for all keys in etcd used by Vitastor. You can change prefix and, for
example, use a single etcd cluster for multiple Vitastor clusters.
info_ru: |
Префикс для ключей etcd, которые использует Vitastor. Вы можете задать другой
префикс, например, чтобы запустить несколько кластеров Vitastor с одним
кластером etcd.
- name: log_level
type: int
default: 0
info: Log level. Raise if you want more verbose output.
info_ru: Уровень логгирования. Повысьте, если хотите более подробный вывод.

View File

@@ -0,0 +1,200 @@
- name: block_size
type: int
default: 131072
info: |
Size of objects (data blocks) into which all physical and virtual drives are
subdivided in Vitastor. One of current main settings in Vitastor, affects
memory usage, write amplification and I/O load distribution effectiveness.
Recommended default block size is 128 KB for SSD and 4 MB for HDD. In fact,
it's possible to use 4 MB for SSD too - it will lower memory usage, but
may increase average WA and reduce linear performance.
OSDs with different block sizes (for example, SSD and SSD+HDD OSDs) can
currently coexist in one etcd instance only within separate Vitastor
clusters with different etcd_prefix'es.
Also block size can't be changed after OSD initialization without losing
data.
You must always specify block_size in etcd in /vitastor/config/global if
you change it so all clients can know about it.
OSD memory usage is roughly (SIZE / BLOCK * 68 bytes) which is roughly
544 MB per 1 TB of used disk space with the default 128 KB block size.
info_ru: |
Размер объектов (блоков данных), на которые делятся физические и виртуальные
диски в Vitastor. Одна из ключевых на данный момент настроек, влияет на
потребление памяти, объём избыточной записи (write amplification) и
эффективность распределения нагрузки по OSD.
Рекомендуемые по умолчанию размеры блока - 128 килобайт для SSD и 4
мегабайта для HDD. В принципе, для SSD можно тоже использовать 4 мегабайта,
это понизит использование памяти, но ухудшит распределение нагрузки и в
среднем увеличит WA.
OSD с разными размерами блока (например, SSD и SSD+HDD OSD) на данный
момент могут сосуществовать в рамках одного etcd только в виде двух независимых
кластеров Vitastor с разными etcd_prefix.
Также размер блока нельзя менять после инициализации OSD без потери данных.
Если вы меняете размер блока, обязательно прописывайте его в etcd в
/vitastor/config/global, дабы все клиенты его знали.
Потребление памяти OSD составляет примерно (РАЗМЕР / БЛОК * 68 байт),
т.е. примерно 544 МБ памяти на 1 ТБ занятого места на диске при
стандартном 128 КБ блоке.
- name: bitmap_granularity
type: int
default: 4096
info: |
Required virtual disk write alignment ("sector size"). Must be a multiple
of disk_alignment. It's called bitmap granularity because Vitastor tracks
an allocation bitmap for each object containing 2 bits per each
(bitmap_granularity) bytes.
This parameter can't be changed after OSD initialization without losing
data. Also it's fixed for the whole Vitastor cluster i.e. two different
values can't be used in a single Vitastor cluster.
Clients MUST be aware of this parameter value, so put it into etcd key
/vitastor/config/global if you change it for any reason.
info_ru: |
Требуемое выравнивание записи на виртуальные диски (размер их "сектора").
Должен быть кратен disk_alignment. Называется гранулярностью битовой карты
потому, что Vitastor хранит битовую карту для каждого объекта, содержащую
по 2 бита на каждые (bitmap_granularity) байт.
Данный параметр нельзя менять после инициализации OSD без потери данных.
Также он фиксирован для всего кластера Vitastor, т.е. разные значения
не могут сосуществовать в одном кластере.
Клиенты ДОЛЖНЫ знать правильное значение этого параметра, так что если вы
его меняете, обязательно прописывайте изменённое значение в etcd в ключ
/vitastor/config/global.
- name: immediate_commit
type: string
default: false
info: |
Another parameter which is really important for performance.
Desktop SSDs are very fast (100000+ iops) for simple random writes
without cache flush. However, they are really slow (only around 1000 iops)
if you try to fsync() each write, that is, when you want to guarantee that
each change gets immediately persisted to the physical media.
Server-grade SSDs with "Advanced/Enhanced Power Loss Protection" or with
"Supercapacitor-based Power Loss Protection", on the other hand, are equally
fast with and without fsync because their cache is protected from sudden
power loss by a built-in supercapacitor-based "UPS".
Some software-defined storage systems always fsync each write and thus are
really slow when used with desktop SSDs. Vitastor, however, can also
efficiently utilize desktop SSDs by postponing fsync until the client calls
it explicitly.
This is what this parameter regulates. When it's set to "all" the whole
Vitastor cluster commits each change to disks immediately and clients just
ignore fsyncs because they know for sure that they're unneeded. This reduces
the amount of network roundtrips performed by clients and improves
performance. So it's always better to use server grade SSDs with
supercapacitors even with Vitastor, especially given that they cost only
a bit more than desktop models.
There is also a common SATA SSD (and HDD too!) firmware bug (or feature)
that makes server SSDs which have supercapacitors slow with fsync. To check
if your SSDs are affected, compare benchmark results from `fio -name=test
-ioengine=libaio -direct=1 -bs=4k -rw=randwrite -iodepth=1` with and without
`-fsync=1`. Results should be the same. If fsync=1 result is worse you can
try to work around this bug by "disabling" drive write-back cache by running
`hdparm -W 0 /dev/sdXX` or `echo write through > /sys/block/sdXX/device/scsi_disk/*/cache_type`
(IMPORTANT: don't mistake it with `/sys/block/sdXX/queue/write_cache` - it's
unsafe to change by hand). The same may apply to newer HDDs with internal
SSD cache or "media-cache" - for example, a lot of Seagate EXOS drives have
it (they have internal SSD cache even though it's not stated in datasheets).
This parameter must be set both in etcd in /vitastor/config/global and in
OSD command line or configuration. Setting it to "all" or "small" requires
enabling disable_journal_fsync and disable_meta_fsync, setting it to "all"
also requires enabling disable_data_fsync.
TLDR: For optimal performance, set immediate_commit to "all" if you only use
SSDs with supercapacitor-based power loss protection (nonvolatile
write-through cache) for both data and journals in the whole Vitastor
cluster. Set it to "small" if you only use such SSDs for journals. Leave
empty if your drives have write-back cache.
info_ru: |
Ещё один важный для производительности параметр.
Модели SSD для настольных компьютеров очень быстрые (100000+ операций в
секунду) при простой случайной записи без сбросов кэша. Однако они очень
медленные (всего порядка 1000 iops), если вы пытаетесь сбрасывать кэш после
каждой записи, то есть, если вы пытаетесь гарантировать, что каждое
изменение физически записывается в энергонезависимую память.
С другой стороны, серверные SSD с конденсаторами - функцией, называемой
"Advanced/Enhanced Power Loss Protection" или просто "Supercapacitor-based
Power Loss Protection" - одинаково быстрые и со сбросом кэша, и без
него, потому что их кэш защищён от потери питания встроенным "источником
бесперебойного питания" на основе суперконденсаторов и на самом деле они
его никогда не сбрасывают.
Некоторые программные СХД всегда сбрасывают кэши дисков при каждой записи
и поэтому работают очень медленно с настольными SSD. Vitastor, однако, может
откладывать fsync до явного его вызова со стороны клиента и таким образом
эффективно утилизировать настольные SSD.
Данный параметр влияет как раз на это. Когда он установлен в значение "all",
весь кластер Vitastor мгновенно фиксирует каждое изменение на физические
носители и клиенты могут просто игнорировать запросы fsync, т.к. они точно
знают, что fsync-и не нужны. Это уменьшает число необходимых обращений к OSD
по сети и улучшает производительность. Поэтому даже с Vitastor лучше всегда
использовать только серверные модели SSD с суперконденсаторами, особенно
учитывая то, что стоят они ненамного дороже настольных.
Также в прошивках SATA SSD (и даже HDD!) очень часто встречается либо баг,
либо просто особенность логики, из-за которой серверные SSD, имеющие
конденсаторы и защиту от потери питания, всё равно медленно работают с
fsync. Чтобы понять, подвержены ли этой проблеме ваши SSD, сравните
результаты тестов `fio -name=test -ioengine=libaio -direct=1 -bs=4k
-rw=randwrite -iodepth=1` без и с опцией `-fsync=1`. Результаты должны
быть одинаковые. Если результат с `fsync=1` хуже, вы можете попробовать
обойти проблему, "отключив" кэш записи диска командой `hdparm -W 0 /dev/sdXX`
либо `echo write through > /sys/block/sdXX/device/scsi_disk/*/cache_type`
(ВАЖНО: не перепутайте с `/sys/block/sdXX/queue/write_cache` - этот параметр
менять руками небезопасно). Такая же проблема может встречаться и в новых
HDD-дисках с внутренним SSD или "медиа" кэшем - например, она встречается во
многих дисках Seagate EXOS (у них есть внутренний SSD-кэш, хотя это и не
указано в спецификациях).
Данный параметр нужно указывать и в etcd в /vitastor/config/global, и в
командной строке или конфигурации OSD. Значения "all" и "small" требуют
включения disable_journal_fsync и disable_meta_fsync, значение "all" также
требует включения disable_data_fsync.
Итого, вкратце: для оптимальной производительности установите
immediate_commit в значение "all", если вы используете в кластере только SSD
с суперконденсаторами и для данных, и для журналов. Если вы используете
такие SSD для всех журналов, но не для данных - можете установить параметр
в "small". Если и какие-то из дисков журналов имеют волатильный кэш записи -
оставьте параметр пустым.
- name: client_dirty_limit
type: int
default: 33554432
info: |
Without immediate_commit=all this parameter sets the limit of "dirty"
(not committed by fsync) data allowed by the client before forcing an
additional fsync and committing the data. Also note that the client always
holds a copy of uncommitted data in memory so this setting also affects
RAM usage of clients.
This parameter doesn't affect OSDs themselves.
info_ru: |
При работе без immediate_commit=all - это лимит объёма "грязных" (не
зафиксированных fsync-ом) данных, при достижении которого клиент будет
принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
что в этом случае до момента fsync клиент хранит копию незафиксированных
данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
Параметр не влияет на сами OSD.

205
docs/params/layout-osd.yml Normal file
View File

@@ -0,0 +1,205 @@
- name: data_device
type: string
info: |
Path to the block device to use for data. It's highly recommendded to use
stable paths for all device names: `/dev/disk/by-partuuid/xxx...` instead
of just `/dev/sda` or `/dev/nvme0n1` to not mess up after server restart.
Files can also be used instead of block devices, but this is implemented
only for testing purposes and not for production.
info_ru: |
Путь к диску (блочному устройству) для хранения данных. Крайне рекомендуется
использовать стабильные пути: `/dev/disk/by-partuuid/xxx...` вместо простых
`/dev/sda` или `/dev/nvme0n1`, чтобы пути не могли спутаться после
перезагрузки сервера. Также вместо блочных устройств можно указывать файлы,
но это реализовано только для тестирования, а не для боевой среды.
- name: meta_device
type: string
info: |
Path to the block device to use for the metadata. Metadata must be on a fast
SSD or performance will suffer. If this option is skipped, `data_device` is
used for the metadata.
info_ru: |
Путь к диску метаданных. Метаданные должны располагаться на быстром
SSD-диске, иначе производительность пострадает. Если эта опция не указана,
для метаданных используется `data_device`.
- name: journal_device
type: string
info: |
Path to the block device to use for the journal. Journal must be on a fast
SSD or performance will suffer. If this option is skipped, `meta_device` is
used for the journal, and if it's also empty, journal is put on
`data_device`. It's almost always fine to put metadata and journal on the
same device, in this case you only need to set `meta_device`.
info_ru: |
Путь к диску журнала. Журнал должен располагаться на быстром SSD-диске,
иначе производительность пострадает. Если эта опция не указана,
для журнала используется `meta_device`, если же пуста и она, журнал
располагается на `data_device`. Нормально располагать журнал и метаданные
на одном устройстве, в этом случае достаточно указать только `meta_device`.
- name: journal_offset
type: int
default: 0
info: Offset on the device in bytes where the journal is stored.
info_ru: Смещение на устройстве в байтах, по которому располагается журнал.
- name: journal_size
type: int
info: |
Journal size in bytes. Doesn't have to be large, 16-32 MB is usually fine.
By default, the whole journal device will be used for the journal. You must
set it to some value manually (or use make-osd.sh) if you colocate the
journal with data or metadata.
info_ru: |
Размер журнала в байтах. Большим быть не обязан, 16-32 МБ обычно достаточно.
По умолчанию для журнала используется всё устройство журнала. Если же вы
размещаете журнал на устройстве данных или метаданных, то вы должны
установить эту опцию в какое-то значение сами (или использовать скрипт
make-osd.sh).
- name: meta_offset
type: int
default: 0
info: |
Offset on the device in bytes where the metadata area is stored.
Again, set it to something if you colocate metadata with journal or data.
info_ru: |
Смещение на устройстве в байтах, по которому располагаются метаданные.
Эту опцию нужно задать, если метаданные у вас хранятся на том же
устройстве, что данные или журнал.
- name: data_offset
type: int
default: 0
info: |
Offset on the device in bytes where the data area is stored.
Again, set it to something if you colocate data with journal or metadata.
info_ru: |
Смещение на устройстве в байтах, по которому располагаются данные.
Эту опцию нужно задать, если данные у вас хранятся на том же
устройстве, что метаданные или журнал.
- name: data_size
type: int
info: |
Data area size in bytes. By default, the whole data device up to the end
will be used for the data area, but you can restrict it if you want to use
a smaller part. Note that there is no option to set metadata area size -
it's derived from the data area size.
info_ru: |
Размер области данных в байтах. По умолчанию под данные будет использована
вся доступная область устройства данных до конца устройства, но вы можете
использовать эту опцию, чтобы ограничить её меньшим размером. Заметьте, что
опции размера области метаданных нет - она вычисляется из размера области
данных автоматически.
- name: meta_block_size
type: int
default: 4096
info: |
Physical block size of the metadata device. 4096 for most current
HDDs and SSDs.
info_ru: |
Размер физического блока устройства метаданных. 4096 для большинства
современных SSD и HDD.
- name: journal_block_size
type: int
default: 4096
info: |
Physical block size of the journal device. Must be a multiple of
`disk_alignment`. 4096 for most current HDDs and SSDs.
info_ru: |
Размер физического блока устройства журнала. Должен быть кратен
`disk_alignment`. 4096 для большинства современных SSD и HDD.
- name: disable_data_fsync
type: bool
default: false
info: |
Do not issue fsyncs to the data device, i.e. do not flush its cache.
Safe ONLY if your data device has write-through cache. If you disable
the cache yourself using `hdparm` or `scsi_disk/cache_type` then make sure
that the cache disable command is run every time before starting Vitastor
OSD, for example, in the systemd unit. See also `immediate_commit` option
for the instructions to disable cache and how to benefit from it.
info_ru: |
Не отправлять fsync-и устройству данных, т.е. не сбрасывать его кэш.
Безопасно, ТОЛЬКО если ваше устройство данных имеет кэш со сквозной
записью (write-through). Если вы отключаете кэш через `hdparm` или
`scsi_disk/cache_type`, то удостоверьтесь, что команда отключения кэша
выполняется перед каждым запуском Vitastor OSD, например, в systemd unit-е.
Смотрите также опцию `immediate_commit` для инструкций по отключению кэша
и о том, как из этого извлечь выгоду.
- name: disable_meta_fsync
type: bool
default: false
info: |
Same as disable_data_fsync, but for the metadata device. If the metadata
device is not set or if the data device is used for the metadata the option
is ignored and disable_data_fsync value is used instead of it.
info_ru: |
То же, что disable_data_fsync, но для устройства метаданных. Если устройство
метаданных не задано или если оно равно устройству данных, значение опции
игнорируется и вместо него используется значение опции disable_data_fsync.
- name: disable_journal_fsync
type: bool
default: false
info: |
Same as disable_data_fsync, but for the journal device. If the journal
device is not set or if the metadata device is used for the journal the
option is ignored and disable_meta_fsync value is used instead of it. If
the same device is used for data, metadata and journal the option is also
ignored and disable_data_fsync value is used instead of it.
info_ru: |
То же, что disable_data_fsync, но для устройства журнала. Если устройство
журнала не задано или если оно равно устройству метаданных, значение опции
игнорируется и вместо него используется значение опции disable_meta_fsync.
Если одно и то же устройство используется и под данные, и под журнал, и под
метаданные - значение опции также игнорируется и вместо него используется
значение опции disable_data_fsync.
- name: disable_device_lock
type: bool
default: false
info: |
Do not lock data, metadata and journal block devices exclusively with
flock(). Though it's not recommended, but you can use it you want to run
multiple OSD with a single device and different offsets, without using
partitions.
info_ru: |
Не блокировать устройства данных, метаданных и журнала от открытия их
другими OSD с помощью flock(). Так делать не рекомендуется, но теоретически
вы можете это использовать, чтобы запускать несколько OSD на одном
устройстве с разными смещениями и без использования разделов.
- name: disk_alignment
type: int
default: 4096
info: |
Required physical disk write alignment. Most current SSD and HDD drives
use 4 KB physical sectors even if they report 512 byte logical sector
size, so 4 KB is a good default setting.
Note, however, that physical sector size also affects WA, because with block
devices it's impossible to write anything smaller than a block. So, when
Vitastor has to write a single metadata entry that's only about 32 bytes in
size, it actually has to write the whole 4 KB sector.
Because of this it can actually be beneficial to use SSDs which work well
with 512 byte sectors and use 512 byte disk_alignment, journal_block_size
and meta_block_size. But the only SSD that may fit into this category is
Intel Optane (probably, not tested yet).
Clients don't need to be aware of disk_alignment, so it's not required to
put a modified value into etcd key /vitastor/config/global.
info_ru: |
Требуемое выравнивание записи на физические диски. Почти все современные
SSD и HDD диски используют 4 КБ физические секторы, даже если показывают
логический размер сектора 512 байт, поэтому 4 КБ - хорошее значение по
умолчанию.
Однако стоит понимать, что физический размер сектора тоже влияет на
избыточную запись (WA), потому что ничего меньше блока (сектора) на блочное
устройство записать невозможно. Таким образом, когда Vitastor-у нужно
записать на диск всего лишь одну 32-байтную запись метаданных, фактически
приходится перезаписывать 4 КБ сектор целиком.
Поэтому, на самом деле, может быть выгодно найти SSD, хорошо работающие с
меньшими, 512-байтными, блоками и использовать 512-байтные disk_alignment,
journal_block_size и meta_block_size. Однако единственные SSD, которые
теоретически могут попасть в эту категорию - это Intel Optane (но и это
пока не проверялось автором).
Клиентам не обязательно знать про disk_alignment, так что помещать значение
этого параметра в etcd в /vitastor/config/global не нужно.

65
docs/params/monitor.yml Normal file
View File

@@ -0,0 +1,65 @@
- name: etcd_mon_ttl
type: sec
min: 10
default: 30
info: Monitor etcd lease refresh interval in seconds
info_ru: Интервал обновления etcd резервации (lease) монитором
- name: etcd_mon_timeout
type: ms
default: 1000
info: etcd request timeout used by monitor
info_ru: Таймаут выполнения запросов к etcd от монитора
- name: etcd_mon_retries
type: int
default: 5
info: Maximum number of attempts for one monitor etcd request
info_ru: Максимальное число попыток выполнения запросов к etcd монитором
- name: mon_change_timeout
type: ms
min: 100
default: 1000
info: Optimistic retry interval for monitor etcd modification requests
info_ru: Время повтора при коллизиях при запросах модификации в etcd, производимых монитором
- name: mon_stats_timeout
type: ms
min: 100
default: 1000
info: |
Interval for monitor to wait before updating aggregated statistics in
etcd after receiving OSD statistics updates
info_ru: |
Интервал, который монитор ожидает при изменении статистики по отдельным
OSD перед обновлением агрегированной статистики в etcd
- name: osd_out_time
type: sec
default: 600
info: |
Time after which a failed OSD is removed from the data distribution.
I.e. time which the monitor waits before attempting to restore data
redundancy using other OSDs.
info_ru: |
Время, через которое отключенный OSD исключается из распределения данных.
То есть, время, которое монитор ожидает перед попыткой переместить данные
на другие OSD и таким образом восстановить избыточность хранения.
- name: placement_levels
type: json
default: '`{"host":100,"osd":101}`'
info: |
Levels for the placement tree. You can define arbitrary tree levels by
defining them in this parameter. The configuration parameter value should
contain a JSON object with level names as keys and integer priorities as
values. Smaller priority means higher level in tree. For example,
"datacenter" should have smaller priority than "osd". "host" and "osd"
levels are always predefined and can't be removed. If one of them is not
present in the configuration, then it is defined with the default priority
(100 for "host", 101 for "osd").
info_ru: |
Определения уровней для дерева размещения OSD. Вы можете определять
произвольные уровни, помещая их в данный параметр конфигурации. Значение
параметра должно содержать JSON-объект, ключи которого будут являться
названиями уровней, а значения - целочисленными приоритетами. Меньшие
приоритеты соответствуют верхним уровням дерева. Например, уровень
"датацентр" должен иметь меньший приоритет, чем "OSD". Уровни с названиями
"host" и "osd" являются предопределёнными и не могут быть удалены. Если
один из них отсутствует в конфигурации, он доопределяется с приоритетом по
умолчанию (100 для уровня "host", 101 для "osd").

225
docs/params/network.yml Normal file
View File

@@ -0,0 +1,225 @@
- name: tcp_header_buffer_size
type: int
default: 65536
info: |
Size of the buffer used to read data using an additional copy. Vitastor
packet headers are 128 bytes, payload is always at least 4 KB, so it is
usually beneficial to try to read multiple packets at once even though
it requires to copy the data an additional time. The rest of each packet
is received without an additional copy. You can try to play with this
parameter and see how it affects random iops and linear bandwidth if you
want.
info_ru: |
Размер буфера для чтения данных с дополнительным копированием. Пакеты
Vitastor содержат 128-байтные заголовки, за которыми следуют данные размером
от 4 КБ и для мелких операций ввода-вывода обычно выгодно за 1 вызов читать
сразу несколько пакетов, даже не смотря на то, что это требует лишний раз
скопировать данные. Часть каждого пакета за пределами значения данного
параметра читается без дополнительного копирования. Вы можете попробовать
поменять этот параметр и посмотреть, как он влияет на производительность
случайного и линейного доступа.
- name: use_sync_send_recv
type: bool
default: false
info: |
If true, synchronous send/recv syscalls are used instead of io_uring for
socket communication. Useless for OSDs because they require io_uring anyway,
but may be required for clients with old kernel versions.
info_ru: |
Если установлено в истину, то вместо io_uring для передачи данных по сети
будут использоваться обычные синхронные системные вызовы send/recv. Для OSD
это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в
принципе, это может применяться для клиентов со старыми версиями ядра.
- name: use_rdma
type: bool
default: true
info: |
Try to use RDMA for communication if it's available. Disable if you don't
want Vitastor to use RDMA. RDMA increases the performance, but TCP-only
clients can still talk to an RDMA-enabled cluster, so you don't need to
make sure that all clients support RDMA when enabling it.
info_ru: |
Пытаться использовать RDMA для связи при наличии доступных устройств.
Отключите, если вы не хотите, чтобы Vitastor использовал RDMA.
RDMA улучшает производительность, но
Клиенты и клиентов and TCP-only clients in the cluster at the
same time - TCP-only clients are still able to use an RDMA-enabled cluster.
- name: rdma_device
type: string
info: |
RDMA device name to use for Vitastor OSD communications (for example,
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
to work. For example, Mellanox ConnectX-3 and older adapters don't have
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
root to list available RDMA devices and their features.
info_ru: |
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
параметры и возможности.
- name: rdma_port_num
type: int
default: 1
info: |
RDMA device port number to use. Only for devices that have more than 1 port.
See `phys_port_cnt` in `ibv_devinfo -v` output to determine how many ports
your device has.
info_ru: |
Номер порта RDMA-устройства, который следует использовать. Имеет смысл
только для устройств, у которых более 1 порта. Чтобы узнать, сколько портов
у вашего адаптера, посмотрите `phys_port_cnt` в выводе команды
`ibv_devinfo -v`.
- name: rdma_gid_index
type: int
default: 0
info: |
Global address identifier index of the RDMA device to use. Different GID
indexes may correspond to different protocols like RoCEv1, RoCEv2 and iWARP.
Search for "GID" in `ibv_devinfo -v` output to determine which GID index
you need.
**IMPORTANT:** If you want to use RoCEv2 (as recommended) then the correct
rdma_gid_index is usually 1 (IPv6) or 3 (IPv4).
info_ru: |
Номер глобального идентификатора адреса RDMA-устройства, который следует
использовать. Разным gid_index могут соответствовать разные протоколы связи:
RoCEv1, RoCEv2, iWARP. Чтобы понять, какой нужен вам - смотрите строчки со
словом "GID" в выводе команды `ibv_devinfo -v`.
**ВАЖНО:** Если вы хотите использовать RoCEv2 (как мы и рекомендуем), то
правильный rdma_gid_index, как правило, 1 (IPv6) или 3 (IPv4).
- name: rdma_mtu
type: int
default: 4096
info: |
RDMA Path MTU to use. Must be 1024, 2048 or 4096. There is usually no
sense to change it from the default 4096.
info_ru: |
Максимальная единица передачи (Path MTU) для RDMA. Должно быть равно 1024,
2048 или 4096. Обычно нет смысла менять значение по умолчанию, равное 4096.
- name: rdma_max_sge
type: int
default: 128
info: |
Maximum number of scatter/gather entries to use for RDMA. OSDs negotiate
the actual value when establishing connection anyway, so it's usually not
required to change this parameter.
info_ru: |
Максимальное число записей разделения/сборки (scatter/gather) для RDMA.
OSD в любом случае согласовывают реальное значение при установке соединения,
так что менять этот параметр обычно не нужно.
- name: rdma_max_msg
type: int
default: 1048576
info: Maximum size of a single RDMA send or receive operation in bytes.
info_ru: Максимальный размер одной RDMA-операции отправки или приёма.
- name: rdma_max_recv
type: int
default: 8
info: |
Maximum number of parallel RDMA receive operations. Note that this number
of receive buffers `rdma_max_msg` in size are allocated for each client,
so this setting actually affects memory usage. This is because RDMA receive
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
later versions.
info_ru: |
Максимальное число параллельных RDMA-операций получения данных. Следует
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
для каждого подключённого клиентского соединения, так что данная настройка
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
более новых версиях Vitastor.
- name: peer_connect_interval
type: sec
min: 1
default: 5
info: Interval before attempting to reconnect to an unavailable OSD.
info_ru: Время ожидания перед повторной попыткой соединиться с недоступным OSD.
- name: peer_connect_timeout
type: sec
min: 1
default: 5
info: Timeout for OSD connection attempts.
info_ru: Максимальное время ожидания попытки соединения с OSD.
- name: osd_idle_timeout
type: sec
min: 1
default: 5
info: |
OSD connection inactivity time after which clients and other OSDs send
keepalive requests to check state of the connection.
info_ru: |
Время неактивности соединения с OSD, после которого клиенты или другие OSD
посылают запрос проверки состояния соединения.
- name: osd_ping_timeout
type: sec
min: 1
default: 5
info: |
Maximum time to wait for OSD keepalive responses. If an OSD doesn't respond
within this time, the connection to it is dropped and a reconnection attempt
is scheduled.
info_ru: |
Максимальное время ожидания ответа на запрос проверки состояния соединения.
Если OSD не отвечает за это время, соединение отключается и производится
повторная попытка соединения.
- name: up_wait_retry_interval
type: ms
min: 50
default: 500
info: |
OSDs respond to clients with a special error code when they receive I/O
requests for a PG that's not synchronized and started. This parameter sets
the time for the clients to wait before re-attempting such I/O requests.
info_ru: |
Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
поднятым на данный момент на них PG, либо к PG в процессе синхронизации,
они отвечают клиентам специальным кодом ошибки, означающим, что клиент
должен некоторое время подождать перед повторением запроса. Именно это время
ожидания задаёт данный параметр.
- name: max_etcd_attempts
type: int
default: 5
info: |
Maximum number of attempts for etcd requests which can't be retried
indefinitely.
info_ru: |
Максимальное число попыток выполнения запросов к etcd для тех запросов,
которые нельзя повторять бесконечно.
- name: etcd_quick_timeout
type: ms
default: 1000
info: |
Timeout for etcd requests which should complete quickly, like lease refresh.
info_ru: |
Максимальное время выполнения запросов к etcd, которые должны завершаться
быстро, таких, как обновление резервации (lease).
- name: etcd_slow_timeout
type: ms
default: 5000
info: Timeout for etcd requests which are allowed to wait for some time.
info_ru: |
Максимальное время выполнения запросов к etcd, для которых не обязательно
гарантировать быстрое выполнение.
- name: etcd_keepalive_timeout
type: sec
default: max(30, etcd_report_interval*2)
info: |
Timeout for etcd connection HTTP Keep-Alive. Should be higher than
etcd_report_interval to guarantee that keepalive actually works.
info_ru: |
Таймаут для HTTP Keep-Alive в соединениях к etcd. Должен быть больше, чем
etcd_report_interval, чтобы keepalive гарантированно работал.
- name: etcd_ws_keepalive_timeout
type: sec
default: 30
info: |
etcd websocket ping interval required to keep the connection alive and
detect disconnections quickly.
info_ru: |
Интервал проверки живости вебсокет-подключений к etcd.

341
docs/params/osd.yml Normal file
View File

@@ -0,0 +1,341 @@
- name: etcd_report_interval
type: sec
default: 5
info: |
Interval at which OSDs report their state to etcd. Affects OSD lease time
and thus the failover speed. Lease time is equal to this parameter value
plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
that every OSD always refreshes its lease in time.
info_ru: |
Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
влияет на время резервации (lease) OSD и поэтому на скорость переключения
при падении OSD. Время lease равняется значению этого параметра плюс
max_etcd_attempts * etcd_quick_timeout.
- name: run_primary
type: bool
default: true
info: |
Start primary OSD logic on this OSD. As of now, can be turned off only for
debugging purposes. It's possible to implement additional feature for the
monitor which may allow to separate primary and secondary OSDs, but it's
unclear why anyone could need it, so it's not implemented.
info_ru: |
Запускать логику первичного OSD на данном OSD. На данный момент отключать
эту опцию может иметь смысл только в целях отладки. В теории, можно
реализовать дополнительный режим для монитора, который позволит отделять
первичные OSD от вторичных, но пока не понятно, зачем это может кому-то
понадобиться, поэтому это не реализовано.
- name: osd_network
type: string or array of strings
type_ru: строка или массив строк
info: |
Network mask of the network (IPv4 or IPv6) to use for OSDs. Note that
although it's possible to specify multiple networks here, this does not
mean that OSDs will create multiple listening sockets - they'll only
pick the first matching address of an UP + RUNNING interface. Separate
networks for cluster and client connections are also not implemented, but
they are mostly useless anyway, so it's not a big deal.
info_ru: |
Маска подсети (IPv4 или IPv6) для использования для соединений с OSD.
Имейте в виду, что хотя сейчас и можно передать в этот параметр несколько
подсетей, это не означает, что OSD будут создавать несколько слушающих
сокетов - они лишь будут выбирать адрес первого поднятого (состояние UP +
RUNNING), подходящий под заданную маску. Также не реализовано разделение
кластерной и публичной сетей OSD. Правда, от него обычно всё равно довольно
мало толку, так что особенной проблемы в этом нет.
- name: bind_address
type: string
default: "0.0.0.0"
info: |
Instead of the network mask, you can also set OSD listen address explicitly
using this parameter. May be useful if you want to start OSDs on interfaces
that are not UP + RUNNING.
info_ru: |
Этим параметром можно явным образом задать адрес, на котором будет ожидать
соединений OSD (вместо использования маски подсети). Может быть полезно,
например, чтобы запускать OSD на неподнятых интерфейсах (не UP + RUNNING).
- name: bind_port
type: int
info: |
By default, OSDs pick random ports to use for incoming connections
automatically. With this option you can set a specific port for a specific
OSD by hand.
info_ru: |
По умолчанию OSD сами выбирают случайные порты для входящих подключений.
С помощью данной опции вы можете задать порт для отдельного OSD вручную.
- name: autosync_interval
type: sec
default: 5
info: |
Time interval at which automatic fsyncs/flushes are issued by each OSD when
the immediate_commit mode if disabled. fsyncs are required because without
them OSDs quickly fill their journals, become unable to clear them and
stall. Also this option limits the amount of recent uncommitted changes
which OSDs may lose in case of a power outage in case when clients don't
issue fsyncs at all.
info_ru: |
Временной интервал отправки автоматических fsync-ов (операций очистки кэша)
каждым OSD для случая, когда режим immediate_commit отключён. fsync-и нужны
OSD, чтобы успевать очищать журнал - без них OSD быстро заполняют журналы и
перестают обрабатывать операции записи. Также эта опция ограничивает объём
недавних незафиксированных изменений, которые OSD могут терять при
отключении питания, если клиенты вообще не отправляют fsync.
- name: autosync_writes
type: int
default: 128
info: |
Same as autosync_interval, but sets the maximum number of uncommitted write
operations before issuing an fsync operation internally.
info_ru: |
Аналогично autosync_interval, но задаёт не временной интервал, а
максимальное количество незафиксированных операций записи перед
принудительной отправкой fsync-а.
- name: recovery_queue_depth
type: int
default: 4
info: |
Maximum recovery operations per one primary OSD at any given moment of time.
Currently it's the only parameter available to tune the speed or recovery
and rebalancing, but it's planned to implement more.
info_ru: |
Максимальное число операций восстановления на одном первичном OSD в любой
момент времени. На данный момент единственный параметр, который можно менять
для ускорения или замедления восстановления и перебалансировки данных, но
в планах реализация других параметров.
- name: recovery_sync_batch
type: int
default: 16
info: Maximum number of recovery operations before issuing an additional fsync.
info_ru: Максимальное число операций восстановления перед дополнительным fsync.
- name: readonly
type: bool
default: false
info: |
Read-only mode. If this is enabled, an OSD will never issue any writes to
the underlying device. This may be useful for recovery purposes.
info_ru: |
Режим "только чтение". Если включить этот режим, OSD не будет писать ничего
на диск. Может быть полезно в целях восстановления.
- name: no_recovery
type: bool
default: false
info: |
Disable automatic background recovery of objects. Note that it doesn't
affect implicit recovery of objects happening during writes - a write is
always made to a full set of at least pg_minsize OSDs.
info_ru: |
Отключить автоматическое фоновое восстановление объектов. Обратите внимание,
что эта опция не отключает восстановление объектов, происходящее при
записи - запись всегда производится в полный набор из как минимум pg_minsize
OSD.
- name: no_rebalance
type: bool
default: false
info: |
Disable background movement of data between different OSDs. Disabling it
means that PGs in the `has_misplaced` state will be left in it indefinitely.
info_ru: |
Отключить фоновое перемещение объектов между разными OSD. Отключение
означает, что PG, находящиеся в состоянии `has_misplaced`, будут оставлены
в нём на неопределённый срок.
- name: print_stats_interval
type: sec
default: 3
info: |
Time interval at which OSDs print simple human-readable operation
statistics on stdout.
info_ru: |
Временной интервал, с которым OSD печатают простую человекочитаемую
статистику выполнения операций в стандартный вывод.
- name: slow_log_interval
type: sec
default: 10
info: |
Time interval at which OSDs dump slow or stuck operations on stdout, if
they're any. Also it's the time after which an operation is considered
"slow".
info_ru: |
Временной интервал, с которым OSD выводят в стандартный вывод список
медленных или зависших операций, если таковые имеются. Также время, при
превышении которого операция считается "медленной".
- name: max_write_iodepth
type: int
default: 128
info: |
Parallel client write operation limit per one OSD. Operations that exceed
this limit are pushed to a temporary queue instead of being executed
immediately.
info_ru: |
Максимальное число одновременных клиентских операций записи на один OSD.
Операции, превышающие этот лимит, не исполняются сразу, а сохраняются во
временной очереди.
- name: min_flusher_count
type: int
default: 1
info: |
Flusher is a micro-thread that moves data from the journal to the data
area of the device. Their number is auto-tuned between minimum and maximum.
Minimum number is set by this parameter.
info_ru: |
Flusher - это микро-поток (корутина), которая копирует данные из журнала в
основную область устройства данных. Их число настраивается динамически между
минимальным и максимальным значением. Этот параметр задаёт минимальное число.
- name: max_flusher_count
type: int
default: 256
info: |
Maximum number of journal flushers (see above min_flusher_count).
info_ru: |
Максимальное число микро-потоков очистки журнала (см. выше min_flusher_count).
- name: inmemory_metadata
type: bool
default: true
info: |
This parameter makes Vitastor always keep metadata area of the block device
in memory. It's required for good performance because it allows to avoid
additional read-modify-write cycles during metadata modifications. Metadata
area size is currently roughly 224 MB per 1 TB of data. You can turn it off
to reduce memory usage by this value, but it will hurt performance. This
restriction is likely to be removed in the future along with the upgrade
of the metadata storage scheme.
info_ru: |
Данный параметр заставляет Vitastor всегда держать область метаданных диска
в памяти. Это нужно, чтобы избегать дополнительных операций чтения с диска
при записи. Размер области метаданных на данный момент составляет примерно
224 МБ на 1 ТБ данных. При включении потребление памяти снизится примерно
на эту величину, но при этом также снизится и производительность. В будущем,
после обновления схемы хранения метаданных, это ограничение, скорее всего,
будет ликвидировано.
- name: inmemory_journal
type: bool
default: true
info: |
This parameter make Vitastor always keep journal area of the block
device in memory. Turning it off will, again, reduce memory usage, but
hurt performance because flusher coroutines will have to read data from
the disk back before copying it into the main area. The memory usage benefit
is typically very small because it's sufficient to have 16-32 MB journal
for SSD OSDs. However, in theory it's possible that you'll want to turn it
off for hybrid (HDD+SSD) OSDs with large journals on quick devices.
info_ru: |
Данный параметр заставляет Vitastor всегда держать в памяти журналы OSD.
Отключение параметра, опять же, снижает потребление памяти, но ухудшает
производительность, так как для копирования данных из журнала в основную
область устройства OSD будут вынуждены читать их обратно с диска. Выигрыш
по памяти при этом обычно крайне низкий, так как для SSD OSD обычно
достаточно 16- или 32-мегабайтного журнала. Однако в теории отключение
параметра может оказаться полезным для гибридных OSD (HDD+SSD) с большими
журналами, расположенными на быстром по сравнению с HDD устройстве.
- name: journal_sector_buffer_count
type: int
default: 32
info: |
Maximum number of buffers that can be used for writing journal metadata
blocks. The only situation when you should increase it to a larger value
is when you enable journal_no_same_sector_overwrites. In this case set
it to, for example, 1024.
info_ru: |
Максимальное число буферов, разрешённых для использования под записываемые
в журнал блоки метаданных. Единственная ситуация, в которой этот параметр
нужно менять - это если вы включаете journal_no_same_sector_overwrites. В
этом случае установите данный параметр, например, в 1024.
- name: journal_no_same_sector_overwrites
type: bool
default: false
info: |
Enable this option for SSDs like Intel D3-S4510 and D3-S4610 which REALLY
don't like when a program overwrites the same sector multiple times in a
row and slow down significantly (from 25000+ iops to ~3000 iops). When
this option is set, Vitastor will always move to the next sector of the
journal after writing it instead of possibly overwriting it the second time.
info_ru: |
Включайте данную опцию для SSD вроде Intel D3-S4510 и D3-S4610, которые
ОЧЕНЬ не любят, когда ПО перезаписывает один и тот же сектор несколько раз
подряд. Такие SSD при многократной перезаписи одного и того же сектора
сильно замедляются - условно, с 25000 и более iops до 3000 iops. Когда
данная опция установлена, Vitastor всегда переходит к следующему сектору
журнала после записи вместо потенциально повторной перезаписи того же
самого сектора.
- name: throttle_small_writes
type: bool
default: false
info: |
Enable soft throttling of small journaled writes. Useful for hybrid OSDs
with fast journal/metadata devices and slow data devices. The idea is that
small writes complete very quickly because they're first written to the
journal device, but moving them to the main device is slow. So if an OSD
allows clients to issue a lot of small writes it will perform very good
for several seconds and then the journal will fill up and the performance
will drop to almost zero. Throttling is meant to prevent this problem by
artifically slowing quick writes down based on the amount of free space in
the journal. When throttling is used, the performance of small writes will
decrease smoothly instead of abrupt drop at the moment when the journal
fills up.
info_ru: |
Разрешить мягкое ограничение скорости журналируемой записи. Полезно для
гибридных OSD с быстрыми устройствами метаданных и медленными устройствами
данных. Идея заключается в том, что мелкие записи в этой ситуации могут
завершаться очень быстро, так как они изначально записываются на быстрое
журнальное устройство (SSD). Но перемещать их потом на основное медленное
устройство долго. Поэтому если OSD быстро примет от клиентов очень много
мелких операций записи, он быстро заполнит свой журнал, после чего
производительность записи резко упадёт практически до нуля. Ограничение
скорости записи призвано решить эту проблему с помощью искусственного
замедления операций записи на основании объёма свободного места в журнале.
Когда эта опция включена, производительность мелких операций записи будет
снижаться плавно, а не резко в момент окончательного заполнения журнала.
- name: throttle_target_iops
type: int
default: 100
info: |
Target maximum number of throttled operations per second under the condition
of full journal. Set it to approximate random write iops of your data devices
(HDDs).
info_ru: |
Расчётное максимальное число ограничиваемых операций в секунду при условии
отсутствия свободного места в журнале. Устанавливайте приблизительно равным
максимальной производительности случайной записи ваших устройств данных
(HDD) в операциях в секунду.
- name: throttle_target_mbs
type: int
default: 100
info: |
Target maximum bandwidth in MB/s of throttled operations per second under
the condition of full journal. Set it to approximate linear write
performance of your data devices (HDDs).
info_ru: |
Расчётный максимальный размер в МБ/с ограничиваемых операций в секунду при
условии отсутствия свободного места в журнале. Устанавливайте приблизительно
равным максимальной производительности линейной записи ваших устройств
данных (HDD).
- name: throttle_target_parallelism
type: int
default: 1
info: |
Target maximum parallelism of throttled operations under the condition of
full journal. Set it to approximate internal parallelism of your data
devices (1 for HDDs, 4-8 for SSDs).
info_ru: |
Расчётный максимальный параллелизм ограничиваемых операций в секунду при
условии отсутствия свободного места в журнале. Устанавливайте приблизительно
равным внутреннему параллелизму ваших устройств данных (1 для HDD, 4-8
для SSD).
- name: throttle_threshold_us
type: us
default: 50
info: |
Minimal computed delay to be applied to throttled operations. Usually
doesn't need to be changed.
info_ru: |
Минимальная применимая к ограничиваемым операциям задержка. Обычно не
требует изменений.
- name: osd_memlock
type: bool
default: false
info: >
Lock all OSD memory to prevent it from being unloaded into swap with
mlockall(). Requires sufficient ulimit -l (max locked memory).
info_ru: >
Блокировать всю память OSD с помощью mlockall, чтобы запретить её выгрузку
в пространство подкачки. Требует достаточного значения ulimit -l (лимита
заблокированной памяти).

2
json11

Submodule json11 updated: 55363fc265...52a3af664f

1
libnfs Submodule

Submodule libnfs added at 5a991e1fcb

View File

@@ -50,7 +50,7 @@ async function lp_solve(text)
return { score, vars };
}
async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, round_robin = false })
async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false })
{
if (!pg_count || !osd_tree)
{
@@ -92,7 +92,7 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
console.log(lp);
throw new Error('Problem is infeasible or unbounded - is it a bug?');
}
const int_pgs = make_int_pgs(lp_result.vars, pg_count, round_robin);
const int_pgs = make_int_pgs(lp_result.vars, pg_count, ordered);
const eff = pg_list_space_efficiency(int_pgs, all_weights, pg_minsize, parity_space);
const res = {
score: lp_result.score,
@@ -140,20 +140,20 @@ function make_int_pgs(weights, pg_count, round_robin)
return int_pgs;
}
function calc_intersect_weights(pg_size, pg_count, prev_weights, all_pgs)
function calc_intersect_weights(old_pg_size, pg_size, pg_count, prev_weights, all_pgs, ordered)
{
const move_weights = {};
if ((1 << pg_size) < pg_count)
if ((1 << old_pg_size) < pg_count)
{
const intersect = {};
for (const pg_name in prev_weights)
{
const pg = pg_name.substr(3).split(/_/);
for (let omit = 1; omit < (1 << pg_size); omit++)
for (let omit = 1; omit < (1 << old_pg_size); omit++)
{
let pg_omit = [ ...pg ];
let intersect_count = pg_size;
for (let i = 0; i < pg_size; i++)
let intersect_count = old_pg_size;
for (let i = 0; i < old_pg_size; i++)
{
if (omit & (1 << i))
{
@@ -161,6 +161,8 @@ function calc_intersect_weights(pg_size, pg_count, prev_weights, all_pgs)
intersect_count--;
}
}
if (!ordered)
pg_omit = pg_omit.filter(n => n).sort();
pg_omit = pg_omit.join(':');
intersect[pg_omit] = Math.max(intersect[pg_omit] || 0, intersect_count);
}
@@ -174,10 +176,10 @@ function calc_intersect_weights(pg_size, pg_count, prev_weights, all_pgs)
for (let i = 0; i < pg_size; i++)
{
if (omit & (1 << i))
{
pg_omit[i] = '';
}
}
if (!ordered)
pg_omit = pg_omit.filter(n => n).sort();
pg_omit = pg_omit.join(':');
max_int = Math.max(max_int, intersect[pg_omit] || 0);
}
@@ -186,15 +188,18 @@ function calc_intersect_weights(pg_size, pg_count, prev_weights, all_pgs)
}
else
{
const prev_pg_hashed = Object.keys(prev_weights).map(pg_name => pg_name.substr(3).split(/_/).reduce((a, c) => { a[c] = 1; return a; }, {}));
const prev_pg_hashed = Object.keys(prev_weights).map(pg_name => pg_name
.substr(3).split(/_/).reduce((a, c, i) => { a[c] = i+1; return a; }, {}));
for (const pg of all_pgs)
{
if (!prev_weights['pg_'+pg.join('_')])
{
let max_int = 0;
for (const prev_hash in prev_pg_hashed)
for (const prev_hash of prev_pg_hashed)
{
const intersect_count = pg.reduce((a, osd) => a + (prev_hash[osd] ? 1 : 0), 0);
const intersect_count = ordered
? pg.reduce((a, osd, i) => a + (prev_hash[osd] == 1+i ? 1 : 0), 0)
: pg.reduce((a, osd, i) => a + (prev_hash[osd] ? 1 : 0), 0);
if (max_int < intersect_count)
{
max_int = intersect_count;
@@ -243,7 +248,7 @@ function add_valid_previous(osd_tree, prev_weights, all_pgs)
}
// Try to minimize data movement
async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1 })
async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false })
{
if (!osd_tree)
{
@@ -266,9 +271,13 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
prev_pg_per_osd[osd].push([ pg_name, (i >= pg_minsize ? parity_space : 1) ]);
}
}
const old_pg_size = prev_int_pgs[0].length;
// Get all combinations
let all_pgs = random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1);
add_valid_previous(osd_tree, prev_weights, all_pgs);
if (old_pg_size == pg_size)
{
add_valid_previous(osd_tree, prev_weights, all_pgs);
}
all_pgs = Object.values(all_pgs);
const pg_per_osd = {};
for (const pg of all_pgs)
@@ -282,7 +291,7 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
}
}
// Penalize PGs based on their similarity to old PGs
const move_weights = calc_intersect_weights(pg_size, pg_count, prev_weights, all_pgs);
const move_weights = calc_intersect_weights(old_pg_size, pg_size, pg_count, prev_weights, all_pgs, ordered);
// Calculate total weight - old PG weights
const all_pg_names = all_pgs.map(pg => 'pg_'+pg.join('_'));
const all_pgs_hash = all_pg_names.reduce((a, c) => { a[c] = true; return a; }, {});
@@ -373,11 +382,35 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
{
differs++;
}
for (let j = 0; j < pg_size; j++)
}
if (ordered)
{
for (let i = 0; i < pg_count; i++)
{
if (new_pgs[i][j] != prev_int_pgs[i][j])
for (let j = 0; j < pg_size; j++)
{
osd_differs++;
if (new_pgs[i][j] != prev_int_pgs[i][j])
{
osd_differs++;
}
}
}
}
else
{
for (let i = 0; i < pg_count; i++)
{
const old_map = prev_int_pgs[i].reduce((a, c) => { a[c] = (a[c]|0) + 1; return a; }, {});
for (let j = 0; j < pg_size; j++)
{
if ((0|old_map[new_pgs[i][j]]) > 0)
{
old_map[new_pgs[i][j]]--;
}
else
{
osd_differs++;
}
}
}
}

414
mon/make-osd-hybrid.js Executable file
View File

@@ -0,0 +1,414 @@
#!/usr/bin/nodejs
// systemd unit generator for hybrid (HDD+SSD) vitastor OSDs
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1
// USAGE: nodejs make-osd-hybrid.js [--disable_ssd_cache 0] [--disable_hdd_cache 0] /dev/sda /dev/sdb /dev/sdc /dev/sdd ...
// I.e. - just pass all HDDs and SSDs mixed, the script will decide where
// to put journals on its own
const fs = require('fs');
const fsp = fs.promises;
const child_process = require('child_process');
const options = {
debug: 1,
journal_size: 1024*1024*1024,
min_meta_size: 1024*1024*1024,
object_size: 1024*1024,
bitmap_granularity: 4096,
device_block_size: 4096,
disable_ssd_cache: 1,
disable_hdd_cache: 1,
};
run().catch(console.fatal);
async function run()
{
const device_list = parse_options();
await system_or_die("mkdir -p /var/log/vitastor; chown vitastor /var/log/vitastor");
// Collect devices
const all_devices = await collect_devices(device_list);
const ssds = all_devices.filter(d => d.ssd);
const hdds = all_devices.filter(d => !d.ssd);
// Collect existing OSD units
const osd_units = await collect_osd_units();
// Count assigned HDD journals and unallocated space for each SSD
await check_journal_count(ssds, osd_units);
// Create new OSDs
await create_new_hybrid_osds(hdds, ssds, osd_units);
process.exit(0);
}
function parse_options()
{
const devices = [];
const opt = {};
for (let i = 2; i < process.argv.length; i++)
{
const arg = process.argv[i];
if (arg == '--help' || arg == '-h')
{
opt.help = true;
break;
}
else if (arg.substr(0, 2) == '--')
opt[arg.substr(2)] = process.argv[++i];
else
devices.push(arg);
}
if (opt.help || !devices.length)
{
console.log(
'Prepare hybrid (HDD+SSD) Vitastor OSDs\n'+
'(c) Vitaliy Filippov, 2019+, license: VNPL-1.1\n\n'+
'USAGE: nodejs make-osd-hybrid.js [OPTIONS] /dev/sda /dev/sdb /dev/sdc ...\n'+
'Just pass all your SSDs and HDDs in any order, the script will distribute OSDs for you.\n\n'+
'OPTIONS (with defaults):\n'+
Object.keys(options).map(k => ` --${k} ${options[k]}`).join('\n')
);
process.exit(0);
}
for (const k in opt)
options[k] = opt[k];
return devices;
}
// Collect devices
async function collect_devices(devices_to_check)
{
const devices = [];
for (const dev of devices_to_check)
{
if (dev.substr(0, 5) != '/dev/')
{
console.log(`${dev} does not start with /dev/, skipping`);
continue;
}
if (!await file_exists('/sys/block/'+dev.substr(5)))
{
console.log(`${dev} is a partition, skipping`);
continue;
}
// Check if the device is an SSD
const rot = '/sys/block/'+dev.substr(5)+'/queue/rotational';
if (!await file_exists(rot))
{
console.log(`${dev} does not have ${rot} to check whether it's an SSD, skipping`);
continue;
}
const ssd = !parseInt(await fsp.readFile(rot, { encoding: 'utf-8' }));
// Check if the device has partition table
let [ has_partition_table, parts ] = await system(`sfdisk --dump ${dev} --json`);
if (has_partition_table != 0)
{
// Check if the device has any data
const [ has_data, out ] = await system(`blkid ${dev}`);
if (has_data == 0)
{
console.log(`${dev} contains data, skipping:\n ${out.trim().replace(/\n/g, '\n ')}`);
continue;
}
}
parts = parts ? JSON.parse(parts).partitiontable : null;
if (parts && parts.label != 'gpt')
{
console.log(`${dev} contains "${parts.label}" partition table, only GPT is supported, skipping`);
continue;
}
devices.push({
path: dev,
ssd,
parts,
});
}
return devices;
}
// Collect existing OSD units
async function collect_osd_units()
{
const units = [];
for (const unit of (await system("ls /etc/systemd/system/vitastor-osd*.service"))[1].trim().split('\n'))
{
if (!unit)
{
continue;
}
let cmd = /^ExecStart\s*=\s*(([^\n]*\\\n)*[^\n]*)/.exec(await fsp.readFile(unit, { encoding: 'utf-8' }));
if (!cmd)
{
console.log('ExecStart= not found in '+unit+', skipping')
continue;
}
let kv = {}, key;
cmd = cmd[1].replace(/^bash\s+-c\s+'/, '')
.replace(/>>\s*\S+2>\s*&1\s*'$/, '')
.replace(/\s*\\\n\s*/g, ' ')
.replace(/([^\s']+)|'([^']+)'/g, (m, m1, m2) =>
{
m1 = m1||m2;
if (key == null)
{
if (m1.substr(0, 2) != '--')
{
console.log('Strange command line in '+unit+', stopping');
process.exit(1);
}
key = m1.substr(2);
}
else
{
kv[key] = m1;
key = null;
}
});
units.push(kv);
}
return units;
}
// Count assigned HDD journals and unallocated space for each SSD
async function check_journal_count(ssds, osd_units)
{
const units_by_journal = osd_units.reduce((a, c) =>
{
if (c.journal_device)
a[c.journal_device] = c;
return a;
}, {});
for (const dev of ssds)
{
dev.journals = 0;
if (dev.parts)
{
for (const part of dev.parts.partitions)
{
if (part.uuid && units_by_journal['/dev/disk/by-partuuid/'+part.uuid.toLowerCase()])
{
dev.journals++;
}
}
dev.free = free_from_parttable(dev.parts);
}
else
{
dev.free = parseInt(await system_or_die("blockdev --getsize64 "+dev.path));
}
}
}
async function create_new_hybrid_osds(hdds, ssds, osd_units)
{
const units_by_disk = osd_units.reduce((a, c) => { a[c.data_device] = c; return a; }, {});
for (const dev of hdds)
{
if (!dev.parts)
{
// HDD is not partitioned yet, create a single partition
// + is the "default value" for sfdisk
await system_or_die('sfdisk '+dev.path, 'label: gpt\n\n+ +\n');
dev.parts = JSON.parse(await system_or_die('sfdisk --dump '+dev.path+' --json')).partitiontable;
}
if (dev.parts.partitions.length != 1)
{
console.log(dev.path+' has more than 1 partition, skipping');
}
else if ((dev.parts.partitions[0].start + dev.parts.partitions[0].size) != (1 + dev.parts.lastlba))
{
console.log(dev.path+'1 is not a whole-disk partition, skipping');
}
else if (!dev.parts.partitions[0].uuid)
{
console.log(dev.parts.partitions[0].node+' does not have UUID. Please repartition '+dev.path+' with GPT');
}
else if (!units_by_disk['/dev/disk/by-partuuid/'+dev.parts.partitions[0].uuid.toLowerCase()])
{
await create_hybrid_osd(dev, ssds);
}
}
}
async function create_hybrid_osd(dev, ssds)
{
// Create a new OSD
// Calculate metadata size
const data_device = '/dev/disk/by-partuuid/'+dev.parts.partitions[0].uuid.toLowerCase();
const data_size = dev.parts.partitions[0].size * dev.parts.sectorsize;
const meta_entry_size = 24 + 2*options.object_size/options.bitmap_granularity/8;
const entries_per_block = Math.floor(options.device_block_size / meta_entry_size);
const object_count = Math.floor(data_size / options.object_size);
let meta_size = Math.ceil(1 + object_count / entries_per_block) * options.device_block_size;
// Leave some extra space for future metadata formats and round metadata area size to multiples of 1 MB
meta_size = 2*meta_size;
meta_size = Math.ceil(meta_size/1024/1024) * 1024*1024;
if (meta_size < options.min_meta_size)
meta_size = options.min_meta_size;
let journal_size = Math.ceil(options.journal_size/1024/1024) * 1024*1024;
// Pick an SSD for journal, balancing the number of journals across SSDs
let selected_ssd;
for (const ssd of ssds)
if (ssd.free >= (meta_size+journal_size) && (!selected_ssd || selected_ssd.journals > ssd.journals))
selected_ssd = ssd;
if (!selected_ssd)
{
console.error('Could not find free space for SSD journal and metadata for '+dev.path);
process.exit(1);
}
// Allocate an OSD number
const osd_num = (await system_or_die("vitastor-cli alloc-osd")).trim();
if (!osd_num)
{
console.error('Failed to run vitastor-cli alloc-osd');
process.exit(1);
}
console.log('Creating OSD '+osd_num+' on '+dev.path+' (HDD) with journal and metadata on '+selected_ssd.path+' (SSD)');
// Add two partitions: journal and metadata
const new_parts = await add_partitions(selected_ssd, [ journal_size, meta_size ]);
selected_ssd.journals++;
const journal_device = '/dev/disk/by-partuuid/'+new_parts[0].uuid.toLowerCase();
const meta_device = '/dev/disk/by-partuuid/'+new_parts[1].uuid.toLowerCase();
// Wait until the device symlinks appear
while (!await file_exists(journal_device))
{
await new Promise(ok => setTimeout(ok, 100));
}
while (!await file_exists(meta_device))
{
await new Promise(ok => setTimeout(ok, 100));
}
// Zero out metadata and journal
await system_or_die("dd if=/dev/zero of="+journal_device+" bs=1M count="+(journal_size/1024/1024)+" oflag=direct");
await system_or_die("dd if=/dev/zero of="+meta_device+" bs=1M count="+(meta_size/1024/1024)+" oflag=direct");
// Create unit file for the OSD
const has_scsi_cache_type = options.disable_ssd_cache &&
(await system("ls /sys/block/"+selected_ssd.path.substr(5)+"/device/scsi_disk/*/cache_type"))[0] == 0;
const write_through = options.disable_ssd_cache && (
has_scsi_cache_type || selected_ssd.path.substr(5, 4) == 'nvme'
&& (await system_or_die("/sys/block/"+selected_ssd.path.substr(5)+"/queue/write_cache")).trim() == "write through");
await fsp.writeFile('/etc/systemd/system/vitastor-osd'+osd_num+'.service',
`[Unit]
Description=Vitastor object storage daemon osd.${osd_num}
After=network-online.target local-fs.target time-sync.target
Wants=network-online.target local-fs.target time-sync.target
PartOf=vitastor.target
[Service]
LimitNOFILE=1048576
LimitNPROC=1048576
LimitMEMLOCK=infinity
ExecStart=bash -c '/usr/bin/vitastor-osd \\
--osd_num ${osd_num} ${write_through
? "--disable_meta_fsync 1 --disable_journal_fsync 1 --immediate_commit "+(options.disable_hdd_cache ? "all" : "small")
: ""} \\
--throttle_small_writes 1 \\
--disk_alignment ${options.device_block_size} \\
--journal_block_size ${options.device_block_size} \\
--meta_block_size ${options.device_block_size} \\
--journal_no_same_sector_overwrites true \\
--journal_sector_buffer_count 1024 \\
--block_size ${options.object_size} \\
--data_device ${data_device} \\
--journal_device ${journal_device} \\
--meta_device ${meta_device} >>/var/log/vitastor/osd${osd_num}.log 2>&1'
WorkingDirectory=/
ExecStartPre=+chown vitastor:vitastor ${data_device}
ExecStartPre=+chown vitastor:vitastor ${journal_device}
ExecStartPre=+chown vitastor:vitastor ${meta_device}${
has_scsi_cache_type
? "\nExecStartPre=+bash -c 'D=$$$(readlink "+journal_device+"); echo write through > $$$(dirname /sys/block/*/$$\${D##*/})/device/scsi_disk/*/cache_type'"
: ""}${
options.disable_hdd_cache
? "\nExecStartPre=+bash -c 'D=$$$(readlink "+data_device+"); echo write through > $$$(dirname /sys/block/*/$$\${D##*/})/device/scsi_disk/*/cache_type'"
: ""}
User=vitastor
PrivateTmp=false
TasksMax=infinity
Restart=always
StartLimitInterval=0
RestartSec=10
[Install]
WantedBy=vitastor.target
`);
await system_or_die("systemctl enable vitastor-osd"+osd_num);
}
async function add_partitions(dev, sizes)
{
let script = 'label: gpt\n\n';
if (dev.parts)
{
// Old partitions
for (const part of dev.parts.partitions)
{
script += part.node+': '+Object.keys(part).map(k => k == 'node' ? '' : k+'='+part[k]).filter(k => k).join(', ')+'\n';
}
}
// New partitions
for (const size of sizes)
{
script += '+ '+Math.ceil(size/1024)+'KiB\n';
}
await system_or_die('sfdisk '+dev.path, script);
// Get new partition table and find the new partition
const newpt = JSON.parse(await system_or_die('sfdisk --dump '+dev.path+' --json')).partitiontable;
const old_nodes = dev.parts ? dev.parts.partitions.reduce((a, c) => { a[c.uuid] = true; return a; }, {}) : {};
const new_nodes = newpt.partitions.filter(part => !old_nodes[part.uuid]);
if (new_nodes.length != sizes.length)
{
console.error('Failed to partition '+dev.path+': new partitions not found in table');
process.exit(1);
}
dev.parts = newpt;
dev.free = free_from_parttable(newpt);
return new_nodes;
}
function free_from_parttable(pt)
{
let free = pt.lastlba + 1 - pt.firstlba;
for (const part of pt.partitions)
{
free -= part.size;
}
free *= pt.sectorsize;
return free;
}
async function system_or_die(cmd, input = '')
{
let [ exitcode, stdout, stderr ] = await system(cmd, input);
if (exitcode != 0)
{
console.error(cmd+' failed: '+stderr);
process.exit(1);
}
return stdout;
}
async function system(cmd, input = '')
{
if (options.debug)
{
process.stderr.write('+ '+cmd+(input ? " <<EOF\n"+input.replace(/\s*$/, '\n')+"EOF" : '')+'\n');
}
const cp = child_process.spawn(cmd, { shell: true });
let stdout = '', stderr = '', finish_cb;
cp.stdout.on('data', buf => stdout += buf.toString());
cp.stderr.on('data', buf => stderr += buf.toString());
cp.on('exit', () => finish_cb && finish_cb());
cp.stdin.write(input);
cp.stdin.end();
if (cp.exitCode == null)
{
await new Promise(ok => finish_cb = ok);
}
return [ cp.exitCode, stdout, stderr ];
}
async function file_exists(filename)
{
return new Promise((ok, no) => fs.access(filename, fs.constants.R_OK, err => ok(!err)));
}

View File

@@ -25,6 +25,10 @@ OPT=$(vitastor-cli simple-offsets --format options $DEV | tr '\n' ' ')
META=$(vitastor-cli simple-offsets --format json $DEV | jq .data_offset)
dd if=/dev/zero of=$DEV bs=1048576 count=$(((META+1048575)/1048576)) oflag=direct
mkdir -p /var/log/vitastor
id vitastor &>/dev/null || useradd vitastor
chown vitastor /var/log/vitastor
cat >/etc/systemd/system/vitastor-osd$OSD_NUM.service <<EOF
[Unit]
Description=Vitastor object storage daemon osd.$OSD_NUM
@@ -36,14 +40,14 @@ PartOf=vitastor.target
LimitNOFILE=1048576
LimitNPROC=1048576
LimitMEMLOCK=infinity
ExecStart=/usr/bin/vitastor-osd \\
ExecStart=bash -c '/usr/bin/vitastor-osd \\
--osd_num $OSD_NUM \\
--disable_data_fsync 1 \\
--immediate_commit all \\
--disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
--journal_no_same_sector_overwrites true \\
--journal_sector_buffer_count 1024 \\
$OPT
$OPT >>/var/log/vitastor/osd$OSD_NUM.log 2>&1'
WorkingDirectory=/
ExecStartPre=+chown vitastor:vitastor $DEV
User=vitastor

View File

@@ -31,6 +31,7 @@ const etcd_allow = new RegExp('^'+[
'osd/inodestats/[1-9]\\d*',
'osd/space/[1-9]\\d*',
'mon/master',
'mon/member/[a-f0-9]+',
'pg/state/[1-9]\\d*/[1-9]\\d*',
'pg/stats/[1-9]\\d*/[1-9]\\d*',
'pg/history/[1-9]\\d*/[1-9]\\d*',
@@ -83,8 +84,13 @@ const etcd_tree = {
osd_idle_timeout: 5, // seconds. min: 1
osd_ping_timeout: 5, // seconds. min: 1
up_wait_retry_interval: 500, // ms. min: 50
max_etcd_attempts: 5,
etcd_quick_timeout: 1000, // ms
etcd_slow_timeout: 5000, // ms
etcd_keepalive_timeout: 30, // seconds, default is max(30, etcd_report_interval*2)
etcd_ws_keepalive_interval: 30, // seconds
// osd
etcd_report_interval: 5,
etcd_report_interval: 5, // seconds
run_primary: true,
osd_network: null, // "192.168.7.0/24" or an array of masks
bind_address: "0.0.0.0",
@@ -99,6 +105,7 @@ const etcd_tree = {
no_rebalance: false,
print_stats_interval: 3,
slow_log_interval: 10,
osd_memlock: false,
// blockstore - fixed in superblock
block_size,
disk_alignment,
@@ -125,6 +132,11 @@ const etcd_tree = {
inmemory_journal,
journal_sector_buffer_count,
journal_no_same_sector_overwrites,
throttle_small_writes: false,
throttle_target_iops: 100,
throttle_target_mbs: 100,
throttle_target_parallelism: 1,
throttle_threshold_us: 50,
}, */
global: {},
/* node_placement: {
@@ -226,7 +238,10 @@ const etcd_tree = {
},
mon: {
master: {
/* ip: [ string ], */
/* ip: [ string ], id: uint64_t */
},
standby: {
/* <uint64_t>: { ip: [ string ] }, */
},
},
pg: {
@@ -257,7 +272,7 @@ const etcd_tree = {
<pg_id>: {
osd_sets: osd_num_t[][],
all_peers: osd_num_t[],
epoch: uint32_t,
epoch: uint64_t,
},
}, */
},
@@ -341,6 +356,9 @@ class Mon
this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
this.state = JSON.parse(JSON.stringify(this.constructor.etcd_tree));
this.signals_set = false;
this.ws = null;
this.ws_alive = false;
this.ws_keepalive_timer = null;
this.on_stop_cb = () => this.on_stop(0).catch(console.error);
}
@@ -383,7 +401,7 @@ class Mon
for (const pool_id in this.state.config.pools)
{
if (!this.state.pool.stats[pool_id] ||
!this.state.pool.stats[pool_id].pg_real_size)
!Number(this.state.pool.stats[pool_id].pg_real_size))
{
// Generate missing data in etcd
this.state.config.pgs.hash = null;
@@ -461,8 +479,20 @@ class Mon
restart_watcher(cur_addr)
{
if (this.ws)
{
this.ws.close();
this.ws = null;
}
if (this.ws_keepalive_timer)
{
clearInterval(this.ws_keepalive_timer);
this.ws_keepalive_timer = null;
}
if (this.selected_etcd_url == cur_addr)
{
this.selected_etcd_url = null;
}
this.start_watcher(this.config.etcd_mon_retries).catch(this.die);
}
@@ -482,6 +512,7 @@ class Mon
const timer_id = setTimeout(() =>
{
this.ws.close();
this.ws = null;
ok(false);
}, this.config.etcd_mon_timeout);
this.ws = new WebSocket(base+'/watch');
@@ -510,6 +541,20 @@ class Mon
this.die('Failed to open etcd watch websocket');
}
const cur_addr = this.selected_etcd_url;
this.ws_alive = true;
this.ws_keepalive_timer = setInterval(() =>
{
if (this.ws_alive)
{
this.ws_alive = false;
this.ws.send(JSON.stringify({ progress_request: {} }));
}
else
{
console.log('etcd websocket timed out, restarting it');
this.restart_watcher(cur_addr);
}
}, (Number(this.config.etcd_keepalive_interval) || 30)*1000);
this.ws.on('error', () => this.restart_watcher(cur_addr));
this.ws.send(JSON.stringify({
create_request: {
@@ -522,6 +567,7 @@ class Mon
}));
this.ws.on('message', (msg) =>
{
this.ws_alive = true;
let data;
try
{
@@ -558,7 +604,7 @@ class Mon
console.log('Revision '+data.result.header.revision+' events: ');
}
this.etcd_watch_revision = BigInt(data.result.header.revision)+BigInt(1);
for (const e of data.result.events)
for (const e of data.result.events||[])
{
this.parse_kv(e.kv);
const key = e.kv.key.substr(this.etcd_prefix.length);
@@ -631,11 +677,25 @@ class Mon
}, this.etcd_start_timeout, 0);
}
get_mon_state()
{
return { ip: this.local_ips(), hostname: os.hostname() };
}
async get_lease()
{
const max_ttl = this.config.etcd_mon_ttl + this.config.etcd_mon_timeout/1000*this.config.etcd_mon_retries;
const res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, -1);
// Get lease
let res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, -1);
this.etcd_lease_id = res.ID;
// Register in /mon/member, just for the information
const state = this.get_mon_state();
res = await this.etcd_call('/kv/put', {
key: b64(this.etcd_prefix+'/mon/member/'+this.etcd_lease_id),
value: b64(JSON.stringify(state)),
lease: ''+this.etcd_lease_id
}, this.etcd_start_timeout, 0);
// Set refresh timer
this.lease_timer = setInterval(async () =>
{
const res = await this.etcd_call('/lease/keepalive', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
@@ -661,7 +721,7 @@ class Mon
async become_master()
{
const state = { ip: this.local_ips() };
const state = { ...this.get_mon_state(), id: ''+this.etcd_lease_id };
while (1)
{
const res = await this.etcd_call('/kv/txn', {
@@ -1097,7 +1157,7 @@ class Mon
pg_size: pool_cfg.pg_size,
pg_minsize: pool_cfg.pg_minsize,
max_combinations: pool_cfg.max_osd_combinations,
round_robin: pool_cfg.scheme != 'replicated',
ordered: pool_cfg.scheme != 'replicated',
};
let optimize_result;
if (old_pg_count > 0)
@@ -1120,10 +1180,6 @@ class Mon
{
pg.push(0);
}
while (pg.length > pool_cfg.pg_size)
{
pg.pop();
}
}
if (!this.state.config.pgs.hash)
{
@@ -1159,8 +1215,8 @@ class Mon
this.state.pool.stats[pool_id] = {
used_raw_tb: (this.state.pool.stats[pool_id]||{}).used_raw_tb || 0,
total_raw_tb: optimize_result.space,
pg_real_size: pg_effsize,
raw_to_usable: pg_effsize / (pool_cfg.scheme === 'replicated'
pg_real_size: pg_effsize || pool_cfg.pg_size,
raw_to_usable: (pg_effsize || pool_cfg.pg_size) / (pool_cfg.scheme === 'replicated'
? 1 : (pool_cfg.pg_size - (pool_cfg.parity_chunks||0))),
space_efficiency: optimize_result.space/(optimize_result.total_space||1),
};
@@ -1307,21 +1363,30 @@ class Mon
const tm = prev_stats ? BigInt(timestamp - prev_stats.timestamp) : 0;
for (const op in op_stats)
{
op_stats[op].bps = prev_stats ? (op_stats[op].bytes - prev_stats.op_stats[op].bytes) * 1000n / tm : 0;
op_stats[op].iops = prev_stats ? (op_stats[op].count - prev_stats.op_stats[op].count) * 1000n / tm : 0;
op_stats[op].lat = prev_stats ? (op_stats[op].usec - prev_stats.op_stats[op].usec)
/ ((op_stats[op].count - prev_stats.op_stats[op].count) || 1n) : 0;
if (prev_stats && prev_stats.op_stats && prev_stats.op_stats[op])
{
op_stats[op].bps = (op_stats[op].bytes - prev_stats.op_stats[op].bytes) * 1000n / tm;
op_stats[op].iops = (op_stats[op].count - prev_stats.op_stats[op].count) * 1000n / tm;
op_stats[op].lat = (op_stats[op].usec - prev_stats.op_stats[op].usec)
/ ((op_stats[op].count - prev_stats.op_stats[op].count) || 1n);
}
}
for (const op in subop_stats)
{
subop_stats[op].iops = prev_stats ? (subop_stats[op].count - prev_stats.subop_stats[op].count) * 1000n / tm : 0;
subop_stats[op].lat = prev_stats ? (subop_stats[op].usec - prev_stats.subop_stats[op].usec)
/ ((subop_stats[op].count - prev_stats.subop_stats[op].count) || 1n) : 0;
if (prev_stats && prev_stats.subop_stats && prev_stats.subop_stats[op])
{
subop_stats[op].iops = (subop_stats[op].count - prev_stats.subop_stats[op].count) * 1000n / tm;
subop_stats[op].lat = (subop_stats[op].usec - prev_stats.subop_stats[op].usec)
/ ((subop_stats[op].count - prev_stats.subop_stats[op].count) || 1n);
}
}
for (const op in recovery_stats)
{
recovery_stats[op].bps = prev_stats ? (recovery_stats[op].bytes - prev_stats.recovery_stats[op].bytes) * 1000n / tm : 0;
recovery_stats[op].iops = prev_stats ? (recovery_stats[op].count - prev_stats.recovery_stats[op].count) * 1000n / tm : 0;
if (prev_stats && prev_stats.recovery_stats && prev_stats.recovery_stats[op])
{
recovery_stats[op].bps = (recovery_stats[op].bytes - prev_stats.recovery_stats[op].bytes) * 1000n / tm;
recovery_stats[op].iops = (recovery_stats[op].count - prev_stats.recovery_stats[op].count) * 1000n / tm;
}
}
return { op_stats, subop_stats, recovery_stats };
}

View File

@@ -49,7 +49,8 @@ async function run()
}
options.journal_offset = Math.ceil(options.journal_offset/options.device_block_size)*options.device_block_size;
const meta_offset = options.journal_offset + Math.ceil(options.journal_size/options.device_block_size)*options.device_block_size;
const entries_per_block = Math.floor(options.device_block_size / (24 + 2*options.object_size/options.bitmap_granularity/8));
const meta_entry_size = 24 + 2*options.object_size/options.bitmap_granularity/8;
const entries_per_block = Math.floor(options.device_block_size / meta_entry_size);
const object_count = Math.floor((device_size-meta_offset)/options.object_size);
const meta_size = Math.ceil(1 + object_count / entries_per_block) * options.device_block_size;
const data_offset = meta_offset + meta_size;

View File

@@ -5,21 +5,45 @@ const LPOptimizer = require('./lp-optimizer.js');
async function run()
{
const osd_tree = { a: { 1: 1 }, b: { 2: 1 }, c: { 3: 1 } };
const osd_tree = {
100: { 1: 1 },
200: { 2: 1 },
300: { 3: 1 },
};
let res;
console.log('16 PGs, size=3');
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 16 });
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 16, ordered: false });
LPOptimizer.print_change_stats(res, false);
console.log('\nReduce PG size to 2');
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs.map(pg => pg.slice(0, 2)), osd_tree, pg_size: 2 });
assert(res.space == 3, 'Initial distribution');
console.log('\nChange size to 2');
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2, ordered: false });
LPOptimizer.print_change_stats(res, false);
assert(res.space >= 3*14/16 && res.osd_differs == 0, 'Redistribution');
console.log('\nRemove OSD 3');
delete osd_tree['c'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 });
const no3_tree = { ...osd_tree };
delete no3_tree['300'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: no3_tree, pg_size: 2, ordered: false });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 2, 'Redistribution after OSD removal');
console.log('\n16 PGs, size=3, ordered');
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 16, ordered: true });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 3, 'Initial distribution');
console.log('\nChange size to 2, ordered');
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2, ordered: true });
LPOptimizer.print_change_stats(res, false);
assert(res.space >= 3*14/16 && res.osd_differs < 8, 'Redistribution');
}
function assert(cond, txt)
{
if (!cond)
{
throw new Error((txt||'test')+' failed');
}
}
run().catch(console.error);

View File

@@ -45,30 +45,45 @@ async function run()
console.log('Empty tree:');
let res = await LPOptimizer.optimize_initial({ osd_tree: cur_tree, pg_size: 3, pg_count: 256 });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 0);
console.log('\nAdding 1st failure domain:');
cur_tree['dom1'] = osd_tree['dom1'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 12 && res.total_space == 12);
console.log('\nAdding 2nd failure domain:');
cur_tree['dom2'] = osd_tree['dom2'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 24 && res.total_space == 24);
console.log('\nAdding 3rd failure domain:');
cur_tree['dom3'] = osd_tree['dom3'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 36 && res.total_space == 36);
console.log('\nRemoving 3rd failure domain:');
delete cur_tree['dom3'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 24 && res.total_space == 24);
console.log('\nRemoving 2nd failure domain:');
delete cur_tree['dom2'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 12 && res.total_space == 12);
console.log('\nRemoving 1st failure domain:');
delete cur_tree['dom1'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 0);
}
function assert(cond, txt)
{
if (!cond)
{
throw new Error((txt||'test')+' failed');
}
}
run().catch(console.error);

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '0.6.10'
VERSION = '0.6.16'
LOG = logging.getLogger(__name__)
@@ -355,7 +355,25 @@ class VitastorDriver(driver.CloneableImageVD,
def revert_to_snapshot(self, context, volume, snapshot):
"""Revert a volume to a given snapshot."""
# FIXME Delete the image, then recreate it from the snapshot
vol_name = utils.convert_str(snapshot.volume_name)
snap_name = utils.convert_str(snapshot.name)
# Delete the image and recreate it from the snapshot
args = [ 'vitastor-cli', 'rm', vol_name, *(self._vitastor_args()) ]
try:
self._execute(*args)
except processutils.ProcessExecutionError as exc:
LOG.error("Failed to delete image "+vol_name+": "+exc)
raise exception.VolumeBackendAPIException(data = exc.stderr)
args = [
'vitastor-cli', 'create', '--parent', vol_name+'@'+snap_name,
vol_name, *(self._vitastor_args())
]
try:
self._execute(*args)
except processutils.ProcessExecutionError as exc:
LOG.error("Failed to recreate image "+vol_name+" from "+vol_name+"@"+snap_name+": "+exc)
raise exception.VolumeBackendAPIException(data = exc.stderr)
def delete_snapshot(self, snapshot):
"""Deletes a snapshot."""
@@ -363,24 +381,15 @@ class VitastorDriver(driver.CloneableImageVD,
vol_name = utils.convert_str(snapshot.volume_name)
snap_name = utils.convert_str(snapshot.name)
# Find the snapshot
resp = self._etcd_txn({ 'success': [
{ 'request_range': { 'key': 'index/image/'+vol_name+'@'+snap_name } },
] })
if len(resp['responses'][0]['kvs']) == 0:
raise exception.SnapshotNotFound(snapshot_id = snap_name)
inode_id = int(resp['responses'][0]['kvs'][0]['value']['id'])
pool_id = int(resp['responses'][0]['kvs'][0]['value']['pool_id'])
parents = {}
parents[(pool_id << 48) | (inode_id & 0xffffffffffff)] = True
# Check if there are child volumes
children = self._child_count(parents)
if children > 0:
raise exception.SnapshotIsBusy(snapshot_name = snap_name)
# FIXME: We can't delete snapshots because we can't merge layers yet
raise exception.VolumeBackendAPIException(data = 'Snapshot delete (layer merge) is not implemented yet')
args = [
'vitastor-cli', 'rm', vol_name+'@'+snap_name,
*(self._vitastor_args())
]
try:
self._execute(*args)
except processutils.ProcessExecutionError as exc:
LOG.error("Failed to remove snapshot "+vol_name+'@'+snap_name+": "+exc)
raise exception.VolumeBackendAPIException(data = exc.stderr)
def _child_count(self, parents):
children = 0

View File

@@ -25,4 +25,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.6.10/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.10$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-0.6.16/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.16$(rpm --eval '%dist').tar.gz *

View File

@@ -34,7 +34,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.6.10.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.6.16.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.6.10
Version: 0.6.16
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.6.10.el7.tar.gz
Source0: vitastor-0.6.16.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -33,7 +33,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.6.10.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.6.16.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.6.10
Version: 0.6.16
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.6.10.el8.tar.gz
Source0: vitastor-0.6.16.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -15,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="0.6.10")
add_definitions(-DVERSION="0.6.16")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -111,12 +111,11 @@ if (${WITH_FIO})
# libfio_vitastor_sec.so
add_library(fio_vitastor_sec SHARED
fio_sec_osd.cpp
rw_blocking.cpp
addr_util.cpp
)
target_link_libraries(fio_vitastor_sec
vitastor_common
tcmalloc_minimal
${LIBURING_LIBRARIES}
${IBVERBS_LIBRARIES}
)
endif (${WITH_FIO})
@@ -153,10 +152,71 @@ target_link_libraries(vitastor-nbd
vitastor_client
)
# vitastor-nfs
add_executable(vitastor-nfs
nfs_proxy.cpp
nfs_conn.cpp
nfs_portmap.cpp
sha256.c
../libnfs/lib/init.c
../libnfs/lib/pdu.c
../libnfs/lib/libnfs-zdr.c
../libnfs/lib/socket.c
../libnfs/portmap/libnfs-raw-portmap.c
../libnfs/nfs/libnfs-raw-nfs.c
../libnfs/mount/libnfs-raw-mount.c
)
set_source_files_properties(
../libnfs/nfs/libnfs-raw-nfs.c
PROPERTIES
COMPILE_FLAGS "-Wno-unused-but-set-variable"
)
# Simplified static configuration
# The other option is to build patched libnfs packages until all distros get my fixes
target_compile_options(vitastor-nfs
PRIVATE
-DHAVE_ARPA_INET_H
-DHAVE_INTTYPES_H
-DHAVE_MEMORY_H
-DHAVE_NETDB_H
-DHAVE_NETINET_IN_H
-DHAVE_NETINET_TCP_H
-DHAVE_NET_IF_H
-DHAVE_POLL_H
-DHAVE_STDINT_H
-DHAVE_STDLIB_H
-DHAVE_STRINGS_H
-DHAVE_STRING_H
-DHAVE_SYS_IOCTL_H
-DHAVE_SYS_SOCKET_H
-DHAVE_SYS_STATVFS_H
-DHAVE_SYS_STAT_H
-DHAVE_SYS_SYSMACROS_H
-DHAVE_SYS_TIME_H
-DHAVE_SYS_TYPES_H
-DHAVE_SYS_VFS_H
-DHAVE_UNISTD_H
-DHAVE_UTIME_H
-DHAVE_SOCKADDR_STORAGE
-DHAVE_STRUCT_STAT_ST_MTIM_TV_NSEC
-D_U_=
)
target_include_directories(vitastor-nfs
PRIVATE
../libnfs/include
../libnfs/include/nfsc
../libnfs/portmap
../libnfs/nfs
../libnfs/mount
)
target_link_libraries(vitastor-nfs
vitastor_client
)
# vitastor-cli
add_executable(vitastor-cli
cli.cpp cli_alloc_osd.cpp cli_simple_offsets.cpp cli_df.cpp
cli_ls.cpp cli_create.cpp cli_modify.cpp cli_flatten.cpp cli_merge.cpp cli_rm.cpp cli_snap_rm.cpp
cli.cpp cli_common.cpp cli_alloc_osd.cpp cli_simple_offsets.cpp cli_status.cpp cli_df.cpp
cli_ls.cpp cli_create.cpp cli_modify.cpp cli_flatten.cpp cli_merge.cpp cli_rm_data.cpp cli_rm.cpp
)
target_link_libraries(vitastor-cli
vitastor_client

View File

@@ -1,4 +1,9 @@
#include <sys/socket.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <net/if.h>
#include <sys/types.h>
#include <ifaddrs.h>
#include <string.h>
#include <stdio.h>
@@ -6,7 +11,7 @@
#include "addr_util.h"
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr *addr)
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr)
{
if (parse_port)
{
@@ -22,7 +27,7 @@ bool string_to_addr(std::string str, bool parse_port, int default_port, struct s
}
if (inet_pton(AF_INET, str.c_str(), &((struct sockaddr_in*)addr)->sin_addr) == 1)
{
addr->sa_family = AF_INET;
addr->ss_family = AF_INET;
((struct sockaddr_in*)addr)->sin_port = htons(default_port);
return true;
}
@@ -30,31 +35,204 @@ bool string_to_addr(std::string str, bool parse_port, int default_port, struct s
str = str.substr(1, str.length()-2);
if (inet_pton(AF_INET6, str.c_str(), &((struct sockaddr_in6*)addr)->sin6_addr) == 1)
{
addr->sa_family = AF_INET6;
addr->ss_family = AF_INET6;
((struct sockaddr_in6*)addr)->sin6_port = htons(default_port);
return true;
}
return false;
}
std::string addr_to_string(const sockaddr &addr)
std::string addr_to_string(const sockaddr_storage &addr)
{
char peer_str[256];
bool ok = false;
int port;
if (addr.sa_family == AF_INET)
if (addr.ss_family == AF_INET)
{
ok = !!inet_ntop(AF_INET, &((sockaddr_in*)&addr)->sin_addr, peer_str, 256);
port = ntohs(((sockaddr_in*)&addr)->sin_port);
}
else if (addr.sa_family == AF_INET6)
else if (addr.ss_family == AF_INET6)
{
ok = !!inet_ntop(AF_INET6, &((sockaddr_in6*)&addr)->sin6_addr, peer_str, 256);
port = ntohs(((sockaddr_in6*)&addr)->sin6_port);
}
else
throw std::runtime_error("Unknown address family "+std::to_string(addr.sa_family));
throw std::runtime_error("Unknown address family "+std::to_string(addr.ss_family));
if (!ok)
throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
return std::string(peer_str)+":"+std::to_string(port);
}
static bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
{
if (bits == 0)
{
// C99 6.5.7 (3): u32 << 32 is undefined behaviour
return true;
}
return !((addr.s_addr ^ net.s_addr) & htonl(0xFFFFFFFFu << (32 - bits)));
}
static bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits)
{
const uint32_t *a = address.s6_addr32;
const uint32_t *n = network.s6_addr32;
int bits_whole, bits_incomplete;
bits_whole = bits >> 5; // number of whole u32
bits_incomplete = bits & 0x1F; // number of bits in incomplete u32
if (bits_whole && memcmp(a, n, bits_whole << 2))
return false;
if (bits_incomplete)
{
uint32_t mask = htonl((0xFFFFFFFFu) << (32 - bits_incomplete));
if ((a[bits_whole] ^ n[bits_whole]) & mask)
return false;
}
return true;
}
struct addr_mask_t
{
sa_family_t family;
in_addr ipv4;
in6_addr ipv6;
uint8_t bits;
};
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool include_v6)
{
std::vector<addr_mask_t> masks;
for (auto mask: mask_cfg)
{
unsigned bits = 0;
int p = mask.find('/');
if (p != std::string::npos)
{
char null_byte = 0;
if (sscanf(mask.c_str()+p+1, "%u%c", &bits, &null_byte) != 1 || bits > 128)
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
}
mask = mask.substr(0, p);
}
in_addr ipv4;
in6_addr ipv6;
if (inet_pton(AF_INET, mask.c_str(), &ipv4) == 1)
{
if (bits > 32)
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
}
masks.push_back((addr_mask_t){ .family = AF_INET, .ipv4 = ipv4, .bits = (uint8_t)bits });
}
else if (include_v6 && inet_pton(AF_INET6, mask.c_str(), &ipv6) == 1)
{
masks.push_back((addr_mask_t){ .family = AF_INET6, .ipv6 = ipv6, .bits = (uint8_t)bits });
}
else
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
}
}
std::vector<std::string> addresses;
ifaddrs *list, *ifa;
if (getifaddrs(&list) == -1)
{
throw std::runtime_error(std::string("getifaddrs: ") + strerror(errno));
}
for (ifa = list; ifa != NULL; ifa = ifa->ifa_next)
{
if (!ifa->ifa_addr)
{
continue;
}
int family = ifa->ifa_addr->sa_family;
if ((family == AF_INET || family == AF_INET6 && include_v6) &&
(ifa->ifa_flags & (IFF_UP | IFF_RUNNING | IFF_LOOPBACK)) == (IFF_UP | IFF_RUNNING))
{
void *addr_ptr;
if (family == AF_INET)
{
addr_ptr = &((sockaddr_in *)ifa->ifa_addr)->sin_addr;
}
else
{
addr_ptr = &((sockaddr_in6 *)ifa->ifa_addr)->sin6_addr;
}
if (masks.size() > 0)
{
int i;
for (i = 0; i < masks.size(); i++)
{
if (masks[i].family == family && (family == AF_INET
? cidr_match(*(in_addr*)addr_ptr, masks[i].ipv4, masks[i].bits)
: cidr6_match(*(in6_addr*)addr_ptr, masks[i].ipv6, masks[i].bits)))
{
break;
}
}
if (i >= masks.size())
{
continue;
}
}
char addr[INET6_ADDRSTRLEN];
if (!inet_ntop(family, addr_ptr, addr, INET6_ADDRSTRLEN))
{
throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
}
addresses.push_back(std::string(addr));
}
}
freeifaddrs(list);
return addresses;
}
int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port)
{
sockaddr_storage addr;
if (!string_to_addr(bind_address, 0, bind_port, &addr))
{
throw std::runtime_error("bind address "+bind_address+" is not valid");
}
int listen_fd = socket(addr.ss_family, SOCK_STREAM, 0);
if (listen_fd < 0)
{
throw std::runtime_error(std::string("socket: ") + strerror(errno));
}
int enable = 1;
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
{
close(listen_fd);
throw std::runtime_error(std::string("bind: ") + strerror(errno));
}
if (listening_port)
{
if (bind_port == 0)
{
socklen_t len = sizeof(addr);
if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
{
close(listen_fd);
throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
}
*listening_port = ntohs(((sockaddr_in*)&addr)->sin_port);
}
else
{
*listening_port = bind_port;
}
}
if (listen(listen_fd, listen_backlog ? listen_backlog : 128) < 0)
{
close(listen_fd);
throw std::runtime_error(std::string("listen: ") + strerror(errno));
}
return listen_fd;
}

View File

@@ -2,6 +2,9 @@
#include <sys/socket.h>
#include <string>
#include <vector>
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr *addr);
std::string addr_to_string(const sockaddr &addr);
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr);
std::string addr_to_string(const sockaddr_storage &addr);
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg = std::vector<std::string>(), bool include_v6 = false);
int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port);

View File

@@ -25,7 +25,7 @@ allocator::allocator(uint64_t blocks)
size = free = blocks;
last_one_mask = (blocks % 64) == 0
? UINT64_MAX
: ((1l << (blocks % 64)) - 1);
: (((uint64_t)1 << (blocks % 64)) - 1);
for (uint64_t i = 0; i < total; i++)
{
mask[i] = 0;
@@ -79,7 +79,7 @@ void allocator::set(uint64_t addr, bool value)
}
if (value)
{
mask[last] = mask[last] | (1l << bit);
mask[last] = mask[last] | ((uint64_t)1 << bit);
if (mask[last] != (!is_last || cur_addr/64 < size/64
? UINT64_MAX : last_one_mask))
{
@@ -88,7 +88,7 @@ void allocator::set(uint64_t addr, bool value)
}
else
{
mask[last] = mask[last] & ~(1l << bit);
mask[last] = mask[last] & ~((uint64_t)1 << bit);
}
is_last = false;
if (p2 > 1)

View File

@@ -21,7 +21,7 @@
// Memory alignment for direct I/O (usually 512 bytes)
// All other alignments must be a multiple of this one
#ifndef MEM_ALIGNMENT
#define MEM_ALIGNMENT 512
#define MEM_ALIGNMENT 4096
#endif
// Default block size is 128 KB, current allowed range is 4K - 128M

View File

@@ -185,7 +185,7 @@ void journal_flusher_t::release_trim()
void journal_flusher_t::dump_diagnostics()
{
const char *unflushable_type = "";
obj_ver_id unflushable = { 0 };
obj_ver_id unflushable = {};
// Try to find out if there is a flushable object for information
for (object_id cur_oid: flush_queue)
{
@@ -415,8 +415,11 @@ stop_flusher:
flusher->active_flushers++;
resume_1:
// Find it in clean_db
clean_it = bs->clean_db.find(cur.oid);
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
{
auto & clean_db = bs->clean_db_shard(cur.oid);
auto clean_it = clean_db.find(cur.oid);
old_clean_loc = (clean_it != clean_db.end() ? clean_it->second.location : UINT64_MAX);
}
// Scan dirty versions of the object
if (!scan_dirty(1))
{
@@ -486,8 +489,8 @@ resume_1:
if (bs->clean_entry_bitmap_size)
{
new_clean_bitmap = (bs->inmemory_meta
? meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry)
: bs->clean_bitmap + (clean_loc >> bs->block_order)*(2*bs->clean_entry_bitmap_size));
? (uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry)
: (uint8_t*)bs->clean_bitmap + (clean_loc >> bs->block_order)*(2*bs->clean_entry_bitmap_size));
if (clean_init_bitmap)
{
memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size);
@@ -533,7 +536,7 @@ resume_1:
return false;
}
// zero out old metadata entry
memset(meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
memset((uint8_t*)meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
await_sqe(15);
data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size };
data->callback = simple_callback_w;
@@ -544,7 +547,7 @@ resume_1:
}
if (has_delete)
{
clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size);
if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
{
printf("Fatal error (metadata corruption or bug): tried to delete metadata entry %lu (%lx:%lx v%lu) while deleting %lx:%lx\n",
@@ -553,11 +556,11 @@ resume_1:
exit(1);
}
// zero out new metadata entry
memset(meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
memset((uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
}
else
{
clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size);
if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
{
printf("Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lu (%lx:%lx v%lu) with %lx:%lx v%lu\n",
@@ -575,7 +578,7 @@ resume_1:
if (bs->clean_entry_bitmap_size)
{
void *bmp_ptr = bs->clean_entry_bitmap_size > sizeof(void*) ? dirty_end->second.bitmap : &dirty_end->second.bitmap;
memcpy((void*)(new_entry+1) + bs->clean_entry_bitmap_size, bmp_ptr, bs->clean_entry_bitmap_size);
memcpy((uint8_t*)(new_entry+1) + bs->clean_entry_bitmap_size, bmp_ptr, bs->clean_entry_bitmap_size);
}
}
await_sqe(6);
@@ -762,7 +765,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
if (bs->journal.inmemory)
{
// Take it from memory
memcpy(it->buf, bs->journal.buffer + submit_offset, submit_len);
memcpy(it->buf, (uint8_t*)bs->journal.buffer + submit_offset, submit_len);
}
else
{
@@ -826,7 +829,7 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
wr.pos = ((meta_loc >> bs->block_order) % (bs->meta_block_size / bs->clean_entry_size));
if (bs->inmemory_meta)
{
wr.buf = bs->metadata_buffer + wr.sector;
wr.buf = (uint8_t*)bs->metadata_buffer + wr.sector;
return true;
}
wr.it = flusher->meta_sectors.find(wr.sector);
@@ -870,10 +873,11 @@ void journal_flusher_co::update_clean_db()
#endif
bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
}
auto & clean_db = bs->clean_db_shard(cur.oid);
if (has_delete)
{
auto clean_it = bs->clean_db.find(cur.oid);
bs->clean_db.erase(clean_it);
auto clean_it = clean_db.find(cur.oid);
clean_db.erase(clean_it);
#ifdef BLOCKSTORE_DEBUG
printf("Free block %lu from %lx:%lx v%lu (delete)\n",
clean_loc >> bs->block_order,
@@ -884,7 +888,7 @@ void journal_flusher_co::update_clean_db()
}
else
{
bs->clean_db[cur.oid] = {
clean_db[cur.oid] = {
.version = cur.version,
.location = clean_loc,
};

View File

@@ -49,7 +49,6 @@ class journal_flusher_co
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
bool skip_copy, has_delete, has_writes;
blockstore_clean_db_t::iterator clean_it;
std::vector<copy_buffer_t> v;
std::vector<copy_buffer_t>::iterator it;
int copy_count;

View File

@@ -118,7 +118,7 @@ void blockstore_impl_t::loop()
// has_writes == 0 - no writes before the current queue item
// has_writes == 1 - some writes in progress
// has_writes == 2 - tried to submit some writes, but failed
int has_writes = 0, op_idx = 0, new_idx = 0;
int has_writes = 0, op_idx = 0, new_idx = 0, done_lists = 0;
for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
{
auto op = submit_queue[op_idx];
@@ -142,7 +142,6 @@ void blockstore_impl_t::loop()
continue;
}
}
unsigned ring_space = ringloop->space_left();
unsigned prev_sqe_pos = ringloop->save();
// 0 = can't submit
// 1 = in progress
@@ -199,9 +198,14 @@ void blockstore_impl_t::loop()
}
else if (op->opcode == BS_OP_LIST)
{
// LIST doesn't need to be blocked by previous modifications
process_list(op);
wr_st = 2;
// LIST doesn't have to be blocked by previous modifications
// But don't do a lot of LISTs at once, because they're blocking and potentially slow
if (single_tick_list_limit <= 0 || done_lists < single_tick_list_limit)
{
process_list(op);
done_lists++;
wr_st = 2;
}
}
if (wr_st == 2)
{
@@ -212,7 +216,6 @@ void blockstore_impl_t::loop()
ringloop->restore(prev_sqe_pos);
if (PRIV(op)->wait_for == WAIT_SQE)
{
PRIV(op)->wait_detail = 1 + ring_space;
// ring is full, stop submission
break;
}
@@ -282,7 +285,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
{
if (PRIV(op)->wait_for == WAIT_SQE)
{
if (ringloop->space_left() < PRIV(op)->wait_detail)
if (ringloop->sqes_left() < PRIV(op)->wait_detail)
{
// stop submission if there's still no free space
#ifdef BLOCKSTORE_DEBUG
@@ -372,7 +375,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
};
}
unstable_writes.clear();
op->callback = [this, old_callback](blockstore_op_t *op)
op->callback = [old_callback](blockstore_op_t *op)
{
obj_ver_id *vers = (obj_ver_id*)op->buf;
delete[] vers;
@@ -425,22 +428,104 @@ static bool replace_stable(object_id oid, uint64_t version, int search_start, in
return false;
}
blockstore_clean_db_t& blockstore_impl_t::clean_db_shard(object_id oid)
{
uint64_t pg_num = 0;
uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
auto sh_it = clean_db_settings.find(pool_id);
if (sh_it != clean_db_settings.end())
{
// like map_to_pg()
pg_num = (oid.stripe / sh_it->second.pg_stripe_size) % sh_it->second.pg_count + 1;
}
return clean_db_shards[(pool_id << (64-POOL_ID_BITS)) | pg_num];
}
void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)
{
uint64_t pool_id = (uint64_t)pool;
std::map<pool_pg_id_t, blockstore_clean_db_t> new_shards;
auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
while (sh_it != clean_db_shards.end() &&
(sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
{
for (auto & pair: sh_it->second)
{
// like map_to_pg()
uint64_t pg_num = (pair.first.stripe / pg_stripe_size) % pg_count + 1;
uint64_t shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
new_shards[shard_id][pair.first] = pair.second;
}
clean_db_shards.erase(sh_it++);
}
for (sh_it = new_shards.begin(); sh_it != new_shards.end(); sh_it++)
{
auto & to = clean_db_shards[sh_it->first];
to.swap(sh_it->second);
}
clean_db_settings[pool_id] = (pool_shard_settings_t){
.pg_count = pg_count,
.pg_stripe_size = pg_stripe_size,
};
}
void blockstore_impl_t::process_list(blockstore_op_t *op)
{
uint32_t list_pg = op->offset;
uint32_t list_pg = op->offset+1;
uint32_t pg_count = op->len;
uint64_t pg_stripe_size = op->oid.stripe;
uint64_t min_inode = op->oid.inode;
uint64_t max_inode = op->version;
// Check PG
if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg > pg_count))
{
op->retval = -EINVAL;
FINISH_OP(op);
return;
}
// Copy clean_db entries (sorted)
int stable_count = 0, stable_alloc = clean_db.size() / (pg_count ? pg_count : 1);
// Check if the DB needs resharding
// (we don't know about PGs from the beginning, we only create "shards" here)
uint64_t first_shard = 0, last_shard = UINT64_MAX;
if (min_inode != 0 &&
// Check if min_inode == max_inode == pool_id<<N, i.e. this is a pool listing
(min_inode >> (64-POOL_ID_BITS)) == (max_inode >> (64-POOL_ID_BITS)))
{
pool_id_t pool_id = (min_inode >> (64-POOL_ID_BITS));
if (pg_count > 1)
{
// Per-pg listing
auto sh_it = clean_db_settings.find(pool_id);
if (sh_it == clean_db_settings.end() ||
sh_it->second.pg_count != pg_count ||
sh_it->second.pg_stripe_size != pg_stripe_size)
{
reshard_clean_db(pool_id, pg_count, pg_stripe_size);
}
first_shard = last_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS)) | list_pg;
}
else
{
// Per-pool listing
first_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS));
last_shard = ((uint64_t)(pool_id+1) << (64-POOL_ID_BITS)) - 1;
}
}
// Copy clean_db entries
int stable_count = 0, stable_alloc = 0;
if (min_inode != max_inode)
{
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
shard_it++)
{
auto & clean_db = shard_it->second;
stable_alloc += clean_db.size();
}
}
else
{
stable_alloc = 32768;
}
obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
if (!stable)
{
@@ -448,7 +533,11 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
FINISH_OP(op);
return;
}
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
shard_it++)
{
auto & clean_db = shard_it->second;
auto clean_it = clean_db.begin(), clean_end = clean_db.end();
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
{
@@ -463,26 +552,28 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
}
for (; clean_it != clean_end; clean_it++)
{
if (!pg_count || ((clean_it->first.stripe / pg_stripe_size) % pg_count) == list_pg) // like map_to_pg()
if (stable_count >= stable_alloc)
{
if (stable_count >= stable_alloc)
stable_alloc *= 2;
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!stable)
{
stable_alloc += 32768;
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!stable)
{
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
stable[stable_count++] = {
.oid = clean_it->first,
.version = clean_it->second.version,
};
}
stable[stable_count++] = {
.oid = clean_it->first,
.version = clean_it->second.version,
};
}
}
if (first_shard != last_shard)
{
// If that's not a per-PG listing, sort clean entries
std::sort(stable, stable+stable_count);
}
int clean_stable_count = stable_count;
// Copy dirty_db entries (sorted, too)
int unstable_count = 0, unstable_alloc = 0;
@@ -508,7 +599,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
}
for (; dirty_it != dirty_end; dirty_it++)
{
if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg) // like map_to_pg()
if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count + 1) == list_pg) // like map_to_pg()
{
if (IS_DELETE(dirty_it->second.state))
{

View File

@@ -55,9 +55,10 @@
#define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE)
#define BS_SUBMIT_CHECK_SQES(n) \
if (ringloop->space_left() < (n))\
if (ringloop->sqes_left() < (n))\
{\
/* Pause until there are more requests available */\
PRIV(op)->wait_detail = (n);\
PRIV(op)->wait_for = WAIT_SQE;\
return 0;\
}
@@ -71,6 +72,7 @@
if (!sqe)\
{\
/* Pause until there are more requests available */\
PRIV(op)->wait_detail = 1;\
PRIV(op)->wait_for = WAIT_SQE;\
return 0;\
}
@@ -80,6 +82,7 @@
if (!sqe)\
{\
/* Pause until there are more requests available */\
PRIV(op)->wait_detail = 1;\
PRIV(op)->wait_for = WAIT_SQE;\
return 0;\
}
@@ -201,6 +204,17 @@ typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
#include "blockstore_flush.h"
typedef uint32_t pool_id_t;
typedef uint64_t pool_pg_id_t;
#define POOL_ID_BITS 16
struct pool_shard_settings_t
{
uint32_t pg_count;
uint32_t pg_stripe_size;
};
class blockstore_impl_t
{
/******* OPTIONS *******/
@@ -238,11 +252,14 @@ class blockstore_impl_t
int throttle_target_parallelism = 1;
// Minimum difference in microseconds between target and real execution times to throttle the response
int throttle_threshold_us = 50;
// Maximum number of LIST operations to be processed between
int single_tick_list_limit = 1;
/******* END OF OPTIONS *******/
struct ring_consumer_t ring_consumer;
blockstore_clean_db_t clean_db;
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
uint8_t *clean_bitmap = NULL;
blockstore_dirty_db_t dirty_db;
std::vector<blockstore_op_t*> submit_queue;
@@ -280,7 +297,7 @@ class blockstore_impl_t
friend class blockstore_init_meta;
friend class blockstore_init_journal;
friend class blockstore_journal_check_t;
friend struct blockstore_journal_check_t;
friend class journal_flusher_t;
friend class journal_flusher_co;
@@ -291,6 +308,9 @@ class blockstore_impl_t
void open_journal();
uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
blockstore_clean_db_t& clean_db_shard(object_id oid);
void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
// Journaling
void prepare_journal_sector_write(int sector, blockstore_op_t *op);
void handle_journal_write(ring_data_t *data, uint64_t flush_id);

View File

@@ -148,7 +148,7 @@ resume_1:
{
GET_SQE();
data->iov = {
metadata_buffer + (bs->inmemory_meta
(uint8_t*)metadata_buffer + (bs->inmemory_meta
? metadata_read
: (prev == 1 ? bs->metadata_buf_size : 0)),
bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read,
@@ -169,13 +169,13 @@ resume_1:
if (prev_done)
{
void *done_buf = bs->inmemory_meta
? (metadata_buffer + done_pos)
: (metadata_buffer + (prev_done == 2 ? bs->metadata_buf_size : 0));
? ((uint8_t*)metadata_buffer + done_pos)
: ((uint8_t*)metadata_buffer + (prev_done == 2 ? bs->metadata_buf_size : 0));
unsigned count = bs->meta_block_size / bs->clean_entry_size;
for (int sector = 0; sector < done_len; sector += bs->meta_block_size)
{
// handle <count> entries
handle_entries(done_buf + sector, count, bs->block_order);
handle_entries((uint8_t*)done_buf + sector, count, bs->block_order);
done_cnt += count;
}
prev_done = 0;
@@ -215,17 +215,18 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
{
for (unsigned i = 0; i < count; i++)
{
clean_disk_entry *entry = (clean_disk_entry*)(entries + i*bs->clean_entry_size);
clean_disk_entry *entry = (clean_disk_entry*)((uint8_t*)entries + i*bs->clean_entry_size);
if (!bs->inmemory_meta && bs->clean_entry_bitmap_size)
{
memcpy(bs->clean_bitmap + (done_cnt+i)*2*bs->clean_entry_bitmap_size, &entry->bitmap, 2*bs->clean_entry_bitmap_size);
}
if (entry->oid.inode > 0)
{
auto clean_it = bs->clean_db.find(entry->oid);
if (clean_it == bs->clean_db.end() || clean_it->second.version < entry->version)
auto & clean_db = bs->clean_db_shard(entry->oid);
auto clean_it = clean_db.find(entry->oid);
if (clean_it == clean_db.end() || clean_it->second.version < entry->version)
{
if (clean_it != bs->clean_db.end())
if (clean_it != clean_db.end())
{
// free the previous block
#ifdef BLOCKSTORE_DEBUG
@@ -245,7 +246,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
printf("Allocate block (clean entry) %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
#endif
bs->data_alloc->set(done_cnt+i, true);
bs->clean_db[entry->oid] = (struct clean_entry){
clean_db[entry->oid] = (struct clean_entry){
.version = entry->version,
.location = (done_cnt+i) << block_order,
};
@@ -440,7 +441,7 @@ resume_1:
if (!bs->journal.inmemory)
submitted_buf = memalign_or_die(MEM_ALIGNMENT, JOURNAL_BUFFER_SIZE);
else
submitted_buf = bs->journal.buffer + journal_pos;
submitted_buf = (uint8_t*)bs->journal.buffer + journal_pos;
data->iov = {
submitted_buf,
end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
@@ -570,7 +571,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
resume:
while (pos < bs->journal.block_size)
{
journal_entry *je = (journal_entry*)(buf + proc_pos - done_pos + pos);
journal_entry *je = (journal_entry*)((uint8_t*)buf + proc_pos - done_pos + pos);
if (je->magic != JOURNAL_MAGIC || je_crc32(je) != je->crc32 ||
je->type < JE_MIN || je->type > JE_MAX || started && je->crc32_prev != crc32_last)
{
@@ -619,7 +620,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
if (location >= done_pos && location+je->small_write.len <= done_pos+len)
{
// data is within this buffer
data_crc32 = crc32c(0, buf + location - done_pos, je->small_write.len);
data_crc32 = crc32c(0, (uint8_t*)buf + location - done_pos, je->small_write.len);
}
else
{
@@ -634,7 +635,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
? location+je->small_write.len : done[i].pos+done[i].len);
uint64_t part_begin = (location < done[i].pos ? done[i].pos : location);
covered += part_end - part_begin;
data_crc32 = crc32c(data_crc32, done[i].buf + part_begin - done[i].pos, part_end - part_begin);
data_crc32 = crc32c(data_crc32, (uint8_t*)done[i].buf + part_begin - done[i].pos, part_end - part_begin);
}
}
if (covered < je->small_write.len)
@@ -650,14 +651,15 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
// interesting thing is that we must clear the corrupt entry if we're not readonly,
// because we don't write next entries in the same journal block
printf("Journal entry data is corrupt (data crc32 %x != %x)\n", data_crc32, je->small_write.crc32_data);
memset(buf + proc_pos - done_pos + pos, 0, bs->journal.block_size - pos);
memset((uint8_t*)buf + proc_pos - done_pos + pos, 0, bs->journal.block_size - pos);
bs->journal.next_free = prev_free;
init_write_buf = buf + proc_pos - done_pos;
init_write_buf = (uint8_t*)buf + proc_pos - done_pos;
init_write_sector = proc_pos;
return 0;
}
auto clean_it = bs->clean_db.find(je->small_write.oid);
if (clean_it == bs->clean_db.end() ||
auto & clean_db = bs->clean_db_shard(je->small_write.oid);
auto clean_it = clean_db.find(je->small_write.oid);
if (clean_it == clean_db.end() ||
clean_it->second.version < je->small_write.version)
{
obj_ver_id ov = {
@@ -665,7 +667,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.version = je->small_write.version,
};
void *bmp = NULL;
void *bmp_from = (void*)je + sizeof(journal_entry_small_write);
void *bmp_from = (uint8_t*)je + sizeof(journal_entry_small_write);
if (bs->clean_entry_bitmap_size <= sizeof(void*))
{
memcpy(&bmp, bmp_from, bs->clean_entry_bitmap_size);
@@ -735,8 +737,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
erase_dirty_object(dirty_it);
}
}
auto clean_it = bs->clean_db.find(je->big_write.oid);
if (clean_it == bs->clean_db.end() ||
auto & clean_db = bs->clean_db_shard(je->big_write.oid);
auto clean_it = clean_db.find(je->big_write.oid);
if (clean_it == clean_db.end() ||
clean_it->second.version < je->big_write.version)
{
// oid, version, block
@@ -745,7 +748,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.version = je->big_write.version,
};
void *bmp = NULL;
void *bmp_from = (void*)je + sizeof(journal_entry_big_write);
void *bmp_from = (uint8_t*)je + sizeof(journal_entry_big_write);
if (bs->clean_entry_bitmap_size <= sizeof(void*))
{
memcpy(&bmp, bmp_from, bs->clean_entry_bitmap_size);
@@ -841,8 +844,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
dirty_it--;
dirty_exists = dirty_it->first.oid == je->del.oid;
}
auto clean_it = bs->clean_db.find(je->del.oid);
bool clean_exists = (clean_it != bs->clean_db.end() &&
auto & clean_db = bs->clean_db_shard(je->del.oid);
auto clean_it = clean_db.find(je->del.oid);
bool clean_exists = (clean_it != clean_db.end() &&
clean_it->second.version < je->del.version);
if (!clean_exists && dirty_exists)
{
@@ -901,8 +905,9 @@ void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator
break;
}
}
auto clean_it = bs->clean_db.find(oid);
uint64_t clean_loc = clean_it != bs->clean_db.end()
auto & clean_db = bs->clean_db_shard(oid);
auto clean_it = clean_db.find(oid);
uint64_t clean_loc = clean_it != clean_db.end()
? clean_it->second.location : UINT64_MAX;
if (exists && clean_loc == UINT64_MAX)
{

View File

@@ -6,7 +6,7 @@
class blockstore_init_meta
{
blockstore_impl_t *bs;
int wait_state = 0, wait_count = 0;
int wait_state = 0;
bool zero_on_init = false;
void *metadata_buffer = NULL;
uint64_t metadata_read = 0;

View File

@@ -96,7 +96,8 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
next_pos = next_pos + data_after;
if (next_pos > bs->journal.len)
{
next_pos = bs->journal.block_size + data_after;
if (right_dir)
next_pos = bs->journal.block_size + data_after;
right_dir = false;
}
}
@@ -136,13 +137,13 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
journal.in_sector_pos = 0;
journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
memset(journal.inmemory
? journal.buffer + journal.sector_info[journal.cur_sector].offset
: journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
}
journal_entry *je = (struct journal_entry*)(
(journal.inmemory
? journal.buffer + journal.sector_info[journal.cur_sector].offset
: journal.sector_buf + journal.block_size*journal.cur_sector) + journal.in_sector_pos
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector) + journal.in_sector_pos
);
journal.in_sector_pos += size;
je->magic = JOURNAL_MAGIC;
@@ -168,8 +169,8 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
journal.sector_info[cur_sector].flush_count++;
data->iov = (struct iovec){
(journal.inmemory
? journal.buffer + journal.sector_info[cur_sector].offset
: journal.sector_buf + journal.block_size*cur_sector),
? (uint8_t*)journal.buffer + journal.sector_info[cur_sector].offset
: (uint8_t*)journal.sector_buf + journal.block_size*cur_sector),
journal.block_size
};
data->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };

View File

@@ -24,7 +24,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
}
if (journal.inmemory && IS_JOURNAL(item_state))
{
memcpy(buf, journal.buffer + offset, len);
memcpy(buf, (uint8_t*)journal.buffer + offset, len);
return 1;
}
BS_SUBMIT_GET_SQE(sqe, data);
@@ -75,7 +75,7 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfille
};
it = PRIV(read_op)->read_vec.insert(it, el);
if (!fulfill_read_push(read_op,
read_op->buf + el.offset - read_op->offset,
(uint8_t*)read_op->buf + el.offset - read_op->offset,
item_location + el.offset - item_start,
el.len, item_state, item_version))
{
@@ -102,7 +102,7 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse
{
uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry) + offset);
clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry) + offset);
}
else
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*2*clean_entry_bitmap_size + offset);
@@ -111,6 +111,7 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse
int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
{
auto & clean_db = clean_db_shard(read_op->oid);
auto clean_it = clean_db.find(read_op->oid);
auto dirty_it = dirty_db.upper_bound((obj_ver_id){
.oid = read_op->oid,
@@ -297,6 +298,7 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void
dirty_it--;
}
}
auto & clean_db = clean_db_shard(oid);
auto clean_it = clean_db.find(oid);
if (clean_it != clean_db.end())
{

View File

@@ -54,6 +54,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
auto dirty_it = dirty_db.find(*v);
if (dirty_it == dirty_db.end())
{
auto & clean_db = clean_db_shard(v->oid);
auto clean_it = clean_db.find(v->oid);
if (clean_it == clean_db.end() || clean_it->second.version < v->version)
{
@@ -188,6 +189,7 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
}
if (exists == -1)
{
auto & clean_db = clean_db_shard(v.oid);
auto clean_it = clean_db.find(v.oid);
exists = clean_it != clean_db.end() ? 1 : 0;
}
@@ -215,6 +217,7 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
break;
}
}
auto & clean_db = clean_db_shard(v.oid);
auto clean_it = clean_db.find(v.oid);
uint64_t clean_loc = clean_it != clean_db.end()
? clean_it->second.location : UINT64_MAX;

View File

@@ -41,6 +41,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
}
if (!found)
{
auto & clean_db = clean_db_shard(op->oid);
auto clean_it = clean_db.find(op->oid);
if (clean_it != clean_db.end())
{
@@ -102,7 +103,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
// Issue an additional sync so that the previous big write can reach the journal
blockstore_op_t *sync_op = new blockstore_op_t;
sync_op->opcode = BS_OP_SYNC;
sync_op->callback = [this, op](blockstore_op_t *sync_op)
sync_op->callback = [](blockstore_op_t *sync_op)
{
delete sync_op;
};
@@ -380,7 +381,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
if (journal.inmemory)
{
// Copy data
memcpy(journal.buffer + journal.next_free, op->buf, op->len);
memcpy((uint8_t*)journal.buffer + journal.next_free, op->buf, op->len);
}
BS_SUBMIT_GET_SQE(sqe2, data2);
data2->iov = (struct iovec){ op->buf, op->len };
@@ -543,12 +544,13 @@ resume_4:
if (ref_us > exec_us + throttle_threshold_us)
{
// Pause reply
PRIV(op)->op_state = 5;
// Remember that the timer can in theory be called right here
tfd->set_timer_us(ref_us-exec_us, false, [this, op](int timer_id)
{
PRIV(op)->op_state++;
ringloop->wakeup();
});
PRIV(op)->op_state = 5;
return 1;
}
}
@@ -629,13 +631,17 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
{
return 0;
}
write_iodepth++;
// Write current journal sector only if it's dirty and full, or in the immediate_commit mode
BS_SUBMIT_CHECK_SQES(
(immediate_commit != IMMEDIATE_NONE ||
(journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
journal.sector_info[journal.cur_sector].dirty) ? 1 : 0
);
if (write_iodepth >= max_write_iodepth)
{
return 0;
}
write_iodepth++;
// Prepare journal sector write
if (immediate_commit == IMMEDIATE_NONE)
{

View File

@@ -2,8 +2,7 @@
// License: VNPL-1.1 (see README.md for details)
/**
* CLI tool
* Currently can (a) remove inodes and (b) merge snapshot/clone layers
* CLI tool and also a library for administrative tasks
*/
#include <vector>
@@ -17,7 +16,9 @@
static const char *exe_name = NULL;
json11::Json::object cli_tool_t::parse_args(int narg, const char *args[])
static void help();
static json11::Json::object parse_args(int narg, const char *args[])
{
json11::Json::object cfg;
json11::Json::array cmd;
@@ -79,13 +80,16 @@ json11::Json::object cli_tool_t::parse_args(int narg, const char *args[])
return cfg;
}
void cli_tool_t::help()
static void help()
{
printf(
"Vitastor command-line tool\n"
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
"\n"
"USAGE:\n"
"%s status\n"
" Show cluster status\n"
"\n"
"%s df\n"
" Show pool space statistics\n"
"\n"
@@ -155,200 +159,132 @@ void cli_tool_t::help()
" --no-color Disable colored output\n"
" --json JSON output\n"
,
exe_name, exe_name, exe_name, exe_name, exe_name, exe_name,
exe_name, exe_name, exe_name, exe_name, exe_name, exe_name, exe_name,
exe_name, exe_name, exe_name, exe_name, exe_name, exe_name
);
exit(0);
}
void cli_tool_t::change_parent(inode_t cur, inode_t new_parent)
{
auto cur_cfg_it = cli->st_cli.inode_config.find(cur);
if (cur_cfg_it == cli->st_cli.inode_config.end())
{
fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
exit(1);
}
inode_config_t new_cfg = cur_cfg_it->second;
std::string cur_name = new_cfg.name;
std::string cur_cfg_key = base64_encode(cli->st_cli.etcd_prefix+
"/config/inode/"+std::to_string(INODE_POOL(cur))+
"/"+std::to_string(INODE_NO_POOL(cur)));
new_cfg.parent_id = new_parent;
json11::Json::object cur_cfg_json = cli->st_cli.serialize_inode_cfg(&new_cfg);
waiting++;
cli->st_cli.etcd_txn(json11::Json::object {
{ "compare", json11::Json::array {
json11::Json::object {
{ "target", "MOD" },
{ "key", cur_cfg_key },
{ "result", "LESS" },
{ "mod_revision", new_cfg.mod_revision+1 },
},
} },
{ "success", json11::Json::array {
json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", cur_cfg_key },
{ "value", base64_encode(json11::Json(cur_cfg_json).dump()) },
} }
},
} },
}, ETCD_SLOW_TIMEOUT, [this, new_parent, cur, cur_name](std::string err, json11::Json res)
{
if (err != "")
{
fprintf(stderr, "Error changing parent of %s: %s\n", cur_name.c_str(), err.c_str());
exit(1);
}
if (!res["succeeded"].bool_value())
{
fprintf(stderr, "Inode %s was modified during snapshot deletion\n", cur_name.c_str());
exit(1);
}
if (new_parent)
{
auto new_parent_it = cli->st_cli.inode_config.find(new_parent);
std::string new_parent_name = new_parent_it != cli->st_cli.inode_config.end()
? new_parent_it->second.name : "<unknown>";
printf(
"Parent of layer %s (inode %lu in pool %u) changed to %s (inode %lu in pool %u)\n",
cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur),
new_parent_name.c_str(), INODE_NO_POOL(new_parent), INODE_POOL(new_parent)
);
}
else
{
printf(
"Parent of layer %s (inode %lu in pool %u) detached\n",
cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur)
);
}
waiting--;
ringloop->wakeup();
});
}
inode_config_t* cli_tool_t::get_inode_cfg(const std::string & name)
{
for (auto & ic: cli->st_cli.inode_config)
{
if (ic.second.name == name)
{
return &ic.second;
}
}
fprintf(stderr, "Layer %s not found\n", name.c_str());
exit(1);
}
void cli_tool_t::run(json11::Json cfg)
static int run(cli_tool_t *p, json11::Json cfg)
{
p->parse_config(cfg);
json11::Json::array cmd = cfg["command"].array_items();
std::function<bool(cli_result_t &)> action_cb;
if (!cmd.size())
{
fprintf(stderr, "command is missing\n");
exit(1);
return EINVAL;
}
else if (cmd[0] == "status")
{
// Show cluster status
action_cb = p->start_status(cfg);
}
else if (cmd[0] == "df")
{
// Show pool space stats
action_cb = start_df(cfg);
action_cb = p->start_df(cfg);
}
else if (cmd[0] == "ls")
{
// List images
action_cb = start_ls(cfg);
action_cb = p->start_ls(cfg);
}
else if (cmd[0] == "create" || cmd[0] == "snap-create")
{
// Create image/snapshot
action_cb = start_create(cfg);
action_cb = p->start_create(cfg);
}
else if (cmd[0] == "modify")
{
// Modify image
action_cb = start_modify(cfg);
action_cb = p->start_modify(cfg);
}
else if (cmd[0] == "rm-data")
{
// Delete inode data
action_cb = start_rm(cfg);
action_cb = p->start_rm(cfg);
}
else if (cmd[0] == "merge-data")
{
// Merge layer data without affecting metadata
action_cb = start_merge(cfg);
action_cb = p->start_merge(cfg);
}
else if (cmd[0] == "flatten")
{
// Merge layer data without affecting metadata
action_cb = start_flatten(cfg);
action_cb = p->start_flatten(cfg);
}
else if (cmd[0] == "rm")
{
// Remove multiple snapshots and rebase their children
action_cb = start_snap_rm(cfg);
action_cb = p->start_snap_rm(cfg);
}
else if (cmd[0] == "alloc-osd")
{
// Allocate a new OSD number
action_cb = start_alloc_osd(cfg);
action_cb = p->start_alloc_osd(cfg);
}
else if (cmd[0] == "simple-offsets")
{
// Calculate offsets for simple & stupid OSD deployment without superblock
action_cb = simple_offsets(cfg);
action_cb = p->simple_offsets(cfg);
}
else
{
fprintf(stderr, "unknown command: %s\n", cmd[0].string_value().c_str());
exit(1);
return EINVAL;
}
if (action_cb == NULL)
{
return;
return 0;
}
color = !cfg["no-color"].bool_value();
json_output = cfg["json"].bool_value();
iodepth = cfg["iodepth"].uint64_value();
if (!iodepth)
iodepth = 32;
parallel_osds = cfg["parallel_osds"].uint64_value();
if (!parallel_osds)
parallel_osds = 4;
log_level = cfg["log_level"].int64_value();
progress = cfg["progress"].uint64_value() ? true : false;
list_first = cfg["wait-list"].uint64_value() ? true : false;
// Create client
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
cli->on_ready([this]()
p->ringloop = new ring_loop_t(512);
p->epmgr = new epoll_manager_t(p->ringloop);
p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg);
// Smaller timeout by default for more interactiveness
p->cli->st_cli.etcd_slow_timeout = p->cli->st_cli.etcd_quick_timeout;
ring_consumer_t consumer;
cli_result_t result;
p->cli->on_ready([&]()
{
// Initialize job
consumer.loop = [this]()
consumer.loop = [&]()
{
if (action_cb != NULL)
{
bool done = action_cb();
bool done = action_cb(result);
if (done)
{
action_cb = NULL;
}
}
ringloop->submit();
p->ringloop->submit();
};
ringloop->register_consumer(&consumer);
p->ringloop->register_consumer(&consumer);
consumer.loop();
});
// Loop until it completes
while (action_cb != NULL)
{
ringloop->loop();
p->ringloop->loop();
if (action_cb != NULL)
ringloop->wait();
p->ringloop->wait();
}
// Print result
if (result.text != "")
{
fprintf(stderr, "%s\n", result.text.c_str());
}
// Destroy the client
delete p->cli;
delete p->epmgr;
delete p->ringloop;
p->cli = NULL;
p->epmgr = NULL;
p->ringloop = NULL;
return result.err;
}
int main(int narg, const char *args[])
@@ -357,6 +293,7 @@ int main(int narg, const char *args[])
setvbuf(stderr, NULL, _IONBF, 0);
exe_name = args[0];
cli_tool_t *p = new cli_tool_t();
p->run(cli_tool_t::parse_args(narg, args));
return 0;
int r = run(p, parse_args(narg, args));
delete p;
return r;
}

View File

@@ -19,6 +19,13 @@ class epoll_manager_t;
class cluster_client_t;
struct inode_config_t;
struct cli_result_t
{
int err;
std::string text;
json11::Json data;
};
class cli_tool_t
{
public:
@@ -34,39 +41,38 @@ public:
cluster_client_t *cli = NULL;
int waiting = 0;
ring_consumer_t consumer;
std::function<bool(void)> action_cb;
json11::Json etcd_result;
void run(json11::Json cfg);
void parse_config(json11::Json cfg);
void change_parent(inode_t cur, inode_t new_parent);
inode_config_t* get_inode_cfg(const std::string & name);
static json11::Json::object parse_args(int narg, const char *args[]);
static void help();
friend struct rm_inode_t;
friend struct snap_merger_t;
friend struct snap_flattener_t;
friend struct snap_remover_t;
std::function<bool(void)> start_df(json11::Json);
std::function<bool(void)> start_ls(json11::Json);
std::function<bool(void)> start_create(json11::Json);
std::function<bool(void)> start_modify(json11::Json);
std::function<bool(void)> start_rm(json11::Json);
std::function<bool(void)> start_merge(json11::Json);
std::function<bool(void)> start_flatten(json11::Json);
std::function<bool(void)> start_snap_rm(json11::Json);
std::function<bool(void)> start_alloc_osd(json11::Json cfg, uint64_t *out = NULL);
std::function<bool(void)> simple_offsets(json11::Json cfg);
std::function<bool(cli_result_t &)> start_status(json11::Json);
std::function<bool(cli_result_t &)> start_df(json11::Json);
std::function<bool(cli_result_t &)> start_ls(json11::Json);
std::function<bool(cli_result_t &)> start_create(json11::Json);
std::function<bool(cli_result_t &)> start_modify(json11::Json);
std::function<bool(cli_result_t &)> start_rm(json11::Json);
std::function<bool(cli_result_t &)> start_merge(json11::Json);
std::function<bool(cli_result_t &)> start_flatten(json11::Json);
std::function<bool(cli_result_t &)> start_snap_rm(json11::Json);
std::function<bool(cli_result_t &)> start_alloc_osd(json11::Json cfg);
std::function<bool(cli_result_t &)> simple_offsets(json11::Json cfg);
void etcd_txn(json11::Json txn);
};
uint64_t parse_size(std::string size_str);
std::string print_table(json11::Json items, json11::Json header, bool use_esc);
std::string format_size(uint64_t size);
std::string format_size(uint64_t size, bool nobytes = false);
std::string format_lat(uint64_t lat);

View File

@@ -13,7 +13,6 @@ struct alloc_osd_t
{
cli_tool_t *parent;
json11::Json result;
uint64_t new_id = 1;
int state = 0;
@@ -29,7 +28,7 @@ struct alloc_osd_t
goto resume_1;
do
{
etcd_txn(json11::Json::object {
parent->etcd_txn(json11::Json::object {
{ "compare", json11::Json::array {
json11::Json::object {
{ "target", "VERSION" },
@@ -63,10 +62,10 @@ struct alloc_osd_t
state = 1;
if (parent->waiting > 0)
return;
if (!result["succeeded"].bool_value())
if (!parent->etcd_result["succeeded"].bool_value())
{
std::vector<osd_num_t> used;
for (auto kv: result["responses"][0]["response_range"]["kvs"].array_items())
for (auto kv: parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items())
{
std::string key = base64_decode(kv["key"].string_value());
osd_num_t cur_osd;
@@ -98,41 +97,25 @@ struct alloc_osd_t
new_id = used[e-1]+1;
}
}
} while (!result["succeeded"].bool_value());
} while (!parent->etcd_result["succeeded"].bool_value());
state = 100;
}
void etcd_txn(json11::Json txn)
{
parent->waiting++;
parent->cli->st_cli.etcd_txn(txn, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json res)
{
parent->waiting--;
if (err != "")
{
fprintf(stderr, "Error reading from etcd: %s\n", err.c_str());
exit(1);
}
this->result = res;
parent->ringloop->wakeup();
});
}
};
std::function<bool(void)> cli_tool_t::start_alloc_osd(json11::Json cfg, uint64_t *out)
std::function<bool(cli_result_t &)> cli_tool_t::start_alloc_osd(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto alloc_osd = new alloc_osd_t();
alloc_osd->parent = this;
return [alloc_osd, out]()
return [alloc_osd](cli_result_t & result)
{
alloc_osd->loop();
if (alloc_osd->is_done())
{
if (out)
*out = alloc_osd->new_id;
else if (alloc_osd->new_id)
printf("%lu\n", alloc_osd->new_id);
result = (cli_result_t){
.text = std::to_string(alloc_osd->new_id),
.data = json11::Json(alloc_osd->new_id),
};
delete alloc_osd;
return true;
}

118
src/cli_common.cpp Normal file
View File

@@ -0,0 +1,118 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "base64.h"
#include "cluster_client.h"
#include "cli.h"
void cli_tool_t::change_parent(inode_t cur, inode_t new_parent)
{
auto cur_cfg_it = cli->st_cli.inode_config.find(cur);
if (cur_cfg_it == cli->st_cli.inode_config.end())
{
fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
exit(1);
}
inode_config_t new_cfg = cur_cfg_it->second;
std::string cur_name = new_cfg.name;
std::string cur_cfg_key = base64_encode(cli->st_cli.etcd_prefix+
"/config/inode/"+std::to_string(INODE_POOL(cur))+
"/"+std::to_string(INODE_NO_POOL(cur)));
new_cfg.parent_id = new_parent;
json11::Json::object cur_cfg_json = cli->st_cli.serialize_inode_cfg(&new_cfg);
waiting++;
cli->st_cli.etcd_txn_slow(json11::Json::object {
{ "compare", json11::Json::array {
json11::Json::object {
{ "target", "MOD" },
{ "key", cur_cfg_key },
{ "result", "LESS" },
{ "mod_revision", new_cfg.mod_revision+1 },
},
} },
{ "success", json11::Json::array {
json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", cur_cfg_key },
{ "value", base64_encode(json11::Json(cur_cfg_json).dump()) },
} }
},
} },
}, [this, new_parent, cur, cur_name](std::string err, json11::Json res)
{
if (err != "")
{
fprintf(stderr, "Error changing parent of %s: %s\n", cur_name.c_str(), err.c_str());
exit(1);
}
if (!res["succeeded"].bool_value())
{
fprintf(stderr, "Inode %s was modified during snapshot deletion\n", cur_name.c_str());
exit(1);
}
if (new_parent)
{
auto new_parent_it = cli->st_cli.inode_config.find(new_parent);
std::string new_parent_name = new_parent_it != cli->st_cli.inode_config.end()
? new_parent_it->second.name : "<unknown>";
printf(
"Parent of layer %s (inode %lu in pool %u) changed to %s (inode %lu in pool %u)\n",
cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur),
new_parent_name.c_str(), INODE_NO_POOL(new_parent), INODE_POOL(new_parent)
);
}
else
{
printf(
"Parent of layer %s (inode %lu in pool %u) detached\n",
cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur)
);
}
waiting--;
ringloop->wakeup();
});
}
void cli_tool_t::etcd_txn(json11::Json txn)
{
waiting++;
cli->st_cli.etcd_txn_slow(txn, [this](std::string err, json11::Json res)
{
waiting--;
if (err != "")
{
fprintf(stderr, "Error reading from etcd: %s\n", err.c_str());
exit(1);
}
etcd_result = res;
ringloop->wakeup();
});
}
inode_config_t* cli_tool_t::get_inode_cfg(const std::string & name)
{
for (auto & ic: cli->st_cli.inode_config)
{
if (ic.second.name == name)
{
return &ic.second;
}
}
fprintf(stderr, "Layer %s not found\n", name.c_str());
exit(1);
}
void cli_tool_t::parse_config(json11::Json cfg)
{
color = !cfg["no-color"].bool_value();
json_output = cfg["json"].bool_value();
iodepth = cfg["iodepth"].uint64_value();
if (!iodepth)
iodepth = 32;
parallel_osds = cfg["parallel_osds"].uint64_value();
if (!parallel_osds)
parallel_osds = 4;
log_level = cfg["log_level"].int64_value();
progress = cfg["progress"].uint64_value() ? true : false;
list_first = cfg["wait-list"].uint64_value() ? true : false;
}

View File

@@ -31,9 +31,9 @@ struct image_creator_t
inode_t new_parent_id = 0;
inode_t new_id = 0, old_id = 0;
uint64_t max_id_mod_rev = 0, cfg_mod_rev = 0, idx_mod_rev = 0;
json11::Json result;
int state = 0;
cli_result_t result;
bool is_done()
{
@@ -44,13 +44,27 @@ struct image_creator_t
{
if (state >= 1)
goto resume_1;
if (image_name == "")
{
// FIXME: EINVAL -> specific codes for every error
result = (cli_result_t){ .err = EINVAL, .text = "Image name is missing" };
state = 100;
return;
}
if (image_name.find('@') != std::string::npos)
{
result = (cli_result_t){ .err = EINVAL, .text = "Image name can't contain @ character" };
state = 100;
return;
}
if (new_pool_id)
{
auto & pools = parent->cli->st_cli.pool_config;
if (pools.find(new_pool_id) == pools.end())
{
fprintf(stderr, "Pool %u does not exist\n", new_pool_id);
exit(1);
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+std::to_string(new_pool_id)+" does not exist" };
state = 100;
return;
}
}
else if (new_pool_name != "")
@@ -65,8 +79,9 @@ struct image_creator_t
}
if (!new_pool_id)
{
fprintf(stderr, "Pool %s does not exist\n", new_pool_name.c_str());
exit(1);
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+new_pool_name+" does not exist" };
state = 100;
return;
}
}
else if (parent->cli->st_cli.pool_config.size() == 1)
@@ -92,8 +107,9 @@ struct image_creator_t
{
if (ic.second.name == image_name)
{
fprintf(stderr, "Image %s already exists\n", image_name.c_str());
exit(1);
result = (cli_result_t){ .err = EEXIST, .text = "Image "+image_name+" already exists" };
state = 100;
return;
}
if (ic.second.name == new_parent)
{
@@ -110,45 +126,46 @@ struct image_creator_t
}
if (new_parent != "" && !new_parent_id)
{
fprintf(stderr, "Parent image not found\n");
exit(1);
result = (cli_result_t){ .err = ENOENT, .text = "Parent image "+new_parent+" not found" };
state = 100;
return;
}
if (!new_pool_id)
{
fprintf(stderr, "Pool name or ID is missing\n");
exit(1);
result = (cli_result_t){ .err = EINVAL, .text = "Pool name or ID is missing" };
state = 100;
return;
}
if (!size)
{
fprintf(stderr, "Image size is missing\n");
exit(1);
result = (cli_result_t){ .err = EINVAL, .text = "Image size is missing" };
state = 100;
return;
}
do
{
etcd_txn(json11::Json::object {
parent->etcd_txn(json11::Json::object {
{ "success", json11::Json::array { get_next_id() } }
});
state = 2;
resume_2:
if (parent->waiting > 0)
return;
extract_next_id(result["responses"][0]);
extract_next_id(parent->etcd_result["responses"][0]);
attempt_create();
state = 3;
resume_3:
if (parent->waiting > 0)
return;
if (!result["succeeded"].bool_value() &&
result["responses"][0]["response_range"]["kvs"].array_items().size() > 0)
if (!parent->etcd_result["succeeded"].bool_value() &&
parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items().size() > 0)
{
fprintf(stderr, "Image %s already exists\n", image_name.c_str());
exit(1);
result = (cli_result_t){ .err = EEXIST, .text = "Image "+image_name+" already exists" };
state = 100;
return;
}
} while (!result["succeeded"].bool_value());
if (parent->progress)
{
printf("Image %s created\n", image_name.c_str());
}
} while (!parent->etcd_result["succeeded"].bool_value());
result = (cli_result_t){ .err = 0, .text = "Image "+image_name+" created" };
state = 100;
}
@@ -164,14 +181,16 @@ resume_3:
{
if (ic.second.name == image_name+"@"+new_snap)
{
fprintf(stderr, "Snapshot %s@%s already exists\n", image_name.c_str(), new_snap.c_str());
exit(1);
result = (cli_result_t){ .err = EEXIST, .text = "Snapshot "+image_name+"@"+new_snap+" already exists" };
state = 100;
return;
}
}
if (new_parent != "")
{
fprintf(stderr, "--parent can't be used with snapshots\n");
exit(1);
result = (cli_result_t){ .err = EINVAL, .text = "Parent can't be specified for snapshots" };
state = 100;
return;
}
do
{
@@ -183,8 +202,9 @@ resume_3:
return;
if (!old_id)
{
fprintf(stderr, "Image %s does not exist\n", image_name.c_str());
exit(1);
result = (cli_result_t){ .err = ENOENT, .text = "Image "+image_name+" does not exist" };
state = 100;
return;
}
if (!new_pool_id)
{
@@ -196,17 +216,15 @@ resume_3:
resume_4:
if (parent->waiting > 0)
return;
if (!result["succeeded"].bool_value() &&
result["responses"][0]["response_range"]["kvs"].array_items().size() > 0)
if (!parent->etcd_result["succeeded"].bool_value() &&
parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items().size() > 0)
{
fprintf(stderr, "Snapshot %s@%s already exists\n", image_name.c_str(), new_snap.c_str());
exit(1);
result = (cli_result_t){ .err = EEXIST, .text = "Snapshot "+image_name+"@"+new_snap+" already exists" };
state = 100;
return;
}
} while (!result["succeeded"].bool_value());
if (parent->progress)
{
printf("Snapshot %s@%s created\n", image_name.c_str(), new_snap.c_str());
}
} while (!parent->etcd_result["succeeded"].bool_value());
result = (cli_result_t){ .err = 0, .text = "Snapshot "+image_name+"@"+new_snap+" created" };
state = 100;
}
@@ -246,7 +264,7 @@ resume_4:
goto resume_2;
else if (state == 3)
goto resume_3;
etcd_txn(json11::Json::object { { "success", json11::Json::array {
parent->etcd_txn(json11::Json::object { { "success", json11::Json::array {
get_next_id(),
json11::Json::object {
{ "request_range", json11::Json::object {
@@ -260,11 +278,11 @@ resume_4:
resume_2:
if (parent->waiting > 0)
return;
extract_next_id(result["responses"][0]);
extract_next_id(parent->etcd_result["responses"][0]);
old_id = 0;
old_pool_id = 0;
cfg_mod_rev = idx_mod_rev = 0;
if (result["responses"][1]["response_range"]["kvs"].array_items().size() == 0)
if (parent->etcd_result["responses"][1]["response_range"]["kvs"].array_items().size() == 0)
{
for (auto & ic: parent->cli->st_cli.inode_config)
{
@@ -283,17 +301,18 @@ resume_2:
{
// FIXME: Parse kvs in etcd_state_client automatically
{
auto kv = parent->cli->st_cli.parse_etcd_kv(result["responses"][1]["response_range"]["kvs"][0]);
auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][1]["response_range"]["kvs"][0]);
old_id = INODE_NO_POOL(kv.value["id"].uint64_value());
old_pool_id = (pool_id_t)kv.value["pool_id"].uint64_value();
idx_mod_rev = kv.mod_revision;
if (!old_id || !old_pool_id || old_pool_id >= POOL_ID_MAX)
{
fprintf(stderr, "Invalid pool or inode ID in etcd key %s\n", kv.key.c_str());
exit(1);
result = (cli_result_t){ .err = ENOENT, .text = "Invalid pool or inode ID in etcd key "+kv.key };
state = 100;
return;
}
}
etcd_txn(json11::Json::object {
parent->etcd_txn(json11::Json::object {
{ "success", json11::Json::array {
json11::Json::object {
{ "request_range", json11::Json::object {
@@ -310,7 +329,7 @@ resume_3:
if (parent->waiting > 0)
return;
{
auto kv = parent->cli->st_cli.parse_etcd_kv(result["responses"][0]["response_range"]["kvs"][0]);
auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]);
size = kv.value["size"].uint64_value();
new_parent_id = kv.value["parent_id"].uint64_value();
uint64_t parent_pool_id = kv.value["parent_pool_id"].uint64_value();
@@ -439,28 +458,12 @@ resume_3:
} },
});
};
etcd_txn(json11::Json::object {
parent->etcd_txn(json11::Json::object {
{ "compare", checks },
{ "success", success },
{ "failure", failure },
});
}
void etcd_txn(json11::Json txn)
{
parent->waiting++;
parent->cli->st_cli.etcd_txn(txn, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json res)
{
parent->waiting--;
if (err != "")
{
fprintf(stderr, "Error reading from etcd: %s\n", err.c_str());
exit(1);
}
this->result = res;
parent->ringloop->wakeup();
});
}
};
uint64_t parse_size(std::string size_str)
@@ -474,25 +477,24 @@ uint64_t parse_size(std::string size_str)
if (type_char == 'k' || type_char == 'm' || type_char == 'g' || type_char == 't')
{
if (type_char == 'k')
mul = 1l<<10;
mul = (uint64_t)1<<10;
else if (type_char == 'm')
mul = 1l<<20;
mul = (uint64_t)1<<20;
else if (type_char == 'g')
mul = 1l<<30;
mul = (uint64_t)1<<30;
else /*if (type_char == 't')*/
mul = 1l<<40;
mul = (uint64_t)1<<40;
size_str = size_str.substr(0, size_str.length()-1);
}
uint64_t size = json11::Json(size_str).uint64_value() * mul;
if (size == 0 && size_str != "0" && (size_str != "" || mul != 1))
{
fprintf(stderr, "Invalid syntax for size: %s\n", size_str.c_str());
exit(1);
return UINT64_MAX;
}
return size;
}
std::function<bool(void)> cli_tool_t::start_create(json11::Json cfg)
std::function<bool(cli_result_t &)> cli_tool_t::start_create(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto image_creator = new image_creator_t();
@@ -509,8 +511,12 @@ std::function<bool(void)> cli_tool_t::start_create(json11::Json cfg)
int p = image_creator->image_name.find('@');
if (p == std::string::npos || p == image_creator->image_name.length()-1)
{
fprintf(stderr, "Please specify new snapshot name after @\n");
exit(1);
delete image_creator;
return [](cli_result_t & result)
{
result = (cli_result_t){ .err = EINVAL, .text = "Please specify new snapshot name after @" };
return true;
};
}
image_creator->new_snap = image_creator->image_name.substr(p + 1);
image_creator->image_name = image_creator->image_name.substr(0, p);
@@ -519,32 +525,39 @@ std::function<bool(void)> cli_tool_t::start_create(json11::Json cfg)
if (cfg["size"].string_value() != "")
{
image_creator->size = parse_size(cfg["size"].string_value());
if (image_creator->size == UINT64_MAX)
{
return [size = cfg["size"].string_value()](cli_result_t & result)
{
result = (cli_result_t){ .err = EINVAL, .text = "Invalid syntax for size: "+size };
return true;
};
}
if (image_creator->size % 4096)
{
fprintf(stderr, "Size should be a multiple of 4096\n");
exit(1);
delete image_creator;
return [](cli_result_t & result)
{
result = (cli_result_t){ .err = EINVAL, .text = "Size should be a multiple of 4096" };
return true;
};
}
if (image_creator->new_snap != "")
{
fprintf(stderr, "--size can't be specified for snapshots\n");
exit(1);
delete image_creator;
return [](cli_result_t & result)
{
result = (cli_result_t){ .err = EINVAL, .text = "Size can't be specified for snapshots" };
return true;
};
}
}
if (image_creator->image_name == "")
{
fprintf(stderr, "Image name is missing\n");
exit(1);
}
if (image_creator->image_name.find('@') != std::string::npos)
{
fprintf(stderr, "Image name can't contain @ character\n");
exit(1);
}
return [image_creator]()
return [image_creator](cli_result_t & result)
{
image_creator->loop();
if (image_creator->is_done())
{
result = image_creator->result;
delete image_creator;
return true;
}

View File

@@ -12,6 +12,7 @@ struct pool_lister_t
int state = 0;
json11::Json space_info;
cli_result_t result;
std::map<pool_id_t, json11::Json::object> pool_stats;
bool is_done()
@@ -24,8 +25,7 @@ struct pool_lister_t
if (state == 1)
goto resume_1;
// Space statistics - pool/stats/<pool>
parent->waiting++;
parent->cli->st_cli.etcd_txn(json11::Json::object {
parent->etcd_txn(json11::Json::object {
{ "success", json11::Json::array {
json11::Json::object {
{ "request_range", json11::Json::object {
@@ -48,21 +48,12 @@ struct pool_lister_t
} },
},
} },
}, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json res)
{
parent->waiting--;
if (err != "")
{
fprintf(stderr, "Error reading from etcd: %s\n", err.c_str());
exit(1);
}
space_info = res;
parent->ringloop->wakeup();
});
state = 1;
resume_1:
if (parent->waiting > 0)
return;
space_info = parent->etcd_result;
std::map<pool_id_t, uint64_t> osd_free;
for (auto & kv_item: space_info["responses"][0]["response_range"]["kvs"].array_items())
{
@@ -118,9 +109,14 @@ resume_1:
pool_avail = pg_free;
}
}
if (pool_avail == UINT64_MAX)
{
pool_avail = 0;
}
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
{
pool_avail = pool_avail * (pool_cfg.pg_size - pool_cfg.parity_chunks) / pool_stats[pool_cfg.id]["pg_real_size"].uint64_value();
uint64_t pg_real_size = pool_stats[pool_cfg.id]["pg_real_size"].uint64_value();
pool_avail = pg_real_size > 0 ? pool_avail * (pool_cfg.pg_size - pool_cfg.parity_chunks) / pg_real_size : 0;
}
pool_stats[pool_cfg.id] = json11::Json::object {
{ "name", pool_cfg.name },
@@ -129,8 +125,8 @@ resume_1:
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
: "EC "+std::to_string(pool_cfg.pg_size-pool_cfg.parity_chunks)+"+"+std::to_string(pool_cfg.parity_chunks) },
{ "used_raw", (uint64_t)(pool_stats[pool_cfg.id]["used_raw_tb"].number_value() * (1l<<40)) },
{ "total_raw", (uint64_t)(pool_stats[pool_cfg.id]["total_raw_tb"].number_value() * (1l<<40)) },
{ "used_raw", (uint64_t)(pool_stats[pool_cfg.id]["used_raw_tb"].number_value() * ((uint64_t)1<<40)) },
{ "total_raw", (uint64_t)(pool_stats[pool_cfg.id]["total_raw_tb"].number_value() * ((uint64_t)1<<40)) },
{ "max_available", pool_avail },
{ "raw_to_usable", pool_stats[pool_cfg.id]["raw_to_usable"].number_value() },
{ "space_efficiency", pool_stats[pool_cfg.id]["space_efficiency"].number_value() },
@@ -155,10 +151,10 @@ resume_1:
get_stats();
if (parent->waiting > 0)
return;
result.data = to_list();
if (parent->json_output)
{
// JSON output
printf("%s\n", json11::Json(to_list()).dump().c_str());
state = 100;
return;
}
@@ -199,28 +195,34 @@ resume_1:
json11::Json::array list;
for (auto & kv: pool_stats)
{
kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / kv.second["raw_to_usable"].number_value());
kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / kv.second["raw_to_usable"].number_value());
double raw_to = kv.second["raw_to_usable"].number_value();
if (raw_to < 0.000001 && raw_to > -0.000001)
raw_to = 1;
kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / raw_to);
kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / raw_to);
kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());
kv.second["used_pct"] = format_q(100 - 100*kv.second["max_available"].uint64_value() *
kv.second["raw_to_usable"].number_value() / kv.second["total_raw"].uint64_value())+"%";
kv.second["used_pct"] = format_q(kv.second["total_raw"].uint64_value()
? (100 - 100*kv.second["max_available"].uint64_value() *
kv.second["raw_to_usable"].number_value() / kv.second["total_raw"].uint64_value())
: 100)+"%";
kv.second["eff_fmt"] = format_q(kv.second["space_efficiency"].number_value()*100)+"%";
}
printf("%s", print_table(to_list(), cols, parent->color).c_str());
result.text = print_table(result.data, cols, parent->color);
state = 100;
}
};
std::function<bool(void)> cli_tool_t::start_df(json11::Json cfg)
std::function<bool(cli_result_t &)> cli_tool_t::start_df(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto lister = new pool_lister_t();
lister->parent = this;
return [lister]()
return [lister](cli_result_t & result)
{
lister->loop();
if (lister->is_done())
{
result = lister->result;
delete lister;
return true;
}

View File

@@ -3,6 +3,7 @@
#include "cli.h"
#include "cluster_client.h"
#include <sys/stat.h>
// Flatten a layer: merge all parents into a layer and break the connection completely
struct snap_flattener_t
@@ -21,7 +22,8 @@ struct snap_flattener_t
std::string top_parent_name;
inode_t target_id = 0;
int state = 0;
std::function<bool(void)> merger_cb;
std::function<bool(cli_result_t &)> merger_cb;
cli_result_t result;
void get_merge_parents()
{
@@ -36,23 +38,34 @@ struct snap_flattener_t
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
if (it == parent->cli->st_cli.inode_config.end())
{
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
exit(1);
result = (cli_result_t){
.err = ENOENT,
.text = "Parent inode of layer "+cur->name+" (id "+std::to_string(cur->parent_id)+") does not exist",
.data = json11::Json::object {
{ "error", "parent-not-found" },
{ "inode_id", cur->num },
{ "inode_name", cur->name },
{ "parent_id", cur->parent_id },
},
};
state = 100;
return;
}
cur = &it->second;
chain_list.push_back(cur->num);
}
if (cur->parent_id != 0)
{
fprintf(stderr, "Layer %s has a loop in parents\n", target_name.c_str());
exit(1);
result = (cli_result_t){ .err = EBADF, .text = "Layer "+target_name+" has a loop in parents" };
state = 100;
return;
}
top_parent_name = cur->name;
}
bool is_done()
{
return state == 5;
return state == 100;
}
void loop()
@@ -63,8 +76,16 @@ struct snap_flattener_t
goto resume_2;
else if (state == 3)
goto resume_3;
if (target_name == "")
{
result = (cli_result_t){ .err = EINVAL, .text = "Layer to flatten not specified" };
state = 100;
return;
}
// Get parent layers
get_merge_parents();
if (state == 100)
return;
// Start merger
merger_cb = parent->start_merge(json11::Json::object {
{ "command", json11::Json::array{ "merge-data", top_parent_name, target_name } },
@@ -75,12 +96,17 @@ struct snap_flattener_t
});
// Wait for it
resume_1:
while (!merger_cb())
while (!merger_cb(result))
{
state = 1;
return;
}
merger_cb = NULL;
if (result.err)
{
state = 100;
return;
}
// Change parent
parent->change_parent(target_id, 0);
// Wait for it to complete
@@ -91,31 +117,27 @@ resume_2:
state = 3;
resume_3:
// Done
return;
state = 100;
}
};
std::function<bool(void)> cli_tool_t::start_flatten(json11::Json cfg)
std::function<bool(cli_result_t &)> cli_tool_t::start_flatten(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto flattener = new snap_flattener_t();
flattener->parent = this;
flattener->target_name = cmd.size() > 1 ? cmd[1].string_value() : "";
if (flattener->target_name == "")
{
fprintf(stderr, "Layer to flatten argument is missing\n");
exit(1);
}
flattener->fsync_interval = cfg["fsync-interval"].uint64_value();
if (!flattener->fsync_interval)
flattener->fsync_interval = 128;
if (!cfg["cas"].is_null())
flattener->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
return [flattener]()
return [flattener](cli_result_t & result)
{
flattener->loop();
if (flattener->is_done())
{
result = flattener->result;
delete flattener;
return true;
}

View File

@@ -24,6 +24,7 @@ struct image_lister_t
int state = 0;
std::map<inode_t, json11::Json::object> stats;
json11::Json space_info;
cli_result_t result;
bool is_done()
{
@@ -44,8 +45,9 @@ struct image_lister_t
}
if (!list_pool_id)
{
fprintf(stderr, "Pool %s does not exist\n", list_pool_name.c_str());
exit(1);
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+list_pool_name+" does not exist" };
state = 100;
return;
}
}
for (auto & ic: parent->cli->st_cli.inode_config)
@@ -84,8 +86,7 @@ struct image_lister_t
// Space statistics
// inode/stats/<pool>/<inode>::raw_used divided by pool/stats/<pool>::pg_real_size
// multiplied by 1 or number of data drives
parent->waiting++;
parent->cli->st_cli.etcd_txn(json11::Json::object {
parent->etcd_txn(json11::Json::object {
{ "success", json11::Json::array {
json11::Json::object {
{ "request_range", json11::Json::object {
@@ -112,21 +113,12 @@ struct image_lister_t
} },
},
} },
}, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json res)
{
parent->waiting--;
if (err != "")
{
fprintf(stderr, "Error reading from etcd: %s\n", err.c_str());
exit(1);
}
space_info = res;
parent->ringloop->wakeup();
});
state = 1;
resume_1:
if (parent->waiting > 0)
return;
space_info = parent->etcd_result;
std::map<pool_id_t, uint64_t> pool_pg_real_size;
for (auto & kv_item: space_info["responses"][0]["response_range"]["kvs"].array_items())
{
@@ -164,7 +156,7 @@ resume_1:
if (pool_it != parent->cli->st_cli.pool_config.end())
{
auto & pool_cfg = pool_it->second;
used_size = used_size / pool_pg_real_size[pool_id]
used_size = used_size / (pool_pg_real_size[pool_id] ? pool_pg_real_size[pool_id] : 1)
* (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
}
auto stat_it = stats.find(inode_num);
@@ -256,10 +248,10 @@ resume_1:
if (parent->waiting > 0)
return;
}
result.data = to_list();
if (parent->json_output)
{
// JSON output
printf("%s\n", json11::Json(to_list()).dump().c_str());
state = 100;
return;
}
@@ -369,7 +361,7 @@ resume_1:
kv.second["size_fmt"] = format_size(kv.second["size"].uint64_value());
kv.second["ro"] = kv.second["readonly"].bool_value() ? "RO" : "-";
}
printf("%s", print_table(to_list(), cols, parent->color).c_str());
result.text = print_table(to_list(), cols, parent->color);
state = 100;
}
};
@@ -446,23 +438,26 @@ std::string print_table(json11::Json items, json11::Json header, bool use_esc)
return str;
}
static uint64_t size_thresh[] = { 1024l*1024*1024*1024, 1024l*1024*1024, 1024l*1024, 1024, 0 };
static uint64_t size_thresh[] = { (uint64_t)1024*1024*1024*1024, (uint64_t)1024*1024*1024, (uint64_t)1024*1024, 1024, 0 };
static uint64_t size_thresh_d[] = { (uint64_t)1000000000000, (uint64_t)1000000000, (uint64_t)1000000, (uint64_t)1000, 0 };
static const int size_thresh_n = sizeof(size_thresh)/sizeof(size_thresh[0]);
static const char *size_unit = "TGMKB";
std::string format_size(uint64_t size)
std::string format_size(uint64_t size, bool nobytes)
{
uint64_t *thr = nobytes ? size_thresh_d : size_thresh;
char buf[256];
for (int i = 0; i < sizeof(size_thresh)/sizeof(size_thresh[0]); i++)
for (int i = 0; i < size_thresh_n; i++)
{
if (size >= size_thresh[i] || i >= sizeof(size_thresh)/sizeof(size_thresh[0])-1)
if (size >= thr[i] || i >= size_thresh_n-1)
{
double value = size_thresh[i] ? (double)size/size_thresh[i] : size;
double value = thr[i] ? (double)size/thr[i] : size;
int l = snprintf(buf, sizeof(buf), "%.1f", value);
assert(l < sizeof(buf)-2);
if (buf[l-1] == '0')
l -= 2;
buf[l] = ' ';
buf[l+1] = size_unit[i];
buf[l] = i == size_thresh_n-1 && nobytes ? 0 : ' ';
buf[l+1] = i == size_thresh_n-1 && nobytes ? 0 : size_unit[i];
buf[l+2] = 0;
break;
}
@@ -553,7 +548,7 @@ back:
return true;
}
std::function<bool(void)> cli_tool_t::start_ls(json11::Json cfg)
std::function<bool(cli_result_t &)> cli_tool_t::start_ls(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto lister = new image_lister_t();
@@ -569,11 +564,12 @@ std::function<bool(void)> cli_tool_t::start_ls(json11::Json cfg)
{
lister->only_names.insert(cmd[i].string_value());
}
return [lister]()
return [lister](cli_result_t & result)
{
lister->loop();
if (lister->is_done())
{
result = lister->result;
delete lister;
return true;
}

View File

@@ -12,6 +12,9 @@ struct snap_rw_op_t
cluster_op_t op;
int todo = 0;
uint32_t start = 0, end = 0;
int error_code = 0;
uint64_t error_offset = 0;
bool error_read = false;
};
// Layer merge is the base for multiple operations:
@@ -54,17 +57,27 @@ struct snap_merger_t
uint64_t last_written_offset = 0;
int deleted_unsynced = 0;
uint64_t processed = 0, to_process = 0;
std::string rwo_error;
cli_result_t result;
void start_merge()
{
if (from_name == "" || to_name == "")
{
result = (cli_result_t){ .err = EINVAL, .text = "Beginning or end of the merge sequence is missing" };
state = 100;
return;
}
check_delete_source = delete_source || check_delete_source;
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
inode_config_t *target_cfg = target_name == "" ? from_cfg : parent->get_inode_cfg(target_name);
if (to_cfg->num == from_cfg->num)
{
fprintf(stderr, "Only one layer specified, nothing to merge\n");
exit(1);
result = (cli_result_t){ .err = EINVAL, .text = "Only one layer specified, nothing to merge" };
state = 100;
return;
}
// Check that to_cfg is actually a child of from_cfg and target_cfg is somewhere between them
std::vector<inode_t> chain_list;
@@ -78,8 +91,18 @@ struct snap_merger_t
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
if (it == parent->cli->st_cli.inode_config.end())
{
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
exit(1);
result = (cli_result_t){
.err = ENOENT,
.text = "Parent inode of layer "+cur->name+" (id "+std::to_string(cur->parent_id)+") does not exist",
.data = json11::Json::object {
{ "error", "parent-not-found" },
{ "inode_id", cur->num },
{ "inode_name", cur->name },
{ "parent_id", cur->parent_id },
},
};
state = 100;
return;
}
cur = &it->second;
chain_list.push_back(cur->num);
@@ -87,8 +110,9 @@ struct snap_merger_t
}
if (cur->parent_id != from_cfg->num)
{
fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
exit(1);
result = (cli_result_t){ .err = EINVAL, .text = "Layer "+to_name+" is not a child of "+from_name };
state = 100;
return;
}
chain_list.push_back(from_cfg->num);
layer_block_size[from_cfg->num] = get_block_size(from_cfg->num);
@@ -99,8 +123,9 @@ struct snap_merger_t
}
if (sources.find(target_cfg->num) == sources.end())
{
fprintf(stderr, "Layer %s is not between %s and %s\n", target_name.c_str(), to_name.c_str(), from_name.c_str());
exit(1);
result = (cli_result_t){ .err = EINVAL, .text = "Layer "+target_name+" is not between "+to_name+" and "+from_name };
state = 100;
return;
}
target = target_cfg->num;
target_rank = sources.at(target);
@@ -130,14 +155,15 @@ struct snap_merger_t
int parent_rank = it->second;
if (parent_rank < to_rank && (parent_rank >= target_rank || check_delete_source))
{
fprintf(
stderr, "Layers at or above %s, but below %s are not allowed"
" to have other children, but %s is a child of %s\n",
(check_delete_source ? from_name.c_str() : target_name.c_str()),
to_name.c_str(), ic.second.name.c_str(),
parent->cli->st_cli.inode_config.at(ic.second.parent_id).name.c_str()
);
exit(1);
result = (cli_result_t){
.err = EINVAL,
.text = "Layers at or above "+(check_delete_source ? from_name : target_name)+
", but below "+to_name+" are not allowed to have other children, but "+
ic.second.name+" is a child of "+
parent->cli->st_cli.inode_config.at(ic.second.parent_id).name,
};
state = 100;
return;
}
if (parent_rank >= to_rank)
{
@@ -152,11 +178,14 @@ struct snap_merger_t
use_cas = 0;
}
sources.erase(target);
printf(
"Merging %ld layer(s) into target %s%s (inode %lu in pool %u)\n",
sources.size(), target_cfg->name.c_str(),
use_cas ? " online (with CAS)" : "", INODE_NO_POOL(target), INODE_POOL(target)
);
if (parent->progress)
{
printf(
"Merging %ld layer(s) into target %s%s (inode %lu in pool %u)\n",
sources.size(), target_cfg->name.c_str(),
use_cas ? " online (with CAS)" : "", INODE_NO_POOL(target), INODE_POOL(target)
);
}
target_block_size = get_block_size(target);
}
@@ -253,7 +282,8 @@ struct snap_merger_t
oit = merge_offsets.begin();
resume_5:
// Now read, overwrite and optionally delete offsets one by one
while (in_flight < parent->iodepth*parent->parallel_osds && oit != merge_offsets.end())
while (in_flight < parent->iodepth*parent->parallel_osds &&
oit != merge_offsets.end() && !rwo_error.size())
{
in_flight++;
read_and_write(*oit);
@@ -264,6 +294,15 @@ struct snap_merger_t
printf("\rOverwriting blocks: %lu/%lu", processed, to_process);
}
}
if (in_flight == 0 && rwo_error.size())
{
result = (cli_result_t){
.err = EIO,
.text = rwo_error,
};
state = 100;
return;
}
if (in_flight > 0 || oit != merge_offsets.end())
{
// Wait until overwrites finish
@@ -396,8 +435,9 @@ struct snap_merger_t
{
if (op->retval != op->len)
{
fprintf(stderr, "error reading target at offset %lx: %s\n", op->offset, strerror(-op->retval));
exit(1);
rwo->error_code = -op->retval;
rwo->error_offset = op->offset;
rwo->error_read = true;
}
next_write(rwo);
};
@@ -410,9 +450,9 @@ struct snap_merger_t
// FIXME: Allow to use single write with "holes" (OSDs don't allow it yet)
uint32_t gran = parent->cli->get_bs_bitmap_granularity();
uint64_t bitmap_size = target_block_size / gran;
while (rwo->end < bitmap_size)
while (rwo->end < bitmap_size && !rwo->error_code)
{
auto bit = ((*(uint8_t*)(rwo->op.bitmap_buf + (rwo->end >> 3))) & (1 << (rwo->end & 0x7)));
auto bit = ((*((uint8_t*)rwo->op.bitmap_buf + (rwo->end >> 3))) & (1 << (rwo->end & 0x7)));
if (!bit)
{
if (rwo->end > rwo->start)
@@ -434,7 +474,7 @@ struct snap_merger_t
rwo->end++;
}
}
if (rwo->end > rwo->start)
if (rwo->end > rwo->start && !rwo->error_code)
{
// write start->end
rwo->todo++;
@@ -459,7 +499,7 @@ struct snap_merger_t
subop->len = end-start;
subop->version = version;
subop->flags = OSD_OP_IGNORE_READONLY;
subop->iov.push_back(rwo->buf+start, end-start);
subop->iov.push_back((uint8_t*)rwo->buf+start, end-start);
subop->callback = [this, rwo](cluster_op_t *subop)
{
rwo->todo--;
@@ -473,8 +513,9 @@ struct snap_merger_t
delete subop;
return;
}
fprintf(stderr, "error writing target at offset %lx: %s\n", subop->offset, strerror(-subop->retval));
exit(1);
rwo->error_code = -subop->retval;
rwo->error_offset = subop->offset;
rwo->error_read = false;
}
// Increment CAS version
rwo->op.version++;
@@ -495,7 +536,7 @@ struct snap_merger_t
subop->offset = offset;
subop->len = 0;
subop->flags = OSD_OP_IGNORE_READONLY;
subop->callback = [this](cluster_op_t *subop)
subop->callback = [](cluster_op_t *subop)
{
if (subop->retval != 0)
{
@@ -510,19 +551,20 @@ struct snap_merger_t
{
if (!rwo->todo)
{
if (last_written_offset < rwo->op.offset+target_block_size)
if (!rwo->error_code &&
last_written_offset < rwo->op.offset+target_block_size)
{
last_written_offset = rwo->op.offset+target_block_size;
}
if (delete_source)
if (!rwo->error_code && delete_source)
{
deleted_unsynced++;
if (deleted_unsynced >= fsync_interval)
{
uint64_t from = last_fsync_offset, to = last_written_offset;
uint64_t to = last_written_offset;
cluster_op_t *subop = new cluster_op_t;
subop->opcode = OSD_OP_SYNC;
subop->callback = [this, from, to](cluster_op_t *subop)
subop->callback = [this, to](cluster_op_t *subop)
{
delete subop;
// We can now delete source data between <from> and <to>
@@ -545,13 +587,20 @@ struct snap_merger_t
}
free(rwo->buf);
delete rwo;
if (rwo->error_code)
{
char buf[1024];
snprintf(buf, 1024, "Error %s target at offset %lx: %s",
rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(rwo->error_code));
rwo_error = std::string(buf);
}
in_flight--;
continue_merge_reent();
}
}
};
std::function<bool(void)> cli_tool_t::start_merge(json11::Json cfg)
std::function<bool(cli_result_t &)> cli_tool_t::start_merge(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto merger = new snap_merger_t();
@@ -559,22 +608,18 @@ std::function<bool(void)> cli_tool_t::start_merge(json11::Json cfg)
merger->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
merger->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
merger->target_name = cfg["target"].string_value();
if (merger->from_name == "" || merger->to_name == "")
{
fprintf(stderr, "Beginning or end of the merge sequence is missing\n");
exit(1);
}
merger->delete_source = cfg["delete-source"].string_value() != "";
merger->fsync_interval = cfg["fsync-interval"].uint64_value();
if (!merger->fsync_interval)
merger->fsync_interval = 128;
if (!cfg["cas"].is_null())
merger->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
return [merger]()
return [merger](cli_result_t & result)
{
merger->continue_merge_reent();
if (merger->is_done())
{
result = merger->result;
delete merger;
return true;
}

View File

@@ -23,7 +23,8 @@ struct image_changer_t
bool has_children = false;
int state = 0;
std::function<bool(void)> cb;
std::function<bool(cli_result_t &)> cb;
cli_result_t result;
bool is_done()
{
@@ -36,6 +37,18 @@ struct image_changer_t
goto resume_1;
else if (state == 2)
goto resume_2;
if (image_name == "")
{
result = (cli_result_t){ .err = EINVAL, .text = "Image name is missing" };
state = 100;
return;
}
if (new_size != 0 && (new_size % 4096))
{
result = (cli_result_t){ .err = EINVAL, .text = "Image size should be a multiple of 4096" };
state = 100;
return;
}
for (auto & ic: parent->cli->st_cli.inode_config)
{
if (ic.second.name == image_name)
@@ -46,14 +59,16 @@ struct image_changer_t
}
if (new_name != "" && ic.second.name == new_name)
{
fprintf(stderr, "Image %s already exists\n", new_name.c_str());
exit(1);
result = (cli_result_t){ .err = EEXIST, .text = "Image "+new_name+" already exists" };
state = 100;
return;
}
}
if (!inode_num)
{
fprintf(stderr, "Image %s does not exist\n", image_name.c_str());
exit(1);
result = (cli_result_t){ .err = ENOENT, .text = "Image "+image_name+" does not exist" };
state = 100;
return;
}
for (auto & ic: parent->cli->st_cli.inode_config)
{
@@ -68,7 +83,7 @@ struct image_changer_t
(!new_size || cfg.size == new_size) &&
(new_name == "" || new_name == image_name))
{
printf("No change\n");
result = (cli_result_t){ .text = "No change" };
state = 100;
return;
}
@@ -79,8 +94,9 @@ struct image_changer_t
// Check confirmation when trimming an image with children
if (has_children && !force)
{
fprintf(stderr, "Image %s has children. Refusing to shrink it without --force\n", image_name.c_str());
exit(1);
result = (cli_result_t){ .err = EINVAL, .text = "Image "+image_name+" has children. Refusing to shrink it without --force" };
state = 100;
return;
}
// Shrink the image first
cb = parent->start_rm(json11::Json::object {
@@ -90,12 +106,17 @@ struct image_changer_t
{ "min-offset", new_size },
});
resume_1:
while (!cb())
while (!cb(result))
{
state = 1;
return;
}
cb = NULL;
if (result.err)
{
state = 100;
return;
}
}
cfg.size = new_size;
}
@@ -109,8 +130,9 @@ resume_1:
// Check confirmation when making an image with children read-write
if (has_children && !force)
{
fprintf(stderr, "Image %s has children. Refusing to make it read-write without --force\n", image_name.c_str());
exit(1);
result = (cli_result_t){ .err = EINVAL, .text = "Image "+image_name+" has children. Refusing to make it read-write without --force" };
state = 100;
return;
}
}
if (new_name != "")
@@ -170,52 +192,33 @@ resume_1:
} }
});
}
parent->waiting++;
parent->cli->st_cli.etcd_txn(json11::Json::object {
parent->etcd_txn(json11::Json::object {
{ "compare", checks },
{ "success", success },
}, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json res)
{
if (err != "")
{
fprintf(stderr, "Error changing %s: %s\n", image_name.c_str(), err.c_str());
exit(1);
}
if (!res["succeeded"].bool_value())
{
fprintf(stderr, "Image %s was modified by someone else, please repeat your request\n", image_name.c_str());
exit(1);
}
parent->waiting--;
parent->ringloop->wakeup();
});
state = 2;
resume_2:
if (parent->waiting > 0)
return;
printf("Image %s modified\n", image_name.c_str());
if (!parent->etcd_result["succeeded"].bool_value())
{
result = (cli_result_t){ .err = EAGAIN, .text = "Image "+image_name+" was modified by someone else, please repeat your request" };
state = 100;
return;
}
result = (cli_result_t){ .err = 0, .text = "Image "+image_name+" modified" };
state = 100;
}
};
std::function<bool(void)> cli_tool_t::start_modify(json11::Json cfg)
std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto changer = new image_changer_t();
changer->parent = this;
changer->image_name = cmd.size() > 1 ? cmd[1].string_value() : "";
if (changer->image_name == "")
{
fprintf(stderr, "Image name is missing\n");
exit(1);
}
changer->new_name = cfg["rename"].string_value();
changer->new_size = parse_size(cfg["resize"].string_value());
if (changer->new_size != 0 && (changer->new_size % 4096))
{
fprintf(stderr, "Image size should be a multiple of 4096\n");
exit(1);
}
changer->force = cfg["force"].bool_value();
changer->set_readonly = cfg["readonly"].bool_value();
changer->set_readwrite = cfg["readwrite"].bool_value();
@@ -223,11 +226,12 @@ std::function<bool(void)> cli_tool_t::start_modify(json11::Json cfg)
if (!changer->fsync_interval)
changer->fsync_interval = 128;
// FIXME Check that the image doesn't have children when shrinking
return [changer]()
return [changer](cli_result_t & result)
{
changer->loop();
if (changer->is_done())
{
result = changer->result;
delete changer;
return true;
}

View File

@@ -1,211 +1,633 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include <fcntl.h>
#include "cli.h"
#include "cluster_client.h"
#include "base64.h"
#define RM_LISTING 1
#define RM_REMOVING 2
#define RM_END 3
struct rm_pg_t
// Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets
//
// Exactly one child of the requested layers may be merged using the "inverted" workflow,
// where we merge it "down" into one of the "to-be-removed" layers and then rename the
// "to-be-removed" layer to the child. It may be done either if all writers are stopped
// before trying to delete layers (which is signaled by --writers-stopped) or if that child
// is a read-only layer (snapshot) itself.
//
// This "inverted" workflow trades copying data of one of the deleted layers for copying
// data of one child of the chain which is also a child of the "traded" layer. So we
// choose the (parent,child) pair which has the largest difference between "parent" and
// "child" inode sizes.
//
// All other children of the chain are processed by iterating though them, merging removed
// parents into them and rebasing them to the last layer which isn't a member of the removed
// chain.
//
// Example:
//
// <parent> - <from> - <layer 2> - <to> - <child 1>
// \ \ \- <child 2>
// \ \- <child 3>
// \-<child 4>
//
// 1) Find optimal pair for the "reverse" scenario
// Imagine that it's (<layer 2>, <child 1>) in this example
// 2) Process all children except <child 1>:
// - Merge <from>..<to> to <child 2>
// - Set <child 2> parent to <parent>
// - Repeat for others
// 3) Process <child 1>:
// - Merge <from>..<child 1> to <layer 2>
// - Set <layer 2> parent to <parent>
// - Rename <layer 2> to <child 1>
// 4) Delete other layers of the chain (<from>, <to>)
struct snap_remover_t
{
pg_num_t pg_num;
osd_num_t rm_osd_num;
std::set<object_id> objects;
std::set<object_id>::iterator obj_pos;
uint64_t obj_count = 0, obj_done = 0;
cli_tool_t *parent;
// remove from..to
std::string from_name, to_name;
// writers are stopped, we can safely change writable layers
bool writers_stopped = false;
// use CAS writes (0 = never, 1 = auto, 2 = always)
int use_cas = 1;
// interval between fsyncs
int fsync_interval = 128;
std::map<inode_t,int> sources;
std::map<inode_t,uint64_t> inode_used;
std::vector<inode_t> merge_children;
std::vector<inode_t> chain_list;
std::map<inode_t,int> inverse_candidates;
inode_t inverse_parent = 0, inverse_child = 0;
inode_t new_parent = 0;
int state = 0;
int in_flight = 0;
};
int current_child = 0;
std::function<bool(cli_result_t &)> cb;
struct rm_inode_t
{
uint64_t inode = 0;
pool_id_t pool_id = 0;
uint64_t min_offset = 0;
cli_result_t result;
cli_tool_t *parent = NULL;
inode_list_t *lister = NULL;
std::vector<rm_pg_t*> lists;
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
uint64_t pgs_to_list = 0;
bool lists_done = false;
int state = 0;
void start_delete()
bool is_done()
{
lister = parent->cli->list_inode_start(inode, [this](inode_list_t *lst,
std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
{
rm_pg_t *rm = new rm_pg_t((rm_pg_t){
.pg_num = pg_num,
.rm_osd_num = primary_osd,
.objects = objects,
.obj_count = objects.size(),
.obj_done = 0,
});
if (min_offset == 0)
{
total_count += objects.size();
}
else
{
for (object_id oid: objects)
{
if (oid.stripe >= min_offset)
{
total_count++;
}
}
}
rm->obj_pos = rm->objects.begin();
lists.push_back(rm);
if (parent->list_first)
{
parent->cli->list_inode_next(lister, 1);
}
if (status & INODE_LIST_DONE)
{
lists_done = true;
}
pgs_to_list--;
continue_delete();
});
if (!lister)
{
fprintf(stderr, "Failed to list inode %lu from pool %u objects\n", INODE_NO_POOL(inode), INODE_POOL(inode));
exit(1);
}
pgs_to_list = parent->cli->list_pg_count(lister);
parent->cli->list_inode_next(lister, parent->parallel_osds);
return state == 9;
}
void send_ops(rm_pg_t *cur_list)
void loop()
{
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
parent->cli->msgr.osd_peer_fds.end())
{
// Initiate connection
parent->cli->msgr.connect_peer(cur_list->rm_osd_num, parent->cli->st_cli.peer_states[cur_list->rm_osd_num]);
return;
}
while (cur_list->in_flight < parent->iodepth && cur_list->obj_pos != cur_list->objects.end())
{
if (cur_list->obj_pos->stripe >= min_offset)
{
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->peer_fd = parent->cli->msgr.osd_peer_fds[cur_list->rm_osd_num];
op->req = (osd_any_op_t){
.rw = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = parent->cli->next_op_id(),
.opcode = OSD_OP_DELETE,
},
.inode = cur_list->obj_pos->inode,
.offset = cur_list->obj_pos->stripe,
.len = 0,
},
};
op->callback = [this, cur_list](osd_op_t *op)
{
cur_list->in_flight--;
if (op->reply.hdr.retval < 0)
{
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
op->req.rw.inode, op->req.rw.offset,
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
}
delete op;
cur_list->obj_done++;
total_done++;
continue_delete();
};
cur_list->in_flight++;
parent->cli->msgr.outbox_push(op);
}
cur_list->obj_pos++;
}
}
void continue_delete()
{
if (parent->list_first && !lists_done)
{
return;
}
for (int i = 0; i < lists.size(); i++)
{
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
{
delete lists[i];
lists.erase(lists.begin()+i, lists.begin()+i+1);
i--;
if (!lists_done)
{
parent->cli->list_inode_next(lister, 1);
}
}
else
{
send_ops(lists[i]);
}
}
if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
{
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
total_prev_pct = total_done*1000/total_count;
}
if (lists_done && !lists.size())
{
printf("Done, inode %lu in pool %u data removed\n", INODE_NO_POOL(inode), pool_id);
state = 2;
}
}
bool loop()
{
if (state == 0)
{
start_delete();
state = 1;
}
else if (state == 1)
{
continue_delete();
}
if (state == 1)
goto resume_1;
else if (state == 2)
goto resume_2;
else if (state == 3)
goto resume_3;
else if (state == 4)
goto resume_4;
else if (state == 5)
goto resume_5;
else if (state == 6)
goto resume_6;
else if (state == 7)
goto resume_7;
else if (state == 8)
goto resume_8;
else if (state == 9)
goto resume_9;
if (from_name == "")
{
return true;
result = (cli_result_t){ .err = EINVAL, .text = "Layer to remove argument is missing" };
state = 100;
return;
}
return false;
if (to_name == "")
{
to_name = from_name;
}
// Get children to merge
get_merge_children();
if (state == 100)
return;
// Try to select an inode for the "inverse" optimized scenario
// Read statistics from etcd to do it
read_stats();
if (state == 100)
return;
state = 1;
resume_1:
if (parent->waiting > 0)
return;
choose_inverse_candidate();
// Merge children one by one, except our "inverse" child
for (current_child = 0; current_child < merge_children.size(); current_child++)
{
if (merge_children[current_child] == inverse_child)
continue;
start_merge_child(merge_children[current_child], merge_children[current_child]);
if (state == 100)
return;
resume_2:
while (!cb(result))
{
state = 2;
return;
}
cb = NULL;
if (result.err)
{
state = 100;
return;
}
parent->change_parent(merge_children[current_child], new_parent);
state = 3;
resume_3:
if (parent->waiting > 0)
return;
}
// Merge our "inverse" child into our "inverse" parent
if (inverse_child != 0)
{
start_merge_child(inverse_child, inverse_parent);
if (state == 100)
return;
resume_4:
while (!cb(result))
{
state = 4;
return;
}
cb = NULL;
if (result.err)
{
state = 100;
return;
}
// Delete "inverse" child data
start_delete_source(inverse_child);
if (state == 100)
return;
resume_5:
while (!cb(result))
{
state = 5;
return;
}
cb = NULL;
if (result.err)
{
state = 100;
return;
}
// Delete "inverse" child metadata, rename parent over it,
// and also change parent links of the previous "inverse" child
rename_inverse_parent();
if (state == 100)
return;
state = 6;
resume_6:
if (parent->waiting > 0)
return;
}
// Delete parents, except the "inverse" one
for (current_child = 0; current_child < chain_list.size(); current_child++)
{
if (chain_list[current_child] == inverse_parent)
continue;
start_delete_source(chain_list[current_child]);
resume_7:
while (!cb(result))
{
state = 7;
return;
}
cb = NULL;
if (result.err)
{
state = 100;
return;
}
delete_inode_config(chain_list[current_child]);
if (state == 100)
return;
state = 8;
resume_8:
if (parent->waiting > 0)
return;
}
state = 9;
resume_9:
// Done
return;
}
void get_merge_children()
{
// Get all children of from..to
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
// Check that to_cfg is actually a child of from_cfg
// FIXME de-copypaste the following piece of code with snap_merger_t
inode_config_t *cur = to_cfg;
chain_list.push_back(cur->num);
while (cur->num != from_cfg->num && cur->parent_id != 0)
{
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
if (it == parent->cli->st_cli.inode_config.end())
{
char buf[1024];
snprintf(buf, 1024, "Parent inode of layer %s (id 0x%lx) not found", cur->name.c_str(), cur->parent_id);
state = 100;
return;
}
cur = &it->second;
chain_list.push_back(cur->num);
}
if (cur->num != from_cfg->num)
{
result = (cli_result_t){ .err = EINVAL, .text = "Layer "+to_name+" is not a child of "+from_name };
state = 100;
return;
}
new_parent = from_cfg->parent_id;
// Calculate ranks
int i = chain_list.size()-1;
for (inode_t item: chain_list)
{
sources[item] = i--;
}
for (auto & ic: parent->cli->st_cli.inode_config)
{
if (!ic.second.parent_id)
{
continue;
}
auto it = sources.find(ic.second.parent_id);
if (it != sources.end() && sources.find(ic.second.num) == sources.end())
{
merge_children.push_back(ic.second.num);
if (ic.second.readonly || writers_stopped)
{
inverse_candidates[ic.second.num] = it->second;
}
}
}
}
void read_stats()
{
if (inverse_candidates.size() == 0)
{
return;
}
json11::Json::array reads;
for (auto cp: inverse_candidates)
{
inode_t inode = cp.first;
reads.push_back(json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(
parent->cli->st_cli.etcd_prefix+
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
"/"+std::to_string(INODE_NO_POOL(inode))
) },
} }
});
}
for (auto cp: sources)
{
inode_t inode = cp.first;
reads.push_back(json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(
parent->cli->st_cli.etcd_prefix+
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
"/"+std::to_string(INODE_NO_POOL(inode))
) },
} }
});
}
parent->waiting++;
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
{ "success", reads },
}, [this](std::string err, json11::Json data)
{
parent->waiting--;
if (err != "")
{
result = (cli_result_t){ .err = EIO, .text = "Error reading layer statistics from etcd: "+err };
state = 100;
return;
}
for (auto inode_result: data["responses"].array_items())
{
auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]);
pool_id_t pool_id = 0;
inode_t inode = 0;
char null_byte = 0;
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
if (!inode || null_byte != 0)
{
result = (cli_result_t){ .err = EIO, .text = "Bad key returned from etcd: "+kv.key };
state = 100;
return;
}
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id);
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
{
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+std::to_string(pool_id)+" does not exist" };
state = 100;
return;
}
inode = INODE_WITH_POOL(pool_id, inode);
auto & pool_cfg = pool_cfg_it->second;
uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size;
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
{
used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
}
inode_used[inode] = used_bytes;
}
parent->ringloop->wakeup();
});
}
void choose_inverse_candidate()
{
uint64_t max_diff = 0;
for (auto cp: inverse_candidates)
{
inode_t child = cp.first;
uint64_t child_used = inode_used[child];
int rank = cp.second;
for (int i = chain_list.size()-rank; i < chain_list.size(); i++)
{
inode_t parent = chain_list[i];
uint64_t parent_used = inode_used[parent];
if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used)))
{
max_diff = (parent_used-child_used);
inverse_parent = parent;
inverse_child = child;
}
}
}
}
void rename_inverse_parent()
{
auto child_it = parent->cli->st_cli.inode_config.find(inverse_child);
if (child_it == parent->cli->st_cli.inode_config.end())
{
char buf[1024];
snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_child);
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
state = 100;
return;
}
auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent);
if (target_it == parent->cli->st_cli.inode_config.end())
{
char buf[1024];
snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_parent);
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
state = 100;
return;
}
inode_config_t *child_cfg = &child_it->second;
inode_config_t *target_cfg = &target_it->second;
std::string child_name = child_cfg->name;
std::string target_name = target_cfg->name;
std::string child_cfg_key = base64_encode(
parent->cli->st_cli.etcd_prefix+
"/config/inode/"+std::to_string(INODE_POOL(inverse_child))+
"/"+std::to_string(INODE_NO_POOL(inverse_child))
);
std::string target_cfg_key = base64_encode(
parent->cli->st_cli.etcd_prefix+
"/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+
"/"+std::to_string(INODE_NO_POOL(inverse_parent))
);
// Fill new configuration
inode_config_t new_cfg = *child_cfg;
new_cfg.num = target_cfg->num;
new_cfg.parent_id = new_parent;
json11::Json::array cmp = json11::Json::array {
json11::Json::object {
{ "target", "MOD" },
{ "key", child_cfg_key },
{ "result", "LESS" },
{ "mod_revision", child_cfg->mod_revision+1 },
},
json11::Json::object {
{ "target", "MOD" },
{ "key", target_cfg_key },
{ "result", "LESS" },
{ "mod_revision", target_cfg->mod_revision+1 },
},
};
json11::Json::array txn = json11::Json::array {
json11::Json::object {
{ "request_delete_range", json11::Json::object {
{ "key", child_cfg_key },
} },
},
json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", target_cfg_key },
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&new_cfg)).dump()) },
} },
},
json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+child_cfg->name) },
{ "value", base64_encode(json11::Json({
{ "id", INODE_NO_POOL(inverse_parent) },
{ "pool_id", (uint64_t)INODE_POOL(inverse_parent) },
}).dump()) },
} },
},
};
// Reparent children of inverse_child
for (auto & cp: parent->cli->st_cli.inode_config)
{
if (cp.second.parent_id == child_cfg->num)
{
auto cp_cfg = cp.second;
cp_cfg.parent_id = inverse_parent;
auto cp_key = base64_encode(
parent->cli->st_cli.etcd_prefix+
"/config/inode/"+std::to_string(INODE_POOL(cp.second.num))+
"/"+std::to_string(INODE_NO_POOL(cp.second.num))
);
cmp.push_back(json11::Json::object {
{ "target", "MOD" },
{ "key", cp_key },
{ "result", "LESS" },
{ "mod_revision", cp.second.mod_revision+1 },
});
txn.push_back(json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", cp_key },
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&cp_cfg)).dump()) },
} },
});
}
}
parent->waiting++;
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
{ "compare", cmp },
{ "success", txn },
}, [this, target_name, child_name](std::string err, json11::Json res)
{
parent->waiting--;
if (err != "")
{
result = (cli_result_t){ .err = EIO, .text = "Error renaming "+target_name+" to "+child_name+": "+err };
state = 100;
return;
}
if (!res["succeeded"].bool_value())
{
result = (cli_result_t){
.err = EIO,
.text = "Parent ("+target_name+"), child ("+child_name+"), or one of its children"
" configuration was modified during rename",
};
state = 100;
return;
}
printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
parent->ringloop->wakeup();
});
}
void delete_inode_config(inode_t cur)
{
auto cur_cfg_it = parent->cli->st_cli.inode_config.find(cur);
if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
{
char buf[1024];
snprintf(buf, 1024, "Inode 0x%lx disappeared", cur);
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
state = 100;
return;
}
inode_config_t *cur_cfg = &cur_cfg_it->second;
std::string cur_name = cur_cfg->name;
std::string cur_cfg_key = base64_encode(
parent->cli->st_cli.etcd_prefix+
"/config/inode/"+std::to_string(INODE_POOL(cur))+
"/"+std::to_string(INODE_NO_POOL(cur))
);
parent->waiting++;
parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
{ "compare", json11::Json::array {
json11::Json::object {
{ "target", "MOD" },
{ "key", cur_cfg_key },
{ "result", "LESS" },
{ "mod_revision", cur_cfg->mod_revision+1 },
},
} },
{ "success", json11::Json::array {
json11::Json::object {
{ "request_delete_range", json11::Json::object {
{ "key", cur_cfg_key },
} },
},
json11::Json::object {
{ "request_delete_range", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) },
} },
},
} },
}, [this, cur_name](std::string err, json11::Json res)
{
parent->waiting--;
if (err != "")
{
result = (cli_result_t){ .err = EIO, .text = "Error deleting "+cur_name+": "+err };
state = 100;
return;
}
if (!res["succeeded"].bool_value())
{
result = (cli_result_t){ .err = EIO, .text = "Layer "+cur_name+" was modified during deletion" };
state = 100;
return;
}
printf("Layer %s deleted\n", cur_name.c_str());
parent->ringloop->wakeup();
});
}
void start_merge_child(inode_t child_inode, inode_t target_inode)
{
auto child_it = parent->cli->st_cli.inode_config.find(child_inode);
if (child_it == parent->cli->st_cli.inode_config.end())
{
char buf[1024];
snprintf(buf, 1024, "Inode 0x%lx disappeared", child_inode);
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
state = 100;
return;
}
auto target_it = parent->cli->st_cli.inode_config.find(target_inode);
if (target_it == parent->cli->st_cli.inode_config.end())
{
char buf[1024];
snprintf(buf, 1024, "Inode 0x%lx disappeared", target_inode);
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
state = 100;
return;
}
cb = parent->start_merge(json11::Json::object {
{ "command", json11::Json::array{ "merge-data", from_name, child_it->second.name } },
{ "target", target_it->second.name },
{ "delete-source", false },
{ "cas", use_cas },
{ "fsync-interval", fsync_interval },
});
}
void start_delete_source(inode_t inode)
{
auto source = parent->cli->st_cli.inode_config.find(inode);
if (source == parent->cli->st_cli.inode_config.end())
{
char buf[1024];
snprintf(buf, 1024, "Inode 0x%lx disappeared", inode);
result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
state = 100;
return;
}
cb = parent->start_rm(json11::Json::object {
{ "inode", inode },
{ "pool", (uint64_t)INODE_POOL(inode) },
{ "fsync-interval", fsync_interval },
});
}
};
std::function<bool(void)> cli_tool_t::start_rm(json11::Json cfg)
std::function<bool(cli_result_t &)> cli_tool_t::start_snap_rm(json11::Json cfg)
{
auto remover = new rm_inode_t();
remover->parent = this;
remover->inode = cfg["inode"].uint64_value();
remover->pool_id = cfg["pool"].uint64_value();
if (remover->pool_id)
json11::Json::array cmd = cfg["command"].array_items();
auto snap_remover = new snap_remover_t();
snap_remover->parent = this;
snap_remover->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
snap_remover->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
if (!snap_remover->fsync_interval)
snap_remover->fsync_interval = 128;
if (!cfg["cas"].is_null())
snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
if (!cfg["writers_stopped"].is_null())
snap_remover->writers_stopped = true;
return [snap_remover](cli_result_t & result)
{
remover->inode = (remover->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
}
remover->pool_id = INODE_POOL(remover->inode);
if (!remover->pool_id)
{
fprintf(stderr, "pool is missing\n");
exit(1);
}
remover->min_offset = cfg["min-offset"].uint64_value();
return [remover]()
{
if (remover->loop())
snap_remover->loop();
if (snap_remover->is_done())
{
delete remover;
result = snap_remover->result;
delete snap_remover;
return true;
}
return false;

230
src/cli_rm_data.cpp Normal file
View File

@@ -0,0 +1,230 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "cli.h"
#include "cluster_client.h"
#define RM_LISTING 1
#define RM_REMOVING 2
#define RM_END 3
struct rm_pg_t
{
pg_num_t pg_num;
osd_num_t rm_osd_num;
std::set<object_id> objects;
std::set<object_id>::iterator obj_pos;
uint64_t obj_count = 0, obj_done = 0;
int state = 0;
int in_flight = 0;
};
struct rm_inode_t
{
uint64_t inode = 0;
pool_id_t pool_id = 0;
uint64_t min_offset = 0;
cli_tool_t *parent = NULL;
inode_list_t *lister = NULL;
std::vector<rm_pg_t*> lists;
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
uint64_t pgs_to_list = 0;
bool lists_done = false;
int state = 0;
int error_count = 0;
cli_result_t result;
void start_delete()
{
lister = parent->cli->list_inode_start(inode, [this](inode_list_t *lst,
std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
{
rm_pg_t *rm = new rm_pg_t((rm_pg_t){
.pg_num = pg_num,
.rm_osd_num = primary_osd,
.objects = objects,
.obj_count = objects.size(),
.obj_done = 0,
});
if (min_offset == 0)
{
total_count += objects.size();
}
else
{
for (object_id oid: objects)
{
if (oid.stripe >= min_offset)
{
total_count++;
}
}
}
rm->obj_pos = rm->objects.begin();
lists.push_back(rm);
if (parent->list_first)
{
parent->cli->list_inode_next(lister, 1);
}
if (status & INODE_LIST_DONE)
{
lists_done = true;
}
pgs_to_list--;
continue_delete();
});
if (!lister)
{
result = (cli_result_t){
.err = EIO,
.text = "Failed to list objects of inode "+std::to_string(INODE_NO_POOL(inode))+
" from pool "+std::to_string(INODE_POOL(inode)),
};
state = 100;
return;
}
pgs_to_list = parent->cli->list_pg_count(lister);
parent->cli->list_inode_next(lister, parent->parallel_osds);
}
void send_ops(rm_pg_t *cur_list)
{
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
parent->cli->msgr.osd_peer_fds.end())
{
// Initiate connection
parent->cli->msgr.connect_peer(cur_list->rm_osd_num, parent->cli->st_cli.peer_states[cur_list->rm_osd_num]);
return;
}
while (cur_list->in_flight < parent->iodepth && cur_list->obj_pos != cur_list->objects.end())
{
if (cur_list->obj_pos->stripe >= min_offset)
{
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
// Already checked that it exists above, but anyway
op->peer_fd = parent->cli->msgr.osd_peer_fds.at(cur_list->rm_osd_num);
op->req = (osd_any_op_t){
.rw = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = parent->cli->next_op_id(),
.opcode = OSD_OP_DELETE,
},
.inode = cur_list->obj_pos->inode,
.offset = cur_list->obj_pos->stripe,
.len = 0,
},
};
op->callback = [this, cur_list](osd_op_t *op)
{
cur_list->in_flight--;
if (op->reply.hdr.retval < 0)
{
fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
op->req.rw.inode, op->req.rw.offset,
cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
error_count++;
}
delete op;
cur_list->obj_done++;
total_done++;
continue_delete();
};
cur_list->in_flight++;
parent->cli->msgr.outbox_push(op);
}
cur_list->obj_pos++;
}
}
void continue_delete()
{
if (parent->list_first && !lists_done)
{
return;
}
for (int i = 0; i < lists.size(); i++)
{
if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
{
delete lists[i];
lists.erase(lists.begin()+i, lists.begin()+i+1);
i--;
if (!lists_done)
{
parent->cli->list_inode_next(lister, 1);
}
}
else
{
send_ops(lists[i]);
}
}
if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
{
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
total_prev_pct = total_done*1000/total_count;
}
if (lists_done && !lists.size())
{
result = (cli_result_t){
.err = error_count > 0 ? EIO : 0,
.text = error_count > 0 ? "Some blocks were not removed" : (
"Done, inode "+std::to_string(INODE_NO_POOL(inode))+" from pool "+
std::to_string(pool_id)+" removed"),
};
state = 100;
}
}
bool is_done()
{
return state == 100;
}
void loop()
{
if (state == 1)
goto resume_1;
if (!pool_id)
{
result = (cli_result_t){ .err = EINVAL, .text = "Pool is not specified" };
state = 100;
return;
}
start_delete();
if (state == 100)
return;
state = 1;
resume_1:
continue_delete();
}
};
std::function<bool(cli_result_t &)> cli_tool_t::start_rm(json11::Json cfg)
{
auto remover = new rm_inode_t();
remover->parent = this;
remover->inode = cfg["inode"].uint64_value();
remover->pool_id = cfg["pool"].uint64_value();
if (remover->pool_id)
{
remover->inode = (remover->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
}
remover->pool_id = INODE_POOL(remover->inode);
remover->min_offset = cfg["min-offset"].uint64_value();
return [remover](cli_result_t & result)
{
remover->loop();
if (remover->is_done())
{
result = remover->result;
delete remover;
return true;
}
return false;
};
}

View File

@@ -8,9 +8,10 @@
#include "cli.h"
#include "cluster_client.h"
#include "base64.h"
#include <sys/stat.h>
// Calculate offsets for a block device and print OSD command line parameters
std::function<bool(void)> cli_tool_t::simple_offsets(json11::Json cfg)
std::function<bool(cli_result_t &)> cli_tool_t::simple_offsets(json11::Json cfg)
{
std::string device = cfg["command"][1].string_value();
uint64_t object_size = parse_size(cfg["object_size"].string_value());

View File

@@ -1,568 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include <fcntl.h>
#include "cli.h"
#include "cluster_client.h"
#include "base64.h"
// Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets
//
// Exactly one child of the requested layers may be merged using the "inverted" workflow,
// where we merge it "down" into one of the "to-be-removed" layers and then rename the
// "to-be-removed" layer to the child. It may be done either if all writers are stopped
// before trying to delete layers (which is signaled by --writers-stopped) or if that child
// is a read-only layer (snapshot) itself.
//
// This "inverted" workflow trades copying data of one of the deleted layers for copying
// data of one child of the chain which is also a child of the "traded" layer. So we
// choose the (parent,child) pair which has the largest difference between "parent" and
// "child" inode sizes.
//
// All other children of the chain are processed by iterating though them, merging removed
// parents into them and rebasing them to the last layer which isn't a member of the removed
// chain.
//
// Example:
//
// <parent> - <from> - <layer 2> - <to> - <child 1>
// \ \ \- <child 2>
// \ \- <child 3>
// \-<child 4>
//
// 1) Find optimal pair for the "reverse" scenario
// Imagine that it's (<layer 2>, <child 1>) in this example
// 2) Process all children except <child 1>:
// - Merge <from>..<to> to <child 2>
// - Set <child 2> parent to <parent>
// - Repeat for others
// 3) Process <child 1>:
// - Merge <from>..<child 1> to <layer 2>
// - Set <layer 2> parent to <parent>
// - Rename <layer 2> to <child 1>
// 4) Delete other layers of the chain (<from>, <to>)
struct snap_remover_t
{
cli_tool_t *parent;
// remove from..to
std::string from_name, to_name;
// writers are stopped, we can safely change writable layers
bool writers_stopped = false;
// use CAS writes (0 = never, 1 = auto, 2 = always)
int use_cas = 1;
// interval between fsyncs
int fsync_interval = 128;
std::map<inode_t,int> sources;
std::map<inode_t,uint64_t> inode_used;
std::vector<inode_t> merge_children;
std::vector<inode_t> chain_list;
std::map<inode_t,int> inverse_candidates;
inode_t inverse_parent = 0, inverse_child = 0;
inode_t new_parent = 0;
int state = 0;
int current_child = 0;
std::function<bool(void)> cb;
bool is_done()
{
return state == 9;
}
void loop()
{
if (state == 1)
goto resume_1;
else if (state == 2)
goto resume_2;
else if (state == 3)
goto resume_3;
else if (state == 4)
goto resume_4;
else if (state == 5)
goto resume_5;
else if (state == 6)
goto resume_6;
else if (state == 7)
goto resume_7;
else if (state == 8)
goto resume_8;
else if (state == 9)
goto resume_9;
// Get children to merge
get_merge_children();
// Try to select an inode for the "inverse" optimized scenario
// Read statistics from etcd to do it
read_stats();
state = 1;
resume_1:
if (parent->waiting > 0)
return;
choose_inverse_candidate();
// Merge children one by one, except our "inverse" child
for (current_child = 0; current_child < merge_children.size(); current_child++)
{
if (merge_children[current_child] == inverse_child)
continue;
start_merge_child(merge_children[current_child], merge_children[current_child]);
resume_2:
while (!cb())
{
state = 2;
return;
}
cb = NULL;
parent->change_parent(merge_children[current_child], new_parent);
state = 3;
resume_3:
if (parent->waiting > 0)
return;
}
// Merge our "inverse" child into our "inverse" parent
if (inverse_child != 0)
{
start_merge_child(inverse_child, inverse_parent);
resume_4:
while (!cb())
{
state = 4;
return;
}
cb = NULL;
// Delete "inverse" child data
start_delete_source(inverse_child);
resume_5:
while (!cb())
{
state = 5;
return;
}
cb = NULL;
// Delete "inverse" child metadata, rename parent over it,
// and also change parent links of the previous "inverse" child
rename_inverse_parent();
state = 6;
resume_6:
if (parent->waiting > 0)
return;
}
// Delete parents, except the "inverse" one
for (current_child = 0; current_child < chain_list.size(); current_child++)
{
if (chain_list[current_child] == inverse_parent)
continue;
start_delete_source(chain_list[current_child]);
resume_7:
while (!cb())
{
state = 7;
return;
}
cb = NULL;
delete_inode_config(chain_list[current_child]);
state = 8;
resume_8:
if (parent->waiting > 0)
return;
}
state = 9;
resume_9:
// Done
return;
}
void get_merge_children()
{
// Get all children of from..to
inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
// Check that to_cfg is actually a child of from_cfg
// FIXME de-copypaste the following piece of code with snap_merger_t
inode_config_t *cur = to_cfg;
chain_list.push_back(cur->num);
while (cur->num != from_cfg->num && cur->parent_id != 0)
{
auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
if (it == parent->cli->st_cli.inode_config.end())
{
fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
exit(1);
}
cur = &it->second;
chain_list.push_back(cur->num);
}
if (cur->num != from_cfg->num)
{
fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
exit(1);
}
new_parent = from_cfg->parent_id;
// Calculate ranks
int i = chain_list.size()-1;
for (inode_t item: chain_list)
{
sources[item] = i--;
}
for (auto & ic: parent->cli->st_cli.inode_config)
{
if (!ic.second.parent_id)
{
continue;
}
auto it = sources.find(ic.second.parent_id);
if (it != sources.end() && sources.find(ic.second.num) == sources.end())
{
merge_children.push_back(ic.second.num);
if (ic.second.readonly || writers_stopped)
{
inverse_candidates[ic.second.num] = it->second;
}
}
}
}
void read_stats()
{
if (inverse_candidates.size() == 0)
{
return;
}
json11::Json::array reads;
for (auto cp: inverse_candidates)
{
inode_t inode = cp.first;
reads.push_back(json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(
parent->cli->st_cli.etcd_prefix+
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
"/"+std::to_string(INODE_NO_POOL(inode))
) },
} }
});
}
for (auto cp: sources)
{
inode_t inode = cp.first;
reads.push_back(json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(
parent->cli->st_cli.etcd_prefix+
"/inode/stats/"+std::to_string(INODE_POOL(inode))+
"/"+std::to_string(INODE_NO_POOL(inode))
) },
} }
});
}
parent->waiting++;
parent->cli->st_cli.etcd_txn(json11::Json::object {
{ "success", reads },
}, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
{
parent->waiting--;
if (err != "")
{
fprintf(stderr, "Error reading layer statistics from etcd: %s\n", err.c_str());
exit(1);
}
for (auto inode_result: data["responses"].array_items())
{
auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]);
pool_id_t pool_id = 0;
inode_t inode = 0;
char null_byte = 0;
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
if (!inode || null_byte != 0)
{
fprintf(stderr, "Bad key returned from etcd: %s\n", kv.key.c_str());
exit(1);
}
auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id);
if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
{
fprintf(stderr, "Pool %u does not exist\n", pool_id);
exit(1);
}
inode = INODE_WITH_POOL(pool_id, inode);
auto & pool_cfg = pool_cfg_it->second;
uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size;
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
{
used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
}
inode_used[inode] = used_bytes;
}
parent->ringloop->wakeup();
});
}
void choose_inverse_candidate()
{
uint64_t max_diff = 0;
for (auto cp: inverse_candidates)
{
inode_t child = cp.first;
uint64_t child_used = inode_used[child];
int rank = cp.second;
for (int i = chain_list.size()-rank; i < chain_list.size(); i++)
{
inode_t parent = chain_list[i];
uint64_t parent_used = inode_used[parent];
if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used)))
{
max_diff = (parent_used-child_used);
inverse_parent = parent;
inverse_child = child;
}
}
}
}
void rename_inverse_parent()
{
auto child_it = parent->cli->st_cli.inode_config.find(inverse_child);
if (child_it == parent->cli->st_cli.inode_config.end())
{
fprintf(stderr, "Inode %ld disappeared\n", inverse_child);
exit(1);
}
auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent);
if (target_it == parent->cli->st_cli.inode_config.end())
{
fprintf(stderr, "Inode %ld disappeared\n", inverse_parent);
exit(1);
}
inode_config_t *child_cfg = &child_it->second;
inode_config_t *target_cfg = &target_it->second;
std::string child_name = child_cfg->name;
std::string target_name = target_cfg->name;
std::string child_cfg_key = base64_encode(
parent->cli->st_cli.etcd_prefix+
"/config/inode/"+std::to_string(INODE_POOL(inverse_child))+
"/"+std::to_string(INODE_NO_POOL(inverse_child))
);
std::string target_cfg_key = base64_encode(
parent->cli->st_cli.etcd_prefix+
"/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+
"/"+std::to_string(INODE_NO_POOL(inverse_parent))
);
// Fill new configuration
inode_config_t new_cfg = *child_cfg;
new_cfg.num = target_cfg->num;
new_cfg.parent_id = new_parent;
json11::Json::array cmp = json11::Json::array {
json11::Json::object {
{ "target", "MOD" },
{ "key", child_cfg_key },
{ "result", "LESS" },
{ "mod_revision", child_cfg->mod_revision+1 },
},
json11::Json::object {
{ "target", "MOD" },
{ "key", target_cfg_key },
{ "result", "LESS" },
{ "mod_revision", target_cfg->mod_revision+1 },
},
};
json11::Json::array txn = json11::Json::array {
json11::Json::object {
{ "request_delete_range", json11::Json::object {
{ "key", child_cfg_key },
} },
},
json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", target_cfg_key },
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&new_cfg)).dump()) },
} },
},
json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+child_cfg->name) },
{ "value", base64_encode(json11::Json({
{ "id", INODE_NO_POOL(inverse_parent) },
{ "pool_id", (uint64_t)INODE_POOL(inverse_parent) },
}).dump()) },
} },
},
};
// Reparent children of inverse_child
for (auto & cp: parent->cli->st_cli.inode_config)
{
if (cp.second.parent_id == child_cfg->num)
{
auto cp_cfg = cp.second;
cp_cfg.parent_id = inverse_parent;
auto cp_key = base64_encode(
parent->cli->st_cli.etcd_prefix+
"/config/inode/"+std::to_string(INODE_POOL(cp.second.num))+
"/"+std::to_string(INODE_NO_POOL(cp.second.num))
);
cmp.push_back(json11::Json::object {
{ "target", "MOD" },
{ "key", cp_key },
{ "result", "LESS" },
{ "mod_revision", cp.second.mod_revision+1 },
});
txn.push_back(json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", cp_key },
{ "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&cp_cfg)).dump()) },
} },
});
}
}
parent->waiting++;
parent->cli->st_cli.etcd_txn(json11::Json::object {
{ "compare", cmp },
{ "success", txn },
}, ETCD_SLOW_TIMEOUT, [this, target_name, child_name](std::string err, json11::Json res)
{
parent->waiting--;
if (err != "")
{
fprintf(stderr, "Error renaming %s to %s: %s\n", target_name.c_str(), child_name.c_str(), err.c_str());
exit(1);
}
if (!res["succeeded"].bool_value())
{
fprintf(
stderr, "Parent (%s), child (%s), or one of its children"
" configuration was modified during rename\n", target_name.c_str(), child_name.c_str()
);
exit(1);
}
printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
parent->ringloop->wakeup();
});
}
void delete_inode_config(inode_t cur)
{
auto cur_cfg_it = parent->cli->st_cli.inode_config.find(cur);
if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
{
fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
exit(1);
}
inode_config_t *cur_cfg = &cur_cfg_it->second;
std::string cur_name = cur_cfg->name;
std::string cur_cfg_key = base64_encode(
parent->cli->st_cli.etcd_prefix+
"/config/inode/"+std::to_string(INODE_POOL(cur))+
"/"+std::to_string(INODE_NO_POOL(cur))
);
parent->waiting++;
parent->cli->st_cli.etcd_txn(json11::Json::object {
{ "compare", json11::Json::array {
json11::Json::object {
{ "target", "MOD" },
{ "key", cur_cfg_key },
{ "result", "LESS" },
{ "mod_revision", cur_cfg->mod_revision+1 },
},
} },
{ "success", json11::Json::array {
json11::Json::object {
{ "request_delete_range", json11::Json::object {
{ "key", cur_cfg_key },
} },
},
json11::Json::object {
{ "request_delete_range", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) },
} },
},
} },
}, ETCD_SLOW_TIMEOUT, [this, cur_name](std::string err, json11::Json res)
{
parent->waiting--;
if (err != "")
{
fprintf(stderr, "Error deleting %s: %s\n", cur_name.c_str(), err.c_str());
exit(1);
}
if (!res["succeeded"].bool_value())
{
fprintf(stderr, "Layer %s configuration was modified during deletion\n", cur_name.c_str());
exit(1);
}
printf("Layer %s deleted\n", cur_name.c_str());
parent->ringloop->wakeup();
});
}
void start_merge_child(inode_t child_inode, inode_t target_inode)
{
auto child_it = parent->cli->st_cli.inode_config.find(child_inode);
if (child_it == parent->cli->st_cli.inode_config.end())
{
fprintf(stderr, "Inode %ld disappeared\n", child_inode);
exit(1);
}
auto target_it = parent->cli->st_cli.inode_config.find(target_inode);
if (target_it == parent->cli->st_cli.inode_config.end())
{
fprintf(stderr, "Inode %ld disappeared\n", target_inode);
exit(1);
}
cb = parent->start_merge(json11::Json::object {
{ "command", json11::Json::array{ "merge-data", from_name, child_it->second.name } },
{ "target", target_it->second.name },
{ "delete-source", false },
{ "cas", use_cas },
{ "fsync-interval", fsync_interval },
});
}
void start_delete_source(inode_t inode)
{
auto source = parent->cli->st_cli.inode_config.find(inode);
if (source == parent->cli->st_cli.inode_config.end())
{
fprintf(stderr, "Inode %ld disappeared\n", inode);
exit(1);
}
cb = parent->start_rm(json11::Json::object {
{ "inode", inode },
{ "pool", (uint64_t)INODE_POOL(inode) },
{ "fsync-interval", fsync_interval },
});
}
};
std::function<bool(void)> cli_tool_t::start_snap_rm(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto snap_remover = new snap_remover_t();
snap_remover->parent = this;
snap_remover->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
snap_remover->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
if (snap_remover->from_name == "")
{
fprintf(stderr, "Layer to remove argument is missing\n");
exit(1);
}
if (snap_remover->to_name == "")
{
snap_remover->to_name = snap_remover->from_name;
}
snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
if (!snap_remover->fsync_interval)
snap_remover->fsync_interval = 128;
if (!cfg["cas"].is_null())
snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
if (!cfg["writers_stopped"].is_null())
snap_remover->writers_stopped = true;
return [snap_remover]()
{
snap_remover->loop();
if (snap_remover->is_done())
{
delete snap_remover;
return true;
}
return false;
};
}

296
src/cli_status.cpp Normal file
View File

@@ -0,0 +1,296 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "cli.h"
#include "cluster_client.h"
#include "base64.h"
#include "pg_states.h"
// Print cluster status:
// etcd, mon, osd states
// raw/used space, object states, pool states, pg states
// client io, recovery io, rebalance io
struct status_printer_t
{
cli_tool_t *parent;
int state = 0;
json11::Json::array mon_members, osd_stats;
json11::Json agg_stats;
std::map<pool_id_t, json11::Json::object> pool_stats;
json11::Json::array etcd_states;
bool is_done()
{
return state == 100;
}
void loop()
{
if (state == 1)
goto resume_1;
else if (state == 2)
goto resume_2;
// etcd states
{
auto addrs = parent->cli->st_cli.get_addresses();
etcd_states.resize(addrs.size());
for (int i = 0; i < etcd_states.size(); i++)
{
parent->waiting++;
parent->cli->st_cli.etcd_call_oneshot(
addrs[i], "/maintenance/status", json11::Json::object(),
parent->cli->st_cli.etcd_quick_timeout, [this, i](std::string err, json11::Json res)
{
parent->waiting--;
etcd_states[i] = err != "" ? json11::Json::object{ { "error", err } } : res;
parent->ringloop->wakeup();
}
);
}
}
state = 1;
resume_1:
if (parent->waiting > 0)
return;
// Monitors, OSD states
parent->etcd_txn(json11::Json::object {
{ "success", json11::Json::array {
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/mon/") },
{ "range_end", base64_encode(parent->cli->st_cli.etcd_prefix+"/mon0") },
} },
},
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(
parent->cli->st_cli.etcd_prefix+"/osd/stats/"
) },
{ "range_end", base64_encode(
parent->cli->st_cli.etcd_prefix+"/osd/stats0"
) },
} },
},
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/stats") },
} },
},
} },
});
state = 2;
resume_2:
if (parent->waiting > 0)
return;
mon_members = parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items();
osd_stats = parent->etcd_result["responses"][1]["response_range"]["kvs"].array_items();
if (parent->etcd_result["responses"][2]["response_range"]["kvs"].array_items().size() > 0)
{
agg_stats = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][2]["response_range"]["kvs"][0]).value;
}
int etcd_alive = 0;
uint64_t etcd_db_size = 0;
std::string etcd_detail;
for (int i = 0; i < etcd_states.size(); i++)
{
if (etcd_states[i]["error"].is_null())
{
etcd_alive++;
etcd_db_size = etcd_states[i]["dbSizeInUse"].uint64_value();
}
}
int mon_count = 0;
std::string mon_master;
for (int i = 0; i < mon_members.size(); i++)
{
auto kv = parent->cli->st_cli.parse_etcd_kv(mon_members[i]);
kv.key = kv.key.substr(parent->cli->st_cli.etcd_prefix.size());
if (kv.key.substr(0, 12) == "/mon/member/")
mon_count++;
else if (kv.key == "/mon/master")
{
if (kv.value["hostname"].is_string())
mon_master = kv.value["hostname"].string_value();
else
mon_master = kv.value["ip"][0].string_value();
}
}
int osd_count = 0, osd_up = 0;
uint64_t total_raw = 0, free_raw = 0, free_down_raw = 0, down_raw = 0;
for (int i = 0; i < osd_stats.size(); i++)
{
auto kv = parent->cli->st_cli.parse_etcd_kv(osd_stats[i]);
osd_num_t stat_osd_num = 0;
char null_byte = 0;
sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.size(), "/osd/stats/%lu%c", &stat_osd_num, &null_byte);
if (!stat_osd_num || null_byte != 0)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
osd_count++;
total_raw += kv.value["size"].uint64_value();
free_raw += kv.value["free"].uint64_value();
auto peer_it = parent->cli->st_cli.peer_states.find(stat_osd_num);
if (peer_it != parent->cli->st_cli.peer_states.end())
{
osd_up++;
}
else
{
down_raw += kv.value["size"].uint64_value();
free_down_raw += kv.value["size"].uint64_value();
}
}
int pool_count = 0, pools_active = 0;
std::map<std::string, int> pgs_by_state;
std::string pgs_by_state_str;
for (auto & pool_pair: parent->cli->st_cli.pool_config)
{
auto & pool_cfg = pool_pair.second;
bool active = true;
if (pool_cfg.pg_config.size() != pool_cfg.pg_count)
{
active = false;
pgs_by_state["offline"] += pool_cfg.pg_count-pool_cfg.pg_config.size();
}
pool_count++;
for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++)
{
if (!(pg_it->second.cur_state & PG_ACTIVE))
{
active = false;
}
std::string pg_state_str;
for (int i = 0; i < pg_state_bit_count; i++)
{
if (pg_it->second.cur_state & pg_state_bits[i])
{
pg_state_str += "+";
pg_state_str += pg_state_names[i];
}
}
if (pg_state_str.size())
pgs_by_state[pg_state_str.substr(1)]++;
else
pgs_by_state["offline"]++;
}
if (active)
{
pools_active++;
}
}
for (auto & kv: pgs_by_state)
{
if (pgs_by_state_str.size())
{
pgs_by_state_str += "\n ";
}
pgs_by_state_str += std::to_string(kv.second)+" "+kv.first;
}
uint64_t object_size = parent->cli->get_bs_block_size();
std::string more_states;
uint64_t obj_n;
obj_n = agg_stats["object_counts"]["misplaced"].uint64_value();
if (obj_n > 0)
more_states += ", "+format_size(obj_n*object_size)+" misplaced";
obj_n = agg_stats["object_counts"]["degraded"].uint64_value();
if (obj_n > 0)
more_states += ", "+format_size(obj_n*object_size)+" degraded";
obj_n = agg_stats["object_counts"]["incomplete"].uint64_value();
if (obj_n > 0)
more_states += ", "+format_size(obj_n*object_size)+" incomplete";
std::string recovery_io;
{
uint64_t deg_bps = agg_stats["recovery_stats"]["degraded"]["bps"].uint64_value();
uint64_t deg_iops = agg_stats["recovery_stats"]["degraded"]["iops"].uint64_value();
uint64_t misp_bps = agg_stats["recovery_stats"]["misplaced"]["bps"].uint64_value();
uint64_t misp_iops = agg_stats["recovery_stats"]["misplaced"]["iops"].uint64_value();
if (deg_iops > 0 || deg_bps > 0)
recovery_io += " recovery: "+format_size(deg_bps)+"/s, "+format_size(deg_iops, true)+" op/s\n";
if (misp_iops > 0 || misp_bps > 0)
recovery_io += " rebalance: "+format_size(misp_bps)+"/s, "+format_size(misp_iops, true)+" op/s\n";
}
if (parent->json_output)
{
// JSON output
printf("%s\n", json11::Json(json11::Json::object {
{ "etcd_alive", etcd_alive },
{ "etcd_count", (uint64_t)etcd_states.size() },
{ "etcd_db_size", etcd_db_size },
{ "mon_count", mon_count },
{ "mon_master", mon_master },
{ "osd_up", osd_up },
{ "osd_count", osd_count },
{ "total_raw", total_raw },
{ "free_raw", free_raw },
{ "down_raw", down_raw },
{ "free_down_raw", free_down_raw },
{ "clean_data", agg_stats["object_counts"]["clean"].uint64_value() * object_size },
{ "misplaced_data", agg_stats["object_counts"]["misplaced"].uint64_value() * object_size },
{ "degraded_data", agg_stats["object_counts"]["degraded"].uint64_value() * object_size },
{ "incomplete_data", agg_stats["object_counts"]["incomplete"].uint64_value() * object_size },
{ "pool_count", pool_count },
{ "active_pool_count", pools_active },
{ "pg_states", pgs_by_state },
{ "op_stats", agg_stats["op_stats"] },
{ "recovery_stats", agg_stats["recovery_stats"] },
{ "object_counts", agg_stats["object_counts"] },
}).dump().c_str());
state = 100;
return;
}
printf(
" cluster:\n"
" etcd: %d / %ld up, %s database size\n"
" mon: %d up%s\n"
" osd: %d / %d up\n"
" \n"
" data:\n"
" raw: %s used, %s / %s available%s\n"
" state: %s clean%s\n"
" pools: %d / %d active\n"
" pgs: %s\n"
" \n"
" io:\n"
" client:%s %s/s rd, %s op/s rd, %s/s wr, %s op/s wr\n"
"%s",
etcd_alive, etcd_states.size(), format_size(etcd_db_size).c_str(),
mon_count, mon_master == "" ? "" : (", master "+mon_master).c_str(),
osd_up, osd_count,
format_size(total_raw-free_raw).c_str(),
format_size(free_raw-free_down_raw).c_str(),
format_size(total_raw-down_raw).c_str(),
(down_raw > 0 ? (", "+format_size(down_raw)+" down").c_str() : ""),
format_size(agg_stats["object_counts"]["clean"].uint64_value() * object_size).c_str(), more_states.c_str(),
pools_active, pool_count,
pgs_by_state_str.c_str(),
recovery_io.size() > 0 ? " " : "",
format_size(agg_stats["op_stats"]["primary_read"]["bps"].uint64_value()).c_str(),
format_size(agg_stats["op_stats"]["primary_read"]["iops"].uint64_value(), true).c_str(),
format_size(agg_stats["op_stats"]["primary_write"]["bps"].uint64_value()).c_str(),
format_size(agg_stats["op_stats"]["primary_write"]["iops"].uint64_value(), true).c_str(),
recovery_io.c_str()
);
state = 100;
}
};
std::function<bool(cli_result_t &)> cli_tool_t::start_status(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto printer = new status_printer_t();
printer->parent = this;
return [printer](cli_result_t & result)
{
printer->loop();
if (printer->is_done())
{
result = { .err = 0 };
delete printer;
return true;
}
return false;
};
}

View File

@@ -143,7 +143,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
}
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
{
for (auto prev = op->prev; prev; prev = prev->prev)
for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
{
if (prev->opcode == OSD_OP_WRITE && prev->flags & OP_FLUSH_BUFFER)
{
@@ -151,7 +151,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
}
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
{
// Flushes are always in the beginning
// Flushes are always in the beginning (we're scanning from the beginning of the queue)
break;
}
}
@@ -172,6 +172,7 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
{
next->prev_wait += inc;
assert(next->prev_wait >= 0);
if (!next->prev_wait)
{
if (next->opcode == OSD_OP_SYNC)
@@ -191,6 +192,7 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
if (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE)
{
next->prev_wait += inc;
assert(next->prev_wait >= 0);
if (!next->prev_wait)
{
if (next->opcode == OSD_OP_SYNC)
@@ -534,8 +536,8 @@ void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_
unsigned iov_len = (op->iov.buf[iov_idx].iov_len - iov_pos);
if (iov_len <= cur_len)
{
memcpy(dirty_it->second.buf + pos - dirty_it->first.stripe,
op->iov.buf[iov_idx].iov_base + iov_pos, iov_len);
memcpy((uint8_t*)dirty_it->second.buf + pos - dirty_it->first.stripe,
(uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, iov_len);
pos += iov_len;
len -= iov_len;
cur_len -= iov_len;
@@ -544,8 +546,8 @@ void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_
}
else
{
memcpy(dirty_it->second.buf + pos - dirty_it->first.stripe,
op->iov.buf[iov_idx].iov_base + iov_pos, cur_len);
memcpy((uint8_t*)dirty_it->second.buf + pos - dirty_it->first.stripe,
(uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, cur_len);
pos += cur_len;
len -= cur_len;
iov_pos += cur_len;
@@ -762,7 +764,7 @@ static void add_iov(int size, bool skip, cluster_op_t *op, int &iov_idx, size_t
{
if (!skip)
{
iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, cur_left);
iov.push_back((uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, cur_left);
}
left -= cur_left;
iov_pos = 0;
@@ -772,7 +774,7 @@ static void add_iov(int size, bool skip, cluster_op_t *op, int &iov_idx, size_t
{
if (!skip)
{
iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, left);
iov.push_back((uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, left);
}
iov_pos += left;
left = 0;
@@ -817,7 +819,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
// First allocation
memset(op->bitmap_buf, 0, object_bitmap_size);
}
op->part_bitmaps = op->bitmap_buf + object_bitmap_size;
op->part_bitmaps = (uint8_t*)op->bitmap_buf + object_bitmap_size;
op->bitmap_buf_size = bitmap_mem;
}
}
@@ -839,7 +841,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
while (cur < end)
{
unsigned bmp_loc = (cur - op->offset)/bs_bitmap_granularity;
bool skip = (((*(uint8_t*)(op->bitmap_buf + bmp_loc/8)) >> (bmp_loc%8)) & 0x1);
bool skip = (((*((uint8_t*)op->bitmap_buf + bmp_loc/8)) >> (bmp_loc%8)) & 0x1);
if (skip_prev != skip)
{
if (cur > prev)
@@ -944,7 +946,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
.meta_revision = meta_rev,
.version = op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE ? op->version : 0,
} },
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? op->part_bitmaps + pg_bitmap_size*i : NULL),
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
.bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? pg_bitmap_size : 0),
.callback = [this, part](osd_op_t *op_part)
{
@@ -1155,7 +1157,7 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
{
// Copy bytes
mem_or(op->bitmap_buf + object_offset/8, part->op.bitmap + part_offset/8, part_len/8);
mem_or((uint8_t*)op->bitmap_buf + object_offset/8, (uint8_t*)part->op.bitmap + part_offset/8, part_len/8);
object_offset += (part_len & ~0x7);
part_offset += (part_len & ~0x7);
part_len = (part_len & 0x7);
@@ -1163,8 +1165,8 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
while (part_len > 0)
{
// Copy bits
(*(uint8_t*)(op->bitmap_buf + (object_offset >> 3))) |= (
(((*(uint8_t*)(part->op.bitmap + (part_offset >> 3))) >> (part_offset & 0x7)) & 0x1) << (object_offset & 0x7)
(*((uint8_t*)op->bitmap_buf + (object_offset >> 3))) |= (
(((*((uint8_t*)part->op.bitmap + (part_offset >> 3))) >> (part_offset & 0x7)) & 0x1) << (object_offset & 0x7)
);
part_offset++;
object_offset++;

View File

@@ -200,7 +200,8 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
auto & pool_cfg = st_cli.pool_config[cur_list->pg->lst->pool_id];
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->peer_fd = msgr.osd_peer_fds[cur_list->osd_num];
// Already checked that it exists above, but anyway
op->peer_fd = msgr.osd_peer_fds.at(cur_list->osd_num);
op->req = (osd_any_op_t){
.sec_list = {
.header = {

View File

@@ -75,7 +75,7 @@ int main(int argc, char *argv[])
uint64_t s;
for (s = 0; s < self.journal_block; s += 8)
{
if (*((uint64_t*)(data+s)) != 0)
if (*((uint64_t*)((uint8_t*)data+s)) != 0)
break;
}
if (s == self.journal_block)
@@ -139,7 +139,7 @@ int journal_dump_t::dump_block(void *buf)
bool wrapped = false;
while (pos < journal_block)
{
journal_entry *je = (journal_entry*)(buf + pos);
journal_entry *je = (journal_entry*)((uint8_t*)buf + pos);
if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX ||
!all && started && je->crc32_prev != crc32_last)
{

View File

@@ -13,6 +13,7 @@
epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
{
this->ringloop = ringloop;
this->pending = false;
epoll_fd = epoll_create(1);
if (epoll_fd < 0)
@@ -22,11 +23,19 @@ epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
tfd = new timerfd_manager_t([this](int fd, bool wr, std::function<void(int, int)> handler) { set_fd_handler(fd, wr, handler); });
consumer.loop = [this]()
{
if (pending)
handle_epoll_events();
};
ringloop->register_consumer(&consumer);
handle_epoll_events();
}
epoll_manager_t::~epoll_manager_t()
{
ringloop->unregister_consumer(&consumer);
if (tfd)
{
delete tfd;
@@ -64,8 +73,13 @@ void epoll_manager_t::handle_epoll_events()
io_uring_sqe *sqe = ringloop->get_sqe();
if (!sqe)
{
throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
// Don't handle epoll events until we manage to post the next event handler
// otherwise we'll fall out of sync with EPOLLET
pending = true;
ringloop->wakeup();
return;
}
pending = false;
ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
data->callback = [this](ring_data_t *data)

View File

@@ -11,6 +11,8 @@
class epoll_manager_t
{
int epoll_fd;
bool pending;
ring_consumer_t consumer;
ring_loop_t *ringloop;
std::map<int, std::function<void(int, int)>> epoll_handlers;
public:

View File

@@ -5,6 +5,7 @@
#include "pg_states.h"
#include "etcd_state_client.h"
#ifndef __MOCK__
#include "addr_util.h"
#include "http_client.h"
#include "base64.h"
#endif
@@ -25,9 +26,14 @@ etcd_state_client_t::~etcd_state_client_t()
#ifndef __MOCK__
if (etcd_watch_ws)
{
etcd_watch_ws->close();
http_close(etcd_watch_ws);
etcd_watch_ws = NULL;
}
if (keepalive_client)
{
http_close(keepalive_client);
keepalive_client = NULL;
}
#endif
}
@@ -48,12 +54,54 @@ etcd_kv_t etcd_state_client_t::parse_etcd_kv(const json11::Json & kv_json)
return kv;
}
void etcd_state_client_t::etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback)
void etcd_state_client_t::etcd_txn(json11::Json txn, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback)
{
etcd_call("/kv/txn", txn, timeout, callback);
etcd_call("/kv/txn", txn, timeout, retries, interval, callback);
}
void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback)
void etcd_state_client_t::etcd_txn_slow(json11::Json txn, std::function<void(std::string, json11::Json)> callback)
{
etcd_call("/kv/txn", txn, etcd_slow_timeout, max_etcd_attempts, 0, callback);
}
std::vector<std::string> etcd_state_client_t::get_addresses()
{
auto addrs = etcd_local;
addrs.insert(addrs.end(), etcd_addresses.begin(), etcd_addresses.end());
return addrs;
}
void etcd_state_client_t::etcd_call_oneshot(std::string etcd_address, std::string api, json11::Json payload,
int timeout, std::function<void(std::string, json11::Json)> callback)
{
std::string etcd_api_path;
int pos = etcd_address.find('/');
if (pos >= 0)
{
etcd_api_path = etcd_address.substr(pos);
etcd_address = etcd_address.substr(0, pos);
}
std::string req = payload.dump();
req = "POST "+etcd_api_path+api+" HTTP/1.1\r\n"
"Host: "+etcd_address+"\r\n"
"Content-Type: application/json\r\n"
"Content-Length: "+std::to_string(req.size())+"\r\n"
"Connection: close\r\n"
"\r\n"+req;
auto http_cli = http_init(tfd);
auto cb = [this, http_cli, callback](const http_response_t *response)
{
std::string err;
json11::Json data;
response->parse_json_response(err, data);
callback(err, data);
http_close(http_cli);
};
http_request(http_cli, etcd_address, req, { .timeout = timeout }, cb);
}
void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int timeout,
int retries, int interval, std::function<void(std::string, json11::Json)> callback)
{
if (!etcd_addresses.size() && !etcd_local.size())
{
@@ -74,14 +122,49 @@ void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int t
"Host: "+etcd_address+"\r\n"
"Content-Type: application/json\r\n"
"Content-Length: "+std::to_string(req.size())+"\r\n"
"Connection: close\r\n"
"Connection: keep-alive\r\n"
"Keep-Alive: timeout="+std::to_string(etcd_keepalive_timeout)+"\r\n"
"\r\n"+req;
http_request_json(tfd, etcd_address, req, timeout, [this, cur_addr = selected_etcd_address, callback](std::string err, json11::Json data)
auto cb = [this, api, payload, timeout, retries, interval, callback,
cur_addr = selected_etcd_address](const http_response_t *response)
{
if (err != "" && cur_addr == selected_etcd_address)
selected_etcd_address = "";
callback(err, data);
});
std::string err;
json11::Json data;
response->parse_json_response(err, data);
if (err != "")
{
if (cur_addr == selected_etcd_address)
selected_etcd_address = "";
if (retries > 0)
{
if (this->log_level > 0)
{
printf(
"Warning: etcd request failed: %s, retrying %d more times\n",
err.c_str(), retries
);
}
if (interval > 0)
{
tfd->set_timer(interval, false, [this, api, payload, timeout, retries, interval, callback](int)
{
etcd_call(api, payload, timeout, retries-1, interval, callback);
});
}
else
etcd_call(api, payload, timeout, retries-1, interval, callback);
}
else
callback(err, data);
}
else
callback(err, data);
};
if (!keepalive_client)
{
keepalive_client = http_init(tfd);
}
http_request(keepalive_client, etcd_address, req, { .timeout = timeout, .keepalive = true }, cb);
}
void etcd_state_client_t::add_etcd_url(std::string addr)
@@ -155,6 +238,33 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
this->etcd_prefix = "/"+this->etcd_prefix;
}
this->log_level = config["log_level"].int64_value();
this->etcd_keepalive_timeout = config["etcd_keepalive_timeout"].uint64_value();
if (this->etcd_keepalive_timeout <= 0)
{
this->etcd_keepalive_timeout = config["etcd_report_interval"].uint64_value() * 2;
if (this->etcd_keepalive_timeout < 30)
this->etcd_keepalive_timeout = 30;
}
this->etcd_ws_keepalive_interval = config["etcd_ws_keepalive_interval"].uint64_value();
if (this->etcd_ws_keepalive_interval <= 0)
{
this->etcd_ws_keepalive_interval = 30;
}
this->max_etcd_attempts = config["max_etcd_attempts"].uint64_value();
if (this->max_etcd_attempts <= 0)
{
this->max_etcd_attempts = 5;
}
this->etcd_slow_timeout = config["etcd_slow_timeout"].uint64_value();
if (this->etcd_slow_timeout <= 0)
{
this->etcd_slow_timeout = 5000;
}
this->etcd_quick_timeout = config["etcd_quick_timeout"].uint64_value();
if (this->etcd_quick_timeout <= 0)
{
this->etcd_quick_timeout = 1000;
}
}
void etcd_state_client_t::pick_next_etcd()
@@ -169,9 +279,16 @@ void etcd_state_client_t::pick_next_etcd()
std::vector<int> ns;
for (int i = 0; i < etcd_addresses.size(); i++)
ns.push_back(i);
if (!rand_initialized)
{
timespec tv;
clock_gettime(CLOCK_REALTIME, &tv);
srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
rand_initialized = true;
}
while (ns.size())
{
int i = rand() % ns.size();
int i = lrand48() % ns.size();
addresses_to_try.push_back(etcd_addresses[ns[i]]);
ns.erase(ns.begin()+i, ns.begin()+i+1);
}
@@ -200,10 +317,12 @@ void etcd_state_client_t::start_etcd_watcher()
ws_alive = 1;
if (etcd_watch_ws)
{
etcd_watch_ws->close();
http_close(etcd_watch_ws);
etcd_watch_ws = NULL;
}
etcd_watch_ws = open_websocket(tfd, etcd_address, etcd_api_path+"/watch", ETCD_SLOW_TIMEOUT,
if (this->log_level > 1)
printf("Trying to connect to etcd websocket at %s\n", etcd_address.c_str());
etcd_watch_ws = open_websocket(tfd, etcd_address, etcd_api_path+"/watch", etcd_slow_timeout,
[this, cur_addr = selected_etcd_address](const http_response_t *msg)
{
if (msg->body.length())
@@ -219,6 +338,8 @@ void etcd_state_client_t::start_etcd_watcher()
{
if (data["result"]["created"].bool_value())
{
if (etcd_watches_initialised == 3 && this->log_level > 0)
fprintf(stderr, "Successfully subscribed to etcd at %s\n", selected_etcd_address.c_str());
etcd_watches_initialised++;
}
if (data["result"]["canceled"].bool_value())
@@ -232,8 +353,11 @@ void etcd_state_client_t::start_etcd_watcher()
{
fprintf(stderr, "Revisions before %lu were compacted by etcd, reloading state\n",
data["result"]["compact_revision"].uint64_value());
etcd_watch_ws->close();
etcd_watch_ws = NULL;
if (etcd_watch_ws)
{
http_close(etcd_watch_ws);
etcd_watch_ws = NULL;
}
etcd_watch_revision = 0;
on_reload_hook();
}
@@ -284,13 +408,20 @@ void etcd_state_client_t::start_etcd_watcher()
{
if (cur_addr == selected_etcd_address)
{
fprintf(stderr, "Disconnected from etcd %s\n", selected_etcd_address.c_str());
selected_etcd_address = "";
}
etcd_watch_ws = NULL;
else
fprintf(stderr, "Disconnected from etcd\n");
if (etcd_watch_ws)
{
http_close(etcd_watch_ws);
etcd_watch_ws = NULL;
}
if (etcd_watches_initialised == 0)
{
// Connection not established, retry in <ETCD_QUICK_TIMEOUT>
tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int)
// Connection not established, retry in <etcd_quick_timeout>
tfd->set_timer(etcd_quick_timeout, false, [this](int)
{
start_etcd_watcher();
});
@@ -302,7 +433,7 @@ void etcd_state_client_t::start_etcd_watcher()
}
}
});
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/config/") },
{ "range_end", base64_encode(etcd_prefix+"/config0") },
@@ -311,7 +442,7 @@ void etcd_state_client_t::start_etcd_watcher()
{ "progress_notify", true },
} }
}).dump());
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/osd/state/") },
{ "range_end", base64_encode(etcd_prefix+"/osd/state0") },
@@ -320,7 +451,7 @@ void etcd_state_client_t::start_etcd_watcher()
{ "progress_notify", true },
} }
}).dump());
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/pg/state/") },
{ "range_end", base64_encode(etcd_prefix+"/pg/state0") },
@@ -329,7 +460,7 @@ void etcd_state_client_t::start_etcd_watcher()
{ "progress_notify", true },
} }
}).dump());
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/pg/history/") },
{ "range_end", base64_encode(etcd_prefix+"/pg/history0") },
@@ -340,7 +471,7 @@ void etcd_state_client_t::start_etcd_watcher()
}).dump());
if (ws_keepalive_timer < 0)
{
ws_keepalive_timer = tfd->set_timer(ETCD_KEEPALIVE_TIMEOUT, true, [this](int)
ws_keepalive_timer = tfd->set_timer(etcd_ws_keepalive_interval*1000, true, [this](int)
{
if (!etcd_watch_ws)
{
@@ -348,14 +479,21 @@ void etcd_state_client_t::start_etcd_watcher()
}
else if (!ws_alive)
{
etcd_watch_ws->close();
etcd_watch_ws = NULL;
if (this->log_level > 0)
{
fprintf(stderr, "Websocket ping failed, disconnecting from etcd %s\n", selected_etcd_address.c_str());
}
if (etcd_watch_ws)
{
http_close(etcd_watch_ws);
etcd_watch_ws = NULL;
}
start_etcd_watcher();
}
else
{
ws_alive = 0;
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object {
{ "progress_request", json11::Json::object { } }
}).dump());
}
@@ -367,12 +505,12 @@ void etcd_state_client_t::load_global_config()
{
etcd_call("/kv/range", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/config/global") }
}, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
}, etcd_slow_timeout, max_etcd_attempts, 0, [this](std::string err, json11::Json data)
{
if (err != "")
{
fprintf(stderr, "Error reading OSD configuration from etcd: %s\n", err.c_str());
tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
tfd->set_timer(etcd_slow_timeout, false, [this](int timer_id)
{
load_global_config();
});
@@ -440,12 +578,13 @@ void etcd_state_client_t::load_pgs()
{
req["compare"] = checks;
}
etcd_txn(req, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
etcd_txn_slow(req, [this](std::string err, json11::Json data)
{
if (err != "")
{
// Retry indefinitely
fprintf(stderr, "Error loading PGs from etcd: %s\n", err.c_str());
tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
tfd->set_timer(etcd_slow_timeout, false, [this](int timer_id)
{
load_pgs();
});
@@ -815,6 +954,10 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
}
if (!value.is_object())
{
if (on_inode_change_hook != NULL)
{
on_inode_change_hook(inode_num, true);
}
this->inode_config.erase(inode_num);
}
else
@@ -829,7 +972,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
{
fprintf(
stderr, "Inode %lu/%lu parent_pool value is invalid, ignoring parent setting\n",
inode_num >> (64-POOL_ID_BITS), inode_num & ((1l << (64-POOL_ID_BITS)) - 1)
inode_num >> (64-POOL_ID_BITS), inode_num & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)
);
parent_inode_num = 0;
}
@@ -856,6 +999,10 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
}
}
}
if (on_inode_change_hook != NULL)
{
on_inode_change_hook(inode_num, false);
}
}
}
}

View File

@@ -12,11 +12,6 @@
#define ETCD_PG_HISTORY_WATCH_ID 3
#define ETCD_OSD_STATE_WATCH_ID 4
#define MAX_ETCD_ATTEMPTS 5
#define ETCD_SLOW_TIMEOUT 5000
#define ETCD_QUICK_TIMEOUT 1000
#define ETCD_KEEPALIVE_TIMEOUT 30000
#define DEFAULT_BLOCK_SIZE 128*1024
struct etcd_kv_t
@@ -71,7 +66,7 @@ struct inode_watch_t
inode_config_t cfg;
};
struct websocket_t;
struct http_co_t;
struct etcd_state_client_t
{
@@ -82,13 +77,20 @@ protected:
std::string selected_etcd_address;
std::vector<std::string> addresses_to_try;
std::vector<inode_watch_t*> watches;
websocket_t *etcd_watch_ws = NULL;
http_co_t *etcd_watch_ws = NULL, *keepalive_client = NULL;
int ws_keepalive_timer = -1;
int ws_alive = 0;
bool rand_initialized = false;
uint64_t bs_block_size = DEFAULT_BLOCK_SIZE;
void add_etcd_url(std::string);
void pick_next_etcd();
public:
int etcd_keepalive_timeout = 30;
int etcd_ws_keepalive_interval = 30;
int max_etcd_attempts = 5;
int etcd_quick_timeout = 1000;
int etcd_slow_timeout = 5000;
std::string etcd_prefix;
int log_level = 0;
timerfd_manager_t *tfd = NULL;
@@ -107,11 +109,15 @@ public:
std::function<void(pool_id_t, pg_num_t)> on_change_pg_history_hook;
std::function<void(osd_num_t)> on_change_osd_state_hook;
std::function<void()> on_reload_hook;
std::function<void(inode_t, bool)> on_inode_change_hook;
json11::Json::object serialize_inode_cfg(inode_config_t *cfg);
etcd_kv_t parse_etcd_kv(const json11::Json & kv_json);
void etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback);
void etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback);
std::vector<std::string> get_addresses();
void etcd_call_oneshot(std::string etcd_address, std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback);
void etcd_call(std::string api, json11::Json payload, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
void etcd_txn(json11::Json txn, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
void etcd_txn_slow(json11::Json txn, std::function<void(std::string, json11::Json)> callback);
void start_etcd_watcher();
void load_global_config();
void load_pgs();

View File

@@ -214,14 +214,14 @@ static int sec_setup(struct thread_data *td)
if (!o->image)
{
if (!(o->inode & ((1l << (64-POOL_ID_BITS)) - 1)))
if (!(o->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)))
{
td_verror(td, EINVAL, "inode number is missing");
return 1;
}
if (o->pool)
{
o->inode = (o->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (o->pool << (64-POOL_ID_BITS));
o->inode = (o->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (o->pool << (64-POOL_ID_BITS));
}
if (!(o->inode >> (64-POOL_ID_BITS)))
{
@@ -247,6 +247,12 @@ static int sec_setup(struct thread_data *td)
vitastor_c_uring_wait_events(bsd->cli);
}
td->files[0]->real_file_size = vitastor_c_inode_get_size(bsd->watch);
if (!vitastor_c_inode_get_num(bsd->watch) ||
!td->files[0]->real_file_size)
{
td_verror(td, EINVAL, "image does not exist");
return 1;
}
}
bsd->trace = o->trace ? true : false;
@@ -345,9 +351,9 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
}
else
{
printf("+++ %s 0x%lx 0x%llx+%llx\n",
printf("+++ %s 0x%lx 0x%llx+%lx\n",
io->ddir == DDIR_READ ? "READ" : "WRITE",
(uint64_t)io, io->offset, io->xfer_buflen);
(uint64_t)io, io->offset, (uint64_t)io->xfer_buflen);
}
}

View File

@@ -26,9 +26,8 @@
#include "blockstore.h"
#include "epoll_manager.h"
#include "fio_headers.h"
#include "json11/json11.hpp"
#include "fio_headers.h"
struct bs_data
{
@@ -150,7 +149,6 @@ static int bs_init(struct thread_data *td)
static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
{
bs_data *bsd = (bs_data*)td->io_ops_data;
int n = bsd->op_n;
if (io->ddir == DDIR_SYNC && bsd->last_sync)
{
return FIO_Q_COMPLETED;
@@ -178,7 +176,7 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
op->version = UINT64_MAX; // last unstable
op->offset = io->offset % bsd->bs->get_block_size();
op->len = io->xfer_buflen;
op->callback = [io, n](blockstore_op_t *op)
op->callback = [io](blockstore_op_t *op)
{
io->error = op->retval < 0 ? -op->retval : 0;
bs_data *bsd = (bs_data*)io->engine_data;
@@ -200,7 +198,7 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
op->version = 0; // assign automatically
op->offset = io->offset % bsd->bs->get_block_size();
op->len = io->xfer_buflen;
op->callback = [io, n](blockstore_op_t *op)
op->callback = [io](blockstore_op_t *op)
{
io->error = op->retval < 0 ? -op->retval : 0;
bs_data *bsd = (bs_data*)io->engine_data;
@@ -215,7 +213,7 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
break;
case DDIR_SYNC:
op->opcode = BS_OP_SYNC_STAB_ALL;
op->callback = [io, n](blockstore_op_t *op)
op->callback = [io](blockstore_op_t *op)
{
bs_data *bsd = (bs_data*)io->engine_data;
io->error = op->retval < 0 ? -op->retval : 0;
@@ -230,6 +228,7 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
break;
default:
io->error = EINVAL;
delete op;
return FIO_Q_COMPLETED;
}

View File

@@ -1,4 +1,3 @@
extern "C" {
// Kill atomics in fio headers
#define _STDATOMIC_H
#include "fio/arch/arch.h"
@@ -11,6 +10,7 @@ extern "C" {
#define CONFIG_HAVE_GETTID
#define CONFIG_SYNC_FILE_RANGE
#define CONFIG_PWRITEV2
extern "C" {
#include "fio/fio.h"
#include "fio/optgroup.h"
}

View File

@@ -29,23 +29,22 @@
#include <unordered_map>
#include "addr_util.h"
#include "epoll_manager.h"
#include "ringloop.h"
#include "messenger.h"
#include "rw_blocking.h"
#include "osd_ops.h"
#include "fio_headers.h"
struct op_buf_t
{
osd_any_op_t buf;
io_u* fio_op;
};
struct sec_data
{
epoll_manager_t *epmgr;
ring_loop_t *ringloop;
osd_messenger_t *msgr;
ring_consumer_t looper;
void *bitmap_buf;
int connect_fd;
uint64_t op_id;
/* block_size = 1 << block_order (128KB by default) */
uint64_t block_order = 17, block_size = 1 << 17;
std::unordered_map<uint64_t, op_buf_t*> queue;
bool last_sync = false;
/* The list of completed io_u structs. */
std::vector<io_u*> completed;
@@ -60,6 +59,7 @@ struct sec_options
int single_primary = 0;
int trace = 0;
int block_order = 17;
int zerocopy_send = 0;
};
static struct fio_option options[] = {
@@ -110,6 +110,16 @@ static struct fio_option options[] = {
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "zerocopy_send",
.lname = "Use zero-copy send",
.type = FIO_OPT_BOOL,
.off1 = offsetof(struct sec_options, zerocopy_send),
.help = "Use zero-copy send (MSG_ZEROCOPY)",
.def = "0",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = NULL,
},
@@ -118,6 +128,9 @@ static struct fio_option options[] = {
static int sec_setup(struct thread_data *td)
{
sec_data *bsd;
//fio_file *f;
//int r;
//int64_t size;
bsd = new sec_data;
if (!bsd)
@@ -134,6 +147,8 @@ static int sec_setup(struct thread_data *td)
td->o.open_files++;
}
//f = td->files[0];
//f->real_file_size = size;
return 0;
}
@@ -142,10 +157,6 @@ static void sec_cleanup(struct thread_data *td)
sec_data *bsd = (sec_data*)td->io_ops_data;
if (bsd)
{
delete bsd->msgr;
delete bsd->epmgr;
delete bsd->ringloop;
free(bsd->bitmap_buf);
close(bsd->connect_fd);
delete bsd;
}
@@ -159,14 +170,14 @@ static int sec_init(struct thread_data *td)
bsd->block_order = o->block_order == 0 ? 17 : o->block_order;
bsd->block_size = 1 << o->block_order;
sockaddr addr;
sockaddr_storage addr;
if (!string_to_addr(std::string(o->host ? o->host : "127.0.0.1"), false, o->port > 0 ? o->port : 11203, &addr))
{
fprintf(stderr, "server address: %s is not valid\n", o->host ? o->host : "127.0.0.1");
return 1;
}
bsd->connect_fd = socket(addr.sa_family, SOCK_STREAM, 0);
bsd->connect_fd = socket(addr.ss_family, SOCK_STREAM, 0);
if (bsd->connect_fd < 0)
{
perror("socket");
@@ -179,45 +190,19 @@ static int sec_init(struct thread_data *td)
}
int one = 1;
setsockopt(bsd->connect_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
fcntl(bsd->connect_fd, F_SETFL, fcntl(bsd->connect_fd, F_GETFL, 0) | O_NONBLOCK);
json11::Json cfg = json11::Json::object{ { "use_rdma", 0 } };
bsd->bitmap_buf = malloc(4096);
bsd->ringloop = new ring_loop_t(512);
bsd->epmgr = new epoll_manager_t(bsd->ringloop);
bsd->msgr = new osd_messenger_t();
bsd->msgr->tfd = bsd->epmgr->tfd;
bsd->msgr->ringloop = bsd->ringloop;
bsd->msgr->repeer_pgs = [](osd_num_t){};
bsd->msgr->parse_config(cfg);
bsd->msgr->init();
bsd->looper.loop = [bsd]()
if (o->zerocopy_send)
{
bsd->msgr->read_requests();
bsd->msgr->send_replies();
bsd->ringloop->submit();
};
bsd->ringloop->register_consumer(&bsd->looper);
int peer_fd = bsd->connect_fd;
bsd->msgr->clients[peer_fd] = new osd_client_t();
bsd->msgr->clients[peer_fd]->peer_addr = addr;
bsd->msgr->clients[peer_fd]->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
bsd->msgr->clients[peer_fd]->peer_fd = peer_fd;
bsd->msgr->clients[peer_fd]->peer_state = PEER_CONNECTED;
bsd->msgr->clients[peer_fd]->connect_timeout_id = -1;
bsd->msgr->clients[peer_fd]->osd_num = 1;
bsd->msgr->clients[peer_fd]->in_buf = malloc_or_die(bsd->msgr->receive_buffer_size);
bsd->epmgr->tfd->set_fd_handler(peer_fd, true, [msgr = bsd->msgr](int peer_fd, int epoll_events)
{
// Either OUT (connected) or HUP
msgr->handle_peer_epoll(peer_fd, epoll_events);
});
bsd->msgr->osd_peer_fds[1] = peer_fd;
#ifndef SO_ZEROCOPY
perror("zerocopy send not supported on your system (socket.h misses SO_ZEROCOPY)");
return 1;
#else
if (setsockopt(bsd->connect_fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)) < 0)
{
perror("setsockopt zerocopy");
return 1;
}
#endif
}
// FIXME: read config (block size) from OSD
@@ -238,12 +223,9 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
}
io->engine_data = bsd;
osd_op_t *oo = new osd_op_t();
oo->op_type = OSD_OP_OUT;
oo->peer_fd = bsd->connect_fd;
osd_any_op_t & op = oo->req;
op_buf_t *op_buf = new op_buf_t;
op_buf->fio_op = io;
osd_any_op_t &op = op_buf->buf;
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = n;
@@ -260,9 +242,6 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
op.sec_rw.version = UINT64_MAX; // last unstable
op.sec_rw.offset = io->offset % bsd->block_size;
op.sec_rw.len = io->xfer_buflen;
op.sec_rw.attr_len = 4;
oo->bitmap = bsd->bitmap_buf;
oo->bitmap_len = 4;
}
else
{
@@ -271,7 +250,6 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
op.rw.offset = io->offset;
op.rw.len = io->xfer_buflen;
}
oo->iov.push_back(io->xfer_buf, io->xfer_buflen);
bsd->last_sync = false;
break;
case DDIR_WRITE:
@@ -293,7 +271,6 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
op.rw.offset = io->offset;
op.rw.len = io->xfer_buflen;
}
oo->iov.push_back(io->xfer_buf, io->xfer_buflen);
bsd->last_sync = false;
break;
case DDIR_SYNC:
@@ -312,24 +289,10 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
break;
default:
io->error = EINVAL;
delete op_buf;
return FIO_Q_COMPLETED;
}
oo->callback = [td, io](osd_op_t *oo)
{
sec_options *opt = (sec_options*)td->eo;
sec_data *bsd = (sec_data*)td->io_ops_data;
if (opt->trace)
{
printf("--- %s # %ld %ld\n", io->ddir == DDIR_READ ? "READ" :
(io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), oo->reply.hdr.id, oo->reply.hdr.retval);
}
io->error = oo->reply.hdr.retval < 0 ? -oo->reply.hdr.retval : 0;
bsd->completed.push_back(io);
delete oo;
};
bsd->msgr->outbox_push(oo);
if (opt->trace)
{
printf("+++ %s # %d\n", io->ddir == DDIR_READ ? "READ" :
@@ -339,6 +302,26 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
io->error = 0;
bsd->inflight++;
bsd->op_n++;
bsd->queue[n] = op_buf;
iovec iov[2] = { { .iov_base = op.buf, .iov_len = OSD_PACKET_SIZE } };
int iovcnt = 1, wtotal = OSD_PACKET_SIZE;
if (io->ddir == DDIR_WRITE)
{
iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
wtotal += io->xfer_buflen;
}
if (sendv_blocking(bsd->connect_fd, iov, iovcnt,
#ifdef SO_ZEROCOPY
opt->zerocopy_send ? MSG_ZEROCOPY : 0
#else
0
#endif
) != wtotal)
{
perror("sendmsg");
exit(1);
}
if (io->error != 0)
return FIO_Q_COMPLETED;
@@ -347,13 +330,74 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int max, const struct timespec *t)
{
sec_options *opt = (sec_options*)td->eo;
sec_data *bsd = (sec_data*)td->io_ops_data;
// FIXME timeout, at least poll. Now it's the stupidest implementation possible
osd_any_reply_t reply;
while (bsd->completed.size() < min)
{
bsd->ringloop->loop();
if (bsd->completed.size() >= min)
break;
bsd->ringloop->wait();
read_blocking(bsd->connect_fd, reply.buf, OSD_PACKET_SIZE);
if (reply.hdr.magic != SECONDARY_OSD_REPLY_MAGIC)
{
fprintf(stderr, "bad reply: magic = %lx instead of %lx\n", reply.hdr.magic, SECONDARY_OSD_REPLY_MAGIC);
exit(1);
}
auto it = bsd->queue.find(reply.hdr.id);
if (it == bsd->queue.end())
{
fprintf(stderr, "bad reply: op id %lx missing in local queue\n", reply.hdr.id);
exit(1);
}
io_u* io = it->second->fio_op;
delete it->second;
bsd->queue.erase(it);
if (io->ddir == DDIR_READ)
{
if (reply.hdr.retval != io->xfer_buflen)
{
fprintf(stderr, "Short read: retval = %ld instead of %lu\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
exit(1);
}
// Support bitmap
uint64_t bitmap = 0;
int iovcnt = 0;
iovec iov[2];
if (reply.sec_rw.attr_len > 0)
{
if (reply.sec_rw.attr_len <= 8)
iov[iovcnt++] = { .iov_base = &bitmap, .iov_len = reply.sec_rw.attr_len };
else
iov[iovcnt++] = { .iov_base = (void*)(bitmap = (uint64_t)malloc(reply.sec_rw.attr_len)), .iov_len = reply.sec_rw.attr_len };
}
iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
readv_blocking(bsd->connect_fd, iov, iovcnt);
if (reply.sec_rw.attr_len > 8)
{
free((void*)bitmap);
}
}
else if (io->ddir == DDIR_WRITE)
{
if (reply.hdr.retval != io->xfer_buflen)
{
fprintf(stderr, "Short write: retval = %ld instead of %lu\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
exit(1);
}
}
else if (io->ddir == DDIR_SYNC)
{
if (reply.hdr.retval != 0)
{
fprintf(stderr, "Sync failed: retval = %ld\n", reply.hdr.retval);
exit(1);
}
}
if (opt->trace)
{
printf("--- %s # %ld\n", io->ddir == DDIR_READ ? "READ" :
(io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), reply.hdr.id);
}
bsd->completed.push_back(io);
}
return bsd->completed.size();
}

View File

@@ -4,9 +4,7 @@
#include <netinet/tcp.h>
#include <sys/epoll.h>
#include <net/if.h>
#include <arpa/inet.h>
#include <ifaddrs.h>
#include <ctype.h>
#include <unistd.h>
@@ -25,11 +23,12 @@
static std::string trim(const std::string & in);
static std::string ws_format_frame(int type, uint64_t size);
static bool ws_parse_frame(std::string & buf, int & type, std::string & res);
static void parse_http_headers(std::string & res, http_response_t *parsed);
// FIXME: Use keepalive
struct http_co_t
{
timerfd_manager_t *tfd;
std::function<void(const http_response_t*)> response_callback;
int request_timeout = 0;
std::string host;
@@ -37,11 +36,12 @@ struct http_co_t
std::string ws_outbox;
std::string response;
bool want_streaming;
bool keepalive;
http_response_t parsed;
uint64_t target_response_size = 0;
std::vector<std::function<void()>> keepalive_queue;
int state = 0;
std::string connected_host;
int peer_fd = -1;
int timeout_id = -1;
int epoll_events = 0;
@@ -49,10 +49,8 @@ struct http_co_t
std::vector<char> rbuf;
iovec read_iov, send_iov;
msghdr read_msg = { 0 }, send_msg = { 0 };
std::function<void(const http_response_t*)> callback;
websocket_t ws;
http_response_t parsed;
uint64_t target_response_size = 0;
int onstack = 0;
bool ended = false;
@@ -61,66 +59,41 @@ struct http_co_t
inline void stackin() { onstack++; }
inline void stackout() { onstack--; if (!onstack && ended) end(); }
inline void end() { ended = true; if (!onstack) { delete this; } }
void run_cb_and_clear();
void start_connection();
void close_connection();
void next_request();
void handle_events();
void handle_connect_result();
void submit_read();
void submit_read(bool check_timeout);
void submit_send();
bool handle_read();
void post_message(int type, const std::string & msg);
void send_request(const std::string & host, const std::string & request,
const http_options_t & options, std::function<void(const http_response_t *response)> response_callback);
};
#define HTTP_CO_CLOSED 0
#define HTTP_CO_CONNECTING 1
#define HTTP_CO_SENDING_REQUEST 2
#define HTTP_CO_REQUEST_SENT 3
#define HTTP_CO_HEADERS_RECEIVED 4
#define HTTP_CO_WEBSOCKET 5
#define HTTP_CO_CHUNKED 6
#define HTTP_CO_KEEPALIVE 7
#define DEFAULT_TIMEOUT 5000
void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
const http_options_t & options, std::function<void(const http_response_t *response)> callback)
http_co_t *http_init(timerfd_manager_t *tfd)
{
http_co_t *handler = new http_co_t();
handler->request_timeout = options.timeout < 0 ? 0 : (options.timeout == 0 ? DEFAULT_TIMEOUT : options.timeout);
handler->want_streaming = options.want_streaming;
handler->tfd = tfd;
handler->host = host;
handler->request = request;
handler->callback = callback;
handler->ws.co = handler;
handler->start_connection();
handler->state = HTTP_CO_CLOSED;
return handler;
}
void http_request_json(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
int timeout, std::function<void(std::string, json11::Json r)> callback)
{
http_request(tfd, host, request, { .timeout = timeout }, [callback](const http_response_t* res)
{
if (res->error_code != 0)
{
callback("Error code: "+std::to_string(res->error_code)+" ("+std::string(strerror(res->error_code))+")", json11::Json());
return;
}
if (res->status_code != 200)
{
callback("HTTP "+std::to_string(res->status_code)+" "+res->status_line+" body: "+trim(res->body), json11::Json());
return;
}
std::string json_err;
json11::Json data = json11::Json::parse(res->body, json_err);
if (json_err != "")
{
callback("Bad JSON: "+json_err+" (response: "+trim(res->body)+")", json11::Json());
return;
}
callback(std::string(), data);
});
}
websocket_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path,
int timeout, std::function<void(const http_response_t *msg)> callback)
http_co_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path,
int timeout, std::function<void(const http_response_t *msg)> response_callback)
{
std::string request = "GET "+path+" HTTP/1.1\r\n"
"Host: "+host+"\r\n"
@@ -130,28 +103,166 @@ websocket_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, co
"Sec-WebSocket-Version: 13\r\n"
"\r\n";
http_co_t *handler = new http_co_t();
handler->tfd = tfd;
handler->state = HTTP_CO_CLOSED;
handler->host = host;
handler->request_timeout = timeout < 0 ? -1 : (timeout == 0 ? DEFAULT_TIMEOUT : timeout);
handler->want_streaming = false;
handler->tfd = tfd;
handler->host = host;
handler->keepalive = false;
handler->request = request;
handler->callback = callback;
handler->ws.co = handler;
handler->response_callback = response_callback;
handler->start_connection();
return &handler->ws;
return handler;
}
void websocket_t::post_message(int type, const std::string & msg)
void http_request(http_co_t *handler, const std::string & host, const std::string & request,
const http_options_t & options, std::function<void(const http_response_t *response)> response_callback)
{
co->post_message(type, msg);
handler->send_request(host, request, options, response_callback);
}
void websocket_t::close()
void http_co_t::run_cb_and_clear()
{
co->end();
parsed.eof = true;
std::function<void(const http_response_t*)> cb;
cb.swap(response_callback);
// Call callback after clearing it because otherwise we may hit reenterability problems
if (cb != NULL)
cb(&parsed);
next_request();
}
void http_co_t::send_request(const std::string & host, const std::string & request,
const http_options_t & options, std::function<void(const http_response_t *response)> response_callback)
{
stackin();
if (state == HTTP_CO_WEBSOCKET)
{
stackout();
throw std::runtime_error("Attempt to send HTTP request into a websocket or chunked stream");
}
else if (state != HTTP_CO_KEEPALIVE && state != HTTP_CO_CLOSED)
{
keepalive_queue.push_back([this, host, request, options, response_callback]()
{
this->send_request(host, request, options, response_callback);
});
stackout();
return;
}
if (state == HTTP_CO_KEEPALIVE && connected_host != host)
{
close_connection();
}
this->request_timeout = options.timeout < 0 ? 0 : (options.timeout == 0 ? DEFAULT_TIMEOUT : options.timeout);
this->want_streaming = options.want_streaming;
this->keepalive = options.keepalive;
this->host = host;
this->request = request;
this->response = "";
this->sent = 0;
this->response_callback = response_callback;
this->parsed = {};
if (state == HTTP_CO_KEEPALIVE)
{
state = HTTP_CO_SENDING_REQUEST;
submit_send();
}
else
{
start_connection();
}
// Do it _after_ state assignment because set_timer() can actually trigger
// other timers and requests (reenterability is our friend)
if (request_timeout > 0)
{
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
{
stackin();
if (state == HTTP_CO_REQUEST_SENT)
{
// In case of high CPU load, we may not handle etcd responses in time
// For this case, first check the socket and only then terminate request with the timeout
submit_read(true);
}
else
{
close_connection();
parsed = { .error = "HTTP request timed out" };
run_cb_and_clear();
}
stackout();
});
}
stackout();
}
void http_post_message(http_co_t *handler, int type, const std::string & msg)
{
handler->post_message(type, msg);
}
void http_co_t::post_message(int type, const std::string & msg)
{
stackin();
if (state == HTTP_CO_WEBSOCKET)
{
request += ws_format_frame(type, msg.size());
request += msg;
submit_send();
}
else if (state == HTTP_CO_KEEPALIVE || state == HTTP_CO_CHUNKED)
{
throw std::runtime_error("Attempt to send websocket message on a regular HTTP connection");
}
else
{
ws_outbox += ws_format_frame(type, msg.size());
ws_outbox += msg;
}
stackout();
}
void http_close(http_co_t *handler)
{
handler->end();
}
void http_response_t::parse_json_response(std::string & error, json11::Json & r) const
{
if (this->error != "")
{
error = this->error;
r = json11::Json();
}
else if (status_code != 200)
{
error = "HTTP "+std::to_string(status_code)+" "+status_line+" body: "+trim(body);
r = json11::Json();
}
else
{
std::string json_err;
json11::Json data = json11::Json::parse(body, json_err);
if (json_err != "")
{
error = "Bad JSON: "+json_err+" (response: "+trim(body)+")";
r = json11::Json();
}
else
{
error = "";
r = data;
}
}
}
http_co_t::~http_co_t()
{
close_connection();
}
void http_co_t::close_connection()
{
if (timeout_id >= 0)
{
@@ -164,63 +275,43 @@ http_co_t::~http_co_t()
close(peer_fd);
peer_fd = -1;
}
if (parsed.headers["transfer-encoding"] == "chunked")
{
int prev = 0, pos = 0;
while ((pos = response.find("\r\n", prev)) >= prev)
{
uint64_t len = strtoull(response.c_str()+prev, NULL, 16);
parsed.body += response.substr(pos+2, len);
prev = pos+2+len+2;
}
}
else
{
std::swap(parsed.body, response);
}
parsed.eof = true;
callback(&parsed);
state = HTTP_CO_CLOSED;
connected_host = "";
response = "";
epoll_events = 0;
}
void http_co_t::start_connection()
{
stackin();
struct sockaddr addr;
struct sockaddr_storage addr;
if (!string_to_addr(host.c_str(), 1, 80, &addr))
{
parsed.error_code = ENXIO;
close_connection();
parsed = { .error = "Invalid address: "+host };
run_cb_and_clear();
stackout();
end();
return;
}
peer_fd = socket(addr.sa_family, SOCK_STREAM, 0);
peer_fd = socket(addr.ss_family, SOCK_STREAM, 0);
if (peer_fd < 0)
{
parsed.error_code = errno;
close_connection();
parsed = { .error = std::string("socket: ")+strerror(errno) };
run_cb_and_clear();
stackout();
end();
return;
}
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
if (request_timeout > 0)
{
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
{
if (response.length() == 0)
{
parsed.error_code = ETIME;
}
end();
});
}
epoll_events = 0;
// Finally call connect
int r = ::connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS)
{
parsed.error_code = errno;
close_connection();
parsed = { .error = std::string("connect: ")+strerror(errno) };
run_cb_and_clear();
stackout();
end();
return;
}
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
@@ -228,6 +319,7 @@ void http_co_t::start_connection()
this->epoll_events |= epoll_events;
handle_events();
});
connected_host = host;
state = HTTP_CO_CONNECTING;
stackout();
}
@@ -246,11 +338,14 @@ void http_co_t::handle_events()
epoll_events &= ~EPOLLOUT;
if (epoll_events & EPOLLIN)
{
submit_read();
submit_read(false);
}
else if (epoll_events & (EPOLLRDHUP|EPOLLERR))
{
end();
if (state == HTTP_CO_HEADERS_RECEIVED)
std::swap(parsed.body, response);
close_connection();
run_cb_and_clear();
break;
}
}
@@ -269,9 +364,10 @@ void http_co_t::handle_connect_result()
}
if (result != 0)
{
parsed.error_code = result;
close_connection();
parsed = { .error = std::string("connect: ")+strerror(result) };
run_cb_and_clear();
stackout();
end();
return;
}
int one = 1;
@@ -286,41 +382,6 @@ void http_co_t::handle_connect_result()
stackout();
}
void http_co_t::submit_read()
{
stackin();
int res;
if (rbuf.size() != READ_BUFFER_SIZE)
{
rbuf.resize(READ_BUFFER_SIZE);
}
read_iov = { .iov_base = rbuf.data(), .iov_len = READ_BUFFER_SIZE };
read_msg.msg_iov = &read_iov;
read_msg.msg_iovlen = 1;
res = recvmsg(peer_fd, &read_msg, 0);
if (res < 0)
{
res = -errno;
}
if (res == -EAGAIN)
{
epoll_events = epoll_events & ~EPOLLIN;
}
else if (res <= 0)
{
// < 0 means error, 0 means EOF
if (!res)
epoll_events = epoll_events & ~EPOLLIN;
end();
}
else
{
response += std::string(rbuf.data(), res);
handle_read();
}
stackout();
}
void http_co_t::submit_send()
{
stackin();
@@ -336,23 +397,23 @@ again:
{
res = -errno;
}
if (res == -EAGAIN)
if (res == -EAGAIN || res == -EINTR)
{
res = 0;
}
else if (res < 0)
{
close_connection();
parsed = { .error = std::string("sendmsg: ")+strerror(errno) };
run_cb_and_clear();
stackout();
end();
return;
}
sent += res;
if (state == HTTP_CO_SENDING_REQUEST)
{
if (sent >= request.size())
{
state = HTTP_CO_REQUEST_SENT;
}
else
goto again;
}
@@ -366,6 +427,61 @@ again:
stackout();
}
void http_co_t::submit_read(bool check_timeout)
{
stackin();
int res;
again:
if (rbuf.size() != READ_BUFFER_SIZE)
{
rbuf.resize(READ_BUFFER_SIZE);
}
read_iov = { .iov_base = rbuf.data(), .iov_len = READ_BUFFER_SIZE };
read_msg.msg_iov = &read_iov;
read_msg.msg_iovlen = 1;
res = recvmsg(peer_fd, &read_msg, 0);
if (res < 0)
{
res = -errno;
}
if (res == -EAGAIN || res == -EINTR)
{
if (check_timeout)
{
if (res == -EINTR)
goto again;
else
{
// Timeout happened and there is no data to read
close_connection();
parsed = { .error = "HTTP request timed out" };
run_cb_and_clear();
}
}
else
{
epoll_events = epoll_events & ~EPOLLIN;
}
}
else if (res <= 0)
{
// < 0 means error, 0 means EOF
epoll_events = epoll_events & ~EPOLLIN;
if (state == HTTP_CO_HEADERS_RECEIVED)
std::swap(parsed.body, response);
close_connection();
if (res < 0)
parsed = { .error = std::string("recvmsg: ")+strerror(-res) };
run_cb_and_clear();
}
else
{
response += std::string(rbuf.data(), res);
handle_read();
}
stackout();
}
bool http_co_t::handle_read()
{
stackin();
@@ -376,6 +492,7 @@ bool http_co_t::handle_read()
{
if (timeout_id >= 0)
{
// Timeout is cleared when headers are received
tfd->clear_timer(timeout_id);
timeout_id = -1;
}
@@ -403,20 +520,29 @@ bool http_co_t::handle_read()
if (!target_response_size)
{
// Sorry, unsupported response
close_connection();
parsed = { .error = "Response has neither Connection: close, nor Transfer-Encoding: chunked nor Content-Length headers" };
run_cb_and_clear();
stackout();
end();
return false;
}
}
else
{
keepalive = false;
}
}
}
if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size)
{
stackout();
end();
return false;
std::swap(parsed.body, response);
if (!keepalive)
close_connection();
else
state = HTTP_CO_KEEPALIVE;
run_cb_and_clear();
}
if (state == HTTP_CO_CHUNKED && response.size() > 0)
else if (state == HTTP_CO_CHUNKED && response.size() > 0)
{
int prev = 0, pos = 0;
while ((pos = response.find("\r\n", prev)) >= prev)
@@ -439,31 +565,27 @@ bool http_co_t::handle_read()
{
response = response.substr(prev);
}
if (parsed.eof)
if (want_streaming)
{
stackout();
end();
return false;
}
if (want_streaming && parsed.body.size() > 0)
{
if (!ended)
{
// Don't deliver additional events after close()
callback(&parsed);
}
// Streaming response
response_callback(&parsed);
parsed.body = "";
}
else if (parsed.eof)
{
// Normal response
if (!keepalive)
close_connection();
else
state = HTTP_CO_KEEPALIVE;
run_cb_and_clear();
}
}
if (state == HTTP_CO_WEBSOCKET && response.size() > 0)
else if (state == HTTP_CO_WEBSOCKET && response.size() > 0)
{
while (ws_parse_frame(response, parsed.ws_msg_type, parsed.body))
{
if (!ended)
{
// Don't deliver additional events after close()
callback(&parsed);
}
response_callback(&parsed);
parsed.body = "";
}
}
@@ -471,21 +593,14 @@ bool http_co_t::handle_read()
return true;
}
void http_co_t::post_message(int type, const std::string & msg)
void http_co_t::next_request()
{
stackin();
if (state == HTTP_CO_WEBSOCKET)
if (keepalive_queue.size() > 0)
{
request += ws_format_frame(type, msg.size());
request += msg;
submit_send();
auto next = keepalive_queue[0];
keepalive_queue.erase(keepalive_queue.begin(), keepalive_queue.begin()+1);
next();
}
else
{
ws_outbox += ws_format_frame(type, msg.size());
ws_outbox += msg;
}
stackout();
}
uint64_t stoull_full(const std::string & str, int base)
@@ -503,7 +618,7 @@ uint64_t stoull_full(const std::string & str, int base)
return r;
}
void parse_http_headers(std::string & res, http_response_t *parsed)
static void parse_http_headers(std::string & res, http_response_t *parsed)
{
int pos = res.find("\r\n");
pos = pos < 0 ? res.length() : pos+2;
@@ -552,13 +667,13 @@ static std::string ws_format_frame(int type, uint64_t size)
res[p++] = size | /*mask*/0x80;
else if (size < 65536)
{
res[p++] = 126 | /*mask*/0x80;
res[p++] = (char)(126 | /*mask*/0x80);
res[p++] = (size >> 8) & 0xFF;
res[p++] = (size >> 0) & 0xFF;
}
else
{
res[p++] = 127 | /*mask*/0x80;
res[p++] = (char)(127 | /*mask*/0x80);
res[p++] = (size >> 56) & 0xFF;
res[p++] = (size >> 48) & 0xFF;
res[p++] = (size >> 40) & 0xFF;
@@ -625,136 +740,6 @@ static bool ws_parse_frame(std::string & buf, int & type, std::string & res)
return true;
}
static bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
{
if (bits == 0)
{
// C99 6.5.7 (3): u32 << 32 is undefined behaviour
return true;
}
return !((addr.s_addr ^ net.s_addr) & htonl(0xFFFFFFFFu << (32 - bits)));
}
static bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits)
{
const uint32_t *a = address.s6_addr32;
const uint32_t *n = network.s6_addr32;
int bits_whole, bits_incomplete;
bits_whole = bits >> 5; // number of whole u32
bits_incomplete = bits & 0x1F; // number of bits in incomplete u32
if (bits_whole && memcmp(a, n, bits_whole << 2))
return false;
if (bits_incomplete)
{
uint32_t mask = htonl((0xFFFFFFFFu) << (32 - bits_incomplete));
if ((a[bits_whole] ^ n[bits_whole]) & mask)
return false;
}
return true;
}
struct addr_mask_t
{
sa_family_t family;
in_addr ipv4;
in6_addr ipv6;
uint8_t bits;
};
std::vector<std::string> getifaddr_list(json11::Json mask_cfg, bool include_v6)
{
std::vector<addr_mask_t> masks;
if (mask_cfg.is_string())
{
mask_cfg = json11::Json::array{ mask_cfg };
}
for (auto mask_json: mask_cfg.array_items())
{
std::string mask = mask_json.string_value();
unsigned bits = 0;
int p = mask.find('/');
if (p != std::string::npos)
{
char null_byte = 0;
if (sscanf(mask.c_str()+p+1, "%u%c", &bits, &null_byte) != 1 || bits > 128)
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
}
mask = mask.substr(0, p);
}
in_addr ipv4;
in6_addr ipv6;
if (inet_pton(AF_INET, mask.c_str(), &ipv4) == 1)
{
if (bits > 32)
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
}
masks.push_back((addr_mask_t){ .family = AF_INET, .ipv4 = ipv4, .bits = (uint8_t)bits });
}
else if (include_v6 && inet_pton(AF_INET6, mask.c_str(), &ipv6) == 1)
{
masks.push_back((addr_mask_t){ .family = AF_INET6, .ipv6 = ipv6, .bits = (uint8_t)bits });
}
else
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
}
}
std::vector<std::string> addresses;
ifaddrs *list, *ifa;
if (getifaddrs(&list) == -1)
{
throw std::runtime_error(std::string("getifaddrs: ") + strerror(errno));
}
for (ifa = list; ifa != NULL; ifa = ifa->ifa_next)
{
if (!ifa->ifa_addr)
{
continue;
}
int family = ifa->ifa_addr->sa_family;
if ((family == AF_INET || family == AF_INET6 && include_v6) &&
(ifa->ifa_flags & (IFF_UP | IFF_RUNNING | IFF_LOOPBACK)) == (IFF_UP | IFF_RUNNING))
{
void *addr_ptr;
if (family == AF_INET)
{
addr_ptr = &((sockaddr_in *)ifa->ifa_addr)->sin_addr;
}
else
{
addr_ptr = &((sockaddr_in6 *)ifa->ifa_addr)->sin6_addr;
}
if (masks.size() > 0)
{
int i;
for (i = 0; i < masks.size(); i++)
{
if (masks[i].family == family && (family == AF_INET
? cidr_match(*(in_addr*)addr_ptr, masks[i].ipv4, masks[i].bits)
: cidr6_match(*(in6_addr*)addr_ptr, masks[i].ipv6, masks[i].bits)))
{
break;
}
}
if (i >= masks.size())
{
continue;
}
}
char addr[INET6_ADDRSTRLEN];
if (!inet_ntop(family, addr_ptr, addr, INET6_ADDRSTRLEN))
{
throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
}
addresses.push_back(std::string(addr));
}
}
freeifaddrs(list);
return addresses;
}
std::string strtolower(const std::string & in)
{
std::string s = in;

View File

@@ -21,41 +21,34 @@ struct http_options_t
{
int timeout;
bool want_streaming;
bool keepalive;
};
struct http_response_t
{
std::string error;
bool eof = false;
int error_code = 0;
int status_code = 0;
std::string status_line;
std::map<std::string, std::string> headers;
int ws_msg_type = -1;
std::string body;
void parse_json_response(std::string & error, json11::Json & r) const;
};
// Opened websocket or keepalive HTTP connection
struct http_co_t;
struct websocket_t
{
http_co_t *co;
void post_message(int type, const std::string & msg);
void close();
};
void parse_http_headers(std::string & res, http_response_t *parsed);
std::vector<std::string> getifaddr_list(json11::Json mask_cfg = json11::Json(), bool include_v6 = true);
http_co_t* http_init(timerfd_manager_t *tfd);
http_co_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path,
int timeout, std::function<void(const http_response_t *msg)> on_message);
void http_request(http_co_t *handler, const std::string & host, const std::string & request,
const http_options_t & options, std::function<void(const http_response_t *response)> response_callback);
void http_post_message(http_co_t *handler, int type, const std::string & msg);
void http_close(http_co_t *co);
// Utils
uint64_t stoull_full(const std::string & str, int base = 10);
std::string strtolower(const std::string & in);
void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
const http_options_t & options, std::function<void(const http_response_t *response)> callback);
void http_request_json(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
int timeout, std::function<void(std::string, json11::Json r)> callback);
websocket_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path,
int timeout, std::function<void(const http_response_t *msg)> callback);

View File

@@ -4,6 +4,7 @@
#include <unistd.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/epoll.h>
#include <netinet/tcp.h>
#include <stdexcept>
@@ -221,13 +222,13 @@ void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
{
assert(peer_osd != this->osd_num);
struct sockaddr addr;
struct sockaddr_storage addr;
if (!string_to_addr(peer_host, 0, peer_port, &addr))
{
on_connect_peer(peer_osd, -EINVAL);
return;
}
int peer_fd = socket(addr.sa_family, SOCK_STREAM, 0);
int peer_fd = socket(addr.ss_family, SOCK_STREAM, 0);
if (peer_fd < 0)
{
on_connect_peer(peer_osd, -errno);
@@ -483,10 +484,10 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
void osd_messenger_t::accept_connections(int listen_fd)
{
// Accept new connections
sockaddr addr;
sockaddr_storage addr;
socklen_t peer_addr_size = sizeof(addr);
int peer_fd;
while ((peer_fd = accept(listen_fd, &addr, &peer_addr_size)) >= 0)
while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
{
assert(peer_fd != 0);
fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd,
@@ -544,7 +545,7 @@ json11::Json osd_messenger_t::read_config(const json11::Json & config)
int done = 0;
while (done < st.st_size)
{
int r = read(fd, (void*)buf.data()+done, st.st_size-done);
int r = read(fd, (uint8_t*)buf.data()+done, st.st_size-done);
if (r < 0)
{
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));

View File

@@ -49,7 +49,7 @@ struct osd_client_t
{
int refs = 0;
sockaddr peer_addr;
sockaddr_storage peer_addr;
int peer_port;
int peer_fd;
int peer_state;
@@ -120,6 +120,7 @@ struct osd_messenger_t
protected:
int keepalive_timer_id = -1;
uint32_t receive_buffer_size = 0;
int peer_connect_interval = 0;
int peer_connect_timeout = 0;
int osd_idle_timeout = 0;
@@ -141,8 +142,6 @@ protected:
std::vector<std::function<void()>> set_immediate;
public:
uint32_t receive_buffer_size = 0;
timerfd_manager_t *tfd;
ring_loop_t *ringloop;
// osd_num_t is only for logging and asserts
@@ -173,11 +172,10 @@ public:
bool connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_msg);
#endif
void handle_peer_epoll(int peer_fd, int epoll_events);
protected:
void try_connect_peer(uint64_t osd_num);
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
void handle_peer_epoll(int peer_fd, int epoll_events);
void handle_connect_epoll(int peer_fd);
void on_connect_peer(osd_num_t peer_osd, int peer_fd);
void check_peer_config(osd_client_t *cl);

View File

@@ -141,7 +141,7 @@ struct osd_op_buf_list_t
else
{
iov.iov_len -= result;
iov.iov_base += result;
iov.iov_base = (uint8_t*)iov.iov_base + result;
break;
}
}

View File

@@ -58,7 +58,9 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
msgr_rdma_context_t *ctx = new msgr_rdma_context_t();
ctx->mtu = mtu;
srand48(time(NULL));
timespec tv;
clock_gettime(CLOCK_REALTIME, &tv);
srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
dev_list = ibv_get_device_list(NULL);
if (!dev_list)
{
@@ -389,7 +391,7 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
uint32_t len = (uint32_t)(op_size+iov.iov_len-rc->send_buf_pos < rc->max_msg
? iov.iov_len-rc->send_buf_pos : rc->max_msg-op_size);
sge[op_sge++] = {
.addr = (uintptr_t)(iov.iov_base+rc->send_buf_pos),
.addr = (uintptr_t)((uint8_t*)iov.iov_base+rc->send_buf_pos),
.length = len,
.lkey = rc->ctx->mr->lkey,
};
@@ -519,7 +521,7 @@ void osd_messenger_t::handle_rdma_events()
}
if (cl->rdma_conn->send_buf_pos > 0)
{
cl->send_list[0].iov_base += cl->rdma_conn->send_buf_pos;
cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + cl->rdma_conn->send_buf_pos;
cl->send_list[0].iov_len -= cl->rdma_conn->send_buf_pos;
cl->rdma_conn->send_buf_pos = 0;
}

View File

@@ -67,7 +67,7 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
}
return false;
}
if (result <= 0 && result != -EAGAIN)
if (result <= 0 && result != -EAGAIN && result != -EINTR)
{
// this is a client socket, so don't panic on error. just disconnect it
if (result != 0)
@@ -77,7 +77,7 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
stop_client(cl->peer_fd);
return false;
}
if (result == -EAGAIN || result < cl->read_iov.iov_len)
if (result == -EAGAIN || result == -EINTR || result < cl->read_iov.iov_len)
{
cl->read_ready--;
if (cl->read_ready > 0)
@@ -142,13 +142,13 @@ bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int rem
memcpy(cur->iov_base, curbuf, remain);
cl->read_remaining -= remain;
cur->iov_len -= remain;
cur->iov_base += remain;
cur->iov_base = (uint8_t*)cur->iov_base + remain;
remain = 0;
}
else
{
memcpy(cur->iov_base, curbuf, cur->iov_len);
curbuf += cur->iov_len;
curbuf = (uint8_t*)curbuf + cur->iov_len;
cl->read_remaining -= cur->iov_len;
remain -= cur->iov_len;
cur->iov_len = 0;
@@ -390,7 +390,7 @@ void osd_messenger_t::handle_reply_ready(osd_op_t *op)
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
);
set_immediate.push_back([this, op]()
set_immediate.push_back([op]()
{
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);

View File

@@ -224,7 +224,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
}
return;
}
if (result < 0 && result != -EAGAIN)
if (result < 0 && result != -EAGAIN && result != -EINTR)
{
// this is a client socket, so don't panic. just disconnect it
fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
@@ -250,7 +250,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
else
{
iov.iov_len -= result;
iov.iov_base += result;
iov.iov_base = (uint8_t*)iov.iov_base + result;
break;
}
}

View File

@@ -111,6 +111,10 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
{
delete cl->read_op;
}
else
{
cancel_op(cl->read_op);
}
cl->read_op = NULL;
}
if (cl->osd_num)

View File

@@ -30,6 +30,9 @@ protected:
std::string image_name;
uint64_t inode = 0;
uint64_t device_size = 0;
int nbd_timeout = 30;
int nbd_max_devices = 64;
int nbd_max_part = 3;
inode_watch_t *watch = NULL;
ring_loop_t *ringloop = NULL;
@@ -51,7 +54,18 @@ protected:
msghdr read_msg = { 0 }, send_msg = { 0 };
iovec read_iov = { 0 };
std::string logfile = "/dev/null";
public:
~nbd_proxy()
{
if (recv_buf)
{
free(recv_buf);
recv_buf = NULL;
}
}
static json11::Json::object parse_args(int narg, const char *args[])
{
json11::Json::object cfg;
@@ -117,9 +131,18 @@ public:
"Vitastor NBD proxy\n"
"(c) Vitaliy Filippov, 2020-2021 (VNPL-1.1)\n\n"
"USAGE:\n"
" %s map [--etcd_address <etcd_address>] (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)\n"
" %s map [OPTIONS] (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)\n"
" %s unmap /dev/nbd0\n"
" %s ls [--json]\n",
" %s ls [--json]\n"
"OPTIONS:\n"
" All usual Vitastor config options like --etcd_address <etcd_address> plus NBD-specific:\n"
" --nbd_timeout 30\n"
" timeout in seconds after which the kernel will stop the device\n"
" you can set it to 0, but beware that you won't be able to stop the device at all\n"
" if vitastor-nbd process dies\n"
" --nbd_max_devices 64 --nbd_max_part 3\n"
" options for the \"nbd\" kernel module when modprobing it (nbds_max and max_part).\n"
" note that maximum allowed (nbds_max)*(1+max_part) is 256.\n",
exe_name, exe_name, exe_name
);
exit(0);
@@ -166,7 +189,7 @@ public:
uint64_t pool = cfg["pool"].uint64_value();
if (pool)
{
inode = (inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS));
inode = (inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS));
}
if (!(inode >> (64-POOL_ID_BITS)))
{
@@ -174,6 +197,18 @@ public:
exit(1);
}
}
if (cfg["nbd_max_devices"].is_number() || cfg["nbd_max_devices"].is_string())
{
nbd_max_devices = cfg["nbd_max_devices"].uint64_value();
}
if (cfg["nbd_max_part"].is_number() || cfg["nbd_max_part"].is_string())
{
nbd_max_part = cfg["nbd_max_part"].uint64_value();
}
if (cfg["nbd_timeout"].is_number() || cfg["nbd_timeout"].is_string())
{
nbd_timeout = cfg["nbd_timeout"].uint64_value();
}
// Create client
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);
@@ -190,6 +225,12 @@ public:
}
watch = cli->st_cli.watch_inode(image_name);
device_size = watch->cfg.size;
if (!watch->cfg.num || !device_size)
{
// Image does not exist
fprintf(stderr, "Image %s does not exist\n", image_name.c_str());
exit(1);
}
}
// Initialize NBD
int sockfd[2];
@@ -204,7 +245,7 @@ public:
bool bg = cfg["foreground"].is_null();
if (!cfg["dev_num"].is_null())
{
if (run_nbd(sockfd, cfg["dev_num"].int64_value(), device_size, NBD_FLAG_SEND_FLUSH, 30, bg) < 0)
if (run_nbd(sockfd, cfg["dev_num"].int64_value(), device_size, NBD_FLAG_SEND_FLUSH, nbd_timeout, bg) < 0)
{
perror("run_nbd");
exit(1);
@@ -239,6 +280,10 @@ public:
}
}
}
if (cfg["logfile"].is_string())
{
logfile = cfg["logfile"].string_value();
}
if (bg)
{
daemonize();
@@ -278,7 +323,7 @@ public:
stop = false;
cluster_op_t *close_sync = new cluster_op_t;
close_sync->opcode = OSD_OP_SYNC;
close_sync->callback = [this, &stop](cluster_op_t *op)
close_sync->callback = [&stop](cluster_op_t *op)
{
stop = true;
delete op;
@@ -292,6 +337,9 @@ public:
delete cli;
delete epmgr;
delete ringloop;
cli = NULL;
epmgr = NULL;
ringloop = NULL;
}
void load_module()
@@ -301,7 +349,10 @@ public:
return;
}
int r;
if ((r = system("modprobe nbd")) != 0)
// Kernel built-in default is 16 devices with up to 16 partitions per device which is a big shit
// 64 also isn't too high, but the possible maximum is nbds_max=256 max_part=0 and it won't reserve
// any block device minor numbers for partitions
if ((r = system(("modprobe nbd nbds_max="+std::to_string(nbd_max_devices)+" max_part="+std::to_string(nbd_max_part)).c_str())) != 0)
{
if (r < 0)
perror("Failed to load NBD kernel module");
@@ -318,13 +369,14 @@ public:
setsid();
if (fork())
exit(0);
chdir("/");
close(0);
close(1);
close(2);
open("/dev/null", O_RDONLY);
open("/dev/null", O_WRONLY);
open("/dev/null", O_WRONLY);
open(logfile.c_str(), O_WRONLY|O_APPEND|O_CREAT, 0666);
open(logfile.c_str(), O_WRONLY|O_APPEND|O_CREAT, 0666);
if (chdir("/") != 0)
fprintf(stderr, "Warning: Failed to chdir into /\n");
}
json11::Json::object list_mapped()
@@ -465,7 +517,7 @@ protected:
goto end_unmap;
}
ioctl(nbd, NBD_SET_FLAGS, flags);
if (timeout >= 0)
if (timeout > 0)
{
r = ioctl(nbd, NBD_SET_TIMEOUT, (unsigned long)timeout);
if (r < 0)
@@ -480,7 +532,11 @@ protected:
{
goto end_unmap;
}
write(qd_fd, "32768", 5);
r = write(qd_fd, "32768", 5);
if (r != 5)
{
fprintf(stderr, "Warning: Failed to configure max_sectors_kb\n");
}
close(qd_fd);
if (!fork())
{
@@ -553,7 +609,7 @@ protected:
}
else
{
send_list[to_eat].iov_base += result;
send_list[to_eat].iov_base = (uint8_t*)send_list[to_eat].iov_base + result;
send_list[to_eat].iov_len -= result;
break;
}
@@ -627,13 +683,14 @@ protected:
memcpy(cur_buf, b, inc);
cur_left -= inc;
result -= inc;
cur_buf += inc;
b += inc;
cur_buf = (uint8_t*)cur_buf + inc;
b = (uint8_t*)b + inc;
}
else
{
assert(result <= cur_left);
cur_left -= result;
cur_buf = (uint8_t*)cur_buf + result;
result = 0;
}
if (cur_left <= 0)
@@ -648,6 +705,12 @@ protected:
if (read_state == CL_READ_HDR)
{
int req_type = be32toh(cur_req.type);
if (be32toh(cur_req.magic) == NBD_REQUEST_MAGIC && req_type == NBD_CMD_DISC)
{
// Disconnect
close(nbd_fd);
exit(0);
}
if (be32toh(cur_req.magic) != NBD_REQUEST_MAGIC ||
req_type != NBD_CMD_READ && req_type != NBD_CMD_WRITE && req_type != NBD_CMD_FLUSH)
{
@@ -667,7 +730,7 @@ protected:
op->offset = be64toh(cur_req.from);
op->len = be32toh(cur_req.len);
buf = malloc_or_die(sizeof(nbd_reply) + op->len);
op->iov.push_back(buf + sizeof(nbd_reply), op->len);
op->iov.push_back((uint8_t*)buf + sizeof(nbd_reply), op->len);
}
else if (req_type == NBD_CMD_FLUSH)
{
@@ -695,7 +758,7 @@ protected:
if (req_type == NBD_CMD_WRITE)
{
cur_op = op;
cur_buf = buf + sizeof(nbd_reply);
cur_buf = (uint8_t*)buf + sizeof(nbd_reply);
cur_left = op->len;
read_state = CL_READ_DATA;
}
@@ -734,5 +797,6 @@ int main(int narg, const char *args[])
exe_name = args[0];
nbd_proxy *p = new nbd_proxy();
p->exec(nbd_proxy::parse_args(narg, args));
delete p;
return 0;
}

748
src/nfs_conn.cpp Normal file
View File

@@ -0,0 +1,748 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
//
// NFS connection handler for NFS proxy
#include <sys/time.h>
#include "libnfs-raw-mount.h"
#include "libnfs-raw-nfs.h"
#include "base64.h"
#include "nfs_proxy.h"
static unsigned len_pad4(unsigned len)
{
return len + (len&3 ? 4-(len&3) : 0);
}
static int nfs3_null_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
rpc_send_reply(rpc, call, NULL, (zdrproc_t)zdr_void, 0);
return 0;
}
static fattr3 get_dir_attributes(nfs_client_t *self, std::string dir)
{
return (fattr3){
.type = NF3DIR,
.mode = 0755,
.nlink = 1,
.uid = 0,
.gid = 0,
.size = 4096,
.used = 4096,
.rdev = (specdata3){ 0 },
.fsid = self->parent->fsid,
.fileid = dir == "" ? 1 : self->parent->dir_ids.at(dir),
//.atime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
//.mtime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
//.ctime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
};
}
static int nfs3_getattr_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
GETATTR3args *args = (GETATTR3args*)call->body.cbody.args;
GETATTR3res reply;
std::string dirhash = std::string(args->object.data.data_val, args->object.data.data_len);
bool is_dir = false;
std::string dir;
if (dirhash == "roothandle")
is_dir = true;
else
{
auto dir_it = self->parent->dir_by_hash.find(dirhash);
if (dir_it != self->parent->dir_by_hash.end())
{
is_dir = true;
dir = dir_it->second;
}
}
if (is_dir)
{
// Directory info
reply.status = NFS3_OK;
reply.GETATTR3res_u.resok.obj_attributes = get_dir_attributes(self, dir);
}
else
{
uint64_t inode_num;
auto inode_num_it = self->parent->inode_by_hash.find(dirhash);
if (inode_num_it != self->parent->inode_by_hash.end())
inode_num = inode_num_it->second;
auto inode_it = self->parent->cli->st_cli.inode_config.find(inode_num);
if (inode_it != self->parent->cli->st_cli.inode_config.end())
{
// File info
auto & inode_cfg = inode_it->second;
reply.status = NFS3_OK;
reply.GETATTR3res_u.resok.obj_attributes = {
.type = NF3REG,
.mode = 0644,
.nlink = 1,
.uid = 0,
.gid = 0,
.size = inode_cfg.size,
.used = inode_cfg.size,
.rdev = (specdata3){ 0 },
.fsid = self->parent->fsid,
.fileid = inode_it->first,
//.atime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
//.mtime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
//.ctime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
};
}
else
{
// File not exists
reply.status = NFS3ERR_NOENT;
}
}
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_GETATTR3res, sizeof(GETATTR3res));
return 0;
}
static int nfs3_setattr_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
SETATTR3args *args = (SETATTR3args*)call->body.cbody.args;
SETATTR3res reply;
// Not supported yet
reply.status = NFS3ERR_NOTSUPP;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_SETATTR3res, sizeof(SETATTR3res));
return 0;
}
static int nfs3_lookup_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
LOOKUP3args *args = (LOOKUP3args*)call->body.cbody.args;
LOOKUP3res reply;
std::string dirhash = std::string(args->what.dir.data.data_val, args->what.dir.data.data_len);
std::string dir;
if (dirhash != "roothandle")
{
auto dir_it = self->parent->dir_by_hash.find(dirhash);
if (dir_it != self->parent->dir_by_hash.end())
dir = dir_it->second;
}
std::string full_name = self->parent->name_prefix;
if (dir != "")
{
full_name += dir+"/";
}
full_name += std::string(args->what.name);
for (auto & ic: self->parent->cli->st_cli.inode_config)
{
if (ic.second.name == full_name)
{
std::string fh = "S"+base64_encode(sha256(full_name.substr(self->parent->name_prefix.size())));
reply.status = NFS3_OK;
reply.LOOKUP3res_u.resok.object.data.data_len = fh.size();
reply.LOOKUP3res_u.resok.object.data.data_val = (char*)fh.c_str();
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_LOOKUP3res, sizeof(LOOKUP3res));
return 0;
}
}
reply.status = NFS3ERR_NOENT;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_LOOKUP3res, sizeof(LOOKUP3res));
return 0;
}
static int nfs3_access_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
ACCESS3args *args = (ACCESS3args*)call->body.cbody.args;
ACCESS3res reply = {
.status = NFS3_OK,
.ACCESS3res_u = { .resok = {
.access = args->access,
} },
};
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_ACCESS3res, sizeof(ACCESS3res));
return 0;
}
static int nfs3_readlink_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
READLINK3args *args = (READLINK3args*)call->body.cbody.args;
READLINK3res reply = {};
// Not supported yet
reply.status = NFS3ERR_NOTSUPP;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READLINK3res, sizeof(READLINK3res));
return 0;
}
#define MAX_REQUEST_SIZE 128*1024*1024
static int nfs3_read_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
READ3args *args = (READ3args*)call->body.cbody.args;
std::string handle = std::string(args->file.data.data_val, args->file.data.data_len);
auto ino_it = self->parent->inode_by_hash.find(handle);
if (ino_it == self->parent->inode_by_hash.end())
{
READ3res reply = { .status = NFS3ERR_NOENT };
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READ3res, sizeof(READ3res));
return 0;
}
if (args->count > MAX_REQUEST_SIZE)
{
READ3res reply = { .status = NFS3ERR_INVAL };
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READ3res, sizeof(READ3res));
return 0;
}
void *buf = malloc_or_die(args->count);
cluster_op_t *op = new cluster_op_t;
op->opcode = OSD_OP_READ;
op->inode = ino_it->second;
op->offset = args->offset;
op->len = args->count;
op->iov.push_back(buf, args->count);
op->callback = [rpc, call](cluster_op_t *op)
{
void *buf = op->iov.buf[0].iov_base;
READ3res reply = {};
if (op->retval != op->len)
{
if (op->retval == -EINVAL)
reply.status = NFS3ERR_INVAL;
else if (op->retval == -ENOSPC)
reply.status = NFS3ERR_NOSPC;
else
reply.status = NFS3ERR_IO;
}
else
{
reply.status = NFS3_OK;
auto & reply_ok = reply.READ3res_u.resok;
reply_ok.count = op->retval;
reply_ok.eof = FALSE;
reply_ok.data.data_len = reply_ok.count;
reply_ok.data.data_val = (char*)buf;
}
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READ3res, sizeof(READ3res));
delete op;
free(buf);
};
self->parent->cli->execute(op);
return 0;
}
static int nfs3_write_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
WRITE3args *args = (WRITE3args*)call->body.cbody.args;
WRITE3res reply;
// Not supported yet
reply.status = NFS3ERR_NOTSUPP;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_WRITE3res, sizeof(WRITE3res));
return 0;
}
static int nfs3_create_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
CREATE3args *args = (CREATE3args*)call->body.cbody.args;
CREATE3res reply;
// Not supported yet
reply.status = NFS3ERR_NOTSUPP;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_CREATE3res, sizeof(CREATE3res));
return 0;
}
static int nfs3_mkdir_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
MKDIR3args *args = (MKDIR3args*)call->body.cbody.args;
MKDIR3res reply;
// Not supported yet
reply.status = NFS3ERR_NOTSUPP;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_MKDIR3res, sizeof(MKDIR3res));
return 0;
}
static int nfs3_symlink_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
SYMLINK3args *args = (SYMLINK3args*)call->body.cbody.args;
SYMLINK3res reply;
// Not supported yet
reply.status = NFS3ERR_NOTSUPP;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_SYMLINK3res, sizeof(SYMLINK3res));
return 0;
}
static int nfs3_mknod_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
MKNOD3args *args = (MKNOD3args*)call->body.cbody.args;
MKNOD3res reply;
// Not supported yet
reply.status = NFS3ERR_NOTSUPP;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_MKNOD3res, sizeof(MKNOD3res));
return 0;
}
static int nfs3_remove_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
REMOVE3args *args = (REMOVE3args*)call->body.cbody.args;
REMOVE3res reply;
// Not supported yet
reply.status = NFS3ERR_NOTSUPP;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_REMOVE3res, sizeof(REMOVE3res));
return 0;
}
static int nfs3_rmdir_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
RMDIR3args *args = (RMDIR3args*)call->body.cbody.args;
RMDIR3res reply;
// Not supported yet
reply.status = NFS3ERR_NOTSUPP;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_RMDIR3res, sizeof(RMDIR3res));
return 0;
}
static int nfs3_rename_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
RENAME3args *args = (RENAME3args*)call->body.cbody.args;
RENAME3res reply;
// Not supported yet
reply.status = NFS3ERR_NOTSUPP;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_RENAME3res, sizeof(RENAME3res));
return 0;
}
static int nfs3_link_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
LINK3args *args = (LINK3args*)call->body.cbody.args;
// We don't support hard links
LINK3res reply = { NFS3ERR_NOTSUPP };
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_LINK3res, sizeof(LINK3res));
return 0;
}
static void nfs3_readdir_common(struct rpc_context *rpc, struct rpc_msg *call, void *opaque, bool is_plus)
{
nfs_client_t *self = (nfs_client_t*)opaque;
READDIRPLUS3args plus_args;
READDIRPLUS3args *args = NULL;
if (is_plus)
args = ((READDIRPLUS3args*)call->body.cbody.args);
else
{
args = &plus_args;
READDIR3args *in_args = ((READDIR3args*)call->body.cbody.args);
args->dir = in_args->dir;
args->cookie = in_args->cookie;
*((uint64_t*)args->cookieverf) = *((uint64_t*)in_args->cookieverf);
args->dircount = 512;
args->maxcount = in_args->count;
}
std::string dirhash = std::string(args->dir.data.data_val, args->dir.data.data_len);
std::string dir;
if (dirhash != "roothandle")
{
auto dir_it = self->parent->dir_by_hash.find(dirhash);
if (dir_it != self->parent->dir_by_hash.end())
dir = dir_it->second;
}
std::string prefix = self->parent->name_prefix;
if (dir != "")
{
prefix += dir+"/";
}
//struct timespec now;
//clock_gettime(CLOCK_REALTIME, &now);
std::map<std::string, struct entryplus3> entries;
std::vector<std::string> handles;
for (auto & ic: self->parent->cli->st_cli.inode_config)
{
auto & inode_cfg = ic.second;
if (prefix != "" && inode_cfg.name.substr(0, prefix.size()) != prefix)
continue;
std::string subname = inode_cfg.name.substr(prefix.size());
int p = 0;
while (p < subname.size() && subname[p] == '/')
p++;
if (p > 0)
subname = subname.substr(p);
if (subname.size() == 0)
continue;
p = 0;
while (p < subname.size() && subname[p] != '/')
p++;
if (p >= subname.size())
{
entries[subname] = (struct entryplus3){
// fileid will change when the user creates snapshots
// however, we hope that clients tolerate it well
// Linux does, even though it complains about "fileid changed" in dmesg
.fileid = ic.first,
};
if (is_plus)
{
handles.push_back("S"+base64_encode(sha256(inode_cfg.name)));
entries[subname].name_attributes = {
.attributes_follow = TRUE,
.post_op_attr_u = { .attributes = {
.type = NF3REG,
.mode = 0644,
.nlink = 1,
.uid = 0,
.gid = 0,
.size = inode_cfg.size,
.used = inode_cfg.size, // FIXME take from statistics
.rdev = (specdata3){ 0 },
.fsid = self->parent->fsid,
.fileid = ic.first,
//.atime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
//.mtime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
//.ctime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
} },
};
entries[subname].name_handle = {
.handle_follows = TRUE,
.post_op_fh3_u = { .handle = {
.data = {
// FIXME: I really want ZDR with std::string
.data_len = handles[handles.size()-1].size(),
.data_val = (char*)handles[handles.size()-1].c_str(),
},
} },
};
}
}
else
{
auto subdir = dir == "" ? subname.substr(0, p) : dir+"/"+subname.substr(0, p);
entries[subdir] = (struct entryplus3){
// for directories, fileid will change when the user restarts proxy
.fileid = self->parent->dir_ids.at(subdir),
};
if (is_plus)
{
handles.push_back("S"+base64_encode(sha256(subdir)));
entries[subdir].name_attributes = {
.attributes_follow = TRUE,
.post_op_attr_u = { .attributes = get_dir_attributes(self, subdir) },
};
entries[subdir].name_handle = {
.handle_follows = TRUE,
.post_op_fh3_u = { .handle = {
.data = {
// FIXME: I really want ZDR with std::string
.data_len = (unsigned)handles[handles.size()-1].size(),
.data_val = (char*)handles[handles.size()-1].c_str(),
},
} },
};
}
}
}
// Offset results by the continuation cookie (equal to index in the listing)
uint64_t idx = 1;
void *prev = NULL;
for (auto it = entries.begin(); it != entries.end(); it++)
{
entryplus3 *entry = &it->second;
// First fields of entry3 and entryplus3 are the same: fileid, name, cookie
entry->name = (char*)it->first.c_str();
entry->cookie = idx++;
if (prev)
{
if (is_plus)
((entryplus3*)prev)->nextentry = entry;
else
((entry3*)prev)->nextentry = (entry3*)entry;
}
prev = entry;
if (args->cookie > 0 && entry->cookie == args->cookie+1)
{
entries.erase(entries.begin(), it);
}
}
// Now limit results based on maximum reply size
// Sadly we have to calculate reply size by hand
// reply without entries is 4+4+(dir_attributes ? sizeof(fattr3) : 0)+8+4 bytes
int reply_size = 20;
if (reply_size > args->maxcount)
{
// Error, too small max reply size
if (is_plus)
{
READDIRPLUS3res reply = { .status = NFS3ERR_TOOSMALL };
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READDIRPLUS3res, sizeof(READDIRPLUS3res));
}
else
{
READDIR3res reply = { .status = NFS3ERR_TOOSMALL };
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READDIR3res, sizeof(READDIR3res));
}
return;
}
// 1 entry3 is (8+4+(filename_len+3)/4*4+8) bytes
// 1 entryplus3 is (8+4+(filename_len+3)/4*4+8
// + 4+(name_attributes ? (sizeof(fattr3) = 84) : 0)
// + 4+(name_handle ? 4+(handle_len+3)/4*4 : 0)) bytes
bool eof = true;
for (auto it = entries.begin(); it != entries.end(); it++)
{
reply_size += 20+len_pad4(it->first.size())+(is_plus
? 8+84+len_pad4(it->second.name_handle.post_op_fh3_u.handle.data.data_len) : 0);
if (reply_size > args->maxcount)
{
// Stop
entries.erase(it, entries.end());
eof = false;
break;
}
}
if (entries.end() != entries.begin())
{
auto last_it = entries.end();
last_it--;
if (is_plus)
((entryplus3*)&last_it->second)->nextentry = NULL;
else
((entry3*)&last_it->second)->nextentry = NULL;
}
// Send reply
if (is_plus)
{
READDIRPLUS3res reply = { .status = NFS3_OK };
*(uint64_t*)(reply.READDIRPLUS3res_u.resok.cookieverf) = self->parent->dir_mod_rev.at(dir);
reply.READDIRPLUS3res_u.resok.reply.entries = &entries.begin()->second;
reply.READDIRPLUS3res_u.resok.reply.eof = eof;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READDIRPLUS3res, sizeof(READDIRPLUS3res));
}
else
{
READDIR3res reply = { .status = NFS3_OK };
*(uint64_t*)(reply.READDIR3res_u.resok.cookieverf) = self->parent->dir_mod_rev.at(dir);
reply.READDIR3res_u.resok.reply.entries = (entry3*)&entries.begin()->second;
reply.READDIR3res_u.resok.reply.eof = eof;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READDIR3res, sizeof(READDIR3res));
}
}
static int nfs3_readdir_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs3_readdir_common(rpc, call, opaque, false);
return 0;
}
static int nfs3_readdirplus_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs3_readdir_common(rpc, call, opaque, true);
return 0;
}
// Get file system statistics
static int nfs3_fsstat_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
FSSTAT3args *args = (FSSTAT3args*)call->body.cbody.args;
FSSTAT3res reply;
reply.status = NFS3_OK;
reply.FSSTAT3res_u.resok.obj_attributes.attributes_follow = TRUE;
reply.FSSTAT3res_u.resok.obj_attributes.post_op_attr_u.attributes = get_dir_attributes(self, "");
reply.FSSTAT3res_u.resok.tbytes = 4096; // total bytes
reply.FSSTAT3res_u.resok.fbytes = 4096; // free bytes
reply.FSSTAT3res_u.resok.abytes = 4096; // available bytes
reply.FSSTAT3res_u.resok.tfiles = 1 << 31; // total files
reply.FSSTAT3res_u.resok.ffiles = 1 << 31; // free files
reply.FSSTAT3res_u.resok.afiles = 1 << 31; // available files
reply.FSSTAT3res_u.resok.invarsec = 0;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_FSSTAT3res, sizeof(FSSTAT3res));
return 0;
}
static int nfs3_fsinfo_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
FSINFO3args *args = (FSINFO3args*)call->body.cbody.args;
FSINFO3res reply;
if (args->fsroot.data.data_len != 10)
{
// Example error
reply.status = NFS3ERR_INVAL;
}
else
{
// Fill info
reply.status = NFS3_OK;
reply.FSINFO3res_u.resok.obj_attributes.attributes_follow = TRUE;
reply.FSINFO3res_u.resok.obj_attributes.post_op_attr_u.attributes = get_dir_attributes(self, "");
reply.FSINFO3res_u.resok.rtmax = 128*1024*1024;
reply.FSINFO3res_u.resok.rtpref = 128*1024*1024;
reply.FSINFO3res_u.resok.rtmult = 4096;
reply.FSINFO3res_u.resok.wtmax = 128*1024*1024;
reply.FSINFO3res_u.resok.wtpref = 128*1024*1024;
reply.FSINFO3res_u.resok.wtmult = 4096;
reply.FSINFO3res_u.resok.dtpref = 128;
reply.FSINFO3res_u.resok.maxfilesize = 0x7fffffffffffffff;
reply.FSINFO3res_u.resok.time_delta.seconds = 1;
reply.FSINFO3res_u.resok.time_delta.nseconds = 0;
reply.FSINFO3res_u.resok.properties = FSF3_SYMLINK | FSF3_HOMOGENEOUS;
}
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_FSINFO3res, sizeof(FSINFO3res));
return 0;
}
static int nfs3_pathconf_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
PATHCONF3args *args = (PATHCONF3args*)call->body.cbody.args;
PATHCONF3res reply;
if (args->object.data.data_len != 10)
{
// Example error
reply.status = NFS3ERR_INVAL;
}
else
{
// Fill info
reply.status = NFS3_OK;
reply.PATHCONF3res_u.resok.obj_attributes.attributes_follow = FALSE;
reply.PATHCONF3res_u.resok.linkmax = 0;
reply.PATHCONF3res_u.resok.name_max = 255;
reply.PATHCONF3res_u.resok.no_trunc = TRUE;
reply.PATHCONF3res_u.resok.chown_restricted = FALSE;
reply.PATHCONF3res_u.resok.case_insensitive = FALSE;
reply.PATHCONF3res_u.resok.case_preserving = TRUE;
}
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_PATHCONF3res, sizeof(PATHCONF3res));
return 0;
}
static int nfs3_commit_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
COMMIT3args *args = (COMMIT3args*)call->body.cbody.args;
COMMIT3res reply = {};
// Just pretend we did fsync :-)
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_COMMIT3res, sizeof(COMMIT3res));
return 0;
}
static int mount3_mnt_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
dirpath *arg = (dirpath*)call->body.cbody.args;
int flavor = AUTH_NONE;
mountres3 reply;
reply.fhs_status = MNT3_OK;
reply.mountres3_u.mountinfo.fhandle.fhandle3_len = 10;
reply.mountres3_u.mountinfo.fhandle.fhandle3_val = "roothandle";
reply.mountres3_u.mountinfo.auth_flavors.auth_flavors_len = 1;
reply.mountres3_u.mountinfo.auth_flavors.auth_flavors_val = &flavor;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_mountres3, sizeof(mountres3));
return 0;
}
static int mount3_dump_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
mountlist reply;
reply = (struct mountbody*)malloc(sizeof(struct mountbody));
reply->ml_hostname = (dirpath)"127.0.0.1";
reply->ml_directory = (dirpath)"/test";
reply->ml_next = NULL;
rpc_send_reply(rpc, call, NULL, (zdrproc_t)zdr_mountlist, sizeof(mountlist));
free(reply);
return 0;
}
static int mount3_umnt_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
dirpath *arg = (dirpath*)call->body.cbody.args;
// do nothing
rpc_send_reply(rpc, call, NULL, (zdrproc_t)zdr_void, 0);
return 0;
}
static int mount3_umntall_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
// do nothing
rpc_send_reply(rpc, call, NULL, (zdrproc_t)zdr_void, 0);
return 0;
}
static int mount3_export_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
nfs_client_t *self = (nfs_client_t*)opaque;
exports reply;
reply = (struct exportnode*)malloc(sizeof(struct exportnode) + sizeof(struct groupnode));
reply->ex_dir = (dirpath)"/test";
reply->ex_groups = (struct groupnode*)(reply+1);
reply->ex_groups->gr_name = (dirpath)"127.0.0.1";
reply->ex_groups->gr_next = NULL;
reply->ex_next = NULL;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_exports, sizeof(exports));
free(reply);
return 0;
}
nfs_client_t::nfs_client_t()
{
struct service_proc nfs3_pt_a[22] = {
{NFS3_NULL, nfs3_null_proc, (zdrproc_t)zdr_void, 0, this},
{NFS3_GETATTR, nfs3_getattr_proc, (zdrproc_t)zdr_GETATTR3args, sizeof(GETATTR3args), this},
{NFS3_SETATTR, nfs3_setattr_proc, (zdrproc_t)zdr_SETATTR3args, sizeof(SETATTR3args), this},
{NFS3_LOOKUP, nfs3_lookup_proc, (zdrproc_t)zdr_LOOKUP3args, sizeof(LOOKUP3args), this},
{NFS3_ACCESS, nfs3_access_proc, (zdrproc_t)zdr_ACCESS3args, sizeof(ACCESS3args), this},
{NFS3_READLINK, nfs3_readlink_proc, (zdrproc_t)zdr_READLINK3args, sizeof(READLINK3args), this},
{NFS3_READ, nfs3_read_proc, (zdrproc_t)zdr_READ3args, sizeof(READ3args), this},
{NFS3_WRITE, nfs3_write_proc, (zdrproc_t)zdr_WRITE3args, sizeof(WRITE3args), this},
{NFS3_CREATE, nfs3_create_proc, (zdrproc_t)zdr_CREATE3args, sizeof(CREATE3args), this},
{NFS3_MKDIR, nfs3_mkdir_proc, (zdrproc_t)zdr_MKDIR3args, sizeof(MKDIR3args), this},
{NFS3_SYMLINK, nfs3_symlink_proc, (zdrproc_t)zdr_SYMLINK3args, sizeof(SYMLINK3args), this},
{NFS3_MKNOD, nfs3_mknod_proc, (zdrproc_t)zdr_MKNOD3args, sizeof(MKNOD3args), this},
{NFS3_REMOVE, nfs3_remove_proc, (zdrproc_t)zdr_REMOVE3args, sizeof(REMOVE3args), this},
{NFS3_RMDIR, nfs3_rmdir_proc, (zdrproc_t)zdr_RMDIR3args, sizeof(RMDIR3args), this},
{NFS3_RENAME, nfs3_rename_proc, (zdrproc_t)zdr_RENAME3args, sizeof(RENAME3args), this},
{NFS3_LINK, nfs3_link_proc, (zdrproc_t)zdr_LINK3args, sizeof(LINK3args), this},
{NFS3_READDIR, nfs3_readdir_proc, (zdrproc_t)zdr_READDIR3args, sizeof(READDIR3args), this},
{NFS3_READDIRPLUS, nfs3_readdirplus_proc, (zdrproc_t)zdr_READDIRPLUS3args, sizeof(READDIRPLUS3args), this},
{NFS3_FSSTAT, nfs3_fsstat_proc, (zdrproc_t)zdr_FSSTAT3args, sizeof(FSSTAT3args), this},
{NFS3_FSINFO, nfs3_fsinfo_proc, (zdrproc_t)zdr_FSINFO3args, sizeof(FSINFO3args), this},
{NFS3_PATHCONF, nfs3_pathconf_proc, (zdrproc_t)zdr_PATHCONF3args, sizeof(PATHCONF3args), this},
{NFS3_COMMIT, nfs3_commit_proc, (zdrproc_t)zdr_COMMIT3args, sizeof(COMMIT3args), this},
};
for (int i = 0; i < sizeof(nfs3_pt_a)/sizeof(service_proc); i++)
{
nfs3_pt.push_back(nfs3_pt_a[i]);
}
struct service_proc nfs3_mount_pt_a[6] = {
{MOUNT3_NULL, nfs3_null_proc, (zdrproc_t)zdr_void, 0, this},
{MOUNT3_MNT, mount3_mnt_proc, (zdrproc_t)zdr_dirpath, sizeof(dirpath), this},
{MOUNT3_DUMP, mount3_dump_proc, (zdrproc_t)zdr_void, 0, this},
{MOUNT3_UMNT, mount3_umnt_proc, (zdrproc_t)zdr_dirpath, sizeof(dirpath), this},
{MOUNT3_UMNTALL, mount3_umntall_proc, (zdrproc_t)zdr_void, 0, this},
{MOUNT3_EXPORT, mount3_export_proc, (zdrproc_t)zdr_void, 0, this},
};
for (int i = 0; i < sizeof(nfs3_mount_pt_a)/sizeof(service_proc); i++)
{
nfs3_mount_pt.push_back(nfs3_mount_pt_a[i]);
}
}
nfs_client_t::~nfs_client_t()
{
if (rpc)
{
rpc_disconnect(rpc, NULL);
rpc_destroy_context(rpc);
}
}

172
src/nfs_portmap.cpp Normal file
View File

@@ -0,0 +1,172 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
//
// Portmap service for NFS proxy
#include <netinet/in.h>
#include <string.h>
#include "nfs_portmap.h"
#include "libnfs-raw-portmap.h"
#include "sha256.h"
#include "base64.h"
/*
* The NULL procedure. All protocols/versions must provide a NULL procedure
* as index 0.
* It is used by clients, and rpcinfo, to "ping" a service and verify that
* the service is available and that it does support the indicated version.
*/
static int pmap2_null_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
rpc_send_reply(rpc, call, NULL, (zdrproc_t)zdr_void, 0);
return 0;
}
/*
* v2 GETPORT.
* This is the lookup function for portmapper version 2.
* A client provides program, version and protocol (tcp or udp)
* and portmapper returns which port that service is available on,
* (or 0 if no such program is registered.)
*/
static int pmap2_getport_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
portmap_service_t *self = (portmap_service_t *)opaque;
PMAP2GETPORTargs *args = (PMAP2GETPORTargs *)call->body.cbody.args;
uint32_t port = 0;
auto it = self->reg_ports.lower_bound((portmap_id_t){
.prog = args->prog,
.vers = args->vers,
.udp = args->prot == IPPROTO_UDP,
.ipv6 = false,
});
if (it != self->reg_ports.end() &&
it->prog == args->prog && it->vers == args->vers &&
it->udp == (args->prot == IPPROTO_UDP))
{
port = it->port;
}
rpc_send_reply(rpc, call, &port, (zdrproc_t)zdr_uint32_t, sizeof(uint32_t));
return 0;
}
/*
* v2 DUMP.
* This RPC returns a list of all endpoints that are registered with
* portmapper.
*/
static int pmap2_dump_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
portmap_service_t *self = (portmap_service_t *)opaque;
pmap2_mapping_list *list = new pmap2_mapping_list[self->reg_ports.size()];
int i = 0;
for (auto it = self->reg_ports.begin(); it != self->reg_ports.end(); it++)
{
if (it->ipv6)
continue;
list[i] = {
.map = {
.prog = it->prog,
.vers = it->vers,
.prot = it->udp ? IPPROTO_UDP : IPPROTO_TCP,
.port = it->port,
},
.next = list+i+1,
};
i++;
}
list[i-1].next = NULL;
// Send reply
PMAP2DUMPres reply;
reply.list = list;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_PMAP2DUMPres, sizeof(PMAP2DUMPres));
reply.list = NULL;
delete list;
return 0;
}
/*
* v3 GETADDR.
* This is the lookup function for portmapper version 3.
*/
static int pmap3_getaddr_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
portmap_service_t *self = (portmap_service_t *)opaque;
PMAP3GETADDRargs *args = (PMAP3GETADDRargs *)call->body.cbody.args;
portmap_id_t ref = (portmap_id_t){
.prog = args->prog,
.vers = args->vers,
.udp = !strcmp(args->netid, "udp") || !strcmp(args->netid, "udp6"),
.ipv6 = !strcmp(args->netid, "tcp6") || !strcmp(args->netid, "udp6"),
};
auto it = self->reg_ports.lower_bound(ref);
PMAP3GETADDRres reply;
if (it != self->reg_ports.end() &&
it->prog == ref.prog && it->vers == ref.vers &&
it->udp == ref.udp && it->ipv6 == ref.ipv6)
{
reply.addr = (char*)it->addr.c_str();
}
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_PMAP3GETADDRres, sizeof(PMAP3GETADDRres));
return 0;
}
/*
* v3 DUMP.
* This RPC returns a list of all endpoints that are registered with
* portmapper.
*/
static int pmap3_dump_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
{
portmap_service_t *self = (portmap_service_t *)opaque;
pmap3_mapping_list *list = new pmap3_mapping_list[self->reg_ports.size()];
int i = 0;
for (auto it = self->reg_ports.begin(); it != self->reg_ports.end(); it++)
{
list[i] = (pmap3_mapping_list){
.map = {
.prog = it->prog,
.vers = it->vers,
.netid = (char*)(it->ipv6
? (it->udp ? "udp6" : "tcp6")
: (it->udp ? "udp" : "tcp")),
.addr = (char*)it->addr.c_str(), // 0.0.0.0.port
.owner = (char*)it->owner.c_str(),
},
.next = list+i+1,
};
i++;
}
list[i-1].next = NULL;
// Send reply
PMAP3DUMPres reply;
reply.list = list;
rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_PMAP3DUMPres, sizeof(PMAP3DUMPres));
reply.list = NULL;
delete list;
return 0;
}
portmap_service_t::portmap_service_t()
{
pmap2_pt.push_back((service_proc){PMAP2_NULL, pmap2_null_proc, (zdrproc_t)zdr_void, 0, this});
pmap2_pt.push_back((service_proc){PMAP2_GETPORT, pmap2_getport_proc, (zdrproc_t)zdr_PMAP2GETPORTargs, sizeof(PMAP2GETPORTargs), this});
pmap2_pt.push_back((service_proc){PMAP2_DUMP, pmap2_dump_proc, (zdrproc_t)zdr_void, 0, this});
pmap3_pt.push_back((service_proc){PMAP3_NULL, pmap2_null_proc, (zdrproc_t)zdr_void, 0, this});
pmap3_pt.push_back((service_proc){PMAP3_GETADDR, pmap3_getaddr_proc, (zdrproc_t)zdr_PMAP3GETADDRargs, sizeof(PMAP3GETADDRargs), this});
pmap3_pt.push_back((service_proc){PMAP3_DUMP, pmap3_dump_proc, (zdrproc_t)zdr_void, 0, this});
}
std::string sha256(const std::string & str)
{
std::string hash;
hash.resize(32);
SHA256_CTX ctx;
sha256_init(&ctx);
sha256_update(&ctx, (uint8_t*)str.data(), str.size());
sha256_final(&ctx, (uint8_t*)hash.data());
return hash;
}

41
src/nfs_portmap.h Normal file
View File

@@ -0,0 +1,41 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
//
// Portmap service for NFS proxy
#pragma once
#include <string>
#include <set>
#include <vector>
#include "nfsc/libnfs.h"
#include "nfsc/libnfs-raw.h"
struct portmap_id_t
{
unsigned prog, vers;
bool udp;
bool ipv6;
unsigned port;
std::string owner;
std::string addr;
};
class portmap_service_t
{
public:
std::set<portmap_id_t> reg_ports;
std::vector<service_proc> pmap2_pt;
std::vector<service_proc> pmap3_pt;
portmap_service_t();
};
inline bool operator < (const portmap_id_t &a, const portmap_id_t &b)
{
return a.prog < b.prog || a.prog == b.prog && a.vers < b.vers ||
a.prog == b.prog && a.vers == b.vers && a.udp < b.udp ||
a.prog == b.prog && a.vers == b.vers && a.udp == b.udp && a.ipv6 < b.ipv6;
}
std::string sha256(const std::string & str);

301
src/nfs_proxy.cpp Normal file
View File

@@ -0,0 +1,301 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
//
// Simplified NFS proxy
// Presents all images as files, stores small files directly in etcd
// Keeps image list in memory and thus is unsuitable for a lot of files
#include <netinet/tcp.h>
#include <sys/epoll.h>
#include <sys/poll.h>
#include <unistd.h>
#include <fcntl.h>
//#include <signal.h>
#include "libnfs-raw-mount.h"
#include "libnfs-raw-nfs.h"
#include "libnfs-raw-portmap.h"
#include "addr_util.h"
#include "base64.h"
#include "nfs_proxy.h"
const char *exe_name = NULL;
nfs_proxy_t::~nfs_proxy_t()
{
if (cli)
delete cli;
if (epmgr)
delete epmgr;
if (ringloop)
delete ringloop;
}
json11::Json::object nfs_proxy_t::parse_args(int narg, const char *args[])
{
json11::Json::object cfg;
for (int i = 1; i < narg; i++)
{
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
{
printf(
"Vitastor NFS 3.0 proxy\n"
"(c) Vitaliy Filippov, 2021-2022 (VNPL-1.1)\n\n"
"USAGE:\n"
" %s [--etcd_address ADDR] [OTHER OPTIONS]\n",
exe_name
);
exit(0);
}
else if (args[i][0] == '-' && args[i][1] == '-')
{
const char *opt = args[i]+2;
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
}
}
return cfg;
}
void nfs_proxy_t::run(json11::Json cfg)
{
bind_address = cfg["bind_address"].string_value();
if (bind_address == "")
bind_address = "0.0.0.0";
// Create client
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
// We need inode name hashes for NFS handles to remain stateless and <= 64 bytes long
dir_mod_rev[""] = 0;
dir_ids[""] = 1;
assert(cli->st_cli.on_inode_change_hook == NULL);
cli->st_cli.on_inode_change_hook = [this](inode_t changed_inode, bool removed)
{
if (removed)
{
auto ino_it = hash_by_inode.find(changed_inode);
if (ino_it != hash_by_inode.end())
{
inode_by_hash.erase(ino_it->second);
hash_by_inode.erase(ino_it);
}
// FIXME also calculate dir_mod_rev
}
else
{
auto & inode_cfg = cli->st_cli.inode_config.at(changed_inode);
std::string name = inode_cfg.name;
if (name_prefix != "")
{
if (name.substr(0, name_prefix.size()) != name_prefix)
return;
name = name.substr(name_prefix.size());
}
dir_mod_rev[""] = dir_mod_rev[""] < inode_cfg.mod_revision ? inode_cfg.mod_revision : dir_mod_rev[""];
std::string hash = "S"+base64_encode(sha256(name));
int pos = name.find('/');
while (pos >= 0)
{
std::string dir = name.substr(0, pos);
if (dir_ids.find(dir) == dir_ids.end())
dir_ids[dir] = next_dir_id++;
dir_mod_rev[dir] = dir_mod_rev[dir] < inode_cfg.mod_revision ? inode_cfg.mod_revision : dir_mod_rev[dir];
dir_by_hash["S"+base64_encode(sha256(dir))] = dir;
int next = name.substr(pos+1).find('/');
pos = next < 0 ? -1 : pos+1+next;
}
auto hbi_it = hash_by_inode.find(changed_inode);
if (hbi_it != hash_by_inode.end() && hbi_it->second != hash)
{
// inode had a different name, remove old hash=>inode pointer
inode_by_hash.erase(hbi_it->second);
}
inode_by_hash[hash] = changed_inode;
hash_by_inode[changed_inode] = hash;
}
};
// Load image metadata
while (!cli->is_ready())
{
ringloop->loop();
if (cli->is_ready())
break;
ringloop->wait();
}
// Create portmap socket
int portmap_socket = create_and_bind_socket(bind_address, 111, 128, NULL);
fcntl(portmap_socket, F_SETFL, fcntl(portmap_socket, F_GETFL, 0) | O_NONBLOCK);
// Create NFS socket
int nfs_socket = create_and_bind_socket(bind_address, 2049, 128, NULL);
fcntl(nfs_socket, F_SETFL, fcntl(nfs_socket, F_GETFL, 0) | O_NONBLOCK);
// Self-register portmap and NFS
pmap.reg_ports.insert((portmap_id_t){
.prog = PMAP_PROGRAM,
.vers = PMAP_V2,
.port = 111,
.owner = "portmapper-service",
.addr = "0.0.0.0.0.111",
});
pmap.reg_ports.insert((portmap_id_t){
.prog = PMAP_PROGRAM,
.vers = PMAP_V3,
.port = 111,
.owner = "portmapper-service",
.addr = "0.0.0.0.0.111",
});
pmap.reg_ports.insert((portmap_id_t){
.prog = NFS_PROGRAM,
.vers = NFS_V3,
.port = 2049,
.owner = "nfs-server",
.addr = "0.0.0.0.0.2049",
});
pmap.reg_ports.insert((portmap_id_t){
.prog = MOUNT_PROGRAM,
.vers = MOUNT_V3,
.port = 2049,
.owner = "rpc.mountd",
.addr = "0.0.0.0.0.2049",
});
// Add FDs to epoll
epmgr->tfd->set_fd_handler(portmap_socket, false, [this](int portmap_socket, int epoll_events)
{
if (epoll_events & EPOLLRDHUP)
{
fprintf(stderr, "Listening portmap socket disconnected, exiting\n");
exit(1);
}
else
{
do_accept(portmap_socket);
}
});
epmgr->tfd->set_fd_handler(nfs_socket, false, [this](int nfs_socket, int epoll_events)
{
if (epoll_events & EPOLLRDHUP)
{
fprintf(stderr, "Listening portmap socket disconnected, exiting\n");
exit(1);
}
else
{
do_accept(nfs_socket);
}
});
if (cfg["foreground"].is_null())
{
daemonize();
}
while (true)
{
ringloop->loop();
ringloop->wait();
}
/*// Sync at the end
cluster_op_t *close_sync = new cluster_op_t;
close_sync->opcode = OSD_OP_SYNC;
close_sync->callback = [&stop](cluster_op_t *op)
{
stop = true;
delete op;
};
cli->execute(close_sync);*/
// Destroy the client
delete cli;
delete epmgr;
delete ringloop;
cli = NULL;
epmgr = NULL;
ringloop = NULL;
}
void nfs_proxy_t::do_accept(int listen_fd)
{
struct sockaddr_storage addr;
socklen_t addr_size = sizeof(addr);
int nfs_fd = 0;
while ((nfs_fd = accept(listen_fd, (struct sockaddr *)&addr, &addr_size)) >= 0)
{
fprintf(stderr, "New client %d: connection from %s\n", nfs_fd, addr_to_string(addr).c_str());
fcntl(nfs_fd, F_SETFL, fcntl(nfs_fd, F_GETFL, 0) | O_NONBLOCK);
int one = 1;
setsockopt(nfs_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
auto cli = new nfs_client_t();
cli->parent = this;
cli->nfs_fd = nfs_fd;
cli->rpc = rpc_init_server_context(nfs_fd);
if (!cli->rpc)
{
delete cli;
close(nfs_fd);
fprintf(stderr, "Failed to init libnfs server context\n");
exit(1);
}
// Use both portmap and NFS everywhere
rpc_register_service(cli->rpc, PMAP_PROGRAM, PMAP_V2, pmap.pmap2_pt.data(), pmap.pmap2_pt.size());
rpc_register_service(cli->rpc, PMAP_PROGRAM, PMAP_V3, pmap.pmap3_pt.data(), pmap.pmap3_pt.size());
rpc_register_service(cli->rpc, NFS_PROGRAM, NFS_V3, cli->nfs3_pt.data(), cli->nfs3_pt.size());
rpc_register_service(cli->rpc, MOUNT_PROGRAM, MOUNT_V3, cli->nfs3_mount_pt.data(), cli->nfs3_mount_pt.size());
epmgr->tfd->set_fd_handler(nfs_fd, true, [this, cli](int nfs_fd, int epoll_events)
{
// Handle incoming event
if (epoll_events & EPOLLRDHUP)
{
fprintf(stderr, "Client %d disconnected\n", nfs_fd);
epmgr->tfd->set_fd_handler(cli->nfs_fd, true, NULL);
delete cli;
close(nfs_fd);
return;
}
int revents = 0;
if (epoll_events & EPOLLIN)
revents |= POLLIN;
if (epoll_events & EPOLLOUT)
revents |= POLLOUT;
// Let libnfs process the event
if (rpc_service(cli->rpc, revents) < 0)
{
fprintf(stderr, "libnfs error: %s, disconnecting client %d\n", rpc_get_error(cli->rpc), nfs_fd);
epmgr->tfd->set_fd_handler(cli->nfs_fd, true, NULL);
delete cli;
close(nfs_fd);
return;
}
// FIXME Add/remove events based on rpc_which_events(rpc) ?
});
}
if (nfs_fd < 0 && errno != EAGAIN)
{
fprintf(stderr, "Failed to accept connection: %s\n", strerror(errno));
exit(1);
}
}
void nfs_proxy_t::daemonize()
{
if (fork())
exit(0);
setsid();
if (fork())
exit(0);
if (chdir("/") != 0)
fprintf(stderr, "Warning: Failed to chdir into /\n");
close(0);
close(1);
close(2);
open("/dev/null", O_RDONLY);
open("/dev/null", O_WRONLY);
open("/dev/null", O_WRONLY);
}
int main(int narg, const char *args[])
{
setvbuf(stdout, NULL, _IONBF, 0);
setvbuf(stderr, NULL, _IONBF, 0);
exe_name = args[0];
nfs_proxy_t *p = new nfs_proxy_t();
p->run(nfs_proxy_t::parse_args(narg, args));
delete p;
return 0;
}

47
src/nfs_proxy.h Normal file
View File

@@ -0,0 +1,47 @@
#pragma once
#include "cluster_client.h"
#include "epoll_manager.h"
#include "nfs_portmap.h"
#include "nfsc/libnfs-raw.h"
class nfs_proxy_t
{
public:
std::string bind_address;
std::string name_prefix;
int fsid = 1;
portmap_service_t pmap;
ring_loop_t *ringloop = NULL;
epoll_manager_t *epmgr = NULL;
cluster_client_t *cli = NULL;
uint64_t next_dir_id = 2;
std::map<std::string, std::string> dir_by_hash;
std::map<std::string, uint64_t> dir_ids;
std::map<std::string, uint64_t> dir_mod_rev;
std::map<inode_t, std::string> hash_by_inode;
std::map<std::string, inode_t> inode_by_hash;
~nfs_proxy_t();
static json11::Json::object parse_args(int narg, const char *args[]);
void run(json11::Json cfg);
void do_accept(int listen_fd);
void daemonize();
};
class nfs_client_t
{
public:
nfs_proxy_t *parent = NULL;
int nfs_fd;
struct rpc_context *rpc = NULL;
std::vector<service_proc> nfs3_pt;
std::vector<service_proc> nfs3_mount_pt;
nfs_client_t();
~nfs_client_t();
};

View File

@@ -3,6 +3,7 @@
#include <sys/socket.h>
#include <sys/poll.h>
#include <sys/mman.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
@@ -53,6 +54,20 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
autosync_writes = max_autosync;
}
if (this->config["osd_memlock"] == "true" || this->config["osd_memlock"] == "1" || this->config["osd_memlock"] == "yes")
{
// Lock all OSD memory if requested
if (mlockall(MCL_CURRENT|MCL_FUTURE
#ifdef MCL_ONFAULT
| MCL_ONFAULT
#endif
) != 0)
{
fprintf(stderr, "osd_memlock is set to true, but mlockall() failed: %s\n", strerror(errno));
exit(-1);
}
}
this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
{
print_stats();
@@ -185,46 +200,7 @@ void osd_t::bind_socket()
// FIXME Support multiple listening sockets
sockaddr addr;
if (!string_to_addr(bind_address, 0, bind_port, &addr))
{
throw std::runtime_error("bind address "+bind_address+" is not valid");
}
listen_fd = socket(addr.sa_family, SOCK_STREAM, 0);
if (listen_fd < 0)
{
throw std::runtime_error(std::string("socket: ") + strerror(errno));
}
int enable = 1;
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
if (bind(listen_fd, &addr, sizeof(addr)) < 0)
{
close(listen_fd);
throw std::runtime_error(std::string("bind: ") + strerror(errno));
}
if (bind_port == 0)
{
socklen_t len = sizeof(addr);
if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
{
close(listen_fd);
throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
}
listening_port = ntohs(((sockaddr_in*)&addr)->sin_port);
}
else
{
listening_port = bind_port;
}
if (listen(listen_fd, listen_backlog) < 0)
{
close(listen_fd);
throw std::runtime_error(std::string("listen: ") + strerror(errno));
}
listen_fd = create_and_bind_socket(bind_address, bind_port, listen_backlog, &listening_port);
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
@@ -331,8 +307,8 @@ void osd_t::exec_op(osd_op_t *cur_op)
void osd_t::reset_stats()
{
msgr.stats = { 0 };
prev_stats = { 0 };
msgr.stats = {};
prev_stats = {};
memset(recovery_stat_count, 0, sizeof(recovery_stat_count));
memset(recovery_stat_bytes, 0, sizeof(recovery_stat_bytes));
}
@@ -447,7 +423,7 @@ void osd_t::print_slow()
{
for (uint64_t i = 0; i < op->req.sec_stab.len; i += sizeof(obj_ver_id))
{
obj_ver_id *ov = (obj_ver_id*)(op->buf + i);
obj_ver_id *ov = (obj_ver_id*)((uint8_t*)op->buf + i);
bufprintf(i == 0 ? " %lx:%lx v%lu" : ", %lx:%lx v%lu", ov->oid.inode, ov->oid.stripe, ov->version);
}
}

View File

@@ -102,7 +102,7 @@ class osd_t
bool no_rebalance = false;
bool no_recovery = false;
std::string bind_address;
int bind_port, listen_backlog;
int bind_port, listen_backlog = 128;
// FIXME: Implement client queue depth limit
int client_queue_depth = 128;
bool allow_test_ops = false;
@@ -166,8 +166,8 @@ class osd_t
osd_op_stats_t prev_stats;
std::map<uint64_t, inode_stats_t> inode_stats;
const char* recovery_stat_names[2] = { "degraded", "misplaced" };
uint64_t recovery_stat_count[2][2] = { 0 };
uint64_t recovery_stat_bytes[2][2] = { 0 };
uint64_t recovery_stat_count[2][2] = {};
uint64_t recovery_stat_bytes[2][2] = {};
// cluster connection
void parse_config(const json11::Json & config);
@@ -211,7 +211,7 @@ class osd_t
// flushing, recovery and backfill
void submit_pg_flush_ops(pg_t & pg);
void handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval);
void submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
bool submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
bool pick_next_recovery(osd_recovery_op_t &op);
void submit_recovery_op(osd_recovery_op_t *op);
bool continue_recovery();

View File

@@ -6,6 +6,7 @@
#include "etcd_state_client.h"
#include "http_client.h"
#include "osd_rmw.h"
#include "addr_util.h"
// Startup sequence:
// Start etcd watcher -> Load global OSD configuration -> Bind socket -> Acquire lease -> Report&lock OSD state
@@ -188,7 +189,7 @@ void osd_t::report_statistics()
for (auto kv: bs->get_inode_space_stats())
{
pool_id_t pool_id = INODE_POOL(kv.first);
uint64_t only_inode_num = (kv.first & ((1l << (64-POOL_ID_BITS)) - 1));
uint64_t only_inode_num = (kv.first & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1));
if (!last_pool || pool_id != last_pool)
{
if (last_pool)
@@ -206,7 +207,7 @@ void osd_t::report_statistics()
for (auto kv: inode_stats)
{
pool_id_t pool_id = INODE_POOL(kv.first);
uint64_t only_inode_num = (kv.first & ((1l << (64-POOL_ID_BITS)) - 1));
uint64_t only_inode_num = (kv.first & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1));
if (!last_pool || pool_id != last_pool)
{
if (last_pool)
@@ -276,14 +277,14 @@ void osd_t::report_statistics()
} }
});
}
st_cli.etcd_txn(json11::Json::object { { "success", txn } }, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json res)
st_cli.etcd_txn_slow(json11::Json::object { { "success", txn } }, [this](std::string err, json11::Json res)
{
etcd_reporting_stats = false;
if (err != "")
{
printf("[OSD %lu] Error reporting state to etcd: %s\n", this->osd_num, err.c_str());
// Retry indefinitely
tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
tfd->set_timer(st_cli.etcd_slow_timeout, false, [this](int timer_id)
{
report_statistics();
});
@@ -354,13 +355,13 @@ void osd_t::acquire_lease()
{
// Maximum lease TTL is (report interval) + retries * (timeout + repeat interval)
st_cli.etcd_call("/lease/grant", json11::Json::object {
{ "TTL", etcd_report_interval+(MAX_ETCD_ATTEMPTS*(2*ETCD_QUICK_TIMEOUT)+999)/1000 }
}, ETCD_QUICK_TIMEOUT, [this](std::string err, json11::Json data)
{ "TTL", etcd_report_interval+(st_cli.max_etcd_attempts*(2*st_cli.etcd_quick_timeout)+999)/1000 }
}, st_cli.etcd_quick_timeout, 0, 0, [this](std::string err, json11::Json data)
{
if (err != "" || data["ID"].string_value() == "")
{
printf("Error acquiring a lease from etcd: %s\n", err.c_str());
tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int timer_id)
printf("Error acquiring a lease from etcd: %s, retrying\n", err.c_str());
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this](int timer_id)
{
acquire_lease();
});
@@ -407,19 +408,19 @@ void osd_t::create_osd_state()
} }
},
} },
}, ETCD_QUICK_TIMEOUT, [this](std::string err, json11::Json data)
}, st_cli.etcd_quick_timeout, 0, 0, [this](std::string err, json11::Json data)
{
if (err != "")
{
etcd_failed_attempts++;
printf("Error creating OSD state key: %s\n", err.c_str());
if (etcd_failed_attempts > MAX_ETCD_ATTEMPTS)
if (etcd_failed_attempts > st_cli.max_etcd_attempts)
{
// Die
throw std::runtime_error("Cluster connection failed");
}
// Retry
tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int timer_id)
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this](int timer_id)
{
create_osd_state();
});
@@ -451,24 +452,26 @@ void osd_t::renew_lease()
{
st_cli.etcd_call("/lease/keepalive", json11::Json::object {
{ "ID", etcd_lease_id }
}, ETCD_QUICK_TIMEOUT, [this](std::string err, json11::Json data)
}, st_cli.etcd_quick_timeout, 0, 0, [this](std::string err, json11::Json data)
{
if (err == "" && data["result"]["TTL"].string_value() == "")
{
// Die
throw std::runtime_error("etcd lease has expired");
fprintf(stderr, "Error refreshing etcd lease\n");
force_stop(1);
}
if (err != "")
{
etcd_failed_attempts++;
printf("Error renewing etcd lease: %s\n", err.c_str());
if (etcd_failed_attempts > MAX_ETCD_ATTEMPTS)
if (etcd_failed_attempts > st_cli.max_etcd_attempts)
{
// Die
throw std::runtime_error("Cluster connection failed");
fprintf(stderr, "Cluster connection failed\n");
force_stop(1);
}
// Retry
tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int timer_id)
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this](int timer_id)
{
renew_lease();
});
@@ -487,7 +490,7 @@ void osd_t::force_stop(int exitcode)
{
st_cli.etcd_call("/kv/lease/revoke", json11::Json::object {
{ "ID", etcd_lease_id }
}, ETCD_QUICK_TIMEOUT, [this, exitcode](std::string err, json11::Json data)
}, st_cli.etcd_quick_timeout, st_cli.max_etcd_attempts, 0, [this, exitcode](std::string err, json11::Json data)
{
if (err != "")
{
@@ -825,7 +828,7 @@ void osd_t::report_pg_states()
etcd_reporting_pg_state = true;
st_cli.etcd_txn(json11::Json::object {
{ "compare", checks }, { "success", success }, { "failure", failure }
}, ETCD_QUICK_TIMEOUT, [this, reporting_pgs](std::string err, json11::Json data)
}, st_cli.etcd_quick_timeout, 0, 0, [this, reporting_pgs](std::string err, json11::Json data)
{
etcd_reporting_pg_state = false;
if (!data["succeeded"].bool_value())
@@ -857,10 +860,13 @@ void osd_t::report_pg_states()
if (null_byte == 0)
{
auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num });
if (pg_it != pgs.end() && pg_it->second.state != PG_OFFLINE && pg_it->second.state != PG_STARTING)
if (pg_it != pgs.end() && pg_it->second.state != PG_OFFLINE && pg_it->second.state != PG_STARTING &&
kv.value["primary"].uint64_value() != 0 &&
kv.value["primary"].uint64_value() != this->osd_num)
{
// Live PG state update failed
printf("Failed to report state of pool %u PG %u which is live. Race condition detected, exiting\n", pool_id, pg_num);
// PG is somehow captured by another OSD
printf("BUG: OSD %lu captured our PG %u/%u. Race condition detected, exiting\n",
kv.value["primary"].uint64_value(), pool_id, pg_num);
force_stop(1);
return;
}

View File

@@ -47,7 +47,8 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
if (l.second.size() > 0)
{
fb->flush_ops++;
submit_flush_op(pg.pool_id, pg.pg_num, fb, true, l.first, l.second.size(), l.second.data());
if (!submit_flush_op(pg.pool_id, pg.pg_num, fb, true, l.first, l.second.size(), l.second.data()))
return;
}
}
for (auto & l: fb->stable_lists)
@@ -55,7 +56,8 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
if (l.second.size() > 0)
{
fb->flush_ops++;
submit_flush_op(pg.pool_id, pg.pg_num, fb, false, l.first, l.second.size(), l.second.data());
if (!submit_flush_op(pg.pool_id, pg.pg_num, fb, false, l.first, l.second.size(), l.second.data()))
return;
}
}
}
@@ -160,7 +162,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
}
}
void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
{
osd_op_t *op = new osd_op_t();
// Copy buffer so it gets freed along with the operation
@@ -188,10 +190,8 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
else
{
// Peer
int peer_fd = msgr.osd_peer_fds[peer_osd];
op->op_type = OSD_OP_OUT;
op->iov.push_back(op->buf, count * sizeof(obj_ver_id));
op->peer_fd = peer_fd;
op->req = (osd_any_op_t){
.sec_stab = {
.header = {
@@ -207,8 +207,21 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
handle_flush_op(op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK, pool_id, pg_num, fb, peer_osd, op->reply.hdr.retval);
delete op;
};
msgr.outbox_push(op);
auto peer_fd_it = msgr.osd_peer_fds.find(peer_osd);
if (peer_fd_it != msgr.osd_peer_fds.end())
{
op->peer_fd = peer_fd_it->second;
msgr.outbox_push(op);
}
else
{
// Fail it immediately
op->reply.hdr.retval = -EPIPE;
op->callback(op);
return false;
}
}
return true;
}
bool osd_t::pick_next_recovery(osd_recovery_op_t &op)

View File

@@ -9,7 +9,7 @@
#define POOL_ID_MAX 0x10000
#define POOL_ID_BITS 16
#define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))
#define INODE_NO_POOL(inode) (inode_t)(inode & ((1l << (64-POOL_ID_BITS)) - 1))
#define INODE_NO_POOL(inode) (inode_t)(inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1))
#define INODE_WITH_POOL(pool_id, inode) (((inode_t)(pool_id) << (64-POOL_ID_BITS)) | INODE_NO_POOL(inode))
// Pool ID is 16 bits long

View File

@@ -27,10 +27,12 @@ void osd_t::handle_peers()
misplaced_objects += p.second.misplaced_objects.size();
// FIXME: degraded objects may currently include misplaced, too! Report them separately?
degraded_objects += p.second.degraded_objects.size();
if ((p.second.state & (PG_ACTIVE | PG_HAS_UNCLEAN)) == (PG_ACTIVE | PG_HAS_UNCLEAN))
if (p.second.state & PG_HAS_UNCLEAN)
peering_state = peering_state | OSD_FLUSHING_PGS;
else if (p.second.state & PG_ACTIVE)
else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
peering_state = peering_state | OSD_RECOVERING;
ringloop->wakeup();
return;
}
else
{
@@ -176,6 +178,17 @@ void osd_t::start_pg_peering(pg_t & pg)
msgr.stop_client(peer_fd);
}
}
// Try to connect with current peers if they're up, but we don't have connections to them
// Otherwise we may erroneously decide that the pg is incomplete :-)
for (auto pg_osd: pg.all_peers)
{
if (pg_osd != this->osd_num &&
msgr.osd_peer_fds.find(pg_osd) == msgr.osd_peer_fds.end() &&
msgr.wanted_peers.find(pg_osd) == msgr.wanted_peers.end())
{
msgr.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
}
}
// Calculate current write OSD set
pg.pg_cursize = 0;
pg.cur_set.resize(pg.target_set.size());
@@ -207,10 +220,6 @@ void osd_t::start_pg_peering(pg_t & pg)
{
cur_peers.insert(pg_osd);
}
else if (msgr.wanted_peers.find(pg_osd) == msgr.wanted_peers.end())
{
msgr.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
}
}
if (pg.target_history.size())
{
@@ -333,7 +342,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
else
{
// Peer
auto & cl = msgr.clients.at(msgr.osd_peer_fds[role_osd]);
auto & cl = msgr.clients.at(msgr.osd_peer_fds.at(role_osd));
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->peer_fd = cl->peer_fd;
@@ -387,7 +396,9 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
{
if (op->bs_op->retval < 0)
{
throw std::runtime_error("local OP_LIST failed");
printf("Local OP_LIST failed: retval=%d\n", op->bs_op->retval);
force_stop(1);
return;
}
add_bs_subop_stats(op);
printf(
@@ -412,7 +423,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
// Peer
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->peer_fd = msgr.osd_peer_fds[role_osd];
op->peer_fd = msgr.osd_peer_fds.at(role_osd);
op->req = (osd_any_op_t){
.sec_list = {
.header = {

View File

@@ -437,7 +437,7 @@ void pg_t::calc_object_states(int log_level)
st.walk();
if (this->state & (PG_DEGRADED|PG_LEFT_ON_DEAD))
{
assert(epoch != ((1ul << PG_EPOCH_BITS)-1));
assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
epoch++;
}
}

View File

@@ -96,11 +96,11 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
(pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 0 : pg_it->second.pg_size)
)
);
void *data_buf = ((void*)op_data) + sizeof(osd_primary_op_data_t);
void *data_buf = (uint8_t*)op_data + sizeof(osd_primary_op_data_t);
op_data->pg_num = pg_num;
op_data->oid = oid;
op_data->stripes = (osd_rmw_stripe_t*)data_buf;
data_buf += sizeof(osd_rmw_stripe_t) * stripe_count;
data_buf = (uint8_t*)data_buf + sizeof(osd_rmw_stripe_t) * stripe_count;
op_data->scheme = pool_cfg.scheme;
op_data->pg_data_size = pg_data_size;
op_data->pg_size = pg_it->second.pg_size;
@@ -110,17 +110,17 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
for (int i = 0; i < stripe_count; i++)
{
op_data->stripes[i].bmp_buf = data_buf;
data_buf += clean_entry_bitmap_size;
data_buf = (uint8_t*)data_buf + clean_entry_bitmap_size;
}
op_data->chain_size = chain_size;
if (chain_size > 0)
{
op_data->read_chain = (inode_t*)data_buf;
data_buf += sizeof(inode_t) * chain_size;
data_buf = (uint8_t*)data_buf + sizeof(inode_t) * chain_size;
op_data->snapshot_bitmaps = data_buf;
data_buf += chain_size * stripe_count * clean_entry_bitmap_size;
data_buf = (uint8_t*)data_buf + chain_size * stripe_count * clean_entry_bitmap_size;
op_data->missing_flags = (uint8_t*)data_buf;
data_buf += chain_size * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 0 : pg_it->second.pg_size);
data_buf = (uint8_t*)data_buf + chain_size * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 0 : pg_it->second.pg_size);
// Copy chain
int chain_num = 0;
op_data->read_chain[chain_num++] = cur_op->req.rw.inode;
@@ -194,18 +194,22 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
// Determine version
auto vo_it = pg.ver_override.find(op_data->oid);
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
op_data->prev_set = pg.cur_set.data();
if (pg.state != PG_ACTIVE)
{
// PG may be degraded or have misplaced objects
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
}
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
{
// Fast happy-path
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, pg.cur_set.data(), cur_op);
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
op_data->st = 1;
}
else
{
// PG may be degraded or have misplaced objects
uint64_t* cur_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
if (extend_missing_stripes(op_data->stripes, cur_set, op_data->pg_data_size, pg.pg_size) < 0)
if (extend_missing_stripes(op_data->stripes, op_data->prev_set, op_data->pg_data_size, pg.pg_size) < 0)
{
finish_op(cur_op, -EIO);
return;
@@ -215,7 +219,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
op_data->scheme = pg.scheme;
op_data->degraded = 1;
cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0);
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, cur_set, cur_op);
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
op_data->st = 1;
}
}
@@ -248,7 +252,7 @@ resume_2:
{
// Send buffer in parts to avoid copying
cur_op->iov.push_back(
stripes[role].read_buf + (stripes[role].req_start - stripes[role].read_start),
(uint8_t*)stripes[role].read_buf + (stripes[role].req_start - stripes[role].read_start),
stripes[role].req_end - stripes[role].req_start
);
}

Some files were not shown because too many files have changed in this diff Show More