Compare commits
73 Commits
Author | SHA1 | Date | |
---|---|---|---|
06f4e0fcce | |||
f285cfc483 | |||
12b50b421d | |||
9f6d09428d | |||
580025cfc9 | |||
13e2d3ce7c | |||
c5b00f897a | |||
e847e26912 | |||
3393463466 | |||
bd96a6194a | |||
601fe10c28 | |||
63dbc9ca85 | |||
aa0c363c39 | |||
ce52c5589e | |||
aee20ab1ee | |||
bb81992fac | |||
a28f401aff | |||
4ac7e096fd | |||
b6171a4599 | |||
28045f230c | |||
10e867880f | |||
012462171a | |||
904793cdab | |||
45c01db2de | |||
8c9206cecd | |||
e8c46ededa | |||
e9b321a0e0 | |||
09a77991ae | |||
29d8c9b6f3 | |||
20321aaaef | |||
987b005356 | |||
41754b748b | |||
31913256f3 | |||
0ee36baed7 | |||
19e2d9d6fa | |||
bfc7e61909 | |||
7da4868b37 | |||
b5c020ce0b | |||
6b33ae973d | |||
cf36445359 | |||
3fd873d263 | |||
a00e8ae9ed | |||
75674545dc | |||
225eb2fe3d | |||
7e82573ed0 | |||
12a6bed2d5 | |||
5524dbdab7 | |||
cd3dec06ac | |||
371d79e059 | |||
0e888e6c60 | |||
408c21d8f0 | |||
43cb9ae212 | |||
e15b6e7805 | |||
31017d8412 | |||
4819854064 | |||
1f509cca77 | |||
aa8e8e8271 | |||
4d79e531c5 | |||
30dff8893f | |||
becf14a705 | |||
64388788c1 | |||
37653abe4b | |||
7c054c6f10 | |||
bb7709e824 | |||
ebeace5a2d | |||
a378789f10 | |||
1fe678e57b | |||
2e592a2f22 | |||
b92f644e3a | |||
890ea3dbc0 | |||
06630369bf | |||
b4740acf62 | |||
eae81bbda6 |
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VERSION "1.1.0")
|
||||
set(VERSION "1.2.0")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
@@ -50,6 +50,7 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
|
||||
- Параметры
|
||||
- [Общие](docs/config/common.ru.md)
|
||||
- [Сетевые](docs/config/network.ru.md)
|
||||
- [Клиентский код](docs/config/client.en.md)
|
||||
- [Глобальные дисковые параметры](docs/config/layout-cluster.ru.md)
|
||||
- [Дисковые параметры OSD](docs/config/layout-osd.ru.md)
|
||||
- [Прочие параметры OSD](docs/config/osd.ru.md)
|
||||
|
@@ -50,6 +50,7 @@ Read more details below in the documentation.
|
||||
- Parameter Reference
|
||||
- [Common](docs/config/common.en.md)
|
||||
- [Network](docs/config/network.en.md)
|
||||
- [Client](docs/config/client.en.md)
|
||||
- [Global Disk Layout](docs/config/layout-cluster.en.md)
|
||||
- [OSD Disk Layout](docs/config/layout-osd.en.md)
|
||||
- [OSD Runtime Parameters](docs/config/osd.en.md)
|
||||
|
@@ -1,4 +1,4 @@
|
||||
VERSION ?= v1.1.0
|
||||
VERSION ?= v1.2.0
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v1.1.0
|
||||
image: vitalif/vitastor-csi:v1.2.0
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -35,10 +35,13 @@ rules:
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: ["snapshot.storage.k8s.io"]
|
||||
resources: ["volumesnapshots"]
|
||||
verbs: ["get", "list"]
|
||||
verbs: ["get", "list", "patch"]
|
||||
- apiGroups: ["snapshot.storage.k8s.io"]
|
||||
resources: ["volumesnapshots/status"]
|
||||
verbs: ["get", "list", "patch"]
|
||||
- apiGroups: ["snapshot.storage.k8s.io"]
|
||||
resources: ["volumesnapshotcontents"]
|
||||
verbs: ["create", "get", "list", "watch", "update", "delete"]
|
||||
verbs: ["create", "get", "list", "watch", "update", "delete", "patch"]
|
||||
- apiGroups: ["snapshot.storage.k8s.io"]
|
||||
resources: ["volumesnapshotclasses"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
@@ -53,7 +56,7 @@ rules:
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: ["snapshot.storage.k8s.io"]
|
||||
resources: ["volumesnapshotcontents/status"]
|
||||
verbs: ["update"]
|
||||
verbs: ["update", "patch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["configmaps"]
|
||||
verbs: ["get"]
|
||||
|
@@ -23,6 +23,11 @@ metadata:
|
||||
name: csi-vitastor-provisioner
|
||||
spec:
|
||||
replicas: 3
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxUnavailable: 1
|
||||
maxSurge: 0
|
||||
selector:
|
||||
matchLabels:
|
||||
app: csi-vitastor-provisioner
|
||||
@@ -46,7 +51,7 @@ spec:
|
||||
priorityClassName: system-cluster-critical
|
||||
containers:
|
||||
- name: csi-provisioner
|
||||
image: k8s.gcr.io/sig-storage/csi-provisioner:v2.2.0
|
||||
image: k8s.gcr.io/sig-storage/csi-provisioner:v3.0.0
|
||||
args:
|
||||
- "--csi-address=$(ADDRESS)"
|
||||
- "--v=5"
|
||||
@@ -116,7 +121,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v1.1.0
|
||||
image: vitalif/vitastor-csi:v1.2.0
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -17,3 +17,4 @@ parameters:
|
||||
# multiple etcdUrls may be specified, delimited by comma
|
||||
#etcdUrl: "http://192.168.7.2:2379"
|
||||
#etcdPrefix: "/vitastor"
|
||||
allowVolumeExpansion: true
|
||||
|
7
csi/deploy/example-snapshot-class.yaml
Normal file
7
csi/deploy/example-snapshot-class.yaml
Normal file
@@ -0,0 +1,7 @@
|
||||
apiVersion: snapshot.storage.k8s.io/v1
|
||||
kind: VolumeSnapshotClass
|
||||
metadata:
|
||||
name: vitastor-snapclass
|
||||
driver: csi.vitastor.io
|
||||
deletionPolicy: Delete
|
||||
parameters:
|
16
csi/deploy/example-snapshot-clone.yaml
Normal file
16
csi/deploy/example-snapshot-clone.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: test-vitastor-clone
|
||||
spec:
|
||||
storageClassName: vitastor
|
||||
dataSource:
|
||||
name: snap1
|
||||
kind: VolumeSnapshot
|
||||
apiGroup: snapshot.storage.k8s.io
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
8
csi/deploy/example-snapshot.yaml
Normal file
8
csi/deploy/example-snapshot.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
apiVersion: snapshot.storage.k8s.io/v1
|
||||
kind: VolumeSnapshot
|
||||
metadata:
|
||||
name: snap1
|
||||
spec:
|
||||
volumeSnapshotClassName: vitastor-snapclass
|
||||
source:
|
||||
persistentVolumeClaimName: test-vitastor-pvc
|
@@ -9,7 +9,8 @@ require (
|
||||
golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
|
||||
google.golang.org/grpc v1.33.1
|
||||
k8s.io/klog v1.1.0
|
||||
google.golang.org/protobuf v1.24.0
|
||||
k8s.io/klog v1.0.0
|
||||
k8s.io/utils v0.0.0-20210305010621-2afb4311ab10
|
||||
)
|
||||
|
||||
|
34
csi/go.sum
34
csi/go.sum
@@ -6,9 +6,9 @@ cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTj
|
||||
cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0=
|
||||
cloud.google.com/go v0.51.0/go.mod h1:hWtGJ6gnXH+KgDv+V0zFGDvpi07n3z8ZNj3T1RW0Gcw=
|
||||
cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o=
|
||||
cloud.google.com/go/datastore v1.1.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
|
||||
cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
|
||||
cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I=
|
||||
cloud.google.com/go/storage v1.1.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw=
|
||||
cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw=
|
||||
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
|
||||
github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78/go.mod h1:LmzpDX56iTiv29bbRTIsUNlaFfuhWRQBWjQdVyAevI8=
|
||||
github.com/Azure/go-autorest/autorest v0.9.0/go.mod h1:xyHB1BMZT0cuDHU7I0+g046+BFDTQ8rEZB0s4Yfa6bI=
|
||||
@@ -25,14 +25,14 @@ github.com/Azure/go-autorest/tracing v0.5.0/go.mod h1:r/s2XiOKccPW3HrqB+W0TQzfbt
|
||||
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
||||
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
|
||||
github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
|
||||
github.com/PuerkitoBio/purell v1.1.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
|
||||
github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
|
||||
github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
|
||||
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
|
||||
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
|
||||
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
|
||||
github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
|
||||
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
|
||||
github.com/beorn7/perks v1.1.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
|
||||
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
|
||||
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
|
||||
github.com/blang/semver v3.5.0+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
|
||||
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
|
||||
@@ -92,13 +92,13 @@ github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QD
|
||||
github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0=
|
||||
github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
|
||||
github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
|
||||
github.com/google/btree v1.1.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
|
||||
github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
|
||||
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
|
||||
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
|
||||
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
|
||||
github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4=
|
||||
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
|
||||
github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
|
||||
@@ -112,7 +112,7 @@ github.com/googleapis/gnostic v0.4.1/go.mod h1:LRhVm6pbyptWbWbuZ38d1eyptfvIytN3i
|
||||
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
|
||||
github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
|
||||
github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
|
||||
github.com/hpcloud/tail v1.1.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
|
||||
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
|
||||
github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
|
||||
github.com/imdario/mergo v0.3.5/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA=
|
||||
github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
|
||||
@@ -121,7 +121,7 @@ github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1
|
||||
github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
|
||||
github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
|
||||
github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
|
||||
github.com/kisielk/gotool v1.1.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
|
||||
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
|
||||
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
|
||||
github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
|
||||
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
|
||||
@@ -153,10 +153,10 @@ github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR
|
||||
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pmezard/go-difflib v1.1.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.1.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
|
||||
github.com/prometheus/client_golang v1.1.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
|
||||
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
|
||||
github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=
|
||||
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
|
||||
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
|
||||
@@ -326,13 +326,13 @@ google.golang.org/protobuf v1.24.0 h1:UhZDfRO8JRQru4/+LlLE0BRKGF8L+PICnvYZmx/fEG
|
||||
google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
|
||||
gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.1.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.1.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
|
||||
gopkg.in/check.v1 v1.1.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
|
||||
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
|
||||
gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
|
||||
gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
|
||||
gopkg.in/tomb.v1 v1.1.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
|
||||
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
|
||||
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
@@ -351,8 +351,8 @@ k8s.io/apimachinery v0.19.0/go.mod h1:DnPGDnARWFvYa3pMHgSxtbZb7gpzzAZ1pTfaUNDVlm
|
||||
k8s.io/client-go v0.19.0/go.mod h1:H9E/VT95blcFQnlyShFgnFT9ZnJOAceiUHM3MlRC+mU=
|
||||
k8s.io/component-base v0.19.0/go.mod h1:dKsY8BxkA+9dZIAh2aWJLL/UdASFDNtGYTCItL4LM7Y=
|
||||
k8s.io/gengo v0.0.0-20200413195148-3a45101e95ac/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0=
|
||||
k8s.io/klog v1.1.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8=
|
||||
k8s.io/klog v1.1.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I=
|
||||
k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8=
|
||||
k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I=
|
||||
k8s.io/klog/v2 v2.0.0/go.mod h1:PBfzABfn139FHAV07az/IF9Wp1bkk3vpT2XSJ76fSDE=
|
||||
k8s.io/klog/v2 v2.2.0 h1:XRvcwJozkgZ1UQJmfMGpvRthQHOvihEhYtDfAaxMz/A=
|
||||
k8s.io/klog/v2 v2.2.0/go.mod h1:Od+F08eJP+W3HUb4pSrPpgp9DGU4GzlpG/TmITuYh/Y=
|
||||
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "1.1.0"
|
||||
vitastorCSIDriverVersion = "1.2.0"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
@@ -20,6 +20,7 @@ import (
|
||||
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/status"
|
||||
"google.golang.org/protobuf/types/known/timestamppb"
|
||||
|
||||
"github.com/container-storage-interface/spec/lib/go/csi"
|
||||
)
|
||||
@@ -45,6 +46,7 @@ type InodeConfig struct
|
||||
ParentPool uint64 `json:"parent_pool,omitempty"`
|
||||
ParentId uint64 `json:"parent_id,omitempty"`
|
||||
Readonly bool `json:"readonly,omitempty"`
|
||||
CreateTs uint64 `json:"create_ts,omitempty"`
|
||||
}
|
||||
|
||||
type ControllerServer struct
|
||||
@@ -178,27 +180,43 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
|
||||
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
||||
}
|
||||
|
||||
args := []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) }
|
||||
|
||||
// Support creation from snapshot
|
||||
var src *csi.VolumeContentSource
|
||||
if (req.VolumeContentSource.GetSnapshot() != nil)
|
||||
{
|
||||
snapId := req.VolumeContentSource.GetSnapshot().GetSnapshotId()
|
||||
if (snapId != "")
|
||||
{
|
||||
snapVars := make(map[string]string)
|
||||
err := json.Unmarshal([]byte(snapId), &snapVars)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
||||
}
|
||||
args = append(args, "--parent", snapVars["name"]+"@"+snapVars["snapshot"])
|
||||
src = &csi.VolumeContentSource{
|
||||
Type: &csi.VolumeContentSource_Snapshot{
|
||||
Snapshot: &csi.VolumeContentSource_SnapshotSource{
|
||||
SnapshotId: snapId,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create image using vitastor-cli
|
||||
_, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
|
||||
_, err := invokeCLI(ctxVars, args)
|
||||
if (err != nil)
|
||||
{
|
||||
if (strings.Index(err.Error(), "already exists") > 0)
|
||||
{
|
||||
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
|
||||
inodeCfg, err := invokeList(ctxVars, volName, true)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
var inodeCfg []InodeConfig
|
||||
err = json.Unmarshal(stat, &inodeCfg)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
|
||||
}
|
||||
if (len(inodeCfg) == 0)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
|
||||
}
|
||||
if (inodeCfg[0].Size < uint64(volSize))
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
|
||||
@@ -217,6 +235,7 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
|
||||
// Ugly, but VolumeContext isn't passed to DeleteVolume :-(
|
||||
VolumeId: string(volumeIdJson),
|
||||
CapacityBytes: volSize,
|
||||
ContentSource: src,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
@@ -230,15 +249,15 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
|
||||
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
|
||||
}
|
||||
|
||||
ctxVars := make(map[string]string)
|
||||
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
|
||||
volVars := make(map[string]string)
|
||||
err := json.Unmarshal([]byte(req.VolumeId), &volVars)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
||||
}
|
||||
volName := ctxVars["name"]
|
||||
volName := volVars["name"]
|
||||
|
||||
ctxVars, _, _ = GetConnectionParams(ctxVars)
|
||||
ctxVars, _, _ := GetConnectionParams(volVars)
|
||||
|
||||
_, err = invokeCLI(ctxVars, []string{ "rm", volName })
|
||||
if (err != nil)
|
||||
@@ -344,6 +363,8 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
|
||||
csi.ControllerServiceCapability_RPC_LIST_VOLUMES,
|
||||
csi.ControllerServiceCapability_RPC_EXPAND_VOLUME,
|
||||
csi.ControllerServiceCapability_RPC_CREATE_DELETE_SNAPSHOT,
|
||||
csi.ControllerServiceCapability_RPC_LIST_SNAPSHOTS,
|
||||
// TODO: csi.ControllerServiceCapability_RPC_CLONE_VOLUME,
|
||||
} {
|
||||
controllerServerCapabilities = append(controllerServerCapabilities, functionControllerServerCapabilities(capability))
|
||||
}
|
||||
@@ -353,28 +374,214 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
|
||||
}, nil
|
||||
}
|
||||
|
||||
func invokeList(ctxVars map[string]string, pattern string, expectExist bool) ([]InodeConfig, error)
|
||||
{
|
||||
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", pattern })
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
var inodeCfg []InodeConfig
|
||||
err = json.Unmarshal(stat, &inodeCfg)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
|
||||
}
|
||||
if (expectExist && len(inodeCfg) == 0)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "Can't find expected image "+pattern+" via vitastor-cli ls")
|
||||
}
|
||||
return inodeCfg, nil
|
||||
}
|
||||
|
||||
// CreateSnapshot create snapshot of an existing PV
|
||||
func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error)
|
||||
{
|
||||
return nil, status.Error(codes.Unimplemented, "")
|
||||
klog.Infof("received controller create snapshot request %+v", protosanitizer.StripSecrets(req))
|
||||
if (req == nil)
|
||||
{
|
||||
return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
|
||||
}
|
||||
if (req.SourceVolumeId == "" || req.Name == "")
|
||||
{
|
||||
return nil, status.Error(codes.InvalidArgument, "source volume ID and snapshot name are required fields")
|
||||
}
|
||||
|
||||
// snapshot name
|
||||
snapName := req.Name
|
||||
|
||||
// req.VolumeId is an ugly json string in our case :)
|
||||
ctxVars := make(map[string]string)
|
||||
err := json.Unmarshal([]byte(req.SourceVolumeId), &ctxVars)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
||||
}
|
||||
volName := ctxVars["name"]
|
||||
|
||||
// Create image using vitastor-cli
|
||||
_, err = invokeCLI(ctxVars, []string{ "create", "--snapshot", snapName, volName })
|
||||
if (err != nil && strings.Index(err.Error(), "already exists") <= 0)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Check created snapshot
|
||||
inodeCfg, err := invokeList(ctxVars, volName+"@"+snapName, true)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Use ugly JSON snapshot ID again, DeleteSnapshot doesn't have context :-(
|
||||
ctxVars["snapshot"] = snapName
|
||||
snapIdJson, _ := json.Marshal(ctxVars)
|
||||
return &csi.CreateSnapshotResponse{
|
||||
Snapshot: &csi.Snapshot{
|
||||
SizeBytes: int64(inodeCfg[0].Size),
|
||||
SnapshotId: string(snapIdJson),
|
||||
SourceVolumeId: req.SourceVolumeId,
|
||||
CreationTime: ×tamppb.Timestamp{ Seconds: int64(inodeCfg[0].CreateTs) },
|
||||
ReadyToUse: true,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// DeleteSnapshot delete provided snapshot of a PV
|
||||
func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error)
|
||||
{
|
||||
return nil, status.Error(codes.Unimplemented, "")
|
||||
klog.Infof("received controller delete snapshot request %+v", protosanitizer.StripSecrets(req))
|
||||
if (req == nil)
|
||||
{
|
||||
return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
|
||||
}
|
||||
if (req.SnapshotId == "")
|
||||
{
|
||||
return nil, status.Error(codes.InvalidArgument, "snapshot ID is a required field")
|
||||
}
|
||||
|
||||
volVars := make(map[string]string)
|
||||
err := json.Unmarshal([]byte(req.SnapshotId), &volVars)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "snapshot ID not in JSON format")
|
||||
}
|
||||
volName := volVars["name"]
|
||||
snapName := volVars["snapshot"]
|
||||
|
||||
ctxVars, _, _ := GetConnectionParams(volVars)
|
||||
|
||||
_, err = invokeCLI(ctxVars, []string{ "rm", volName+"@"+snapName })
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &csi.DeleteSnapshotResponse{}, nil
|
||||
}
|
||||
|
||||
// ListSnapshots list the snapshots of a PV
|
||||
func (cs *ControllerServer) ListSnapshots(ctx context.Context, req *csi.ListSnapshotsRequest) (*csi.ListSnapshotsResponse, error)
|
||||
{
|
||||
return nil, status.Error(codes.Unimplemented, "")
|
||||
klog.Infof("received controller list snapshots request %+v", protosanitizer.StripSecrets(req))
|
||||
if (req == nil)
|
||||
{
|
||||
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
|
||||
}
|
||||
|
||||
volVars := make(map[string]string)
|
||||
err := json.Unmarshal([]byte(req.SourceVolumeId), &volVars)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
||||
}
|
||||
volName := volVars["name"]
|
||||
ctxVars, _, _ := GetConnectionParams(volVars)
|
||||
|
||||
inodeCfg, err := invokeList(ctxVars, volName+"@*", false)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp := &csi.ListSnapshotsResponse{}
|
||||
for _, ino := range inodeCfg
|
||||
{
|
||||
snapName := ino.Name[len(volName)+1:]
|
||||
if (len(req.StartingToken) > 0 && snapName < req.StartingToken)
|
||||
{
|
||||
}
|
||||
else if (req.MaxEntries == 0 || len(resp.Entries) < int(req.MaxEntries))
|
||||
{
|
||||
volVars["snapshot"] = snapName
|
||||
snapIdJson, _ := json.Marshal(volVars)
|
||||
resp.Entries = append(resp.Entries, &csi.ListSnapshotsResponse_Entry{
|
||||
Snapshot: &csi.Snapshot{
|
||||
SizeBytes: int64(ino.Size),
|
||||
SnapshotId: string(snapIdJson),
|
||||
SourceVolumeId: req.SourceVolumeId,
|
||||
CreationTime: ×tamppb.Timestamp{ Seconds: int64(ino.CreateTs) },
|
||||
ReadyToUse: true,
|
||||
},
|
||||
})
|
||||
}
|
||||
else
|
||||
{
|
||||
resp.NextToken = snapName
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
// ControllerExpandVolume resizes a volume
|
||||
// ControllerExpandVolume increases the size of a volume
|
||||
func (cs *ControllerServer) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error)
|
||||
{
|
||||
return nil, status.Error(codes.Unimplemented, "")
|
||||
klog.Infof("received controller expand volume request %+v", protosanitizer.StripSecrets(req))
|
||||
if (req == nil)
|
||||
{
|
||||
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
|
||||
}
|
||||
if (req.VolumeId == "" || req.CapacityRange == nil || req.CapacityRange.RequiredBytes == 0)
|
||||
{
|
||||
return nil, status.Error(codes.InvalidArgument, "VolumeId, CapacityRange and RequiredBytes are required fields")
|
||||
}
|
||||
|
||||
volVars := make(map[string]string)
|
||||
err := json.Unmarshal([]byte(req.VolumeId), &volVars)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
||||
}
|
||||
volName := volVars["name"]
|
||||
ctxVars, _, _ := GetConnectionParams(volVars)
|
||||
|
||||
inodeCfg, err := invokeList(ctxVars, volName, true)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if (req.CapacityRange.RequiredBytes > 0 && inodeCfg[0].Size < uint64(req.CapacityRange.RequiredBytes))
|
||||
{
|
||||
sz := ((req.CapacityRange.RequiredBytes+4095)/4096)*4096
|
||||
_, err := invokeCLI(ctxVars, []string{ "modify", "--inc_size", "1", "--resize", fmt.Sprintf("%d", sz), volName })
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
inodeCfg, err = invokeList(ctxVars, volName, true)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return &csi.ControllerExpandVolumeResponse{
|
||||
CapacityBytes: int64(inodeCfg[0].Size),
|
||||
NodeExpansionRequired: false,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ControllerGetVolume get volume info
|
||||
|
@@ -49,6 +49,13 @@ func (is *IdentityServer) GetPluginCapabilities(ctx context.Context, req *csi.Ge
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Type: &csi.PluginCapability_VolumeExpansion_{
|
||||
VolumeExpansion: &csi.PluginCapability_VolumeExpansion{
|
||||
Type: csi.PluginCapability_VolumeExpansion_OFFLINE,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
@@ -70,10 +70,10 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
|
||||
isBlock := req.GetVolumeCapability().GetBlock() != nil
|
||||
|
||||
// Check that it's not already mounted
|
||||
_, error := mount.IsNotMountPoint(ns.mounter, targetPath)
|
||||
if (error != nil)
|
||||
_, err := mount.IsNotMountPoint(ns.mounter, targetPath)
|
||||
if (err != nil)
|
||||
{
|
||||
if (os.IsNotExist(error))
|
||||
if (os.IsNotExist(err))
|
||||
{
|
||||
if (isBlock)
|
||||
{
|
||||
@@ -102,12 +102,12 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
|
||||
}
|
||||
else
|
||||
{
|
||||
return nil, status.Error(codes.Internal, error.Error())
|
||||
return nil, status.Error(codes.Internal, err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
ctxVars := make(map[string]string)
|
||||
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
|
||||
err = json.Unmarshal([]byte(req.VolumeId), &ctxVars)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
||||
@@ -147,70 +147,74 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
|
||||
}
|
||||
devicePath := strings.TrimSpace(stdoutStr)
|
||||
|
||||
// Check existing format
|
||||
diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
|
||||
existingFormat, err := diskMounter.GetDiskFormat(devicePath)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("failed to get disk format for path %s, error: %v", err)
|
||||
// unmap NBD device
|
||||
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
|
||||
if (unmapErr != nil)
|
||||
{
|
||||
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Format the device (ext4 or xfs)
|
||||
fsType := req.GetVolumeCapability().GetMount().GetFsType()
|
||||
opt := req.GetVolumeCapability().GetMount().GetMountFlags()
|
||||
opt = append(opt, "_netdev")
|
||||
if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
|
||||
req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
|
||||
!Contains(opt, "ro"))
|
||||
{
|
||||
opt = append(opt, "ro")
|
||||
}
|
||||
if (fsType == "xfs")
|
||||
{
|
||||
opt = append(opt, "nouuid")
|
||||
}
|
||||
readOnly := Contains(opt, "ro")
|
||||
if (existingFormat == "" && !readOnly)
|
||||
{
|
||||
args := []string{}
|
||||
switch fsType
|
||||
{
|
||||
case "ext4":
|
||||
args = []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
|
||||
case "xfs":
|
||||
args = []string{"-K", devicePath}
|
||||
}
|
||||
if (len(args) > 0)
|
||||
{
|
||||
cmdOut, cmdErr := diskMounter.Exec.Command("mkfs."+fsType, args...).CombinedOutput()
|
||||
if (cmdErr != nil)
|
||||
{
|
||||
klog.Errorf("failed to run mkfs error: %v, output: %v", cmdErr, string(cmdOut))
|
||||
// unmap NBD device
|
||||
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
|
||||
if (unmapErr != nil)
|
||||
{
|
||||
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
|
||||
}
|
||||
return nil, status.Error(codes.Internal, cmdErr.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
if (isBlock)
|
||||
{
|
||||
opt = append(opt, "bind")
|
||||
err = diskMounter.Mount(devicePath, targetPath, fsType, opt)
|
||||
err = diskMounter.Mount(devicePath, targetPath, "", []string{"bind"})
|
||||
}
|
||||
else
|
||||
{
|
||||
// Check existing format
|
||||
existingFormat, err := diskMounter.GetDiskFormat(devicePath)
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("failed to get disk format for path %s, error: %v", err)
|
||||
goto unmap
|
||||
}
|
||||
|
||||
// Format the device (ext4 or xfs)
|
||||
fsType := req.GetVolumeCapability().GetMount().GetFsType()
|
||||
opt := req.GetVolumeCapability().GetMount().GetMountFlags()
|
||||
opt = append(opt, "_netdev")
|
||||
if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
|
||||
req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
|
||||
!Contains(opt, "ro"))
|
||||
{
|
||||
opt = append(opt, "ro")
|
||||
}
|
||||
if (fsType == "xfs")
|
||||
{
|
||||
opt = append(opt, "nouuid")
|
||||
}
|
||||
readOnly := Contains(opt, "ro")
|
||||
if (existingFormat == "" && !readOnly)
|
||||
{
|
||||
var cmdOut []byte
|
||||
switch fsType
|
||||
{
|
||||
case "ext4":
|
||||
args := []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
|
||||
cmdOut, err = diskMounter.Exec.Command("mkfs.ext4", args...).CombinedOutput()
|
||||
case "xfs":
|
||||
cmdOut, err = diskMounter.Exec.Command("mkfs.xfs", "-K", devicePath).CombinedOutput()
|
||||
}
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("failed to run mkfs error: %v, output: %v", err, string(cmdOut))
|
||||
goto unmap
|
||||
}
|
||||
}
|
||||
|
||||
err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
|
||||
|
||||
// Try to run online resize on mount.
|
||||
// FIXME: Implement online resize. It requires online resize support in vitastor-nbd.
|
||||
if (err == nil && existingFormat != "" && !readOnly)
|
||||
{
|
||||
var cmdOut []byte
|
||||
switch (fsType)
|
||||
{
|
||||
case "ext4":
|
||||
cmdOut, err = diskMounter.Exec.Command("resize2fs", devicePath).CombinedOutput()
|
||||
case "xfs":
|
||||
cmdOut, err = diskMounter.Exec.Command("xfs_growfs", devicePath).CombinedOutput()
|
||||
}
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("failed to run resizefs error: %v, output: %v", err, string(cmdOut))
|
||||
goto unmap
|
||||
}
|
||||
}
|
||||
}
|
||||
if (err != nil)
|
||||
{
|
||||
@@ -218,15 +222,18 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
|
||||
"failed to mount device path (%s) to path (%s) for volume (%s) error: %s",
|
||||
devicePath, targetPath, volName, err,
|
||||
)
|
||||
// unmap NBD device
|
||||
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
|
||||
if (unmapErr != nil)
|
||||
{
|
||||
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
|
||||
}
|
||||
return nil, status.Error(codes.Internal, err.Error())
|
||||
goto unmap
|
||||
}
|
||||
return &csi.NodePublishVolumeResponse{}, nil
|
||||
|
||||
unmap:
|
||||
// unmap NBD device
|
||||
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
|
||||
if (unmapErr != nil)
|
||||
{
|
||||
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
|
||||
}
|
||||
return nil, status.Error(codes.Internal, err.Error())
|
||||
}
|
||||
|
||||
// NodeUnpublishVolume unmounts the volume from the target path
|
||||
|
4
debian/changelog
vendored
4
debian/changelog
vendored
@@ -1,10 +1,10 @@
|
||||
vitastor (1.1.0-1) unstable; urgency=medium
|
||||
vitastor (1.2.0-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
||||
|
||||
vitastor (1.1.0-1) unstable; urgency=medium
|
||||
vitastor (1.2.0-1) unstable; urgency=medium
|
||||
|
||||
* Implement NFS proxy
|
||||
* Add documentation
|
||||
|
2
debian/control
vendored
2
debian/control
vendored
@@ -2,7 +2,7 @@ Source: vitastor
|
||||
Section: admin
|
||||
Priority: optional
|
||||
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
|
||||
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev
|
||||
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config
|
||||
Standards-Version: 4.5.0
|
||||
Homepage: https://vitastor.io/
|
||||
Rules-Requires-Root: no
|
||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -35,8 +35,8 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-1.1.0; \
|
||||
cd vitastor-1.1.0; \
|
||||
cp -r /root/vitastor vitastor-1.2.0; \
|
||||
cd vitastor-1.2.0; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
@@ -49,8 +49,8 @@ RUN set -e -x; \
|
||||
rm -rf a b; \
|
||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.1.0.orig.tar.xz vitastor-1.1.0; \
|
||||
cd vitastor-1.1.0; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.2.0.orig.tar.xz vitastor-1.2.0; \
|
||||
cd vitastor-1.2.0; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
@@ -67,8 +67,8 @@
|
||||
явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
|
||||
буферизуются, это может приводить к потере данных. Поэтому в старых версиях
|
||||
клиентских драйверов буферизация записи не включается вообще, в новых
|
||||
версиях QEMU-драйвера включается только если разрешена опцией диска
|
||||
`-blockdev cache.direct=false`, а в fio - только если нет опция `-direct=1`.
|
||||
версиях QEMU-драйвера включается, только если разрешена опцией диска
|
||||
`-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
|
||||
В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
|
||||
|
||||
Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
|
||||
|
@@ -20,6 +20,7 @@ between clients, OSDs and etcd.
|
||||
- [rdma_max_msg](#rdma_max_msg)
|
||||
- [rdma_max_recv](#rdma_max_recv)
|
||||
- [rdma_max_send](#rdma_max_send)
|
||||
- [rdma_odp](#rdma_odp)
|
||||
- [peer_connect_interval](#peer_connect_interval)
|
||||
- [peer_connect_timeout](#peer_connect_timeout)
|
||||
- [osd_idle_timeout](#osd_idle_timeout)
|
||||
@@ -68,11 +69,14 @@ but they are not connected to the cluster.
|
||||
- Type: string
|
||||
|
||||
RDMA device name to use for Vitastor OSD communications (for example,
|
||||
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
|
||||
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
|
||||
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||
root to list available RDMA devices and their features.
|
||||
"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
|
||||
ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
|
||||
|
||||
Versions up to Vitastor 1.2.0 required ODP which is only present in
|
||||
Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
|
||||
|
||||
Run `ibv_devinfo -v` as root to list available RDMA devices and their
|
||||
features.
|
||||
|
||||
Remember that you also have to configure your network switches if you use
|
||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
||||
@@ -147,6 +151,28 @@ less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
|
||||
Doesn't affect memory usage - additional memory isn't allocated for send
|
||||
operations.
|
||||
|
||||
## rdma_odp
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
|
||||
Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
|
||||
ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
|
||||
for RDMA adapter to be able to use it. This, in turn, allows to skip memory
|
||||
copying during sending. One would think this should improve performance, but
|
||||
**in reality** RDMA performance with ODP is **drastically** worse. Example
|
||||
3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
|
||||
without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
|
||||
|
||||
This happens because Mellanox ODP implementation seems to be based on
|
||||
message retransmissions when the adapter doesn't know about the buffer yet -
|
||||
it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
|
||||
which is generally slow in RDMA/RoCE networks. Here's a presentation about
|
||||
it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
|
||||
|
||||
ODP support is retained in the code just in case a good ODP implementation
|
||||
appears one day.
|
||||
|
||||
## peer_connect_interval
|
||||
|
||||
- Type: seconds
|
||||
|
@@ -20,6 +20,7 @@
|
||||
- [rdma_max_msg](#rdma_max_msg)
|
||||
- [rdma_max_recv](#rdma_max_recv)
|
||||
- [rdma_max_send](#rdma_max_send)
|
||||
- [rdma_odp](#rdma_odp)
|
||||
- [peer_connect_interval](#peer_connect_interval)
|
||||
- [peer_connect_timeout](#peer_connect_timeout)
|
||||
- [osd_idle_timeout](#osd_idle_timeout)
|
||||
@@ -71,12 +72,15 @@ RDMA может быть нужно только если у клиентов е
|
||||
- Тип: строка
|
||||
|
||||
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
||||
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
||||
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
|
||||
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
|
||||
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||
параметры и возможности.
|
||||
Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
|
||||
нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
|
||||
картами производства не Mellanox.
|
||||
|
||||
Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
|
||||
на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
|
||||
|
||||
Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
|
||||
список доступных RDMA-устройств, их параметры и возможности.
|
||||
|
||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
||||
@@ -155,6 +159,29 @@ OSD в любом случае согласовывают реальное зн
|
||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
||||
не выделяется.
|
||||
|
||||
## rdma_odp
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
|
||||
Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
|
||||
исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
|
||||
не регистрировать память для её использования RDMA-картой. Благодаря этому
|
||||
можно не копировать данные при отправке их в сеть и, казалось бы, это должно
|
||||
улучшать производительность - но **по факту** получается так, что
|
||||
производительность только ухудшается, причём сильно. Пример - на 3-узловом
|
||||
кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
|
||||
удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
|
||||
|
||||
Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
|
||||
основана на повторной передаче сообщений, когда карте не известен буфер -
|
||||
вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
|
||||
А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
|
||||
Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
|
||||
|
||||
Возможность использования ODP сохранена в коде на случай, если вдруг в один
|
||||
прекрасный день появится хорошая реализация ODP.
|
||||
|
||||
## peer_connect_interval
|
||||
|
||||
- Тип: секунды
|
||||
|
@@ -87,8 +87,8 @@
|
||||
явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
|
||||
буферизуются, это может приводить к потере данных. Поэтому в старых версиях
|
||||
клиентских драйверов буферизация записи не включается вообще, в новых
|
||||
версиях QEMU-драйвера включается только если разрешена опцией диска
|
||||
`-blockdev cache.direct=false`, а в fio - только если нет опция `-direct=1`.
|
||||
версиях QEMU-драйвера включается, только если разрешена опцией диска
|
||||
`-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
|
||||
В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
|
||||
|
||||
Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
|
||||
|
@@ -48,11 +48,14 @@
|
||||
type: string
|
||||
info: |
|
||||
RDMA device name to use for Vitastor OSD communications (for example,
|
||||
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
|
||||
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
|
||||
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||
root to list available RDMA devices and their features.
|
||||
"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
|
||||
ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
|
||||
|
||||
Versions up to Vitastor 1.2.0 required ODP which is only present in
|
||||
Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
|
||||
|
||||
Run `ibv_devinfo -v` as root to list available RDMA devices and their
|
||||
features.
|
||||
|
||||
Remember that you also have to configure your network switches if you use
|
||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
||||
@@ -61,12 +64,15 @@
|
||||
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
||||
info_ru: |
|
||||
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
||||
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
||||
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
|
||||
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
|
||||
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||
параметры и возможности.
|
||||
Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
|
||||
нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
|
||||
картами производства не Mellanox.
|
||||
|
||||
Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
|
||||
на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
|
||||
|
||||
Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
|
||||
список доступных RDMA-устройств, их параметры и возможности.
|
||||
|
||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
||||
@@ -160,6 +166,45 @@
|
||||
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
||||
не выделяется.
|
||||
- name: rdma_odp
|
||||
type: bool
|
||||
default: false
|
||||
online: false
|
||||
info: |
|
||||
Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
|
||||
ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
|
||||
for RDMA adapter to be able to use it. This, in turn, allows to skip memory
|
||||
copying during sending. One would think this should improve performance, but
|
||||
**in reality** RDMA performance with ODP is **drastically** worse. Example
|
||||
3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
|
||||
without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
|
||||
|
||||
This happens because Mellanox ODP implementation seems to be based on
|
||||
message retransmissions when the adapter doesn't know about the buffer yet -
|
||||
it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
|
||||
which is generally slow in RDMA/RoCE networks. Here's a presentation about
|
||||
it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
|
||||
|
||||
ODP support is retained in the code just in case a good ODP implementation
|
||||
appears one day.
|
||||
info_ru: |
|
||||
Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
|
||||
исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
|
||||
не регистрировать память для её использования RDMA-картой. Благодаря этому
|
||||
можно не копировать данные при отправке их в сеть и, казалось бы, это должно
|
||||
улучшать производительность - но **по факту** получается так, что
|
||||
производительность только ухудшается, причём сильно. Пример - на 3-узловом
|
||||
кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
|
||||
удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
|
||||
|
||||
Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
|
||||
основана на повторной передаче сообщений, когда карте не известен буфер -
|
||||
вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
|
||||
А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
|
||||
Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
|
||||
|
||||
Возможность использования ODP сохранена в коде на случай, если вдруг в один
|
||||
прекрасный день появится хорошая реализация ODP.
|
||||
- name: peer_connect_interval
|
||||
type: sec
|
||||
min: 1
|
||||
|
@@ -17,4 +17,15 @@ and apply all `NNN-*.yaml` manifests to your Kubernetes installation:
|
||||
for i in ./???-*.yaml; do kubectl apply -f $i; done
|
||||
```
|
||||
|
||||
After that you'll be able to create PersistentVolumes. See example in [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
|
||||
After that you'll be able to create PersistentVolumes.
|
||||
|
||||
## Features
|
||||
|
||||
Vitastor CSI supports:
|
||||
- Kubernetes starting with 1.20 (or 1.17 for older vitastor-csi <= 1.1.0)
|
||||
- Filesystem RWO (ReadWriteOnce) volumes. Example: [PVC](../../csi/deploy/example-pvc.yaml), [pod](../../csi/deploy/example-test-pod.yaml)
|
||||
- Raw block RWX (ReadWriteMany) volumes. Example: [PVC](../../csi/deploy/example-pvc-block.yaml), [pod](../../csi/deploy/example-test-pod-block.yaml)
|
||||
- Volume expansion
|
||||
- Volume snapshots. Example: [snapshot class](../../csi/deploy/example-snapshot-class.yaml), [snapshot](../../csi/deploy/example-snapshot.yaml), [clone](../../csi/deploy/example-snapshot-clone.yaml)
|
||||
|
||||
Remember that to use snapshots with CSI you also have to install [Snapshot Controller and CRDs](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
|
||||
|
@@ -17,4 +17,15 @@
|
||||
for i in ./???-*.yaml; do kubectl apply -f $i; done
|
||||
```
|
||||
|
||||
После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
|
||||
После этого вы сможете создавать PersistentVolume.
|
||||
|
||||
## Возможности
|
||||
|
||||
CSI-плагин Vitastor поддерживает:
|
||||
- Версии Kubernetes, начиная с 1.20 (или с 1.17 для более старых vitastor-csi <= 1.1.0)
|
||||
- Файловые RWO (ReadWriteOnce) тома. Пример: [PVC](../../csi/deploy/example-pvc.yaml), [под](../../csi/deploy/example-test-pod.yaml)
|
||||
- Сырые блочные RWX (ReadWriteMany) тома. Пример: [PVC](../../csi/deploy/example-pvc-block.yaml), [под](../../csi/deploy/example-test-pod-block.yaml)
|
||||
- Расширение размера томов
|
||||
- Снимки томов. Пример: [класс снимков](../../csi/deploy/example-snapshot-class.yaml), [снимок](../../csi/deploy/example-snapshot.yaml), [клон снимка](../../csi/deploy/example-snapshot-clone.yaml)
|
||||
|
||||
Не забывайте, что для использования снимков нужно сначала установить [контроллер снимков и CRD](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
|
||||
|
@@ -51,13 +51,15 @@
|
||||
|
||||
The following features are planned for the future:
|
||||
|
||||
- File system
|
||||
- Control plane optimisation
|
||||
- Other administrative tools
|
||||
- Web GUI
|
||||
- OpenNebula plugin
|
||||
- iSCSI proxy
|
||||
- iSCSI and NVMeoF gateways
|
||||
- Multi-threaded client
|
||||
- Faster failover
|
||||
- S3
|
||||
- Tiered storage (SSD caching)
|
||||
- NVDIMM support
|
||||
- Compression (possibly)
|
||||
- Read caching using system page cache (possibly)
|
||||
|
@@ -51,12 +51,15 @@
|
||||
|
||||
## Планы развития
|
||||
|
||||
- Файловая система
|
||||
- Оптимизация слоя управления
|
||||
- Другие инструменты администрирования
|
||||
- Web-интерфейс
|
||||
- Плагин для OpenNebula
|
||||
- iSCSI-прокси
|
||||
- iSCSI и NVMeoF прокси
|
||||
- Многопоточный клиент
|
||||
- Более быстрое переключение при отказах
|
||||
- S3
|
||||
- Поддержка SSD-кэширования (tiered storage)
|
||||
- Поддержка NVDIMM
|
||||
- Возможно, сжатие
|
||||
|
@@ -127,19 +127,46 @@ Linux kernel, starting with version 5.15, supports a new interface for attaching
|
||||
to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
|
||||
exporting QEMU block devices over this protocol using qemu-storage-daemon.
|
||||
|
||||
VDUSE has the same problem as other FUSE-like interfaces in Linux: if a userspace process hangs,
|
||||
for example, if it loses connectivity with Vitastor cluster - active processes doing I/O may
|
||||
hang in the D state (uninterruptible sleep) and you won't be able to kill them even with kill -9.
|
||||
In this case reboot will be the only way to remove VDUSE devices from system.
|
||||
VDUSE is currently the best interface to attach Vitastor disks as kernel devices because:
|
||||
- It avoids data copies and thus achieves much better performance than [NBD](nbd.en.md)
|
||||
- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long
|
||||
- It doesn't have hung device problem - if the userspace process dies it can be restarted (!)
|
||||
and block device will continue operation
|
||||
- It doesn't seem to have the device number limit
|
||||
|
||||
On the other hand, VDUSE is faster than [NBD](nbd.en.md), so you may prefer to use it if
|
||||
performance is important for you. Approximate performance numbers:
|
||||
direct fio benchmark - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
|
||||
Example performance comparison:
|
||||
|
||||
| | direct fio | NBD | VDUSE |
|
||||
|----------------------|-------------|-------------|-------------|
|
||||
| linear write | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
|
||||
| 4k random write Q128 | 240000 iops | 120000 iops | 178000 iops |
|
||||
| 4k random write Q1 | 9500 iops | 7620 iops | 7640 iops |
|
||||
| linear read | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
|
||||
| 4k random read Q128 | 287000 iops | 140000 iops | 189000 iops |
|
||||
| 4k random read Q1 | 9600 iops | 7640 iops | 7780 iops |
|
||||
|
||||
To try VDUSE you need at least Linux 5.15, built with VDUSE support
|
||||
(CONFIG_VIRTIO_VDPA=m and CONFIG_VDPA_USER=m). Debian Linux kernels have these options
|
||||
disabled by now, so if you want to try it on Debian, use a kernel from Ubuntu
|
||||
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) or Proxmox.
|
||||
(CONFIG_VIRTIO_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
|
||||
|
||||
Debian Linux kernels have these options disabled by now, so if you want to try it on Debian,
|
||||
use a kernel from Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/), Proxmox,
|
||||
or build modules for Debian kernel manually:
|
||||
|
||||
```
|
||||
mkdir build
|
||||
cd build
|
||||
apt-get install linux-headers-`uname -r`
|
||||
apt-get build-dep linux-image-`uname -r`-unsigned
|
||||
apt-get source linux-image-`uname -r`-unsigned
|
||||
cd linux*/drivers/vdpa
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
||||
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
|
||||
cd ../virtio
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
||||
depmod -a
|
||||
```
|
||||
|
||||
You also need `vdpa` tool from the `iproute2` package.
|
||||
|
||||
Commands to attach Vitastor image as a VDUSE device:
|
||||
|
||||
@@ -152,7 +179,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
|
||||
vdpa dev add name test1 mgmtdev vduse
|
||||
```
|
||||
|
||||
After running these commands /dev/vda device will appear in the system and you'll be able to
|
||||
After running these commands, `/dev/vda` device will appear in the system and you'll be able to
|
||||
use it as a normal disk.
|
||||
|
||||
To remove the device:
|
||||
|
@@ -129,19 +129,47 @@ qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
|
||||
к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
|
||||
экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.
|
||||
|
||||
VDUSE страдает общей проблемой FUSE-подобных интерфейсов в Linux: если пользовательский процесс
|
||||
подвиснет, например, если будет потеряна связь с кластером Vitastor - читающие/пишущие в кластер
|
||||
процессы могут "залипнуть" в состоянии D (непрерываемый сон) и их будет невозможно убить даже
|
||||
через kill -9. В этом случае удалить из системы устройство можно только перезагрузившись.
|
||||
VDUSE - на данный момент лучший интерфейс для подключения дисков Vitastor в виде блочных
|
||||
устройств на уровне ядра, ибо:
|
||||
- VDUSE не копирует данные и поэтому достигает значительно лучшей производительности, чем [NBD](nbd.ru.md)
|
||||
- Также оно не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго
|
||||
- Также оно не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
|
||||
перезапустить (!) и блочное устройство продолжит работать
|
||||
- По-видимому, у него нет предела числа подключаемых в систему устройств
|
||||
|
||||
С другой стороны, VDUSE быстрее по сравнению с [NBD](nbd.ru.md), поэтому его может
|
||||
быть предпочтительно использовать там, где производительность важнее. Порядок показателей:
|
||||
прямое тестирование через fio - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
|
||||
Пример сравнения производительности:
|
||||
|
||||
Чтобы использовать VDUSE, вам нужно ядро Linux версии хотя бы 5.15, собранное с поддержкой
|
||||
VDUSE (CONFIG_VIRTIO_VDPA=m и CONFIG_VDPA_USER=m). В ядрах в Debian Linux поддержка пока
|
||||
отключена - если хотите попробовать эту функцию на Debian, поставьте ядро из Ubuntu
|
||||
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) или из Proxmox.
|
||||
| | Прямой fio | NBD | VDUSE |
|
||||
|--------------------------|-------------|-------------|-------------|
|
||||
| линейная запись | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
|
||||
| 4k случайная запись Q128 | 240000 iops | 120000 iops | 178000 iops |
|
||||
| 4k случайная запись Q1 | 9500 iops | 7620 iops | 7640 iops |
|
||||
| линейное чтение | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
|
||||
| 4k случайное чтение Q128 | 287000 iops | 140000 iops | 189000 iops |
|
||||
| 4k случайное чтение Q1 | 9600 iops | 7640 iops | 7780 iops |
|
||||
|
||||
Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
|
||||
VDUSE (CONFIG_VIRTIO_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
|
||||
|
||||
В ядрах в Debian Linux поддержка пока отключена по умолчанию, так что чтобы попробовать VDUSE
|
||||
на Debian, поставьте ядро из Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/),
|
||||
из Proxmox или соберите модули для ядра Debian вручную:
|
||||
|
||||
```
|
||||
mkdir build
|
||||
cd build
|
||||
apt-get install linux-headers-`uname -r`
|
||||
apt-get build-dep linux-image-`uname -r`-unsigned
|
||||
apt-get source linux-image-`uname -r`-unsigned
|
||||
cd linux*/drivers/vdpa
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
||||
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
|
||||
cd ../virtio
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
||||
depmod -a
|
||||
```
|
||||
|
||||
Также вам понадобится консольная утилита `vdpa` из пакета `iproute2`.
|
||||
|
||||
Команды для подключения виртуального диска через VDUSE:
|
||||
|
||||
@@ -154,7 +182,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
|
||||
vdpa dev add name test1 mgmtdev vduse
|
||||
```
|
||||
|
||||
После этого в системе появится устройство /dev/vda, которое можно будет использовать как
|
||||
После этого в системе появится устройство `/dev/vda`, которое можно будет использовать как
|
||||
обычный диск.
|
||||
|
||||
Для удаления устройства из системы:
|
||||
|
@@ -3,5 +3,5 @@ SUBSYSTEM=="block", ENV{ID_PART_ENTRY_TYPE}=="e7009fac-a5a1-4d72-af72-53de130599
|
||||
IMPORT{program}="/usr/bin/vitastor-disk udev $devnode", \
|
||||
SYMLINK+="vitastor/$env{VITASTOR_ALIAS}"
|
||||
|
||||
ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
|
||||
ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
|
||||
ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
|
||||
ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
|
||||
|
142
mon/mon.js
142
mon/mon.js
@@ -397,12 +397,13 @@ class Mon
|
||||
this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
|
||||
this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
|
||||
this.state = JSON.parse(JSON.stringify(this.constructor.etcd_tree));
|
||||
this.prev_stats = { osd_stats: {}, osd_diff: {} };
|
||||
this.signals_set = false;
|
||||
this.stat_time = Date.now();
|
||||
this.ws = null;
|
||||
this.ws_alive = false;
|
||||
this.ws_keepalive_timer = null;
|
||||
this.on_stop_cb = () => this.on_stop(0).catch(console.error);
|
||||
this.recheck_pgs_active = false;
|
||||
}
|
||||
|
||||
parse_etcd_addresses(addrs)
|
||||
@@ -552,9 +553,9 @@ class Mon
|
||||
const cur_addr = this.pick_next_etcd();
|
||||
const base = 'ws'+cur_addr.substr(4);
|
||||
let now = Date.now();
|
||||
if (tried[base] && now-tried[base] < timeout)
|
||||
if (tried[base] && now-tried[base] < this.etcd_start_timeout)
|
||||
{
|
||||
await new Promise(ok => setTimeout(ok, timeout-(now-tried[base])));
|
||||
await new Promise(ok => setTimeout(ok, this.etcd_start_timeout-(now-tried[base])));
|
||||
now = Date.now();
|
||||
}
|
||||
tried[base] = now;
|
||||
@@ -692,8 +693,27 @@ class Mon
|
||||
});
|
||||
}
|
||||
|
||||
// Schedule save_last_clean() to to run after a small timeout (1s) (to not spam etcd)
|
||||
schedule_save_last_clean()
|
||||
{
|
||||
if (!this.save_last_clean_timer)
|
||||
{
|
||||
this.save_last_clean_timer = setTimeout(() =>
|
||||
{
|
||||
this.save_last_clean_timer = null;
|
||||
this.save_last_clean().catch(this.die);
|
||||
}, this.config.mon_change_timeout || 1000);
|
||||
}
|
||||
}
|
||||
|
||||
async save_last_clean()
|
||||
{
|
||||
if (this.save_last_clean_running)
|
||||
{
|
||||
this.schedule_save_last_clean();
|
||||
return;
|
||||
}
|
||||
this.save_last_clean_running = true;
|
||||
// last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
|
||||
const new_clean_pgs = { items: {} };
|
||||
next_pool:
|
||||
@@ -730,6 +750,7 @@ class Mon
|
||||
value: b64(JSON.stringify(this.state.history.last_clean_pgs))
|
||||
} } ],
|
||||
}, this.etcd_start_timeout, 0);
|
||||
this.save_last_clean_running = false;
|
||||
}
|
||||
|
||||
get_mon_state()
|
||||
@@ -1203,6 +1224,12 @@ class Mon
|
||||
|
||||
async recheck_pgs()
|
||||
{
|
||||
if (this.recheck_pgs_active)
|
||||
{
|
||||
this.schedule_recheck();
|
||||
return;
|
||||
}
|
||||
this.recheck_pgs_active = true;
|
||||
// Take configuration and state, check it against the stored configuration hash
|
||||
// Recalculate PGs and save them to etcd if the configuration is changed
|
||||
// FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
|
||||
@@ -1224,6 +1251,7 @@ class Mon
|
||||
// Pool deleted. Delete all PGs, but first stop them.
|
||||
if (!await this.stop_all_pgs(pool_id))
|
||||
{
|
||||
this.recheck_pgs_active = false;
|
||||
this.schedule_recheck();
|
||||
return;
|
||||
}
|
||||
@@ -1292,9 +1320,16 @@ class Mon
|
||||
// PG count changed. Need to bring all PGs down.
|
||||
if (!await this.stop_all_pgs(pool_id))
|
||||
{
|
||||
this.recheck_pgs_active = false;
|
||||
this.schedule_recheck();
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (prev_pgs.length != pool_cfg.pg_count)
|
||||
{
|
||||
// Scale PG count
|
||||
// Do it even if old_pg_count is already equal to pool_cfg.pg_count,
|
||||
// because last_clean_pgs may still contain the old number of PGs
|
||||
const new_pg_history = [];
|
||||
PGUtil.scale_pg_count(prev_pgs, real_prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
|
||||
pg_history = new_pg_history;
|
||||
@@ -1396,6 +1431,7 @@ class Mon
|
||||
await this.save_pg_config(new_config_pgs);
|
||||
}
|
||||
}
|
||||
this.recheck_pgs_active = false;
|
||||
}
|
||||
|
||||
async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
|
||||
@@ -1445,7 +1481,6 @@ class Mon
|
||||
}
|
||||
|
||||
// Schedule a recheck to run after a small timeout (1s)
|
||||
// If already scheduled, cancel previous timer and schedule it again
|
||||
// This is required for multiple change events to trigger at most 1 recheck in 1s
|
||||
schedule_recheck()
|
||||
{
|
||||
@@ -1459,15 +1494,15 @@ class Mon
|
||||
}
|
||||
}
|
||||
|
||||
derive_osd_stats(st, prev)
|
||||
derive_osd_stats(st, prev, prev_diff)
|
||||
{
|
||||
const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
|
||||
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
|
||||
if (!st || !st.time || prev && (prev.time || this.stat_time/1000) >= st.time)
|
||||
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
|
||||
if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
|
||||
{
|
||||
return diff;
|
||||
return prev_diff || diff;
|
||||
}
|
||||
const timediff = BigInt(st.time*1000 - (prev && prev.time*1000 || this.stat_time));
|
||||
const timediff = BigInt(st.time*1000 - prev.time*1000);
|
||||
for (const op in st.op_stats||{})
|
||||
{
|
||||
const pr = prev && prev.op_stats && prev.op_stats[op];
|
||||
@@ -1499,25 +1534,47 @@ class Mon
|
||||
if (n > 0)
|
||||
diff.recovery_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff };
|
||||
}
|
||||
for (const pool_id in st.inode_stats||{})
|
||||
{
|
||||
const pool_diff = diff.inode_stats[pool_id] = {};
|
||||
for (const inode_num in st.inode_stats[pool_id])
|
||||
{
|
||||
const inode_diff = diff.inode_stats[pool_id][inode_num] = {};
|
||||
for (const op of [ 'read', 'write', 'delete' ])
|
||||
{
|
||||
const c = st.inode_stats[pool_id][inode_num][op];
|
||||
const pr = prev && prev.inode_stats && prev.inode_stats[pool_id] &&
|
||||
prev.inode_stats[pool_id][inode_num] && prev.inode_stats[pool_id][inode_num][op];
|
||||
const n = BigInt(c.count||0) - BigInt(pr && pr.count||0);
|
||||
inode_diff[op] = {
|
||||
bps: (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff,
|
||||
iops: n*1000n/timediff,
|
||||
lat: (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/(n || 1n),
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
sum_op_stats(timestamp, prev_stats)
|
||||
sum_op_stats()
|
||||
{
|
||||
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
|
||||
if (!prev_stats || prev_stats.timestamp >= timestamp)
|
||||
for (const osd in this.state.osd.stats)
|
||||
{
|
||||
return sum_diff;
|
||||
const cur = { ...this.state.osd.stats[osd], inode_stats: this.state.osd.inodestats[osd]||{} };
|
||||
this.prev_stats.osd_diff[osd] = this.derive_osd_stats(
|
||||
cur, this.prev_stats.osd_stats[osd], this.prev_stats.osd_diff[osd]
|
||||
);
|
||||
this.prev_stats.osd_stats[osd] = cur;
|
||||
}
|
||||
const tm = BigInt(timestamp - (prev_stats.timestamp || 0));
|
||||
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
|
||||
// Sum derived values instead of deriving summed
|
||||
for (const osd in this.state.osd.stats)
|
||||
{
|
||||
const derived = this.derive_osd_stats(this.state.osd.stats[osd],
|
||||
this.prev_stats && this.prev_stats.osd_stats && this.prev_stats.osd_stats[osd]);
|
||||
for (const type in derived)
|
||||
const derived = this.prev_stats.osd_diff[osd];
|
||||
for (const type in sum_diff)
|
||||
{
|
||||
for (const op in derived[type])
|
||||
for (const op in derived[type]||{})
|
||||
{
|
||||
for (const k in derived[type][op])
|
||||
{
|
||||
@@ -1574,14 +1631,14 @@ class Mon
|
||||
return { object_counts, object_bytes };
|
||||
}
|
||||
|
||||
sum_inode_stats(prev_stats, timestamp, prev_timestamp)
|
||||
sum_inode_stats()
|
||||
{
|
||||
const inode_stats = {};
|
||||
const inode_stub = () => ({
|
||||
raw_used: 0n,
|
||||
read: { count: 0n, usec: 0n, bytes: 0n },
|
||||
write: { count: 0n, usec: 0n, bytes: 0n },
|
||||
delete: { count: 0n, usec: 0n, bytes: 0n },
|
||||
read: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
|
||||
write: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
|
||||
delete: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
|
||||
});
|
||||
const seen_pools = {};
|
||||
for (const pool_id in this.state.config.pools)
|
||||
@@ -1633,11 +1690,25 @@ class Mon
|
||||
}
|
||||
}
|
||||
}
|
||||
if (prev_stats && prev_timestamp >= timestamp)
|
||||
for (const osd in this.prev_stats.osd_diff)
|
||||
{
|
||||
prev_stats = null;
|
||||
for (const pool_id in this.prev_stats.osd_diff[osd].inode_stats)
|
||||
{
|
||||
for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id])
|
||||
{
|
||||
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
|
||||
for (const op of [ 'read', 'write', 'delete' ])
|
||||
{
|
||||
const op_diff = this.prev_stats.osd_diff[osd].inode_stats[pool_id][inode_num][op] || {};
|
||||
const op_st = inode_stats[pool_id][inode_num][op];
|
||||
op_st.bps += op_diff.bps;
|
||||
op_st.iops += op_diff.iops;
|
||||
op_st.lat += op_diff.lat;
|
||||
op_st.n_osd = (op_st.n_osd || 0) + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
const tm = prev_stats ? BigInt(timestamp - prev_timestamp) : 0;
|
||||
for (const pool_id in inode_stats)
|
||||
{
|
||||
for (const inode_num in inode_stats[pool_id])
|
||||
@@ -1646,11 +1717,12 @@ class Mon
|
||||
for (const op of [ 'read', 'write', 'delete' ])
|
||||
{
|
||||
const op_st = inode_stats[pool_id][inode_num][op];
|
||||
const prev_st = prev_stats && prev_stats[pool_id] && prev_stats[pool_id][inode_num] && prev_stats[pool_id][inode_num][op];
|
||||
op_st.bps = prev_st ? (op_st.bytes - prev_st.bytes) * 1000n / tm : 0;
|
||||
op_st.iops = prev_st ? (op_st.count - prev_st.count) * 1000n / tm : 0;
|
||||
op_st.lat = prev_st ? (op_st.usec - prev_st.usec) / ((op_st.count - prev_st.count) || 1n) : 0;
|
||||
if (op_st.bps > 0 || op_st.iops > 0 || op_st.lat > 0)
|
||||
if (op_st.n_osd)
|
||||
{
|
||||
op_st.lat /= BigInt(op_st.n_osd);
|
||||
delete op_st.n_osd;
|
||||
}
|
||||
if (op_st.bps > 0 || op_st.iops > 0)
|
||||
nonzero = true;
|
||||
}
|
||||
if (!nonzero && (!this.state.config.inode[pool_id] || !this.state.config.inode[pool_id][inode_num]))
|
||||
@@ -1683,15 +1755,9 @@ class Mon
|
||||
async update_total_stats()
|
||||
{
|
||||
const txn = [];
|
||||
const timestamp = Date.now();
|
||||
const { object_counts, object_bytes } = this.sum_object_counts();
|
||||
let stats = this.sum_op_stats(timestamp, this.prev_stats);
|
||||
let { inode_stats, seen_pools } = this.sum_inode_stats(
|
||||
this.prev_stats ? this.prev_stats.inode_stats : null,
|
||||
timestamp, this.prev_stats ? this.prev_stats.timestamp : null
|
||||
);
|
||||
this.prev_stats = { timestamp, inode_stats, osd_stats: { ...this.state.osd.stats } };
|
||||
this.stat_time = Date.now();
|
||||
let stats = this.sum_op_stats();
|
||||
let { inode_stats, seen_pools } = this.sum_inode_stats();
|
||||
stats.object_counts = object_counts;
|
||||
stats.object_bytes = object_bytes;
|
||||
stats = this.serialize_bigints(stats);
|
||||
|
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vitastor-mon",
|
||||
"version": "1.1.0",
|
||||
"version": "1.2.0",
|
||||
"description": "Vitastor SDS monitor service",
|
||||
"main": "mon-main.js",
|
||||
"scripts": {
|
||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VERSION = '1.1.0'
|
||||
VERSION = '1.2.0'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
@@ -24,4 +24,4 @@ rm fio
|
||||
mv fio-copy fio
|
||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-1.1.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.1.0$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-1.2.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.2.0$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-1.1.0.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-1.2.0.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 1.1.0
|
||||
Version: 1.2.0
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-1.1.0.el7.tar.gz
|
||||
Source0: vitastor-1.2.0.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-1.1.0.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-1.2.0.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 1.1.0
|
||||
Version: 1.2.0
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-1.1.0.el8.tar.gz
|
||||
Source0: vitastor-1.2.0.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -18,7 +18,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-1.1.0.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-1.2.0.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 1.1.0
|
||||
Version: 1.2.0
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-1.1.0.el9.tar.gz
|
||||
Source0: vitastor-1.2.0.el9.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -16,10 +16,11 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="1.1.0")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||
add_definitions(-DVERSION="1.2.0")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
||||
add_link_options(-fno-omit-frame-pointer)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||
add_definitions(-fsanitize=address)
|
||||
add_link_options(-fsanitize=address -fno-omit-frame-pointer)
|
||||
endif (${WITH_ASAN})
|
||||
|
||||
@@ -180,6 +181,25 @@ target_link_libraries(vitastor-nbd
|
||||
vitastor_client
|
||||
)
|
||||
|
||||
# vitastor-kv
|
||||
add_executable(vitastor-kv
|
||||
kv_cli.cpp
|
||||
kv_db.cpp
|
||||
kv_db.h
|
||||
)
|
||||
target_link_libraries(vitastor-kv
|
||||
vitastor_client
|
||||
)
|
||||
|
||||
add_executable(vitastor-kv-stress
|
||||
kv_stress.cpp
|
||||
kv_db.cpp
|
||||
kv_db.h
|
||||
)
|
||||
target_link_libraries(vitastor-kv-stress
|
||||
vitastor_client
|
||||
)
|
||||
|
||||
# vitastor-nfs
|
||||
add_executable(vitastor-nfs
|
||||
nfs_proxy.cpp
|
||||
|
@@ -1372,7 +1372,8 @@ bool journal_flusher_co::trim_journal(int wait_base)
|
||||
? (uint32_t)JE_START_V1_SIZE : (uint32_t)JE_START_V2_SIZE),
|
||||
.reserved = 0,
|
||||
.journal_start = new_trim_pos,
|
||||
.version = JOURNAL_VERSION_V2,
|
||||
.version = (uint64_t)(!bs->dsk.data_csum_type && ((journal_entry_start*)flusher->journal_superblock)->version == JOURNAL_VERSION_V1
|
||||
? JOURNAL_VERSION_V1 : JOURNAL_VERSION_V2),
|
||||
.data_csum_type = bs->dsk.data_csum_type,
|
||||
.csum_block_size = bs->dsk.csum_block_size,
|
||||
};
|
||||
|
@@ -274,7 +274,7 @@ class blockstore_impl_t
|
||||
blockstore_dirty_db_t dirty_db;
|
||||
std::vector<blockstore_op_t*> submit_queue;
|
||||
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
||||
int unsynced_big_write_count = 0;
|
||||
int unsynced_big_write_count = 0, unstable_unsynced = 0;
|
||||
int unsynced_queued_ops = 0;
|
||||
allocator *data_alloc = NULL;
|
||||
uint8_t *zero_object;
|
||||
|
@@ -553,7 +553,7 @@ resume_1:
|
||||
}
|
||||
if (je_start->size == JE_START_V0_SIZE ||
|
||||
(je_start->version != JOURNAL_VERSION_V1 || je_start->size != JE_START_V1_SIZE) &&
|
||||
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE))
|
||||
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE && je_start->size != JE_START_V1_SIZE))
|
||||
{
|
||||
fprintf(
|
||||
stderr, "The code only supports journal versions 2 and 1, but it is %lu on disk."
|
||||
@@ -562,7 +562,8 @@ resume_1:
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
if (je_start->version == JOURNAL_VERSION_V1)
|
||||
if (je_start->version == JOURNAL_VERSION_V1 ||
|
||||
je_start->version == JOURNAL_VERSION_V2 && je_start->size == JE_START_V1_SIZE)
|
||||
{
|
||||
je_start->data_csum_type = 0;
|
||||
je_start->csum_block_size = 0;
|
||||
|
@@ -145,6 +145,7 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
|
||||
journal.sector_info[journal.cur_sector].offset = journal.next_free;
|
||||
journal.in_sector_pos = 0;
|
||||
journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
|
||||
assert(journal.next_free != journal.used_start);
|
||||
memset(journal.inmemory
|
||||
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
|
||||
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
|
||||
|
@@ -13,12 +13,6 @@
|
||||
#define JOURNAL_BUFFER_SIZE 4*1024*1024
|
||||
#define JOURNAL_ENTRY_HEADER_SIZE 16
|
||||
|
||||
// We reserve some extra space for future stabilize requests during writes
|
||||
// FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
|
||||
// writing more than can be stabilized afterwards
|
||||
#define JOURNAL_STABILIZE_RESERVATION 65536
|
||||
#define JOURNAL_INSTANT_RESERVATION 131072
|
||||
|
||||
// Journal entries
|
||||
// Journal entries are linked to each other by their crc32 value
|
||||
// The journal is almost a blockchain, because object versions constantly increase
|
||||
|
@@ -86,14 +86,15 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
auto & dirty_entry = dirty_db.at(sbw);
|
||||
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
|
||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
|
||||
left == 0 ? JOURNAL_STABILIZE_RESERVATION : 0))
|
||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
|
||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
|
||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
|
||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -184,6 +185,11 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
|
||||
{
|
||||
mark_stable(dirty_it->first);
|
||||
}
|
||||
else
|
||||
{
|
||||
unstable_unsynced--;
|
||||
assert(unstable_unsynced >= 0);
|
||||
}
|
||||
dirty_it++;
|
||||
while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
|
||||
{
|
||||
@@ -214,6 +220,11 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
|
||||
{
|
||||
mark_stable(*it);
|
||||
}
|
||||
else
|
||||
{
|
||||
unstable_unsynced--;
|
||||
assert(unstable_unsynced >= 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
op->retval = 0;
|
||||
|
@@ -21,7 +21,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
dyn = calloc_or_die(1, dyn_size+sizeof(int));
|
||||
*((int*)dyn) = 1;
|
||||
}
|
||||
uint8_t *dyn_ptr = (uint8_t*)(alloc_dyn_data ? dyn+sizeof(int) : &dyn);
|
||||
uint8_t *dyn_ptr = (alloc_dyn_data ? (uint8_t*)dyn+sizeof(int) : (uint8_t*)&dyn);
|
||||
uint64_t version = 1;
|
||||
if (dirty_db.size() > 0)
|
||||
{
|
||||
@@ -320,7 +320,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, unsynced_big_write_count + 1,
|
||||
sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
|
||||
(dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION))
|
||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -386,6 +386,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
sqe, dsk.data_fd, PRIV(op)->iov_zerofill, vcnt, dsk.data_offset + (loc << dsk.block_order) + op->offset - stripe_offset
|
||||
);
|
||||
PRIV(op)->pending_ops = 1;
|
||||
if (immediate_commit != IMMEDIATE_ALL && !(dirty_it->second.state & BS_ST_INSTANT))
|
||||
{
|
||||
unstable_unsynced++;
|
||||
}
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
{
|
||||
// Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
|
||||
@@ -408,7 +412,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
|
||||
|| !space_check.check_available(op, 1,
|
||||
sizeof(journal_entry_small_write) + dyn_size,
|
||||
op->len + ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
|
||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -499,6 +503,11 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
if (journal.next_free >= journal.len)
|
||||
{
|
||||
journal.next_free = dsk.journal_block_size;
|
||||
assert(journal.next_free != journal.used_start);
|
||||
}
|
||||
if (immediate_commit == IMMEDIATE_NONE && !(dirty_it->second.state & BS_ST_INSTANT))
|
||||
{
|
||||
unstable_unsynced++;
|
||||
}
|
||||
if (!PRIV(op)->pending_ops)
|
||||
{
|
||||
@@ -538,7 +547,7 @@ resume_2:
|
||||
uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
|
||||
((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
|
||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -582,14 +591,20 @@ resume_4:
|
||||
#endif
|
||||
bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
|
||||
bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
|
||||
bool is_instant = ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT));
|
||||
if (imm)
|
||||
{
|
||||
auto & unstab = unstable_writes[op->oid];
|
||||
unstab = unstab < op->version ? op->version : unstab;
|
||||
}
|
||||
else if (!is_instant)
|
||||
{
|
||||
unstable_unsynced--;
|
||||
assert(unstable_unsynced >= 0);
|
||||
}
|
||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK)
|
||||
| (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
|
||||
if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
|
||||
if (imm && is_instant)
|
||||
{
|
||||
// Deletions and 'instant' operations are treated as immediately stable
|
||||
mark_stable(dirty_it->first);
|
||||
@@ -735,7 +750,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
||||
});
|
||||
assert(dirty_it != dirty_db.end());
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_INSTANT_RESERVATION))
|
||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), (unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
@@ -17,7 +17,7 @@
|
||||
static const char *exe_name = NULL;
|
||||
|
||||
static const char* help_text =
|
||||
"Vitastor command-line tool\n"
|
||||
"Vitastor command-line tool " VERSION "\n"
|
||||
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"COMMANDS:\n"
|
||||
@@ -331,7 +331,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
|
||||
{
|
||||
// Create client
|
||||
json11::Json cfg_j = cfg;
|
||||
p->ringloop = new ring_loop_t(512);
|
||||
p->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
p->epmgr = new epoll_manager_t(p->ringloop);
|
||||
p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg_j);
|
||||
// Smaller timeout by default for more interactiveness
|
||||
|
@@ -109,7 +109,7 @@ resume_1:
|
||||
}
|
||||
for (auto pg_per_pair: pg_per_osd)
|
||||
{
|
||||
uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.pg_count / pg_per_pair.second;
|
||||
uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.real_pg_count / pg_per_pair.second;
|
||||
if (pool_avail > pg_free)
|
||||
{
|
||||
pool_avail = pg_free;
|
||||
@@ -124,8 +124,10 @@ resume_1:
|
||||
pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
|
||||
}
|
||||
pool_stats[pool_cfg.id] = json11::Json::object {
|
||||
{ "id", (uint64_t)pool_cfg.id },
|
||||
{ "name", pool_cfg.name },
|
||||
{ "pg_count", pool_cfg.pg_count },
|
||||
{ "real_pg_count", pool_cfg.real_pg_count },
|
||||
{ "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "ec" },
|
||||
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
|
||||
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
|
||||
@@ -176,7 +178,7 @@ resume_1:
|
||||
{ "title", "SCHEME" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "pg_count" },
|
||||
{ "key", "pg_count_fmt" },
|
||||
{ "title", "PGS" },
|
||||
});
|
||||
cols.push_back(json11::Json::object{
|
||||
@@ -205,6 +207,9 @@ resume_1:
|
||||
double raw_to = kv.second["raw_to_usable"].number_value();
|
||||
if (raw_to < 0.000001 && raw_to > -0.000001)
|
||||
raw_to = 1;
|
||||
kv.second["pg_count_fmt"] = kv.second["real_pg_count"] == kv.second["pg_count"]
|
||||
? kv.second["real_pg_count"].as_string()
|
||||
: kv.second["real_pg_count"].as_string()+"->"+kv.second["pg_count"].as_string();
|
||||
kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / raw_to);
|
||||
kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / raw_to);
|
||||
kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());
|
||||
|
@@ -158,12 +158,7 @@ resume_2:
|
||||
for (auto & pool_pair: parent->cli->st_cli.pool_config)
|
||||
{
|
||||
auto & pool_cfg = pool_pair.second;
|
||||
bool active = true;
|
||||
if (pool_cfg.pg_config.size() != pool_cfg.pg_count)
|
||||
{
|
||||
active = false;
|
||||
pgs_by_state["offline"] += pool_cfg.pg_count-pool_cfg.pg_config.size();
|
||||
}
|
||||
bool active = pool_cfg.real_pg_count > 0;
|
||||
pool_count++;
|
||||
for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++)
|
||||
{
|
||||
|
@@ -6,7 +6,7 @@
|
||||
#include "cluster_client_impl.h"
|
||||
#include "http_client.h" // json_is_true
|
||||
|
||||
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
|
||||
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config)
|
||||
{
|
||||
wb = new writeback_cache_t();
|
||||
|
||||
@@ -64,7 +64,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||
|
||||
cluster_client_t::~cluster_client_t()
|
||||
{
|
||||
msgr.repeer_pgs = [this](osd_num_t){};
|
||||
msgr.repeer_pgs = [](osd_num_t){};
|
||||
if (ringloop)
|
||||
{
|
||||
ringloop->unregister_consumer(&consumer);
|
||||
@@ -454,7 +454,7 @@ bool cluster_client_t::flush()
|
||||
wb->start_writebacks(this, 0);
|
||||
cluster_op_t *sync = new cluster_op_t;
|
||||
sync->opcode = OSD_OP_SYNC;
|
||||
sync->callback = [this](cluster_op_t *sync)
|
||||
sync->callback = [](cluster_op_t *sync)
|
||||
{
|
||||
delete sync;
|
||||
};
|
||||
@@ -465,7 +465,7 @@ bool cluster_client_t::flush()
|
||||
bool sync_done = false;
|
||||
cluster_op_t *sync = new cluster_op_t;
|
||||
sync->opcode = OSD_OP_SYNC;
|
||||
sync->callback = [this, &sync_done](cluster_op_t *sync)
|
||||
sync->callback = [&sync_done](cluster_op_t *sync)
|
||||
{
|
||||
delete sync;
|
||||
sync_done = true;
|
||||
@@ -532,7 +532,7 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
|
||||
return;
|
||||
}
|
||||
if (op->opcode == OSD_OP_WRITE && enable_writeback && !(op->flags & OP_FLUSH_BUFFER) &&
|
||||
!op->version /* FIXME no CAS writeback */)
|
||||
!op->version /* no CAS writeback */)
|
||||
{
|
||||
if (wb->writebacks_active >= client_max_writeback_iodepth)
|
||||
{
|
||||
@@ -553,7 +553,7 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
|
||||
}
|
||||
if (op->opcode == OSD_OP_WRITE && !(op->flags & OP_IMMEDIATE_COMMIT))
|
||||
{
|
||||
if (!(op->flags & OP_FLUSH_BUFFER))
|
||||
if (!(op->flags & OP_FLUSH_BUFFER) && !op->version /* no CAS write-repeat */)
|
||||
{
|
||||
wb->copy_write(op, CACHE_WRITTEN);
|
||||
}
|
||||
@@ -1152,7 +1152,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
||||
);
|
||||
}
|
||||
else
|
||||
else if (log_level > 0)
|
||||
{
|
||||
fprintf(
|
||||
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d)\n",
|
||||
|
@@ -121,7 +121,7 @@ public:
|
||||
json11::Json::object cli_config, file_config, etcd_global_config;
|
||||
json11::Json::object config;
|
||||
|
||||
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
|
||||
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config);
|
||||
~cluster_client_t();
|
||||
void execute(cluster_op_t *op);
|
||||
void execute_raw(osd_num_t osd_num, osd_op_t *op);
|
||||
|
@@ -263,7 +263,7 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
|
||||
}
|
||||
assert(calc_len == op->len);
|
||||
writebacks_active++;
|
||||
op->callback = [this, cli, flush_id](cluster_op_t* op)
|
||||
op->callback = [this, flush_id](cluster_op_t* op)
|
||||
{
|
||||
// Buffer flushes should be always retried, regardless of the error,
|
||||
// so they should never result in an error here
|
||||
@@ -383,7 +383,7 @@ static void copy_to_op(cluster_op_t *op, uint64_t offset, uint8_t *buf, uint64_t
|
||||
auto begin = (cur_offset < offset ? offset : cur_offset);
|
||||
auto end = (cur_offset+v.iov_len > offset+len ? offset+len : cur_offset+v.iov_len);
|
||||
memcpy(
|
||||
v.iov_base + begin - cur_offset,
|
||||
(uint8_t*)v.iov_base + begin - cur_offset,
|
||||
buf + (cur_offset <= offset ? 0 : cur_offset-offset),
|
||||
end - begin
|
||||
);
|
||||
|
@@ -5,7 +5,7 @@
|
||||
#include "str_util.h"
|
||||
|
||||
static const char *help_text =
|
||||
"Vitastor disk management tool\n"
|
||||
"Vitastor disk management tool " VERSION "\n"
|
||||
"(c) Vitaliy Filippov, 2022+ (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"COMMANDS:\n"
|
||||
@@ -229,7 +229,7 @@ int main(int argc, char *argv[])
|
||||
{
|
||||
self.options["allow_data_loss"] = "1";
|
||||
}
|
||||
else if (argv[i][0] == '-' && argv[i][1] == '-')
|
||||
else if (argv[i][0] == '-' && argv[i][1] == '-' && i < argc-1)
|
||||
{
|
||||
char *key = argv[i]+2;
|
||||
self.options[key] = argv[++i];
|
||||
|
@@ -320,7 +320,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
|
||||
if (journal_calc_data_pos != sw.data_offset)
|
||||
{
|
||||
printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\""
|
||||
: " (mismatched, calculated = %lu)", journal_pos);
|
||||
: " (mismatched, calculated = %08lx)", journal_pos);
|
||||
}
|
||||
uint32_t data_csum_size = (!je_start.csum_block_size
|
||||
? 0
|
||||
|
@@ -245,7 +245,7 @@ int disk_tool_t::resize_copy_data()
|
||||
{
|
||||
iodepth = 32;
|
||||
}
|
||||
ringloop = new ring_loop_t(iodepth < 512 ? 512 : iodepth);
|
||||
ringloop = new ring_loop_t(iodepth < RINGLOOP_DEFAULT_SIZE ? RINGLOOP_DEFAULT_SIZE : iodepth);
|
||||
dsk.data_fd = open(dsk.data_device.c_str(), O_DIRECT|O_RDWR);
|
||||
if (dsk.data_fd < 0)
|
||||
{
|
||||
|
@@ -130,7 +130,7 @@ static int bs_init(struct thread_data *td)
|
||||
config[p.first] = p.second.dump();
|
||||
}
|
||||
}
|
||||
bsd->ringloop = new ring_loop_t(512);
|
||||
bsd->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
bsd->epmgr = new epoll_manager_t(bsd->ringloop);
|
||||
bsd->bs = new blockstore_t(config, bsd->ringloop, bsd->epmgr->tfd);
|
||||
while (1)
|
||||
|
401
src/kv_cli.cpp
Normal file
401
src/kv_cli.cpp
Normal file
@@ -0,0 +1,401 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// Vitastor shared key/value database test CLI
|
||||
|
||||
#define _XOPEN_SOURCE
|
||||
#include <limits.h>
|
||||
|
||||
#include <netinet/tcp.h>
|
||||
#include <sys/epoll.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
//#include <signal.h>
|
||||
|
||||
#include "epoll_manager.h"
|
||||
#include "str_util.h"
|
||||
#include "kv_db.h"
|
||||
|
||||
const char *exe_name = NULL;
|
||||
|
||||
class kv_cli_t
|
||||
{
|
||||
public:
|
||||
kv_dbw_t *db = NULL;
|
||||
ring_loop_t *ringloop = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
cluster_client_t *cli = NULL;
|
||||
bool interactive = false;
|
||||
int in_progress = 0;
|
||||
char *cur_cmd = NULL;
|
||||
int cur_cmd_size = 0, cur_cmd_alloc = 0;
|
||||
bool finished = false, eof = false;
|
||||
json11::Json::object cfg;
|
||||
|
||||
~kv_cli_t();
|
||||
|
||||
static json11::Json::object parse_args(int narg, const char *args[]);
|
||||
void run(const json11::Json::object & cfg);
|
||||
void read_cmd();
|
||||
void next_cmd();
|
||||
void handle_cmd(const std::string & cmd, std::function<void()> cb);
|
||||
};
|
||||
|
||||
kv_cli_t::~kv_cli_t()
|
||||
{
|
||||
if (cur_cmd)
|
||||
{
|
||||
free(cur_cmd);
|
||||
cur_cmd = NULL;
|
||||
}
|
||||
cur_cmd_alloc = 0;
|
||||
if (db)
|
||||
delete db;
|
||||
if (cli)
|
||||
{
|
||||
cli->flush();
|
||||
delete cli;
|
||||
}
|
||||
if (epmgr)
|
||||
delete epmgr;
|
||||
if (ringloop)
|
||||
delete ringloop;
|
||||
}
|
||||
|
||||
json11::Json::object kv_cli_t::parse_args(int narg, const char *args[])
|
||||
{
|
||||
json11::Json::object cfg;
|
||||
for (int i = 1; i < narg; i++)
|
||||
{
|
||||
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
|
||||
{
|
||||
printf(
|
||||
"Vitastor Key/Value CLI\n"
|
||||
"(c) Vitaliy Filippov, 2023+ (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"USAGE: %s [--etcd_address ADDR] [OTHER OPTIONS]\n",
|
||||
exe_name
|
||||
);
|
||||
exit(0);
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == '-')
|
||||
{
|
||||
const char *opt = args[i]+2;
|
||||
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
|
||||
}
|
||||
}
|
||||
return cfg;
|
||||
}
|
||||
|
||||
void kv_cli_t::run(const json11::Json::object & cfg)
|
||||
{
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(512);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||
db = new kv_dbw_t(cli);
|
||||
// Load image metadata
|
||||
while (!cli->is_ready())
|
||||
{
|
||||
ringloop->loop();
|
||||
if (cli->is_ready())
|
||||
break;
|
||||
ringloop->wait();
|
||||
}
|
||||
// Run
|
||||
fcntl(0, F_SETFL, fcntl(0, F_GETFL, 0) | O_NONBLOCK);
|
||||
try
|
||||
{
|
||||
epmgr->tfd->set_fd_handler(0, false, [this](int fd, int events)
|
||||
{
|
||||
if (events & EPOLLIN)
|
||||
{
|
||||
read_cmd();
|
||||
}
|
||||
if (events & EPOLLRDHUP)
|
||||
{
|
||||
epmgr->tfd->set_fd_handler(0, false, NULL);
|
||||
finished = true;
|
||||
}
|
||||
});
|
||||
interactive = true;
|
||||
printf("> ");
|
||||
}
|
||||
catch (std::exception & e)
|
||||
{
|
||||
// Can't add to epoll, STDIN is probably a file
|
||||
read_cmd();
|
||||
}
|
||||
while (!finished)
|
||||
{
|
||||
ringloop->loop();
|
||||
if (!finished)
|
||||
ringloop->wait();
|
||||
}
|
||||
// Destroy the client
|
||||
delete db;
|
||||
db = NULL;
|
||||
cli->flush();
|
||||
delete cli;
|
||||
delete epmgr;
|
||||
delete ringloop;
|
||||
cli = NULL;
|
||||
epmgr = NULL;
|
||||
ringloop = NULL;
|
||||
}
|
||||
|
||||
void kv_cli_t::read_cmd()
|
||||
{
|
||||
if (!cur_cmd_alloc)
|
||||
{
|
||||
cur_cmd_alloc = 65536;
|
||||
cur_cmd = (char*)malloc_or_die(cur_cmd_alloc);
|
||||
}
|
||||
while (cur_cmd_size < cur_cmd_alloc)
|
||||
{
|
||||
int r = read(0, cur_cmd+cur_cmd_size, cur_cmd_alloc-cur_cmd_size);
|
||||
if (r < 0 && errno != EAGAIN)
|
||||
fprintf(stderr, "Error reading from stdin: %s\n", strerror(errno));
|
||||
if (r > 0)
|
||||
cur_cmd_size += r;
|
||||
if (r == 0)
|
||||
eof = true;
|
||||
if (r <= 0)
|
||||
break;
|
||||
}
|
||||
next_cmd();
|
||||
}
|
||||
|
||||
void kv_cli_t::next_cmd()
|
||||
{
|
||||
if (in_progress > 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
int pos = 0;
|
||||
for (; pos < cur_cmd_size; pos++)
|
||||
{
|
||||
if (cur_cmd[pos] == '\n' || cur_cmd[pos] == '\r')
|
||||
{
|
||||
auto cmd = trim(std::string(cur_cmd, pos));
|
||||
pos++;
|
||||
memmove(cur_cmd, cur_cmd+pos, cur_cmd_size-pos);
|
||||
cur_cmd_size -= pos;
|
||||
in_progress++;
|
||||
handle_cmd(cmd, [this]()
|
||||
{
|
||||
in_progress--;
|
||||
if (interactive)
|
||||
printf("> ");
|
||||
next_cmd();
|
||||
if (!in_progress)
|
||||
read_cmd();
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (eof && !in_progress)
|
||||
{
|
||||
finished = true;
|
||||
}
|
||||
}
|
||||
|
||||
void kv_cli_t::handle_cmd(const std::string & cmd, std::function<void()> cb)
|
||||
{
|
||||
if (cmd == "")
|
||||
{
|
||||
cb();
|
||||
return;
|
||||
}
|
||||
auto pos = cmd.find_first_of(" \t");
|
||||
if (pos != std::string::npos)
|
||||
{
|
||||
while (pos < cmd.size()-1 && (cmd[pos+1] == ' ' || cmd[pos+1] == '\t'))
|
||||
pos++;
|
||||
}
|
||||
auto opname = strtolower(pos == std::string::npos ? cmd : cmd.substr(0, pos));
|
||||
if (opname == "open")
|
||||
{
|
||||
uint64_t pool_id = 0;
|
||||
inode_t inode_id = 0;
|
||||
uint32_t kv_block_size = 0;
|
||||
int scanned = sscanf(cmd.c_str() + pos+1, "%lu %lu %u", &pool_id, &inode_id, &kv_block_size);
|
||||
if (scanned == 2)
|
||||
{
|
||||
kv_block_size = 4096;
|
||||
}
|
||||
if (scanned < 2 || !pool_id || !inode_id || !kv_block_size || (kv_block_size & (kv_block_size-1)) != 0)
|
||||
{
|
||||
fprintf(stderr, "Usage: open <pool_id> <inode_id> [block_size]. Block size must be a power of 2. Default is 4096.\n");
|
||||
cb();
|
||||
return;
|
||||
}
|
||||
cfg["kv_block_size"] = (uint64_t)kv_block_size;
|
||||
db->open(INODE_WITH_POOL(pool_id, inode_id), cfg, [=](int res)
|
||||
{
|
||||
if (res < 0)
|
||||
fprintf(stderr, "Error opening index: %s (code %d)\n", strerror(-res), res);
|
||||
else
|
||||
printf("Index opened. Current size: %lu bytes\n", db->get_size());
|
||||
cb();
|
||||
});
|
||||
}
|
||||
else if (opname == "config")
|
||||
{
|
||||
auto pos2 = cmd.find_first_of(" \t", pos+1);
|
||||
if (pos2 == std::string::npos)
|
||||
{
|
||||
fprintf(stderr, "Usage: config <property> <value>\n");
|
||||
cb();
|
||||
return;
|
||||
}
|
||||
auto key = trim(cmd.substr(pos+1, pos2-pos-1));
|
||||
auto value = parse_size(trim(cmd.substr(pos2+1)));
|
||||
if (key != "kv_memory_limit" &&
|
||||
key != "kv_allocate_blocks" &&
|
||||
key != "kv_evict_max_misses" &&
|
||||
key != "kv_evict_attempts_per_level" &&
|
||||
key != "kv_evict_unused_age" &&
|
||||
key != "kv_log_level")
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Allowed properties: kv_memory_limit, kv_allocate_blocks,"
|
||||
" kv_evict_max_misses, kv_evict_attempts_per_level, kv_evict_unused_age, kv_log_level\n"
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
cfg[key] = value;
|
||||
db->set_config(cfg);
|
||||
}
|
||||
cb();
|
||||
}
|
||||
else if (opname == "get" || opname == "set" || opname == "del")
|
||||
{
|
||||
if (opname == "get" || opname == "del")
|
||||
{
|
||||
if (pos == std::string::npos)
|
||||
{
|
||||
fprintf(stderr, "Usage: %s <key>\n", opname.c_str());
|
||||
cb();
|
||||
return;
|
||||
}
|
||||
auto key = trim(cmd.substr(pos+1));
|
||||
if (opname == "get")
|
||||
{
|
||||
db->get(key, [this, cb](int res, const std::string & value)
|
||||
{
|
||||
if (res < 0)
|
||||
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
|
||||
else
|
||||
{
|
||||
write(1, value.c_str(), value.size());
|
||||
write(1, "\n", 1);
|
||||
}
|
||||
cb();
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
db->del(key, [this, cb](int res)
|
||||
{
|
||||
if (res < 0)
|
||||
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
|
||||
else
|
||||
printf("OK\n");
|
||||
cb();
|
||||
});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
auto pos2 = cmd.find_first_of(" \t", pos+1);
|
||||
if (pos2 == std::string::npos)
|
||||
{
|
||||
fprintf(stderr, "Usage: set <key> <value>\n");
|
||||
cb();
|
||||
return;
|
||||
}
|
||||
auto key = trim(cmd.substr(pos+1, pos2-pos-1));
|
||||
auto value = trim(cmd.substr(pos2+1));
|
||||
db->set(key, value, [this, cb](int res)
|
||||
{
|
||||
if (res < 0)
|
||||
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
|
||||
else
|
||||
printf("OK\n");
|
||||
cb();
|
||||
});
|
||||
}
|
||||
}
|
||||
else if (opname == "list")
|
||||
{
|
||||
std::string start, end;
|
||||
if (pos != std::string::npos)
|
||||
{
|
||||
auto pos2 = cmd.find_first_of(" \t", pos+1);
|
||||
if (pos2 != std::string::npos)
|
||||
{
|
||||
start = trim(cmd.substr(pos+1, pos2-pos-1));
|
||||
end = trim(cmd.substr(pos2+1));
|
||||
}
|
||||
else
|
||||
{
|
||||
start = trim(cmd.substr(pos+1));
|
||||
}
|
||||
}
|
||||
void *handle = db->list_start(start);
|
||||
db->list_next(handle, [=](int res, const std::string & key, const std::string & value)
|
||||
{
|
||||
if (res < 0)
|
||||
{
|
||||
if (res != -ENOENT)
|
||||
{
|
||||
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
|
||||
}
|
||||
db->list_close(handle);
|
||||
cb();
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%s = %s\n", key.c_str(), value.c_str());
|
||||
db->list_next(handle, NULL);
|
||||
}
|
||||
});
|
||||
}
|
||||
else if (opname == "close")
|
||||
{
|
||||
db->close([=]()
|
||||
{
|
||||
printf("Index closed\n");
|
||||
cb();
|
||||
});
|
||||
}
|
||||
else if (opname == "quit" || opname == "q")
|
||||
{
|
||||
::close(0);
|
||||
finished = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(
|
||||
stderr, "Unknown operation: %s. Supported operations:\n"
|
||||
"open <pool_id> <inode_id> [block_size]\n"
|
||||
"config <property> <value>\n"
|
||||
"get <key>\nset <key> <value>\ndel <key>\nlist [<start> [end]]\n"
|
||||
"close\nquit\n", opname.c_str()
|
||||
);
|
||||
cb();
|
||||
}
|
||||
}
|
||||
|
||||
int main(int narg, const char *args[])
|
||||
{
|
||||
setvbuf(stdout, NULL, _IONBF, 0);
|
||||
setvbuf(stderr, NULL, _IONBF, 0);
|
||||
exe_name = args[0];
|
||||
kv_cli_t *p = new kv_cli_t();
|
||||
p->run(kv_cli_t::parse_args(narg, args));
|
||||
delete p;
|
||||
return 0;
|
||||
}
|
2064
src/kv_db.cpp
Normal file
2064
src/kv_db.cpp
Normal file
File diff suppressed because it is too large
Load Diff
36
src/kv_db.h
Normal file
36
src/kv_db.h
Normal file
@@ -0,0 +1,36 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// Vitastor shared key/value database
|
||||
// Parallel optimistic B-Tree O:-)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cluster_client.h"
|
||||
|
||||
struct kv_db_t;
|
||||
|
||||
struct kv_dbw_t
|
||||
{
|
||||
kv_dbw_t(cluster_client_t *cli);
|
||||
~kv_dbw_t();
|
||||
|
||||
void open(inode_t inode_id, json11::Json cfg, std::function<void(int)> cb);
|
||||
void set_config(json11::Json cfg);
|
||||
void close(std::function<void()> cb);
|
||||
|
||||
uint64_t get_size();
|
||||
|
||||
void get(const std::string & key, std::function<void(int res, const std::string & value)> cb,
|
||||
bool allow_old_cached = false);
|
||||
void set(const std::string & key, const std::string & value, std::function<void(int res)> cb,
|
||||
std::function<bool(int res, const std::string & value)> cas_compare = NULL);
|
||||
void del(const std::string & key, std::function<void(int res)> cb,
|
||||
std::function<bool(int res, const std::string & value)> cas_compare = NULL);
|
||||
|
||||
void* list_start(const std::string & start);
|
||||
void list_next(void *handle, std::function<void(int res, const std::string & key, const std::string & value)> cb);
|
||||
void list_close(void *handle);
|
||||
|
||||
kv_db_t *db;
|
||||
};
|
697
src/kv_stress.cpp
Normal file
697
src/kv_stress.cpp
Normal file
@@ -0,0 +1,697 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
//
|
||||
// Vitastor shared key/value database stress tester / benchmark
|
||||
|
||||
#define _XOPEN_SOURCE
|
||||
#include <limits.h>
|
||||
|
||||
#include <netinet/tcp.h>
|
||||
#include <sys/epoll.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
//#include <signal.h>
|
||||
|
||||
#include "epoll_manager.h"
|
||||
#include "str_util.h"
|
||||
#include "kv_db.h"
|
||||
|
||||
const char *exe_name = NULL;
|
||||
|
||||
struct kv_test_listing_t
|
||||
{
|
||||
uint64_t count = 0, done = 0;
|
||||
void *handle = NULL;
|
||||
std::string next_after;
|
||||
std::set<std::string> inflights;
|
||||
timespec tv_begin;
|
||||
bool error = false;
|
||||
};
|
||||
|
||||
struct kv_test_lat_t
|
||||
{
|
||||
const char *name = NULL;
|
||||
uint64_t usec = 0, count = 0;
|
||||
};
|
||||
|
||||
struct kv_test_stat_t
|
||||
{
|
||||
kv_test_lat_t get, add, update, del, list;
|
||||
uint64_t list_keys = 0;
|
||||
};
|
||||
|
||||
class kv_test_t
|
||||
{
|
||||
public:
|
||||
// Config
|
||||
json11::Json::object kv_cfg;
|
||||
std::string key_prefix, key_suffix;
|
||||
uint64_t inode_id = 0;
|
||||
uint64_t op_count = 1000000;
|
||||
uint64_t runtime_sec = 0;
|
||||
uint64_t parallelism = 4;
|
||||
uint64_t reopen_prob = 1;
|
||||
uint64_t get_prob = 30000;
|
||||
uint64_t add_prob = 20000;
|
||||
uint64_t update_prob = 20000;
|
||||
uint64_t del_prob = 5000;
|
||||
uint64_t list_prob = 300;
|
||||
uint64_t min_key_len = 10;
|
||||
uint64_t max_key_len = 70;
|
||||
uint64_t min_value_len = 50;
|
||||
uint64_t max_value_len = 300;
|
||||
uint64_t min_list_count = 10;
|
||||
uint64_t max_list_count = 1000;
|
||||
uint64_t print_stats_interval = 1;
|
||||
bool json_output = false;
|
||||
uint64_t log_level = 1;
|
||||
bool trace = false;
|
||||
bool stop_on_error = false;
|
||||
// FIXME: Multiple clients
|
||||
kv_test_stat_t stat, prev_stat;
|
||||
timespec prev_stat_time, start_stat_time;
|
||||
|
||||
// State
|
||||
kv_dbw_t *db = NULL;
|
||||
ring_loop_t *ringloop = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
cluster_client_t *cli = NULL;
|
||||
ring_consumer_t consumer;
|
||||
bool finished = false;
|
||||
uint64_t total_prob = 0;
|
||||
uint64_t ops_sent = 0, ops_done = 0;
|
||||
int stat_timer_id = -1;
|
||||
int in_progress = 0;
|
||||
bool reopening = false;
|
||||
std::set<kv_test_listing_t*> listings;
|
||||
std::set<std::string> changing_keys;
|
||||
std::map<std::string, std::string> values;
|
||||
|
||||
~kv_test_t();
|
||||
|
||||
static json11::Json::object parse_args(int narg, const char *args[]);
|
||||
void parse_config(json11::Json cfg);
|
||||
void run(json11::Json cfg);
|
||||
void loop();
|
||||
void print_stats(kv_test_stat_t & prev_stat, timespec & prev_stat_time);
|
||||
void print_total_stats();
|
||||
void start_change(const std::string & key);
|
||||
void stop_change(const std::string & key);
|
||||
void add_stat(kv_test_lat_t & stat, timespec tv_begin);
|
||||
};
|
||||
|
||||
kv_test_t::~kv_test_t()
|
||||
{
|
||||
if (db)
|
||||
delete db;
|
||||
if (cli)
|
||||
{
|
||||
cli->flush();
|
||||
delete cli;
|
||||
}
|
||||
if (epmgr)
|
||||
delete epmgr;
|
||||
if (ringloop)
|
||||
delete ringloop;
|
||||
}
|
||||
|
||||
json11::Json::object kv_test_t::parse_args(int narg, const char *args[])
|
||||
{
|
||||
json11::Json::object cfg;
|
||||
for (int i = 1; i < narg; i++)
|
||||
{
|
||||
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
|
||||
{
|
||||
printf(
|
||||
"Vitastor Key/Value DB stress tester / benchmark\n"
|
||||
"(c) Vitaliy Filippov, 2023+ (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"USAGE: %s --pool_id POOL_ID --inode_id INODE_ID [OPTIONS]\n"
|
||||
" --op_count 1000000\n"
|
||||
" Total operations to run during test. 0 means unlimited\n"
|
||||
" --key_prefix \"\"\n"
|
||||
" Prefix for all keys read or written (to avoid collisions)\n"
|
||||
" --key_suffix \"\"\n"
|
||||
" Suffix for all keys read or written (to avoid collisions, but scan all DB)\n"
|
||||
" --runtime 0\n"
|
||||
" Run for this number of seconds. 0 means unlimited\n"
|
||||
" --parallelism 4\n"
|
||||
" Run this number of operations in parallel\n"
|
||||
" --get_prob 30000\n"
|
||||
" Fraction of key retrieve operations\n"
|
||||
" --add_prob 20000\n"
|
||||
" Fraction of key addition operations\n"
|
||||
" --update_prob 20000\n"
|
||||
" Fraction of key update operations\n"
|
||||
" --del_prob 30000\n"
|
||||
" Fraction of key delete operations\n"
|
||||
" --list_prob 300\n"
|
||||
" Fraction of listing operations\n"
|
||||
" --min_key_len 10\n"
|
||||
" Minimum key size in bytes\n"
|
||||
" --max_key_len 70\n"
|
||||
" Maximum key size in bytes\n"
|
||||
" --min_value_len 50\n"
|
||||
" Minimum value size in bytes\n"
|
||||
" --max_value_len 300\n"
|
||||
" Maximum value size in bytes\n"
|
||||
" --min_list_count 10\n"
|
||||
" Minimum number of keys read in listing (0 = all keys)\n"
|
||||
" --max_list_count 1000\n"
|
||||
" Maximum number of keys read in listing\n"
|
||||
" --print_stats 1\n"
|
||||
" Print operation statistics every this number of seconds\n"
|
||||
" --json\n"
|
||||
" JSON output\n"
|
||||
" --stop_on_error 0\n"
|
||||
" Stop on first execution error, mismatch, lost key or extra key during listing\n"
|
||||
" --kv_memory_limit 128M\n"
|
||||
" Maximum memory to use for vitastor-kv index cache\n"
|
||||
" --kv_allocate_blocks 4\n"
|
||||
" Number of PG blocks used for new tree block allocation in parallel\n"
|
||||
" --kv_evict_max_misses 10\n"
|
||||
" Eviction algorithm parameter: retry eviction from another random spot\n"
|
||||
" if this number of keys is used currently or was used recently\n"
|
||||
" --kv_evict_attempts_per_level 3\n"
|
||||
" Retry eviction at most this number of times per tree level, starting\n"
|
||||
" with bottom-most levels\n"
|
||||
" --kv_evict_unused_age 1000\n"
|
||||
" Evict only keys unused during this number of last operations\n"
|
||||
" --kv_log_level 1\n"
|
||||
" Log level. 0 = errors, 1 = warnings, 10 = trace operations\n",
|
||||
exe_name
|
||||
);
|
||||
exit(0);
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == '-')
|
||||
{
|
||||
const char *opt = args[i]+2;
|
||||
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
|
||||
}
|
||||
}
|
||||
return cfg;
|
||||
}
|
||||
|
||||
void kv_test_t::parse_config(json11::Json cfg)
|
||||
{
|
||||
inode_id = INODE_WITH_POOL(cfg["pool_id"].uint64_value(), cfg["inode_id"].uint64_value());
|
||||
if (cfg["op_count"].uint64_value() > 0)
|
||||
op_count = cfg["op_count"].uint64_value();
|
||||
key_prefix = cfg["key_prefix"].string_value();
|
||||
key_suffix = cfg["key_suffix"].string_value();
|
||||
if (cfg["runtime"].uint64_value() > 0)
|
||||
runtime_sec = cfg["runtime"].uint64_value();
|
||||
if (cfg["parallelism"].uint64_value() > 0)
|
||||
parallelism = cfg["parallelism"].uint64_value();
|
||||
if (!cfg["reopen_prob"].is_null())
|
||||
reopen_prob = cfg["reopen_prob"].uint64_value();
|
||||
if (!cfg["get_prob"].is_null())
|
||||
get_prob = cfg["get_prob"].uint64_value();
|
||||
if (!cfg["add_prob"].is_null())
|
||||
add_prob = cfg["add_prob"].uint64_value();
|
||||
if (!cfg["update_prob"].is_null())
|
||||
update_prob = cfg["update_prob"].uint64_value();
|
||||
if (!cfg["del_prob"].is_null())
|
||||
del_prob = cfg["del_prob"].uint64_value();
|
||||
if (!cfg["list_prob"].is_null())
|
||||
list_prob = cfg["list_prob"].uint64_value();
|
||||
if (!cfg["min_key_len"].is_null())
|
||||
min_key_len = cfg["min_key_len"].uint64_value();
|
||||
if (cfg["max_key_len"].uint64_value() > 0)
|
||||
max_key_len = cfg["max_key_len"].uint64_value();
|
||||
if (!cfg["min_value_len"].is_null())
|
||||
min_value_len = cfg["min_value_len"].uint64_value();
|
||||
if (cfg["max_value_len"].uint64_value() > 0)
|
||||
max_value_len = cfg["max_value_len"].uint64_value();
|
||||
if (!cfg["min_list_count"].is_null())
|
||||
min_list_count = cfg["min_list_count"].uint64_value();
|
||||
if (!cfg["max_list_count"].is_null())
|
||||
max_list_count = cfg["max_list_count"].uint64_value();
|
||||
if (!cfg["print_stats"].is_null())
|
||||
print_stats_interval = cfg["print_stats"].uint64_value();
|
||||
if (!cfg["json"].is_null())
|
||||
json_output = true;
|
||||
if (!cfg["stop_on_error"].is_null())
|
||||
stop_on_error = cfg["stop_on_error"].bool_value();
|
||||
if (!cfg["kv_memory_limit"].is_null())
|
||||
kv_cfg["kv_memory_limit"] = cfg["kv_memory_limit"];
|
||||
if (!cfg["kv_allocate_blocks"].is_null())
|
||||
kv_cfg["kv_allocate_blocks"] = cfg["kv_allocate_blocks"];
|
||||
if (!cfg["kv_evict_max_misses"].is_null())
|
||||
kv_cfg["kv_evict_max_misses"] = cfg["kv_evict_max_misses"];
|
||||
if (!cfg["kv_evict_attempts_per_level"].is_null())
|
||||
kv_cfg["kv_evict_attempts_per_level"] = cfg["kv_evict_attempts_per_level"];
|
||||
if (!cfg["kv_evict_unused_age"].is_null())
|
||||
kv_cfg["kv_evict_unused_age"] = cfg["kv_evict_unused_age"];
|
||||
if (!cfg["kv_log_level"].is_null())
|
||||
{
|
||||
log_level = cfg["kv_log_level"].uint64_value();
|
||||
trace = log_level >= 10;
|
||||
kv_cfg["kv_log_level"] = cfg["kv_log_level"];
|
||||
}
|
||||
total_prob = reopen_prob+get_prob+add_prob+update_prob+del_prob+list_prob;
|
||||
stat.get.name = "get";
|
||||
stat.add.name = "add";
|
||||
stat.update.name = "update";
|
||||
stat.del.name = "del";
|
||||
stat.list.name = "list";
|
||||
}
|
||||
|
||||
void kv_test_t::run(json11::Json cfg)
|
||||
{
|
||||
srand48(time(NULL));
|
||||
parse_config(cfg);
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(512);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||
db = new kv_dbw_t(cli);
|
||||
// Load image metadata
|
||||
while (!cli->is_ready())
|
||||
{
|
||||
ringloop->loop();
|
||||
if (cli->is_ready())
|
||||
break;
|
||||
ringloop->wait();
|
||||
}
|
||||
// Run
|
||||
reopening = true;
|
||||
db->open(inode_id, kv_cfg, [this](int res)
|
||||
{
|
||||
reopening = false;
|
||||
if (res < 0)
|
||||
{
|
||||
fprintf(stderr, "ERROR: Open index: %d (%s)\n", res, strerror(-res));
|
||||
exit(1);
|
||||
}
|
||||
if (trace)
|
||||
printf("Index opened\n");
|
||||
ringloop->wakeup();
|
||||
});
|
||||
consumer.loop = [this]() { loop(); };
|
||||
ringloop->register_consumer(&consumer);
|
||||
if (print_stats_interval)
|
||||
stat_timer_id = epmgr->tfd->set_timer(print_stats_interval*1000, true, [this](int) { print_stats(prev_stat, prev_stat_time); });
|
||||
clock_gettime(CLOCK_REALTIME, &start_stat_time);
|
||||
prev_stat_time = start_stat_time;
|
||||
while (!finished)
|
||||
{
|
||||
ringloop->loop();
|
||||
if (!finished)
|
||||
ringloop->wait();
|
||||
}
|
||||
if (stat_timer_id >= 0)
|
||||
epmgr->tfd->clear_timer(stat_timer_id);
|
||||
ringloop->unregister_consumer(&consumer);
|
||||
// Print total stats
|
||||
print_total_stats();
|
||||
// Destroy the client
|
||||
delete db;
|
||||
db = NULL;
|
||||
cli->flush();
|
||||
delete cli;
|
||||
delete epmgr;
|
||||
delete ringloop;
|
||||
cli = NULL;
|
||||
epmgr = NULL;
|
||||
ringloop = NULL;
|
||||
}
|
||||
|
||||
static const char *base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789@+/";
|
||||
|
||||
std::string random_str(int len)
|
||||
{
|
||||
std::string str;
|
||||
str.resize(len);
|
||||
for (int i = 0; i < len; i++)
|
||||
{
|
||||
str[i] = base64_chars[lrand48() % 64];
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
void kv_test_t::loop()
|
||||
{
|
||||
if (reopening)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (ops_done >= op_count)
|
||||
{
|
||||
finished = true;
|
||||
}
|
||||
while (!finished && ops_sent < op_count && in_progress < parallelism)
|
||||
{
|
||||
uint64_t dice = (lrand48() % total_prob);
|
||||
if (dice < reopen_prob)
|
||||
{
|
||||
reopening = true;
|
||||
db->close([this]()
|
||||
{
|
||||
if (trace)
|
||||
printf("Index closed\n");
|
||||
db->open(inode_id, kv_cfg, [this](int res)
|
||||
{
|
||||
reopening = false;
|
||||
if (res < 0)
|
||||
{
|
||||
fprintf(stderr, "ERROR: Reopen index: %d (%s)\n", res, strerror(-res));
|
||||
finished = true;
|
||||
return;
|
||||
}
|
||||
if (trace)
|
||||
printf("Index reopened\n");
|
||||
ringloop->wakeup();
|
||||
});
|
||||
});
|
||||
return;
|
||||
}
|
||||
else if (dice < reopen_prob+get_prob)
|
||||
{
|
||||
// get existing
|
||||
auto key = random_str(max_key_len);
|
||||
auto k_it = values.lower_bound(key);
|
||||
if (k_it == values.end())
|
||||
continue;
|
||||
key = k_it->first;
|
||||
if (changing_keys.find(key) != changing_keys.end())
|
||||
continue;
|
||||
in_progress++;
|
||||
ops_sent++;
|
||||
if (trace)
|
||||
printf("get %s\n", key.c_str());
|
||||
timespec tv_begin;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_begin);
|
||||
db->get(key, [this, key, tv_begin](int res, const std::string & value)
|
||||
{
|
||||
add_stat(stat.get, tv_begin);
|
||||
ops_done++;
|
||||
in_progress--;
|
||||
auto it = values.find(key);
|
||||
if (res != (it == values.end() ? -ENOENT : 0))
|
||||
{
|
||||
fprintf(stderr, "ERROR: get %s: %d (%s)\n", key.c_str(), res, strerror(-res));
|
||||
if (stop_on_error)
|
||||
exit(1);
|
||||
}
|
||||
else if (it != values.end() && value != it->second)
|
||||
{
|
||||
fprintf(stderr, "ERROR: get %s: mismatch: %s vs %s\n", key.c_str(), value.c_str(), it->second.c_str());
|
||||
if (stop_on_error)
|
||||
exit(1);
|
||||
}
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
else if (dice < reopen_prob+get_prob+add_prob+update_prob)
|
||||
{
|
||||
bool is_add = false;
|
||||
std::string key;
|
||||
if (dice < reopen_prob+get_prob+add_prob)
|
||||
{
|
||||
// add
|
||||
is_add = true;
|
||||
uint64_t key_len = min_key_len + (max_key_len > min_key_len ? lrand48() % (max_key_len-min_key_len) : 0);
|
||||
key = key_prefix + random_str(key_len) + key_suffix;
|
||||
}
|
||||
else
|
||||
{
|
||||
// update
|
||||
key = random_str(max_key_len);
|
||||
auto k_it = values.lower_bound(key);
|
||||
if (k_it == values.end())
|
||||
continue;
|
||||
key = k_it->first;
|
||||
}
|
||||
if (changing_keys.find(key) != changing_keys.end())
|
||||
continue;
|
||||
uint64_t value_len = min_value_len + (max_value_len > min_value_len ? lrand48() % (max_value_len-min_value_len) : 0);
|
||||
auto value = random_str(value_len);
|
||||
start_change(key);
|
||||
ops_sent++;
|
||||
in_progress++;
|
||||
if (trace)
|
||||
printf("set %s = %s\n", key.c_str(), value.c_str());
|
||||
timespec tv_begin;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_begin);
|
||||
db->set(key, value, [this, key, value, tv_begin, is_add](int res)
|
||||
{
|
||||
add_stat(is_add ? stat.add : stat.update, tv_begin);
|
||||
stop_change(key);
|
||||
ops_done++;
|
||||
in_progress--;
|
||||
if (res != 0)
|
||||
{
|
||||
fprintf(stderr, "ERROR: set %s = %s: %d (%s)\n", key.c_str(), value.c_str(), res, strerror(-res));
|
||||
if (stop_on_error)
|
||||
exit(1);
|
||||
}
|
||||
else
|
||||
{
|
||||
values[key] = value;
|
||||
}
|
||||
ringloop->wakeup();
|
||||
}, NULL);
|
||||
}
|
||||
else if (dice < reopen_prob+get_prob+add_prob+update_prob+del_prob)
|
||||
{
|
||||
// delete
|
||||
auto key = random_str(max_key_len);
|
||||
auto k_it = values.lower_bound(key);
|
||||
if (k_it == values.end())
|
||||
continue;
|
||||
key = k_it->first;
|
||||
if (changing_keys.find(key) != changing_keys.end())
|
||||
continue;
|
||||
start_change(key);
|
||||
ops_sent++;
|
||||
in_progress++;
|
||||
if (trace)
|
||||
printf("del %s\n", key.c_str());
|
||||
timespec tv_begin;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_begin);
|
||||
db->del(key, [this, key, tv_begin](int res)
|
||||
{
|
||||
add_stat(stat.del, tv_begin);
|
||||
stop_change(key);
|
||||
ops_done++;
|
||||
in_progress--;
|
||||
if (res != 0)
|
||||
{
|
||||
fprintf(stderr, "ERROR: del %s: %d (%s)\n", key.c_str(), res, strerror(-res));
|
||||
if (stop_on_error)
|
||||
exit(1);
|
||||
}
|
||||
else
|
||||
{
|
||||
values.erase(key);
|
||||
}
|
||||
ringloop->wakeup();
|
||||
}, NULL);
|
||||
}
|
||||
else if (dice < reopen_prob+get_prob+add_prob+update_prob+del_prob+list_prob)
|
||||
{
|
||||
// list
|
||||
ops_sent++;
|
||||
in_progress++;
|
||||
auto key = random_str(max_key_len);
|
||||
auto lst = new kv_test_listing_t;
|
||||
auto k_it = values.lower_bound(key);
|
||||
lst->count = min_list_count + (max_list_count > min_list_count ? lrand48() % (max_list_count-min_list_count) : 0);
|
||||
lst->handle = db->list_start(k_it == values.begin() ? key_prefix : key);
|
||||
lst->next_after = k_it == values.begin() ? key_prefix : key;
|
||||
lst->inflights = changing_keys;
|
||||
listings.insert(lst);
|
||||
if (trace)
|
||||
printf("list from %s\n", key.c_str());
|
||||
clock_gettime(CLOCK_REALTIME, &lst->tv_begin);
|
||||
db->list_next(lst->handle, [this, lst](int res, const std::string & key, const std::string & value)
|
||||
{
|
||||
if (log_level >= 11)
|
||||
printf("list: %s = %s\n", key.c_str(), value.c_str());
|
||||
if (res >= 0 && key_prefix.size() && (key.size() < key_prefix.size() ||
|
||||
key.substr(0, key_prefix.size()) != key_prefix))
|
||||
{
|
||||
// stop at this key
|
||||
res = -ENOENT;
|
||||
}
|
||||
if (res < 0 || (lst->count > 0 && lst->done >= lst->count))
|
||||
{
|
||||
add_stat(stat.list, lst->tv_begin);
|
||||
if (res == 0)
|
||||
{
|
||||
// ok (done >= count)
|
||||
}
|
||||
else if (res != -ENOENT)
|
||||
{
|
||||
fprintf(stderr, "ERROR: list: %d (%s)\n", res, strerror(-res));
|
||||
lst->error = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto k_it = lst->next_after == "" ? values.begin() : values.upper_bound(lst->next_after);
|
||||
while (k_it != values.end())
|
||||
{
|
||||
while (k_it != values.end() && lst->inflights.find(k_it->first) != lst->inflights.end())
|
||||
k_it++;
|
||||
if (k_it != values.end())
|
||||
{
|
||||
fprintf(stderr, "ERROR: list: missing key %s\n", (k_it++)->first.c_str());
|
||||
lst->error = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (lst->error && stop_on_error)
|
||||
exit(1);
|
||||
ops_done++;
|
||||
in_progress--;
|
||||
db->list_close(lst->handle);
|
||||
delete lst;
|
||||
listings.erase(lst);
|
||||
ringloop->wakeup();
|
||||
}
|
||||
else
|
||||
{
|
||||
stat.list_keys++;
|
||||
// Do not check modified keys in listing
|
||||
// Listing may return their old or new state
|
||||
if ((!key_suffix.size() || key.size() >= key_suffix.size() &&
|
||||
key.substr(key.size()-key_suffix.size()) == key_suffix) &&
|
||||
lst->inflights.find(key) == lst->inflights.end())
|
||||
{
|
||||
lst->done++;
|
||||
auto k_it = lst->next_after == "" ? values.begin() : values.upper_bound(lst->next_after);
|
||||
while (true)
|
||||
{
|
||||
while (k_it != values.end() && lst->inflights.find(k_it->first) != lst->inflights.end())
|
||||
{
|
||||
k_it++;
|
||||
}
|
||||
if (k_it == values.end() || k_it->first > key)
|
||||
{
|
||||
fprintf(stderr, "ERROR: list: extra key %s\n", key.c_str());
|
||||
lst->error = true;
|
||||
break;
|
||||
}
|
||||
else if (k_it->first < key)
|
||||
{
|
||||
fprintf(stderr, "ERROR: list: missing key %s\n", k_it->first.c_str());
|
||||
lst->error = true;
|
||||
lst->next_after = k_it->first;
|
||||
k_it++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (k_it->second != value)
|
||||
{
|
||||
fprintf(stderr, "ERROR: list: mismatch: %s = %s but should be %s\n",
|
||||
key.c_str(), value.c_str(), k_it->second.c_str());
|
||||
lst->error = true;
|
||||
}
|
||||
lst->next_after = k_it->first;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
db->list_next(lst->handle, NULL);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void kv_test_t::add_stat(kv_test_lat_t & stat, timespec tv_begin)
|
||||
{
|
||||
timespec tv_end;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
||||
int64_t usec = (tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
|
||||
(tv_end.tv_nsec - tv_begin.tv_nsec)/1000;
|
||||
if (usec > 0)
|
||||
{
|
||||
stat.usec += usec;
|
||||
stat.count++;
|
||||
}
|
||||
}
|
||||
|
||||
void kv_test_t::print_stats(kv_test_stat_t & prev_stat, timespec & prev_stat_time)
|
||||
{
|
||||
timespec cur_stat_time;
|
||||
clock_gettime(CLOCK_REALTIME, &cur_stat_time);
|
||||
int64_t usec = (cur_stat_time.tv_sec - prev_stat_time.tv_sec)*1000000 +
|
||||
(cur_stat_time.tv_nsec - prev_stat_time.tv_nsec)/1000;
|
||||
if (usec > 0)
|
||||
{
|
||||
kv_test_lat_t *lats[] = { &stat.get, &stat.add, &stat.update, &stat.del, &stat.list };
|
||||
kv_test_lat_t *prev[] = { &prev_stat.get, &prev_stat.add, &prev_stat.update, &prev_stat.del, &prev_stat.list };
|
||||
if (!json_output)
|
||||
{
|
||||
char buf[128] = { 0 };
|
||||
for (int i = 0; i < sizeof(lats)/sizeof(lats[0]); i++)
|
||||
{
|
||||
snprintf(buf, sizeof(buf)-1, "%.1f %s/s (%lu us)", (lats[i]->count-prev[i]->count)*1000000.0/usec,
|
||||
lats[i]->name, (lats[i]->usec-prev[i]->usec)/(lats[i]->count-prev[i]->count > 0 ? lats[i]->count-prev[i]->count : 1));
|
||||
int k;
|
||||
for (k = strlen(buf); k < strlen(lats[i]->name)+21; k++)
|
||||
buf[k] = ' ';
|
||||
buf[k] = 0;
|
||||
printf("%s", buf);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
int64_t runtime = (cur_stat_time.tv_sec - start_stat_time.tv_sec)*1000000 +
|
||||
(cur_stat_time.tv_nsec - start_stat_time.tv_nsec)/1000;
|
||||
printf("{\"runtime\":%.1f", (double)runtime/1000000.0);
|
||||
for (int i = 0; i < sizeof(lats)/sizeof(lats[0]); i++)
|
||||
{
|
||||
if (lats[i]->count > prev[i]->count)
|
||||
{
|
||||
printf(
|
||||
",\"%s\":{\"avg\":{\"iops\":%.1f,\"usec\":%lu},\"total\":{\"count\":%lu,\"usec\":%lu}}",
|
||||
lats[i]->name, (lats[i]->count-prev[i]->count)*1000000.0/usec,
|
||||
(lats[i]->usec-prev[i]->usec)/(lats[i]->count-prev[i]->count),
|
||||
lats[i]->count, lats[i]->usec
|
||||
);
|
||||
}
|
||||
}
|
||||
printf("}\n");
|
||||
}
|
||||
}
|
||||
prev_stat = stat;
|
||||
prev_stat_time = cur_stat_time;
|
||||
}
|
||||
|
||||
void kv_test_t::print_total_stats()
|
||||
{
|
||||
if (!json_output)
|
||||
printf("Total:\n");
|
||||
kv_test_stat_t start_stats;
|
||||
timespec start_stat_time = this->start_stat_time;
|
||||
print_stats(start_stats, start_stat_time);
|
||||
}
|
||||
|
||||
void kv_test_t::start_change(const std::string & key)
|
||||
{
|
||||
changing_keys.insert(key);
|
||||
for (auto lst: listings)
|
||||
{
|
||||
lst->inflights.insert(key);
|
||||
}
|
||||
}
|
||||
|
||||
void kv_test_t::stop_change(const std::string & key)
|
||||
{
|
||||
changing_keys.erase(key);
|
||||
}
|
||||
|
||||
int main(int narg, const char *args[])
|
||||
{
|
||||
setvbuf(stdout, NULL, _IONBF, 0);
|
||||
setvbuf(stderr, NULL, _IONBF, 0);
|
||||
exe_name = args[0];
|
||||
kv_test_t *p = new kv_test_t();
|
||||
p->run(kv_test_t::parse_args(narg, args));
|
||||
delete p;
|
||||
return 0;
|
||||
}
|
@@ -22,7 +22,7 @@ void osd_messenger_t::init()
|
||||
{
|
||||
rdma_context = msgr_rdma_context_t::create(
|
||||
rdma_device != "" ? rdma_device.c_str() : NULL,
|
||||
rdma_port_num, rdma_gid_index, rdma_mtu, log_level
|
||||
rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
|
||||
);
|
||||
if (!rdma_context)
|
||||
{
|
||||
@@ -167,6 +167,7 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
||||
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
|
||||
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
||||
this->rdma_max_msg = 129*1024;
|
||||
this->rdma_odp = config["rdma_odp"].bool_value();
|
||||
#endif
|
||||
this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
|
||||
if (!this->receive_buffer_size || this->receive_buffer_size > 1024*1024*1024)
|
||||
@@ -490,7 +491,14 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||
fprintf(stderr, "Connected to OSD %lu using RDMA\n", cl->osd_num);
|
||||
}
|
||||
cl->peer_state = PEER_RDMA;
|
||||
tfd->set_fd_handler(cl->peer_fd, false, NULL);
|
||||
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Do not miss the disconnection!
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
handle_peer_epoll(peer_fd, epoll_events);
|
||||
}
|
||||
});
|
||||
// Add the initial receive request
|
||||
try_recv_rdma(cl);
|
||||
}
|
||||
|
@@ -131,6 +131,7 @@ protected:
|
||||
msgr_rdma_context_t *rdma_context = NULL;
|
||||
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
|
||||
uint64_t rdma_max_msg = 0;
|
||||
bool rdma_odp = false;
|
||||
#endif
|
||||
|
||||
std::vector<int> read_ready_clients;
|
||||
@@ -197,7 +198,9 @@ protected:
|
||||
void handle_reply_ready(osd_op_t *op);
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
bool try_send_rdma(osd_client_t *cl);
|
||||
void try_send_rdma(osd_client_t *cl);
|
||||
void try_send_rdma_odp(osd_client_t *cl);
|
||||
void try_send_rdma_nodp(osd_client_t *cl);
|
||||
bool try_recv_rdma(osd_client_t *cl);
|
||||
void handle_rdma_events();
|
||||
#endif
|
||||
|
@@ -47,11 +47,29 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
||||
if (qp)
|
||||
ibv_destroy_qp(qp);
|
||||
if (recv_buffers.size())
|
||||
{
|
||||
for (auto b: recv_buffers)
|
||||
free(b);
|
||||
{
|
||||
if (b.mr)
|
||||
ibv_dereg_mr(b.mr);
|
||||
free(b.buf);
|
||||
}
|
||||
recv_buffers.clear();
|
||||
}
|
||||
if (send_out.mr)
|
||||
{
|
||||
ibv_dereg_mr(send_out.mr);
|
||||
send_out.mr = NULL;
|
||||
}
|
||||
if (send_out.buf)
|
||||
{
|
||||
free(send_out.buf);
|
||||
send_out.buf = NULL;
|
||||
}
|
||||
send_out_size = 0;
|
||||
}
|
||||
|
||||
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level)
|
||||
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
|
||||
{
|
||||
int res;
|
||||
ibv_device **dev_list = NULL;
|
||||
@@ -136,21 +154,27 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
|
||||
fprintf(stderr, "Couldn't query RDMA device for its features\n");
|
||||
goto cleanup;
|
||||
}
|
||||
if (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
|
||||
ctx->odp = odp;
|
||||
if (ctx->odp &&
|
||||
(!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
|
||||
!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT) ||
|
||||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
|
||||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV))
|
||||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV)))
|
||||
{
|
||||
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable or does not support RC send and receive with ODP\n");
|
||||
goto cleanup;
|
||||
ctx->odp = false;
|
||||
if (log_level > 0)
|
||||
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable, disabling it\n");
|
||||
}
|
||||
}
|
||||
|
||||
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
|
||||
if (!ctx->mr)
|
||||
if (ctx->odp)
|
||||
{
|
||||
fprintf(stderr, "Couldn't register RDMA memory region\n");
|
||||
goto cleanup;
|
||||
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
|
||||
if (!ctx->mr)
|
||||
{
|
||||
fprintf(stderr, "Couldn't register RDMA memory region\n");
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
ctx->channel = ibv_create_comp_channel(ctx->context);
|
||||
@@ -365,12 +389,34 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
||||
cl->rdma_conn->cur_send++;
|
||||
}
|
||||
|
||||
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||
static int try_send_rdma_copy(osd_client_t *cl, uint8_t *dst, int dst_len)
|
||||
{
|
||||
auto rc = cl->rdma_conn;
|
||||
int total_dst_len = dst_len;
|
||||
while (dst_len > 0 && rc->send_pos < cl->send_list.size())
|
||||
{
|
||||
iovec & iov = cl->send_list[rc->send_pos];
|
||||
uint32_t len = (uint32_t)(iov.iov_len-rc->send_buf_pos < dst_len
|
||||
? iov.iov_len-rc->send_buf_pos : dst_len);
|
||||
memcpy(dst, iov.iov_base+rc->send_buf_pos, len);
|
||||
dst += len;
|
||||
dst_len -= len;
|
||||
rc->send_buf_pos += len;
|
||||
if (rc->send_buf_pos >= iov.iov_len)
|
||||
{
|
||||
rc->send_pos++;
|
||||
rc->send_buf_pos = 0;
|
||||
}
|
||||
}
|
||||
return total_dst_len-dst_len;
|
||||
}
|
||||
|
||||
void osd_messenger_t::try_send_rdma_odp(osd_client_t *cl)
|
||||
{
|
||||
auto rc = cl->rdma_conn;
|
||||
if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
|
||||
{
|
||||
return true;
|
||||
return;
|
||||
}
|
||||
uint64_t op_size = 0, op_sge = 0;
|
||||
ibv_sge sge[rc->max_sge];
|
||||
@@ -408,15 +454,70 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||
rc->send_sizes.push_back(op_size);
|
||||
try_send_rdma_wr(cl, sge, op_sge);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
|
||||
void osd_messenger_t::try_send_rdma_nodp(osd_client_t *cl)
|
||||
{
|
||||
auto rc = cl->rdma_conn;
|
||||
if (!rc->send_out_size)
|
||||
{
|
||||
// Allocate send ring buffer, if not yet
|
||||
rc->send_out_size = rc->max_msg*rdma_max_send;
|
||||
rc->send_out.buf = malloc_or_die(rc->send_out_size);
|
||||
if (!rdma_context->odp)
|
||||
{
|
||||
rc->send_out.mr = ibv_reg_mr(rdma_context->pd, rc->send_out.buf, rc->send_out_size, 0);
|
||||
if (!rc->send_out.mr)
|
||||
{
|
||||
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Copy data into the buffer and send it
|
||||
uint8_t *dst = NULL;
|
||||
int dst_len = 0;
|
||||
int copied = 1;
|
||||
while (!rc->send_out_full && copied > 0 && rc->cur_send < rc->max_send)
|
||||
{
|
||||
dst = (uint8_t*)rc->send_out.buf + rc->send_out_pos;
|
||||
dst_len = (rc->send_out_pos < rc->send_out_size ? rc->send_out_size-rc->send_out_pos : rc->send_done_pos-rc->send_out_pos);
|
||||
if (dst_len > rc->max_msg)
|
||||
dst_len = rc->max_msg;
|
||||
copied = try_send_rdma_copy(cl, dst, dst_len);
|
||||
if (copied > 0)
|
||||
{
|
||||
rc->send_out_pos += copied;
|
||||
if (rc->send_out_pos == rc->send_out_size)
|
||||
rc->send_out_pos = 0;
|
||||
assert(rc->send_out_pos < rc->send_out_size);
|
||||
if (rc->send_out_pos >= rc->send_done_pos)
|
||||
rc->send_out_full = true;
|
||||
ibv_sge sge = {
|
||||
.addr = (uintptr_t)dst,
|
||||
.length = (uint32_t)copied,
|
||||
.lkey = rdma_context->odp ? rdma_context->mr->lkey : rc->send_out.mr->lkey,
|
||||
};
|
||||
try_send_rdma_wr(cl, &sge, 1);
|
||||
rc->send_sizes.push_back(copied);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||
{
|
||||
if (rdma_context->odp)
|
||||
try_send_rdma_odp(cl);
|
||||
else
|
||||
try_send_rdma_nodp(cl);
|
||||
}
|
||||
|
||||
static void try_recv_rdma_wr(osd_client_t *cl, msgr_rdma_buf_t b)
|
||||
{
|
||||
ibv_sge sge = {
|
||||
.addr = (uintptr_t)buf,
|
||||
.addr = (uintptr_t)b.buf,
|
||||
.length = (uint32_t)cl->rdma_conn->max_msg,
|
||||
.lkey = cl->rdma_conn->ctx->mr->lkey,
|
||||
.lkey = cl->rdma_conn->ctx->odp ? cl->rdma_conn->ctx->mr->lkey : b.mr->lkey,
|
||||
};
|
||||
ibv_recv_wr *bad_wr = NULL;
|
||||
ibv_recv_wr wr = {
|
||||
@@ -438,9 +539,19 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
|
||||
auto rc = cl->rdma_conn;
|
||||
while (rc->cur_recv < rc->max_recv)
|
||||
{
|
||||
void *buf = malloc_or_die(rc->max_msg);
|
||||
rc->recv_buffers.push_back(buf);
|
||||
try_recv_rdma_wr(cl, buf);
|
||||
msgr_rdma_buf_t b;
|
||||
b.buf = malloc_or_die(rc->max_msg);
|
||||
if (!rdma_context->odp)
|
||||
{
|
||||
b.mr = ibv_reg_mr(rdma_context->pd, b.buf, rc->max_msg, IBV_ACCESS_LOCAL_WRITE);
|
||||
if (!b.mr)
|
||||
{
|
||||
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
rc->recv_buffers.push_back(b);
|
||||
try_recv_rdma_wr(cl, b);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -492,7 +603,7 @@ void osd_messenger_t::handle_rdma_events()
|
||||
if (!is_send)
|
||||
{
|
||||
rc->cur_recv--;
|
||||
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
|
||||
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
|
||||
{
|
||||
// handle_read_buffer may stop the client
|
||||
continue;
|
||||
@@ -505,6 +616,14 @@ void osd_messenger_t::handle_rdma_events()
|
||||
rc->cur_send--;
|
||||
uint64_t sent_size = rc->send_sizes.at(0);
|
||||
rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
|
||||
if (!rdma_context->odp)
|
||||
{
|
||||
rc->send_done_pos += sent_size;
|
||||
rc->send_out_full = false;
|
||||
if (rc->send_done_pos == rc->send_out_size)
|
||||
rc->send_done_pos = 0;
|
||||
assert(rc->send_done_pos < rc->send_out_size);
|
||||
}
|
||||
int send_pos = 0, send_buf_pos = 0;
|
||||
while (sent_size > 0)
|
||||
{
|
||||
|
@@ -23,6 +23,7 @@ struct msgr_rdma_context_t
|
||||
ibv_device *dev = NULL;
|
||||
ibv_device_attr_ex attrx;
|
||||
ibv_pd *pd = NULL;
|
||||
bool odp = false;
|
||||
ibv_mr *mr = NULL;
|
||||
ibv_comp_channel *channel = NULL;
|
||||
ibv_cq *cq = NULL;
|
||||
@@ -35,10 +36,16 @@ struct msgr_rdma_context_t
|
||||
int max_cqe = 0;
|
||||
int used_max_cqe = 0;
|
||||
|
||||
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level);
|
||||
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level);
|
||||
~msgr_rdma_context_t();
|
||||
};
|
||||
|
||||
struct msgr_rdma_buf_t
|
||||
{
|
||||
void *buf = NULL;
|
||||
ibv_mr *mr = NULL;
|
||||
};
|
||||
|
||||
struct msgr_rdma_connection_t
|
||||
{
|
||||
msgr_rdma_context_t *ctx = NULL;
|
||||
@@ -50,8 +57,11 @@ struct msgr_rdma_connection_t
|
||||
|
||||
int send_pos = 0, send_buf_pos = 0;
|
||||
int next_recv_buf = 0;
|
||||
std::vector<void*> recv_buffers;
|
||||
std::vector<msgr_rdma_buf_t> recv_buffers;
|
||||
std::vector<uint64_t> send_sizes;
|
||||
msgr_rdma_buf_t send_out;
|
||||
int send_out_pos = 0, send_done_pos = 0, send_out_size = 0;
|
||||
bool send_out_full = false;
|
||||
|
||||
~msgr_rdma_connection_t();
|
||||
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
|
||||
|
@@ -3,6 +3,7 @@
|
||||
|
||||
#define _XOPEN_SOURCE
|
||||
#include <limits.h>
|
||||
#include <sys/epoll.h>
|
||||
|
||||
#include "messenger.h"
|
||||
|
||||
@@ -119,9 +120,9 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
try_send(cl);
|
||||
}
|
||||
}
|
||||
else if (cl->write_msg.msg_iovlen > 0 || !try_send(cl))
|
||||
else
|
||||
{
|
||||
if (cl->write_state == 0)
|
||||
if ((cl->write_msg.msg_iovlen > 0 || !try_send(cl)) && (cl->write_state == 0))
|
||||
{
|
||||
cl->write_state = CL_WRITE_READY;
|
||||
write_ready_clients.push_back(cur_op->peer_fd);
|
||||
@@ -283,7 +284,14 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
fprintf(stderr, "Successfully connected with client %d using RDMA\n", cl->peer_fd);
|
||||
}
|
||||
cl->peer_state = PEER_RDMA;
|
||||
tfd->set_fd_handler(cl->peer_fd, false, NULL);
|
||||
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Do not miss the disconnection!
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
handle_peer_epoll(peer_fd, epoll_events);
|
||||
}
|
||||
});
|
||||
// Add the initial receive request
|
||||
try_recv_rdma(cl);
|
||||
}
|
||||
|
@@ -225,7 +225,7 @@ public:
|
||||
cfg = obj;
|
||||
}
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(512);
|
||||
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||
if (!inode)
|
||||
|
@@ -124,7 +124,7 @@ void nfs_proxy_t::run(json11::Json cfg)
|
||||
cfg = obj;
|
||||
}
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(512);
|
||||
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||
cmd = new cli_tool_t();
|
||||
|
@@ -541,11 +541,15 @@ void osd_t::print_slow()
|
||||
}
|
||||
else if (op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
|
||||
{
|
||||
for (uint64_t i = 0; i < op->req.sec_stab.len; i += sizeof(obj_ver_id))
|
||||
for (uint64_t i = 0; i < op->req.sec_stab.len && i < sizeof(obj_ver_id)*12; i += sizeof(obj_ver_id))
|
||||
{
|
||||
obj_ver_id *ov = (obj_ver_id*)((uint8_t*)op->buf + i);
|
||||
bufprintf(i == 0 ? " %lx:%lx v%lu" : ", %lx:%lx v%lu", ov->oid.inode, ov->oid.stripe, ov->version);
|
||||
}
|
||||
if (op->req.sec_stab.len > sizeof(obj_ver_id)*12)
|
||||
{
|
||||
bufprintf(", ... (%lu items)", op->req.sec_stab.len/sizeof(obj_ver_id));
|
||||
}
|
||||
}
|
||||
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
|
||||
{
|
||||
|
@@ -19,6 +19,14 @@ static void handle_sigint(int sig)
|
||||
exit(0);
|
||||
}
|
||||
|
||||
static const char* help_text =
|
||||
"Vitastor OSD (block object storage daemon) " VERSION "\n"
|
||||
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
|
||||
"\n"
|
||||
"OSDs are usually started by vitastor-disk.\n"
|
||||
"Manual usage: vitastor-osd [--option value] ...\n"
|
||||
;
|
||||
|
||||
int main(int narg, char *args[])
|
||||
{
|
||||
setvbuf(stdout, NULL, _IONBF, 0);
|
||||
@@ -37,10 +45,20 @@ int main(int narg, char *args[])
|
||||
char *opt = args[i]+2;
|
||||
config[std::string(opt)] = std::string(args[++i]);
|
||||
}
|
||||
else if (!strcmp(args[i], "--help"))
|
||||
{
|
||||
printf("%s", help_text);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
if (!config.size())
|
||||
{
|
||||
printf("%s", help_text);
|
||||
return 1;
|
||||
}
|
||||
signal(SIGINT, handle_sigint);
|
||||
signal(SIGTERM, handle_sigint);
|
||||
ring_loop_t *ringloop = new ring_loop_t(512);
|
||||
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
osd = new osd_t(config, ringloop);
|
||||
while (1)
|
||||
{
|
||||
|
@@ -239,8 +239,9 @@ static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size
|
||||
{
|
||||
int edd = 0;
|
||||
int erased[pg_size];
|
||||
// we should distinguish stripes which are not at all and missing stripes
|
||||
for (int i = 0; i < pg_size; i++)
|
||||
erased[i] = (stripes[i].read_end == 0 || stripes[i].missing ? 1 : 0);
|
||||
erased[i] = (stripes[i].read_end == 0 ? 2 : (stripes[i].missing ? 1 : 0));
|
||||
for (int i = 0; i < pg_minsize; i++)
|
||||
if (stripes[i].read_end != 0 && stripes[i].missing)
|
||||
edd++;
|
||||
@@ -253,7 +254,7 @@ static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size
|
||||
#ifdef WITH_ISAL
|
||||
int smrow = 0;
|
||||
uint8_t *submatrix = (uint8_t*)malloc_or_die(pg_minsize*pg_minsize*2);
|
||||
for (int i = 0; i < pg_size; i++)
|
||||
for (int i = 0; i < pg_size && smrow < pg_minsize; i++)
|
||||
{
|
||||
if (!erased[i])
|
||||
{
|
||||
@@ -279,7 +280,7 @@ static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size
|
||||
smrow = 0;
|
||||
for (int i = 0; i < pg_minsize; i++)
|
||||
{
|
||||
if (erased[i])
|
||||
if (erased[i] == 1)
|
||||
{
|
||||
memcpy(submatrix + pg_minsize*smrow, submatrix + (pg_minsize+i)*pg_minsize, pg_minsize);
|
||||
smrow++;
|
||||
|
@@ -29,6 +29,7 @@ void test15(bool second);
|
||||
void test16();
|
||||
void test_recover_22_d2();
|
||||
void test_ec43_error_bruteforce();
|
||||
void test_recover_53_d5();
|
||||
|
||||
int main(int narg, char *args[])
|
||||
{
|
||||
@@ -67,6 +68,8 @@ int main(int narg, char *args[])
|
||||
test_recover_22_d2();
|
||||
// Error bruteforce
|
||||
test_ec43_error_bruteforce();
|
||||
// Test 19
|
||||
test_recover_53_d5();
|
||||
// End
|
||||
printf("all ok\n");
|
||||
return 0;
|
||||
@@ -1112,7 +1115,7 @@ void test_recover_22_d2()
|
||||
|
||||
/***
|
||||
|
||||
EC 4+2 error location bruteforce
|
||||
18. EC 4+2 error location bruteforce
|
||||
|
||||
***/
|
||||
|
||||
@@ -1178,3 +1181,66 @@ void test_ec43_error_bruteforce()
|
||||
free(write_buf);
|
||||
use_ec(7, 4, false);
|
||||
}
|
||||
|
||||
/***
|
||||
|
||||
19. EC 5+3 recover 5th data block but not 4th
|
||||
|
||||
***/
|
||||
|
||||
void test_recover_53_d5()
|
||||
{
|
||||
const int bmp = 128*1024 / 4096 / 8;
|
||||
use_ec(8, 5, true);
|
||||
osd_num_t osd_set[8] = { 1, 2, 3, 0, 0, 6, 7, 8 };
|
||||
osd_rmw_stripe_t stripes[8] = {};
|
||||
unsigned bitmaps[8] = { 0 };
|
||||
// Read 512+128K
|
||||
split_stripes(5, 128*1024, 512*1024, 128*1024, stripes);
|
||||
assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
|
||||
assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
|
||||
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
|
||||
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
|
||||
assert(stripes[4].req_start == 0 && stripes[4].req_end == 128*1024);
|
||||
uint8_t *data_buf = (uint8_t*)malloc_or_die(128*1024*8);
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
stripes[i].read_start = stripes[i].req_start;
|
||||
stripes[i].read_end = stripes[i].req_end;
|
||||
stripes[i].read_buf = data_buf + i*128*1024;
|
||||
stripes[i].bmp_buf = bitmaps + i;
|
||||
}
|
||||
// Read using parity
|
||||
assert(extend_missing_stripes(stripes, osd_set, 5, 8) == 0);
|
||||
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
|
||||
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
|
||||
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
|
||||
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
|
||||
assert(stripes[4].read_start == 0 && stripes[4].read_end == 128*1024);
|
||||
assert(stripes[5].read_start == 0 && stripes[5].read_end == 128*1024);
|
||||
assert(stripes[6].read_start == 0 && stripes[6].read_end == 128*1024);
|
||||
assert(stripes[7].read_start == 0 && stripes[7].read_end == 0);
|
||||
bitmaps[0] = 0xffffffff;
|
||||
bitmaps[1] = 0xffffffff;
|
||||
bitmaps[2] = 0xffffffff;
|
||||
bitmaps[3] = 0;
|
||||
bitmaps[4] = 0;
|
||||
bitmaps[5] = 0xffffffff;
|
||||
bitmaps[6] = 0x64646464;
|
||||
bitmaps[7] = 0;
|
||||
set_pattern(stripes[0].read_buf, 128*1024, 0x70a549add9a2280a);
|
||||
set_pattern(stripes[1].read_buf, 128*1024, 0xa70a549add9a2280);
|
||||
set_pattern(stripes[2].read_buf, 128*1024, 0x0a70a549add9a228);
|
||||
set_pattern(stripes[3].read_buf, 128*1024, 0); // 0x80a70a549add9a22
|
||||
set_pattern(stripes[4].read_buf, 128*1024, 0); // 0x280a70a549add9a2
|
||||
set_pattern(stripes[5].read_buf, 128*1024, 0x7572c28f7a91eb22); // xor
|
||||
set_pattern(stripes[6].read_buf, 128*1024, 0xb4542b32a560fe26); // 2nd EC chunk
|
||||
set_pattern(stripes[7].read_buf, 128*1024, 0);
|
||||
// Reconstruct
|
||||
reconstruct_stripes_ec(stripes, 8, 5, bmp);
|
||||
check_pattern(stripes[4].read_buf, 128*1024, 0x280a70a549add9a2);
|
||||
assert(bitmaps[4] == 0xFFFFFFFF);
|
||||
free(data_buf);
|
||||
// Done
|
||||
use_ec(8, 5, false);
|
||||
}
|
||||
|
@@ -17,7 +17,7 @@ ring_loop_t::ring_loop_t(int qd)
|
||||
{
|
||||
throw std::runtime_error(std::string("io_uring_queue_init: ") + strerror(-ret));
|
||||
}
|
||||
free_ring_data_ptr = *ring.cq.kring_entries;
|
||||
free_ring_data_ptr = *ring.sq.kring_entries;
|
||||
ring_datas = (struct ring_data_t*)calloc(free_ring_data_ptr, sizeof(ring_data_t));
|
||||
free_ring_data = (int*)malloc(sizeof(int) * free_ring_data_ptr);
|
||||
if (!ring_datas || !free_ring_data)
|
||||
|
@@ -15,6 +15,8 @@
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
|
||||
#define RINGLOOP_DEFAULT_SIZE 1024
|
||||
|
||||
static inline void my_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, const void *addr, unsigned len, off_t offset)
|
||||
{
|
||||
// Prepare a read/write operation without clearing user_data
|
||||
@@ -139,11 +141,9 @@ public:
|
||||
if (free_ring_data_ptr == 0)
|
||||
return NULL;
|
||||
struct io_uring_sqe* sqe = io_uring_get_sqe(&ring);
|
||||
if (sqe)
|
||||
{
|
||||
*sqe = { 0 };
|
||||
io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
|
||||
}
|
||||
assert(sqe);
|
||||
*sqe = { 0 };
|
||||
io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
|
||||
return sqe;
|
||||
}
|
||||
inline void set_immediate(const std::function<void()> cb)
|
||||
|
@@ -30,7 +30,7 @@ void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op);
|
||||
int main(int narg, char *args[])
|
||||
{
|
||||
ring_consumer_t looper;
|
||||
ring_loop_t *ringloop = new ring_loop_t(512);
|
||||
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
|
||||
osd_messenger_t *msgr = new osd_messenger_t();
|
||||
msgr->osd_num = 1351;
|
||||
|
@@ -11,7 +11,7 @@ int main(int narg, char *args[])
|
||||
config["meta_device"] = "./test_meta.bin";
|
||||
config["journal_device"] = "./test_journal.bin";
|
||||
config["data_device"] = "./test_data.bin";
|
||||
ring_loop_t *ringloop = new ring_loop_t(512);
|
||||
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
|
||||
blockstore_t *bs = new blockstore_t(config, ringloop, epmgr->tfd);
|
||||
|
||||
|
@@ -68,7 +68,7 @@ int main(int narg, char *args[])
|
||||
| cfg["inode_id"].uint64_value();
|
||||
uint64_t base_ver = 0;
|
||||
// Create client
|
||||
auto ringloop = new ring_loop_t(512);
|
||||
auto ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
auto epmgr = new epoll_manager_t(ringloop);
|
||||
auto cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||
cli->on_ready([&]()
|
||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
Name: Vitastor
|
||||
Description: Vitastor client library
|
||||
Version: 1.1.0
|
||||
Version: 1.2.0
|
||||
Libs: -L${libdir} -lvitastor_client
|
||||
Cflags: -I${includedir}
|
||||
|
||||
|
@@ -114,7 +114,7 @@ vitastor_c *vitastor_c_create_qemu_uring(QEMUSetFDHandler *aio_set_fd_handler, v
|
||||
ring_loop_t *ringloop = NULL;
|
||||
try
|
||||
{
|
||||
ringloop = new ring_loop_t(512);
|
||||
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
}
|
||||
catch (std::exception & e)
|
||||
{
|
||||
@@ -136,7 +136,7 @@ vitastor_c *vitastor_c_create_uring(const char *config_path, const char *etcd_ho
|
||||
ring_loop_t *ringloop = NULL;
|
||||
try
|
||||
{
|
||||
ringloop = new ring_loop_t(512);
|
||||
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
}
|
||||
catch (std::exception & e)
|
||||
{
|
||||
@@ -167,7 +167,7 @@ vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len)
|
||||
ring_loop_t *ringloop = NULL;
|
||||
try
|
||||
{
|
||||
ringloop = new ring_loop_t(512);
|
||||
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
}
|
||||
catch (std::exception & e)
|
||||
{
|
||||
|
@@ -29,7 +29,7 @@ start_osd_on()
|
||||
{
|
||||
local i=$1
|
||||
local dev=$2
|
||||
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL \
|
||||
build/src/vitastor-osd --osd_num $i --bind_address $ETCD_IP $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL \
|
||||
$(build/src/vitastor-disk simple-offsets --format options $OFFSET_ARGS $dev $OFFSET_ARGS 2>/dev/null) \
|
||||
>>./testdata/osd$i.log 2>&1 &
|
||||
eval OSD${i}_PID=$!
|
||||
|
Reference in New Issue
Block a user