Compare commits
21 Commits
rdma-simpl
...
v1.3.1
Author | SHA1 | Date | |
---|---|---|---|
a1c7cc3d8d | |||
a5e3dfbc5a | |||
7972502eaf | |||
e57b7203b8 | |||
c8a179dcda | |||
845454742d | |||
d65512bd80 | |||
53de2bbd0f | |||
628aa59574 | |||
037cf64a47 | |||
19e2d9d6fa | |||
bfc7e61909 | |||
7da4868b37 | |||
b5c020ce0b | |||
6b33ae973d | |||
cf36445359 | |||
3fd873d263 | |||
a00e8ae9ed | |||
75674545dc | |||
225eb2fe3d | |||
7e82573ed0 |
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
|
|||||||
|
|
||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
set(VERSION "1.2.0")
|
set(VERSION "1.3.1")
|
||||||
|
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
@@ -1,14 +1,15 @@
|
|||||||
# Compile stage
|
# Compile stage
|
||||||
FROM golang:buster AS build
|
FROM golang:bookworm AS build
|
||||||
|
|
||||||
ADD go.sum go.mod /app/
|
ADD go.sum go.mod /app/
|
||||||
RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go mod download -x
|
RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go mod download -x
|
||||||
ADD . /app
|
ADD . /app
|
||||||
RUN perl -i -e '$/ = undef; while(<>) { s/\n\s*(\{\s*\n)/$1\n/g; s/\}(\s*\n\s*)else\b/$1} else/g; print; }' `find /app -name '*.go'`
|
RUN perl -i -e '$/ = undef; while(<>) { s/\n\s*(\{\s*\n)/$1\n/g; s/\}(\s*\n\s*)else\b/$1} else/g; print; }' `find /app -name '*.go'` && \
|
||||||
RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o vitastor-csi
|
cd /app && \
|
||||||
|
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o vitastor-csi
|
||||||
|
|
||||||
# Final stage
|
# Final stage
|
||||||
FROM debian:buster
|
FROM debian:bookworm
|
||||||
|
|
||||||
LABEL maintainers="Vitaliy Filippov <vitalif@yourcmc.ru>"
|
LABEL maintainers="Vitaliy Filippov <vitalif@yourcmc.ru>"
|
||||||
LABEL description="Vitastor CSI Driver"
|
LABEL description="Vitastor CSI Driver"
|
||||||
@@ -18,19 +19,30 @@ ENV CSI_ENDPOINT=""
|
|||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y wget && \
|
apt-get install -y wget && \
|
||||||
(echo deb http://deb.debian.org/debian buster-backports main > /etc/apt/sources.list.d/backports.list) && \
|
|
||||||
(echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
|
(echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y e2fsprogs xfsprogs kmod && \
|
apt-get install -y e2fsprogs xfsprogs kmod iproute2 \
|
||||||
|
# dependencies of qemu-storage-daemon
|
||||||
|
libnuma1 liburing2 libglib2.0-0 libfuse3-3 libaio1 libzstd1 libnettle8 \
|
||||||
|
libgmp10 libhogweed6 libp11-kit0 libidn2-0 libunistring2 libtasn1-6 libpcre2-8-0 libffi8 && \
|
||||||
apt-get clean && \
|
apt-get clean && \
|
||||||
(echo options nbd nbds_max=128 > /etc/modprobe.d/nbd.conf)
|
(echo options nbd nbds_max=128 > /etc/modprobe.d/nbd.conf)
|
||||||
|
|
||||||
COPY --from=build /app/vitastor-csi /bin/
|
COPY --from=build /app/vitastor-csi /bin/
|
||||||
|
|
||||||
RUN (echo deb http://vitastor.io/debian buster main > /etc/apt/sources.list.d/vitastor.list) && \
|
RUN (echo deb http://vitastor.io/debian bookworm main > /etc/apt/sources.list.d/vitastor.list) && \
|
||||||
|
((echo 'Package: *'; echo 'Pin: origin "vitastor.io"'; echo 'Pin-Priority: 1000') > /etc/apt/preferences.d/vitastor.pref) && \
|
||||||
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
|
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y vitastor-client && \
|
apt-get install -y vitastor-client && \
|
||||||
|
apt-get download qemu-system-common && \
|
||||||
|
apt-get download qemu-block-extra && \
|
||||||
|
dpkg -x qemu-system-common*.deb tmp1 && \
|
||||||
|
dpkg -x qemu-block-extra*.deb tmp1 && \
|
||||||
|
cp -a tmp1/usr/bin/qemu-storage-daemon /usr/bin/ && \
|
||||||
|
mkdir -p /usr/lib/x86_64-linux-gnu/qemu && \
|
||||||
|
cp -a tmp1/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/ && \
|
||||||
|
rm -rf tmp1 *.deb && \
|
||||||
apt-get clean
|
apt-get clean
|
||||||
|
|
||||||
ENTRYPOINT ["/bin/vitastor-csi"]
|
ENTRYPOINT ["/bin/vitastor-csi"]
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
VERSION ?= v1.2.0
|
VERSION ?= v1.3.1
|
||||||
|
|
||||||
all: build push
|
all: build push
|
||||||
|
|
||||||
|
@@ -2,6 +2,7 @@
|
|||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
data:
|
data:
|
||||||
|
# You can add multiple configuration files here to use a multi-cluster setup
|
||||||
vitastor.conf: |-
|
vitastor.conf: |-
|
||||||
{"etcd_address":"http://192.168.7.2:2379","etcd_prefix":"/vitastor"}
|
{"etcd_address":"http://192.168.7.2:2379","etcd_prefix":"/vitastor"}
|
||||||
metadata:
|
metadata:
|
||||||
|
@@ -49,7 +49,7 @@ spec:
|
|||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
allowPrivilegeEscalation: true
|
allowPrivilegeEscalation: true
|
||||||
image: vitalif/vitastor-csi:v1.2.0
|
image: vitalif/vitastor-csi:v1.3.1
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
@@ -82,6 +82,8 @@ spec:
|
|||||||
name: host-sys
|
name: host-sys
|
||||||
- mountPath: /run/mount
|
- mountPath: /run/mount
|
||||||
name: host-mount
|
name: host-mount
|
||||||
|
- mountPath: /run/vitastor-csi
|
||||||
|
name: run-vitastor-csi
|
||||||
- mountPath: /lib/modules
|
- mountPath: /lib/modules
|
||||||
name: lib-modules
|
name: lib-modules
|
||||||
readOnly: true
|
readOnly: true
|
||||||
@@ -132,6 +134,9 @@ spec:
|
|||||||
- name: host-mount
|
- name: host-mount
|
||||||
hostPath:
|
hostPath:
|
||||||
path: /run/mount
|
path: /run/mount
|
||||||
|
- name: run-vitastor-csi
|
||||||
|
hostPath:
|
||||||
|
path: /run/vitastor-csi
|
||||||
- name: lib-modules
|
- name: lib-modules
|
||||||
hostPath:
|
hostPath:
|
||||||
path: /lib/modules
|
path: /lib/modules
|
||||||
|
@@ -121,7 +121,7 @@ spec:
|
|||||||
privileged: true
|
privileged: true
|
||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
image: vitalif/vitastor-csi:v1.2.0
|
image: vitalif/vitastor-csi:v1.3.1
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
@@ -12,9 +12,6 @@ parameters:
|
|||||||
etcdVolumePrefix: ""
|
etcdVolumePrefix: ""
|
||||||
poolId: "1"
|
poolId: "1"
|
||||||
# you can choose other configuration file if you have it in the config map
|
# you can choose other configuration file if you have it in the config map
|
||||||
|
# different etcd URLs and prefixes should also be put in the config
|
||||||
#configPath: "/etc/vitastor/vitastor.conf"
|
#configPath: "/etc/vitastor/vitastor.conf"
|
||||||
# you can also specify etcdUrl here, maybe to connect to another Vitastor cluster
|
|
||||||
# multiple etcdUrls may be specified, delimited by comma
|
|
||||||
#etcdUrl: "http://192.168.7.2:2379"
|
|
||||||
#etcdPrefix: "/vitastor"
|
|
||||||
allowVolumeExpansion: true
|
allowVolumeExpansion: true
|
||||||
|
@@ -5,7 +5,7 @@ package vitastor
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
vitastorCSIDriverName = "csi.vitastor.io"
|
vitastorCSIDriverName = "csi.vitastor.io"
|
||||||
vitastorCSIDriverVersion = "1.2.0"
|
vitastorCSIDriverVersion = "1.3.1"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config struct fills the parameters of request or user input
|
// Config struct fills the parameters of request or user input
|
||||||
|
@@ -62,7 +62,7 @@ func NewControllerServer(driver *Driver) *ControllerServer
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetConnectionParams(params map[string]string) (map[string]string, []string, string)
|
func GetConnectionParams(params map[string]string) (map[string]string, error)
|
||||||
{
|
{
|
||||||
ctxVars := make(map[string]string)
|
ctxVars := make(map[string]string)
|
||||||
configPath := params["configPath"]
|
configPath := params["configPath"]
|
||||||
@@ -75,71 +75,58 @@ func GetConnectionParams(params map[string]string) (map[string]string, []string,
|
|||||||
ctxVars["configPath"] = configPath
|
ctxVars["configPath"] = configPath
|
||||||
}
|
}
|
||||||
config := make(map[string]interface{})
|
config := make(map[string]interface{})
|
||||||
if configFD, err := os.Open(configPath); err == nil
|
configFD, err := os.Open(configPath)
|
||||||
|
if (err != nil)
|
||||||
{
|
{
|
||||||
defer configFD.Close()
|
return nil, err
|
||||||
data, _ := ioutil.ReadAll(configFD)
|
|
||||||
json.Unmarshal(data, &config)
|
|
||||||
}
|
}
|
||||||
// Try to load prefix & etcd URL from the config
|
defer configFD.Close()
|
||||||
|
data, _ := ioutil.ReadAll(configFD)
|
||||||
|
json.Unmarshal(data, &config)
|
||||||
|
// Check etcd URL in the config, but do not use the explicit etcdUrl
|
||||||
|
// parameter for CLI calls, otherwise users won't be able to later
|
||||||
|
// change them - storage class parameters are saved in volume IDs
|
||||||
var etcdUrl []string
|
var etcdUrl []string
|
||||||
if (params["etcdUrl"] != "")
|
switch config["etcd_address"].(type)
|
||||||
{
|
{
|
||||||
ctxVars["etcdUrl"] = params["etcdUrl"]
|
case string:
|
||||||
etcdUrl = strings.Split(params["etcdUrl"], ",")
|
url := strings.TrimSpace(config["etcd_address"].(string))
|
||||||
|
if (url != "")
|
||||||
|
{
|
||||||
|
etcdUrl = strings.Split(url, ",")
|
||||||
|
}
|
||||||
|
case []string:
|
||||||
|
etcdUrl = config["etcd_address"].([]string)
|
||||||
}
|
}
|
||||||
if (len(etcdUrl) == 0)
|
if (len(etcdUrl) == 0)
|
||||||
{
|
{
|
||||||
switch config["etcd_address"].(type)
|
return nil, status.Error(codes.InvalidArgument, "etcd_address is missing in "+configPath)
|
||||||
{
|
|
||||||
case string:
|
|
||||||
etcdUrl = strings.Split(config["etcd_address"].(string), ",")
|
|
||||||
case []string:
|
|
||||||
etcdUrl = config["etcd_address"].([]string)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
etcdPrefix := params["etcdPrefix"]
|
return ctxVars, nil
|
||||||
if (etcdPrefix == "")
|
}
|
||||||
|
|
||||||
|
func system(program string, args ...string) ([]byte, error)
|
||||||
|
{
|
||||||
|
c := exec.Command(program, args...)
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
c.Stdout, c.Stderr = &stdout, &stderr
|
||||||
|
err := c.Run()
|
||||||
|
if (err != nil)
|
||||||
{
|
{
|
||||||
etcdPrefix, _ = config["etcd_prefix"].(string)
|
stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
|
||||||
if (etcdPrefix == "")
|
klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s, status %s\n", stdoutStr+stderrStr, err)
|
||||||
{
|
return nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
|
||||||
etcdPrefix = "/vitastor"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else
|
return stdout.Bytes(), nil
|
||||||
{
|
|
||||||
ctxVars["etcdPrefix"] = etcdPrefix
|
|
||||||
}
|
|
||||||
return ctxVars, etcdUrl, etcdPrefix
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
|
func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
|
||||||
{
|
{
|
||||||
if (ctxVars["etcdUrl"] != "")
|
|
||||||
{
|
|
||||||
args = append(args, "--etcd_address", ctxVars["etcdUrl"])
|
|
||||||
}
|
|
||||||
if (ctxVars["etcdPrefix"] != "")
|
|
||||||
{
|
|
||||||
args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
|
|
||||||
}
|
|
||||||
if (ctxVars["configPath"] != "")
|
if (ctxVars["configPath"] != "")
|
||||||
{
|
{
|
||||||
args = append(args, "--config_path", ctxVars["configPath"])
|
args = append(args, "--config_path", ctxVars["configPath"])
|
||||||
}
|
}
|
||||||
c := exec.Command("/usr/bin/vitastor-cli", args...)
|
return system("/usr/bin/vitastor-cli", args...)
|
||||||
var stdout, stderr bytes.Buffer
|
|
||||||
c.Stdout = &stdout
|
|
||||||
c.Stderr = &stderr
|
|
||||||
err := c.Run()
|
|
||||||
stderrStr := string(stderr.Bytes())
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
|
|
||||||
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
|
|
||||||
}
|
|
||||||
return stdout.Bytes(), nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create the volume
|
// Create the volume
|
||||||
@@ -174,10 +161,10 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
|
|||||||
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
|
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
|
||||||
}
|
}
|
||||||
|
|
||||||
ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
|
ctxVars, err := GetConnectionParams(req.Parameters)
|
||||||
if (len(etcdUrl) == 0)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
args := []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) }
|
args := []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) }
|
||||||
@@ -207,7 +194,7 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Create image using vitastor-cli
|
// Create image using vitastor-cli
|
||||||
_, err := invokeCLI(ctxVars, args)
|
_, err = invokeCLI(ctxVars, args)
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
if (strings.Index(err.Error(), "already exists") > 0)
|
if (strings.Index(err.Error(), "already exists") > 0)
|
||||||
@@ -257,7 +244,11 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
|
|||||||
}
|
}
|
||||||
volName := volVars["name"]
|
volName := volVars["name"]
|
||||||
|
|
||||||
ctxVars, _, _ := GetConnectionParams(volVars)
|
ctxVars, err := GetConnectionParams(volVars)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
_, err = invokeCLI(ctxVars, []string{ "rm", volName })
|
_, err = invokeCLI(ctxVars, []string{ "rm", volName })
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
@@ -469,7 +460,11 @@ func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteS
|
|||||||
volName := volVars["name"]
|
volName := volVars["name"]
|
||||||
snapName := volVars["snapshot"]
|
snapName := volVars["snapshot"]
|
||||||
|
|
||||||
ctxVars, _, _ := GetConnectionParams(volVars)
|
ctxVars, err := GetConnectionParams(volVars)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
_, err = invokeCLI(ctxVars, []string{ "rm", volName+"@"+snapName })
|
_, err = invokeCLI(ctxVars, []string{ "rm", volName+"@"+snapName })
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
@@ -496,7 +491,11 @@ func (cs *ControllerServer) ListSnapshots(ctx context.Context, req *csi.ListSnap
|
|||||||
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
||||||
}
|
}
|
||||||
volName := volVars["name"]
|
volName := volVars["name"]
|
||||||
ctxVars, _, _ := GetConnectionParams(volVars)
|
ctxVars, err := GetConnectionParams(volVars)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
inodeCfg, err := invokeList(ctxVars, volName+"@*", false)
|
inodeCfg, err := invokeList(ctxVars, volName+"@*", false)
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
@@ -555,7 +554,11 @@ func (cs *ControllerServer) ControllerExpandVolume(ctx context.Context, req *csi
|
|||||||
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
||||||
}
|
}
|
||||||
volName := volVars["name"]
|
volName := volVars["name"]
|
||||||
ctxVars, _, _ := GetConnectionParams(volVars)
|
ctxVars, err := GetConnectionParams(volVars)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
inodeCfg, err := invokeList(ctxVars, volName, true)
|
inodeCfg, err := invokeList(ctxVars, volName, true)
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
|
@@ -5,11 +5,14 @@ package vitastor
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
|
"encoding/json"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"encoding/json"
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"bytes"
|
"syscall"
|
||||||
|
|
||||||
"google.golang.org/grpc/codes"
|
"google.golang.org/grpc/codes"
|
||||||
"google.golang.org/grpc/status"
|
"google.golang.org/grpc/status"
|
||||||
@@ -25,16 +28,91 @@ import (
|
|||||||
type NodeServer struct
|
type NodeServer struct
|
||||||
{
|
{
|
||||||
*Driver
|
*Driver
|
||||||
|
useVduse bool
|
||||||
|
stateDir string
|
||||||
mounter mount.Interface
|
mounter mount.Interface
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type DeviceState struct
|
||||||
|
{
|
||||||
|
ConfigPath string `json:"configPath"`
|
||||||
|
VdpaId string `json:"vdpaId"`
|
||||||
|
Image string `json:"image"`
|
||||||
|
Blockdev string `json:"blockdev"`
|
||||||
|
Readonly bool `json:"readonly"`
|
||||||
|
PidFile string `json:"pidFile"`
|
||||||
|
}
|
||||||
|
|
||||||
// NewNodeServer create new instance node
|
// NewNodeServer create new instance node
|
||||||
func NewNodeServer(driver *Driver) *NodeServer
|
func NewNodeServer(driver *Driver) *NodeServer
|
||||||
{
|
{
|
||||||
return &NodeServer{
|
stateDir := os.Getenv("STATE_DIR")
|
||||||
|
if (stateDir == "")
|
||||||
|
{
|
||||||
|
stateDir = "/run/vitastor-csi"
|
||||||
|
}
|
||||||
|
if (stateDir[len(stateDir)-1] != '/')
|
||||||
|
{
|
||||||
|
stateDir += "/"
|
||||||
|
}
|
||||||
|
ns := &NodeServer{
|
||||||
Driver: driver,
|
Driver: driver,
|
||||||
|
useVduse: checkVduseSupport(),
|
||||||
|
stateDir: stateDir,
|
||||||
mounter: mount.New(""),
|
mounter: mount.New(""),
|
||||||
}
|
}
|
||||||
|
if (ns.useVduse)
|
||||||
|
{
|
||||||
|
ns.restoreVduseDaemons()
|
||||||
|
}
|
||||||
|
return ns
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkVduseSupport() bool
|
||||||
|
{
|
||||||
|
// Check VDUSE support (vdpa, vduse, virtio-vdpa kernel modules)
|
||||||
|
vduse := true
|
||||||
|
for _, mod := range []string{"vdpa", "vduse", "virtio-vdpa"}
|
||||||
|
{
|
||||||
|
_, err := os.Stat("/sys/module/"+mod)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
if (!errors.Is(err, os.ErrNotExist))
|
||||||
|
{
|
||||||
|
klog.Errorf("failed to check /sys/module/%s: %v", mod, err)
|
||||||
|
}
|
||||||
|
c := exec.Command("/sbin/modprobe", mod)
|
||||||
|
c.Stdout = os.Stderr
|
||||||
|
c.Stderr = os.Stderr
|
||||||
|
err := c.Run()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("/sbin/modprobe %s failed: %v", mod, err)
|
||||||
|
vduse = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Check that vdpa tool functions
|
||||||
|
if (vduse)
|
||||||
|
{
|
||||||
|
c := exec.Command("/sbin/vdpa", "-j", "dev")
|
||||||
|
c.Stderr = os.Stderr
|
||||||
|
err := c.Run()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("/sbin/vdpa -j dev failed: %v", err)
|
||||||
|
vduse = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!vduse)
|
||||||
|
{
|
||||||
|
klog.Errorf(
|
||||||
|
"Your host apparently has no VDUSE support. VDUSE support disabled, NBD will be used to map devices."+
|
||||||
|
" For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
return vduse
|
||||||
}
|
}
|
||||||
|
|
||||||
// NodeStageVolume mounts the volume to a staging path on the node.
|
// NodeStageVolume mounts the volume to a staging path on the node.
|
||||||
@@ -61,6 +139,303 @@ func Contains(list []string, s string) bool
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (ns *NodeServer) mapNbd(volName string, ctxVars map[string]string, readonly bool) (string, error)
|
||||||
|
{
|
||||||
|
// Map NBD device
|
||||||
|
// FIXME: Check if already mapped
|
||||||
|
args := []string{
|
||||||
|
"map", "--image", volName,
|
||||||
|
}
|
||||||
|
if (ctxVars["configPath"] != "")
|
||||||
|
{
|
||||||
|
args = append(args, "--config_path", ctxVars["configPath"])
|
||||||
|
}
|
||||||
|
if (readonly)
|
||||||
|
{
|
||||||
|
args = append(args, "--readonly", "1")
|
||||||
|
}
|
||||||
|
dev, err := system("/usr/bin/vitastor-nbd", args...)
|
||||||
|
return strings.TrimSpace(string(dev)), err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ns *NodeServer) unmapNbd(devicePath string)
|
||||||
|
{
|
||||||
|
// unmap NBD device
|
||||||
|
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
|
||||||
|
if (unmapErr != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func findByPidFile(pidFile string) (*os.Process, error)
|
||||||
|
{
|
||||||
|
pidBuf, err := os.ReadFile(pidFile)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
pid, err := strconv.ParseInt(strings.TrimSpace(string(pidBuf)), 0, 64)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
proc, err := os.FindProcess(int(pid))
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return proc, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func killByPidFile(pidFile string) error
|
||||||
|
{
|
||||||
|
proc, err := findByPidFile(pidFile)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return proc.Signal(syscall.SIGTERM)
|
||||||
|
}
|
||||||
|
|
||||||
|
func startStorageDaemon(vdpaId, volName, pidFile, configPath string, readonly bool) error
|
||||||
|
{
|
||||||
|
// Start qemu-storage-daemon
|
||||||
|
blockSpec := map[string]interface{}{
|
||||||
|
"node-name": "disk1",
|
||||||
|
"driver": "vitastor",
|
||||||
|
"image": volName,
|
||||||
|
"cache": map[string]bool{
|
||||||
|
"direct": true,
|
||||||
|
"no-flush": false,
|
||||||
|
},
|
||||||
|
"discard": "unmap",
|
||||||
|
}
|
||||||
|
if (configPath != "")
|
||||||
|
{
|
||||||
|
blockSpec["config-path"] = configPath
|
||||||
|
}
|
||||||
|
blockSpecJson, _ := json.Marshal(blockSpec)
|
||||||
|
writable := "true"
|
||||||
|
if (readonly)
|
||||||
|
{
|
||||||
|
writable = "false"
|
||||||
|
}
|
||||||
|
_, err := system(
|
||||||
|
"/usr/bin/qemu-storage-daemon", "--daemonize", "--pidfile", pidFile, "--blockdev", string(blockSpecJson),
|
||||||
|
"--export", "vduse-blk,id="+vdpaId+",node-name=disk1,name="+vdpaId+",num-queues=16,queue-size=128,writable="+writable,
|
||||||
|
)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ns *NodeServer) mapVduse(volName string, ctxVars map[string]string, readonly bool) (string, string, error)
|
||||||
|
{
|
||||||
|
// Generate state file
|
||||||
|
stateFd, err := os.CreateTemp(ns.stateDir, "vitastor-vduse-*.json")
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return "", "", status.Error(codes.Internal, err.Error())
|
||||||
|
}
|
||||||
|
stateFile := stateFd.Name()
|
||||||
|
stateFd.Close()
|
||||||
|
vdpaId := filepath.Base(stateFile)
|
||||||
|
vdpaId = vdpaId[0:len(vdpaId)-5] // remove ".json"
|
||||||
|
pidFile := ns.stateDir + vdpaId + ".pid"
|
||||||
|
// Map VDUSE device via qemu-storage-daemon
|
||||||
|
err = startStorageDaemon(vdpaId, volName, pidFile, ctxVars["configPath"], readonly)
|
||||||
|
if (err == nil)
|
||||||
|
{
|
||||||
|
// Add device to VDPA bus
|
||||||
|
_, err = system("/sbin/vdpa", "-j", "dev", "add", "name", vdpaId, "mgmtdev", "vduse")
|
||||||
|
if (err == nil)
|
||||||
|
{
|
||||||
|
// Find block device name
|
||||||
|
matches, err := filepath.Glob("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/*")
|
||||||
|
if (err == nil && len(matches) == 0)
|
||||||
|
{
|
||||||
|
err = errors.New("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/* is not found")
|
||||||
|
}
|
||||||
|
if (err == nil)
|
||||||
|
{
|
||||||
|
blockdev := "/dev/"+filepath.Base(matches[0])
|
||||||
|
_, err = os.Stat(blockdev)
|
||||||
|
if (err == nil)
|
||||||
|
{
|
||||||
|
// Generate state file
|
||||||
|
stateJSON, _ := json.Marshal(&DeviceState{
|
||||||
|
ConfigPath: ctxVars["configPath"],
|
||||||
|
VdpaId: vdpaId,
|
||||||
|
Image: volName,
|
||||||
|
Blockdev: blockdev,
|
||||||
|
Readonly: readonly,
|
||||||
|
PidFile: pidFile,
|
||||||
|
})
|
||||||
|
err = os.WriteFile(stateFile, stateJSON, 0600)
|
||||||
|
if (err == nil)
|
||||||
|
{
|
||||||
|
return blockdev, vdpaId, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
err = status.Error(codes.Internal, err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
killErr := killByPidFile(pidFile)
|
||||||
|
if (killErr != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("Failed to kill started qemu-storage-daemon: %v", killErr)
|
||||||
|
}
|
||||||
|
os.Remove(stateFile)
|
||||||
|
os.Remove(pidFile)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "", "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ns *NodeServer) unmapVduse(devicePath string)
|
||||||
|
{
|
||||||
|
if (len(devicePath) < 6 || devicePath[0:6] != "/dev/v")
|
||||||
|
{
|
||||||
|
klog.Errorf("%s does not start with /dev/v", devicePath)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
vduseDev, err := os.Readlink("/sys/block/"+devicePath[5:])
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx): %v", devicePath, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
vdpaId := ""
|
||||||
|
p := strings.Index(vduseDev, "/vduse/")
|
||||||
|
if (p >= 0)
|
||||||
|
{
|
||||||
|
vduseDev = vduseDev[p+7:]
|
||||||
|
p = strings.Index(vduseDev, "/")
|
||||||
|
if (p >= 0)
|
||||||
|
{
|
||||||
|
vdpaId = vduseDev[0:p]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vdpaId == "")
|
||||||
|
{
|
||||||
|
klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx), but is %v", devicePath, vduseDev)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
ns.unmapVduseById(vdpaId)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ns *NodeServer) unmapVduseById(vdpaId string)
|
||||||
|
{
|
||||||
|
_, err := os.Stat("/sys/bus/vdpa/devices/"+vdpaId)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("failed to stat /sys/bus/vdpa/devices/"+vdpaId+": %v", err)
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_, _ = system("/sbin/vdpa", "-j", "dev", "del", vdpaId)
|
||||||
|
}
|
||||||
|
stateFile := ns.stateDir + vdpaId + ".json"
|
||||||
|
os.Remove(stateFile)
|
||||||
|
pidFile := ns.stateDir + vdpaId + ".pid"
|
||||||
|
_, err = os.Stat(pidFile)
|
||||||
|
if (os.IsNotExist(err))
|
||||||
|
{
|
||||||
|
// ok, already killed
|
||||||
|
}
|
||||||
|
else if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("Failed to stat %v: %v", pidFile, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
err = killByPidFile(pidFile)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("Failed to kill started qemu-storage-daemon: %v", err)
|
||||||
|
}
|
||||||
|
os.Remove(pidFile)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ns *NodeServer) restoreVduseDaemons()
|
||||||
|
{
|
||||||
|
pattern := ns.stateDir+"vitastor-vduse-*.json"
|
||||||
|
matches, err := filepath.Glob(pattern)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("failed to list %s: %v", pattern, err)
|
||||||
|
}
|
||||||
|
if (len(matches) == 0)
|
||||||
|
{
|
||||||
|
return
|
||||||
|
}
|
||||||
|
devList := make(map[string]interface{})
|
||||||
|
// example output: {"dev":{"test1":{"type":"block","mgmtdev":"vduse","vendor_id":0,"max_vqs":16,"max_vq_size":128}}}
|
||||||
|
devListJSON, err := system("/sbin/vdpa", "-j", "dev", "list")
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return
|
||||||
|
}
|
||||||
|
err = json.Unmarshal(devListJSON, &devList)
|
||||||
|
devs, ok := devList["dev"].(map[string]interface{})
|
||||||
|
if (err != nil || !ok)
|
||||||
|
{
|
||||||
|
klog.Errorf("/sbin/vdpa -j dev list returned bad JSON (error %v): %v", err, string(devListJSON))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, stateFile := range matches
|
||||||
|
{
|
||||||
|
vdpaId := filepath.Base(stateFile)
|
||||||
|
vdpaId = vdpaId[0:len(vdpaId)-5]
|
||||||
|
// Check if VDPA device is still added to the bus
|
||||||
|
if (devs[vdpaId] != nil)
|
||||||
|
{
|
||||||
|
// Check if the storage daemon is still active
|
||||||
|
pidFile := ns.stateDir + vdpaId + ".pid"
|
||||||
|
exists := false
|
||||||
|
proc, err := findByPidFile(pidFile)
|
||||||
|
if (err == nil)
|
||||||
|
{
|
||||||
|
exists = proc.Signal(syscall.Signal(0)) == nil
|
||||||
|
}
|
||||||
|
if (!exists)
|
||||||
|
{
|
||||||
|
// Restart daemon
|
||||||
|
stateJSON, err := os.ReadFile(stateFile)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Warningf("error reading state file %v: %v", stateFile, err)
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
var state DeviceState
|
||||||
|
err := json.Unmarshal(stateJSON, &state)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Warningf("state file %v contains invalid JSON (error %v): %v", stateFile, err, string(stateJSON))
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
klog.Warningf("restarting storage daemon for volume %v (VDPA ID %v)", state.Image, vdpaId)
|
||||||
|
_ = startStorageDaemon(vdpaId, state.Image, pidFile, state.ConfigPath, state.Readonly)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Unused, clean it up
|
||||||
|
ns.unmapVduseById(vdpaId)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// NodePublishVolume mounts the volume mounted to the staging path to the target path
|
// NodePublishVolume mounts the volume mounted to the staging path to the target path
|
||||||
func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error)
|
func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error)
|
||||||
{
|
{
|
||||||
@@ -114,38 +489,25 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
|
|||||||
}
|
}
|
||||||
volName := ctxVars["name"]
|
volName := ctxVars["name"]
|
||||||
|
|
||||||
_, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
|
_, err = GetConnectionParams(ctxVars)
|
||||||
if (len(etcdUrl) == 0)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Map NBD device
|
|
||||||
// FIXME: Check if already mapped
|
|
||||||
args := []string{
|
|
||||||
"map", "--etcd_address", strings.Join(etcdUrl, ","),
|
|
||||||
"--etcd_prefix", etcdPrefix,
|
|
||||||
"--image", volName,
|
|
||||||
};
|
|
||||||
if (ctxVars["configPath"] != "")
|
|
||||||
{
|
|
||||||
args = append(args, "--config_path", ctxVars["configPath"])
|
|
||||||
}
|
|
||||||
if (req.GetReadonly())
|
|
||||||
{
|
|
||||||
args = append(args, "--readonly", "1")
|
|
||||||
}
|
|
||||||
c := exec.Command("/usr/bin/vitastor-nbd", args...)
|
|
||||||
var stdout, stderr bytes.Buffer
|
|
||||||
c.Stdout, c.Stderr = &stdout, &stderr
|
|
||||||
err = c.Run()
|
|
||||||
stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
|
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
klog.Errorf("vitastor-nbd map failed: %s, status %s\n", stdoutStr+stderrStr, err)
|
return nil, err
|
||||||
return nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
|
}
|
||||||
|
|
||||||
|
var devicePath, vdpaId string
|
||||||
|
if (!ns.useVduse)
|
||||||
|
{
|
||||||
|
devicePath, err = ns.mapNbd(volName, ctxVars, req.GetReadonly())
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
devicePath, vdpaId, err = ns.mapVduse(volName, ctxVars, req.GetReadonly())
|
||||||
|
}
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, err
|
||||||
}
|
}
|
||||||
devicePath := strings.TrimSpace(stdoutStr)
|
|
||||||
|
|
||||||
diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
|
diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
|
||||||
if (isBlock)
|
if (isBlock)
|
||||||
@@ -227,11 +589,13 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
|
|||||||
return &csi.NodePublishVolumeResponse{}, nil
|
return &csi.NodePublishVolumeResponse{}, nil
|
||||||
|
|
||||||
unmap:
|
unmap:
|
||||||
// unmap NBD device
|
if (!ns.useVduse || len(devicePath) >= 8 && devicePath[0:8] == "/dev/nbd")
|
||||||
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
|
|
||||||
if (unmapErr != nil)
|
|
||||||
{
|
{
|
||||||
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
|
ns.unmapNbd(devicePath)
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ns.unmapVduseById(vdpaId)
|
||||||
}
|
}
|
||||||
return nil, status.Error(codes.Internal, err.Error())
|
return nil, status.Error(codes.Internal, err.Error())
|
||||||
}
|
}
|
||||||
@@ -252,7 +616,10 @@ func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu
|
|||||||
}
|
}
|
||||||
if (devicePath == "")
|
if (devicePath == "")
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.NotFound, "Volume not mounted")
|
// volume not mounted
|
||||||
|
klog.Warningf("%s is not a mountpoint, deleting", targetPath)
|
||||||
|
os.Remove(targetPath)
|
||||||
|
return &csi.NodeUnpublishVolumeResponse{}, nil
|
||||||
}
|
}
|
||||||
// unmount
|
// unmount
|
||||||
err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
|
err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
|
||||||
@@ -263,10 +630,13 @@ func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu
|
|||||||
// unmap NBD device
|
// unmap NBD device
|
||||||
if (refCount == 1)
|
if (refCount == 1)
|
||||||
{
|
{
|
||||||
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
|
if (!ns.useVduse)
|
||||||
if (unmapErr != nil)
|
|
||||||
{
|
{
|
||||||
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
|
ns.unmapNbd(devicePath)
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ns.unmapVduse(devicePath)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return &csi.NodeUnpublishVolumeResponse{}, nil
|
return &csi.NodeUnpublishVolumeResponse{}, nil
|
||||||
|
4
debian/changelog
vendored
4
debian/changelog
vendored
@@ -1,10 +1,10 @@
|
|||||||
vitastor (1.2.0-1) unstable; urgency=medium
|
vitastor (1.3.1-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Bugfixes
|
* Bugfixes
|
||||||
|
|
||||||
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
||||||
|
|
||||||
vitastor (1.2.0-1) unstable; urgency=medium
|
vitastor (0.7.0-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Implement NFS proxy
|
* Implement NFS proxy
|
||||||
* Add documentation
|
* Add documentation
|
||||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -35,8 +35,8 @@ RUN set -e -x; \
|
|||||||
mkdir -p /root/packages/vitastor-$REL; \
|
mkdir -p /root/packages/vitastor-$REL; \
|
||||||
rm -rf /root/packages/vitastor-$REL/*; \
|
rm -rf /root/packages/vitastor-$REL/*; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
cp -r /root/vitastor vitastor-1.2.0; \
|
cp -r /root/vitastor vitastor-1.3.1; \
|
||||||
cd vitastor-1.2.0; \
|
cd vitastor-1.3.1; \
|
||||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||||
@@ -49,8 +49,8 @@ RUN set -e -x; \
|
|||||||
rm -rf a b; \
|
rm -rf a b; \
|
||||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.2.0.orig.tar.xz vitastor-1.2.0; \
|
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.3.1.orig.tar.xz vitastor-1.3.1; \
|
||||||
cd vitastor-1.2.0; \
|
cd vitastor-1.3.1; \
|
||||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||||
|
@@ -15,6 +15,9 @@ the cluster.
|
|||||||
- [client_max_buffered_bytes](#client_max_buffered_bytes)
|
- [client_max_buffered_bytes](#client_max_buffered_bytes)
|
||||||
- [client_max_buffered_ops](#client_max_buffered_ops)
|
- [client_max_buffered_ops](#client_max_buffered_ops)
|
||||||
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
|
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
|
||||||
|
- [nbd_timeout](#nbd_timeout)
|
||||||
|
- [nbd_max_devices](#nbd_max_devices)
|
||||||
|
- [nbd_max_part](#nbd_max_part)
|
||||||
|
|
||||||
## client_max_dirty_bytes
|
## client_max_dirty_bytes
|
||||||
|
|
||||||
@@ -101,3 +104,34 @@ Multiple consecutive modified data regions are counted as 1 write here.
|
|||||||
- Can be changed online: yes
|
- Can be changed online: yes
|
||||||
|
|
||||||
Maximum number of parallel writes when flushing buffered data to the server.
|
Maximum number of parallel writes when flushing buffered data to the server.
|
||||||
|
|
||||||
|
## nbd_timeout
|
||||||
|
|
||||||
|
- Type: seconds
|
||||||
|
- Default: 300
|
||||||
|
|
||||||
|
Timeout for I/O operations for [NBD](../usage/nbd.en.md). If an operation
|
||||||
|
executes for longer than this timeout, including when your cluster is just
|
||||||
|
temporarily down for more than timeout, the NBD device will detach by itself
|
||||||
|
(and possibly break the mounted file system).
|
||||||
|
|
||||||
|
You can set timeout to 0 to never detach, but in that case you won't be
|
||||||
|
able to remove the kernel device at all if the NBD process dies - you'll have
|
||||||
|
to reboot the host.
|
||||||
|
|
||||||
|
## nbd_max_devices
|
||||||
|
|
||||||
|
- Type: integer
|
||||||
|
- Default: 64
|
||||||
|
|
||||||
|
Maximum number of NBD devices in the system. This value is passed as
|
||||||
|
`nbds_max` parameter for the nbd kernel module when vitastor-nbd autoloads it.
|
||||||
|
|
||||||
|
## nbd_max_part
|
||||||
|
|
||||||
|
- Type: integer
|
||||||
|
- Default: 3
|
||||||
|
|
||||||
|
Maximum number of partitions per NBD device. This value is passed as
|
||||||
|
`max_part` parameter for the nbd kernel module when vitastor-nbd autoloads it.
|
||||||
|
Note that (nbds_max)*(1+max_part) usually can't exceed 256.
|
||||||
|
@@ -15,6 +15,9 @@
|
|||||||
- [client_max_buffered_bytes](#client_max_buffered_bytes)
|
- [client_max_buffered_bytes](#client_max_buffered_bytes)
|
||||||
- [client_max_buffered_ops](#client_max_buffered_ops)
|
- [client_max_buffered_ops](#client_max_buffered_ops)
|
||||||
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
|
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
|
||||||
|
- [nbd_timeout](#nbd_timeout)
|
||||||
|
- [nbd_max_devices](#nbd_max_devices)
|
||||||
|
- [nbd_max_part](#nbd_max_part)
|
||||||
|
|
||||||
## client_max_dirty_bytes
|
## client_max_dirty_bytes
|
||||||
|
|
||||||
@@ -101,3 +104,34 @@
|
|||||||
- Можно менять на лету: да
|
- Можно менять на лету: да
|
||||||
|
|
||||||
Максимальное число параллельных операций записи при сбросе буферов на сервер.
|
Максимальное число параллельных операций записи при сбросе буферов на сервер.
|
||||||
|
|
||||||
|
## nbd_timeout
|
||||||
|
|
||||||
|
- Тип: секунды
|
||||||
|
- Значение по умолчанию: 300
|
||||||
|
|
||||||
|
Таймаут для операций чтения/записи через [NBD](../usage/nbd.ru.md). Если
|
||||||
|
операция выполняется дольше таймаута, включая временную недоступность
|
||||||
|
кластера на время, большее таймаута, NBD-устройство отключится само собой
|
||||||
|
(и, возможно, сломает примонтированную ФС).
|
||||||
|
|
||||||
|
Вы можете установить таймаут в 0, чтобы никогда не отключать устройство по
|
||||||
|
таймауту, но в этом случае вы вообще не сможете удалить устройство, если
|
||||||
|
процесс NBD умрёт - вам придётся перезагружать сервер.
|
||||||
|
|
||||||
|
## nbd_max_devices
|
||||||
|
|
||||||
|
- Тип: целое число
|
||||||
|
- Значение по умолчанию: 64
|
||||||
|
|
||||||
|
Максимальное число NBD-устройств в системе. Данное значение передаётся
|
||||||
|
модулю ядра nbd как параметр `nbds_max`, когда его загружает vitastor-nbd.
|
||||||
|
|
||||||
|
## nbd_max_part
|
||||||
|
|
||||||
|
- Тип: целое число
|
||||||
|
- Значение по умолчанию: 3
|
||||||
|
|
||||||
|
Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
|
||||||
|
модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
|
||||||
|
Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
|
||||||
|
@@ -20,6 +20,7 @@ between clients, OSDs and etcd.
|
|||||||
- [rdma_max_msg](#rdma_max_msg)
|
- [rdma_max_msg](#rdma_max_msg)
|
||||||
- [rdma_max_recv](#rdma_max_recv)
|
- [rdma_max_recv](#rdma_max_recv)
|
||||||
- [rdma_max_send](#rdma_max_send)
|
- [rdma_max_send](#rdma_max_send)
|
||||||
|
- [rdma_odp](#rdma_odp)
|
||||||
- [peer_connect_interval](#peer_connect_interval)
|
- [peer_connect_interval](#peer_connect_interval)
|
||||||
- [peer_connect_timeout](#peer_connect_timeout)
|
- [peer_connect_timeout](#peer_connect_timeout)
|
||||||
- [osd_idle_timeout](#osd_idle_timeout)
|
- [osd_idle_timeout](#osd_idle_timeout)
|
||||||
@@ -68,11 +69,14 @@ but they are not connected to the cluster.
|
|||||||
- Type: string
|
- Type: string
|
||||||
|
|
||||||
RDMA device name to use for Vitastor OSD communications (for example,
|
RDMA device name to use for Vitastor OSD communications (for example,
|
||||||
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
|
"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
|
||||||
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
|
ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
|
||||||
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
|
||||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
Versions up to Vitastor 1.2.0 required ODP which is only present in
|
||||||
root to list available RDMA devices and their features.
|
Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
|
||||||
|
|
||||||
|
Run `ibv_devinfo -v` as root to list available RDMA devices and their
|
||||||
|
features.
|
||||||
|
|
||||||
Remember that you also have to configure your network switches if you use
|
Remember that you also have to configure your network switches if you use
|
||||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
||||||
@@ -147,6 +151,28 @@ less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
|
|||||||
Doesn't affect memory usage - additional memory isn't allocated for send
|
Doesn't affect memory usage - additional memory isn't allocated for send
|
||||||
operations.
|
operations.
|
||||||
|
|
||||||
|
## rdma_odp
|
||||||
|
|
||||||
|
- Type: boolean
|
||||||
|
- Default: false
|
||||||
|
|
||||||
|
Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
|
||||||
|
ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
|
||||||
|
for RDMA adapter to be able to use it. This, in turn, allows to skip memory
|
||||||
|
copying during sending. One would think this should improve performance, but
|
||||||
|
**in reality** RDMA performance with ODP is **drastically** worse. Example
|
||||||
|
3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
|
||||||
|
without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
|
||||||
|
|
||||||
|
This happens because Mellanox ODP implementation seems to be based on
|
||||||
|
message retransmissions when the adapter doesn't know about the buffer yet -
|
||||||
|
it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
|
||||||
|
which is generally slow in RDMA/RoCE networks. Here's a presentation about
|
||||||
|
it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
|
||||||
|
|
||||||
|
ODP support is retained in the code just in case a good ODP implementation
|
||||||
|
appears one day.
|
||||||
|
|
||||||
## peer_connect_interval
|
## peer_connect_interval
|
||||||
|
|
||||||
- Type: seconds
|
- Type: seconds
|
||||||
|
@@ -20,6 +20,7 @@
|
|||||||
- [rdma_max_msg](#rdma_max_msg)
|
- [rdma_max_msg](#rdma_max_msg)
|
||||||
- [rdma_max_recv](#rdma_max_recv)
|
- [rdma_max_recv](#rdma_max_recv)
|
||||||
- [rdma_max_send](#rdma_max_send)
|
- [rdma_max_send](#rdma_max_send)
|
||||||
|
- [rdma_odp](#rdma_odp)
|
||||||
- [peer_connect_interval](#peer_connect_interval)
|
- [peer_connect_interval](#peer_connect_interval)
|
||||||
- [peer_connect_timeout](#peer_connect_timeout)
|
- [peer_connect_timeout](#peer_connect_timeout)
|
||||||
- [osd_idle_timeout](#osd_idle_timeout)
|
- [osd_idle_timeout](#osd_idle_timeout)
|
||||||
@@ -71,12 +72,15 @@ RDMA может быть нужно только если у клиентов е
|
|||||||
- Тип: строка
|
- Тип: строка
|
||||||
|
|
||||||
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
||||||
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
|
||||||
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
|
нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
|
||||||
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
|
картами производства не Mellanox.
|
||||||
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
|
||||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
|
||||||
параметры и возможности.
|
на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
|
||||||
|
|
||||||
|
Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
|
||||||
|
список доступных RDMA-устройств, их параметры и возможности.
|
||||||
|
|
||||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
||||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
||||||
@@ -155,6 +159,29 @@ OSD в любом случае согласовывают реальное зн
|
|||||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
Не влияет на потребление памяти - дополнительная память на операции отправки
|
||||||
не выделяется.
|
не выделяется.
|
||||||
|
|
||||||
|
## rdma_odp
|
||||||
|
|
||||||
|
- Тип: булево (да/нет)
|
||||||
|
- Значение по умолчанию: false
|
||||||
|
|
||||||
|
Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
|
||||||
|
исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
|
||||||
|
не регистрировать память для её использования RDMA-картой. Благодаря этому
|
||||||
|
можно не копировать данные при отправке их в сеть и, казалось бы, это должно
|
||||||
|
улучшать производительность - но **по факту** получается так, что
|
||||||
|
производительность только ухудшается, причём сильно. Пример - на 3-узловом
|
||||||
|
кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
|
||||||
|
удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
|
||||||
|
|
||||||
|
Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
|
||||||
|
основана на повторной передаче сообщений, когда карте не известен буфер -
|
||||||
|
вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
|
||||||
|
А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
|
||||||
|
Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
|
||||||
|
|
||||||
|
Возможность использования ODP сохранена в коде на случай, если вдруг в один
|
||||||
|
прекрасный день появится хорошая реализация ODP.
|
||||||
|
|
||||||
## peer_connect_interval
|
## peer_connect_interval
|
||||||
|
|
||||||
- Тип: секунды
|
- Тип: секунды
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
# Client Parameters
|
# Client Parameters
|
||||||
|
|
||||||
These parameters apply only to clients and affect their interaction with
|
These parameters apply only to Vitastor clients (QEMU, fio, NBD and so on) and
|
||||||
the cluster.
|
affect their interaction with the cluster.
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
# Параметры клиентского кода
|
# Параметры клиентского кода
|
||||||
|
|
||||||
Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD) и
|
Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD и т.п.) и
|
||||||
затрагивают логику их работы с кластером.
|
затрагивают логику их работы с кластером.
|
||||||
|
@@ -122,3 +122,47 @@
|
|||||||
Maximum number of parallel writes when flushing buffered data to the server.
|
Maximum number of parallel writes when flushing buffered data to the server.
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Максимальное число параллельных операций записи при сбросе буферов на сервер.
|
Максимальное число параллельных операций записи при сбросе буферов на сервер.
|
||||||
|
- name: nbd_timeout
|
||||||
|
type: sec
|
||||||
|
default: 300
|
||||||
|
online: false
|
||||||
|
info: |
|
||||||
|
Timeout for I/O operations for [NBD](../usage/nbd.en.md). If an operation
|
||||||
|
executes for longer than this timeout, including when your cluster is just
|
||||||
|
temporarily down for more than timeout, the NBD device will detach by itself
|
||||||
|
(and possibly break the mounted file system).
|
||||||
|
|
||||||
|
You can set timeout to 0 to never detach, but in that case you won't be
|
||||||
|
able to remove the kernel device at all if the NBD process dies - you'll have
|
||||||
|
to reboot the host.
|
||||||
|
info_ru: |
|
||||||
|
Таймаут для операций чтения/записи через [NBD](../usage/nbd.ru.md). Если
|
||||||
|
операция выполняется дольше таймаута, включая временную недоступность
|
||||||
|
кластера на время, большее таймаута, NBD-устройство отключится само собой
|
||||||
|
(и, возможно, сломает примонтированную ФС).
|
||||||
|
|
||||||
|
Вы можете установить таймаут в 0, чтобы никогда не отключать устройство по
|
||||||
|
таймауту, но в этом случае вы вообще не сможете удалить устройство, если
|
||||||
|
процесс NBD умрёт - вам придётся перезагружать сервер.
|
||||||
|
- name: nbd_max_devices
|
||||||
|
type: int
|
||||||
|
default: 64
|
||||||
|
online: false
|
||||||
|
info: |
|
||||||
|
Maximum number of NBD devices in the system. This value is passed as
|
||||||
|
`nbds_max` parameter for the nbd kernel module when vitastor-nbd autoloads it.
|
||||||
|
info_ru: |
|
||||||
|
Максимальное число NBD-устройств в системе. Данное значение передаётся
|
||||||
|
модулю ядра nbd как параметр `nbds_max`, когда его загружает vitastor-nbd.
|
||||||
|
- name: nbd_max_part
|
||||||
|
type: int
|
||||||
|
default: 3
|
||||||
|
online: false
|
||||||
|
info: |
|
||||||
|
Maximum number of partitions per NBD device. This value is passed as
|
||||||
|
`max_part` parameter for the nbd kernel module when vitastor-nbd autoloads it.
|
||||||
|
Note that (nbds_max)*(1+max_part) usually can't exceed 256.
|
||||||
|
info_ru: |
|
||||||
|
Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
|
||||||
|
модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
|
||||||
|
Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
|
||||||
|
@@ -48,11 +48,14 @@
|
|||||||
type: string
|
type: string
|
||||||
info: |
|
info: |
|
||||||
RDMA device name to use for Vitastor OSD communications (for example,
|
RDMA device name to use for Vitastor OSD communications (for example,
|
||||||
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
|
"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
|
||||||
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
|
ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
|
||||||
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
|
||||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
Versions up to Vitastor 1.2.0 required ODP which is only present in
|
||||||
root to list available RDMA devices and their features.
|
Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
|
||||||
|
|
||||||
|
Run `ibv_devinfo -v` as root to list available RDMA devices and their
|
||||||
|
features.
|
||||||
|
|
||||||
Remember that you also have to configure your network switches if you use
|
Remember that you also have to configure your network switches if you use
|
||||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
||||||
@@ -61,12 +64,15 @@
|
|||||||
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
||||||
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
|
||||||
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
|
нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
|
||||||
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
|
картами производства не Mellanox.
|
||||||
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
|
||||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
|
||||||
параметры и возможности.
|
на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
|
||||||
|
|
||||||
|
Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
|
||||||
|
список доступных RDMA-устройств, их параметры и возможности.
|
||||||
|
|
||||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
||||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
||||||
@@ -160,6 +166,45 @@
|
|||||||
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
||||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
Не влияет на потребление памяти - дополнительная память на операции отправки
|
||||||
не выделяется.
|
не выделяется.
|
||||||
|
- name: rdma_odp
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
online: false
|
||||||
|
info: |
|
||||||
|
Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
|
||||||
|
ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
|
||||||
|
for RDMA adapter to be able to use it. This, in turn, allows to skip memory
|
||||||
|
copying during sending. One would think this should improve performance, but
|
||||||
|
**in reality** RDMA performance with ODP is **drastically** worse. Example
|
||||||
|
3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
|
||||||
|
without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
|
||||||
|
|
||||||
|
This happens because Mellanox ODP implementation seems to be based on
|
||||||
|
message retransmissions when the adapter doesn't know about the buffer yet -
|
||||||
|
it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
|
||||||
|
which is generally slow in RDMA/RoCE networks. Here's a presentation about
|
||||||
|
it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
|
||||||
|
|
||||||
|
ODP support is retained in the code just in case a good ODP implementation
|
||||||
|
appears one day.
|
||||||
|
info_ru: |
|
||||||
|
Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
|
||||||
|
исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
|
||||||
|
не регистрировать память для её использования RDMA-картой. Благодаря этому
|
||||||
|
можно не копировать данные при отправке их в сеть и, казалось бы, это должно
|
||||||
|
улучшать производительность - но **по факту** получается так, что
|
||||||
|
производительность только ухудшается, причём сильно. Пример - на 3-узловом
|
||||||
|
кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
|
||||||
|
удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
|
||||||
|
|
||||||
|
Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
|
||||||
|
основана на повторной передаче сообщений, когда карте не известен буфер -
|
||||||
|
вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
|
||||||
|
А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
|
||||||
|
Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
|
||||||
|
|
||||||
|
Возможность использования ODP сохранена в коде на случай, если вдруг в один
|
||||||
|
прекрасный день появится хорошая реализация ODP.
|
||||||
- name: peer_connect_interval
|
- name: peer_connect_interval
|
||||||
type: sec
|
type: sec
|
||||||
min: 1
|
min: 1
|
||||||
|
@@ -19,6 +19,14 @@ for i in ./???-*.yaml; do kubectl apply -f $i; done
|
|||||||
|
|
||||||
After that you'll be able to create PersistentVolumes.
|
After that you'll be able to create PersistentVolumes.
|
||||||
|
|
||||||
|
**Important:** For best experience, use Linux kernel at least 5.15 with [VDUSE](../usage/qemu.en.md#vduse)
|
||||||
|
kernel modules enabled (vdpa, vduse, virtio-vdpa). If your distribution doesn't
|
||||||
|
have them pre-built - build them yourself ([instructions](../usage/qemu.en.md#vduse)),
|
||||||
|
I promise it's worth it :-). When VDUSE is unavailable, CSI driver uses [NBD](../usage/nbd.en.md)
|
||||||
|
to map Vitastor devices. NBD is slower and prone to timeout issues: if Vitastor
|
||||||
|
cluster becomes unresponsible for more than [nbd_timeout](../config/client.en.md#nbd_timeout),
|
||||||
|
the NBD device detaches and breaks pods using it.
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
Vitastor CSI supports:
|
Vitastor CSI supports:
|
||||||
@@ -27,5 +35,8 @@ Vitastor CSI supports:
|
|||||||
- Raw block RWX (ReadWriteMany) volumes. Example: [PVC](../../csi/deploy/example-pvc-block.yaml), [pod](../../csi/deploy/example-test-pod-block.yaml)
|
- Raw block RWX (ReadWriteMany) volumes. Example: [PVC](../../csi/deploy/example-pvc-block.yaml), [pod](../../csi/deploy/example-test-pod-block.yaml)
|
||||||
- Volume expansion
|
- Volume expansion
|
||||||
- Volume snapshots. Example: [snapshot class](../../csi/deploy/example-snapshot-class.yaml), [snapshot](../../csi/deploy/example-snapshot.yaml), [clone](../../csi/deploy/example-snapshot-clone.yaml)
|
- Volume snapshots. Example: [snapshot class](../../csi/deploy/example-snapshot-class.yaml), [snapshot](../../csi/deploy/example-snapshot.yaml), [clone](../../csi/deploy/example-snapshot-clone.yaml)
|
||||||
|
- [VDUSE](../usage/qemu.en.md#vduse) (preferred) and [NBD](../usage/nbd.en.md) device mapping methods
|
||||||
|
- Upgrades with VDUSE - new handler processes are restarted when CSI pods are restarted themselves
|
||||||
|
- Multiple clusters by using multiple configuration files in ConfigMap.
|
||||||
|
|
||||||
Remember that to use snapshots with CSI you also have to install [Snapshot Controller and CRDs](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
|
Remember that to use snapshots with CSI you also have to install [Snapshot Controller and CRDs](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
|
||||||
|
@@ -19,6 +19,14 @@ for i in ./???-*.yaml; do kubectl apply -f $i; done
|
|||||||
|
|
||||||
После этого вы сможете создавать PersistentVolume.
|
После этого вы сможете создавать PersistentVolume.
|
||||||
|
|
||||||
|
**Важно:** Лучше всего использовать ядро Linux версии не менее 5.15 с включёнными модулями
|
||||||
|
[VDUSE](../usage/qemu.ru.md#vduse) (vdpa, vduse, virtio-vdpa). Если в вашем дистрибутиве
|
||||||
|
они не собраны из коробки - соберите их сами, обещаю, что это стоит того ([инструкция](../usage/qemu.ru.md#vduse)) :-).
|
||||||
|
Когда VDUSE недоступно, CSI-плагин использует [NBD](../usage/nbd.ru.md) для подключения
|
||||||
|
дисков, а NBD медленнее и имеет проблему таймаута - если кластер остаётся недоступным
|
||||||
|
дольше, чем [nbd_timeout](../config/client.ru.md#nbd_timeout), NBD-устройство отключается
|
||||||
|
и ломает поды, использующие его.
|
||||||
|
|
||||||
## Возможности
|
## Возможности
|
||||||
|
|
||||||
CSI-плагин Vitastor поддерживает:
|
CSI-плагин Vitastor поддерживает:
|
||||||
@@ -27,5 +35,8 @@ CSI-плагин Vitastor поддерживает:
|
|||||||
- Сырые блочные RWX (ReadWriteMany) тома. Пример: [PVC](../../csi/deploy/example-pvc-block.yaml), [под](../../csi/deploy/example-test-pod-block.yaml)
|
- Сырые блочные RWX (ReadWriteMany) тома. Пример: [PVC](../../csi/deploy/example-pvc-block.yaml), [под](../../csi/deploy/example-test-pod-block.yaml)
|
||||||
- Расширение размера томов
|
- Расширение размера томов
|
||||||
- Снимки томов. Пример: [класс снимков](../../csi/deploy/example-snapshot-class.yaml), [снимок](../../csi/deploy/example-snapshot.yaml), [клон снимка](../../csi/deploy/example-snapshot-clone.yaml)
|
- Снимки томов. Пример: [класс снимков](../../csi/deploy/example-snapshot-class.yaml), [снимок](../../csi/deploy/example-snapshot.yaml), [клон снимка](../../csi/deploy/example-snapshot-clone.yaml)
|
||||||
|
- Способы подключения устройств [VDUSE](../usage/qemu.ru.md#vduse) (предпочитаемый) и [NBD](../usage/nbd.ru.md)
|
||||||
|
- Обновление при использовании VDUSE - новые процессы-обработчики устройств успешно перезапускаются вместе с самими подами CSI
|
||||||
|
- Несколько кластеров через задание нескольких файлов конфигурации в ConfigMap.
|
||||||
|
|
||||||
Не забывайте, что для использования снимков нужно сначала установить [контроллер снимков и CRD](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
|
Не забывайте, что для использования снимков нужно сначала установить [контроллер снимков и CRD](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
|
||||||
|
@@ -6,10 +6,10 @@
|
|||||||
|
|
||||||
# Proxmox VE
|
# Proxmox VE
|
||||||
|
|
||||||
To enable Vitastor support in Proxmox Virtual Environment (6.4-8.0 are supported):
|
To enable Vitastor support in Proxmox Virtual Environment (6.4-8.1 are supported):
|
||||||
|
|
||||||
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
|
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
|
||||||
bookworm for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
|
bookworm for 8.1, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
|
||||||
- Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
|
- Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
|
||||||
- Define storage in `/etc/pve/storage.cfg` (see below)
|
- Define storage in `/etc/pve/storage.cfg` (see below)
|
||||||
- Block network access from VMs to Vitastor network (to OSDs and etcd),
|
- Block network access from VMs to Vitastor network (to OSDs and etcd),
|
||||||
|
@@ -6,10 +6,10 @@
|
|||||||
|
|
||||||
# Proxmox VE
|
# Proxmox VE
|
||||||
|
|
||||||
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.0):
|
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.1):
|
||||||
|
|
||||||
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
|
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
|
||||||
bookworm для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
|
bookworm для 8.1, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
|
||||||
- Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
|
- Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
|
||||||
- Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
|
- Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
|
||||||
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
|
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
|
||||||
|
@@ -54,7 +54,8 @@
|
|||||||
виртуальные диски, их снимки и клоны.
|
виртуальные диски, их снимки и клоны.
|
||||||
- **Драйвер QEMU** — подключаемый модуль QEMU, позволяющий QEMU/KVM виртуальным машинам работать
|
- **Драйвер QEMU** — подключаемый модуль QEMU, позволяющий QEMU/KVM виртуальным машинам работать
|
||||||
с виртуальными дисками Vitastor напрямую из пространства пользователя с помощью клиентской
|
с виртуальными дисками Vitastor напрямую из пространства пользователя с помощью клиентской
|
||||||
библиотеки, без необходимости отображения дисков в виде блочных устройств.
|
библиотеки, без необходимости отображения дисков в виде блочных устройств. Тот же драйвер
|
||||||
|
позволяет подключать диски в систему через [VDUSE](../usage/qemu.ru.md#vduse).
|
||||||
- **vitastor-nbd** — утилита, позволяющая монтировать образы Vitastor в виде блочных устройств
|
- **vitastor-nbd** — утилита, позволяющая монтировать образы Vitastor в виде блочных устройств
|
||||||
с помощью NBD (Network Block Device), на самом деле скорее работающего как "BUSE"
|
с помощью NBD (Network Block Device), на самом деле скорее работающего как "BUSE"
|
||||||
(Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в Vitastor нет
|
(Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в Vitastor нет
|
||||||
|
@@ -28,7 +28,8 @@ It supports the following commands:
|
|||||||
Global options:
|
Global options:
|
||||||
|
|
||||||
```
|
```
|
||||||
--etcd_address ADDR Etcd connection address
|
--config_file FILE Path to Vitastor configuration file
|
||||||
|
--etcd_address URL Etcd connection address
|
||||||
--iodepth N Send N operations in parallel to each OSD when possible (default 32)
|
--iodepth N Send N operations in parallel to each OSD when possible (default 32)
|
||||||
--parallel_osds M Work with M osds in parallel when possible (default 4)
|
--parallel_osds M Work with M osds in parallel when possible (default 4)
|
||||||
--progress 1|0 Report progress (default 1)
|
--progress 1|0 Report progress (default 1)
|
||||||
|
@@ -27,7 +27,8 @@ vitastor-cli - интерфейс командной строки для адм
|
|||||||
Глобальные опции:
|
Глобальные опции:
|
||||||
|
|
||||||
```
|
```
|
||||||
--etcd_address ADDR Адрес соединения с etcd
|
--config_file FILE Путь к файлу конфигурации Vitastor
|
||||||
|
--etcd_address URL Адрес соединения с etcd
|
||||||
--iodepth N Отправлять параллельно N операций на каждый OSD (по умолчанию 32)
|
--iodepth N Отправлять параллельно N операций на каждый OSD (по умолчанию 32)
|
||||||
--parallel_osds M Работать параллельно с M OSD (по умолчанию 4)
|
--parallel_osds M Работать параллельно с M OSD (по умолчанию 4)
|
||||||
--progress 1|0 Печатать прогресс выполнения (по умолчанию 1)
|
--progress 1|0 Печатать прогресс выполнения (по умолчанию 1)
|
||||||
|
@@ -11,25 +11,25 @@ NBD stands for "Network Block Device", but in fact it also functions as "BUSE"
|
|||||||
NBD slighly lowers the performance due to additional overhead, but performance still
|
NBD slighly lowers the performance due to additional overhead, but performance still
|
||||||
remains decent (see an example [here](../performance/comparison1.en.md#vitastor-0-4-0-nbd)).
|
remains decent (see an example [here](../performance/comparison1.en.md#vitastor-0-4-0-nbd)).
|
||||||
|
|
||||||
Vitastor Kubernetes CSI driver is based on NBD.
|
See also [VDUSE](qemu.en.md#vduse) as a better alternative to NBD.
|
||||||
|
|
||||||
See also [VDUSE](qemu.en.md#vduse).
|
Vitastor Kubernetes CSI driver uses NBD when VDUSE is unavailable.
|
||||||
|
|
||||||
## Map image
|
## Map image
|
||||||
|
|
||||||
To create a local block device for a Vitastor image run:
|
To create a local block device for a Vitastor image run:
|
||||||
|
|
||||||
```
|
```
|
||||||
vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
vitastor-nbd map --image testimg
|
||||||
```
|
```
|
||||||
|
|
||||||
It will output a block device name like /dev/nbd0 which you can then use as a normal disk.
|
It will output a block device name like /dev/nbd0 which you can then use as a normal disk.
|
||||||
|
|
||||||
You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.
|
You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.
|
||||||
|
|
||||||
Additional options for map command:
|
vitastor-nbd supports all usual Vitastor configuration options like `--config_file <path_to_config>` plus NBD-specific:
|
||||||
|
|
||||||
* `--nbd_timeout 30` \
|
* `--nbd_timeout 300` \
|
||||||
Timeout for I/O operations in seconds after exceeding which the kernel stops
|
Timeout for I/O operations in seconds after exceeding which the kernel stops
|
||||||
the device. You can set it to 0 to disable the timeout, but beware that you
|
the device. You can set it to 0 to disable the timeout, but beware that you
|
||||||
won't be able to stop the device at all if vitastor-nbd process dies.
|
won't be able to stop the device at all if vitastor-nbd process dies.
|
||||||
@@ -44,6 +44,9 @@ Additional options for map command:
|
|||||||
* `--foreground 1` \
|
* `--foreground 1` \
|
||||||
Stay in foreground, do not daemonize.
|
Stay in foreground, do not daemonize.
|
||||||
|
|
||||||
|
Note that `nbd_timeout`, `nbd_max_devices` and `nbd_max_part` options may also be specified
|
||||||
|
in `/etc/vitastor/vitastor.conf` or in other configuration file specified with `--config_file`.
|
||||||
|
|
||||||
## Unmap image
|
## Unmap image
|
||||||
|
|
||||||
To unmap the device run:
|
To unmap the device run:
|
||||||
|
@@ -14,16 +14,16 @@ NBD на данный момент необходимо, чтобы монтир
|
|||||||
NBD немного снижает производительность из-за дополнительных копирований памяти,
|
NBD немного снижает производительность из-за дополнительных копирований памяти,
|
||||||
но она всё равно остаётся на неплохом уровне (см. для примера [тест](../performance/comparison1.ru.md#vitastor-0-4-0-nbd)).
|
но она всё равно остаётся на неплохом уровне (см. для примера [тест](../performance/comparison1.ru.md#vitastor-0-4-0-nbd)).
|
||||||
|
|
||||||
CSI-драйвер Kubernetes Vitastor основан на NBD.
|
Смотрите также [VDUSE](qemu.ru.md#vduse), как лучшую альтернативу NBD.
|
||||||
|
|
||||||
Смотрите также [VDUSE](qemu.ru.md#vduse).
|
CSI-драйвер Kubernetes Vitastor использует NBD, когда VDUSE недоступен.
|
||||||
|
|
||||||
## Подключить устройство
|
## Подключить устройство
|
||||||
|
|
||||||
Чтобы создать локальное блочное устройство для образа, выполните команду:
|
Чтобы создать локальное блочное устройство для образа, выполните команду:
|
||||||
|
|
||||||
```
|
```
|
||||||
vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
vitastor-nbd map --image testimg
|
||||||
```
|
```
|
||||||
|
|
||||||
Команда напечатает название блочного устройства вида /dev/nbd0, которое потом можно
|
Команда напечатает название блочного устройства вида /dev/nbd0, которое потом можно
|
||||||
@@ -32,7 +32,8 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
|||||||
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
|
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
|
||||||
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
|
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
|
||||||
|
|
||||||
Дополнительные опции для команды подключения NBD-устройства:
|
vitastor-nbd поддерживает все обычные опции Vitastor, например, `--config_file <path_to_config>`,
|
||||||
|
плюс специфичные для NBD:
|
||||||
|
|
||||||
* `--nbd_timeout 30` \
|
* `--nbd_timeout 30` \
|
||||||
Максимальное время выполнения любой операции чтения/записи в секундах, при
|
Максимальное время выполнения любой операции чтения/записи в секундах, при
|
||||||
@@ -53,6 +54,10 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
|||||||
* `--foreground 1` \
|
* `--foreground 1` \
|
||||||
Не уводить процесс в фоновый режим.
|
Не уводить процесс в фоновый режим.
|
||||||
|
|
||||||
|
Обратите внимание, что опции `nbd_timeout`, `nbd_max_devices` и `nbd_max_part` можно
|
||||||
|
также задавать в `/etc/vitastor/vitastor.conf` или в другом файле конфигурации,
|
||||||
|
заданном опцией `--config_file`.
|
||||||
|
|
||||||
## Отключить устройство
|
## Отключить устройство
|
||||||
|
|
||||||
Для отключения устройства выполните:
|
Для отключения устройства выполните:
|
||||||
|
@@ -23,7 +23,7 @@ balancer or any failover method you want to in that case.
|
|||||||
vitastor-nfs usage:
|
vitastor-nfs usage:
|
||||||
|
|
||||||
```
|
```
|
||||||
vitastor-nfs [--etcd_address ADDR] [OTHER OPTIONS]
|
vitastor-nfs [STANDARD OPTIONS] [OTHER OPTIONS]
|
||||||
|
|
||||||
--subdir <DIR> export images prefixed <DIR>/ (default empty - export all images)
|
--subdir <DIR> export images prefixed <DIR>/ (default empty - export all images)
|
||||||
--portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)
|
--portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)
|
||||||
|
@@ -22,7 +22,7 @@
|
|||||||
Использование vitastor-nfs:
|
Использование vitastor-nfs:
|
||||||
|
|
||||||
```
|
```
|
||||||
vitastor-nfs [--etcd_address ADDR] [ДРУГИЕ ОПЦИИ]
|
vitastor-nfs [СТАНДАРТНЫЕ ОПЦИИ] [ДРУГИЕ ОПЦИИ]
|
||||||
|
|
||||||
--subdir <DIR> экспортировать "поддиректорию" - образы с префиксом имени <DIR>/ (по умолчанию пусто - экспортировать все образы)
|
--subdir <DIR> экспортировать "поддиректорию" - образы с префиксом имени <DIR>/ (по умолчанию пусто - экспортировать все образы)
|
||||||
--portmap 0 отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий)
|
--portmap 0 отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий)
|
||||||
|
@@ -127,19 +127,46 @@ Linux kernel, starting with version 5.15, supports a new interface for attaching
|
|||||||
to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
|
to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
|
||||||
exporting QEMU block devices over this protocol using qemu-storage-daemon.
|
exporting QEMU block devices over this protocol using qemu-storage-daemon.
|
||||||
|
|
||||||
VDUSE has the same problem as other FUSE-like interfaces in Linux: if a userspace process hangs,
|
VDUSE is currently the best interface to attach Vitastor disks as kernel devices because:
|
||||||
for example, if it loses connectivity with Vitastor cluster - active processes doing I/O may
|
- It avoids data copies and thus achieves much better performance than [NBD](nbd.en.md)
|
||||||
hang in the D state (uninterruptible sleep) and you won't be able to kill them even with kill -9.
|
- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long
|
||||||
In this case reboot will be the only way to remove VDUSE devices from system.
|
- It doesn't have hung device problem - if the userspace process dies it can be restarted (!)
|
||||||
|
and block device will continue operation
|
||||||
|
- It doesn't seem to have the device number limit
|
||||||
|
|
||||||
On the other hand, VDUSE is faster than [NBD](nbd.en.md), so you may prefer to use it if
|
Example performance comparison:
|
||||||
performance is important for you. Approximate performance numbers:
|
|
||||||
direct fio benchmark - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
|
| | direct fio | NBD | VDUSE |
|
||||||
|
|----------------------|-------------|-------------|-------------|
|
||||||
|
| linear write | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
|
||||||
|
| 4k random write Q128 | 240000 iops | 120000 iops | 178000 iops |
|
||||||
|
| 4k random write Q1 | 9500 iops | 7620 iops | 7640 iops |
|
||||||
|
| linear read | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
|
||||||
|
| 4k random read Q128 | 287000 iops | 140000 iops | 189000 iops |
|
||||||
|
| 4k random read Q1 | 9600 iops | 7640 iops | 7780 iops |
|
||||||
|
|
||||||
To try VDUSE you need at least Linux 5.15, built with VDUSE support
|
To try VDUSE you need at least Linux 5.15, built with VDUSE support
|
||||||
(CONFIG_VIRTIO_VDPA=m and CONFIG_VDPA_USER=m). Debian Linux kernels have these options
|
(CONFIG_VIRTIO_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
|
||||||
disabled by now, so if you want to try it on Debian, use a kernel from Ubuntu
|
|
||||||
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) or Proxmox.
|
Debian Linux kernels have these options disabled by now, so if you want to try it on Debian,
|
||||||
|
use a kernel from Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/), Proxmox,
|
||||||
|
or build modules for Debian kernel manually:
|
||||||
|
|
||||||
|
```
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
apt-get install linux-headers-`uname -r`
|
||||||
|
apt-get build-dep linux-image-`uname -r`-unsigned
|
||||||
|
apt-get source linux-image-`uname -r`-unsigned
|
||||||
|
cd linux*/drivers/vdpa
|
||||||
|
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
||||||
|
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
|
||||||
|
cd ../virtio
|
||||||
|
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
||||||
|
depmod -a
|
||||||
|
```
|
||||||
|
|
||||||
|
You also need `vdpa` tool from the `iproute2` package.
|
||||||
|
|
||||||
Commands to attach Vitastor image as a VDUSE device:
|
Commands to attach Vitastor image as a VDUSE device:
|
||||||
|
|
||||||
@@ -152,7 +179,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
|
|||||||
vdpa dev add name test1 mgmtdev vduse
|
vdpa dev add name test1 mgmtdev vduse
|
||||||
```
|
```
|
||||||
|
|
||||||
After running these commands /dev/vda device will appear in the system and you'll be able to
|
After running these commands, `/dev/vda` device will appear in the system and you'll be able to
|
||||||
use it as a normal disk.
|
use it as a normal disk.
|
||||||
|
|
||||||
To remove the device:
|
To remove the device:
|
||||||
|
@@ -129,19 +129,47 @@ qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
|
|||||||
к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
|
к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
|
||||||
экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.
|
экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.
|
||||||
|
|
||||||
VDUSE страдает общей проблемой FUSE-подобных интерфейсов в Linux: если пользовательский процесс
|
VDUSE - на данный момент лучший интерфейс для подключения дисков Vitastor в виде блочных
|
||||||
подвиснет, например, если будет потеряна связь с кластером Vitastor - читающие/пишущие в кластер
|
устройств на уровне ядра, ибо:
|
||||||
процессы могут "залипнуть" в состоянии D (непрерываемый сон) и их будет невозможно убить даже
|
- VDUSE не копирует данные и поэтому достигает значительно лучшей производительности, чем [NBD](nbd.ru.md)
|
||||||
через kill -9. В этом случае удалить из системы устройство можно только перезагрузившись.
|
- Также оно не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго
|
||||||
|
- Также оно не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
|
||||||
|
перезапустить (!) и блочное устройство продолжит работать
|
||||||
|
- По-видимому, у него нет предела числа подключаемых в систему устройств
|
||||||
|
|
||||||
С другой стороны, VDUSE быстрее по сравнению с [NBD](nbd.ru.md), поэтому его может
|
Пример сравнения производительности:
|
||||||
быть предпочтительно использовать там, где производительность важнее. Порядок показателей:
|
|
||||||
прямое тестирование через fio - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
|
|
||||||
|
|
||||||
Чтобы использовать VDUSE, вам нужно ядро Linux версии хотя бы 5.15, собранное с поддержкой
|
| | Прямой fio | NBD | VDUSE |
|
||||||
VDUSE (CONFIG_VIRTIO_VDPA=m и CONFIG_VDPA_USER=m). В ядрах в Debian Linux поддержка пока
|
|--------------------------|-------------|-------------|-------------|
|
||||||
отключена - если хотите попробовать эту функцию на Debian, поставьте ядро из Ubuntu
|
| линейная запись | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
|
||||||
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) или из Proxmox.
|
| 4k случайная запись Q128 | 240000 iops | 120000 iops | 178000 iops |
|
||||||
|
| 4k случайная запись Q1 | 9500 iops | 7620 iops | 7640 iops |
|
||||||
|
| линейное чтение | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
|
||||||
|
| 4k случайное чтение Q128 | 287000 iops | 140000 iops | 189000 iops |
|
||||||
|
| 4k случайное чтение Q1 | 9600 iops | 7640 iops | 7780 iops |
|
||||||
|
|
||||||
|
Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
|
||||||
|
VDUSE (CONFIG_VIRTIO_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
|
||||||
|
|
||||||
|
В ядрах в Debian Linux поддержка пока отключена по умолчанию, так что чтобы попробовать VDUSE
|
||||||
|
на Debian, поставьте ядро из Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/),
|
||||||
|
из Proxmox или соберите модули для ядра Debian вручную:
|
||||||
|
|
||||||
|
```
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
apt-get install linux-headers-`uname -r`
|
||||||
|
apt-get build-dep linux-image-`uname -r`-unsigned
|
||||||
|
apt-get source linux-image-`uname -r`-unsigned
|
||||||
|
cd linux*/drivers/vdpa
|
||||||
|
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
||||||
|
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
|
||||||
|
cd ../virtio
|
||||||
|
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
||||||
|
depmod -a
|
||||||
|
```
|
||||||
|
|
||||||
|
Также вам понадобится консольная утилита `vdpa` из пакета `iproute2`.
|
||||||
|
|
||||||
Команды для подключения виртуального диска через VDUSE:
|
Команды для подключения виртуального диска через VDUSE:
|
||||||
|
|
||||||
@@ -154,7 +182,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
|
|||||||
vdpa dev add name test1 mgmtdev vduse
|
vdpa dev add name test1 mgmtdev vduse
|
||||||
```
|
```
|
||||||
|
|
||||||
После этого в системе появится устройство /dev/vda, которое можно будет использовать как
|
После этого в системе появится устройство `/dev/vda`, которое можно будет использовать как
|
||||||
обычный диск.
|
обычный диск.
|
||||||
|
|
||||||
Для удаления устройства из системы:
|
Для удаления устройства из системы:
|
||||||
|
@@ -1498,7 +1498,7 @@ class Mon
|
|||||||
{
|
{
|
||||||
const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
|
const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
|
||||||
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
|
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
|
||||||
if (!st || !st.time || !prev || prev.time >= st.time)
|
if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
|
||||||
{
|
{
|
||||||
return prev_diff || diff;
|
return prev_diff || diff;
|
||||||
}
|
}
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vitastor-mon",
|
"name": "vitastor-mon",
|
||||||
"version": "1.2.0",
|
"version": "1.3.1",
|
||||||
"description": "Vitastor SDS monitor service",
|
"description": "Vitastor SDS monitor service",
|
||||||
"main": "mon-main.js",
|
"main": "mon-main.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
|||||||
from cinder.volume import driver
|
from cinder.volume import driver
|
||||||
from cinder.volume import volume_utils
|
from cinder.volume import volume_utils
|
||||||
|
|
||||||
VERSION = '1.2.0'
|
VERSION = '1.3.1'
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
190
patches/pve-qemu-8.1-vitastor.patch
Normal file
190
patches/pve-qemu-8.1-vitastor.patch
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
Index: pve-qemu-kvm-8.1.2/block/meson.build
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-8.1.2.orig/block/meson.build
|
||||||
|
+++ pve-qemu-kvm-8.1.2/block/meson.build
|
||||||
|
@@ -123,6 +123,7 @@ foreach m : [
|
||||||
|
[libnfs, 'nfs', files('nfs.c')],
|
||||||
|
[libssh, 'ssh', files('ssh.c')],
|
||||||
|
[rbd, 'rbd', files('rbd.c')],
|
||||||
|
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||||
|
]
|
||||||
|
if m[0].found()
|
||||||
|
module_ss = ss.source_set()
|
||||||
|
Index: pve-qemu-kvm-8.1.2/meson.build
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-8.1.2.orig/meson.build
|
||||||
|
+++ pve-qemu-kvm-8.1.2/meson.build
|
||||||
|
@@ -1303,6 +1303,26 @@ if not get_option('rbd').auto() or have_
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
+vitastor = not_found
|
||||||
|
+if not get_option('vitastor').auto() or have_block
|
||||||
|
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||||
|
+ required: get_option('vitastor'))
|
||||||
|
+ if libvitastor_client.found()
|
||||||
|
+ if cc.links('''
|
||||||
|
+ #include <vitastor_c.h>
|
||||||
|
+ int main(void) {
|
||||||
|
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||||
|
+ return 0;
|
||||||
|
+ }''', dependencies: libvitastor_client)
|
||||||
|
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||||
|
+ elif get_option('vitastor').enabled()
|
||||||
|
+ error('could not link libvitastor_client')
|
||||||
|
+ else
|
||||||
|
+ warning('could not link libvitastor_client, disabling')
|
||||||
|
+ endif
|
||||||
|
+ endif
|
||||||
|
+endif
|
||||||
|
+
|
||||||
|
glusterfs = not_found
|
||||||
|
glusterfs_ftruncate_has_stat = false
|
||||||
|
glusterfs_iocb_has_stat = false
|
||||||
|
@@ -2123,6 +2143,7 @@ if numa.found()
|
||||||
|
endif
|
||||||
|
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||||
|
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||||
|
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||||
|
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||||
|
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
|
||||||
|
config_host_data.set('CONFIG_SDL', sdl.found())
|
||||||
|
@@ -4298,6 +4319,7 @@ summary_info += {'fdt support': fd
|
||||||
|
summary_info += {'libcap-ng support': libcap_ng}
|
||||||
|
summary_info += {'bpf support': libbpf}
|
||||||
|
summary_info += {'rbd support': rbd}
|
||||||
|
+summary_info += {'vitastor support': vitastor}
|
||||||
|
summary_info += {'smartcard support': cacard}
|
||||||
|
summary_info += {'U2F support': u2f}
|
||||||
|
summary_info += {'libusb': libusb}
|
||||||
|
Index: pve-qemu-kvm-8.1.2/meson_options.txt
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-8.1.2.orig/meson_options.txt
|
||||||
|
+++ pve-qemu-kvm-8.1.2/meson_options.txt
|
||||||
|
@@ -186,6 +186,8 @@ option('lzo', type : 'feature', value :
|
||||||
|
description: 'lzo compression support')
|
||||||
|
option('rbd', type : 'feature', value : 'auto',
|
||||||
|
description: 'Ceph block device driver')
|
||||||
|
+option('vitastor', type : 'feature', value : 'auto',
|
||||||
|
+ description: 'Vitastor block device driver')
|
||||||
|
option('opengl', type : 'feature', value : 'auto',
|
||||||
|
description: 'OpenGL support')
|
||||||
|
option('rdma', type : 'feature', value : 'auto',
|
||||||
|
Index: pve-qemu-kvm-8.1.2/qapi/block-core.json
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-8.1.2.orig/qapi/block-core.json
|
||||||
|
+++ pve-qemu-kvm-8.1.2/qapi/block-core.json
|
||||||
|
@@ -3403,7 +3403,7 @@
|
||||||
|
'raw', 'rbd',
|
||||||
|
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||||
|
'pbs',
|
||||||
|
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
||||||
|
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
||||||
|
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
||||||
|
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
||||||
|
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
||||||
|
@@ -4465,6 +4465,28 @@
|
||||||
|
'*server': ['InetSocketAddressBase'] } }
|
||||||
|
|
||||||
|
##
|
||||||
|
+# @BlockdevOptionsVitastor:
|
||||||
|
+#
|
||||||
|
+# Driver specific block device options for vitastor
|
||||||
|
+#
|
||||||
|
+# @image: Image name
|
||||||
|
+# @inode: Inode number
|
||||||
|
+# @pool: Pool ID
|
||||||
|
+# @size: Desired image size in bytes
|
||||||
|
+# @config-path: Path to Vitastor configuration
|
||||||
|
+# @etcd-host: etcd connection address(es)
|
||||||
|
+# @etcd-prefix: etcd key/value prefix
|
||||||
|
+##
|
||||||
|
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||||
|
+ 'data': { '*inode': 'uint64',
|
||||||
|
+ '*pool': 'uint64',
|
||||||
|
+ '*size': 'uint64',
|
||||||
|
+ '*image': 'str',
|
||||||
|
+ '*config-path': 'str',
|
||||||
|
+ '*etcd-host': 'str',
|
||||||
|
+ '*etcd-prefix': 'str' } }
|
||||||
|
+
|
||||||
|
+##
|
||||||
|
# @ReplicationMode:
|
||||||
|
#
|
||||||
|
# An enumeration of replication modes.
|
||||||
|
@@ -4923,6 +4945,7 @@
|
||||||
|
'throttle': 'BlockdevOptionsThrottle',
|
||||||
|
'vdi': 'BlockdevOptionsGenericFormat',
|
||||||
|
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||||
|
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||||
|
'virtio-blk-vfio-pci':
|
||||||
|
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||||
|
'if': 'CONFIG_BLKIO' },
|
||||||
|
@@ -5360,6 +5383,17 @@
|
||||||
|
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||||
|
|
||||||
|
##
|
||||||
|
+# @BlockdevCreateOptionsVitastor:
|
||||||
|
+#
|
||||||
|
+# Driver specific image creation options for Vitastor.
|
||||||
|
+#
|
||||||
|
+# @size: Size of the virtual disk in bytes
|
||||||
|
+##
|
||||||
|
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||||
|
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||||
|
+ 'size': 'size' } }
|
||||||
|
+
|
||||||
|
+##
|
||||||
|
# @BlockdevVmdkSubformat:
|
||||||
|
#
|
||||||
|
# Subformat options for VMDK images
|
||||||
|
@@ -5581,6 +5615,7 @@
|
||||||
|
'ssh': 'BlockdevCreateOptionsSsh',
|
||||||
|
'vdi': 'BlockdevCreateOptionsVdi',
|
||||||
|
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||||
|
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||||
|
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||||
|
'vpc': 'BlockdevCreateOptionsVpc'
|
||||||
|
} }
|
||||||
|
Index: pve-qemu-kvm-8.1.2/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-8.1.2.orig/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||||
|
+++ pve-qemu-kvm-8.1.2/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||||
|
@@ -30,7 +30,7 @@
|
||||||
|
--with-suffix="qemu-kvm" \
|
||||||
|
--firmwarepath=/usr/share/qemu-firmware \
|
||||||
|
--target-list="x86_64-softmmu" \
|
||||||
|
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||||
|
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||||
|
--audio-drv-list="" \
|
||||||
|
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
||||||
|
--with-coroutine=ucontext \
|
||||||
|
@@ -176,6 +176,7 @@
|
||||||
|
--enable-opengl \
|
||||||
|
--enable-pie \
|
||||||
|
--enable-rbd \
|
||||||
|
+--enable-vitastor \
|
||||||
|
--enable-rdma \
|
||||||
|
--enable-seccomp \
|
||||||
|
--enable-snappy \
|
||||||
|
Index: pve-qemu-kvm-8.1.2/scripts/meson-buildoptions.sh
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-8.1.2.orig/scripts/meson-buildoptions.sh
|
||||||
|
+++ pve-qemu-kvm-8.1.2/scripts/meson-buildoptions.sh
|
||||||
|
@@ -153,6 +153,7 @@ meson_options_help() {
|
||||||
|
printf "%s\n" ' qed qed image format support'
|
||||||
|
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||||
|
printf "%s\n" ' rbd Ceph block device driver'
|
||||||
|
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||||
|
printf "%s\n" ' rdma Enable RDMA-based migration'
|
||||||
|
printf "%s\n" ' replication replication support'
|
||||||
|
printf "%s\n" ' sdl SDL user interface'
|
||||||
|
@@ -416,6 +417,8 @@ _meson_option_parse() {
|
||||||
|
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
|
||||||
|
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||||
|
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||||
|
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||||
|
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||||
|
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
||||||
|
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
||||||
|
--enable-replication) printf "%s" -Dreplication=enabled ;;
|
190
patches/qemu-8.1-vitastor.patch
Normal file
190
patches/qemu-8.1-vitastor.patch
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
diff --git a/block/meson.build b/block/meson.build
|
||||||
|
index 529fc172c6..d542dc0609 100644
|
||||||
|
--- a/block/meson.build
|
||||||
|
+++ b/block/meson.build
|
||||||
|
@@ -110,6 +110,7 @@ foreach m : [
|
||||||
|
[libnfs, 'nfs', files('nfs.c')],
|
||||||
|
[libssh, 'ssh', files('ssh.c')],
|
||||||
|
[rbd, 'rbd', files('rbd.c')],
|
||||||
|
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||||
|
]
|
||||||
|
if m[0].found()
|
||||||
|
module_ss = ss.source_set()
|
||||||
|
diff --git a/meson.build b/meson.build
|
||||||
|
index a9c4f28247..8496cf13f1 100644
|
||||||
|
--- a/meson.build
|
||||||
|
+++ b/meson.build
|
||||||
|
@@ -1303,6 +1303,26 @@ if not get_option('rbd').auto() or have_block
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
+vitastor = not_found
|
||||||
|
+if not get_option('vitastor').auto() or have_block
|
||||||
|
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||||
|
+ required: get_option('vitastor'))
|
||||||
|
+ if libvitastor_client.found()
|
||||||
|
+ if cc.links('''
|
||||||
|
+ #include <vitastor_c.h>
|
||||||
|
+ int main(void) {
|
||||||
|
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||||
|
+ return 0;
|
||||||
|
+ }''', dependencies: libvitastor_client)
|
||||||
|
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||||
|
+ elif get_option('vitastor').enabled()
|
||||||
|
+ error('could not link libvitastor_client')
|
||||||
|
+ else
|
||||||
|
+ warning('could not link libvitastor_client, disabling')
|
||||||
|
+ endif
|
||||||
|
+ endif
|
||||||
|
+endif
|
||||||
|
+
|
||||||
|
glusterfs = not_found
|
||||||
|
glusterfs_ftruncate_has_stat = false
|
||||||
|
glusterfs_iocb_has_stat = false
|
||||||
|
@@ -2119,6 +2139,7 @@ if numa.found()
|
||||||
|
endif
|
||||||
|
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||||
|
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||||
|
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||||
|
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||||
|
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
|
||||||
|
config_host_data.set('CONFIG_SDL', sdl.found())
|
||||||
|
@@ -4286,6 +4307,7 @@ summary_info += {'fdt support': fdt_opt == 'disabled' ? false : fdt_opt}
|
||||||
|
summary_info += {'libcap-ng support': libcap_ng}
|
||||||
|
summary_info += {'bpf support': libbpf}
|
||||||
|
summary_info += {'rbd support': rbd}
|
||||||
|
+summary_info += {'vitastor support': vitastor}
|
||||||
|
summary_info += {'smartcard support': cacard}
|
||||||
|
summary_info += {'U2F support': u2f}
|
||||||
|
summary_info += {'libusb': libusb}
|
||||||
|
diff --git a/meson_options.txt b/meson_options.txt
|
||||||
|
index ae6d8f469d..e3d9f8404d 100644
|
||||||
|
--- a/meson_options.txt
|
||||||
|
+++ b/meson_options.txt
|
||||||
|
@@ -186,6 +186,8 @@ option('lzo', type : 'feature', value : 'auto',
|
||||||
|
description: 'lzo compression support')
|
||||||
|
option('rbd', type : 'feature', value : 'auto',
|
||||||
|
description: 'Ceph block device driver')
|
||||||
|
+option('vitastor', type : 'feature', value : 'auto',
|
||||||
|
+ description: 'Vitastor block device driver')
|
||||||
|
option('opengl', type : 'feature', value : 'auto',
|
||||||
|
description: 'OpenGL support')
|
||||||
|
option('rdma', type : 'feature', value : 'auto',
|
||||||
|
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||||
|
index 2b1d493d6e..90673fdbdc 100644
|
||||||
|
--- a/qapi/block-core.json
|
||||||
|
+++ b/qapi/block-core.json
|
||||||
|
@@ -3146,7 +3146,7 @@
|
||||||
|
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
|
||||||
|
'raw', 'rbd',
|
||||||
|
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||||
|
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
||||||
|
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
||||||
|
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
||||||
|
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
||||||
|
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
||||||
|
@@ -4196,6 +4196,28 @@
|
||||||
|
'*key-secret': 'str',
|
||||||
|
'*server': ['InetSocketAddressBase'] } }
|
||||||
|
|
||||||
|
+##
|
||||||
|
+# @BlockdevOptionsVitastor:
|
||||||
|
+#
|
||||||
|
+# Driver specific block device options for vitastor
|
||||||
|
+#
|
||||||
|
+# @image: Image name
|
||||||
|
+# @inode: Inode number
|
||||||
|
+# @pool: Pool ID
|
||||||
|
+# @size: Desired image size in bytes
|
||||||
|
+# @config-path: Path to Vitastor configuration
|
||||||
|
+# @etcd-host: etcd connection address(es)
|
||||||
|
+# @etcd-prefix: etcd key/value prefix
|
||||||
|
+##
|
||||||
|
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||||
|
+ 'data': { '*inode': 'uint64',
|
||||||
|
+ '*pool': 'uint64',
|
||||||
|
+ '*size': 'uint64',
|
||||||
|
+ '*image': 'str',
|
||||||
|
+ '*config-path': 'str',
|
||||||
|
+ '*etcd-host': 'str',
|
||||||
|
+ '*etcd-prefix': 'str' } }
|
||||||
|
+
|
||||||
|
##
|
||||||
|
# @ReplicationMode:
|
||||||
|
#
|
||||||
|
@@ -4654,6 +4676,7 @@
|
||||||
|
'throttle': 'BlockdevOptionsThrottle',
|
||||||
|
'vdi': 'BlockdevOptionsGenericFormat',
|
||||||
|
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||||
|
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||||
|
'virtio-blk-vfio-pci':
|
||||||
|
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||||
|
'if': 'CONFIG_BLKIO' },
|
||||||
|
@@ -5089,6 +5112,17 @@
|
||||||
|
'*cluster-size' : 'size',
|
||||||
|
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||||
|
|
||||||
|
+##
|
||||||
|
+# @BlockdevCreateOptionsVitastor:
|
||||||
|
+#
|
||||||
|
+# Driver specific image creation options for Vitastor.
|
||||||
|
+#
|
||||||
|
+# @size: Size of the virtual disk in bytes
|
||||||
|
+##
|
||||||
|
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||||
|
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||||
|
+ 'size': 'size' } }
|
||||||
|
+
|
||||||
|
##
|
||||||
|
# @BlockdevVmdkSubformat:
|
||||||
|
#
|
||||||
|
@@ -5311,6 +5345,7 @@
|
||||||
|
'ssh': 'BlockdevCreateOptionsSsh',
|
||||||
|
'vdi': 'BlockdevCreateOptionsVdi',
|
||||||
|
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||||
|
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||||
|
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||||
|
'vpc': 'BlockdevCreateOptionsVpc'
|
||||||
|
} }
|
||||||
|
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||||
|
index d02b09a4b9..f0b5fbfef3 100755
|
||||||
|
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||||
|
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||||
|
@@ -30,7 +30,7 @@
|
||||||
|
--with-suffix="qemu-kvm" \
|
||||||
|
--firmwarepath=/usr/share/qemu-firmware \
|
||||||
|
--target-list="x86_64-softmmu" \
|
||||||
|
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||||
|
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||||
|
--audio-drv-list="" \
|
||||||
|
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
||||||
|
--with-coroutine=ucontext \
|
||||||
|
@@ -176,6 +176,7 @@
|
||||||
|
--enable-opengl \
|
||||||
|
--enable-pie \
|
||||||
|
--enable-rbd \
|
||||||
|
+--enable-vitastor \
|
||||||
|
--enable-rdma \
|
||||||
|
--enable-seccomp \
|
||||||
|
--enable-snappy \
|
||||||
|
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
||||||
|
index d7020af175..94958eb6fa 100644
|
||||||
|
--- a/scripts/meson-buildoptions.sh
|
||||||
|
+++ b/scripts/meson-buildoptions.sh
|
||||||
|
@@ -153,6 +153,7 @@ meson_options_help() {
|
||||||
|
printf "%s\n" ' qed qed image format support'
|
||||||
|
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||||
|
printf "%s\n" ' rbd Ceph block device driver'
|
||||||
|
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||||
|
printf "%s\n" ' rdma Enable RDMA-based migration'
|
||||||
|
printf "%s\n" ' replication replication support'
|
||||||
|
printf "%s\n" ' sdl SDL user interface'
|
||||||
|
@@ -416,6 +417,8 @@ _meson_option_parse() {
|
||||||
|
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
|
||||||
|
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||||
|
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||||
|
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||||
|
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||||
|
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
||||||
|
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
||||||
|
--enable-replication) printf "%s" -Dreplication=enabled ;;
|
@@ -24,4 +24,4 @@ rm fio
|
|||||||
mv fio-copy fio
|
mv fio-copy fio
|
||||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||||
tar --transform 's#^#vitastor-1.2.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.2.0$(rpm --eval '%dist').tar.gz *
|
tar --transform 's#^#vitastor-1.3.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.3.1$(rpm --eval '%dist').tar.gz *
|
||||||
|
@@ -15,6 +15,7 @@ RUN yumdownloader --disablerepo=centos-sclo-rh --source fio
|
|||||||
RUN rpm --nomd5 -i fio*.src.rpm
|
RUN rpm --nomd5 -i fio*.src.rpm
|
||||||
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
|
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
|
||||||
RUN cd ~/rpmbuild/SPECS && yum-builddep -y fio.spec
|
RUN cd ~/rpmbuild/SPECS && yum-builddep -y fio.spec
|
||||||
|
RUN yum -y install cmake3
|
||||||
|
|
||||||
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
|
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
|
||||||
|
|
||||||
@@ -35,7 +36,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-1.2.0.el7.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-1.3.1.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 1.2.0
|
Version: 1.3.1
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-1.2.0.el7.tar.gz
|
Source0: vitastor-1.3.1.el7.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
@@ -16,7 +16,7 @@ BuildRequires: jerasure-devel
|
|||||||
BuildRequires: libisa-l-devel
|
BuildRequires: libisa-l-devel
|
||||||
BuildRequires: gf-complete-devel
|
BuildRequires: gf-complete-devel
|
||||||
BuildRequires: libibverbs-devel
|
BuildRequires: libibverbs-devel
|
||||||
BuildRequires: cmake
|
BuildRequires: cmake3
|
||||||
Requires: vitastor-osd = %{version}-%{release}
|
Requires: vitastor-osd = %{version}-%{release}
|
||||||
Requires: vitastor-mon = %{version}-%{release}
|
Requires: vitastor-mon = %{version}-%{release}
|
||||||
Requires: vitastor-client = %{version}-%{release}
|
Requires: vitastor-client = %{version}-%{release}
|
||||||
@@ -94,7 +94,7 @@ Vitastor fio drivers for benchmarking.
|
|||||||
|
|
||||||
%build
|
%build
|
||||||
. /opt/rh/devtoolset-9/enable
|
. /opt/rh/devtoolset-9/enable
|
||||||
%cmake .
|
%cmake3 .
|
||||||
%make_build
|
%make_build
|
||||||
|
|
||||||
|
|
||||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-1.2.0.el8.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-1.3.1.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 1.2.0
|
Version: 1.3.1
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-1.2.0.el8.tar.gz
|
Source0: vitastor-1.3.1.el8.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
@@ -18,7 +18,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-1.2.0.el9.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-1.3.1.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 1.2.0
|
Version: 1.3.1
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-1.2.0.el9.tar.gz
|
Source0: vitastor-1.3.1.el9.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
@@ -16,10 +16,11 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
|||||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_definitions(-DVERSION="1.2.0")
|
add_definitions(-DVERSION="1.3.1")
|
||||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
||||||
|
add_link_options(-fno-omit-frame-pointer)
|
||||||
if (${WITH_ASAN})
|
if (${WITH_ASAN})
|
||||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
add_definitions(-fsanitize=address)
|
||||||
add_link_options(-fsanitize=address -fno-omit-frame-pointer)
|
add_link_options(-fsanitize=address -fno-omit-frame-pointer)
|
||||||
endif (${WITH_ASAN})
|
endif (${WITH_ASAN})
|
||||||
|
|
||||||
|
@@ -274,7 +274,7 @@ class blockstore_impl_t
|
|||||||
blockstore_dirty_db_t dirty_db;
|
blockstore_dirty_db_t dirty_db;
|
||||||
std::vector<blockstore_op_t*> submit_queue;
|
std::vector<blockstore_op_t*> submit_queue;
|
||||||
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
||||||
int unsynced_big_write_count = 0;
|
int unsynced_big_write_count = 0, unstable_unsynced = 0;
|
||||||
int unsynced_queued_ops = 0;
|
int unsynced_queued_ops = 0;
|
||||||
allocator *data_alloc = NULL;
|
allocator *data_alloc = NULL;
|
||||||
uint8_t *zero_object;
|
uint8_t *zero_object;
|
||||||
|
@@ -144,7 +144,10 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
|
|||||||
journal.sector_info[journal.cur_sector].written = false;
|
journal.sector_info[journal.cur_sector].written = false;
|
||||||
journal.sector_info[journal.cur_sector].offset = journal.next_free;
|
journal.sector_info[journal.cur_sector].offset = journal.next_free;
|
||||||
journal.in_sector_pos = 0;
|
journal.in_sector_pos = 0;
|
||||||
journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
|
auto next_next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
|
||||||
|
// double check that next_free doesn't cross used_start from the left
|
||||||
|
assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
|
||||||
|
journal.next_free = next_next_free;
|
||||||
memset(journal.inmemory
|
memset(journal.inmemory
|
||||||
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
|
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
|
||||||
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
|
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
|
||||||
|
@@ -13,12 +13,6 @@
|
|||||||
#define JOURNAL_BUFFER_SIZE 4*1024*1024
|
#define JOURNAL_BUFFER_SIZE 4*1024*1024
|
||||||
#define JOURNAL_ENTRY_HEADER_SIZE 16
|
#define JOURNAL_ENTRY_HEADER_SIZE 16
|
||||||
|
|
||||||
// We reserve some extra space for future stabilize requests during writes
|
|
||||||
// FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
|
|
||||||
// writing more than can be stabilized afterwards
|
|
||||||
#define JOURNAL_STABILIZE_RESERVATION 65536
|
|
||||||
#define JOURNAL_INSTANT_RESERVATION 131072
|
|
||||||
|
|
||||||
// Journal entries
|
// Journal entries
|
||||||
// Journal entries are linked to each other by their crc32 value
|
// Journal entries are linked to each other by their crc32 value
|
||||||
// The journal is almost a blockchain, because object versions constantly increase
|
// The journal is almost a blockchain, because object versions constantly increase
|
||||||
|
@@ -86,14 +86,15 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
|||||||
auto & dirty_entry = dirty_db.at(sbw);
|
auto & dirty_entry = dirty_db.at(sbw);
|
||||||
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
|
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
|
||||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
|
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
|
||||||
left == 0 ? JOURNAL_STABILIZE_RESERVATION : 0))
|
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
|
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
|
||||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
|
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
|
||||||
|
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -184,6 +185,11 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
|
|||||||
{
|
{
|
||||||
mark_stable(dirty_it->first);
|
mark_stable(dirty_it->first);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
unstable_unsynced--;
|
||||||
|
assert(unstable_unsynced >= 0);
|
||||||
|
}
|
||||||
dirty_it++;
|
dirty_it++;
|
||||||
while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
|
while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
|
||||||
{
|
{
|
||||||
@@ -214,6 +220,11 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
|
|||||||
{
|
{
|
||||||
mark_stable(*it);
|
mark_stable(*it);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
unstable_unsynced--;
|
||||||
|
assert(unstable_unsynced >= 0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
op->retval = 0;
|
op->retval = 0;
|
||||||
|
@@ -320,7 +320,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||||||
blockstore_journal_check_t space_check(this);
|
blockstore_journal_check_t space_check(this);
|
||||||
if (!space_check.check_available(op, unsynced_big_write_count + 1,
|
if (!space_check.check_available(op, unsynced_big_write_count + 1,
|
||||||
sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
|
sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
|
||||||
(dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION))
|
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -386,6 +386,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||||||
sqe, dsk.data_fd, PRIV(op)->iov_zerofill, vcnt, dsk.data_offset + (loc << dsk.block_order) + op->offset - stripe_offset
|
sqe, dsk.data_fd, PRIV(op)->iov_zerofill, vcnt, dsk.data_offset + (loc << dsk.block_order) + op->offset - stripe_offset
|
||||||
);
|
);
|
||||||
PRIV(op)->pending_ops = 1;
|
PRIV(op)->pending_ops = 1;
|
||||||
|
if (!(dirty_it->second.state & BS_ST_INSTANT))
|
||||||
|
{
|
||||||
|
unstable_unsynced++;
|
||||||
|
}
|
||||||
if (immediate_commit != IMMEDIATE_ALL)
|
if (immediate_commit != IMMEDIATE_ALL)
|
||||||
{
|
{
|
||||||
// Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
|
// Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
|
||||||
@@ -408,7 +412,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||||||
sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
|
sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
|
||||||
|| !space_check.check_available(op, 1,
|
|| !space_check.check_available(op, 1,
|
||||||
sizeof(journal_entry_small_write) + dyn_size,
|
sizeof(journal_entry_small_write) + dyn_size,
|
||||||
op->len + ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
|
op->len + (unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -458,6 +462,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// double check that next_free doesn't cross used_start from the left
|
||||||
|
assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
|
||||||
journal.next_free = next_next_free;
|
journal.next_free = next_next_free;
|
||||||
je->oid = op->oid;
|
je->oid = op->oid;
|
||||||
je->version = op->version;
|
je->version = op->version;
|
||||||
@@ -495,10 +501,15 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||||||
}
|
}
|
||||||
dirty_it->second.location = journal.next_free;
|
dirty_it->second.location = journal.next_free;
|
||||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
|
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
|
||||||
journal.next_free += op->len;
|
next_next_free = journal.next_free + op->len;
|
||||||
if (journal.next_free >= journal.len)
|
if (next_next_free >= journal.len)
|
||||||
|
next_next_free = dsk.journal_block_size;
|
||||||
|
// double check that next_free doesn't cross used_start from the left
|
||||||
|
assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
|
||||||
|
journal.next_free = next_next_free;
|
||||||
|
if (!(dirty_it->second.state & BS_ST_INSTANT))
|
||||||
{
|
{
|
||||||
journal.next_free = dsk.journal_block_size;
|
unstable_unsynced++;
|
||||||
}
|
}
|
||||||
if (!PRIV(op)->pending_ops)
|
if (!PRIV(op)->pending_ops)
|
||||||
{
|
{
|
||||||
@@ -538,7 +549,7 @@ resume_2:
|
|||||||
uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
|
uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
|
||||||
blockstore_journal_check_t space_check(this);
|
blockstore_journal_check_t space_check(this);
|
||||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
|
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
|
||||||
((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
|
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -582,14 +593,20 @@ resume_4:
|
|||||||
#endif
|
#endif
|
||||||
bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
|
bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
|
||||||
bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
|
bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
|
||||||
|
bool is_instant = ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT));
|
||||||
if (imm)
|
if (imm)
|
||||||
{
|
{
|
||||||
auto & unstab = unstable_writes[op->oid];
|
auto & unstab = unstable_writes[op->oid];
|
||||||
unstab = unstab < op->version ? op->version : unstab;
|
unstab = unstab < op->version ? op->version : unstab;
|
||||||
|
if (!is_instant)
|
||||||
|
{
|
||||||
|
unstable_unsynced--;
|
||||||
|
assert(unstable_unsynced >= 0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK)
|
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK)
|
||||||
| (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
|
| (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
|
||||||
if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
|
if (imm && is_instant)
|
||||||
{
|
{
|
||||||
// Deletions and 'instant' operations are treated as immediately stable
|
// Deletions and 'instant' operations are treated as immediately stable
|
||||||
mark_stable(dirty_it->first);
|
mark_stable(dirty_it->first);
|
||||||
@@ -735,7 +752,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
|||||||
});
|
});
|
||||||
assert(dirty_it != dirty_db.end());
|
assert(dirty_it != dirty_db.end());
|
||||||
blockstore_journal_check_t space_check(this);
|
blockstore_journal_check_t space_check(this);
|
||||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_INSTANT_RESERVATION))
|
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), (unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@@ -116,7 +116,8 @@ static const char* help_text =
|
|||||||
"Use vitastor-cli --help <command> for command details or vitastor-cli --help --all for all details.\n"
|
"Use vitastor-cli --help <command> for command details or vitastor-cli --help --all for all details.\n"
|
||||||
"\n"
|
"\n"
|
||||||
"GLOBAL OPTIONS:\n"
|
"GLOBAL OPTIONS:\n"
|
||||||
" --etcd_address <etcd_address>\n"
|
" --config_file FILE Path to Vitastor configuration file\n"
|
||||||
|
" --etcd_address URL Etcd connection address\n"
|
||||||
" --iodepth N Send N operations in parallel to each OSD when possible (default 32)\n"
|
" --iodepth N Send N operations in parallel to each OSD when possible (default 32)\n"
|
||||||
" --parallel_osds M Work with M osds in parallel when possible (default 4)\n"
|
" --parallel_osds M Work with M osds in parallel when possible (default 4)\n"
|
||||||
" --progress 1|0 Report progress (default 1)\n"
|
" --progress 1|0 Report progress (default 1)\n"
|
||||||
@@ -331,7 +332,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
|
|||||||
{
|
{
|
||||||
// Create client
|
// Create client
|
||||||
json11::Json cfg_j = cfg;
|
json11::Json cfg_j = cfg;
|
||||||
p->ringloop = new ring_loop_t(512);
|
p->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||||
p->epmgr = new epoll_manager_t(p->ringloop);
|
p->epmgr = new epoll_manager_t(p->ringloop);
|
||||||
p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg_j);
|
p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg_j);
|
||||||
// Smaller timeout by default for more interactiveness
|
// Smaller timeout by default for more interactiveness
|
||||||
|
@@ -229,7 +229,7 @@ int main(int argc, char *argv[])
|
|||||||
{
|
{
|
||||||
self.options["allow_data_loss"] = "1";
|
self.options["allow_data_loss"] = "1";
|
||||||
}
|
}
|
||||||
else if (argv[i][0] == '-' && argv[i][1] == '-')
|
else if (argv[i][0] == '-' && argv[i][1] == '-' && i < argc-1)
|
||||||
{
|
{
|
||||||
char *key = argv[i]+2;
|
char *key = argv[i]+2;
|
||||||
self.options[key] = argv[++i];
|
self.options[key] = argv[++i];
|
||||||
|
@@ -320,7 +320,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
|
|||||||
if (journal_calc_data_pos != sw.data_offset)
|
if (journal_calc_data_pos != sw.data_offset)
|
||||||
{
|
{
|
||||||
printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\""
|
printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\""
|
||||||
: " (mismatched, calculated = %lu)", journal_pos);
|
: " (mismatched, calculated = %08lx)", journal_pos);
|
||||||
}
|
}
|
||||||
uint32_t data_csum_size = (!je_start.csum_block_size
|
uint32_t data_csum_size = (!je_start.csum_block_size
|
||||||
? 0
|
? 0
|
||||||
|
@@ -245,7 +245,7 @@ int disk_tool_t::resize_copy_data()
|
|||||||
{
|
{
|
||||||
iodepth = 32;
|
iodepth = 32;
|
||||||
}
|
}
|
||||||
ringloop = new ring_loop_t(iodepth < 512 ? 512 : iodepth);
|
ringloop = new ring_loop_t(iodepth < RINGLOOP_DEFAULT_SIZE ? RINGLOOP_DEFAULT_SIZE : iodepth);
|
||||||
dsk.data_fd = open(dsk.data_device.c_str(), O_DIRECT|O_RDWR);
|
dsk.data_fd = open(dsk.data_device.c_str(), O_DIRECT|O_RDWR);
|
||||||
if (dsk.data_fd < 0)
|
if (dsk.data_fd < 0)
|
||||||
{
|
{
|
||||||
|
@@ -130,7 +130,7 @@ static int bs_init(struct thread_data *td)
|
|||||||
config[p.first] = p.second.dump();
|
config[p.first] = p.second.dump();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
bsd->ringloop = new ring_loop_t(512);
|
bsd->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||||
bsd->epmgr = new epoll_manager_t(bsd->ringloop);
|
bsd->epmgr = new epoll_manager_t(bsd->ringloop);
|
||||||
bsd->bs = new blockstore_t(config, bsd->ringloop, bsd->epmgr->tfd);
|
bsd->bs = new blockstore_t(config, bsd->ringloop, bsd->epmgr->tfd);
|
||||||
while (1)
|
while (1)
|
||||||
|
@@ -22,7 +22,7 @@ void osd_messenger_t::init()
|
|||||||
{
|
{
|
||||||
rdma_context = msgr_rdma_context_t::create(
|
rdma_context = msgr_rdma_context_t::create(
|
||||||
rdma_device != "" ? rdma_device.c_str() : NULL,
|
rdma_device != "" ? rdma_device.c_str() : NULL,
|
||||||
rdma_port_num, rdma_gid_index, rdma_mtu, log_level
|
rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
|
||||||
);
|
);
|
||||||
if (!rdma_context)
|
if (!rdma_context)
|
||||||
{
|
{
|
||||||
@@ -167,6 +167,7 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
|||||||
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
|
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
|
||||||
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
||||||
this->rdma_max_msg = 129*1024;
|
this->rdma_max_msg = 129*1024;
|
||||||
|
this->rdma_odp = config["rdma_odp"].bool_value();
|
||||||
#endif
|
#endif
|
||||||
this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
|
this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
|
||||||
if (!this->receive_buffer_size || this->receive_buffer_size > 1024*1024*1024)
|
if (!this->receive_buffer_size || this->receive_buffer_size > 1024*1024*1024)
|
||||||
@@ -490,7 +491,14 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
|||||||
fprintf(stderr, "Connected to OSD %lu using RDMA\n", cl->osd_num);
|
fprintf(stderr, "Connected to OSD %lu using RDMA\n", cl->osd_num);
|
||||||
}
|
}
|
||||||
cl->peer_state = PEER_RDMA;
|
cl->peer_state = PEER_RDMA;
|
||||||
tfd->set_fd_handler(cl->peer_fd, false, NULL);
|
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||||
|
{
|
||||||
|
// Do not miss the disconnection!
|
||||||
|
if (epoll_events & EPOLLRDHUP)
|
||||||
|
{
|
||||||
|
handle_peer_epoll(peer_fd, epoll_events);
|
||||||
|
}
|
||||||
|
});
|
||||||
// Add the initial receive request
|
// Add the initial receive request
|
||||||
try_recv_rdma(cl);
|
try_recv_rdma(cl);
|
||||||
}
|
}
|
||||||
|
@@ -131,6 +131,7 @@ protected:
|
|||||||
msgr_rdma_context_t *rdma_context = NULL;
|
msgr_rdma_context_t *rdma_context = NULL;
|
||||||
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
|
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
|
||||||
uint64_t rdma_max_msg = 0;
|
uint64_t rdma_max_msg = 0;
|
||||||
|
bool rdma_odp = false;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::vector<int> read_ready_clients;
|
std::vector<int> read_ready_clients;
|
||||||
@@ -197,7 +198,9 @@ protected:
|
|||||||
void handle_reply_ready(osd_op_t *op);
|
void handle_reply_ready(osd_op_t *op);
|
||||||
|
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
bool try_send_rdma(osd_client_t *cl);
|
void try_send_rdma(osd_client_t *cl);
|
||||||
|
void try_send_rdma_odp(osd_client_t *cl);
|
||||||
|
void try_send_rdma_nodp(osd_client_t *cl);
|
||||||
bool try_recv_rdma(osd_client_t *cl);
|
bool try_recv_rdma(osd_client_t *cl);
|
||||||
void handle_rdma_events();
|
void handle_rdma_events();
|
||||||
#endif
|
#endif
|
||||||
|
@@ -47,11 +47,29 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
|||||||
if (qp)
|
if (qp)
|
||||||
ibv_destroy_qp(qp);
|
ibv_destroy_qp(qp);
|
||||||
if (recv_buffers.size())
|
if (recv_buffers.size())
|
||||||
|
{
|
||||||
for (auto b: recv_buffers)
|
for (auto b: recv_buffers)
|
||||||
free(b);
|
{
|
||||||
|
if (b.mr)
|
||||||
|
ibv_dereg_mr(b.mr);
|
||||||
|
free(b.buf);
|
||||||
|
}
|
||||||
|
recv_buffers.clear();
|
||||||
|
}
|
||||||
|
if (send_out.mr)
|
||||||
|
{
|
||||||
|
ibv_dereg_mr(send_out.mr);
|
||||||
|
send_out.mr = NULL;
|
||||||
|
}
|
||||||
|
if (send_out.buf)
|
||||||
|
{
|
||||||
|
free(send_out.buf);
|
||||||
|
send_out.buf = NULL;
|
||||||
|
}
|
||||||
|
send_out_size = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level)
|
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
|
||||||
{
|
{
|
||||||
int res;
|
int res;
|
||||||
ibv_device **dev_list = NULL;
|
ibv_device **dev_list = NULL;
|
||||||
@@ -136,21 +154,27 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
|
|||||||
fprintf(stderr, "Couldn't query RDMA device for its features\n");
|
fprintf(stderr, "Couldn't query RDMA device for its features\n");
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
if (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
|
ctx->odp = odp;
|
||||||
|
if (ctx->odp &&
|
||||||
|
(!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
|
||||||
!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT) ||
|
!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT) ||
|
||||||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
|
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
|
||||||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV))
|
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV)))
|
||||||
{
|
{
|
||||||
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable or does not support RC send and receive with ODP\n");
|
ctx->odp = false;
|
||||||
goto cleanup;
|
if (log_level > 0)
|
||||||
|
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable, disabling it\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
|
if (ctx->odp)
|
||||||
if (!ctx->mr)
|
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Couldn't register RDMA memory region\n");
|
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
|
||||||
goto cleanup;
|
if (!ctx->mr)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't register RDMA memory region\n");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->channel = ibv_create_comp_channel(ctx->context);
|
ctx->channel = ibv_create_comp_channel(ctx->context);
|
||||||
@@ -365,12 +389,34 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
|||||||
cl->rdma_conn->cur_send++;
|
cl->rdma_conn->cur_send++;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
static int try_send_rdma_copy(osd_client_t *cl, uint8_t *dst, int dst_len)
|
||||||
|
{
|
||||||
|
auto rc = cl->rdma_conn;
|
||||||
|
int total_dst_len = dst_len;
|
||||||
|
while (dst_len > 0 && rc->send_pos < cl->send_list.size())
|
||||||
|
{
|
||||||
|
iovec & iov = cl->send_list[rc->send_pos];
|
||||||
|
uint32_t len = (uint32_t)(iov.iov_len-rc->send_buf_pos < dst_len
|
||||||
|
? iov.iov_len-rc->send_buf_pos : dst_len);
|
||||||
|
memcpy(dst, iov.iov_base+rc->send_buf_pos, len);
|
||||||
|
dst += len;
|
||||||
|
dst_len -= len;
|
||||||
|
rc->send_buf_pos += len;
|
||||||
|
if (rc->send_buf_pos >= iov.iov_len)
|
||||||
|
{
|
||||||
|
rc->send_pos++;
|
||||||
|
rc->send_buf_pos = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return total_dst_len-dst_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::try_send_rdma_odp(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
auto rc = cl->rdma_conn;
|
auto rc = cl->rdma_conn;
|
||||||
if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
|
if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
|
||||||
{
|
{
|
||||||
return true;
|
return;
|
||||||
}
|
}
|
||||||
uint64_t op_size = 0, op_sge = 0;
|
uint64_t op_size = 0, op_sge = 0;
|
||||||
ibv_sge sge[rc->max_sge];
|
ibv_sge sge[rc->max_sge];
|
||||||
@@ -408,15 +454,70 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
|||||||
rc->send_sizes.push_back(op_size);
|
rc->send_sizes.push_back(op_size);
|
||||||
try_send_rdma_wr(cl, sge, op_sge);
|
try_send_rdma_wr(cl, sge, op_sge);
|
||||||
}
|
}
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
|
void osd_messenger_t::try_send_rdma_nodp(osd_client_t *cl)
|
||||||
|
{
|
||||||
|
auto rc = cl->rdma_conn;
|
||||||
|
if (!rc->send_out_size)
|
||||||
|
{
|
||||||
|
// Allocate send ring buffer, if not yet
|
||||||
|
rc->send_out_size = rc->max_msg*rdma_max_send;
|
||||||
|
rc->send_out.buf = malloc_or_die(rc->send_out_size);
|
||||||
|
if (!rdma_context->odp)
|
||||||
|
{
|
||||||
|
rc->send_out.mr = ibv_reg_mr(rdma_context->pd, rc->send_out.buf, rc->send_out_size, 0);
|
||||||
|
if (!rc->send_out.mr)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Copy data into the buffer and send it
|
||||||
|
uint8_t *dst = NULL;
|
||||||
|
int dst_len = 0;
|
||||||
|
int copied = 1;
|
||||||
|
while (!rc->send_out_full && copied > 0 && rc->cur_send < rc->max_send)
|
||||||
|
{
|
||||||
|
dst = (uint8_t*)rc->send_out.buf + rc->send_out_pos;
|
||||||
|
dst_len = (rc->send_out_pos < rc->send_out_size ? rc->send_out_size-rc->send_out_pos : rc->send_done_pos-rc->send_out_pos);
|
||||||
|
if (dst_len > rc->max_msg)
|
||||||
|
dst_len = rc->max_msg;
|
||||||
|
copied = try_send_rdma_copy(cl, dst, dst_len);
|
||||||
|
if (copied > 0)
|
||||||
|
{
|
||||||
|
rc->send_out_pos += copied;
|
||||||
|
if (rc->send_out_pos == rc->send_out_size)
|
||||||
|
rc->send_out_pos = 0;
|
||||||
|
assert(rc->send_out_pos < rc->send_out_size);
|
||||||
|
if (rc->send_out_pos >= rc->send_done_pos)
|
||||||
|
rc->send_out_full = true;
|
||||||
|
ibv_sge sge = {
|
||||||
|
.addr = (uintptr_t)dst,
|
||||||
|
.length = (uint32_t)copied,
|
||||||
|
.lkey = rdma_context->odp ? rdma_context->mr->lkey : rc->send_out.mr->lkey,
|
||||||
|
};
|
||||||
|
try_send_rdma_wr(cl, &sge, 1);
|
||||||
|
rc->send_sizes.push_back(copied);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||||
|
{
|
||||||
|
if (rdma_context->odp)
|
||||||
|
try_send_rdma_odp(cl);
|
||||||
|
else
|
||||||
|
try_send_rdma_nodp(cl);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void try_recv_rdma_wr(osd_client_t *cl, msgr_rdma_buf_t b)
|
||||||
{
|
{
|
||||||
ibv_sge sge = {
|
ibv_sge sge = {
|
||||||
.addr = (uintptr_t)buf,
|
.addr = (uintptr_t)b.buf,
|
||||||
.length = (uint32_t)cl->rdma_conn->max_msg,
|
.length = (uint32_t)cl->rdma_conn->max_msg,
|
||||||
.lkey = cl->rdma_conn->ctx->mr->lkey,
|
.lkey = cl->rdma_conn->ctx->odp ? cl->rdma_conn->ctx->mr->lkey : b.mr->lkey,
|
||||||
};
|
};
|
||||||
ibv_recv_wr *bad_wr = NULL;
|
ibv_recv_wr *bad_wr = NULL;
|
||||||
ibv_recv_wr wr = {
|
ibv_recv_wr wr = {
|
||||||
@@ -438,9 +539,19 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
|
|||||||
auto rc = cl->rdma_conn;
|
auto rc = cl->rdma_conn;
|
||||||
while (rc->cur_recv < rc->max_recv)
|
while (rc->cur_recv < rc->max_recv)
|
||||||
{
|
{
|
||||||
void *buf = malloc_or_die(rc->max_msg);
|
msgr_rdma_buf_t b;
|
||||||
rc->recv_buffers.push_back(buf);
|
b.buf = malloc_or_die(rc->max_msg);
|
||||||
try_recv_rdma_wr(cl, buf);
|
if (!rdma_context->odp)
|
||||||
|
{
|
||||||
|
b.mr = ibv_reg_mr(rdma_context->pd, b.buf, rc->max_msg, IBV_ACCESS_LOCAL_WRITE);
|
||||||
|
if (!b.mr)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rc->recv_buffers.push_back(b);
|
||||||
|
try_recv_rdma_wr(cl, b);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -492,7 +603,7 @@ void osd_messenger_t::handle_rdma_events()
|
|||||||
if (!is_send)
|
if (!is_send)
|
||||||
{
|
{
|
||||||
rc->cur_recv--;
|
rc->cur_recv--;
|
||||||
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
|
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
|
||||||
{
|
{
|
||||||
// handle_read_buffer may stop the client
|
// handle_read_buffer may stop the client
|
||||||
continue;
|
continue;
|
||||||
@@ -505,6 +616,14 @@ void osd_messenger_t::handle_rdma_events()
|
|||||||
rc->cur_send--;
|
rc->cur_send--;
|
||||||
uint64_t sent_size = rc->send_sizes.at(0);
|
uint64_t sent_size = rc->send_sizes.at(0);
|
||||||
rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
|
rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
|
||||||
|
if (!rdma_context->odp)
|
||||||
|
{
|
||||||
|
rc->send_done_pos += sent_size;
|
||||||
|
rc->send_out_full = false;
|
||||||
|
if (rc->send_done_pos == rc->send_out_size)
|
||||||
|
rc->send_done_pos = 0;
|
||||||
|
assert(rc->send_done_pos < rc->send_out_size);
|
||||||
|
}
|
||||||
int send_pos = 0, send_buf_pos = 0;
|
int send_pos = 0, send_buf_pos = 0;
|
||||||
while (sent_size > 0)
|
while (sent_size > 0)
|
||||||
{
|
{
|
||||||
|
@@ -23,6 +23,7 @@ struct msgr_rdma_context_t
|
|||||||
ibv_device *dev = NULL;
|
ibv_device *dev = NULL;
|
||||||
ibv_device_attr_ex attrx;
|
ibv_device_attr_ex attrx;
|
||||||
ibv_pd *pd = NULL;
|
ibv_pd *pd = NULL;
|
||||||
|
bool odp = false;
|
||||||
ibv_mr *mr = NULL;
|
ibv_mr *mr = NULL;
|
||||||
ibv_comp_channel *channel = NULL;
|
ibv_comp_channel *channel = NULL;
|
||||||
ibv_cq *cq = NULL;
|
ibv_cq *cq = NULL;
|
||||||
@@ -35,10 +36,16 @@ struct msgr_rdma_context_t
|
|||||||
int max_cqe = 0;
|
int max_cqe = 0;
|
||||||
int used_max_cqe = 0;
|
int used_max_cqe = 0;
|
||||||
|
|
||||||
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level);
|
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level);
|
||||||
~msgr_rdma_context_t();
|
~msgr_rdma_context_t();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct msgr_rdma_buf_t
|
||||||
|
{
|
||||||
|
void *buf = NULL;
|
||||||
|
ibv_mr *mr = NULL;
|
||||||
|
};
|
||||||
|
|
||||||
struct msgr_rdma_connection_t
|
struct msgr_rdma_connection_t
|
||||||
{
|
{
|
||||||
msgr_rdma_context_t *ctx = NULL;
|
msgr_rdma_context_t *ctx = NULL;
|
||||||
@@ -50,8 +57,11 @@ struct msgr_rdma_connection_t
|
|||||||
|
|
||||||
int send_pos = 0, send_buf_pos = 0;
|
int send_pos = 0, send_buf_pos = 0;
|
||||||
int next_recv_buf = 0;
|
int next_recv_buf = 0;
|
||||||
std::vector<void*> recv_buffers;
|
std::vector<msgr_rdma_buf_t> recv_buffers;
|
||||||
std::vector<uint64_t> send_sizes;
|
std::vector<uint64_t> send_sizes;
|
||||||
|
msgr_rdma_buf_t send_out;
|
||||||
|
int send_out_pos = 0, send_done_pos = 0, send_out_size = 0;
|
||||||
|
bool send_out_full = false;
|
||||||
|
|
||||||
~msgr_rdma_connection_t();
|
~msgr_rdma_connection_t();
|
||||||
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
|
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
|
||||||
|
@@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
#define _XOPEN_SOURCE
|
#define _XOPEN_SOURCE
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
|
#include <sys/epoll.h>
|
||||||
|
|
||||||
#include "messenger.h"
|
#include "messenger.h"
|
||||||
|
|
||||||
@@ -283,7 +284,14 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||||||
fprintf(stderr, "Successfully connected with client %d using RDMA\n", cl->peer_fd);
|
fprintf(stderr, "Successfully connected with client %d using RDMA\n", cl->peer_fd);
|
||||||
}
|
}
|
||||||
cl->peer_state = PEER_RDMA;
|
cl->peer_state = PEER_RDMA;
|
||||||
tfd->set_fd_handler(cl->peer_fd, false, NULL);
|
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||||
|
{
|
||||||
|
// Do not miss the disconnection!
|
||||||
|
if (epoll_events & EPOLLRDHUP)
|
||||||
|
{
|
||||||
|
handle_peer_epoll(peer_fd, epoll_events);
|
||||||
|
}
|
||||||
|
});
|
||||||
// Add the initial receive request
|
// Add the initial receive request
|
||||||
try_recv_rdma(cl);
|
try_recv_rdma(cl);
|
||||||
}
|
}
|
||||||
|
@@ -30,7 +30,7 @@ protected:
|
|||||||
std::string image_name;
|
std::string image_name;
|
||||||
uint64_t inode = 0;
|
uint64_t inode = 0;
|
||||||
uint64_t device_size = 0;
|
uint64_t device_size = 0;
|
||||||
int nbd_timeout = 30;
|
int nbd_timeout = 300;
|
||||||
int nbd_max_devices = 64;
|
int nbd_max_devices = 64;
|
||||||
int nbd_max_part = 3;
|
int nbd_max_part = 3;
|
||||||
inode_watch_t *watch = NULL;
|
inode_watch_t *watch = NULL;
|
||||||
@@ -135,14 +135,16 @@ public:
|
|||||||
" %s unmap /dev/nbd0\n"
|
" %s unmap /dev/nbd0\n"
|
||||||
" %s ls [--json]\n"
|
" %s ls [--json]\n"
|
||||||
"OPTIONS:\n"
|
"OPTIONS:\n"
|
||||||
" All usual Vitastor config options like --etcd_address <etcd_address> plus NBD-specific:\n"
|
" All usual Vitastor config options like --config_file <path_to_config> plus NBD-specific:\n"
|
||||||
" --nbd_timeout 30\n"
|
" --nbd_timeout 300\n"
|
||||||
" Timeout for I/O operations in seconds after exceeding which the kernel stops\n"
|
" Timeout for I/O operations in seconds after exceeding which the kernel stops\n"
|
||||||
" the device. You can set it to 0 to disable the timeout, but beware that you\n"
|
" the device. You can set it to 0 to disable the timeout, but beware that you\n"
|
||||||
" won't be able to stop the device at all if vitastor-nbd process dies.\n"
|
" won't be able to stop the device at all if vitastor-nbd process dies.\n"
|
||||||
" --nbd_max_devices 64 --nbd_max_part 3\n"
|
" --nbd_max_devices 64 --nbd_max_part 3\n"
|
||||||
" Options for the \"nbd\" kernel module when modprobing it (nbds_max and max_part).\n"
|
" Options for the \"nbd\" kernel module when modprobing it (nbds_max and max_part).\n"
|
||||||
" note that maximum allowed (nbds_max)*(1+max_part) is 256.\n"
|
" note that maximum allowed (nbds_max)*(1+max_part) is 256.\n"
|
||||||
|
" Note that nbd_timeout, nbd_max_devices and nbd_max_part options may also be specified\n"
|
||||||
|
" in /etc/vitastor/vitastor.conf or in other configuration file specified with --config_file.\n"
|
||||||
" --logfile /path/to/log/file.txt\n"
|
" --logfile /path/to/log/file.txt\n"
|
||||||
" Wite log messages to the specified file instead of dropping them (in background mode)\n"
|
" Wite log messages to the specified file instead of dropping them (in background mode)\n"
|
||||||
" or printing them to the standard output (in foreground mode).\n"
|
" or printing them to the standard output (in foreground mode).\n"
|
||||||
@@ -204,17 +206,18 @@ public:
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (cfg["nbd_max_devices"].is_number() || cfg["nbd_max_devices"].is_string())
|
auto file_config = osd_messenger_t::read_config(cfg);
|
||||||
|
if (file_config["nbd_max_devices"].is_number() || file_config["nbd_max_devices"].is_string())
|
||||||
{
|
{
|
||||||
nbd_max_devices = cfg["nbd_max_devices"].uint64_value();
|
nbd_max_devices = file_config["nbd_max_devices"].uint64_value();
|
||||||
}
|
}
|
||||||
if (cfg["nbd_max_part"].is_number() || cfg["nbd_max_part"].is_string())
|
if (file_config["nbd_max_part"].is_number() || file_config["nbd_max_part"].is_string())
|
||||||
{
|
{
|
||||||
nbd_max_part = cfg["nbd_max_part"].uint64_value();
|
nbd_max_part = file_config["nbd_max_part"].uint64_value();
|
||||||
}
|
}
|
||||||
if (cfg["nbd_timeout"].is_number() || cfg["nbd_timeout"].is_string())
|
if (file_config["nbd_timeout"].is_number() || file_config["nbd_timeout"].is_string())
|
||||||
{
|
{
|
||||||
nbd_timeout = cfg["nbd_timeout"].uint64_value();
|
nbd_timeout = file_config["nbd_timeout"].uint64_value();
|
||||||
}
|
}
|
||||||
if (cfg["client_writeback_allowed"].is_null())
|
if (cfg["client_writeback_allowed"].is_null())
|
||||||
{
|
{
|
||||||
@@ -225,7 +228,7 @@ public:
|
|||||||
cfg = obj;
|
cfg = obj;
|
||||||
}
|
}
|
||||||
// Create client
|
// Create client
|
||||||
ringloop = new ring_loop_t(512);
|
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||||
epmgr = new epoll_manager_t(ringloop);
|
epmgr = new epoll_manager_t(ringloop);
|
||||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||||
if (!inode)
|
if (!inode)
|
||||||
@@ -272,7 +275,7 @@ public:
|
|||||||
int i = 0;
|
int i = 0;
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
int r = run_nbd(sockfd, i, device_size, NBD_FLAG_SEND_FLUSH, 30, bg);
|
int r = run_nbd(sockfd, i, device_size, NBD_FLAG_SEND_FLUSH, nbd_timeout, bg);
|
||||||
if (r == 0)
|
if (r == 0)
|
||||||
{
|
{
|
||||||
printf("/dev/nbd%d\n", i);
|
printf("/dev/nbd%d\n", i);
|
||||||
|
@@ -56,7 +56,7 @@ json11::Json::object nfs_proxy_t::parse_args(int narg, const char *args[])
|
|||||||
"(c) Vitaliy Filippov, 2021-2022 (VNPL-1.1)\n"
|
"(c) Vitaliy Filippov, 2021-2022 (VNPL-1.1)\n"
|
||||||
"\n"
|
"\n"
|
||||||
"USAGE:\n"
|
"USAGE:\n"
|
||||||
" %s [--etcd_address ADDR] [OTHER OPTIONS]\n"
|
" %s [STANDARD OPTIONS] [OTHER OPTIONS]\n"
|
||||||
" --subdir <DIR> export images prefixed <DIR>/ (default empty - export all images)\n"
|
" --subdir <DIR> export images prefixed <DIR>/ (default empty - export all images)\n"
|
||||||
" --portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)\n"
|
" --portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)\n"
|
||||||
" --bind <IP> bind service to <IP> address (default 0.0.0.0)\n"
|
" --bind <IP> bind service to <IP> address (default 0.0.0.0)\n"
|
||||||
@@ -124,7 +124,7 @@ void nfs_proxy_t::run(json11::Json cfg)
|
|||||||
cfg = obj;
|
cfg = obj;
|
||||||
}
|
}
|
||||||
// Create client
|
// Create client
|
||||||
ringloop = new ring_loop_t(512);
|
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||||
epmgr = new epoll_manager_t(ringloop);
|
epmgr = new epoll_manager_t(ringloop);
|
||||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||||
cmd = new cli_tool_t();
|
cmd = new cli_tool_t();
|
||||||
|
@@ -541,11 +541,15 @@ void osd_t::print_slow()
|
|||||||
}
|
}
|
||||||
else if (op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
|
else if (op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
|
||||||
{
|
{
|
||||||
for (uint64_t i = 0; i < op->req.sec_stab.len; i += sizeof(obj_ver_id))
|
for (uint64_t i = 0; i < op->req.sec_stab.len && i < sizeof(obj_ver_id)*12; i += sizeof(obj_ver_id))
|
||||||
{
|
{
|
||||||
obj_ver_id *ov = (obj_ver_id*)((uint8_t*)op->buf + i);
|
obj_ver_id *ov = (obj_ver_id*)((uint8_t*)op->buf + i);
|
||||||
bufprintf(i == 0 ? " %lx:%lx v%lu" : ", %lx:%lx v%lu", ov->oid.inode, ov->oid.stripe, ov->version);
|
bufprintf(i == 0 ? " %lx:%lx v%lu" : ", %lx:%lx v%lu", ov->oid.inode, ov->oid.stripe, ov->version);
|
||||||
}
|
}
|
||||||
|
if (op->req.sec_stab.len > sizeof(obj_ver_id)*12)
|
||||||
|
{
|
||||||
|
bufprintf(", ... (%lu items)", op->req.sec_stab.len/sizeof(obj_ver_id));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
|
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
|
||||||
{
|
{
|
||||||
|
@@ -58,7 +58,7 @@ int main(int narg, char *args[])
|
|||||||
}
|
}
|
||||||
signal(SIGINT, handle_sigint);
|
signal(SIGINT, handle_sigint);
|
||||||
signal(SIGTERM, handle_sigint);
|
signal(SIGTERM, handle_sigint);
|
||||||
ring_loop_t *ringloop = new ring_loop_t(512);
|
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||||
osd = new osd_t(config, ringloop);
|
osd = new osd_t(config, ringloop);
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
|
@@ -196,10 +196,11 @@ static void vitastor_parse_filename(const char *filename, QDict *options, Error
|
|||||||
!strcmp(name, "rdma-gid-index") ||
|
!strcmp(name, "rdma-gid-index") ||
|
||||||
!strcmp(name, "rdma-mtu"))
|
!strcmp(name, "rdma-mtu"))
|
||||||
{
|
{
|
||||||
unsigned long long num_val;
|
|
||||||
#if QEMU_VERSION_MAJOR < 8 || QEMU_VERSION_MAJOR == 8 && QEMU_VERSION_MINOR < 1
|
#if QEMU_VERSION_MAJOR < 8 || QEMU_VERSION_MAJOR == 8 && QEMU_VERSION_MINOR < 1
|
||||||
|
unsigned long long num_val;
|
||||||
if (parse_uint_full(value, &num_val, 0))
|
if (parse_uint_full(value, &num_val, 0))
|
||||||
#else
|
#else
|
||||||
|
uint64_t num_val;
|
||||||
if (parse_uint_full(value, 0, &num_val))
|
if (parse_uint_full(value, 0, &num_val))
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
|
@@ -17,7 +17,7 @@ ring_loop_t::ring_loop_t(int qd)
|
|||||||
{
|
{
|
||||||
throw std::runtime_error(std::string("io_uring_queue_init: ") + strerror(-ret));
|
throw std::runtime_error(std::string("io_uring_queue_init: ") + strerror(-ret));
|
||||||
}
|
}
|
||||||
free_ring_data_ptr = *ring.cq.kring_entries;
|
free_ring_data_ptr = *ring.sq.kring_entries;
|
||||||
ring_datas = (struct ring_data_t*)calloc(free_ring_data_ptr, sizeof(ring_data_t));
|
ring_datas = (struct ring_data_t*)calloc(free_ring_data_ptr, sizeof(ring_data_t));
|
||||||
free_ring_data = (int*)malloc(sizeof(int) * free_ring_data_ptr);
|
free_ring_data = (int*)malloc(sizeof(int) * free_ring_data_ptr);
|
||||||
if (!ring_datas || !free_ring_data)
|
if (!ring_datas || !free_ring_data)
|
||||||
|
@@ -15,6 +15,8 @@
|
|||||||
#include <functional>
|
#include <functional>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#define RINGLOOP_DEFAULT_SIZE 1024
|
||||||
|
|
||||||
static inline void my_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, const void *addr, unsigned len, off_t offset)
|
static inline void my_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, const void *addr, unsigned len, off_t offset)
|
||||||
{
|
{
|
||||||
// Prepare a read/write operation without clearing user_data
|
// Prepare a read/write operation without clearing user_data
|
||||||
@@ -139,11 +141,9 @@ public:
|
|||||||
if (free_ring_data_ptr == 0)
|
if (free_ring_data_ptr == 0)
|
||||||
return NULL;
|
return NULL;
|
||||||
struct io_uring_sqe* sqe = io_uring_get_sqe(&ring);
|
struct io_uring_sqe* sqe = io_uring_get_sqe(&ring);
|
||||||
if (sqe)
|
assert(sqe);
|
||||||
{
|
*sqe = { 0 };
|
||||||
*sqe = { 0 };
|
io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
|
||||||
io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
|
|
||||||
}
|
|
||||||
return sqe;
|
return sqe;
|
||||||
}
|
}
|
||||||
inline void set_immediate(const std::function<void()> cb)
|
inline void set_immediate(const std::function<void()> cb)
|
||||||
|
@@ -30,7 +30,7 @@ void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op);
|
|||||||
int main(int narg, char *args[])
|
int main(int narg, char *args[])
|
||||||
{
|
{
|
||||||
ring_consumer_t looper;
|
ring_consumer_t looper;
|
||||||
ring_loop_t *ringloop = new ring_loop_t(512);
|
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||||
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
|
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
|
||||||
osd_messenger_t *msgr = new osd_messenger_t();
|
osd_messenger_t *msgr = new osd_messenger_t();
|
||||||
msgr->osd_num = 1351;
|
msgr->osd_num = 1351;
|
||||||
|
@@ -11,7 +11,7 @@ int main(int narg, char *args[])
|
|||||||
config["meta_device"] = "./test_meta.bin";
|
config["meta_device"] = "./test_meta.bin";
|
||||||
config["journal_device"] = "./test_journal.bin";
|
config["journal_device"] = "./test_journal.bin";
|
||||||
config["data_device"] = "./test_data.bin";
|
config["data_device"] = "./test_data.bin";
|
||||||
ring_loop_t *ringloop = new ring_loop_t(512);
|
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||||
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
|
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
|
||||||
blockstore_t *bs = new blockstore_t(config, ringloop, epmgr->tfd);
|
blockstore_t *bs = new blockstore_t(config, ringloop, epmgr->tfd);
|
||||||
|
|
||||||
|
@@ -68,7 +68,7 @@ int main(int narg, char *args[])
|
|||||||
| cfg["inode_id"].uint64_value();
|
| cfg["inode_id"].uint64_value();
|
||||||
uint64_t base_ver = 0;
|
uint64_t base_ver = 0;
|
||||||
// Create client
|
// Create client
|
||||||
auto ringloop = new ring_loop_t(512);
|
auto ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||||
auto epmgr = new epoll_manager_t(ringloop);
|
auto epmgr = new epoll_manager_t(ringloop);
|
||||||
auto cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
auto cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||||
cli->on_ready([&]()
|
cli->on_ready([&]()
|
||||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
|||||||
|
|
||||||
Name: Vitastor
|
Name: Vitastor
|
||||||
Description: Vitastor client library
|
Description: Vitastor client library
|
||||||
Version: 1.2.0
|
Version: 1.3.1
|
||||||
Libs: -L${libdir} -lvitastor_client
|
Libs: -L${libdir} -lvitastor_client
|
||||||
Cflags: -I${includedir}
|
Cflags: -I${includedir}
|
||||||
|
|
||||||
|
@@ -114,7 +114,7 @@ vitastor_c *vitastor_c_create_qemu_uring(QEMUSetFDHandler *aio_set_fd_handler, v
|
|||||||
ring_loop_t *ringloop = NULL;
|
ring_loop_t *ringloop = NULL;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
ringloop = new ring_loop_t(512);
|
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||||
}
|
}
|
||||||
catch (std::exception & e)
|
catch (std::exception & e)
|
||||||
{
|
{
|
||||||
@@ -136,7 +136,7 @@ vitastor_c *vitastor_c_create_uring(const char *config_path, const char *etcd_ho
|
|||||||
ring_loop_t *ringloop = NULL;
|
ring_loop_t *ringloop = NULL;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
ringloop = new ring_loop_t(512);
|
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||||
}
|
}
|
||||||
catch (std::exception & e)
|
catch (std::exception & e)
|
||||||
{
|
{
|
||||||
@@ -167,7 +167,7 @@ vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len)
|
|||||||
ring_loop_t *ringloop = NULL;
|
ring_loop_t *ringloop = NULL;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
ringloop = new ring_loop_t(512);
|
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||||
}
|
}
|
||||||
catch (std::exception & e)
|
catch (std::exception & e)
|
||||||
{
|
{
|
||||||
|
Reference in New Issue
Block a user