Compare commits

..

1 Commits

Author SHA1 Message Date
64bbf121b6 Experiment: zero-copy TCP send 2023-11-04 01:34:18 +03:00
93 changed files with 559 additions and 1928 deletions

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
set(VERSION "1.3.1")
set(VERSION "1.1.0")
add_subdirectory(src)

View File

@@ -1,15 +1,14 @@
# Compile stage
FROM golang:bookworm AS build
FROM golang:buster AS build
ADD go.sum go.mod /app/
RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go mod download -x
ADD . /app
RUN perl -i -e '$/ = undef; while(<>) { s/\n\s*(\{\s*\n)/$1\n/g; s/\}(\s*\n\s*)else\b/$1} else/g; print; }' `find /app -name '*.go'` && \
cd /app && \
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o vitastor-csi
RUN perl -i -e '$/ = undef; while(<>) { s/\n\s*(\{\s*\n)/$1\n/g; s/\}(\s*\n\s*)else\b/$1} else/g; print; }' `find /app -name '*.go'`
RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o vitastor-csi
# Final stage
FROM debian:bookworm
FROM debian:buster
LABEL maintainers="Vitaliy Filippov <vitalif@yourcmc.ru>"
LABEL description="Vitastor CSI Driver"
@@ -19,30 +18,19 @@ ENV CSI_ENDPOINT=""
RUN apt-get update && \
apt-get install -y wget && \
(echo deb http://deb.debian.org/debian buster-backports main > /etc/apt/sources.list.d/backports.list) && \
(echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
apt-get update && \
apt-get install -y e2fsprogs xfsprogs kmod iproute2 \
# dependencies of qemu-storage-daemon
libnuma1 liburing2 libglib2.0-0 libfuse3-3 libaio1 libzstd1 libnettle8 \
libgmp10 libhogweed6 libp11-kit0 libidn2-0 libunistring2 libtasn1-6 libpcre2-8-0 libffi8 && \
apt-get install -y e2fsprogs xfsprogs kmod && \
apt-get clean && \
(echo options nbd nbds_max=128 > /etc/modprobe.d/nbd.conf)
COPY --from=build /app/vitastor-csi /bin/
RUN (echo deb http://vitastor.io/debian bookworm main > /etc/apt/sources.list.d/vitastor.list) && \
((echo 'Package: *'; echo 'Pin: origin "vitastor.io"'; echo 'Pin-Priority: 1000') > /etc/apt/preferences.d/vitastor.pref) && \
RUN (echo deb http://vitastor.io/debian buster main > /etc/apt/sources.list.d/vitastor.list) && \
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
apt-get update && \
apt-get install -y vitastor-client && \
wget https://vitastor.io/archive/qemu/qemu-bookworm-8.1.2%2Bds-1%2Bvitastor1/qemu-utils_8.1.2%2Bds-1%2Bvitastor1_amd64.deb && \
wget https://vitastor.io/archive/qemu/qemu-bookworm-8.1.2%2Bds-1%2Bvitastor1/qemu-block-extra_8.1.2%2Bds-1%2Bvitastor1_amd64.deb && \
dpkg -x qemu-utils*.deb tmp1 && \
dpkg -x qemu-block-extra*.deb tmp1 && \
cp -a tmp1/usr/bin/qemu-storage-daemon /usr/bin/ && \
mkdir -p /usr/lib/x86_64-linux-gnu/qemu && \
cp -a tmp1/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/ && \
rm -rf tmp1 *.deb && \
apt-get clean
ENTRYPOINT ["/bin/vitastor-csi"]

View File

@@ -1,4 +1,4 @@
VERSION ?= v1.3.1
VERSION ?= v1.1.0
all: build push

View File

@@ -2,7 +2,6 @@
apiVersion: v1
kind: ConfigMap
data:
# You can add multiple configuration files here to use a multi-cluster setup
vitastor.conf: |-
{"etcd_address":"http://192.168.7.2:2379","etcd_prefix":"/vitastor"}
metadata:

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v1.3.1
image: vitalif/vitastor-csi:v1.1.0
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"
@@ -82,8 +82,6 @@ spec:
name: host-sys
- mountPath: /run/mount
name: host-mount
- mountPath: /run/vitastor-csi
name: run-vitastor-csi
- mountPath: /lib/modules
name: lib-modules
readOnly: true
@@ -134,9 +132,6 @@ spec:
- name: host-mount
hostPath:
path: /run/mount
- name: run-vitastor-csi
hostPath:
path: /run/vitastor-csi
- name: lib-modules
hostPath:
path: /lib/modules

View File

@@ -121,7 +121,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v1.3.1
image: vitalif/vitastor-csi:v1.1.0
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -12,6 +12,9 @@ parameters:
etcdVolumePrefix: ""
poolId: "1"
# you can choose other configuration file if you have it in the config map
# different etcd URLs and prefixes should also be put in the config
#configPath: "/etc/vitastor/vitastor.conf"
# you can also specify etcdUrl here, maybe to connect to another Vitastor cluster
# multiple etcdUrls may be specified, delimited by comma
#etcdUrl: "http://192.168.7.2:2379"
#etcdPrefix: "/vitastor"
allowVolumeExpansion: true

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "1.3.1"
vitastorCSIDriverVersion = "1.1.0"
)
// Config struct fills the parameters of request or user input

View File

@@ -62,7 +62,7 @@ func NewControllerServer(driver *Driver) *ControllerServer
}
}
func GetConnectionParams(params map[string]string) (map[string]string, error)
func GetConnectionParams(params map[string]string) (map[string]string, []string, string)
{
ctxVars := make(map[string]string)
configPath := params["configPath"]
@@ -75,69 +75,71 @@ func GetConnectionParams(params map[string]string) (map[string]string, error)
ctxVars["configPath"] = configPath
}
config := make(map[string]interface{})
configFD, err := os.Open(configPath)
if (err != nil)
if configFD, err := os.Open(configPath); err == nil
{
return nil, err
defer configFD.Close()
data, _ := ioutil.ReadAll(configFD)
json.Unmarshal(data, &config)
}
defer configFD.Close()
data, _ := ioutil.ReadAll(configFD)
json.Unmarshal(data, &config)
// Check etcd URL in the config, but do not use the explicit etcdUrl
// parameter for CLI calls, otherwise users won't be able to later
// change them - storage class parameters are saved in volume IDs
// Try to load prefix & etcd URL from the config
var etcdUrl []string
switch config["etcd_address"].(type)
if (params["etcdUrl"] != "")
{
case string:
url := strings.TrimSpace(config["etcd_address"].(string))
if (url != "")
{
etcdUrl = strings.Split(url, ",")
}
case []string:
etcdUrl = config["etcd_address"].([]string)
case []interface{}:
for _, url := range config["etcd_address"].([]interface{})
{
s, ok := url.(string)
if (ok)
{
etcdUrl = append(etcdUrl, s)
}
}
ctxVars["etcdUrl"] = params["etcdUrl"]
etcdUrl = strings.Split(params["etcdUrl"], ",")
}
if (len(etcdUrl) == 0)
{
return nil, status.Error(codes.InvalidArgument, "etcd_address is missing in "+configPath)
switch config["etcd_address"].(type)
{
case string:
etcdUrl = strings.Split(config["etcd_address"].(string), ",")
case []string:
etcdUrl = config["etcd_address"].([]string)
}
}
return ctxVars, nil
}
func system(program string, args ...string) ([]byte, []byte, error)
{
klog.Infof("Running "+program+" "+strings.Join(args, " "))
c := exec.Command(program, args...)
var stdout, stderr bytes.Buffer
c.Stdout, c.Stderr = &stdout, &stderr
err := c.Run()
if (err != nil)
etcdPrefix := params["etcdPrefix"]
if (etcdPrefix == "")
{
stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s, status %s\n", stdoutStr+stderrStr, err)
return nil, nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
etcdPrefix, _ = config["etcd_prefix"].(string)
if (etcdPrefix == "")
{
etcdPrefix = "/vitastor"
}
}
return stdout.Bytes(), stderr.Bytes(), nil
else
{
ctxVars["etcdPrefix"] = etcdPrefix
}
return ctxVars, etcdUrl, etcdPrefix
}
func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
{
if (ctxVars["etcdUrl"] != "")
{
args = append(args, "--etcd_address", ctxVars["etcdUrl"])
}
if (ctxVars["etcdPrefix"] != "")
{
args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
}
if (ctxVars["configPath"] != "")
{
args = append(args, "--config_path", ctxVars["configPath"])
}
stdout, _, err := system("/usr/bin/vitastor-cli", args...)
return stdout, err
c := exec.Command("/usr/bin/vitastor-cli", args...)
var stdout, stderr bytes.Buffer
c.Stdout = &stdout
c.Stderr = &stderr
err := c.Run()
stderrStr := string(stderr.Bytes())
if (err != nil)
{
klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
}
return stdout.Bytes(), nil
}
// Create the volume
@@ -172,10 +174,10 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
}
ctxVars, err := GetConnectionParams(req.Parameters)
if (err != nil)
ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
if (len(etcdUrl) == 0)
{
return nil, err
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
}
args := []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) }
@@ -205,7 +207,7 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
}
// Create image using vitastor-cli
_, err = invokeCLI(ctxVars, args)
_, err := invokeCLI(ctxVars, args)
if (err != nil)
{
if (strings.Index(err.Error(), "already exists") > 0)
@@ -255,11 +257,7 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
}
volName := volVars["name"]
ctxVars, err := GetConnectionParams(volVars)
if (err != nil)
{
return nil, err
}
ctxVars, _, _ := GetConnectionParams(volVars)
_, err = invokeCLI(ctxVars, []string{ "rm", volName })
if (err != nil)
@@ -471,11 +469,7 @@ func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteS
volName := volVars["name"]
snapName := volVars["snapshot"]
ctxVars, err := GetConnectionParams(volVars)
if (err != nil)
{
return nil, err
}
ctxVars, _, _ := GetConnectionParams(volVars)
_, err = invokeCLI(ctxVars, []string{ "rm", volName+"@"+snapName })
if (err != nil)
@@ -502,11 +496,7 @@ func (cs *ControllerServer) ListSnapshots(ctx context.Context, req *csi.ListSnap
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := volVars["name"]
ctxVars, err := GetConnectionParams(volVars)
if (err != nil)
{
return nil, err
}
ctxVars, _, _ := GetConnectionParams(volVars)
inodeCfg, err := invokeList(ctxVars, volName+"@*", false)
if (err != nil)
@@ -565,11 +555,7 @@ func (cs *ControllerServer) ControllerExpandVolume(ctx context.Context, req *csi
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := volVars["name"]
ctxVars, err := GetConnectionParams(volVars)
if (err != nil)
{
return nil, err
}
ctxVars, _, _ := GetConnectionParams(volVars)
inodeCfg, err := invokeList(ctxVars, volName, true)
if (err != nil)

View File

@@ -5,15 +5,11 @@ package vitastor
import (
"context"
"errors"
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"encoding/json"
"strings"
"syscall"
"bytes"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
@@ -29,91 +25,16 @@ import (
type NodeServer struct
{
*Driver
useVduse bool
stateDir string
mounter mount.Interface
}
type DeviceState struct
{
ConfigPath string `json:"configPath"`
VdpaId string `json:"vdpaId"`
Image string `json:"image"`
Blockdev string `json:"blockdev"`
Readonly bool `json:"readonly"`
PidFile string `json:"pidFile"`
}
// NewNodeServer create new instance node
func NewNodeServer(driver *Driver) *NodeServer
{
stateDir := os.Getenv("STATE_DIR")
if (stateDir == "")
{
stateDir = "/run/vitastor-csi"
}
if (stateDir[len(stateDir)-1] != '/')
{
stateDir += "/"
}
ns := &NodeServer{
return &NodeServer{
Driver: driver,
useVduse: checkVduseSupport(),
stateDir: stateDir,
mounter: mount.New(""),
}
if (ns.useVduse)
{
ns.restoreVduseDaemons()
}
return ns
}
func checkVduseSupport() bool
{
// Check VDUSE support (vdpa, vduse, virtio-vdpa kernel modules)
vduse := true
for _, mod := range []string{"vdpa", "vduse", "virtio-vdpa"}
{
_, err := os.Stat("/sys/module/"+mod)
if (err != nil)
{
if (!errors.Is(err, os.ErrNotExist))
{
klog.Errorf("failed to check /sys/module/%s: %v", mod, err)
}
c := exec.Command("/sbin/modprobe", mod)
c.Stdout = os.Stderr
c.Stderr = os.Stderr
err := c.Run()
if (err != nil)
{
klog.Errorf("/sbin/modprobe %s failed: %v", mod, err)
vduse = false
break
}
}
}
// Check that vdpa tool functions
if (vduse)
{
c := exec.Command("/sbin/vdpa", "-j", "dev")
c.Stderr = os.Stderr
err := c.Run()
if (err != nil)
{
klog.Errorf("/sbin/vdpa -j dev failed: %v", err)
vduse = false
}
}
if (!vduse)
{
klog.Errorf(
"Your host apparently has no VDUSE support. VDUSE support disabled, NBD will be used to map devices."+
" For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
)
}
return vduse
}
// NodeStageVolume mounts the volume to a staging path on the node.
@@ -140,303 +61,6 @@ func Contains(list []string, s string) bool
return false
}
func (ns *NodeServer) mapNbd(volName string, ctxVars map[string]string, readonly bool) (string, error)
{
// Map NBD device
// FIXME: Check if already mapped
args := []string{
"map", "--image", volName,
}
if (ctxVars["configPath"] != "")
{
args = append(args, "--config_path", ctxVars["configPath"])
}
if (readonly)
{
args = append(args, "--readonly", "1")
}
stdout, stderr, err := system("/usr/bin/vitastor-nbd", args...)
dev := strings.TrimSpace(string(stdout))
if (dev == "")
{
return "", fmt.Errorf("vitastor-nbd did not return the name of NBD device. output: %s", stderr)
}
return dev, err
}
func (ns *NodeServer) unmapNbd(devicePath string)
{
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
}
func findByPidFile(pidFile string) (*os.Process, error)
{
klog.Infof("killing process with PID from file %s", pidFile)
pidBuf, err := os.ReadFile(pidFile)
if (err != nil)
{
return nil, err
}
pid, err := strconv.ParseInt(strings.TrimSpace(string(pidBuf)), 0, 64)
if (err != nil)
{
return nil, err
}
proc, err := os.FindProcess(int(pid))
if (err != nil)
{
return nil, err
}
return proc, nil
}
func killByPidFile(pidFile string) error
{
proc, err := findByPidFile(pidFile)
if (err != nil)
{
return err
}
return proc.Signal(syscall.SIGTERM)
}
func startStorageDaemon(vdpaId, volName, pidFile, configPath string, readonly bool) error
{
// Start qemu-storage-daemon
blockSpec := map[string]interface{}{
"node-name": "disk1",
"driver": "vitastor",
"image": volName,
"cache": map[string]bool{
"direct": true,
"no-flush": false,
},
"discard": "unmap",
}
if (configPath != "")
{
blockSpec["config-path"] = configPath
}
blockSpecJson, _ := json.Marshal(blockSpec)
writable := "true"
if (readonly)
{
writable = "false"
}
_, _, err := system(
"/usr/bin/qemu-storage-daemon", "--daemonize", "--pidfile", pidFile, "--blockdev", string(blockSpecJson),
"--export", "vduse-blk,id="+vdpaId+",node-name=disk1,name="+vdpaId+",num-queues=16,queue-size=128,writable="+writable,
)
return err
}
func (ns *NodeServer) mapVduse(volName string, ctxVars map[string]string, readonly bool) (string, string, error)
{
// Generate state file
stateFd, err := os.CreateTemp(ns.stateDir, "vitastor-vduse-*.json")
if (err != nil)
{
return "", "", err
}
stateFile := stateFd.Name()
stateFd.Close()
vdpaId := filepath.Base(stateFile)
vdpaId = vdpaId[0:len(vdpaId)-5] // remove ".json"
pidFile := ns.stateDir + vdpaId + ".pid"
// Map VDUSE device via qemu-storage-daemon
err = startStorageDaemon(vdpaId, volName, pidFile, ctxVars["configPath"], readonly)
if (err == nil)
{
// Add device to VDPA bus
_, _, err = system("/sbin/vdpa", "-j", "dev", "add", "name", vdpaId, "mgmtdev", "vduse")
if (err == nil)
{
// Find block device name
var matches []string
matches, err = filepath.Glob("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/*")
if (err == nil && len(matches) == 0)
{
err = errors.New("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/* is not found")
}
if (err == nil)
{
blockdev := "/dev/"+filepath.Base(matches[0])
_, err = os.Stat(blockdev)
if (err == nil)
{
// Generate state file
stateJSON, _ := json.Marshal(&DeviceState{
ConfigPath: ctxVars["configPath"],
VdpaId: vdpaId,
Image: volName,
Blockdev: blockdev,
Readonly: readonly,
PidFile: pidFile,
})
err = os.WriteFile(stateFile, stateJSON, 0600)
if (err == nil)
{
return blockdev, vdpaId, nil
}
}
}
}
killErr := killByPidFile(pidFile)
if (killErr != nil)
{
klog.Errorf("Failed to kill started qemu-storage-daemon: %v", killErr)
}
os.Remove(stateFile)
os.Remove(pidFile)
}
return "", "", err
}
func (ns *NodeServer) unmapVduse(devicePath string)
{
if (len(devicePath) < 6 || devicePath[0:6] != "/dev/v")
{
klog.Errorf("%s does not start with /dev/v", devicePath)
return
}
vduseDev, err := os.Readlink("/sys/block/"+devicePath[5:])
if (err != nil)
{
klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx): %v", devicePath, err)
return
}
vdpaId := ""
p := strings.Index(vduseDev, "/vduse/")
if (p >= 0)
{
vduseDev = vduseDev[p+7:]
p = strings.Index(vduseDev, "/")
if (p >= 0)
{
vdpaId = vduseDev[0:p]
}
}
if (vdpaId == "")
{
klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx), but is %v", devicePath, vduseDev)
return
}
ns.unmapVduseById(vdpaId)
}
func (ns *NodeServer) unmapVduseById(vdpaId string)
{
_, err := os.Stat("/sys/bus/vdpa/devices/"+vdpaId)
if (err != nil)
{
klog.Errorf("failed to stat /sys/bus/vdpa/devices/"+vdpaId+": %v", err)
}
else
{
_, _, _ = system("/sbin/vdpa", "-j", "dev", "del", vdpaId)
}
stateFile := ns.stateDir + vdpaId + ".json"
os.Remove(stateFile)
pidFile := ns.stateDir + vdpaId + ".pid"
_, err = os.Stat(pidFile)
if (os.IsNotExist(err))
{
// ok, already killed
}
else if (err != nil)
{
klog.Errorf("Failed to stat %v: %v", pidFile, err)
return
}
else
{
err = killByPidFile(pidFile)
if (err != nil)
{
klog.Errorf("Failed to kill started qemu-storage-daemon: %v", err)
}
os.Remove(pidFile)
}
}
func (ns *NodeServer) restoreVduseDaemons()
{
pattern := ns.stateDir+"vitastor-vduse-*.json"
matches, err := filepath.Glob(pattern)
if (err != nil)
{
klog.Errorf("failed to list %s: %v", pattern, err)
}
if (len(matches) == 0)
{
return
}
devList := make(map[string]interface{})
// example output: {"dev":{"test1":{"type":"block","mgmtdev":"vduse","vendor_id":0,"max_vqs":16,"max_vq_size":128}}}
devListJSON, _, err := system("/sbin/vdpa", "-j", "dev", "list")
if (err != nil)
{
return
}
err = json.Unmarshal(devListJSON, &devList)
devs, ok := devList["dev"].(map[string]interface{})
if (err != nil || !ok)
{
klog.Errorf("/sbin/vdpa -j dev list returned bad JSON (error %v): %v", err, string(devListJSON))
return
}
for _, stateFile := range matches
{
vdpaId := filepath.Base(stateFile)
vdpaId = vdpaId[0:len(vdpaId)-5]
// Check if VDPA device is still added to the bus
if (devs[vdpaId] != nil)
{
// Check if the storage daemon is still active
pidFile := ns.stateDir + vdpaId + ".pid"
exists := false
proc, err := findByPidFile(pidFile)
if (err == nil)
{
exists = proc.Signal(syscall.Signal(0)) == nil
}
if (!exists)
{
// Restart daemon
stateJSON, err := os.ReadFile(stateFile)
if (err != nil)
{
klog.Warningf("error reading state file %v: %v", stateFile, err)
}
else
{
var state DeviceState
err := json.Unmarshal(stateJSON, &state)
if (err != nil)
{
klog.Warningf("state file %v contains invalid JSON (error %v): %v", stateFile, err, string(stateJSON))
}
else
{
klog.Warningf("restarting storage daemon for volume %v (VDPA ID %v)", state.Image, vdpaId)
_ = startStorageDaemon(vdpaId, state.Image, pidFile, state.ConfigPath, state.Readonly)
}
}
}
}
else
{
// Unused, clean it up
ns.unmapVduseById(vdpaId)
}
}
}
// NodePublishVolume mounts the volume mounted to the staging path to the target path
func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error)
{
@@ -457,13 +81,13 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
if (err != nil)
{
klog.Errorf("failed to create block device mount target %s with error: %v", targetPath, err)
return nil, err
return nil, status.Error(codes.Internal, err.Error())
}
err = pathFile.Close()
if (err != nil)
{
klog.Errorf("failed to close %s with error: %v", targetPath, err)
return nil, err
return nil, status.Error(codes.Internal, err.Error())
}
}
else
@@ -472,13 +96,13 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
if (err != nil)
{
klog.Errorf("failed to create fs mount target %s with error: %v", targetPath, err)
return nil, err
return nil, status.Error(codes.Internal, err.Error())
}
}
}
else
{
return nil, err
return nil, status.Error(codes.Internal, err.Error())
}
}
@@ -490,25 +114,38 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
}
volName := ctxVars["name"]
_, err = GetConnectionParams(ctxVars)
if (err != nil)
_, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
if (len(etcdUrl) == 0)
{
return nil, err
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
}
var devicePath, vdpaId string
if (!ns.useVduse)
// Map NBD device
// FIXME: Check if already mapped
args := []string{
"map", "--etcd_address", strings.Join(etcdUrl, ","),
"--etcd_prefix", etcdPrefix,
"--image", volName,
};
if (ctxVars["configPath"] != "")
{
devicePath, err = ns.mapNbd(volName, ctxVars, req.GetReadonly())
args = append(args, "--config_path", ctxVars["configPath"])
}
else
if (req.GetReadonly())
{
devicePath, vdpaId, err = ns.mapVduse(volName, ctxVars, req.GetReadonly())
args = append(args, "--readonly", "1")
}
c := exec.Command("/usr/bin/vitastor-nbd", args...)
var stdout, stderr bytes.Buffer
c.Stdout, c.Stderr = &stdout, &stderr
err = c.Run()
stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
if (err != nil)
{
return nil, err
klog.Errorf("vitastor-nbd map failed: %s, status %s\n", stdoutStr+stderrStr, err)
return nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
}
devicePath := strings.TrimSpace(stdoutStr)
diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
if (isBlock)
@@ -590,15 +227,13 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
return &csi.NodePublishVolumeResponse{}, nil
unmap:
if (!ns.useVduse || len(devicePath) >= 8 && devicePath[0:8] == "/dev/nbd")
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
ns.unmapNbd(devicePath)
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
else
{
ns.unmapVduseById(vdpaId)
}
return nil, err
return nil, status.Error(codes.Internal, err.Error())
}
// NodeUnpublishVolume unmounts the volume from the target path
@@ -613,31 +248,25 @@ func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu
{
return nil, status.Error(codes.NotFound, "Target path not found")
}
return nil, err
return nil, status.Error(codes.Internal, err.Error())
}
if (devicePath == "")
{
// volume not mounted
klog.Warningf("%s is not a mountpoint, deleting", targetPath)
os.Remove(targetPath)
return &csi.NodeUnpublishVolumeResponse{}, nil
return nil, status.Error(codes.NotFound, "Volume not mounted")
}
// unmount
err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
if (err != nil)
{
return nil, err
return nil, status.Error(codes.Internal, err.Error())
}
// unmap NBD device
if (refCount == 1)
{
if (!ns.useVduse)
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
ns.unmapNbd(devicePath)
}
else
{
ns.unmapVduse(devicePath)
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
}
return &csi.NodeUnpublishVolumeResponse{}, nil

4
debian/changelog vendored
View File

@@ -1,10 +1,10 @@
vitastor (1.3.1-1) unstable; urgency=medium
vitastor (1.1.0-1) unstable; urgency=medium
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
vitastor (0.7.0-1) unstable; urgency=medium
vitastor (1.1.0-1) unstable; urgency=medium
* Implement NFS proxy
* Add documentation

View File

@@ -7,7 +7,7 @@ ARG REL=
WORKDIR /root
RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" -o "$REL" = "bookworm" ]; then \
RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
echo >> /etc/apt/preferences; \
echo 'Package: *' >> /etc/apt/preferences; \
@@ -45,7 +45,7 @@ RUN set -e; \
rm -rf /root/packages/qemu-$REL/*; \
cd /root/packages/qemu-$REL; \
dpkg-source -x /root/qemu*.dsc; \
QEMU_VER=$(ls -d qemu*/ | perl -pe 's!^.*?(\d+\.\d+).*!$1!'); \
QEMU_VER=$(ls -d qemu*/ | perl -pe 's!^.*(\d+\.\d+).*!$1!'); \
D=$(ls -d qemu*/); \
cp /root/vitastor/patches/qemu-$QEMU_VER-vitastor.patch ./qemu-*/debian/patches; \
echo qemu-$QEMU_VER-vitastor.patch >> $D/debian/patches/series; \

View File

@@ -35,8 +35,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-1.3.1; \
cd vitastor-1.3.1; \
cp -r /root/vitastor vitastor-1.1.0; \
cd vitastor-1.1.0; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -49,8 +49,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.3.1.orig.tar.xz vitastor-1.3.1; \
cd vitastor-1.3.1; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.1.0.orig.tar.xz vitastor-1.1.0; \
cd vitastor-1.1.0; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -15,9 +15,6 @@ the cluster.
- [client_max_buffered_bytes](#client_max_buffered_bytes)
- [client_max_buffered_ops](#client_max_buffered_ops)
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
- [nbd_timeout](#nbd_timeout)
- [nbd_max_devices](#nbd_max_devices)
- [nbd_max_part](#nbd_max_part)
## client_max_dirty_bytes
@@ -104,34 +101,3 @@ Multiple consecutive modified data regions are counted as 1 write here.
- Can be changed online: yes
Maximum number of parallel writes when flushing buffered data to the server.
## nbd_timeout
- Type: seconds
- Default: 300
Timeout for I/O operations for [NBD](../usage/nbd.en.md). If an operation
executes for longer than this timeout, including when your cluster is just
temporarily down for more than timeout, the NBD device will detach by itself
(and possibly break the mounted file system).
You can set timeout to 0 to never detach, but in that case you won't be
able to remove the kernel device at all if the NBD process dies - you'll have
to reboot the host.
## nbd_max_devices
- Type: integer
- Default: 64
Maximum number of NBD devices in the system. This value is passed as
`nbds_max` parameter for the nbd kernel module when vitastor-nbd autoloads it.
## nbd_max_part
- Type: integer
- Default: 3
Maximum number of partitions per NBD device. This value is passed as
`max_part` parameter for the nbd kernel module when vitastor-nbd autoloads it.
Note that (nbds_max)*(1+max_part) usually can't exceed 256.

View File

@@ -15,9 +15,6 @@
- [client_max_buffered_bytes](#client_max_buffered_bytes)
- [client_max_buffered_ops](#client_max_buffered_ops)
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
- [nbd_timeout](#nbd_timeout)
- [nbd_max_devices](#nbd_max_devices)
- [nbd_max_part](#nbd_max_part)
## client_max_dirty_bytes
@@ -104,34 +101,3 @@
- Можно менять на лету: да
Максимальное число параллельных операций записи при сбросе буферов на сервер.
## nbd_timeout
- Тип: секунды
- Значение по умолчанию: 300
Таймаут для операций чтения/записи через [NBD](../usage/nbd.ru.md). Если
операция выполняется дольше таймаута, включая временную недоступность
кластера на время, большее таймаута, NBD-устройство отключится само собой
(и, возможно, сломает примонтированную ФС).
Вы можете установить таймаут в 0, чтобы никогда не отключать устройство по
таймауту, но в этом случае вы вообще не сможете удалить устройство, если
процесс NBD умрёт - вам придётся перезагружать сервер.
## nbd_max_devices
- Тип: целое число
- Значение по умолчанию: 64
Максимальное число NBD-устройств в системе. Данное значение передаётся
модулю ядра nbd как параметр `nbds_max`, когда его загружает vitastor-nbd.
## nbd_max_part
- Тип: целое число
- Значение по умолчанию: 3
Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.

View File

@@ -20,7 +20,6 @@ between clients, OSDs and etcd.
- [rdma_max_msg](#rdma_max_msg)
- [rdma_max_recv](#rdma_max_recv)
- [rdma_max_send](#rdma_max_send)
- [rdma_odp](#rdma_odp)
- [peer_connect_interval](#peer_connect_interval)
- [peer_connect_timeout](#peer_connect_timeout)
- [osd_idle_timeout](#osd_idle_timeout)
@@ -69,14 +68,11 @@ but they are not connected to the cluster.
- Type: string
RDMA device name to use for Vitastor OSD communications (for example,
"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
Versions up to Vitastor 1.2.0 required ODP which is only present in
Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
Run `ibv_devinfo -v` as root to list available RDMA devices and their
features.
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
to work. For example, Mellanox ConnectX-3 and older adapters don't have
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
root to list available RDMA devices and their features.
Remember that you also have to configure your network switches if you use
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@@ -151,28 +147,6 @@ less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
Doesn't affect memory usage - additional memory isn't allocated for send
operations.
## rdma_odp
- Type: boolean
- Default: false
Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
for RDMA adapter to be able to use it. This, in turn, allows to skip memory
copying during sending. One would think this should improve performance, but
**in reality** RDMA performance with ODP is **drastically** worse. Example
3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
This happens because Mellanox ODP implementation seems to be based on
message retransmissions when the adapter doesn't know about the buffer yet -
it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
which is generally slow in RDMA/RoCE networks. Here's a presentation about
it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
ODP support is retained in the code just in case a good ODP implementation
appears one day.
## peer_connect_interval
- Type: seconds

View File

@@ -20,7 +20,6 @@
- [rdma_max_msg](#rdma_max_msg)
- [rdma_max_recv](#rdma_max_recv)
- [rdma_max_send](#rdma_max_send)
- [rdma_odp](#rdma_odp)
- [peer_connect_interval](#peer_connect_interval)
- [peer_connect_timeout](#peer_connect_timeout)
- [osd_idle_timeout](#osd_idle_timeout)
@@ -72,15 +71,12 @@ RDMA может быть нужно только если у клиентов е
- Тип: строка
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
картами производства не Mellanox.
Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
список доступных RDMA-устройств, их параметры и возможности.
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
параметры и возможности.
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@@ -159,29 +155,6 @@ OSD в любом случае согласовывают реальное зн
Не влияет на потребление памяти - дополнительная память на операции отправки
не выделяется.
## rdma_odp
- Тип: булево (да/нет)
- Значение по умолчанию: false
Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
не регистрировать память для её использования RDMA-картой. Благодаря этому
можно не копировать данные при отправке их в сеть и, казалось бы, это должно
улучшать производительность - но **по факту** получается так, что
производительность только ухудшается, причём сильно. Пример - на 3-узловом
кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
основана на повторной передаче сообщений, когда карте не известен буфер -
вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
Возможность использования ODP сохранена в коде на случай, если вдруг в один
прекрасный день появится хорошая реализация ODP.
## peer_connect_interval
- Тип: секунды

View File

@@ -1,4 +1,4 @@
# Client Parameters
These parameters apply only to Vitastor clients (QEMU, fio, NBD and so on) and
affect their interaction with the cluster.
These parameters apply only to clients and affect their interaction with
the cluster.

View File

@@ -1,4 +1,4 @@
# Параметры клиентского кода
Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD и т.п.) и
Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD) и
затрагивают логику их работы с кластером.

View File

@@ -122,47 +122,3 @@
Maximum number of parallel writes when flushing buffered data to the server.
info_ru: |
Максимальное число параллельных операций записи при сбросе буферов на сервер.
- name: nbd_timeout
type: sec
default: 300
online: false
info: |
Timeout for I/O operations for [NBD](../usage/nbd.en.md). If an operation
executes for longer than this timeout, including when your cluster is just
temporarily down for more than timeout, the NBD device will detach by itself
(and possibly break the mounted file system).
You can set timeout to 0 to never detach, but in that case you won't be
able to remove the kernel device at all if the NBD process dies - you'll have
to reboot the host.
info_ru: |
Таймаут для операций чтения/записи через [NBD](../usage/nbd.ru.md). Если
операция выполняется дольше таймаута, включая временную недоступность
кластера на время, большее таймаута, NBD-устройство отключится само собой
(и, возможно, сломает примонтированную ФС).
Вы можете установить таймаут в 0, чтобы никогда не отключать устройство по
таймауту, но в этом случае вы вообще не сможете удалить устройство, если
процесс NBD умрёт - вам придётся перезагружать сервер.
- name: nbd_max_devices
type: int
default: 64
online: false
info: |
Maximum number of NBD devices in the system. This value is passed as
`nbds_max` parameter for the nbd kernel module when vitastor-nbd autoloads it.
info_ru: |
Максимальное число NBD-устройств в системе. Данное значение передаётся
модулю ядра nbd как параметр `nbds_max`, когда его загружает vitastor-nbd.
- name: nbd_max_part
type: int
default: 3
online: false
info: |
Maximum number of partitions per NBD device. This value is passed as
`max_part` parameter for the nbd kernel module when vitastor-nbd autoloads it.
Note that (nbds_max)*(1+max_part) usually can't exceed 256.
info_ru: |
Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.

View File

@@ -30,6 +30,18 @@
будут использоваться обычные синхронные системные вызовы send/recv. Для OSD
это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в
принципе, это может применяться для клиентов со старыми версиями ядра.
- name: use_zerocopy_send
type: bool
default: false
info: |
If true, OSDs and clients will attempt to use TCP zero-copy send
(MSG_ZEROCOPY) for big buffers. It's recommended to raise net.ipv4.tcp_wmem
and net.core.wmem_max sysctls when using this mode.
info_ru: |
Если установлено в true, то OSD и клиенты будут стараться использовать
TCP-отправку без копирования (MSG_ZEROCOPY) для больших буферов данных.
Рекомендуется поднять значения sysctl net.ipv4.tcp_wmem и net.core.wmem_max
при использовании этого режима.
- name: use_rdma
type: bool
default: true
@@ -48,14 +60,11 @@
type: string
info: |
RDMA device name to use for Vitastor OSD communications (for example,
"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
Versions up to Vitastor 1.2.0 required ODP which is only present in
Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
Run `ibv_devinfo -v` as root to list available RDMA devices and their
features.
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
to work. For example, Mellanox ConnectX-3 and older adapters don't have
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
root to list available RDMA devices and their features.
Remember that you also have to configure your network switches if you use
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@@ -64,15 +73,12 @@
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
info_ru: |
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
картами производства не Mellanox.
Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
список доступных RDMA-устройств, их параметры и возможности.
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
параметры и возможности.
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@@ -166,45 +172,6 @@
у принимающей стороны в процессе работы не заканчивались буферы на приём.
Не влияет на потребление памяти - дополнительная память на операции отправки
не выделяется.
- name: rdma_odp
type: bool
default: false
online: false
info: |
Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
for RDMA adapter to be able to use it. This, in turn, allows to skip memory
copying during sending. One would think this should improve performance, but
**in reality** RDMA performance with ODP is **drastically** worse. Example
3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
This happens because Mellanox ODP implementation seems to be based on
message retransmissions when the adapter doesn't know about the buffer yet -
it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
which is generally slow in RDMA/RoCE networks. Here's a presentation about
it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
ODP support is retained in the code just in case a good ODP implementation
appears one day.
info_ru: |
Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
не регистрировать память для её использования RDMA-картой. Благодаря этому
можно не копировать данные при отправке их в сеть и, казалось бы, это должно
улучшать производительность - но **по факту** получается так, что
производительность только ухудшается, причём сильно. Пример - на 3-узловом
кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
основана на повторной передаче сообщений, когда карте не известен буфер -
вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
Возможность использования ODP сохранена в коде на случай, если вдруг в один
прекрасный день появится хорошая реализация ODP.
- name: peer_connect_interval
type: sec
min: 1

View File

@@ -17,26 +17,4 @@ and apply all `NNN-*.yaml` manifests to your Kubernetes installation:
for i in ./???-*.yaml; do kubectl apply -f $i; done
```
After that you'll be able to create PersistentVolumes.
**Important:** For best experience, use Linux kernel at least 5.15 with [VDUSE](../usage/qemu.en.md#vduse)
kernel modules enabled (vdpa, vduse, virtio-vdpa). If your distribution doesn't
have them pre-built - build them yourself ([instructions](../usage/qemu.en.md#vduse)),
I promise it's worth it :-). When VDUSE is unavailable, CSI driver uses [NBD](../usage/nbd.en.md)
to map Vitastor devices. NBD is slower and prone to timeout issues: if Vitastor
cluster becomes unresponsible for more than [nbd_timeout](../config/client.en.md#nbd_timeout),
the NBD device detaches and breaks pods using it.
## Features
Vitastor CSI supports:
- Kubernetes starting with 1.20 (or 1.17 for older vitastor-csi <= 1.1.0)
- Filesystem RWO (ReadWriteOnce) volumes. Example: [PVC](../../csi/deploy/example-pvc.yaml), [pod](../../csi/deploy/example-test-pod.yaml)
- Raw block RWX (ReadWriteMany) volumes. Example: [PVC](../../csi/deploy/example-pvc-block.yaml), [pod](../../csi/deploy/example-test-pod-block.yaml)
- Volume expansion
- Volume snapshots. Example: [snapshot class](../../csi/deploy/example-snapshot-class.yaml), [snapshot](../../csi/deploy/example-snapshot.yaml), [clone](../../csi/deploy/example-snapshot-clone.yaml)
- [VDUSE](../usage/qemu.en.md#vduse) (preferred) and [NBD](../usage/nbd.en.md) device mapping methods
- Upgrades with VDUSE - new handler processes are restarted when CSI pods are restarted themselves
- Multiple clusters by using multiple configuration files in ConfigMap.
Remember that to use snapshots with CSI you also have to install [Snapshot Controller and CRDs](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
After that you'll be able to create PersistentVolumes. See example in [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).

View File

@@ -17,26 +17,4 @@
for i in ./???-*.yaml; do kubectl apply -f $i; done
```
После этого вы сможете создавать PersistentVolume.
**Важно:** Лучше всего использовать ядро Linux версии не менее 5.15 с включёнными модулями
[VDUSE](../usage/qemu.ru.md#vduse) (vdpa, vduse, virtio-vdpa). Если в вашем дистрибутиве
они не собраны из коробки - соберите их сами, обещаю, что это стоит того ([инструкция](../usage/qemu.ru.md#vduse)) :-).
Когда VDUSE недоступно, CSI-плагин использует [NBD](../usage/nbd.ru.md) для подключения
дисков, а NBD медленнее и имеет проблему таймаута - если кластер остаётся недоступным
дольше, чем [nbd_timeout](../config/client.ru.md#nbd_timeout), NBD-устройство отключается
и ломает поды, использующие его.
## Возможности
CSI-плагин Vitastor поддерживает:
- Версии Kubernetes, начиная с 1.20 (или с 1.17 для более старых vitastor-csi <= 1.1.0)
- Файловые RWO (ReadWriteOnce) тома. Пример: [PVC](../../csi/deploy/example-pvc.yaml), [под](../../csi/deploy/example-test-pod.yaml)
- Сырые блочные RWX (ReadWriteMany) тома. Пример: [PVC](../../csi/deploy/example-pvc-block.yaml), [под](../../csi/deploy/example-test-pod-block.yaml)
- Расширение размера томов
- Снимки томов. Пример: [класс снимков](../../csi/deploy/example-snapshot-class.yaml), [снимок](../../csi/deploy/example-snapshot.yaml), [клон снимка](../../csi/deploy/example-snapshot-clone.yaml)
- Способы подключения устройств [VDUSE](../usage/qemu.ru.md#vduse) (предпочитаемый) и [NBD](../usage/nbd.ru.md)
- Обновление при использовании VDUSE - новые процессы-обработчики устройств успешно перезапускаются вместе с самими подами CSI
- Несколько кластеров через задание нескольких файлов конфигурации в ConfigMap.
Не забывайте, что для использования снимков нужно сначала установить [контроллер снимков и CRD](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).

View File

@@ -18,7 +18,7 @@
stable version from 0.9.x branch instead of 1.x
- For Debian 10 (Buster) also enable backports repository:
`deb http://deb.debian.org/debian buster-backports main`
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`
## CentOS

View File

@@ -18,7 +18,7 @@
установить последнюю стабильную версию из ветки 0.9.x вместо 1.x
- Для Debian 10 (Buster) также включите репозиторий backports:
`deb http://deb.debian.org/debian buster-backports main`
- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`
## CentOS

View File

@@ -6,10 +6,10 @@
# Proxmox VE
To enable Vitastor support in Proxmox Virtual Environment (6.4-8.1 are supported):
To enable Vitastor support in Proxmox Virtual Environment (6.4-8.0 are supported):
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
bookworm for 8.1, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
bookworm for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
- Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
- Define storage in `/etc/pve/storage.cfg` (see below)
- Block network access from VMs to Vitastor network (to OSDs and etcd),

View File

@@ -6,10 +6,10 @@
# Proxmox VE
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.1):
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.0):
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
bookworm для 8.1, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
bookworm для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
- Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
- Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию

View File

@@ -54,8 +54,7 @@
виртуальные диски, их снимки и клоны.
- **Драйвер QEMU** — подключаемый модуль QEMU, позволяющий QEMU/KVM виртуальным машинам работать
с виртуальными дисками Vitastor напрямую из пространства пользователя с помощью клиентской
библиотеки, без необходимости отображения дисков в виде блочных устройств. Тот же драйвер
позволяет подключать диски в систему через [VDUSE](../usage/qemu.ru.md#vduse).
библиотеки, без необходимости отображения дисков в виде блочных устройств.
- **vitastor-nbd** — утилита, позволяющая монтировать образы Vitastor в виде блочных устройств
с помощью NBD (Network Block Device), на самом деле скорее работающего как "BUSE"
(Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в Vitastor нет

View File

@@ -28,8 +28,7 @@ It supports the following commands:
Global options:
```
--config_file FILE Path to Vitastor configuration file
--etcd_address URL Etcd connection address
--etcd_address ADDR Etcd connection address
--iodepth N Send N operations in parallel to each OSD when possible (default 32)
--parallel_osds M Work with M osds in parallel when possible (default 4)
--progress 1|0 Report progress (default 1)

View File

@@ -27,8 +27,7 @@ vitastor-cli - интерфейс командной строки для адм
Глобальные опции:
```
--config_file FILE Путь к файлу конфигурации Vitastor
--etcd_address URL Адрес соединения с etcd
--etcd_address ADDR Адрес соединения с etcd
--iodepth N Отправлять параллельно N операций на каждый OSD (по умолчанию 32)
--parallel_osds M Работать параллельно с M OSD (по умолчанию 4)
--progress 1|0 Печатать прогресс выполнения (по умолчанию 1)

View File

@@ -17,7 +17,6 @@ It supports the following commands:
- [purge](#purge)
- [read-sb](#read-sb)
- [write-sb](#write-sb)
- [update-sb](#update-sb)
- [udev](#udev)
- [exec-osd](#exec-osd)
- [pre-exec](#pre-exec)
@@ -183,14 +182,6 @@ Try to read Vitastor OSD superblock from `<device>` and print it in JSON format.
Read JSON from STDIN and write it into Vitastor OSD superblock on `<device>`.
## update-sb
`vitastor-disk update-sb <device> [--force] [--<parameter> <value>] [...]`
Read Vitastor OSD superblock from <device>, update parameters in it and write it back.
`--force` allows to ignore validation errors.
## udev
`vitastor-disk udev <device>`

View File

@@ -17,7 +17,6 @@ vitastor-disk - инструмент командной строки для уп
- [purge](#purge)
- [read-sb](#read-sb)
- [write-sb](#write-sb)
- [update-sb](#update-sb)
- [udev](#udev)
- [exec-osd](#exec-osd)
- [pre-exec](#pre-exec)
@@ -188,15 +187,6 @@ throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.
Прочитать JSON со стандартного ввода и записать его в суперблок OSD на диск `<device>`.
## update-sb
`vitastor-disk update-sb <device> [--force] [--<параметр> <значение>] [...]`
Прочитать суперблок OSD с диска `<device>`, изменить в нём заданные параметры и записать обратно.
Опция `--force` позволяет читать суперблок, даже если он считается некорректным
из-за ошибок валидации.
## udev
`vitastor-disk udev <device>`

View File

@@ -11,25 +11,25 @@ NBD stands for "Network Block Device", but in fact it also functions as "BUSE"
NBD slighly lowers the performance due to additional overhead, but performance still
remains decent (see an example [here](../performance/comparison1.en.md#vitastor-0-4-0-nbd)).
See also [VDUSE](qemu.en.md#vduse) as a better alternative to NBD.
Vitastor Kubernetes CSI driver is based on NBD.
Vitastor Kubernetes CSI driver uses NBD when VDUSE is unavailable.
See also [VDUSE](qemu.en.md#vduse).
## Map image
To create a local block device for a Vitastor image run:
```
vitastor-nbd map --image testimg
vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
```
It will output a block device name like /dev/nbd0 which you can then use as a normal disk.
You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.
vitastor-nbd supports all usual Vitastor configuration options like `--config_file <path_to_config>` plus NBD-specific:
Additional options for map command:
* `--nbd_timeout 300` \
* `--nbd_timeout 30` \
Timeout for I/O operations in seconds after exceeding which the kernel stops
the device. You can set it to 0 to disable the timeout, but beware that you
won't be able to stop the device at all if vitastor-nbd process dies.
@@ -44,9 +44,6 @@ vitastor-nbd supports all usual Vitastor configuration options like `--config_fi
* `--foreground 1` \
Stay in foreground, do not daemonize.
Note that `nbd_timeout`, `nbd_max_devices` and `nbd_max_part` options may also be specified
in `/etc/vitastor/vitastor.conf` or in other configuration file specified with `--config_file`.
## Unmap image
To unmap the device run:

View File

@@ -14,16 +14,16 @@ NBD на данный момент необходимо, чтобы монтир
NBD немного снижает производительность из-за дополнительных копирований памяти,
но она всё равно остаётся на неплохом уровне (см. для примера [тест](../performance/comparison1.ru.md#vitastor-0-4-0-nbd)).
Смотрите также [VDUSE](qemu.ru.md#vduse), как лучшую альтернативу NBD.
CSI-драйвер Kubernetes Vitastor основан на NBD.
CSI-драйвер Kubernetes Vitastor использует NBD, когда VDUSE недоступен.
Смотрите также [VDUSE](qemu.ru.md#vduse).
## Подключить устройство
Чтобы создать локальное блочное устройство для образа, выполните команду:
```
vitastor-nbd map --image testimg
vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
```
Команда напечатает название блочного устройства вида /dev/nbd0, которое потом можно
@@ -32,8 +32,7 @@ vitastor-nbd map --image testimg
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
vitastor-nbd поддерживает все обычные опции Vitastor, например, `--config_file <path_to_config>`,
плюс специфичные для NBD:
Дополнительные опции для команды подключения NBD-устройства:
* `--nbd_timeout 30` \
Максимальное время выполнения любой операции чтения/записи в секундах, при
@@ -54,10 +53,6 @@ vitastor-nbd поддерживает все обычные опции Vitastor,
* `--foreground 1` \
Не уводить процесс в фоновый режим.
Обратите внимание, что опции `nbd_timeout`, `nbd_max_devices` и `nbd_max_part` можно
также задавать в `/etc/vitastor/vitastor.conf` или в другом файле конфигурации,
заданном опцией `--config_file`.
## Отключить устройство
Для отключения устройства выполните:

View File

@@ -23,7 +23,7 @@ balancer or any failover method you want to in that case.
vitastor-nfs usage:
```
vitastor-nfs [STANDARD OPTIONS] [OTHER OPTIONS]
vitastor-nfs [--etcd_address ADDR] [OTHER OPTIONS]
--subdir <DIR> export images prefixed <DIR>/ (default empty - export all images)
--portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)

View File

@@ -22,7 +22,7 @@
Использование vitastor-nfs:
```
vitastor-nfs [СТАНДАРТНЫЕ ОПЦИИ] [ДРУГИЕ ОПЦИИ]
vitastor-nfs [--etcd_address ADDR] [ДРУГИЕ ОПЦИИ]
--subdir <DIR> экспортировать "поддиректорию" - образы с префиксом имени <DIR>/ (по умолчанию пусто - экспортировать все образы)
--portmap 0 отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий)

View File

@@ -127,46 +127,19 @@ Linux kernel, starting with version 5.15, supports a new interface for attaching
to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
exporting QEMU block devices over this protocol using qemu-storage-daemon.
VDUSE is currently the best interface to attach Vitastor disks as kernel devices because:
- It avoids data copies and thus achieves much better performance than [NBD](nbd.en.md)
- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long
- It doesn't have hung device problem - if the userspace process dies it can be restarted (!)
and block device will continue operation
- It doesn't seem to have the device number limit
VDUSE has the same problem as other FUSE-like interfaces in Linux: if a userspace process hangs,
for example, if it loses connectivity with Vitastor cluster - active processes doing I/O may
hang in the D state (uninterruptible sleep) and you won't be able to kill them even with kill -9.
In this case reboot will be the only way to remove VDUSE devices from system.
Example performance comparison:
| | direct fio | NBD | VDUSE |
|----------------------|-------------|-------------|-------------|
| linear write | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
| 4k random write Q128 | 240000 iops | 120000 iops | 178000 iops |
| 4k random write Q1 | 9500 iops | 7620 iops | 7640 iops |
| linear read | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
| 4k random read Q128 | 287000 iops | 140000 iops | 189000 iops |
| 4k random read Q1 | 9600 iops | 7640 iops | 7780 iops |
On the other hand, VDUSE is faster than [NBD](nbd.en.md), so you may prefer to use it if
performance is important for you. Approximate performance numbers:
direct fio benchmark - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
To try VDUSE you need at least Linux 5.15, built with VDUSE support
(CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
Debian Linux kernels have these options disabled by now, so if you want to try it on Debian,
use a kernel from Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/), Proxmox,
or build modules for Debian kernel manually:
```
mkdir build
cd build
apt-get install linux-headers-`uname -r`
apt-get build-dep linux-image-`uname -r`-unsigned
apt-get source linux-image-`uname -r`-unsigned
cd linux*/drivers/vdpa
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
cd ../virtio
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
depmod -a
```
You also need `vdpa` tool from the `iproute2` package.
(CONFIG_VIRTIO_VDPA=m and CONFIG_VDPA_USER=m). Debian Linux kernels have these options
disabled by now, so if you want to try it on Debian, use a kernel from Ubuntu
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) or Proxmox.
Commands to attach Vitastor image as a VDUSE device:
@@ -179,7 +152,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
vdpa dev add name test1 mgmtdev vduse
```
After running these commands, `/dev/vda` device will appear in the system and you'll be able to
After running these commands /dev/vda device will appear in the system and you'll be able to
use it as a normal disk.
To remove the device:

View File

@@ -129,47 +129,19 @@ qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.
VDUSE - на данный момент лучший интерфейс для подключения дисков Vitastor в виде блочных
устройств на уровне ядра, ибо:
- VDUSE не копирует данные и поэтому достигает значительно лучшей производительности, чем [NBD](nbd.ru.md)
- Также оно не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго
- Также оно не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
перезапустить (!) и блочное устройство продолжит работать
- По-видимому, у него нет предела числа подключаемых в систему устройств
VDUSE страдает общей проблемой FUSE-подобных интерфейсов в Linux: если пользовательский процесс
подвиснет, например, если будет потеряна связь с кластером Vitastor - читающие/пишущие в кластер
процессы могут "залипнуть" в состоянии D (непрерываемый сон) и их будет невозможно убить даже
через kill -9. В этом случае удалить из системы устройство можно только перезагрузившись.
Пример сравнения производительности:
С другой стороны, VDUSE быстрее по сравнению с [NBD](nbd.ru.md), поэтому его может
быть предпочтительно использовать там, где производительность важнее. Порядок показателей:
прямое тестирование через fio - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
| | Прямой fio | NBD | VDUSE |
|--------------------------|-------------|-------------|-------------|
| линейная запись | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
| 4k случайная запись Q128 | 240000 iops | 120000 iops | 178000 iops |
| 4k случайная запись Q1 | 9500 iops | 7620 iops | 7640 iops |
| линейное чтение | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
| 4k случайное чтение Q128 | 287000 iops | 140000 iops | 189000 iops |
| 4k случайное чтение Q1 | 9600 iops | 7640 iops | 7780 iops |
Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
VDUSE (CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
В ядрах в Debian Linux поддержка пока отключена по умолчанию, так что чтобы попробовать VDUSE
на Debian, поставьте ядро из Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/),
из Proxmox или соберите модули для ядра Debian вручную:
```
mkdir build
cd build
apt-get install linux-headers-`uname -r`
apt-get build-dep linux-image-`uname -r`-unsigned
apt-get source linux-image-`uname -r`-unsigned
cd linux*/drivers/vdpa
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
cd ../virtio
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
depmod -a
```
Также вам понадобится консольная утилита `vdpa` из пакета `iproute2`.
Чтобы использовать VDUSE, вам нужно ядро Linux версии хотя бы 5.15, собранное с поддержкой
VDUSE (CONFIG_VIRTIO_VDPA=m и CONFIG_VDPA_USER=m). В ядрах в Debian Linux поддержка пока
отключена - если хотите попробовать эту функцию на Debian, поставьте ядро из Ubuntu
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) или из Proxmox.
Команды для подключения виртуального диска через VDUSE:
@@ -182,7 +154,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
vdpa dev add name test1 mgmtdev vduse
```
После этого в системе появится устройство `/dev/vda`, которое можно будет использовать как
После этого в системе появится устройство /dev/vda, которое можно будет использовать как
обычный диск.
Для удаления устройства из системы:

2
json11

Submodule json11 updated: fd37016cf8...52a3af664f

View File

@@ -65,6 +65,7 @@ const etcd_tree = {
// client and osd
tcp_header_buffer_size: 65536,
use_sync_send_recv: false,
use_zerocopy_send: false,
use_rdma: true,
rdma_device: null, // for example, "rocep5s0f0"
rdma_port_num: 1,
@@ -110,15 +111,7 @@ const etcd_tree = {
autosync_interval: 5,
autosync_writes: 128,
client_queue_depth: 128, // unused
recovery_queue_depth: 1,
recovery_sleep_us: 0,
recovery_tune_min_util: 0.1,
recovery_tune_min_client_util: 0,
recovery_tune_max_util: 1.0,
recovery_tune_max_client_util: 0.5,
recovery_tune_interval: 1,
recovery_tune_ewma_rate: 0.5,
recovery_tune_sleep_min_us: 10, // 10 microseconds
recovery_queue_depth: 4,
recovery_pg_switch: 128,
recovery_sync_batch: 16,
no_recovery: false,
@@ -411,7 +404,6 @@ class Mon
this.ws_alive = false;
this.ws_keepalive_timer = null;
this.on_stop_cb = () => this.on_stop(0).catch(console.error);
this.recheck_pgs_active = false;
}
parse_etcd_addresses(addrs)
@@ -701,27 +693,8 @@ class Mon
});
}
// Schedule save_last_clean() to to run after a small timeout (1s) (to not spam etcd)
schedule_save_last_clean()
{
if (!this.save_last_clean_timer)
{
this.save_last_clean_timer = setTimeout(() =>
{
this.save_last_clean_timer = null;
this.save_last_clean().catch(this.die);
}, this.config.mon_change_timeout || 1000);
}
}
async save_last_clean()
{
if (this.save_last_clean_running)
{
this.schedule_save_last_clean();
return;
}
this.save_last_clean_running = true;
// last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
const new_clean_pgs = { items: {} };
next_pool:
@@ -758,7 +731,6 @@ class Mon
value: b64(JSON.stringify(this.state.history.last_clean_pgs))
} } ],
}, this.etcd_start_timeout, 0);
this.save_last_clean_running = false;
}
get_mon_state()
@@ -1232,12 +1204,6 @@ class Mon
async recheck_pgs()
{
if (this.recheck_pgs_active)
{
this.schedule_recheck();
return;
}
this.recheck_pgs_active = true;
// Take configuration and state, check it against the stored configuration hash
// Recalculate PGs and save them to etcd if the configuration is changed
// FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
@@ -1259,7 +1225,6 @@ class Mon
// Pool deleted. Delete all PGs, but first stop them.
if (!await this.stop_all_pgs(pool_id))
{
this.recheck_pgs_active = false;
this.schedule_recheck();
return;
}
@@ -1328,16 +1293,9 @@ class Mon
// PG count changed. Need to bring all PGs down.
if (!await this.stop_all_pgs(pool_id))
{
this.recheck_pgs_active = false;
this.schedule_recheck();
return;
}
}
if (prev_pgs.length != pool_cfg.pg_count)
{
// Scale PG count
// Do it even if old_pg_count is already equal to pool_cfg.pg_count,
// because last_clean_pgs may still contain the old number of PGs
const new_pg_history = [];
PGUtil.scale_pg_count(prev_pgs, real_prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
pg_history = new_pg_history;
@@ -1439,7 +1397,6 @@ class Mon
await this.save_pg_config(new_config_pgs);
}
}
this.recheck_pgs_active = false;
}
async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
@@ -1489,6 +1446,7 @@ class Mon
}
// Schedule a recheck to run after a small timeout (1s)
// If already scheduled, cancel previous timer and schedule it again
// This is required for multiple change events to trigger at most 1 recheck in 1s
schedule_recheck()
{
@@ -1506,7 +1464,7 @@ class Mon
{
const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
if (!st || !st.time || !prev || prev.time >= st.time)
{
return prev_diff || diff;
}

View File

@@ -1,6 +1,6 @@
{
"name": "vitastor-mon",
"version": "1.3.1",
"version": "1.1.0",
"description": "Vitastor SDS monitor service",
"main": "mon-main.js",
"scripts": {

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '1.3.1'
VERSION = '1.1.0'
LOG = logging.getLogger(__name__)

View File

@@ -1,190 +0,0 @@
Index: pve-qemu-kvm-8.1.2/block/meson.build
===================================================================
--- pve-qemu-kvm-8.1.2.orig/block/meson.build
+++ pve-qemu-kvm-8.1.2/block/meson.build
@@ -123,6 +123,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
Index: pve-qemu-kvm-8.1.2/meson.build
===================================================================
--- pve-qemu-kvm-8.1.2.orig/meson.build
+++ pve-qemu-kvm-8.1.2/meson.build
@@ -1303,6 +1303,26 @@ if not get_option('rbd').auto() or have_
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'))
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -2123,6 +2143,7 @@ if numa.found()
endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
config_host_data.set('CONFIG_SDL', sdl.found())
@@ -4298,6 +4319,7 @@ summary_info += {'fdt support': fd
summary_info += {'libcap-ng support': libcap_ng}
summary_info += {'bpf support': libbpf}
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
Index: pve-qemu-kvm-8.1.2/meson_options.txt
===================================================================
--- pve-qemu-kvm-8.1.2.orig/meson_options.txt
+++ pve-qemu-kvm-8.1.2/meson_options.txt
@@ -186,6 +186,8 @@ option('lzo', type : 'feature', value :
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
Index: pve-qemu-kvm-8.1.2/qapi/block-core.json
===================================================================
--- pve-qemu-kvm-8.1.2.orig/qapi/block-core.json
+++ pve-qemu-kvm-8.1.2/qapi/block-core.json
@@ -3403,7 +3403,7 @@
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
'pbs',
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4465,6 +4465,28 @@
'*server': ['InetSocketAddressBase'] } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -4923,6 +4945,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5360,6 +5383,17 @@
'*encrypt' : 'RbdEncryptionCreateOptions' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -5581,6 +5615,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
Index: pve-qemu-kvm-8.1.2/scripts/ci/org.centos/stream/8/x86_64/configure
===================================================================
--- pve-qemu-kvm-8.1.2.orig/scripts/ci/org.centos/stream/8/x86_64/configure
+++ pve-qemu-kvm-8.1.2/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -30,7 +30,7 @@
--with-suffix="qemu-kvm" \
--firmwarepath=/usr/share/qemu-firmware \
--target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
--audio-drv-list="" \
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
@@ -176,6 +176,7 @@
--enable-opengl \
--enable-pie \
--enable-rbd \
+--enable-vitastor \
--enable-rdma \
--enable-seccomp \
--enable-snappy \
Index: pve-qemu-kvm-8.1.2/scripts/meson-buildoptions.sh
===================================================================
--- pve-qemu-kvm-8.1.2.orig/scripts/meson-buildoptions.sh
+++ pve-qemu-kvm-8.1.2/scripts/meson-buildoptions.sh
@@ -153,6 +153,7 @@ meson_options_help() {
printf "%s\n" ' qed qed image format support'
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' sdl SDL user interface'
@@ -416,6 +417,8 @@ _meson_option_parse() {
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-replication) printf "%s" -Dreplication=enabled ;;

View File

@@ -1,190 +0,0 @@
diff --git a/block/meson.build b/block/meson.build
index 529fc172c6..d542dc0609 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -110,6 +110,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
diff --git a/meson.build b/meson.build
index a9c4f28247..8496cf13f1 100644
--- a/meson.build
+++ b/meson.build
@@ -1303,6 +1303,26 @@ if not get_option('rbd').auto() or have_block
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'))
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -2119,6 +2139,7 @@ if numa.found()
endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
config_host_data.set('CONFIG_SDL', sdl.found())
@@ -4286,6 +4307,7 @@ summary_info += {'fdt support': fdt_opt == 'disabled' ? false : fdt_opt}
summary_info += {'libcap-ng support': libcap_ng}
summary_info += {'bpf support': libbpf}
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
diff --git a/meson_options.txt b/meson_options.txt
index ae6d8f469d..e3d9f8404d 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -186,6 +186,8 @@ option('lzo', type : 'feature', value : 'auto',
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 2b1d493d6e..90673fdbdc 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3146,7 +3146,7 @@
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4196,6 +4196,28 @@
'*key-secret': 'str',
'*server': ['InetSocketAddressBase'] } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -4654,6 +4676,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5089,6 +5112,17 @@
'*cluster-size' : 'size',
'*encrypt' : 'RbdEncryptionCreateOptions' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVmdkSubformat:
#
@@ -5311,6 +5345,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
index d02b09a4b9..f0b5fbfef3 100755
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -30,7 +30,7 @@
--with-suffix="qemu-kvm" \
--firmwarepath=/usr/share/qemu-firmware \
--target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
--audio-drv-list="" \
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
@@ -176,6 +176,7 @@
--enable-opengl \
--enable-pie \
--enable-rbd \
+--enable-vitastor \
--enable-rdma \
--enable-seccomp \
--enable-snappy \
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index d7020af175..94958eb6fa 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -153,6 +153,7 @@ meson_options_help() {
printf "%s\n" ' qed qed image format support'
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' sdl SDL user interface'
@@ -416,6 +417,8 @@ _meson_option_parse() {
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-replication) printf "%s" -Dreplication=enabled ;;

View File

@@ -24,4 +24,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-1.3.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.3.1$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-1.1.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.1.0$(rpm --eval '%dist').tar.gz *

View File

@@ -15,7 +15,6 @@ RUN yumdownloader --disablerepo=centos-sclo-rh --source fio
RUN rpm --nomd5 -i fio*.src.rpm
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
RUN cd ~/rpmbuild/SPECS && yum-builddep -y fio.spec
RUN yum -y install cmake3
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
@@ -36,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.3.1.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.1.0.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 1.3.1
Version: 1.1.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.3.1.el7.tar.gz
Source0: vitastor-1.1.0.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
@@ -16,7 +16,7 @@ BuildRequires: jerasure-devel
BuildRequires: libisa-l-devel
BuildRequires: gf-complete-devel
BuildRequires: libibverbs-devel
BuildRequires: cmake3
BuildRequires: cmake
Requires: vitastor-osd = %{version}-%{release}
Requires: vitastor-mon = %{version}-%{release}
Requires: vitastor-client = %{version}-%{release}
@@ -94,7 +94,7 @@ Vitastor fio drivers for benchmarking.
%build
. /opt/rh/devtoolset-9/enable
%cmake3 .
%cmake .
%make_build

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.3.1.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.1.0.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 1.3.1
Version: 1.1.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.3.1.el8.tar.gz
Source0: vitastor-1.1.0.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -18,7 +18,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.3.1.el9.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.1.0.el9.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 1.3.1
Version: 1.1.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.3.1.el9.tar.gz
Source0: vitastor-1.1.0.el9.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -16,11 +16,10 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="1.3.1")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
add_link_options(-fno-omit-frame-pointer)
add_definitions(-DVERSION="1.1.0")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN})
add_definitions(-fsanitize=address)
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
add_link_options(-fsanitize=address -fno-omit-frame-pointer)
endif (${WITH_ASAN})

View File

@@ -1372,8 +1372,7 @@ bool journal_flusher_co::trim_journal(int wait_base)
? (uint32_t)JE_START_V1_SIZE : (uint32_t)JE_START_V2_SIZE),
.reserved = 0,
.journal_start = new_trim_pos,
.version = (uint64_t)(!bs->dsk.data_csum_type && ((journal_entry_start*)flusher->journal_superblock)->version == JOURNAL_VERSION_V1
? JOURNAL_VERSION_V1 : JOURNAL_VERSION_V2),
.version = JOURNAL_VERSION_V2,
.data_csum_type = bs->dsk.data_csum_type,
.csum_block_size = bs->dsk.csum_block_size,
};

View File

@@ -274,7 +274,7 @@ class blockstore_impl_t
blockstore_dirty_db_t dirty_db;
std::vector<blockstore_op_t*> submit_queue;
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
int unsynced_big_write_count = 0, unstable_unsynced = 0;
int unsynced_big_write_count = 0;
int unsynced_queued_ops = 0;
allocator *data_alloc = NULL;
uint8_t *zero_object;

View File

@@ -553,7 +553,7 @@ resume_1:
}
if (je_start->size == JE_START_V0_SIZE ||
(je_start->version != JOURNAL_VERSION_V1 || je_start->size != JE_START_V1_SIZE) &&
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE && je_start->size != JE_START_V1_SIZE))
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE))
{
fprintf(
stderr, "The code only supports journal versions 2 and 1, but it is %lu on disk."
@@ -562,8 +562,7 @@ resume_1:
);
exit(1);
}
if (je_start->version == JOURNAL_VERSION_V1 ||
je_start->version == JOURNAL_VERSION_V2 && je_start->size == JE_START_V1_SIZE)
if (je_start->version == JOURNAL_VERSION_V1)
{
je_start->data_csum_type = 0;
je_start->csum_block_size = 0;
@@ -732,9 +731,8 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
resume:
while (pos < bs->journal.block_size)
{
auto buf_pos = proc_pos - done_pos + pos;
journal_entry *je = (journal_entry*)((uint8_t*)buf + buf_pos);
if (je->magic != JOURNAL_MAGIC || buf_pos+je->size > len || je_crc32(je) != je->crc32 ||
journal_entry *je = (journal_entry*)((uint8_t*)buf + proc_pos - done_pos + pos);
if (je->magic != JOURNAL_MAGIC || je_crc32(je) != je->crc32 ||
je->type < JE_MIN || je->type > JE_MAX || started && je->crc32_prev != crc32_last)
{
if (pos == 0)

View File

@@ -144,10 +144,7 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
journal.sector_info[journal.cur_sector].written = false;
journal.sector_info[journal.cur_sector].offset = journal.next_free;
journal.in_sector_pos = 0;
auto next_next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
// double check that next_free doesn't cross used_start from the left
assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
journal.next_free = next_next_free;
journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
memset(journal.inmemory
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);

View File

@@ -13,6 +13,12 @@
#define JOURNAL_BUFFER_SIZE 4*1024*1024
#define JOURNAL_ENTRY_HEADER_SIZE 16
// We reserve some extra space for future stabilize requests during writes
// FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
// writing more than can be stabilized afterwards
#define JOURNAL_STABILIZE_RESERVATION 65536
#define JOURNAL_INSTANT_RESERVATION 131072
// Journal entries
// Journal entries are linked to each other by their crc32 value
// The journal is almost a blockchain, because object versions constantly increase

View File

@@ -86,15 +86,14 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
auto & dirty_entry = dirty_db.at(sbw);
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
left == 0 ? JOURNAL_STABILIZE_RESERVATION : 0))
{
return 0;
}
}
}
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
{
return 0;
}
@@ -185,11 +184,6 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
{
mark_stable(dirty_it->first);
}
else
{
unstable_unsynced--;
assert(unstable_unsynced >= 0);
}
dirty_it++;
while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
{
@@ -220,11 +214,6 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
{
mark_stable(*it);
}
else
{
unstable_unsynced--;
assert(unstable_unsynced >= 0);
}
}
}
op->retval = 0;

View File

@@ -21,7 +21,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
dyn = calloc_or_die(1, dyn_size+sizeof(int));
*((int*)dyn) = 1;
}
uint8_t *dyn_ptr = (alloc_dyn_data ? (uint8_t*)dyn+sizeof(int) : (uint8_t*)&dyn);
uint8_t *dyn_ptr = (uint8_t*)(alloc_dyn_data ? dyn+sizeof(int) : &dyn);
uint64_t version = 1;
if (dirty_db.size() > 0)
{
@@ -320,7 +320,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, unsynced_big_write_count + 1,
sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
(dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION))
{
return 0;
}
@@ -386,10 +386,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
sqe, dsk.data_fd, PRIV(op)->iov_zerofill, vcnt, dsk.data_offset + (loc << dsk.block_order) + op->offset - stripe_offset
);
PRIV(op)->pending_ops = 1;
if (!(dirty_it->second.state & BS_ST_INSTANT))
{
unstable_unsynced++;
}
if (immediate_commit != IMMEDIATE_ALL)
{
// Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
@@ -412,7 +408,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
|| !space_check.check_available(op, 1,
sizeof(journal_entry_small_write) + dyn_size,
op->len + (unstable_writes.size()+unstable_unsynced)*journal.block_size))
op->len + ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
{
return 0;
}
@@ -462,8 +458,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
exit(1);
}
}
// double check that next_free doesn't cross used_start from the left
assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
journal.next_free = next_next_free;
je->oid = op->oid;
je->version = op->version;
@@ -501,15 +495,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
}
dirty_it->second.location = journal.next_free;
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
next_next_free = journal.next_free + op->len;
if (next_next_free >= journal.len)
next_next_free = dsk.journal_block_size;
// double check that next_free doesn't cross used_start from the left
assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
journal.next_free = next_next_free;
if (!(dirty_it->second.state & BS_ST_INSTANT))
journal.next_free += op->len;
if (journal.next_free >= journal.len)
{
unstable_unsynced++;
journal.next_free = dsk.journal_block_size;
}
if (!PRIV(op)->pending_ops)
{
@@ -549,7 +538,7 @@ resume_2:
uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
{
return 0;
}
@@ -593,20 +582,14 @@ resume_4:
#endif
bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
bool is_instant = ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT));
if (imm)
{
auto & unstab = unstable_writes[op->oid];
unstab = unstab < op->version ? op->version : unstab;
if (!is_instant)
{
unstable_unsynced--;
assert(unstable_unsynced >= 0);
}
}
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK)
| (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
if (imm && is_instant)
if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
{
// Deletions and 'instant' operations are treated as immediately stable
mark_stable(dirty_it->first);
@@ -752,7 +735,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
});
assert(dirty_it != dirty_db.end());
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), (unstable_writes.size()+unstable_unsynced)*journal.block_size))
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_INSTANT_RESERVATION))
{
return 0;
}

View File

@@ -17,7 +17,7 @@
static const char *exe_name = NULL;
static const char* help_text =
"Vitastor command-line tool " VERSION "\n"
"Vitastor command-line tool\n"
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
"\n"
"COMMANDS:\n"
@@ -116,8 +116,7 @@ static const char* help_text =
"Use vitastor-cli --help <command> for command details or vitastor-cli --help --all for all details.\n"
"\n"
"GLOBAL OPTIONS:\n"
" --config_file FILE Path to Vitastor configuration file\n"
" --etcd_address URL Etcd connection address\n"
" --etcd_address <etcd_address>\n"
" --iodepth N Send N operations in parallel to each OSD when possible (default 32)\n"
" --parallel_osds M Work with M osds in parallel when possible (default 4)\n"
" --progress 1|0 Report progress (default 1)\n"
@@ -332,7 +331,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
{
// Create client
json11::Json cfg_j = cfg;
p->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
p->ringloop = new ring_loop_t(512);
p->epmgr = new epoll_manager_t(p->ringloop);
p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg_j);
// Smaller timeout by default for more interactiveness

View File

@@ -109,7 +109,7 @@ resume_1:
}
for (auto pg_per_pair: pg_per_osd)
{
uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.real_pg_count / pg_per_pair.second;
uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.pg_count / pg_per_pair.second;
if (pool_avail > pg_free)
{
pool_avail = pg_free;
@@ -127,7 +127,6 @@ resume_1:
{ "id", (uint64_t)pool_cfg.id },
{ "name", pool_cfg.name },
{ "pg_count", pool_cfg.pg_count },
{ "real_pg_count", pool_cfg.real_pg_count },
{ "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "ec" },
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
@@ -178,7 +177,7 @@ resume_1:
{ "title", "SCHEME" },
});
cols.push_back(json11::Json::object{
{ "key", "pg_count_fmt" },
{ "key", "pg_count" },
{ "title", "PGS" },
});
cols.push_back(json11::Json::object{
@@ -207,9 +206,6 @@ resume_1:
double raw_to = kv.second["raw_to_usable"].number_value();
if (raw_to < 0.000001 && raw_to > -0.000001)
raw_to = 1;
kv.second["pg_count_fmt"] = kv.second["real_pg_count"] == kv.second["pg_count"]
? kv.second["real_pg_count"].as_string()
: kv.second["real_pg_count"].as_string()+"->"+kv.second["pg_count"].as_string();
kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / raw_to);
kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / raw_to);
kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());

View File

@@ -158,7 +158,12 @@ resume_2:
for (auto & pool_pair: parent->cli->st_cli.pool_config)
{
auto & pool_cfg = pool_pair.second;
bool active = pool_cfg.real_pg_count > 0;
bool active = true;
if (pool_cfg.pg_config.size() != pool_cfg.pg_count)
{
active = false;
pgs_by_state["offline"] += pool_cfg.pg_count-pool_cfg.pg_config.size();
}
pool_count++;
for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++)
{

View File

@@ -64,7 +64,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
cluster_client_t::~cluster_client_t()
{
msgr.repeer_pgs = [](osd_num_t){};
msgr.repeer_pgs = [this](osd_num_t){};
if (ringloop)
{
ringloop->unregister_consumer(&consumer);
@@ -454,7 +454,7 @@ bool cluster_client_t::flush()
wb->start_writebacks(this, 0);
cluster_op_t *sync = new cluster_op_t;
sync->opcode = OSD_OP_SYNC;
sync->callback = [](cluster_op_t *sync)
sync->callback = [this](cluster_op_t *sync)
{
delete sync;
};
@@ -465,7 +465,7 @@ bool cluster_client_t::flush()
bool sync_done = false;
cluster_op_t *sync = new cluster_op_t;
sync->opcode = OSD_OP_SYNC;
sync->callback = [&sync_done](cluster_op_t *sync)
sync->callback = [this, &sync_done](cluster_op_t *sync)
{
delete sync;
sync_done = true;

View File

@@ -263,7 +263,7 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
}
assert(calc_len == op->len);
writebacks_active++;
op->callback = [this, flush_id](cluster_op_t* op)
op->callback = [this, cli, flush_id](cluster_op_t* op)
{
// Buffer flushes should be always retried, regardless of the error,
// so they should never result in an error here
@@ -383,7 +383,7 @@ static void copy_to_op(cluster_op_t *op, uint64_t offset, uint8_t *buf, uint64_t
auto begin = (cur_offset < offset ? offset : cur_offset);
auto end = (cur_offset+v.iov_len > offset+len ? offset+len : cur_offset+v.iov_len);
memcpy(
(uint8_t*)v.iov_base + begin - cur_offset,
v.iov_base + begin - cur_offset,
buf + (cur_offset <= offset ? 0 : cur_offset-offset),
end - begin
);

View File

@@ -5,7 +5,7 @@
#include "str_util.h"
static const char *help_text =
"Vitastor disk management tool " VERSION "\n"
"Vitastor disk management tool\n"
"(c) Vitaliy Filippov, 2022+ (VNPL-1.1)\n"
"\n"
"COMMANDS:\n"
@@ -127,10 +127,6 @@ static const char *help_text =
"vitastor-disk write-sb <device>\n"
" Read JSON from STDIN and write it into Vitastor OSD superblock on <device>.\n"
"\n"
"vitastor-disk update-sb <device> [--force] [--<parameter> <value>] [...]\n"
" Read Vitastor OSD superblock from <device>, update parameters in it and write it back.\n"
" --force allows to ignore validation errors.\n"
"\n"
"vitastor-disk udev <device>\n"
" Try to read Vitastor OSD superblock from <device> and print variables for udev.\n"
"\n"
@@ -233,7 +229,7 @@ int main(int argc, char *argv[])
{
self.options["allow_data_loss"] = "1";
}
else if (argv[i][0] == '-' && argv[i][1] == '-' && i < argc-1)
else if (argv[i][0] == '-' && argv[i][1] == '-')
{
char *key = argv[i]+2;
self.options[key] = argv[++i];
@@ -367,15 +363,6 @@ int main(int argc, char *argv[])
}
return self.write_sb(cmd[1]);
}
else if (!strcmp(cmd[0], "update-sb"))
{
if (cmd.size() != 2)
{
fprintf(stderr, "Exactly 1 device path argument is required\n");
return 1;
}
return self.update_sb(cmd[1]);
}
else if (!strcmp(cmd[0], "start") || !strcmp(cmd[0], "stop") ||
!strcmp(cmd[0], "restart") || !strcmp(cmd[0], "enable") || !strcmp(cmd[0], "disable"))
{

View File

@@ -109,7 +109,6 @@ struct disk_tool_t
int udev_import(std::string device);
int read_sb(std::string device);
int write_sb(std::string device);
int update_sb(std::string device);
int exec_osd(std::string device);
int systemd_start_stop_osds(const std::vector<std::string> & cmd, const std::vector<std::string> & devices);
int pre_exec_osd(std::string device);

View File

@@ -320,7 +320,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
if (journal_calc_data_pos != sw.data_offset)
{
printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\""
: " (mismatched, calculated = %08lx)", journal_pos);
: " (mismatched, calculated = %lu)", journal_pos);
}
uint32_t data_csum_size = (!je_start.csum_block_size
? 0

View File

@@ -245,7 +245,7 @@ int disk_tool_t::resize_copy_data()
{
iodepth = 32;
}
ringloop = new ring_loop_t(iodepth < RINGLOOP_DEFAULT_SIZE ? RINGLOOP_DEFAULT_SIZE : iodepth);
ringloop = new ring_loop_t(iodepth < 512 ? 512 : iodepth);
dsk.data_fd = open(dsk.data_device.c_str(), O_DIRECT|O_RDWR);
if (dsk.data_fd < 0)
{

View File

@@ -86,24 +86,6 @@ int disk_tool_t::write_sb(std::string device)
return !write_osd_superblock(device, params);
}
int disk_tool_t::update_sb(std::string device)
{
json11::Json sb = read_osd_superblock(device, true, options.find("force") != options.end());
if (sb.is_null())
{
return 1;
}
auto sb_obj = sb["params"].object_items();
for (auto & kv: options)
{
if (kv.first != "force")
{
sb_obj[kv.first] = kv.second;
}
}
return !write_osd_superblock(device, sb_obj);
}
uint32_t disk_tool_t::write_osd_superblock(std::string device, json11::Json params)
{
std::string json_data = params.dump();

View File

@@ -135,8 +135,8 @@ void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int t
{
if (this->log_level > 0)
{
fprintf(
stderr, "Warning: etcd request failed: %s, retrying %d more times\n",
printf(
"Warning: etcd request failed: %s, retrying %d more times\n",
err.c_str(), retries
);
}
@@ -333,7 +333,7 @@ void etcd_state_client_t::start_etcd_watcher()
etcd_watch_ws = NULL;
}
if (this->log_level > 1)
fprintf(stderr, "Trying to connect to etcd websocket at %s\n", etcd_address.c_str());
printf("Trying to connect to etcd websocket at %s\n", etcd_address.c_str());
etcd_watch_ws = open_websocket(tfd, etcd_address, etcd_api_path+"/watch", etcd_slow_timeout,
[this, cur_addr = selected_etcd_address](const http_response_t *msg)
{

View File

@@ -130,7 +130,7 @@ static int bs_init(struct thread_data *td)
config[p.first] = p.second.dump();
}
}
bsd->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
bsd->ringloop = new ring_loop_t(512);
bsd->epmgr = new epoll_manager_t(bsd->ringloop);
bsd->bs = new blockstore_t(config, bsd->ringloop, bsd->epmgr->tfd);
while (1)

View File

@@ -22,7 +22,7 @@ void osd_messenger_t::init()
{
rdma_context = msgr_rdma_context_t::create(
rdma_device != "" ? rdma_device.c_str() : NULL,
rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
rdma_port_num, rdma_gid_index, rdma_mtu, log_level
);
if (!rdma_context)
{
@@ -42,6 +42,12 @@ void osd_messenger_t::init()
handle_rdma_events();
}
}
#endif
#ifndef SO_ZEROCOPY
if (log_level > 0)
{
fprintf(stderr, "Zero-copy TCP send is not supported in this build, ignoring\n");
}
#endif
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
{
@@ -167,13 +173,14 @@ void osd_messenger_t::parse_config(const json11::Json & config)
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
this->rdma_max_msg = 129*1024;
this->rdma_odp = config["rdma_odp"].bool_value();
#endif
this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
if (!this->receive_buffer_size || this->receive_buffer_size > 1024*1024*1024)
this->receive_buffer_size = 65536;
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
config["use_sync_send_recv"].uint64_value();
this->use_zerocopy_send = config["use_zerocopy_send"].bool_value() ||
config["use_zerocopy_send"].uint64_value();
this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
if (!this->peer_connect_interval)
this->peer_connect_interval = 5;
@@ -304,8 +311,7 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
on_connect_peer(peer_osd, -result);
return;
}
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
set_socket_options(cl);
cl->peer_state = PEER_CONNECTED;
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
{
@@ -315,6 +321,23 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
check_peer_config(cl);
}
void osd_messenger_t::set_socket_options(osd_client_t *cl)
{
int one = 1;
setsockopt(cl->peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
#ifdef SO_ZEROCOPY
if (!use_zerocopy_send)
cl->zerocopy_send = false;
else if (setsockopt(cl->peer_fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)) != 0)
{
if (log_level > 0)
fprintf(stderr, "[OSD %lu] Failed to enable zero-copy send for client %d: %s\n", this->osd_num, cl->peer_fd, strerror(errno));
}
else
cl->zerocopy_send = true;
#endif
}
void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
{
// Mark client as ready (i.e. some data is available)
@@ -491,14 +514,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
fprintf(stderr, "Connected to OSD %lu using RDMA\n", cl->osd_num);
}
cl->peer_state = PEER_RDMA;
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
{
// Do not miss the disconnection!
if (epoll_events & EPOLLRDHUP)
{
handle_peer_epoll(peer_fd, epoll_events);
}
});
tfd->set_fd_handler(cl->peer_fd, false, NULL);
// Add the initial receive request
try_recv_rdma(cl);
}
@@ -523,14 +539,13 @@ void osd_messenger_t::accept_connections(int listen_fd)
fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd,
addr_to_string(addr).c_str());
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
clients[peer_fd] = new osd_client_t();
clients[peer_fd]->peer_addr = addr;
clients[peer_fd]->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
clients[peer_fd]->peer_fd = peer_fd;
clients[peer_fd]->peer_state = PEER_CONNECTED;
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
auto cl = clients[peer_fd] = new osd_client_t();
cl->peer_addr = addr;
cl->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
cl->peer_fd = peer_fd;
cl->peer_state = PEER_CONNECTED;
cl->in_buf = malloc_or_die(receive_buffer_size);
set_socket_options(cl);
// Add FD to epoll
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
{

View File

@@ -45,6 +45,12 @@ struct msgr_rdma_connection_t;
struct msgr_rdma_context_t;
#endif
struct msgr_zc_not_t
{
osd_op_t *op;
uint32_t nsend;
};
struct osd_client_t
{
int refs = 0;
@@ -57,6 +63,7 @@ struct osd_client_t
int ping_time_remaining = 0;
int idle_time_remaining = 0;
osd_num_t osd_num = 0;
bool zerocopy_send = false;
void *in_buf = NULL;
@@ -87,6 +94,12 @@ struct osd_client_t
int write_state = 0;
std::vector<iovec> send_list, next_send_list;
std::vector<msgr_sendp_t> outbox, next_outbox;
std::vector<msgr_zc_not_t> zerocopy_sent;
uint64_t outbox_size = 0, next_outbox_size = 0;
uint32_t zerocopy_notification_idx = 0;
uint32_t zerocopy_notification_prev = 0;
uint8_t zerocopy_notification_buf[256];
struct msghdr zerocopy_notification_msg;
~osd_client_t();
};
@@ -123,6 +136,7 @@ protected:
int osd_ping_timeout = 0;
int log_level = 0;
bool use_sync_send_recv = false;
bool use_zerocopy_send = false;
#ifdef WITH_RDMA
bool use_rdma = true;
@@ -131,7 +145,6 @@ protected:
msgr_rdma_context_t *rdma_context = NULL;
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
uint64_t rdma_max_msg = 0;
bool rdma_odp = false;
#endif
std::vector<int> read_ready_clients;
@@ -186,9 +199,11 @@ protected:
void check_peer_config(osd_client_t *cl);
void cancel_osd_ops(osd_client_t *cl);
void cancel_op(osd_op_t *op);
void set_socket_options(osd_client_t *cl);
bool try_send(osd_client_t *cl);
void handle_send(int result, osd_client_t *cl);
void handle_zerocopy_notification(osd_client_t *cl, int res);
bool handle_read(int result, osd_client_t *cl);
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
@@ -198,9 +213,7 @@ protected:
void handle_reply_ready(osd_op_t *op);
#ifdef WITH_RDMA
void try_send_rdma(osd_client_t *cl);
void try_send_rdma_odp(osd_client_t *cl);
void try_send_rdma_nodp(osd_client_t *cl);
bool try_send_rdma(osd_client_t *cl);
bool try_recv_rdma(osd_client_t *cl);
void handle_rdma_events();
#endif

View File

@@ -47,29 +47,11 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
if (qp)
ibv_destroy_qp(qp);
if (recv_buffers.size())
{
for (auto b: recv_buffers)
{
if (b.mr)
ibv_dereg_mr(b.mr);
free(b.buf);
}
recv_buffers.clear();
}
if (send_out.mr)
{
ibv_dereg_mr(send_out.mr);
send_out.mr = NULL;
}
if (send_out.buf)
{
free(send_out.buf);
send_out.buf = NULL;
}
send_out_size = 0;
free(b);
}
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level)
{
int res;
ibv_device **dev_list = NULL;
@@ -154,27 +136,21 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
fprintf(stderr, "Couldn't query RDMA device for its features\n");
goto cleanup;
}
ctx->odp = odp;
if (ctx->odp &&
(!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
if (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT) ||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV)))
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV))
{
ctx->odp = false;
if (log_level > 0)
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable, disabling it\n");
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable or does not support RC send and receive with ODP\n");
goto cleanup;
}
}
if (ctx->odp)
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
if (!ctx->mr)
{
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
if (!ctx->mr)
{
fprintf(stderr, "Couldn't register RDMA memory region\n");
goto cleanup;
}
fprintf(stderr, "Couldn't register RDMA memory region\n");
goto cleanup;
}
ctx->channel = ibv_create_comp_channel(ctx->context);
@@ -389,34 +365,12 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
cl->rdma_conn->cur_send++;
}
static int try_send_rdma_copy(osd_client_t *cl, uint8_t *dst, int dst_len)
{
auto rc = cl->rdma_conn;
int total_dst_len = dst_len;
while (dst_len > 0 && rc->send_pos < cl->send_list.size())
{
iovec & iov = cl->send_list[rc->send_pos];
uint32_t len = (uint32_t)(iov.iov_len-rc->send_buf_pos < dst_len
? iov.iov_len-rc->send_buf_pos : dst_len);
memcpy(dst, iov.iov_base+rc->send_buf_pos, len);
dst += len;
dst_len -= len;
rc->send_buf_pos += len;
if (rc->send_buf_pos >= iov.iov_len)
{
rc->send_pos++;
rc->send_buf_pos = 0;
}
}
return total_dst_len-dst_len;
}
void osd_messenger_t::try_send_rdma_odp(osd_client_t *cl)
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
{
auto rc = cl->rdma_conn;
if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
{
return;
return true;
}
uint64_t op_size = 0, op_sge = 0;
ibv_sge sge[rc->max_sge];
@@ -454,70 +408,15 @@ void osd_messenger_t::try_send_rdma_odp(osd_client_t *cl)
rc->send_sizes.push_back(op_size);
try_send_rdma_wr(cl, sge, op_sge);
}
return true;
}
void osd_messenger_t::try_send_rdma_nodp(osd_client_t *cl)
{
auto rc = cl->rdma_conn;
if (!rc->send_out_size)
{
// Allocate send ring buffer, if not yet
rc->send_out_size = rc->max_msg*rdma_max_send;
rc->send_out.buf = malloc_or_die(rc->send_out_size);
if (!rdma_context->odp)
{
rc->send_out.mr = ibv_reg_mr(rdma_context->pd, rc->send_out.buf, rc->send_out_size, 0);
if (!rc->send_out.mr)
{
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
exit(1);
}
}
}
// Copy data into the buffer and send it
uint8_t *dst = NULL;
int dst_len = 0;
int copied = 1;
while (!rc->send_out_full && copied > 0 && rc->cur_send < rc->max_send)
{
dst = (uint8_t*)rc->send_out.buf + rc->send_out_pos;
dst_len = (rc->send_out_pos < rc->send_out_size ? rc->send_out_size-rc->send_out_pos : rc->send_done_pos-rc->send_out_pos);
if (dst_len > rc->max_msg)
dst_len = rc->max_msg;
copied = try_send_rdma_copy(cl, dst, dst_len);
if (copied > 0)
{
rc->send_out_pos += copied;
if (rc->send_out_pos == rc->send_out_size)
rc->send_out_pos = 0;
assert(rc->send_out_pos < rc->send_out_size);
if (rc->send_out_pos >= rc->send_done_pos)
rc->send_out_full = true;
ibv_sge sge = {
.addr = (uintptr_t)dst,
.length = (uint32_t)copied,
.lkey = rdma_context->odp ? rdma_context->mr->lkey : rc->send_out.mr->lkey,
};
try_send_rdma_wr(cl, &sge, 1);
rc->send_sizes.push_back(copied);
}
}
}
void osd_messenger_t::try_send_rdma(osd_client_t *cl)
{
if (rdma_context->odp)
try_send_rdma_odp(cl);
else
try_send_rdma_nodp(cl);
}
static void try_recv_rdma_wr(osd_client_t *cl, msgr_rdma_buf_t b)
static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
{
ibv_sge sge = {
.addr = (uintptr_t)b.buf,
.addr = (uintptr_t)buf,
.length = (uint32_t)cl->rdma_conn->max_msg,
.lkey = cl->rdma_conn->ctx->odp ? cl->rdma_conn->ctx->mr->lkey : b.mr->lkey,
.lkey = cl->rdma_conn->ctx->mr->lkey,
};
ibv_recv_wr *bad_wr = NULL;
ibv_recv_wr wr = {
@@ -539,19 +438,9 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
auto rc = cl->rdma_conn;
while (rc->cur_recv < rc->max_recv)
{
msgr_rdma_buf_t b;
b.buf = malloc_or_die(rc->max_msg);
if (!rdma_context->odp)
{
b.mr = ibv_reg_mr(rdma_context->pd, b.buf, rc->max_msg, IBV_ACCESS_LOCAL_WRITE);
if (!b.mr)
{
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
exit(1);
}
}
rc->recv_buffers.push_back(b);
try_recv_rdma_wr(cl, b);
void *buf = malloc_or_die(rc->max_msg);
rc->recv_buffers.push_back(buf);
try_recv_rdma_wr(cl, buf);
}
return true;
}
@@ -603,7 +492,7 @@ void osd_messenger_t::handle_rdma_events()
if (!is_send)
{
rc->cur_recv--;
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
{
// handle_read_buffer may stop the client
continue;
@@ -616,14 +505,6 @@ void osd_messenger_t::handle_rdma_events()
rc->cur_send--;
uint64_t sent_size = rc->send_sizes.at(0);
rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
if (!rdma_context->odp)
{
rc->send_done_pos += sent_size;
rc->send_out_full = false;
if (rc->send_done_pos == rc->send_out_size)
rc->send_done_pos = 0;
assert(rc->send_done_pos < rc->send_out_size);
}
int send_pos = 0, send_buf_pos = 0;
while (sent_size > 0)
{

View File

@@ -23,7 +23,6 @@ struct msgr_rdma_context_t
ibv_device *dev = NULL;
ibv_device_attr_ex attrx;
ibv_pd *pd = NULL;
bool odp = false;
ibv_mr *mr = NULL;
ibv_comp_channel *channel = NULL;
ibv_cq *cq = NULL;
@@ -36,16 +35,10 @@ struct msgr_rdma_context_t
int max_cqe = 0;
int used_max_cqe = 0;
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level);
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level);
~msgr_rdma_context_t();
};
struct msgr_rdma_buf_t
{
void *buf = NULL;
ibv_mr *mr = NULL;
};
struct msgr_rdma_connection_t
{
msgr_rdma_context_t *ctx = NULL;
@@ -57,11 +50,8 @@ struct msgr_rdma_connection_t
int send_pos = 0, send_buf_pos = 0;
int next_recv_buf = 0;
std::vector<msgr_rdma_buf_t> recv_buffers;
std::vector<void*> recv_buffers;
std::vector<uint64_t> send_sizes;
msgr_rdma_buf_t send_out;
int send_out_pos = 0, send_done_pos = 0, send_out_size = 0;
bool send_out_full = false;
~msgr_rdma_connection_t();
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);

View File

@@ -3,10 +3,15 @@
#define _XOPEN_SOURCE
#include <limits.h>
#include <sys/epoll.h>
#include "messenger.h"
#include <linux/errqueue.h>
#ifndef MSG_ZEROCOPY
#define MSG_ZEROCOPY 0
#endif
void osd_messenger_t::outbox_push(osd_op_t *cur_op)
{
assert(cur_op->peer_fd);
@@ -37,6 +42,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
}
auto & to_send_list = cl->write_msg.msg_iovlen ? cl->next_send_list : cl->send_list;
auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox;
auto & to_size = cl->write_msg.msg_iovlen ? cl->next_outbox_size : cl->outbox_size;
if (cur_op->op_type == OSD_OP_IN)
{
measure_exec(cur_op);
@@ -47,6 +53,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
cl->sent_ops[cur_op->req.hdr.id] = cur_op;
}
to_size += OSD_PACKET_SIZE;
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_HDR });
// Bitmap
if (cur_op->op_type == OSD_OP_IN &&
@@ -58,6 +65,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
.iov_len = cur_op->reply.sec_rw.attr_len,
});
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
to_size += cur_op->reply.sec_rw.attr_len;
}
else if (cur_op->op_type == OSD_OP_OUT &&
(cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
@@ -68,6 +76,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
.iov_len = cur_op->req.sec_rw.attr_len,
});
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
to_size += cur_op->req.sec_rw.attr_len;
}
// Operation data
if ((cur_op->op_type == OSD_OP_IN
@@ -90,15 +99,22 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
assert(cur_op->iov.buf[i].iov_base);
to_send_list.push_back(cur_op->iov.buf[i]);
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
to_size += cur_op->iov.buf[i].iov_len;
}
}
}
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
{
if (cur_op->op_type == OSD_OP_IN && cur_op->reply.hdr.retval > 0)
{
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->reply.hdr.retval });
to_size += cur_op->reply.hdr.retval;
}
else if (cur_op->op_type == OSD_OP_OUT && cur_op->req.sec_read_bmp.len > 0)
{
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len });
to_size += cur_op->req.sec_read_bmp.len;
}
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
}
if (cur_op->op_type == OSD_OP_IN)
@@ -184,17 +200,19 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
}
cl->write_msg.msg_iov = cl->send_list.data();
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
cl->write_msg.msg_flags = (cl->zerocopy_send && (cl->outbox_size/cl->send_list.size()) >= 4096 ? MSG_ZEROCOPY : 0);
cl->refs++;
ring_data_t* data = ((ring_data_t*)sqe->user_data);
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, cl->write_msg.msg_flags);
}
else
{
cl->write_msg.msg_iov = cl->send_list.data();
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
cl->write_msg.msg_flags = (cl->zerocopy_send && (cl->outbox_size/cl->send_list.size()) >= 4096 ? MSG_ZEROCOPY : 0);
cl->refs++;
int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL);
int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL | cl->write_msg.msg_flags);
if (result < 0)
{
result = -errno;
@@ -204,6 +222,62 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
return true;
}
void osd_messenger_t::handle_zerocopy_notification(osd_client_t *cl, int res)
{
cl->refs--;
if (cl->peer_state == PEER_STOPPED)
{
if (cl->refs <= 0)
{
delete cl;
}
return;
}
if (res != 0)
{
return;
}
if (cl->zerocopy_notification_msg.msg_flags & MSG_CTRUNC)
{
fprintf(stderr, "zero-copy send notification truncated on client socket %d\n", cl->peer_fd);
return;
}
for (struct cmsghdr *cm = CMSG_FIRSTHDR(&cl->zerocopy_notification_msg); cm; cm = CMSG_NXTHDR(&cl->zerocopy_notification_msg, cm))
{
if (cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR)
{
struct sock_extended_err *serr = (struct sock_extended_err*)CMSG_DATA(cm);
if (serr->ee_errno == 0 && serr->ee_origin == SO_EE_ORIGIN_ZEROCOPY)
{
// completed sends numbered serr->ee_info .. serr->ee_data
int start = 0;
while (start < cl->zerocopy_sent.size() && cl->zerocopy_sent[start].nsend < serr->ee_info)
start++;
int end = start;
if (serr->ee_data < serr->ee_info)
{
// counter has wrapped around
while (end < cl->zerocopy_sent.size() && cl->zerocopy_sent[end].nsend >= cl->zerocopy_sent[start].nsend)
end++;
}
while (end < cl->zerocopy_sent.size() && cl->zerocopy_sent[end].nsend <= serr->ee_data)
end++;
if (end > start)
{
for (int i = start; i < end; i++)
{
delete cl->zerocopy_sent[i].op;
}
cl->zerocopy_sent.erase(
cl->zerocopy_sent.begin() + start,
cl->zerocopy_sent.begin() + end
);
}
}
}
}
}
void osd_messenger_t::send_replies()
{
for (int i = 0; i < write_ready_clients.size(); i++)
@@ -231,16 +305,19 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
}
return;
}
if (result < 0 && result != -EAGAIN && result != -EINTR)
if (result < 0 && result != -EAGAIN && result != -EINTR && result != -ENOBUFS)
{
// this is a client socket, so don't panic. just disconnect it
fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
stop_client(cl->peer_fd);
return;
}
bool used_zerocopy = false;
if (result >= 0)
{
used_zerocopy = (cl->write_msg.msg_flags & MSG_ZEROCOPY) ? true : false;
int done = 0;
int bytes_written = result;
while (result > 0 && done < cl->send_list.size())
{
iovec & iov = cl->send_list[done];
@@ -249,7 +326,19 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
if (cl->outbox[done].flags & MSGR_SENDP_FREE)
{
// Reply fully sent
delete cl->outbox[done].op;
if (!used_zerocopy)
{
delete cl->outbox[done].op;
}
else
{
// With zero-copy send the difference is that we must keep the buffer (i.e. the operation)
// allocated until we get send notification from MSG_ERRQUEUE
cl->zerocopy_sent.push_back((msgr_zc_not_t){
.op = cl->outbox[done].op,
.nsend = cl->zerocopy_notification_idx,
});
}
}
result -= iov.iov_len;
done++;
@@ -261,6 +350,11 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
break;
}
}
if (used_zerocopy)
{
cl->zerocopy_notification_idx++;
}
cl->outbox_size -= bytes_written;
if (done > 0)
{
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+done);
@@ -270,8 +364,10 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
{
cl->send_list.insert(cl->send_list.end(), cl->next_send_list.begin(), cl->next_send_list.end());
cl->outbox.insert(cl->outbox.end(), cl->next_outbox.begin(), cl->next_outbox.end());
cl->outbox_size += cl->next_outbox_size;
cl->next_send_list.clear();
cl->next_outbox.clear();
cl->next_outbox_size = 0;
}
cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0;
#ifdef WITH_RDMA
@@ -284,14 +380,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
fprintf(stderr, "Successfully connected with client %d using RDMA\n", cl->peer_fd);
}
cl->peer_state = PEER_RDMA;
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
{
// Do not miss the disconnection!
if (epoll_events & EPOLLRDHUP)
{
handle_peer_epoll(peer_fd, epoll_events);
}
});
tfd->set_fd_handler(cl->peer_fd, false, NULL);
// Add the initial receive request
try_recv_rdma(cl);
}
@@ -301,4 +390,34 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
{
write_ready_clients.push_back(cl->peer_fd);
}
if (used_zerocopy && (cl->zerocopy_notification_idx-cl->zerocopy_notification_prev) >= 16 &&
cl->zerocopy_sent.size() > 0)
{
cl->zerocopy_notification_prev = cl->zerocopy_notification_idx;
cl->zerocopy_notification_msg = {
.msg_control = cl->zerocopy_notification_buf,
.msg_controllen = sizeof(cl->zerocopy_notification_buf),
};
cl->refs++;
io_uring_sqe* sqe = NULL;
if (ringloop && !use_sync_send_recv)
{
sqe = ringloop->get_sqe();
}
if (!sqe)
{
int res = recvmsg(cl->peer_fd, &cl->zerocopy_notification_msg, MSG_ERRQUEUE|MSG_DONTWAIT);
if (res < 0)
{
res = -errno;
}
handle_zerocopy_notification(cl, res);
}
else
{
ring_data_t* data = ((ring_data_t*)sqe->user_data);
data->callback = [this, cl](ring_data_t *data) { handle_zerocopy_notification(cl, data->res); };
my_uring_prep_recvmsg(sqe, cl->peer_fd, &cl->zerocopy_notification_msg, MSG_ERRQUEUE);
}
}
}

View File

@@ -30,7 +30,7 @@ protected:
std::string image_name;
uint64_t inode = 0;
uint64_t device_size = 0;
int nbd_timeout = 300;
int nbd_timeout = 30;
int nbd_max_devices = 64;
int nbd_max_part = 3;
inode_watch_t *watch = NULL;
@@ -135,16 +135,14 @@ public:
" %s unmap /dev/nbd0\n"
" %s ls [--json]\n"
"OPTIONS:\n"
" All usual Vitastor config options like --config_file <path_to_config> plus NBD-specific:\n"
" --nbd_timeout 300\n"
" All usual Vitastor config options like --etcd_address <etcd_address> plus NBD-specific:\n"
" --nbd_timeout 30\n"
" Timeout for I/O operations in seconds after exceeding which the kernel stops\n"
" the device. You can set it to 0 to disable the timeout, but beware that you\n"
" won't be able to stop the device at all if vitastor-nbd process dies.\n"
" --nbd_max_devices 64 --nbd_max_part 3\n"
" Options for the \"nbd\" kernel module when modprobing it (nbds_max and max_part).\n"
" note that maximum allowed (nbds_max)*(1+max_part) is 256.\n"
" Note that nbd_timeout, nbd_max_devices and nbd_max_part options may also be specified\n"
" in /etc/vitastor/vitastor.conf or in other configuration file specified with --config_file.\n"
" --logfile /path/to/log/file.txt\n"
" Wite log messages to the specified file instead of dropping them (in background mode)\n"
" or printing them to the standard output (in foreground mode).\n"
@@ -206,18 +204,17 @@ public:
exit(1);
}
}
auto file_config = osd_messenger_t::read_config(cfg);
if (file_config["nbd_max_devices"].is_number() || file_config["nbd_max_devices"].is_string())
if (cfg["nbd_max_devices"].is_number() || cfg["nbd_max_devices"].is_string())
{
nbd_max_devices = file_config["nbd_max_devices"].uint64_value();
nbd_max_devices = cfg["nbd_max_devices"].uint64_value();
}
if (file_config["nbd_max_part"].is_number() || file_config["nbd_max_part"].is_string())
if (cfg["nbd_max_part"].is_number() || cfg["nbd_max_part"].is_string())
{
nbd_max_part = file_config["nbd_max_part"].uint64_value();
nbd_max_part = cfg["nbd_max_part"].uint64_value();
}
if (file_config["nbd_timeout"].is_number() || file_config["nbd_timeout"].is_string())
if (cfg["nbd_timeout"].is_number() || cfg["nbd_timeout"].is_string())
{
nbd_timeout = file_config["nbd_timeout"].uint64_value();
nbd_timeout = cfg["nbd_timeout"].uint64_value();
}
if (cfg["client_writeback_allowed"].is_null())
{
@@ -228,7 +225,7 @@ public:
cfg = obj;
}
// Create client
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
if (!inode)
@@ -275,7 +272,7 @@ public:
int i = 0;
while (true)
{
int r = run_nbd(sockfd, i, device_size, NBD_FLAG_SEND_FLUSH, nbd_timeout, bg);
int r = run_nbd(sockfd, i, device_size, NBD_FLAG_SEND_FLUSH, 30, bg);
if (r == 0)
{
printf("/dev/nbd%d\n", i);

View File

@@ -56,7 +56,7 @@ json11::Json::object nfs_proxy_t::parse_args(int narg, const char *args[])
"(c) Vitaliy Filippov, 2021-2022 (VNPL-1.1)\n"
"\n"
"USAGE:\n"
" %s [STANDARD OPTIONS] [OTHER OPTIONS]\n"
" %s [--etcd_address ADDR] [OTHER OPTIONS]\n"
" --subdir <DIR> export images prefixed <DIR>/ (default empty - export all images)\n"
" --portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)\n"
" --bind <IP> bind service to <IP> address (default 0.0.0.0)\n"
@@ -124,7 +124,7 @@ void nfs_proxy_t::run(json11::Json cfg)
cfg = obj;
}
// Create client
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
cmd = new cli_tool_t();

View File

@@ -68,21 +68,14 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
}
}
if (print_stats_timer_id == -1)
print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
{
print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
{
print_stats();
});
}
if (slow_log_timer_id == -1)
print_stats();
});
slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
{
slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
{
print_slow();
});
}
apply_recovery_tune_interval();
print_slow();
});
msgr.tfd = this->tfd;
msgr.ringloop = this->ringloop;
@@ -104,11 +97,6 @@ osd_t::~osd_t()
tfd->clear_timer(slow_log_timer_id);
slow_log_timer_id = -1;
}
if (rtune_timer_id >= 0)
{
tfd->clear_timer(rtune_timer_id);
rtune_timer_id = -1;
}
if (print_stats_timer_id >= 0)
{
tfd->clear_timer(print_stats_timer_id);
@@ -208,22 +196,6 @@ void osd_t::parse_config(bool init)
recovery_queue_depth = config["recovery_queue_depth"].uint64_value();
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
recovery_sleep_us = config["recovery_sleep_us"].uint64_value();
recovery_tune_min_util = config["recovery_tune_min_util"].is_null()
? 0.1 : config["recovery_tune_min_util"].number_value();
recovery_tune_max_util = config["recovery_tune_max_util"].is_null()
? 1.0 : config["recovery_tune_max_util"].number_value();
recovery_tune_min_client_util = config["recovery_tune_min_client_util"].is_null()
? 0 : config["recovery_tune_min_client_util"].number_value();
recovery_tune_max_client_util = config["recovery_tune_max_client_util"].is_null()
? 0.5 : config["recovery_tune_max_client_util"].number_value();
auto old_recovery_tune_interval = recovery_tune_interval;
recovery_tune_interval = config["recovery_tune_interval"].is_null()
? 1 : config["recovery_tune_interval"].uint64_value();
recovery_tune_ewma_rate = config["recovery_tune_ewma_rate"].is_null()
? 0.5 : config["recovery_tune_ewma_rate"].number_value();
recovery_tune_sleep_min_us = config["recovery_tune_sleep_min_us"].is_null()
? 10 : config["recovery_tune_sleep_min_us"].uint64_value();
recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
if (recovery_pg_switch < 1)
recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
@@ -302,10 +274,6 @@ void osd_t::parse_config(bool init)
print_slow();
});
}
if (old_recovery_tune_interval != recovery_tune_interval)
{
apply_recovery_tune_interval();
}
}
void osd_t::bind_socket()
@@ -453,6 +421,14 @@ void osd_t::exec_op(osd_op_t *cur_op)
}
}
void osd_t::reset_stats()
{
msgr.stats = {};
prev_stats = {};
memset(recovery_stat_count, 0, sizeof(recovery_stat_count));
memset(recovery_stat_bytes, 0, sizeof(recovery_stat_bytes));
}
void osd_t::print_stats()
{
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
@@ -490,19 +466,19 @@ void osd_t::print_stats()
}
for (int i = 0; i < 2; i++)
{
if (recovery_stat[i].count > recovery_print_prev[i].count)
if (recovery_stat_count[0][i] != recovery_stat_count[1][i])
{
uint64_t bw = (recovery_stat[i].bytes - recovery_print_prev[i].bytes) / print_stats_interval;
uint64_t bw = (recovery_stat_bytes[0][i] - recovery_stat_bytes[1][i]) / print_stats_interval;
printf(
"[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s, avg lat %ld us\n", osd_num, recovery_stat_names[i],
(recovery_stat[i].count - recovery_print_prev[i].count) * 1.0 / print_stats_interval,
"[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s\n", osd_num, recovery_stat_names[i],
(recovery_stat_count[0][i] - recovery_stat_count[1][i]) * 1.0 / print_stats_interval,
(bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
(bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s")),
(recovery_stat[i].usec - recovery_print_prev[i].usec) / (recovery_stat[i].count - recovery_print_prev[i].count)
(bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
);
recovery_stat_count[1][i] = recovery_stat_count[0][i];
recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
}
}
memcpy(recovery_print_prev, recovery_stat, sizeof(recovery_stat));
if (corrupted_objects > 0)
{
printf("[OSD %lu] %lu object(s) corrupted\n", osd_num, corrupted_objects);
@@ -565,15 +541,11 @@ void osd_t::print_slow()
}
else if (op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
{
for (uint64_t i = 0; i < op->req.sec_stab.len && i < sizeof(obj_ver_id)*12; i += sizeof(obj_ver_id))
for (uint64_t i = 0; i < op->req.sec_stab.len; i += sizeof(obj_ver_id))
{
obj_ver_id *ov = (obj_ver_id*)((uint8_t*)op->buf + i);
bufprintf(i == 0 ? " %lx:%lx v%lu" : ", %lx:%lx v%lu", ov->oid.inode, ov->oid.stripe, ov->version);
}
if (op->req.sec_stab.len > sizeof(obj_ver_id)*12)
{
bufprintf(", ... (%lu items)", op->req.sec_stab.len/sizeof(obj_ver_id));
}
}
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
{

View File

@@ -34,7 +34,7 @@
#define DEFAULT_AUTOSYNC_INTERVAL 5
#define DEFAULT_AUTOSYNC_WRITES 128
#define MAX_RECOVERY_QUEUE 2048
#define DEFAULT_RECOVERY_QUEUE 1
#define DEFAULT_RECOVERY_QUEUE 4
#define DEFAULT_RECOVERY_PG_SWITCH 128
#define DEFAULT_RECOVERY_BATCH 16
@@ -87,11 +87,6 @@ struct osd_chain_read_t
struct osd_rmw_stripe_t;
struct recovery_stat_t
{
uint64_t count, usec, bytes;
};
class osd_t
{
// config
@@ -116,15 +111,7 @@ class osd_t
int immediate_commit = IMMEDIATE_NONE;
int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds
int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
uint64_t recovery_queue_depth = 1;
uint64_t recovery_sleep_us = 0;
double recovery_tune_min_util = 0.1;
double recovery_tune_min_client_util = 0;
double recovery_tune_max_util = 1.0;
double recovery_tune_max_client_util = 0.5;
int recovery_tune_interval = 1;
double recovery_tune_ewma_rate = 0.5;
int recovery_tune_sleep_min_us = 10;
int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
int inode_vanish_time = 60;
@@ -202,17 +189,8 @@ class osd_t
std::map<uint64_t, inode_stats_t> inode_stats;
std::map<uint64_t, timespec> vanishing_inodes;
const char* recovery_stat_names[2] = { "degraded", "misplaced" };
recovery_stat_t recovery_stat[2];
recovery_stat_t recovery_print_prev[2];
// recovery auto-tuning
int rtune_timer_id = -1;
uint64_t rtune_avg_lat = 0;
double rtune_client_util = 0, rtune_target_util = 1;
osd_op_stats_t rtune_prev_stats;
recovery_stat_t rtune_prev_recovery[2];
uint64_t recovery_target_queue_depth = 1;
uint64_t recovery_target_sleep_us = 0;
uint64_t recovery_stat_count[2][2] = {};
uint64_t recovery_stat_bytes[2][2] = {};
// cluster connection
void parse_config(bool init);
@@ -230,9 +208,8 @@ class osd_t
void create_osd_state();
void renew_lease(bool reload);
void print_stats();
void tune_recovery();
void apply_recovery_tune_interval();
void print_slow();
void reset_stats();
json11::Json get_statistics();
void report_statistics();
void report_pg_state(pg_t & pg);
@@ -261,7 +238,6 @@ class osd_t
bool submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
bool pick_next_recovery(osd_recovery_op_t &op);
void submit_recovery_op(osd_recovery_op_t *op);
void finish_recovery_op(osd_recovery_op_t *op);
bool continue_recovery();
pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg);

View File

@@ -213,14 +213,12 @@ json11::Json osd_t::get_statistics()
st["subop_stats"] = subop_stats;
st["recovery_stats"] = json11::Json::object {
{ recovery_stat_names[0], json11::Json::object {
{ "count", recovery_stat[0].count },
{ "bytes", recovery_stat[0].bytes },
{ "usec", recovery_stat[0].usec },
{ "count", recovery_stat_count[0][0] },
{ "bytes", recovery_stat_bytes[0][0] },
} },
{ recovery_stat_names[1], json11::Json::object {
{ "count", recovery_stat[1].count },
{ "bytes", recovery_stat[1].bytes },
{ "usec", recovery_stat[1].usec },
{ "count", recovery_stat_count[0][1] },
{ "bytes", recovery_stat_bytes[0][1] },
} },
};
return st;

View File

@@ -325,113 +325,30 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
{
printf("Recovery operation done for %lx:%lx\n", op->oid.inode, op->oid.stripe);
}
if (recovery_target_sleep_us)
// CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
op->osd_op = NULL;
recovery_ops.erase(op->oid);
delete osd_op;
if (immediate_commit != IMMEDIATE_ALL)
{
this->tfd->set_timer_us(recovery_target_sleep_us, false, [this, op](int timer_id)
recovery_done++;
if (recovery_done >= recovery_sync_batch)
{
finish_recovery_op(op);
});
}
else
{
finish_recovery_op(op);
// Force sync every <recovery_sync_batch> operations
// This is required not to pile up an excessive amount of delete operations
autosync();
recovery_done = 0;
}
}
continue_recovery();
};
exec_op(op->osd_op);
}
void osd_t::apply_recovery_tune_interval()
{
if (rtune_timer_id >= 0)
{
tfd->clear_timer(rtune_timer_id);
rtune_timer_id = -1;
}
if (recovery_tune_interval != 0)
{
rtune_timer_id = this->tfd->set_timer(recovery_tune_interval*1000, true, [this](int timer_id)
{
tune_recovery();
});
}
else
{
recovery_target_queue_depth = recovery_queue_depth;
recovery_target_sleep_us = recovery_sleep_us;
}
}
void osd_t::finish_recovery_op(osd_recovery_op_t *op)
{
// CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
delete op->osd_op;
op->osd_op = NULL;
recovery_ops.erase(op->oid);
if (immediate_commit != IMMEDIATE_ALL)
{
recovery_done++;
if (recovery_done >= recovery_sync_batch)
{
// Force sync every <recovery_sync_batch> operations
// This is required not to pile up an excessive amount of delete operations
autosync();
recovery_done = 0;
}
}
continue_recovery();
}
void osd_t::tune_recovery()
{
static int total_client_ops[] = { OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE };
uint64_t total_client_usec = 0;
for (int i = 0; i < sizeof(total_client_ops)/sizeof(total_client_ops[0]); i++)
{
total_client_usec += (msgr.stats.op_stat_sum[total_client_ops[i]] - rtune_prev_stats.op_stat_sum[total_client_ops[i]]);
rtune_prev_stats.op_stat_sum[total_client_ops[i]] = msgr.stats.op_stat_sum[total_client_ops[i]];
}
uint64_t total_recovery_usec = 0, recovery_count = 0;
total_recovery_usec += recovery_stat[0].usec-rtune_prev_recovery[0].usec;
total_recovery_usec += recovery_stat[1].usec-rtune_prev_recovery[1].usec;
recovery_count += recovery_stat[0].count-rtune_prev_recovery[0].count;
recovery_count += recovery_stat[1].count-rtune_prev_recovery[1].count;
memcpy(rtune_prev_recovery, recovery_stat, sizeof(recovery_stat));
if (recovery_count == 0)
{
return;
}
rtune_avg_lat = total_recovery_usec/recovery_count*recovery_tune_ewma_rate +
rtune_avg_lat*(1-recovery_tune_ewma_rate);
// client_util = count/interval * usec/1000000.0/count = usec/1000000.0/interval :-)
double client_util = total_client_usec/1000000.0/recovery_tune_interval;
rtune_client_util = rtune_client_util*(1-recovery_tune_ewma_rate) + client_util*recovery_tune_ewma_rate;
rtune_target_util = (rtune_client_util < recovery_tune_min_client_util
? recovery_tune_max_util
: recovery_tune_min_util + (rtune_client_util >= recovery_tune_max_client_util
? 0 : (recovery_tune_max_util-recovery_tune_min_util)*
(recovery_tune_max_client_util-rtune_client_util)/(recovery_tune_max_client_util-recovery_tune_min_client_util)
)
);
recovery_target_queue_depth = (int)rtune_target_util + (rtune_target_util < 1 || rtune_target_util-(int)rtune_target_util >= 0.1 ? 1 : 0);
// ideal_iops = 1s / real_latency
// ;; target_iops = target_util * ideal_iops
// => target_lat = target_queue * 1s / target_iops
// => target_lat = target_queue / target_util * real_latency
uint64_t target_lat = recovery_target_queue_depth/rtune_target_util * rtune_avg_lat;
recovery_target_sleep_us = target_lat > rtune_avg_lat+recovery_tune_sleep_min_us ? target_lat-rtune_avg_lat : 0;
if (log_level > 3)
{
printf(
"recovery tune: client util %.2f (ewma %.2f), target util %.2f -> queue %ld, lat %lu us, real %lu us, pause %lu us\n",
client_util, rtune_client_util, rtune_target_util, recovery_target_queue_depth, target_lat, rtune_avg_lat, recovery_target_sleep_us
);
}
}
// Just trigger write requests for degraded objects. They'll be recovered during writing
bool osd_t::continue_recovery()
{
while (recovery_ops.size() < recovery_target_queue_depth)
while (recovery_ops.size() < recovery_queue_depth)
{
osd_recovery_op_t op;
if (pick_next_recovery(op))

View File

@@ -19,14 +19,6 @@ static void handle_sigint(int sig)
exit(0);
}
static const char* help_text =
"Vitastor OSD (block object storage daemon) " VERSION "\n"
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
"\n"
"OSDs are usually started by vitastor-disk.\n"
"Manual usage: vitastor-osd [--option value] ...\n"
;
int main(int narg, char *args[])
{
setvbuf(stdout, NULL, _IONBF, 0);
@@ -45,20 +37,10 @@ int main(int narg, char *args[])
char *opt = args[i]+2;
config[std::string(opt)] = std::string(args[++i]);
}
else if (!strcmp(args[i], "--help"))
{
printf("%s", help_text);
return 0;
}
}
if (!config.size())
{
printf("%s", help_text);
return 1;
}
signal(SIGINT, handle_sigint);
signal(SIGTERM, handle_sigint);
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
ring_loop_t *ringloop = new ring_loop_t(512);
osd = new osd_t(config, ringloop);
while (1)
{

View File

@@ -3,15 +3,13 @@
#include "osd_primary.h"
#define SELF_FD -1
void osd_t::autosync()
{
if (immediate_commit != IMMEDIATE_ALL && !autosync_op)
{
autosync_op = new osd_op_t();
autosync_op->op_type = OSD_OP_IN;
autosync_op->peer_fd = SELF_FD;
autosync_op->peer_fd = -1;
autosync_op->req = (osd_any_op_t){
.sync = {
.header = {
@@ -87,13 +85,9 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
cur_op->reply.hdr.id = cur_op->req.hdr.id;
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
cur_op->reply.hdr.retval = retval;
if (cur_op->peer_fd == SELF_FD)
if (cur_op->peer_fd == -1)
{
// Do not include internal primary writes (recovery/rebalance) into client op statistics
if (cur_op->req.hdr.opcode != OSD_OP_WRITE)
{
msgr.measure_exec(cur_op);
}
msgr.measure_exec(cur_op);
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(cur_op->callback)(cur_op);
}

View File

@@ -292,27 +292,16 @@ resume_7:
{
{
int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
recovery_stat[recovery_type].count++;
if (!recovery_stat[recovery_type].count) // wrapped
recovery_stat_count[0][recovery_type]++;
if (!recovery_stat_count[0][recovery_type])
{
memset(&recovery_print_prev[recovery_type], 0, sizeof(recovery_print_prev[recovery_type]));
memset(&rtune_prev_recovery[recovery_type], 0, sizeof(rtune_prev_recovery[recovery_type]));
memset(&recovery_stat[recovery_type], 0, sizeof(recovery_stat[recovery_type]));
recovery_stat[recovery_type].count++;
recovery_stat_count[0][recovery_type]++;
recovery_stat_bytes[0][recovery_type] = 0;
}
for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size); role++)
{
recovery_stat[recovery_type].bytes += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
}
if (!cur_op->tv_end.tv_sec)
{
clock_gettime(CLOCK_REALTIME, &cur_op->tv_end);
}
uint64_t usec = (
(cur_op->tv_end.tv_sec - cur_op->tv_begin.tv_sec)*1000000 +
(cur_op->tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000
);
recovery_stat[recovery_type].usec += usec;
}
// Any kind of a non-clean object can have extra chunks, because we don't record objects
// as degraded & misplaced or incomplete & misplaced at the same time. So try to remove extra chunks

View File

@@ -196,11 +196,10 @@ static void vitastor_parse_filename(const char *filename, QDict *options, Error
!strcmp(name, "rdma-gid-index") ||
!strcmp(name, "rdma-mtu"))
{
#if QEMU_VERSION_MAJOR < 8 || QEMU_VERSION_MAJOR == 8 && QEMU_VERSION_MINOR < 1
unsigned long long num_val;
#if QEMU_VERSION_MAJOR < 8 || QEMU_VERSION_MAJOR == 8 && QEMU_VERSION_MINOR < 1
if (parse_uint_full(value, &num_val, 0))
#else
uint64_t num_val;
if (parse_uint_full(value, 0, &num_val))
#endif
{

View File

@@ -17,7 +17,7 @@ ring_loop_t::ring_loop_t(int qd)
{
throw std::runtime_error(std::string("io_uring_queue_init: ") + strerror(-ret));
}
free_ring_data_ptr = *ring.sq.kring_entries;
free_ring_data_ptr = *ring.cq.kring_entries;
ring_datas = (struct ring_data_t*)calloc(free_ring_data_ptr, sizeof(ring_data_t));
free_ring_data = (int*)malloc(sizeof(int) * free_ring_data_ptr);
if (!ring_datas || !free_ring_data)

View File

@@ -15,8 +15,6 @@
#include <functional>
#include <vector>
#define RINGLOOP_DEFAULT_SIZE 1024
static inline void my_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, const void *addr, unsigned len, off_t offset)
{
// Prepare a read/write operation without clearing user_data
@@ -141,9 +139,11 @@ public:
if (free_ring_data_ptr == 0)
return NULL;
struct io_uring_sqe* sqe = io_uring_get_sqe(&ring);
assert(sqe);
*sqe = { 0 };
io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
if (sqe)
{
*sqe = { 0 };
io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
}
return sqe;
}
inline void set_immediate(const std::function<void()> cb)

View File

@@ -30,7 +30,7 @@ void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op);
int main(int narg, char *args[])
{
ring_consumer_t looper;
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
ring_loop_t *ringloop = new ring_loop_t(512);
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
osd_messenger_t *msgr = new osd_messenger_t();
msgr->osd_num = 1351;

View File

@@ -11,7 +11,7 @@ int main(int narg, char *args[])
config["meta_device"] = "./test_meta.bin";
config["journal_device"] = "./test_journal.bin";
config["data_device"] = "./test_data.bin";
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
ring_loop_t *ringloop = new ring_loop_t(512);
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
blockstore_t *bs = new blockstore_t(config, ringloop, epmgr->tfd);

View File

@@ -68,7 +68,7 @@ int main(int narg, char *args[])
| cfg["inode_id"].uint64_value();
uint64_t base_ver = 0;
// Create client
auto ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
auto ringloop = new ring_loop_t(512);
auto epmgr = new epoll_manager_t(ringloop);
auto cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
cli->on_ready([&]()

View File

@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor
Description: Vitastor client library
Version: 1.3.1
Version: 1.1.0
Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir}

View File

@@ -114,7 +114,7 @@ vitastor_c *vitastor_c_create_qemu_uring(QEMUSetFDHandler *aio_set_fd_handler, v
ring_loop_t *ringloop = NULL;
try
{
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
ringloop = new ring_loop_t(512);
}
catch (std::exception & e)
{
@@ -136,7 +136,7 @@ vitastor_c *vitastor_c_create_uring(const char *config_path, const char *etcd_ho
ring_loop_t *ringloop = NULL;
try
{
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
ringloop = new ring_loop_t(512);
}
catch (std::exception & e)
{
@@ -167,7 +167,7 @@ vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len)
ring_loop_t *ringloop = NULL;
try
{
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
ringloop = new ring_loop_t(512);
}
catch (std::exception & e)
{