Compare commits

..

14 Commits

Author SHA1 Message Date
fc83e3821c Fix write-over-delete failing for the very first entry in dirty_db 2023-10-21 17:08:32 +03:00
0d707fc83b Fix possible segfault in vitastor-cli ls -l 2023-10-21 17:08:32 +03:00
4f99f78430 Fix possible OSD crash during sync due to missing min_flushed_journal_sector reset 2023-09-17 00:36:46 +03:00
f926f8c2e0 Remove unused bs_sync fields 2023-09-17 00:36:46 +03:00
d73ad12c56 Fix fio_sec_osd attr_len 2023-09-17 00:36:46 +03:00
d68cec10e2 Remove erroneous block_size mismatch warnings on pools without matching PGs 2023-09-17 00:36:46 +03:00
9afa200a33 Flush STDOUT and STDERR before exiting from cli to fix Proxmox "Unexpected result" 2023-09-17 00:36:46 +03:00
aff6f3e970 Fix sscanf validation usage (field count instead of null_byte == 0) 2023-09-07 11:34:42 +03:00
49fca80f1c Add supported_truncate_flags 2023-09-07 11:34:42 +03:00
a028b4fa4c Make QEMU driver compatible with QEMU 8.1 2023-08-24 19:09:00 +03:00
da2bfd0b1e Fix co_truncate size division by BDRV_SECTOR_SIZE 2023-08-24 19:02:24 +03:00
cec5ceab77 Fix buffer insert in cluster_client 2023-08-24 19:02:24 +03:00
dc90faec7e Fix incorrect marking op parts as done with snapshots (could probably lead to client hangs) 2023-08-24 19:02:24 +03:00
20c62a4244 Fix monitor retrying failed etcd connection in an infinite loop without pauses 2023-08-24 19:02:24 +03:00
124 changed files with 930 additions and 6302 deletions

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
set(VERSION "1.2.0")
set(VERSION "1.0.0")
add_subdirectory(src)

View File

@@ -50,7 +50,6 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
- Параметры
- [Общие](docs/config/common.ru.md)
- [Сетевые](docs/config/network.ru.md)
- [Клиентский код](docs/config/client.en.md)
- [Глобальные дисковые параметры](docs/config/layout-cluster.ru.md)
- [Дисковые параметры OSD](docs/config/layout-osd.ru.md)
- [Прочие параметры OSD](docs/config/osd.ru.md)

View File

@@ -50,7 +50,6 @@ Read more details below in the documentation.
- Parameter Reference
- [Common](docs/config/common.en.md)
- [Network](docs/config/network.en.md)
- [Client](docs/config/client.en.md)
- [Global Disk Layout](docs/config/layout-cluster.en.md)
- [OSD Disk Layout](docs/config/layout-osd.en.md)
- [OSD Runtime Parameters](docs/config/osd.en.md)

View File

@@ -1,4 +1,4 @@
VERSION ?= v1.2.0
VERSION ?= v1.0.0
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v1.2.0
image: vitalif/vitastor-csi:v1.0.0
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -35,13 +35,10 @@ rules:
verbs: ["get", "list", "watch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshots"]
verbs: ["get", "list", "patch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshots/status"]
verbs: ["get", "list", "patch"]
verbs: ["get", "list"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshotcontents"]
verbs: ["create", "get", "list", "watch", "update", "delete", "patch"]
verbs: ["create", "get", "list", "watch", "update", "delete"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshotclasses"]
verbs: ["get", "list", "watch"]
@@ -56,7 +53,7 @@ rules:
verbs: ["get", "list", "watch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshotcontents/status"]
verbs: ["update", "patch"]
verbs: ["update"]
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["get"]

View File

@@ -23,11 +23,6 @@ metadata:
name: csi-vitastor-provisioner
spec:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
maxSurge: 0
selector:
matchLabels:
app: csi-vitastor-provisioner
@@ -51,7 +46,7 @@ spec:
priorityClassName: system-cluster-critical
containers:
- name: csi-provisioner
image: k8s.gcr.io/sig-storage/csi-provisioner:v3.0.0
image: k8s.gcr.io/sig-storage/csi-provisioner:v2.2.0
args:
- "--csi-address=$(ADDRESS)"
- "--v=5"
@@ -121,7 +116,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v1.2.0
image: vitalif/vitastor-csi:v1.0.0
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -17,4 +17,3 @@ parameters:
# multiple etcdUrls may be specified, delimited by comma
#etcdUrl: "http://192.168.7.2:2379"
#etcdPrefix: "/vitastor"
allowVolumeExpansion: true

View File

@@ -1,7 +0,0 @@
apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshotClass
metadata:
name: vitastor-snapclass
driver: csi.vitastor.io
deletionPolicy: Delete
parameters:

View File

@@ -1,16 +0,0 @@
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: test-vitastor-clone
spec:
storageClassName: vitastor
dataSource:
name: snap1
kind: VolumeSnapshot
apiGroup: snapshot.storage.k8s.io
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi

View File

@@ -1,8 +0,0 @@
apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshot
metadata:
name: snap1
spec:
volumeSnapshotClassName: vitastor-snapclass
source:
persistentVolumeClaimName: test-vitastor-pvc

View File

@@ -9,7 +9,6 @@ require (
golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
google.golang.org/grpc v1.33.1
google.golang.org/protobuf v1.24.0
k8s.io/klog v1.0.0
k8s.io/utils v0.0.0-20210305010621-2afb4311ab10
)

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "1.2.0"
vitastorCSIDriverVersion = "1.0.0"
)
// Config struct fills the parameters of request or user input

View File

@@ -20,7 +20,6 @@ import (
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"google.golang.org/protobuf/types/known/timestamppb"
"github.com/container-storage-interface/spec/lib/go/csi"
)
@@ -46,7 +45,6 @@ type InodeConfig struct
ParentPool uint64 `json:"parent_pool,omitempty"`
ParentId uint64 `json:"parent_id,omitempty"`
Readonly bool `json:"readonly,omitempty"`
CreateTs uint64 `json:"create_ts,omitempty"`
}
type ControllerServer struct
@@ -180,43 +178,27 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
}
args := []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) }
// Support creation from snapshot
var src *csi.VolumeContentSource
if (req.VolumeContentSource.GetSnapshot() != nil)
{
snapId := req.VolumeContentSource.GetSnapshot().GetSnapshotId()
if (snapId != "")
{
snapVars := make(map[string]string)
err := json.Unmarshal([]byte(snapId), &snapVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
args = append(args, "--parent", snapVars["name"]+"@"+snapVars["snapshot"])
src = &csi.VolumeContentSource{
Type: &csi.VolumeContentSource_Snapshot{
Snapshot: &csi.VolumeContentSource_SnapshotSource{
SnapshotId: snapId,
},
},
}
}
}
// Create image using vitastor-cli
_, err := invokeCLI(ctxVars, args)
_, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
if (err != nil)
{
if (strings.Index(err.Error(), "already exists") > 0)
{
inodeCfg, err := invokeList(ctxVars, volName, true)
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
if (err != nil)
{
return nil, err
}
var inodeCfg []InodeConfig
err = json.Unmarshal(stat, &inodeCfg)
if (err != nil)
{
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
}
if (len(inodeCfg) == 0)
{
return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
}
if (inodeCfg[0].Size < uint64(volSize))
{
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
@@ -235,7 +217,6 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
// Ugly, but VolumeContext isn't passed to DeleteVolume :-(
VolumeId: string(volumeIdJson),
CapacityBytes: volSize,
ContentSource: src,
},
}, nil
}
@@ -249,15 +230,15 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
}
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &volVars)
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := volVars["name"]
volName := ctxVars["name"]
ctxVars, _, _ := GetConnectionParams(volVars)
ctxVars, _, _ = GetConnectionParams(ctxVars)
_, err = invokeCLI(ctxVars, []string{ "rm", volName })
if (err != nil)
@@ -363,8 +344,6 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
csi.ControllerServiceCapability_RPC_LIST_VOLUMES,
csi.ControllerServiceCapability_RPC_EXPAND_VOLUME,
csi.ControllerServiceCapability_RPC_CREATE_DELETE_SNAPSHOT,
csi.ControllerServiceCapability_RPC_LIST_SNAPSHOTS,
// TODO: csi.ControllerServiceCapability_RPC_CLONE_VOLUME,
} {
controllerServerCapabilities = append(controllerServerCapabilities, functionControllerServerCapabilities(capability))
}
@@ -374,214 +353,28 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
}, nil
}
func invokeList(ctxVars map[string]string, pattern string, expectExist bool) ([]InodeConfig, error)
{
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", pattern })
if (err != nil)
{
return nil, err
}
var inodeCfg []InodeConfig
err = json.Unmarshal(stat, &inodeCfg)
if (err != nil)
{
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
}
if (expectExist && len(inodeCfg) == 0)
{
return nil, status.Error(codes.Internal, "Can't find expected image "+pattern+" via vitastor-cli ls")
}
return inodeCfg, nil
}
// CreateSnapshot create snapshot of an existing PV
func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error)
{
klog.Infof("received controller create snapshot request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
}
if (req.SourceVolumeId == "" || req.Name == "")
{
return nil, status.Error(codes.InvalidArgument, "source volume ID and snapshot name are required fields")
}
// snapshot name
snapName := req.Name
// req.VolumeId is an ugly json string in our case :)
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.SourceVolumeId), &ctxVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := ctxVars["name"]
// Create image using vitastor-cli
_, err = invokeCLI(ctxVars, []string{ "create", "--snapshot", snapName, volName })
if (err != nil && strings.Index(err.Error(), "already exists") <= 0)
{
return nil, err
}
// Check created snapshot
inodeCfg, err := invokeList(ctxVars, volName+"@"+snapName, true)
if (err != nil)
{
return nil, err
}
// Use ugly JSON snapshot ID again, DeleteSnapshot doesn't have context :-(
ctxVars["snapshot"] = snapName
snapIdJson, _ := json.Marshal(ctxVars)
return &csi.CreateSnapshotResponse{
Snapshot: &csi.Snapshot{
SizeBytes: int64(inodeCfg[0].Size),
SnapshotId: string(snapIdJson),
SourceVolumeId: req.SourceVolumeId,
CreationTime: &timestamppb.Timestamp{ Seconds: int64(inodeCfg[0].CreateTs) },
ReadyToUse: true,
},
}, nil
return nil, status.Error(codes.Unimplemented, "")
}
// DeleteSnapshot delete provided snapshot of a PV
func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error)
{
klog.Infof("received controller delete snapshot request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
}
if (req.SnapshotId == "")
{
return nil, status.Error(codes.InvalidArgument, "snapshot ID is a required field")
}
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.SnapshotId), &volVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "snapshot ID not in JSON format")
}
volName := volVars["name"]
snapName := volVars["snapshot"]
ctxVars, _, _ := GetConnectionParams(volVars)
_, err = invokeCLI(ctxVars, []string{ "rm", volName+"@"+snapName })
if (err != nil)
{
return nil, err
}
return &csi.DeleteSnapshotResponse{}, nil
return nil, status.Error(codes.Unimplemented, "")
}
// ListSnapshots list the snapshots of a PV
func (cs *ControllerServer) ListSnapshots(ctx context.Context, req *csi.ListSnapshotsRequest) (*csi.ListSnapshotsResponse, error)
{
klog.Infof("received controller list snapshots request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
}
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.SourceVolumeId), &volVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := volVars["name"]
ctxVars, _, _ := GetConnectionParams(volVars)
inodeCfg, err := invokeList(ctxVars, volName+"@*", false)
if (err != nil)
{
return nil, err
}
resp := &csi.ListSnapshotsResponse{}
for _, ino := range inodeCfg
{
snapName := ino.Name[len(volName)+1:]
if (len(req.StartingToken) > 0 && snapName < req.StartingToken)
{
}
else if (req.MaxEntries == 0 || len(resp.Entries) < int(req.MaxEntries))
{
volVars["snapshot"] = snapName
snapIdJson, _ := json.Marshal(volVars)
resp.Entries = append(resp.Entries, &csi.ListSnapshotsResponse_Entry{
Snapshot: &csi.Snapshot{
SizeBytes: int64(ino.Size),
SnapshotId: string(snapIdJson),
SourceVolumeId: req.SourceVolumeId,
CreationTime: &timestamppb.Timestamp{ Seconds: int64(ino.CreateTs) },
ReadyToUse: true,
},
})
}
else
{
resp.NextToken = snapName
break
}
}
return resp, nil
return nil, status.Error(codes.Unimplemented, "")
}
// ControllerExpandVolume increases the size of a volume
// ControllerExpandVolume resizes a volume
func (cs *ControllerServer) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error)
{
klog.Infof("received controller expand volume request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
}
if (req.VolumeId == "" || req.CapacityRange == nil || req.CapacityRange.RequiredBytes == 0)
{
return nil, status.Error(codes.InvalidArgument, "VolumeId, CapacityRange and RequiredBytes are required fields")
}
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &volVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := volVars["name"]
ctxVars, _, _ := GetConnectionParams(volVars)
inodeCfg, err := invokeList(ctxVars, volName, true)
if (err != nil)
{
return nil, err
}
if (req.CapacityRange.RequiredBytes > 0 && inodeCfg[0].Size < uint64(req.CapacityRange.RequiredBytes))
{
sz := ((req.CapacityRange.RequiredBytes+4095)/4096)*4096
_, err := invokeCLI(ctxVars, []string{ "modify", "--inc_size", "1", "--resize", fmt.Sprintf("%d", sz), volName })
if (err != nil)
{
return nil, err
}
inodeCfg, err = invokeList(ctxVars, volName, true)
if (err != nil)
{
return nil, err
}
}
return &csi.ControllerExpandVolumeResponse{
CapacityBytes: int64(inodeCfg[0].Size),
NodeExpansionRequired: false,
}, nil
return nil, status.Error(codes.Unimplemented, "")
}
// ControllerGetVolume get volume info

View File

@@ -49,13 +49,6 @@ func (is *IdentityServer) GetPluginCapabilities(ctx context.Context, req *csi.Ge
},
},
},
{
Type: &csi.PluginCapability_VolumeExpansion_{
VolumeExpansion: &csi.PluginCapability_VolumeExpansion{
Type: csi.PluginCapability_VolumeExpansion_OFFLINE,
},
},
},
},
}, nil
}

View File

@@ -70,10 +70,10 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
isBlock := req.GetVolumeCapability().GetBlock() != nil
// Check that it's not already mounted
_, err := mount.IsNotMountPoint(ns.mounter, targetPath)
if (err != nil)
_, error := mount.IsNotMountPoint(ns.mounter, targetPath)
if (error != nil)
{
if (os.IsNotExist(err))
if (os.IsNotExist(error))
{
if (isBlock)
{
@@ -102,12 +102,12 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
}
else
{
return nil, status.Error(codes.Internal, err.Error())
return nil, status.Error(codes.Internal, error.Error())
}
}
ctxVars := make(map[string]string)
err = json.Unmarshal([]byte(req.VolumeId), &ctxVars)
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
@@ -147,74 +147,70 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
}
devicePath := strings.TrimSpace(stdoutStr)
// Check existing format
diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
existingFormat, err := diskMounter.GetDiskFormat(devicePath)
if (err != nil)
{
klog.Errorf("failed to get disk format for path %s, error: %v", err)
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, err
}
// Format the device (ext4 or xfs)
fsType := req.GetVolumeCapability().GetMount().GetFsType()
opt := req.GetVolumeCapability().GetMount().GetMountFlags()
opt = append(opt, "_netdev")
if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
!Contains(opt, "ro"))
{
opt = append(opt, "ro")
}
if (fsType == "xfs")
{
opt = append(opt, "nouuid")
}
readOnly := Contains(opt, "ro")
if (existingFormat == "" && !readOnly)
{
args := []string{}
switch fsType
{
case "ext4":
args = []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
case "xfs":
args = []string{"-K", devicePath}
}
if (len(args) > 0)
{
cmdOut, cmdErr := diskMounter.Exec.Command("mkfs."+fsType, args...).CombinedOutput()
if (cmdErr != nil)
{
klog.Errorf("failed to run mkfs error: %v, output: %v", cmdErr, string(cmdOut))
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, status.Error(codes.Internal, cmdErr.Error())
}
}
}
if (isBlock)
{
err = diskMounter.Mount(devicePath, targetPath, "", []string{"bind"})
opt = append(opt, "bind")
err = diskMounter.Mount(devicePath, targetPath, fsType, opt)
}
else
{
// Check existing format
existingFormat, err := diskMounter.GetDiskFormat(devicePath)
if (err != nil)
{
klog.Errorf("failed to get disk format for path %s, error: %v", err)
goto unmap
}
// Format the device (ext4 or xfs)
fsType := req.GetVolumeCapability().GetMount().GetFsType()
opt := req.GetVolumeCapability().GetMount().GetMountFlags()
opt = append(opt, "_netdev")
if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
!Contains(opt, "ro"))
{
opt = append(opt, "ro")
}
if (fsType == "xfs")
{
opt = append(opt, "nouuid")
}
readOnly := Contains(opt, "ro")
if (existingFormat == "" && !readOnly)
{
var cmdOut []byte
switch fsType
{
case "ext4":
args := []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
cmdOut, err = diskMounter.Exec.Command("mkfs.ext4", args...).CombinedOutput()
case "xfs":
cmdOut, err = diskMounter.Exec.Command("mkfs.xfs", "-K", devicePath).CombinedOutput()
}
if (err != nil)
{
klog.Errorf("failed to run mkfs error: %v, output: %v", err, string(cmdOut))
goto unmap
}
}
err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
// Try to run online resize on mount.
// FIXME: Implement online resize. It requires online resize support in vitastor-nbd.
if (err == nil && existingFormat != "" && !readOnly)
{
var cmdOut []byte
switch (fsType)
{
case "ext4":
cmdOut, err = diskMounter.Exec.Command("resize2fs", devicePath).CombinedOutput()
case "xfs":
cmdOut, err = diskMounter.Exec.Command("xfs_growfs", devicePath).CombinedOutput()
}
if (err != nil)
{
klog.Errorf("failed to run resizefs error: %v, output: %v", err, string(cmdOut))
goto unmap
}
}
}
if (err != nil)
{
@@ -222,18 +218,15 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
"failed to mount device path (%s) to path (%s) for volume (%s) error: %s",
devicePath, targetPath, volName, err,
)
goto unmap
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, status.Error(codes.Internal, err.Error())
}
return &csi.NodePublishVolumeResponse{}, nil
unmap:
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, status.Error(codes.Internal, err.Error())
}
// NodeUnpublishVolume unmounts the volume from the target path

4
debian/changelog vendored
View File

@@ -1,10 +1,10 @@
vitastor (1.2.0-1) unstable; urgency=medium
vitastor (1.0.0-1) unstable; urgency=medium
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
vitastor (1.2.0-1) unstable; urgency=medium
vitastor (1.0.0-1) unstable; urgency=medium
* Implement NFS proxy
* Add documentation

2
debian/control vendored
View File

@@ -2,7 +2,7 @@ Source: vitastor
Section: admin
Priority: optional
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev
Standards-Version: 4.5.0
Homepage: https://vitastor.io/
Rules-Requires-Root: no

View File

@@ -54,8 +54,7 @@ RUN set -e; \
quilt add block/vitastor.c; \
cp /root/vitastor/src/qemu_driver.c block/vitastor.c; \
quilt refresh; \
V=$(head -n1 debian/changelog | perl -pe 's/5\.2\+dfsg-9/5.2+dfsg-11/; s/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor4; \
if [ "$REL" = bullseye ]; then V=${V}bullseye; fi; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor3; \
DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
rm -rf /root/packages/qemu-$REL/qemu-*/

View File

@@ -35,8 +35,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-1.2.0; \
cd vitastor-1.2.0; \
cp -r /root/vitastor vitastor-1.0.0; \
cd vitastor-1.0.0; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -49,8 +49,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.2.0.orig.tar.xz vitastor-1.2.0; \
cd vitastor-1.2.0; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.0.0.orig.tar.xz vitastor-1.0.0; \
cd vitastor-1.0.0; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -33,7 +33,6 @@ In the future, additional configuration methods may be added:
- [Common](config/common.en.md)
- [Network](config/network.en.md)
- [Client](config/client.en.md)
- [Global Disk Layout](config/layout-cluster.en.md)
- [OSD Disk Layout](config/layout-osd.en.md)
- [OSD Runtime Parameters](config/osd.en.md)

View File

@@ -36,7 +36,6 @@
- [Общие](config/common.ru.md)
- [Сеть](config/network.ru.md)
- [Клиентский код](config/client.ru.md)
- [Глобальные дисковые параметры](config/layout-cluster.ru.md)
- [Дисковые параметры OSD](config/layout-osd.ru.md)
- [Прочие параметры OSD](config/osd.ru.md)

View File

@@ -1,103 +0,0 @@
[Documentation](../../README.md#documentation) → [Configuration](../config.en.md) → Client Parameters
-----
[Читать на русском](client.ru.md)
# Client Parameters
These parameters apply only to clients and affect their interaction with
the cluster.
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
- [client_max_buffered_bytes](#client_max_buffered_bytes)
- [client_max_buffered_ops](#client_max_buffered_ops)
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
## client_max_dirty_bytes
- Type: integer
- Default: 33554432
- Can be changed online: yes
Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
(not committed by fsync) data allowed by the client before forcing an
additional fsync and committing the data. Also note that the client always
holds a copy of uncommitted data in memory so this setting also affects
RAM usage of clients.
## client_max_dirty_ops
- Type: integer
- Default: 1024
- Can be changed online: yes
Same as client_max_dirty_bytes, but instead of total size, limits the number
of uncommitted write operations.
## client_enable_writeback
- Type: boolean
- Default: false
- Can be changed online: yes
This parameter enables client-side write buffering. This means that write
requests are accumulated in memory for a short time before being sent to
a Vitastor cluster which allows to send them in parallel and increase
performance of some applications. Writes are buffered until client forces
a flush with fsync() or until the amount of buffered writes exceeds the
limit.
Write buffering significantly increases performance of some applications,
for example, CrystalDiskMark under Windows (LOL :-D), but also any other
applications if they do writes in one of two non-optimal ways: either if
they do a lot of small (4 kb or so) sequential writes, or if they do a lot
of small random writes, but without any parallelism or asynchrony, and also
without calling fsync().
With write buffering enabled, you can expect around 22000 T1Q1 random write
iops in QEMU more or less regardless of the quality of your SSDs, and this
number is in fact bound by QEMU itself rather than Vitastor (check it
yourself by adding a "driver=null-co" disk in QEMU). Without write
buffering, the current record is 9900 iops, but the number is usually
even lower with non-ideal hardware, for example, it may be 5000 iops.
Even when this parameter is enabled, write buffering isn't enabled until
the client explicitly allows it, because enabling it without the client
being aware of the fact that his writes may be buffered may lead to data
loss. Because of this, older versions of clients don't support write
buffering at all, newer versions of the QEMU driver allow write buffering
only if it's enabled in disk settings with `-blockdev cache.direct=false`,
and newer versions of FIO only allow write buffering if you don't specify
`-direct=1`. NBD and NFS drivers allow write buffering by default.
You can overcome this restriction too with the `client_writeback_allowed`
parameter, but you shouldn't do that unless you **really** know what you
are doing.
## client_max_buffered_bytes
- Type: integer
- Default: 33554432
- Can be changed online: yes
Maximum total size of buffered writes which triggers write-back when reached.
## client_max_buffered_ops
- Type: integer
- Default: 1024
- Can be changed online: yes
Maximum number of buffered writes which triggers write-back when reached.
Multiple consecutive modified data regions are counted as 1 write here.
## client_max_writeback_iodepth
- Type: integer
- Default: 256
- Can be changed online: yes
Maximum number of parallel writes when flushing buffered data to the server.

View File

@@ -1,103 +0,0 @@
[Документация](../../README-ru.md#документация) → [Конфигурация](../config.ru.md) → Параметры клиентского кода
-----
[Read in English](client.en.md)
# Параметры клиентского кода
Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD) и
затрагивают логику их работы с кластером.
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
- [client_max_buffered_bytes](#client_max_buffered_bytes)
- [client_max_buffered_ops](#client_max_buffered_ops)
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
## client_max_dirty_bytes
- Тип: целое число
- Значение по умолчанию: 33554432
- Можно менять на лету: да
При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
зафиксированных fsync-ом) данных, при достижении которого клиент будет
принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
что в этом случае до момента fsync клиент хранит копию незафиксированных
данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
## client_max_dirty_ops
- Тип: целое число
- Значение по умолчанию: 1024
- Можно менять на лету: да
Аналогично client_max_dirty_bytes, но ограничивает количество
незафиксированных операций записи вместо их общего объёма.
## client_enable_writeback
- Тип: булево (да/нет)
- Значение по умолчанию: false
- Можно менять на лету: да
Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
означает, что операции записи отправляются на кластер Vitastor не сразу, а
могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
пока клиент не вызовет fsync.
Буферизация значительно повышает производительность некоторых приложений,
например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
(например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
есть, например, отправляя 128 операций записи в разные места диска, но не
все сразу с помощью асинхронного I/O, а по одной.
В QEMU с буферизацией записи можно ожидать показателя примерно 22000
операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
в секунду.
При этом, даже если данный параметр включён, буферизация не включается, если
явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
буферизуются, это может приводить к потере данных. Поэтому в старых версиях
клиентских драйверов буферизация записи не включается вообще, в новых
версиях QEMU-драйвера включается, только если разрешена опцией диска
`-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
но делать так не надо, если только вы не уверены в том, что делаете, на все
100%. :-)
## client_max_buffered_bytes
- Тип: целое число
- Значение по умолчанию: 33554432
- Можно менять на лету: да
Максимальный общий размер буферизованных записей, при достижении которого
начинается процесс сброса данных на сервер.
## client_max_buffered_ops
- Тип: целое число
- Значение по умолчанию: 1024
- Можно менять на лету: да
Максимальное количество буферизованных записей, при достижении которого
начинается процесс сброса данных на сервер. При этом несколько
последовательных изменённых областей здесь считаются 1 записью.
## client_max_writeback_iodepth
- Тип: целое число
- Значение по умолчанию: 256
- Можно менять на лету: да
Максимальное число параллельных операций записи при сбросе буферов на сервер.

View File

@@ -96,9 +96,8 @@ SSD cache or "media-cache" - for example, a lot of Seagate EXOS drives have
it (they have internal SSD cache even though it's not stated in datasheets).
Setting this parameter to "all" or "small" in OSD parameters requires enabling
[disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
[disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
"all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
disable_journal_fsync and disable_meta_fsync, setting it to "all" also requires
enabling disable_data_fsync.
TLDR: For optimal performance, set immediate_commit to "all" if you only use
SSDs with supercapacitor-based power loss protection (nonvolatile

View File

@@ -103,9 +103,8 @@ HDD-дисках с внутренним SSD или "медиа" кэшем - н
указано в спецификациях).
Указание "all" или "small" в настройках / командной строке OSD требует
включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
[disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
включения disable_journal_fsync и disable_meta_fsync, значение "all" также
требует включения disable_data_fsync.
Итого, вкратце: для оптимальной производительности установите
immediate_commit в значение "all", если вы используете в кластере только SSD

View File

@@ -213,6 +213,6 @@ Thus, recommended setups are:
3. Hybrid HDD+SSD: csum_block_size=4k + inmemory_metadata=false
4. HDD-only, faster random read: csum_block_size=32k
5. HDD-only, faster random write: csum_block_size=4k +
inmemory_metadata=false + meta_io=cached
inmemory_metadata=false + cached_io_meta=true
See also [meta_io](osd.en.md#meta_io).
See also [cached_io_meta](osd.en.md#cached_io_meta).

View File

@@ -226,6 +226,6 @@ csum_block_size данных.
3. Гибридные HDD+SSD: csum_block_size=4k + inmemory_metadata=false
4. Только HDD, быстрее случайное чтение: csum_block_size=32k
5. Только HDD, быстрее случайная запись: csum_block_size=4k +
inmemory_metadata=false + meta_io=cached
inmemory_metadata=false + cached_io_meta=true
Смотрите также [meta_io](osd.ru.md#meta_io).
Смотрите также [cached_io_meta](osd.ru.md#cached_io_meta).

View File

@@ -20,7 +20,6 @@ between clients, OSDs and etcd.
- [rdma_max_msg](#rdma_max_msg)
- [rdma_max_recv](#rdma_max_recv)
- [rdma_max_send](#rdma_max_send)
- [rdma_odp](#rdma_odp)
- [peer_connect_interval](#peer_connect_interval)
- [peer_connect_timeout](#peer_connect_timeout)
- [osd_idle_timeout](#osd_idle_timeout)
@@ -31,6 +30,7 @@ between clients, OSDs and etcd.
- [etcd_slow_timeout](#etcd_slow_timeout)
- [etcd_keepalive_timeout](#etcd_keepalive_timeout)
- [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
- [client_dirty_limit](#client_dirty_limit)
## tcp_header_buffer_size
@@ -69,14 +69,11 @@ but they are not connected to the cluster.
- Type: string
RDMA device name to use for Vitastor OSD communications (for example,
"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
Versions up to Vitastor 1.2.0 required ODP which is only present in
Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
Run `ibv_devinfo -v` as root to list available RDMA devices and their
features.
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
to work. For example, Mellanox ConnectX-3 and older adapters don't have
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
root to list available RDMA devices and their features.
Remember that you also have to configure your network switches if you use
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@@ -151,28 +148,6 @@ less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
Doesn't affect memory usage - additional memory isn't allocated for send
operations.
## rdma_odp
- Type: boolean
- Default: false
Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
for RDMA adapter to be able to use it. This, in turn, allows to skip memory
copying during sending. One would think this should improve performance, but
**in reality** RDMA performance with ODP is **drastically** worse. Example
3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
This happens because Mellanox ODP implementation seems to be based on
message retransmissions when the adapter doesn't know about the buffer yet -
it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
which is generally slow in RDMA/RoCE networks. Here's a presentation about
it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
ODP support is retained in the code just in case a good ODP implementation
appears one day.
## peer_connect_interval
- Type: seconds
@@ -265,3 +240,17 @@ etcd_report_interval to guarantee that keepalive actually works.
etcd websocket ping interval required to keep the connection alive and
detect disconnections quickly.
## client_dirty_limit
- Type: integer
- Default: 33554432
- Can be changed online: yes
Without immediate_commit=all this parameter sets the limit of "dirty"
(not committed by fsync) data allowed by the client before forcing an
additional fsync and committing the data. Also note that the client always
holds a copy of uncommitted data in memory so this setting also affects
RAM usage of clients.
This parameter doesn't affect OSDs themselves.

View File

@@ -20,7 +20,6 @@
- [rdma_max_msg](#rdma_max_msg)
- [rdma_max_recv](#rdma_max_recv)
- [rdma_max_send](#rdma_max_send)
- [rdma_odp](#rdma_odp)
- [peer_connect_interval](#peer_connect_interval)
- [peer_connect_timeout](#peer_connect_timeout)
- [osd_idle_timeout](#osd_idle_timeout)
@@ -31,6 +30,7 @@
- [etcd_slow_timeout](#etcd_slow_timeout)
- [etcd_keepalive_timeout](#etcd_keepalive_timeout)
- [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
- [client_dirty_limit](#client_dirty_limit)
## tcp_header_buffer_size
@@ -72,15 +72,12 @@ RDMA может быть нужно только если у клиентов е
- Тип: строка
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
картами производства не Mellanox.
Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
список доступных RDMA-устройств, их параметры и возможности.
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
параметры и возможности.
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@@ -159,29 +156,6 @@ OSD в любом случае согласовывают реальное зн
Не влияет на потребление памяти - дополнительная память на операции отправки
не выделяется.
## rdma_odp
- Тип: булево (да/нет)
- Значение по умолчанию: false
Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
не регистрировать память для её использования RDMA-картой. Благодаря этому
можно не копировать данные при отправке их в сеть и, казалось бы, это должно
улучшать производительность - но **по факту** получается так, что
производительность только ухудшается, причём сильно. Пример - на 3-узловом
кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
основана на повторной передаче сообщений, когда карте не известен буфер -
вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
Возможность использования ODP сохранена в коде на случай, если вдруг в один
прекрасный день появится хорошая реализация ODP.
## peer_connect_interval
- Тип: секунды
@@ -277,3 +251,17 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
- Можно менять на лету: да
Интервал проверки живости вебсокет-подключений к etcd.
## client_dirty_limit
- Тип: целое число
- Значение по умолчанию: 33554432
- Можно менять на лету: да
При работе без immediate_commit=all - это лимит объёма "грязных" (не
зафиксированных fsync-ом) данных, при достижении которого клиент будет
принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
что в этом случае до момента fsync клиент хранит копию незафиксированных
данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
Параметр не влияет на сами OSD.

View File

@@ -11,7 +11,6 @@ initialization and can be changed - either with an OSD restart or, for some of
them, even without restarting by updating configuration in etcd.
- [etcd_report_interval](#etcd_report_interval)
- [etcd_stats_interval](#etcd_stats_interval)
- [run_primary](#run_primary)
- [osd_network](#osd_network)
- [bind_address](#bind_address)
@@ -32,9 +31,9 @@ them, even without restarting by updating configuration in etcd.
- [max_flusher_count](#max_flusher_count)
- [inmemory_metadata](#inmemory_metadata)
- [inmemory_journal](#inmemory_journal)
- [data_io](#data_io)
- [meta_io](#meta_io)
- [journal_io](#journal_io)
- [cached_io_data](#cached_io_data)
- [cached_io_meta](#cached_io_meta)
- [cached_io_journal](#cached_io_journal)
- [journal_sector_buffer_count](#journal_sector_buffer_count)
- [journal_no_same_sector_overwrites](#journal_no_same_sector_overwrites)
- [throttle_small_writes](#throttle_small_writes)
@@ -57,21 +56,11 @@ them, even without restarting by updating configuration in etcd.
- Type: seconds
- Default: 5
Interval at which OSDs report their liveness to etcd. Affects OSD lease time
Interval at which OSDs report their state to etcd. Affects OSD lease time
and thus the failover speed. Lease time is equal to this parameter value
plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
that every OSD always refreshes its lease in time.
## etcd_stats_interval
- Type: seconds
- Default: 30
Interval at which OSDs report their statistics to etcd. Highly affects the
imposed load on etcd, because statistics include a key for every OSD and
for every PG. At the same time, low statistic intervals make `vitastor-cli`
statistics more responsive.
## run_primary
- Type: boolean
@@ -269,59 +258,47 @@ is typically very small because it's sufficient to have 16-32 MB journal
for SSD OSDs. However, in theory it's possible that you'll want to turn it
off for hybrid (HDD+SSD) OSDs with large journals on quick devices.
## data_io
## cached_io_data
- Type: string
- Default: direct
- Type: boolean
- Default: false
I/O mode for *data*. One of "direct", "cached" or "directsync". Corresponds
to O_DIRECT, O_SYNC and O_DIRECT|O_SYNC, respectively.
Read and write *data* through Linux page cache, i.e. use a file descriptor
opened with O_SYNC, but without O_DIRECT for I/O. May improve read
performance for hot data and slower disks - HDDs and maybe SATA SSDs.
Not recommended for desktop SSDs without capacitors because O_SYNC flushes
disk cache on every write.
Choose "cached" to use Linux page cache. This may improve read performance
for hot data and slower disks - HDDs and maybe SATA SSDs - but will slightly
decrease write performance for fast disks because page cache is an overhead
itself.
## cached_io_meta
Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
(which requires disable_data_fsync) with drives having write-back cache
which can't be turned off, for example, Intel Optane. Also note that *some*
desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
disable_data_fsync unsafe even with "directsync".
- Type: boolean
- Default: false
## meta_io
Read and write *metadata* through Linux page cache. May improve read
performance only if your drives are relatively slow (HDD, SATA SSD), and
only if checksums are enabled and [inmemory_metadata](#inmemory_metadata)
is disabled, because in this case metadata blocks are read from disk
on every read request to verify checksums and caching them may reduce this
extra read load.
- Type: string
- Default: direct
I/O mode for *metadata*. One of "direct", "cached" or "directsync".
"cached" may improve read performance, but only under the following conditions:
1. your drives are relatively slow (HDD, SATA SSD), and
2. checksums are enabled, and
3. [inmemory_metadata](#inmemory_metadata) is disabled.
Under all these conditions, metadata blocks are read from disk on every
read request to verify checksums and caching them may reduce this extra
read load. Without (3) metadata is never read from the disk after starting,
and without (2) metadata blocks are read from disk only during journal
Absolutely pointless to enable with enabled inmemory_metadata because all
metadata is kept in memory anyway, and likely pointless without checksums,
because in that case, metadata blocks are read from disk only during journal
flushing.
"directsync" is the same as above.
If the same device is used for data and metadata, enabling [cached_io_data](#cached_io_data)
also enables this parameter, given that it isn't turned off explicitly.
If the same device is used for data and metadata, meta_io by default is set
to the same value as [data_io](#data_io).
## cached_io_journal
## journal_io
- Type: boolean
- Default: false
- Type: string
- Default: direct
Read and write *journal* through Linux page cache. May improve read
performance if [inmemory_journal](#inmemory_journal) is turned off.
I/O mode for *journal*. One of "direct", "cached" or "directsync".
Here, "cached" may only improve read performance for recent writes and
only if [inmemory_journal](#inmemory_journal) is turned off.
If the same device is used for metadata and journal, journal_io by default
is set to the same value as [meta_io](#meta_io).
If the same device is used for metadata and journal, enabling [cached_io_meta](#cached_io_meta)
also enables this parameter, given that it isn't turned off explicitly.
## journal_sector_buffer_count

View File

@@ -12,7 +12,6 @@
изменения конфигурации в etcd.
- [etcd_report_interval](#etcd_report_interval)
- [etcd_stats_interval](#etcd_stats_interval)
- [run_primary](#run_primary)
- [osd_network](#osd_network)
- [bind_address](#bind_address)
@@ -33,9 +32,9 @@
- [max_flusher_count](#max_flusher_count)
- [inmemory_metadata](#inmemory_metadata)
- [inmemory_journal](#inmemory_journal)
- [data_io](#data_io)
- [meta_io](#meta_io)
- [journal_io](#journal_io)
- [cached_io_data](#cached_io_data)
- [cached_io_meta](#cached_io_meta)
- [cached_io_journal](#cached_io_journal)
- [journal_sector_buffer_count](#journal_sector_buffer_count)
- [journal_no_same_sector_overwrites](#journal_no_same_sector_overwrites)
- [throttle_small_writes](#throttle_small_writes)
@@ -58,21 +57,11 @@
- Тип: секунды
- Значение по умолчанию: 5
Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
влияет на время резервации (lease) OSD и поэтому - на скорость переключения
Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
влияет на время резервации (lease) OSD и поэтому на скорость переключения
при падении OSD. Время lease равняется значению этого параметра плюс
max_etcd_attempts * etcd_quick_timeout.
## etcd_stats_interval
- Тип: секунды
- Значение по умолчанию: 30
Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
каждый OSD и на каждую PG. В то же время низкий интервал делает
статистику, печатаемую `vitastor-cli`, отзывчивей.
## run_primary
- Тип: булево (да/нет)
@@ -277,62 +266,51 @@ Flusher - это микро-поток (корутина), которая коп
параметра может оказаться полезным для гибридных OSD (HDD+SSD) с большими
журналами, расположенными на быстром по сравнению с HDD устройстве.
## data_io
## cached_io_data
- Тип: строка
- Значение по умолчанию: direct
- Тип: булево (да/нет)
- Значение по умолчанию: false
Режим ввода-вывода для *данных*. Одно из значений "direct", "cached" или
"directsync", означающих O_DIRECT, O_SYNC и O_DIRECT|O_SYNC, соответственно.
Читать и записывать *данные* через системный кэш Linux (page cache), то есть,
использовать для данных файловый дескриптор, открытый без флага O_DIRECT, но
с флагом O_SYNC. Может улучшить скорость чтения для относительно медленных
дисков - HDD и, возможно, SATA SSD. Не рекомендуется для потребительских
SSD без конденсаторов, так как O_SYNC сбрасывает кэш диска при каждой записи.
Выберите "cached", чтобы использовать системный кэш Linux (page cache) при
чтении и записи. Это может улучшить скорость чтения горячих данных с
относительно медленных дисков - HDD и, возможно, SATA SSD - но немного
снижает производительность записи для быстрых дисков, так как кэш сам по
себе тоже добавляет накладные расходы.
## cached_io_meta
Выберите "directsync", если хотите задействовать
[immediate_commit](layout-cluster.ru.md#immediate_commit) (требующий
включенияd disable_data_fsync) на дисках с неотключаемым кэшем. Пример таких
дисков - Intel Optane. При этом также стоит иметь в виду, что *некоторые*
настольные SSD (например, HP EX950) игнорируют флаг O_SYNC, делая отключение
fsync небезопасным даже с режимом "directsync".
- Тип: булево (да/нет)
- Значение по умолчанию: false
## meta_io
- Тип: строка
- Значение по умолчанию: direct
Режим ввода-вывода для *метаданных*. Одно из значений "direct", "cached" или
"directsync".
"cached" может улучшить скорость чтения, если:
1. у вас медленные диски (HDD, SATA SSD)
2. контрольные суммы включены
3. параметр [inmemory_metadata](#inmemory_metadata) отключён.
При этих условиях блоки метаданных читаются с диска при каждом запросе чтения
Читать и записывать *метаданные* через системный кэш Linux. Может улучшить
скорость чтения, если у вас медленные диски, и только если контрольные суммы
включены, а параметр [inmemory_metadata](#inmemory_metadata) отключён, так
как в этом случае блоки метаданных читаются с диска при каждом запросе чтения
для проверки контрольных сумм и их кэширование может снизить дополнительную
нагрузку на диск. Без (3) метаданные никогда не читаются с диска после
запуска OSD, а без (2) блоки метаданных читаются только при сбросе журнала.
нагрузку на диск.
Если одно и то же устройство используется для данных и метаданных, режим
ввода-вывода метаданных по умолчанию устанавливается равным [data_io](#data_io).
Абсолютно бессмысленно включать данный параметр, если параметр
inmemory_metadata включён (по умолчанию это так), и также вероятно
бессмысленно включать его, если не включены контрольные суммы, так как в
этом случае блоки метаданных читаются с диска только во время сброса
журнала.
## journal_io
Если одно и то же устройство используется для данных и метаданных, включение
[cached_io_data](#cached_io_data) также включает данный параметр, при
условии, что он не отключён явным образом.
- Тип: строка
- Значение по умолчанию: direct
## cached_io_journal
Режим ввода-вывода для *журнала*. Одно из значений "direct", "cached" или
"directsync".
- Тип: булево (да/нет)
- Значение по умолчанию: false
Здесь "cached" может улучшить скорость чтения только недавно записанных
данных и только если параметр [inmemory_journal](#inmemory_journal)
Читать и записывать *журнал* через системный кэш Linux. Может улучшить
скорость чтения, если параметр [inmemory_journal](#inmemory_journal)
отключён.
Если одно и то же устройство используется для метаданных и журнала,
режим ввода-вывода журнала по умолчанию устанавливается равным
[meta_io](#meta_io).
включение [cached_io_meta](#cached_io_meta) также включает данный
параметр, при условии, что он не отключён явным образом.
## journal_sector_buffer_count

View File

@@ -205,8 +205,9 @@ This parameter usually doesn't require to be changed.
- Default: 131072
Block size for this pool. The value from /vitastor/config/global is used when
unspecified. Only OSDs with matching block_size are used for each pool. If you
want to further restrict OSDs for the pool, use [osd_tags](#osd_tags).
unspecified. If your cluster has OSDs with different block sizes then pool must
be restricted by [osd_tags](#osd_tags) to only include OSDs with matching block
size.
Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#block_size).
@@ -215,9 +216,10 @@ Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-c
- Type: integer
- Default: 4096
"Sector" size of virtual disks in this pool. The value from /vitastor/config/global
is used when unspecified. Similarly to block_size, only OSDs with matching
bitmap_granularity are used for each pool.
"Sector" size of virtual disks in this pool. The value from
/vitastor/config/global is used when unspecified. Similar to block_size, the
pool must be restricted by [osd_tags](#osd_tags) to only include OSDs with
matching bitmap_granularity.
Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#bitmap_granularity).
@@ -227,11 +229,10 @@ Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-c
- Default: none
Immediate commit setting for this pool. The value from /vitastor/config/global
is used when unspecified. Similarly to block_size, only OSDs with compatible
bitmap_granularity are used for each pool. "Compatible" means that a pool with
non-immediate commit will use OSDs with immediate commit enabled, but not vice
versa. I.e., pools with "none" use all OSDs, pools with "small" only use OSDs
with "all" or "small", and pools with "all" only use OSDs with "all".
is used when unspecified. Similar to block_size, the pool must be restricted by
[osd_tags](#osd_tags) to only include OSDs with compatible immediate_commit.
Compatible means that a pool with non-immediate commit will work with OSDs with
immediate commit enabled, but not vice versa.
Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#immediate_commit).

View File

@@ -208,9 +208,8 @@ PG в Vitastor эферемерны, то есть вы можете менят
Размер блока для данного пула. Если не задан, используется значение из
/vitastor/config/global. Если в вашем кластере есть OSD с разными размерами
блока, пул будет использовать только OSD с размером блока, равным размеру блока
пула. Если вы хотите сильнее ограничить набор используемых для пула OSD -
используйте [osd_tags](#osd_tags).
блока, пул должен быть ограничен только OSD, блок которых равен блоку пула,
с помощью [osd_tags](#osd_tags).
О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#block_size).
@@ -220,8 +219,9 @@ PG в Vitastor эферемерны, то есть вы можете менят
- По умолчанию: 4096
Размер "сектора" виртуальных дисков в данном пуле. Если не задан, используется
значение из /vitastor/config/global. Аналогично block_size, каждый пул будет
использовать только OSD с совпадающей с пулом настройкой bitmap_granularity.
значение из /vitastor/config/global. Аналогично block_size, пул должен быть
ограничен OSD со значением bitmap_granularity, равным значению пула, с помощью
[osd_tags](#osd_tags).
О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#bitmap_granularity).
@@ -231,13 +231,11 @@ PG в Vitastor эферемерны, то есть вы можете менят
- По умолчанию: none
Настройка мгновенного коммита для данного пула. Если не задана, используется
значение из /vitastor/config/global. Аналогично block_size, каждый пул будет
использовать только OSD с *совместимыми* настройками immediate_commit.
"Совместимыми" означает, что пул с отключенным мгновенным коммитом будет
использовать OSD с включённым мгновенным коммитом, но не наоборот. То есть,
пул со значением "none" будет использовать все OSD, пул со "small" будет
использовать OSD с "all" или "small", а пул с "all" будет использовать только
OSD с "all".
значение из /vitastor/config/global. Аналогично block_size, пул должен быть
ограничен OSD со значением bitmap_granularity, совместимым со значением пула, с
помощью [osd_tags](#osd_tags). Совместимость означает, что пул с отключенным
мгновенным коммитом может работать на OSD с включённым мгновенным коммитом, но
не наоборот.
О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#immediate_commit).

View File

@@ -1,4 +0,0 @@
# Client Parameters
These parameters apply only to clients and affect their interaction with
the cluster.

View File

@@ -1,4 +0,0 @@
# Параметры клиентского кода
Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD) и
затрагивают логику их работы с кластером.

View File

@@ -1,124 +0,0 @@
- name: client_max_dirty_bytes
type: int
default: 33554432
online: true
info: |
Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
(not committed by fsync) data allowed by the client before forcing an
additional fsync and committing the data. Also note that the client always
holds a copy of uncommitted data in memory so this setting also affects
RAM usage of clients.
info_ru: |
При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
зафиксированных fsync-ом) данных, при достижении которого клиент будет
принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
что в этом случае до момента fsync клиент хранит копию незафиксированных
данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
- name: client_max_dirty_ops
type: int
default: 1024
online: true
info: |
Same as client_max_dirty_bytes, but instead of total size, limits the number
of uncommitted write operations.
info_ru: |
Аналогично client_max_dirty_bytes, но ограничивает количество
незафиксированных операций записи вместо их общего объёма.
- name: client_enable_writeback
type: bool
default: false
online: true
info: |
This parameter enables client-side write buffering. This means that write
requests are accumulated in memory for a short time before being sent to
a Vitastor cluster which allows to send them in parallel and increase
performance of some applications. Writes are buffered until client forces
a flush with fsync() or until the amount of buffered writes exceeds the
limit.
Write buffering significantly increases performance of some applications,
for example, CrystalDiskMark under Windows (LOL :-D), but also any other
applications if they do writes in one of two non-optimal ways: either if
they do a lot of small (4 kb or so) sequential writes, or if they do a lot
of small random writes, but without any parallelism or asynchrony, and also
without calling fsync().
With write buffering enabled, you can expect around 22000 T1Q1 random write
iops in QEMU more or less regardless of the quality of your SSDs, and this
number is in fact bound by QEMU itself rather than Vitastor (check it
yourself by adding a "driver=null-co" disk in QEMU). Without write
buffering, the current record is 9900 iops, but the number is usually
even lower with non-ideal hardware, for example, it may be 5000 iops.
Even when this parameter is enabled, write buffering isn't enabled until
the client explicitly allows it, because enabling it without the client
being aware of the fact that his writes may be buffered may lead to data
loss. Because of this, older versions of clients don't support write
buffering at all, newer versions of the QEMU driver allow write buffering
only if it's enabled in disk settings with `-blockdev cache.direct=false`,
and newer versions of FIO only allow write buffering if you don't specify
`-direct=1`. NBD and NFS drivers allow write buffering by default.
You can overcome this restriction too with the `client_writeback_allowed`
parameter, but you shouldn't do that unless you **really** know what you
are doing.
info_ru: |
Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
означает, что операции записи отправляются на кластер Vitastor не сразу, а
могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
пока клиент не вызовет fsync.
Буферизация значительно повышает производительность некоторых приложений,
например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
(например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
есть, например, отправляя 128 операций записи в разные места диска, но не
все сразу с помощью асинхронного I/O, а по одной.
В QEMU с буферизацией записи можно ожидать показателя примерно 22000
операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
в секунду.
При этом, даже если данный параметр включён, буферизация не включается, если
явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
буферизуются, это может приводить к потере данных. Поэтому в старых версиях
клиентских драйверов буферизация записи не включается вообще, в новых
версиях QEMU-драйвера включается, только если разрешена опцией диска
`-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
но делать так не надо, если только вы не уверены в том, что делаете, на все
100%. :-)
- name: client_max_buffered_bytes
type: int
default: 33554432
online: true
info: |
Maximum total size of buffered writes which triggers write-back when reached.
info_ru: |
Максимальный общий размер буферизованных записей, при достижении которого
начинается процесс сброса данных на сервер.
- name: client_max_buffered_ops
type: int
default: 1024
online: true
info: |
Maximum number of buffered writes which triggers write-back when reached.
Multiple consecutive modified data regions are counted as 1 write here.
info_ru: |
Максимальное количество буферизованных записей, при достижении которого
начинается процесс сброса данных на сервер. При этом несколько
последовательных изменённых областей здесь считаются 1 записью.
- name: client_max_writeback_iodepth
type: int
default: 256
online: true
info: |
Maximum number of parallel writes when flushing buffered data to the server.
info_ru: |
Максимальное число параллельных операций записи при сбросе буферов на сервер.

View File

@@ -28,8 +28,6 @@
{{../../config/network.en.md|indent=2}}
{{../../config/client.en.md|indent=2}}
{{../../config/layout-cluster.en.md|indent=2}}
{{../../config/layout-osd.en.md|indent=2}}

View File

@@ -28,8 +28,6 @@
{{../../config/network.ru.md|indent=2}}
{{../../config/client.ru.md|indent=2}}
{{../../config/layout-cluster.ru.md|indent=2}}
{{../../config/layout-osd.ru.md|indent=2}}

View File

@@ -87,9 +87,8 @@
it (they have internal SSD cache even though it's not stated in datasheets).
Setting this parameter to "all" or "small" in OSD parameters requires enabling
[disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
[disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
"all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
disable_journal_fsync and disable_meta_fsync, setting it to "all" also requires
enabling disable_data_fsync.
TLDR: For optimal performance, set immediate_commit to "all" if you only use
SSDs with supercapacitor-based power loss protection (nonvolatile
@@ -141,9 +140,8 @@
указано в спецификациях).
Указание "all" или "small" в настройках / командной строке OSD требует
включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
[disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
включения disable_journal_fsync и disable_meta_fsync, значение "all" также
требует включения disable_data_fsync.
Итого, вкратце: для оптимальной производительности установите
immediate_commit в значение "all", если вы используете в кластере только SSD

View File

@@ -244,9 +244,9 @@
3. Hybrid HDD+SSD: csum_block_size=4k + inmemory_metadata=false
4. HDD-only, faster random read: csum_block_size=32k
5. HDD-only, faster random write: csum_block_size=4k +
inmemory_metadata=false + meta_io=cached
inmemory_metadata=false + cached_io_meta=true
See also [meta_io](osd.en.md#meta_io).
See also [cached_io_meta](osd.en.md#cached_io_meta).
info_ru: |
Размер блока расчёта контрольных сумм.
@@ -271,6 +271,6 @@
3. Гибридные HDD+SSD: csum_block_size=4k + inmemory_metadata=false
4. Только HDD, быстрее случайное чтение: csum_block_size=32k
5. Только HDD, быстрее случайная запись: csum_block_size=4k +
inmemory_metadata=false + meta_io=cached
inmemory_metadata=false + cached_io_meta=true
Смотрите также [meta_io](osd.ru.md#meta_io).
Смотрите также [cached_io_meta](osd.ru.md#cached_io_meta).

View File

@@ -48,14 +48,11 @@
type: string
info: |
RDMA device name to use for Vitastor OSD communications (for example,
"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
Versions up to Vitastor 1.2.0 required ODP which is only present in
Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
Run `ibv_devinfo -v` as root to list available RDMA devices and their
features.
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
to work. For example, Mellanox ConnectX-3 and older adapters don't have
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
root to list available RDMA devices and their features.
Remember that you also have to configure your network switches if you use
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@@ -64,15 +61,12 @@
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
info_ru: |
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
картами производства не Mellanox.
Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
список доступных RDMA-устройств, их параметры и возможности.
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
параметры и возможности.
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@@ -166,45 +160,6 @@
у принимающей стороны в процессе работы не заканчивались буферы на приём.
Не влияет на потребление памяти - дополнительная память на операции отправки
не выделяется.
- name: rdma_odp
type: bool
default: false
online: false
info: |
Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
for RDMA adapter to be able to use it. This, in turn, allows to skip memory
copying during sending. One would think this should improve performance, but
**in reality** RDMA performance with ODP is **drastically** worse. Example
3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
This happens because Mellanox ODP implementation seems to be based on
message retransmissions when the adapter doesn't know about the buffer yet -
it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
which is generally slow in RDMA/RoCE networks. Here's a presentation about
it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
ODP support is retained in the code just in case a good ODP implementation
appears one day.
info_ru: |
Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
не регистрировать память для её использования RDMA-картой. Благодаря этому
можно не копировать данные при отправке их в сеть и, казалось бы, это должно
улучшать производительность - но **по факту** получается так, что
производительность только ухудшается, причём сильно. Пример - на 3-узловом
кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
основана на повторной передаче сообщений, когда карте не известен буфер -
вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
Возможность использования ODP сохранена в коде на случай, если вдруг в один
прекрасный день появится хорошая реализация ODP.
- name: peer_connect_interval
type: sec
min: 1
@@ -304,3 +259,23 @@
detect disconnections quickly.
info_ru: |
Интервал проверки живости вебсокет-подключений к etcd.
- name: client_dirty_limit
type: int
default: 33554432
online: true
info: |
Without immediate_commit=all this parameter sets the limit of "dirty"
(not committed by fsync) data allowed by the client before forcing an
additional fsync and committing the data. Also note that the client always
holds a copy of uncommitted data in memory so this setting also affects
RAM usage of clients.
This parameter doesn't affect OSDs themselves.
info_ru: |
При работе без immediate_commit=all - это лимит объёма "грязных" (не
зафиксированных fsync-ом) данных, при достижении которого клиент будет
принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
что в этом случае до момента fsync клиент хранит копию незафиксированных
данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
Параметр не влияет на сами OSD.

View File

@@ -2,28 +2,15 @@
type: sec
default: 5
info: |
Interval at which OSDs report their liveness to etcd. Affects OSD lease time
Interval at which OSDs report their state to etcd. Affects OSD lease time
and thus the failover speed. Lease time is equal to this parameter value
plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
that every OSD always refreshes its lease in time.
info_ru: |
Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
влияет на время резервации (lease) OSD и поэтому - на скорость переключения
Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
влияет на время резервации (lease) OSD и поэтому на скорость переключения
при падении OSD. Время lease равняется значению этого параметра плюс
max_etcd_attempts * etcd_quick_timeout.
- name: etcd_stats_interval
type: sec
default: 30
info: |
Interval at which OSDs report their statistics to etcd. Highly affects the
imposed load on etcd, because statistics include a key for every OSD and
for every PG. At the same time, low statistic intervals make `vitastor-cli`
statistics more responsive.
info_ru: |
Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
каждый OSD и на каждую PG. В то же время низкий интервал делает
статистику, печатаемую `vitastor-cli`, отзывчивей.
- name: run_primary
type: bool
default: true
@@ -273,96 +260,73 @@
достаточно 16- или 32-мегабайтного журнала. Однако в теории отключение
параметра может оказаться полезным для гибридных OSD (HDD+SSD) с большими
журналами, расположенными на быстром по сравнению с HDD устройстве.
- name: data_io
type: string
default: direct
- name: cached_io_data
type: bool
default: false
info: |
I/O mode for *data*. One of "direct", "cached" or "directsync". Corresponds
to O_DIRECT, O_SYNC and O_DIRECT|O_SYNC, respectively.
Choose "cached" to use Linux page cache. This may improve read performance
for hot data and slower disks - HDDs and maybe SATA SSDs - but will slightly
decrease write performance for fast disks because page cache is an overhead
itself.
Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
(which requires disable_data_fsync) with drives having write-back cache
which can't be turned off, for example, Intel Optane. Also note that *some*
desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
disable_data_fsync unsafe even with "directsync".
Read and write *data* through Linux page cache, i.e. use a file descriptor
opened with O_SYNC, but without O_DIRECT for I/O. May improve read
performance for hot data and slower disks - HDDs and maybe SATA SSDs.
Not recommended for desktop SSDs without capacitors because O_SYNC flushes
disk cache on every write.
info_ru: |
Режим ввода-вывода для *данных*. Одно из значений "direct", "cached" или
"directsync", означающих O_DIRECT, O_SYNC и O_DIRECT|O_SYNC, соответственно.
Выберите "cached", чтобы использовать системный кэш Linux (page cache) при
чтении и записи. Это может улучшить скорость чтения горячих данных с
относительно медленных дисков - HDD и, возможно, SATA SSD - но немного
снижает производительность записи для быстрых дисков, так как кэш сам по
себе тоже добавляет накладные расходы.
Выберите "directsync", если хотите задействовать
[immediate_commit](layout-cluster.ru.md#immediate_commit) (требующий
включенияd disable_data_fsync) на дисках с неотключаемым кэшем. Пример таких
дисков - Intel Optane. При этом также стоит иметь в виду, что *некоторые*
настольные SSD (например, HP EX950) игнорируют флаг O_SYNC, делая отключение
fsync небезопасным даже с режимом "directsync".
- name: meta_io
type: string
default: direct
Читать и записывать *данные* через системный кэш Linux (page cache), то есть,
использовать для данных файловый дескриптор, открытый без флага O_DIRECT, но
с флагом O_SYNC. Может улучшить скорость чтения для относительно медленных
дисков - HDD и, возможно, SATA SSD. Не рекомендуется для потребительских
SSD без конденсаторов, так как O_SYNC сбрасывает кэш диска при каждой записи.
- name: cached_io_meta
type: bool
default: false
info: |
I/O mode for *metadata*. One of "direct", "cached" or "directsync".
Read and write *metadata* through Linux page cache. May improve read
performance only if your drives are relatively slow (HDD, SATA SSD), and
only if checksums are enabled and [inmemory_metadata](#inmemory_metadata)
is disabled, because in this case metadata blocks are read from disk
on every read request to verify checksums and caching them may reduce this
extra read load.
"cached" may improve read performance, but only under the following conditions:
1. your drives are relatively slow (HDD, SATA SSD), and
2. checksums are enabled, and
3. [inmemory_metadata](#inmemory_metadata) is disabled.
Under all these conditions, metadata blocks are read from disk on every
read request to verify checksums and caching them may reduce this extra
read load. Without (3) metadata is never read from the disk after starting,
and without (2) metadata blocks are read from disk only during journal
Absolutely pointless to enable with enabled inmemory_metadata because all
metadata is kept in memory anyway, and likely pointless without checksums,
because in that case, metadata blocks are read from disk only during journal
flushing.
"directsync" is the same as above.
If the same device is used for data and metadata, meta_io by default is set
to the same value as [data_io](#data_io).
If the same device is used for data and metadata, enabling [cached_io_data](#cached_io_data)
also enables this parameter, given that it isn't turned off explicitly.
info_ru: |
Режим ввода-вывода для *метаданных*. Одно из значений "direct", "cached" или
"directsync".
"cached" может улучшить скорость чтения, если:
1. у вас медленные диски (HDD, SATA SSD)
2. контрольные суммы включены
3. параметр [inmemory_metadata](#inmemory_metadata) отключён.
При этих условиях блоки метаданных читаются с диска при каждом запросе чтения
Читать и записывать *метаданные* через системный кэш Linux. Может улучшить
скорость чтения, если у вас медленные диски, и только если контрольные суммы
включены, а параметр [inmemory_metadata](#inmemory_metadata) отключён, так
как в этом случае блоки метаданных читаются с диска при каждом запросе чтения
для проверки контрольных сумм и их кэширование может снизить дополнительную
нагрузку на диск. Без (3) метаданные никогда не читаются с диска после
запуска OSD, а без (2) блоки метаданных читаются только при сбросе журнала.
нагрузку на диск.
Если одно и то же устройство используется для данных и метаданных, режим
ввода-вывода метаданных по умолчанию устанавливается равным [data_io](#data_io).
- name: journal_io
type: string
default: direct
Абсолютно бессмысленно включать данный параметр, если параметр
inmemory_metadata включён (по умолчанию это так), и также вероятно
бессмысленно включать его, если не включены контрольные суммы, так как в
этом случае блоки метаданных читаются с диска только во время сброса
журнала.
Если одно и то же устройство используется для данных и метаданных, включение
[cached_io_data](#cached_io_data) также включает данный параметр, при
условии, что он не отключён явным образом.
- name: cached_io_journal
type: bool
default: false
info: |
I/O mode for *journal*. One of "direct", "cached" or "directsync".
Read and write *journal* through Linux page cache. May improve read
performance if [inmemory_journal](#inmemory_journal) is turned off.
Here, "cached" may only improve read performance for recent writes and
only if [inmemory_journal](#inmemory_journal) is turned off.
If the same device is used for metadata and journal, journal_io by default
is set to the same value as [meta_io](#meta_io).
If the same device is used for metadata and journal, enabling [cached_io_meta](#cached_io_meta)
also enables this parameter, given that it isn't turned off explicitly.
info_ru: |
Режим ввода-вывода для *журнала*. Одно из значений "direct", "cached" или
"directsync".
Здесь "cached" может улучшить скорость чтения только недавно записанных
данных и только если параметр [inmemory_journal](#inmemory_journal)
Читать и записывать *журнал* через системный кэш Linux. Может улучшить
скорость чтения, если параметр [inmemory_journal](#inmemory_journal)
отключён.
Если одно и то же устройство используется для метаданных и журнала,
режим ввода-вывода журнала по умолчанию устанавливается равным
[meta_io](#meta_io).
включение [cached_io_meta](#cached_io_meta) также включает данный
параметр, при условии, что он не отключён явным образом.
- name: journal_sector_buffer_count
type: int
default: 32

View File

@@ -17,15 +17,4 @@ and apply all `NNN-*.yaml` manifests to your Kubernetes installation:
for i in ./???-*.yaml; do kubectl apply -f $i; done
```
After that you'll be able to create PersistentVolumes.
## Features
Vitastor CSI supports:
- Kubernetes starting with 1.20 (or 1.17 for older vitastor-csi <= 1.1.0)
- Filesystem RWO (ReadWriteOnce) volumes. Example: [PVC](../../csi/deploy/example-pvc.yaml), [pod](../../csi/deploy/example-test-pod.yaml)
- Raw block RWX (ReadWriteMany) volumes. Example: [PVC](../../csi/deploy/example-pvc-block.yaml), [pod](../../csi/deploy/example-test-pod-block.yaml)
- Volume expansion
- Volume snapshots. Example: [snapshot class](../../csi/deploy/example-snapshot-class.yaml), [snapshot](../../csi/deploy/example-snapshot.yaml), [clone](../../csi/deploy/example-snapshot-clone.yaml)
Remember that to use snapshots with CSI you also have to install [Snapshot Controller and CRDs](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
After that you'll be able to create PersistentVolumes. See example in [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).

View File

@@ -17,15 +17,4 @@
for i in ./???-*.yaml; do kubectl apply -f $i; done
```
После этого вы сможете создавать PersistentVolume.
## Возможности
CSI-плагин Vitastor поддерживает:
- Версии Kubernetes, начиная с 1.20 (или с 1.17 для более старых vitastor-csi <= 1.1.0)
- Файловые RWO (ReadWriteOnce) тома. Пример: [PVC](../../csi/deploy/example-pvc.yaml), [под](../../csi/deploy/example-test-pod.yaml)
- Сырые блочные RWX (ReadWriteMany) тома. Пример: [PVC](../../csi/deploy/example-pvc-block.yaml), [под](../../csi/deploy/example-test-pod-block.yaml)
- Расширение размера томов
- Снимки томов. Пример: [класс снимков](../../csi/deploy/example-snapshot-class.yaml), [снимок](../../csi/deploy/example-snapshot.yaml), [клон снимка](../../csi/deploy/example-snapshot-clone.yaml)
Не забывайте, что для использования снимков нужно сначала установить [контроллер снимков и CRD](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).

View File

@@ -31,7 +31,6 @@
- [RDMA/RoCEv2 support via libibverbs](../config/network.en.md#rdma_device)
- [Scrubbing](../config/osd.en.md#auto_scrub) (verification of copies)
- [Checksums](../config/layout-osd.en.md#data_csum_type)
- [Client write-back cache](../config/client.en.md#client_enable_writeback)
## Plugins and tools
@@ -51,15 +50,13 @@
The following features are planned for the future:
- File system
- Control plane optimisation
- Other administrative tools
- Web GUI
- OpenNebula plugin
- iSCSI and NVMeoF gateways
- iSCSI proxy
- Multi-threaded client
- Faster failover
- S3
- Tiered storage (SSD caching)
- NVDIMM support
- Compression (possibly)
- Read caching using system page cache (possibly)

View File

@@ -33,7 +33,6 @@
- [Поддержка RDMA/RoCEv2 через libibverbs](../config/network.ru.md#rdma_device)
- [Фоновая проверка целостности](../config/osd.ru.md#auto_scrub) (сверка копий)
- [Контрольные суммы](../config/layout-osd.ru.md#data_csum_type)
- [Буферизация записи на стороне клиента](../config/client.ru.md#client_enable_writeback)
## Драйверы и инструменты
@@ -51,15 +50,12 @@
## Планы развития
- Файловая система
- Оптимизация слоя управления
- Другие инструменты администрирования
- Web-интерфейс
- Плагин для OpenNebula
- iSCSI и NVMeoF прокси
- iSCSI-прокси
- Многопоточный клиент
- Более быстрое переключение при отказах
- S3
- Поддержка SSD-кэширования (tiered storage)
- Поддержка NVDIMM
- Возможно, сжатие

View File

@@ -34,20 +34,6 @@ qemu-system-x86_64 -enable-kvm -m 1024 \
-vnc 0.0.0.0:0
```
With a separate I/O thread:
```
qemu-system-x86_64 -enable-kvm -m 1024 \
-object iothread,id=vitastor1 \
-blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
"cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
-device 'virtio-blk-pci,iothread=vitastor1,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
id=virtio-disk0,bootindex=1,write-cache=off' \
-vnc 0.0.0.0:0
```
You can also specify inode ID, pool and size manually instead of `:image=<IMAGE>` option: `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
## qemu-img
For qemu-img, you should use `vitastor:etcd_host=<HOST>:image=<IMAGE>` as filename.
@@ -98,75 +84,25 @@ This can be used for backups. Just note that exporting an image that is currentl
is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
on a live VM.
## vhost-user-blk
QEMU, starting with 6.0, includes support for attaching disks via a separate
userspace worker process, called `vhost-user-blk`. It usually has slightly (20-30 us)
lower latency.
Example commands to use it with Vitastor:
```
qemu-storage-daemon \
--daemonize \
--blockdev '{"node-name":"drive-virtio-disk1","driver":"vitastor","image":"testosd1","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
--export type=vhost-user-blk,id=vitastor1,node-name=drive-virtio-disk1,addr.type=unix,addr.path=/run/vitastor1-user-blk.sock,writable=on,num-queues=1
qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
-object memory-backend-memfd,id=mem,size=2G,share=on \
-chardev socket,id=vitastor1,reconnect=1,path=/run/vitastor1-user-blk.sock \
-device vhost-user-blk-pci,chardev=vitastor1,num-queues=1,config-wce=off \
-vnc 0.0.0.0:0
```
memfd memory-backend is crucial, vhost-user-blk does not work without it.
## VDUSE
Linux kernel, starting with version 5.15, supports a new interface for attaching virtual disks
to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
exporting QEMU block devices over this protocol using qemu-storage-daemon.
VDUSE is currently the best interface to attach Vitastor disks as kernel devices because:
- It avoids data copies and thus achieves much better performance than [NBD](nbd.en.md)
- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long
- It doesn't have hung device problem - if the userspace process dies it can be restarted (!)
and block device will continue operation
- It doesn't seem to have the device number limit
VDUSE has the same problem as other FUSE-like interfaces in Linux: if a userspace process hangs,
for example, if it loses connectivity with Vitastor cluster - active processes doing I/O may
hang in the D state (uninterruptible sleep) and you won't be able to kill them even with kill -9.
In this case reboot will be the only way to remove VDUSE devices from system.
Example performance comparison:
| | direct fio | NBD | VDUSE |
|----------------------|-------------|-------------|-------------|
| linear write | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
| 4k random write Q128 | 240000 iops | 120000 iops | 178000 iops |
| 4k random write Q1 | 9500 iops | 7620 iops | 7640 iops |
| linear read | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
| 4k random read Q128 | 287000 iops | 140000 iops | 189000 iops |
| 4k random read Q1 | 9600 iops | 7640 iops | 7780 iops |
On the other hand, VDUSE is faster than [NBD](nbd.en.md), so you may prefer to use it if
performance is important for you. Approximate performance numbers:
direct fio benchmark - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
To try VDUSE you need at least Linux 5.15, built with VDUSE support
(CONFIG_VIRTIO_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
Debian Linux kernels have these options disabled by now, so if you want to try it on Debian,
use a kernel from Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/), Proxmox,
or build modules for Debian kernel manually:
```
mkdir build
cd build
apt-get install linux-headers-`uname -r`
apt-get build-dep linux-image-`uname -r`-unsigned
apt-get source linux-image-`uname -r`-unsigned
cd linux*/drivers/vdpa
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
cd ../virtio
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
depmod -a
```
You also need `vdpa` tool from the `iproute2` package.
(CONFIG_VIRTIO_VDPA=m and CONFIG_VDPA_USER=m). Debian Linux kernels have these options
disabled by now, so if you want to try it on Debian, use a kernel from Ubuntu
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) or Proxmox.
Commands to attach Vitastor image as a VDUSE device:
@@ -179,7 +115,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
vdpa dev add name test1 mgmtdev vduse
```
After running these commands, `/dev/vda` device will appear in the system and you'll be able to
After running these commands /dev/vda device will appear in the system and you'll be able to
use it as a normal disk.
To remove the device:

View File

@@ -36,18 +36,6 @@ qemu-system-x86_64 -enable-kvm -m 1024 \
-vnc 0.0.0.0:0
```
С отдельным потоком ввода-вывода:
```
qemu-system-x86_64 -enable-kvm -m 1024 \
-object iothread,id=vitastor1 \
-blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
"cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
-device 'virtio-blk-pci,iothread=vitastor1,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
id=virtio-disk0,bootindex=1,write-cache=off' \
-vnc 0.0.0.0:0
```
Вместо `:image=<IMAGE>` также можно указывать номер инода, пул и размер: `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
## qemu-img
@@ -100,76 +88,25 @@ qemu-img rebase -u -b '' testimg.qcow2
в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.
## vhost-user-blk
QEMU, начиная с 6.0, позволяет подключать диски через отдельный рабочий процесс.
Этот метод подключения называется `vhost-user-blk` и обычно имеет чуть меньшую
задержку (ниже на 20-30 микросекунд, чем при обычном методе).
Пример команд для использования vhost-user-blk с Vitastor:
```
qemu-storage-daemon \
--daemonize \
--blockdev '{"node-name":"drive-virtio-disk1","driver":"vitastor","image":"testosd1","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
--export type=vhost-user-blk,id=vitastor1,node-name=drive-virtio-disk1,addr.type=unix,addr.path=/run/vitastor1-user-blk.sock,writable=on,num-queues=1
qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
-object memory-backend-memfd,id=mem,size=2G,share=on \
-chardev socket,id=vitastor1,reconnect=1,path=/run/vitastor1-user-blk.sock \
-device vhost-user-blk-pci,chardev=vitastor1,num-queues=1,config-wce=off \
-vnc 0.0.0.0:0
```
Здесь критична опция memory-backend-memfd, vhost-user-blk без неё не работает.
## VDUSE
В Linux, начиная с версии ядра 5.15, доступен новый интерфейс для подключения виртуальных дисков
к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.
VDUSE - на данный момент лучший интерфейс для подключения дисков Vitastor в виде блочных
устройств на уровне ядра, ибо:
- VDUSE не копирует данные и поэтому достигает значительно лучшей производительности, чем [NBD](nbd.ru.md)
- Также оно не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго
- Также оно не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
перезапустить (!) и блочное устройство продолжит работать
- По-видимому, у него нет предела числа подключаемых в систему устройств
VDUSE страдает общей проблемой FUSE-подобных интерфейсов в Linux: если пользовательский процесс
подвиснет, например, если будет потеряна связь с кластером Vitastor - читающие/пишущие в кластер
процессы могут "залипнуть" в состоянии D (непрерываемый сон) и их будет невозможно убить даже
через kill -9. В этом случае удалить из системы устройство можно только перезагрузившись.
Пример сравнения производительности:
С другой стороны, VDUSE быстрее по сравнению с [NBD](nbd.ru.md), поэтому его может
быть предпочтительно использовать там, где производительность важнее. Порядок показателей:
прямое тестирование через fio - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
| | Прямой fio | NBD | VDUSE |
|--------------------------|-------------|-------------|-------------|
| линейная запись | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
| 4k случайная запись Q128 | 240000 iops | 120000 iops | 178000 iops |
| 4k случайная запись Q1 | 9500 iops | 7620 iops | 7640 iops |
| линейное чтение | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
| 4k случайное чтение Q128 | 287000 iops | 140000 iops | 189000 iops |
| 4k случайное чтение Q1 | 9600 iops | 7640 iops | 7780 iops |
Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
VDUSE (CONFIG_VIRTIO_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
В ядрах в Debian Linux поддержка пока отключена по умолчанию, так что чтобы попробовать VDUSE
на Debian, поставьте ядро из Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/),
из Proxmox или соберите модули для ядра Debian вручную:
```
mkdir build
cd build
apt-get install linux-headers-`uname -r`
apt-get build-dep linux-image-`uname -r`-unsigned
apt-get source linux-image-`uname -r`-unsigned
cd linux*/drivers/vdpa
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
cd ../virtio
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
depmod -a
```
Также вам понадобится консольная утилита `vdpa` из пакета `iproute2`.
Чтобы использовать VDUSE, вам нужно ядро Linux версии хотя бы 5.15, собранное с поддержкой
VDUSE (CONFIG_VIRTIO_VDPA=m и CONFIG_VDPA_USER=m). В ядрах в Debian Linux поддержка пока
отключена - если хотите попробовать эту функцию на Debian, поставьте ядро из Ubuntu
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) или из Proxmox.
Команды для подключения виртуального диска через VDUSE:
@@ -182,7 +119,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
vdpa dev add name test1 mgmtdev vduse
```
После этого в системе появится устройство `/dev/vda`, которое можно будет использовать как
После этого в системе появится устройство /dev/vda, которое можно будет использовать как
обычный диск.
Для удаления устройства из системы:

View File

@@ -3,5 +3,5 @@ SUBSYSTEM=="block", ENV{ID_PART_ENTRY_TYPE}=="e7009fac-a5a1-4d72-af72-53de130599
IMPORT{program}="/usr/bin/vitastor-disk udev $devnode", \
SYMLINK+="vitastor/$env{VITASTOR_ALIAS}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"

View File

@@ -78,15 +78,9 @@ const etcd_tree = {
disk_alignment: 4096,
bitmap_granularity: 4096,
immediate_commit: false, // 'all' or 'small'
// client - configurable online
client_max_dirty_bytes: 33554432,
client_max_dirty_ops: 1024,
client_enable_writeback: false,
client_max_buffered_bytes: 33554432,
client_max_buffered_ops: 1024,
client_max_writeback_iodepth: 256,
// client and osd - configurable online
log_level: 0,
client_dirty_limit: 33554432,
peer_connect_interval: 5, // seconds. min: 1
peer_connect_timeout: 5, // seconds. min: 1
osd_idle_timeout: 5, // seconds. min: 1
@@ -99,7 +93,6 @@ const etcd_tree = {
etcd_ws_keepalive_interval: 30, // seconds
// osd
etcd_report_interval: 5, // seconds
etcd_stats_interval: 30, // seconds
run_primary: true,
osd_network: null, // "192.168.7.0/24" or an array of masks
bind_address: "0.0.0.0",
@@ -397,13 +390,12 @@ class Mon
this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
this.state = JSON.parse(JSON.stringify(this.constructor.etcd_tree));
this.prev_stats = { osd_stats: {}, osd_diff: {} };
this.signals_set = false;
this.stat_time = Date.now();
this.ws = null;
this.ws_alive = false;
this.ws_keepalive_timer = null;
this.on_stop_cb = () => this.on_stop(0).catch(console.error);
this.recheck_pgs_active = false;
}
parse_etcd_addresses(addrs)
@@ -553,9 +545,9 @@ class Mon
const cur_addr = this.pick_next_etcd();
const base = 'ws'+cur_addr.substr(4);
let now = Date.now();
if (tried[base] && now-tried[base] < this.etcd_start_timeout)
if (tried[base] && now-tried[base] < timeout)
{
await new Promise(ok => setTimeout(ok, this.etcd_start_timeout-(now-tried[base])));
await new Promise(ok => setTimeout(ok, timeout-(now-tried[base])));
now = Date.now();
}
tried[base] = now;
@@ -693,27 +685,8 @@ class Mon
});
}
// Schedule save_last_clean() to to run after a small timeout (1s) (to not spam etcd)
schedule_save_last_clean()
{
if (!this.save_last_clean_timer)
{
this.save_last_clean_timer = setTimeout(() =>
{
this.save_last_clean_timer = null;
this.save_last_clean().catch(this.die);
}, this.config.mon_change_timeout || 1000);
}
}
async save_last_clean()
{
if (this.save_last_clean_running)
{
this.schedule_save_last_clean();
return;
}
this.save_last_clean_running = true;
// last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
const new_clean_pgs = { items: {} };
next_pool:
@@ -750,7 +723,6 @@ class Mon
value: b64(JSON.stringify(this.state.history.last_clean_pgs))
} } ],
}, this.etcd_start_timeout, 0);
this.save_last_clean_running = false;
}
get_mon_state()
@@ -1184,33 +1156,6 @@ class Mon
}
}
filter_osds_by_block_layout(flat_tree, block_size, bitmap_granularity, immediate_commit)
{
for (const host in flat_tree)
{
let found = 0;
for (const osd in flat_tree[host])
{
const osd_stat = this.state.osd.stats[osd];
if (osd_stat && (osd_stat.bs_block_size && osd_stat.bs_block_size != block_size ||
osd_stat.bitmap_granularity && osd_stat.bitmap_granularity != bitmap_granularity ||
osd_stat.immediate_commit == 'small' && immediate_commit == 'all' ||
osd_stat.immediate_commit == 'none' && immediate_commit != 'none'))
{
delete flat_tree[host][osd];
}
else
{
found++;
}
}
if (!found)
{
delete flat_tree[host];
}
}
}
get_affinity_osds(pool_cfg, up_osds, osd_tree)
{
let aff_osds = up_osds;
@@ -1224,12 +1169,6 @@ class Mon
async recheck_pgs()
{
if (this.recheck_pgs_active)
{
this.schedule_recheck();
return;
}
this.recheck_pgs_active = true;
// Take configuration and state, check it against the stored configuration hash
// Recalculate PGs and save them to etcd if the configuration is changed
// FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
@@ -1251,7 +1190,6 @@ class Mon
// Pool deleted. Delete all PGs, but first stop them.
if (!await this.stop_all_pgs(pool_id))
{
this.recheck_pgs_active = false;
this.schedule_recheck();
return;
}
@@ -1278,12 +1216,6 @@ class Mon
pool_tree = pool_tree ? pool_tree.children : [];
pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
this.filter_osds_by_block_layout(
pool_tree,
pool_cfg.block_size || this.config.block_size || 131072,
pool_cfg.bitmap_granularity || this.config.bitmap_granularity || 4096,
pool_cfg.immediate_commit || this.config.immediate_commit || 'none'
);
// These are for the purpose of building history.osd_sets
const real_prev_pgs = [];
let pg_history = [];
@@ -1320,16 +1252,9 @@ class Mon
// PG count changed. Need to bring all PGs down.
if (!await this.stop_all_pgs(pool_id))
{
this.recheck_pgs_active = false;
this.schedule_recheck();
return;
}
}
if (prev_pgs.length != pool_cfg.pg_count)
{
// Scale PG count
// Do it even if old_pg_count is already equal to pool_cfg.pg_count,
// because last_clean_pgs may still contain the old number of PGs
const new_pg_history = [];
PGUtil.scale_pg_count(prev_pgs, real_prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
pg_history = new_pg_history;
@@ -1431,7 +1356,6 @@ class Mon
await this.save_pg_config(new_config_pgs);
}
}
this.recheck_pgs_active = false;
}
async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
@@ -1481,6 +1405,7 @@ class Mon
}
// Schedule a recheck to run after a small timeout (1s)
// If already scheduled, cancel previous timer and schedule it again
// This is required for multiple change events to trigger at most 1 recheck in 1s
schedule_recheck()
{
@@ -1494,15 +1419,15 @@ class Mon
}
}
derive_osd_stats(st, prev, prev_diff)
derive_osd_stats(st, prev)
{
const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
if (!st || !st.time || prev && (prev.time || this.stat_time/1000) >= st.time)
{
return prev_diff || diff;
return diff;
}
const timediff = BigInt(st.time*1000 - prev.time*1000);
const timediff = BigInt(st.time*1000 - (prev && prev.time*1000 || this.stat_time));
for (const op in st.op_stats||{})
{
const pr = prev && prev.op_stats && prev.op_stats[op];
@@ -1534,47 +1459,25 @@ class Mon
if (n > 0)
diff.recovery_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff };
}
for (const pool_id in st.inode_stats||{})
{
const pool_diff = diff.inode_stats[pool_id] = {};
for (const inode_num in st.inode_stats[pool_id])
{
const inode_diff = diff.inode_stats[pool_id][inode_num] = {};
for (const op of [ 'read', 'write', 'delete' ])
{
const c = st.inode_stats[pool_id][inode_num][op];
const pr = prev && prev.inode_stats && prev.inode_stats[pool_id] &&
prev.inode_stats[pool_id][inode_num] && prev.inode_stats[pool_id][inode_num][op];
const n = BigInt(c.count||0) - BigInt(pr && pr.count||0);
inode_diff[op] = {
bps: (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff,
iops: n*1000n/timediff,
lat: (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/(n || 1n),
};
}
}
}
return diff;
}
sum_op_stats()
sum_op_stats(timestamp, prev_stats)
{
for (const osd in this.state.osd.stats)
{
const cur = { ...this.state.osd.stats[osd], inode_stats: this.state.osd.inodestats[osd]||{} };
this.prev_stats.osd_diff[osd] = this.derive_osd_stats(
cur, this.prev_stats.osd_stats[osd], this.prev_stats.osd_diff[osd]
);
this.prev_stats.osd_stats[osd] = cur;
}
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
if (!prev_stats || prev_stats.timestamp >= timestamp)
{
return sum_diff;
}
const tm = BigInt(timestamp - (prev_stats.timestamp || 0));
// Sum derived values instead of deriving summed
for (const osd in this.state.osd.stats)
{
const derived = this.prev_stats.osd_diff[osd];
for (const type in sum_diff)
const derived = this.derive_osd_stats(this.state.osd.stats[osd],
this.prev_stats && this.prev_stats.osd_stats && this.prev_stats.osd_stats[osd]);
for (const type in derived)
{
for (const op in derived[type]||{})
for (const op in derived[type])
{
for (const k in derived[type][op])
{
@@ -1631,14 +1534,14 @@ class Mon
return { object_counts, object_bytes };
}
sum_inode_stats()
sum_inode_stats(prev_stats, timestamp, prev_timestamp)
{
const inode_stats = {};
const inode_stub = () => ({
raw_used: 0n,
read: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
write: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
delete: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
read: { count: 0n, usec: 0n, bytes: 0n },
write: { count: 0n, usec: 0n, bytes: 0n },
delete: { count: 0n, usec: 0n, bytes: 0n },
});
const seen_pools = {};
for (const pool_id in this.state.config.pools)
@@ -1690,25 +1593,11 @@ class Mon
}
}
}
for (const osd in this.prev_stats.osd_diff)
if (prev_stats && prev_timestamp >= timestamp)
{
for (const pool_id in this.prev_stats.osd_diff[osd].inode_stats)
{
for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id])
{
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
for (const op of [ 'read', 'write', 'delete' ])
{
const op_diff = this.prev_stats.osd_diff[osd].inode_stats[pool_id][inode_num][op] || {};
const op_st = inode_stats[pool_id][inode_num][op];
op_st.bps += op_diff.bps;
op_st.iops += op_diff.iops;
op_st.lat += op_diff.lat;
op_st.n_osd = (op_st.n_osd || 0) + 1;
}
}
}
prev_stats = null;
}
const tm = prev_stats ? BigInt(timestamp - prev_timestamp) : 0;
for (const pool_id in inode_stats)
{
for (const inode_num in inode_stats[pool_id])
@@ -1717,12 +1606,11 @@ class Mon
for (const op of [ 'read', 'write', 'delete' ])
{
const op_st = inode_stats[pool_id][inode_num][op];
if (op_st.n_osd)
{
op_st.lat /= BigInt(op_st.n_osd);
delete op_st.n_osd;
}
if (op_st.bps > 0 || op_st.iops > 0)
const prev_st = prev_stats && prev_stats[pool_id] && prev_stats[pool_id][inode_num] && prev_stats[pool_id][inode_num][op];
op_st.bps = prev_st ? (op_st.bytes - prev_st.bytes) * 1000n / tm : 0;
op_st.iops = prev_st ? (op_st.count - prev_st.count) * 1000n / tm : 0;
op_st.lat = prev_st ? (op_st.usec - prev_st.usec) / ((op_st.count - prev_st.count) || 1n) : 0;
if (op_st.bps > 0 || op_st.iops > 0 || op_st.lat > 0)
nonzero = true;
}
if (!nonzero && (!this.state.config.inode[pool_id] || !this.state.config.inode[pool_id][inode_num]))
@@ -1755,9 +1643,15 @@ class Mon
async update_total_stats()
{
const txn = [];
const timestamp = Date.now();
const { object_counts, object_bytes } = this.sum_object_counts();
let stats = this.sum_op_stats();
let { inode_stats, seen_pools } = this.sum_inode_stats();
let stats = this.sum_op_stats(timestamp, this.prev_stats);
let { inode_stats, seen_pools } = this.sum_inode_stats(
this.prev_stats ? this.prev_stats.inode_stats : null,
timestamp, this.prev_stats ? this.prev_stats.timestamp : null
);
this.prev_stats = { timestamp, inode_stats, osd_stats: { ...this.state.osd.stats } };
this.stat_time = Date.now();
stats.object_counts = object_counts;
stats.object_bytes = object_bytes;
stats = this.serialize_bigints(stats);

View File

@@ -1,6 +1,6 @@
{
"name": "vitastor-mon",
"version": "1.2.0",
"version": "1.0.0",
"description": "Vitastor SDS monitor service",
"main": "mon-main.js",
"scripts": {

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '1.2.0'
VERSION = '1.0.0'
LOG = logging.getLogger(__name__)

View File

@@ -24,4 +24,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-1.2.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.2.0$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-1.0.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.0.0$(rpm --eval '%dist').tar.gz *

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.2.0.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.0.0.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 1.2.0
Version: 1.0.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.2.0.el7.tar.gz
Source0: vitastor-1.0.0.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.2.0.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.0.0.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 1.2.0
Version: 1.0.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.2.0.el8.tar.gz
Source0: vitastor-1.0.0.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -18,7 +18,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.2.0.el9.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.0.0.el9.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 1.2.0
Version: 1.0.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.2.0.el9.tar.gz
Source0: vitastor-1.0.0.el9.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -16,11 +16,10 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="1.2.0")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
add_link_options(-fno-omit-frame-pointer)
add_definitions(-DVERSION="1.0.0")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN})
add_definitions(-fsanitize=address)
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
add_link_options(-fsanitize=address -fno-omit-frame-pointer)
endif (${WITH_ASAN})
@@ -138,7 +137,6 @@ endif (${WITH_FIO})
add_library(vitastor_client SHARED
cluster_client.cpp
cluster_client_list.cpp
cluster_client_wb.cpp
vitastor_c.cpp
cli_common.cpp
cli_alloc_osd.cpp
@@ -181,25 +179,6 @@ target_link_libraries(vitastor-nbd
vitastor_client
)
# vitastor-kv
add_executable(vitastor-kv
kv_cli.cpp
kv_db.cpp
kv_db.h
)
target_link_libraries(vitastor-kv
vitastor_client
)
add_executable(vitastor-kv-stress
kv_stress.cpp
kv_db.cpp
kv_db.h
)
target_link_libraries(vitastor-kv-stress
vitastor_client
)
# vitastor-nfs
add_executable(vitastor-nfs
nfs_proxy.cpp
@@ -321,7 +300,7 @@ target_link_libraries(test_crc32
add_executable(test_cluster_client
EXCLUDE_FROM_ALL
test_cluster_client.cpp
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp cluster_client_wb.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
etcd_state_client.cpp timerfd_manager.cpp str_util.cpp ../json11/json11.cpp
)
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)

View File

@@ -45,31 +45,13 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
meta_block_size = parse_size(config["meta_block_size"]);
bitmap_granularity = parse_size(config["bitmap_granularity"]);
meta_format = stoull_full(config["meta_format"]);
if (config.find("data_io") == config.end() &&
config.find("meta_io") == config.end() &&
config.find("journal_io") == config.end())
{
bool cached_io_data = config["cached_io_data"] == "true" || config["cached_io_data"] == "yes" || config["cached_io_data"] == "1";
bool cached_io_meta = cached_io_data && (meta_device == data_device || meta_device == "") &&
config.find("cached_io_meta") == config.end() ||
config["cached_io_meta"] == "true" || config["cached_io_meta"] == "yes" || config["cached_io_meta"] == "1";
bool cached_io_journal = cached_io_meta && (journal_device == meta_device || journal_device == "") &&
config.find("cached_io_journal") == config.end() ||
config["cached_io_journal"] == "true" || config["cached_io_journal"] == "yes" || config["cached_io_journal"] == "1";
data_io = cached_io_data ? "cached" : "direct";
meta_io = cached_io_meta ? "cached" : "direct";
journal_io = cached_io_journal ? "cached" : "direct";
}
else
{
data_io = config.find("data_io") != config.end() ? config["data_io"] : "direct";
meta_io = config.find("meta_io") != config.end()
? config["meta_io"]
: (meta_device == data_device || meta_device == "" ? data_io : "direct");
journal_io = config.find("journal_io") != config.end()
? config["journal_io"]
: (journal_device == meta_device || journal_device == "" ? meta_io : "direct");
}
cached_io_data = config["cached_io_data"] == "true" || config["cached_io_data"] == "yes" || config["cached_io_data"] == "1";
cached_io_meta = cached_io_data && (meta_device == data_device || meta_device == "") &&
config.find("cached_io_meta") == config.end() ||
config["cached_io_meta"] == "true" || config["cached_io_meta"] == "yes" || config["cached_io_meta"] == "1";
cached_io_journal = cached_io_meta && (journal_device == meta_device || journal_device == "") &&
config.find("cached_io_journal") == config.end() ||
config["cached_io_journal"] == "true" || config["cached_io_journal"] == "yes" || config["cached_io_journal"] == "1";
if (config["data_csum_type"] == "crc32c")
{
data_csum_type = BLOCKSTORE_CSUM_CRC32C;
@@ -290,19 +272,9 @@ static void check_size(int fd, uint64_t *size, uint64_t *sectsize, std::string n
}
}
static int bs_openmode(const std::string & mode)
{
if (mode == "directsync")
return O_DIRECT|O_SYNC;
else if (mode == "cached")
return O_SYNC;
else
return O_DIRECT;
}
void blockstore_disk_t::open_data()
{
data_fd = open(data_device.c_str(), bs_openmode(data_io) | O_RDWR);
data_fd = open(data_device.c_str(), (cached_io_data ? O_SYNC : O_DIRECT) | O_RDWR);
if (data_fd == -1)
{
throw std::runtime_error("Failed to open data device "+data_device+": "+std::string(strerror(errno)));
@@ -327,9 +299,9 @@ void blockstore_disk_t::open_data()
void blockstore_disk_t::open_meta()
{
if (meta_device != data_device || meta_io != data_io)
if (meta_device != data_device || cached_io_meta != cached_io_data)
{
meta_fd = open(meta_device.c_str(), bs_openmode(meta_io) | O_RDWR);
meta_fd = open(meta_device.c_str(), (cached_io_meta ? O_SYNC : O_DIRECT) | O_RDWR);
if (meta_fd == -1)
{
throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
@@ -365,9 +337,9 @@ void blockstore_disk_t::open_meta()
void blockstore_disk_t::open_journal()
{
if (journal_device != meta_device || journal_io != meta_io)
if (journal_device != meta_device || cached_io_journal != cached_io_meta)
{
journal_fd = open(journal_device.c_str(), bs_openmode(journal_io) | O_RDWR);
journal_fd = open(journal_device.c_str(), (cached_io_journal ? O_SYNC : O_DIRECT) | O_RDWR);
if (journal_fd == -1)
{
throw std::runtime_error("Failed to open journal device "+journal_device+": "+std::string(strerror(errno)));

View File

@@ -31,9 +31,8 @@ struct blockstore_disk_t
uint32_t csum_block_size = 4096;
// By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
bool disable_flock = false;
// I/O modes for data, metadata and journal: direct or "" = O_DIRECT, cached = O_SYNC, directsync = O_DIRECT|O_SYNC
// O_SYNC without O_DIRECT = use Linux page cache for reads and writes
std::string data_io, meta_io, journal_io;
// Use Linux page cache for reads and writes, i.e. open FDs with O_SYNC instead of O_DIRECT
bool cached_io_data = false, cached_io_meta = false, cached_io_journal = false;
int meta_fd = -1, data_fd = -1, journal_fd = -1;
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;

View File

@@ -1372,8 +1372,7 @@ bool journal_flusher_co::trim_journal(int wait_base)
? (uint32_t)JE_START_V1_SIZE : (uint32_t)JE_START_V2_SIZE),
.reserved = 0,
.journal_start = new_trim_pos,
.version = (uint64_t)(!bs->dsk.data_csum_type && ((journal_entry_start*)flusher->journal_superblock)->version == JOURNAL_VERSION_V1
? JOURNAL_VERSION_V1 : JOURNAL_VERSION_V2),
.version = JOURNAL_VERSION_V2,
.data_csum_type = bs->dsk.data_csum_type,
.csum_block_size = bs->dsk.csum_block_size,
};

View File

@@ -384,10 +384,6 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
return;
}
if (op->opcode == BS_OP_SYNC)
{
unsynced_queued_ops = 0;
}
init_op(op);
submit_queue.push_back(op);
ringloop->wakeup();

View File

@@ -262,8 +262,6 @@ class blockstore_impl_t
int throttle_target_parallelism = 1;
// Minimum difference in microseconds between target and real execution times to throttle the response
int throttle_threshold_us = 50;
// Maximum writes between automatically added fsync operations
uint64_t autosync_writes = 128;
/******* END OF OPTIONS *******/
struct ring_consumer_t ring_consumer;
@@ -274,8 +272,7 @@ class blockstore_impl_t
blockstore_dirty_db_t dirty_db;
std::vector<blockstore_op_t*> submit_queue;
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
int unsynced_big_write_count = 0, unstable_unsynced = 0;
int unsynced_queued_ops = 0;
int unsynced_big_write_count = 0;
allocator *data_alloc = NULL;
uint8_t *zero_object;

View File

@@ -553,7 +553,7 @@ resume_1:
}
if (je_start->size == JE_START_V0_SIZE ||
(je_start->version != JOURNAL_VERSION_V1 || je_start->size != JE_START_V1_SIZE) &&
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE && je_start->size != JE_START_V1_SIZE))
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE))
{
fprintf(
stderr, "The code only supports journal versions 2 and 1, but it is %lu on disk."
@@ -562,8 +562,7 @@ resume_1:
);
exit(1);
}
if (je_start->version == JOURNAL_VERSION_V1 ||
je_start->version == JOURNAL_VERSION_V2 && je_start->size == JE_START_V1_SIZE)
if (je_start->version == JOURNAL_VERSION_V1)
{
je_start->data_csum_type = 0;
je_start->csum_block_size = 0;

View File

@@ -145,7 +145,6 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
journal.sector_info[journal.cur_sector].offset = journal.next_free;
journal.in_sector_pos = 0;
journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
assert(journal.next_free != journal.used_start);
memset(journal.inmemory
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);

View File

@@ -13,6 +13,12 @@
#define JOURNAL_BUFFER_SIZE 4*1024*1024
#define JOURNAL_ENTRY_HEADER_SIZE 16
// We reserve some extra space for future stabilize requests during writes
// FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
// writing more than can be stabilized afterwards
#define JOURNAL_STABILIZE_RESERVATION 65536
#define JOURNAL_INSTANT_RESERVATION 131072
// Journal entries
// Journal entries are linked to each other by their crc32 value
// The journal is almost a blockchain, because object versions constantly increase

View File

@@ -19,10 +19,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
if (config.find("autosync_writes") != config.end())
{
autosync_writes = strtoull(config["autosync_writes"].c_str(), NULL, 10);
}
if (!max_flusher_count)
{
max_flusher_count = 256;

View File

@@ -86,15 +86,14 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
auto & dirty_entry = dirty_db.at(sbw);
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
left == 0 ? JOURNAL_STABILIZE_RESERVATION : 0))
{
return 0;
}
}
}
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
{
return 0;
}
@@ -185,11 +184,6 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
{
mark_stable(dirty_it->first);
}
else
{
unstable_unsynced--;
assert(unstable_unsynced >= 0);
}
dirty_it++;
while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
{
@@ -220,11 +214,6 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
{
mark_stable(*it);
}
else
{
unstable_unsynced--;
assert(unstable_unsynced >= 0);
}
}
}
op->retval = 0;

View File

@@ -21,7 +21,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
dyn = calloc_or_die(1, dyn_size+sizeof(int));
*((int*)dyn) = 1;
}
uint8_t *dyn_ptr = (alloc_dyn_data ? (uint8_t*)dyn+sizeof(int) : (uint8_t*)&dyn);
uint8_t *dyn_ptr = (uint8_t*)(alloc_dyn_data ? dyn+sizeof(int) : &dyn);
uint64_t version = 1;
if (dirty_db.size() > 0)
{
@@ -127,9 +127,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
return false;
}
}
bool imm = (op->len < dsk.data_block_size ? (immediate_commit != IMMEDIATE_NONE) : (immediate_commit == IMMEDIATE_ALL));
if (wait_big && !is_del && !deleted && op->len < dsk.data_block_size && !imm ||
!imm && unsynced_queued_ops >= autosync_writes)
if (wait_big && !is_del && !deleted && op->len < dsk.data_block_size &&
immediate_commit != IMMEDIATE_ALL)
{
// Issue an additional sync so that the previous big write can reach the journal
blockstore_op_t *sync_op = new blockstore_op_t;
@@ -140,8 +139,6 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
};
enqueue_op(sync_op);
}
else if (!imm)
unsynced_queued_ops++;
#ifdef BLOCKSTORE_DEBUG
if (is_del)
printf("Delete %lx:%lx v%lu\n", op->oid.inode, op->oid.stripe, op->version);
@@ -320,7 +317,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, unsynced_big_write_count + 1,
sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
(dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION))
{
return 0;
}
@@ -386,10 +383,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
sqe, dsk.data_fd, PRIV(op)->iov_zerofill, vcnt, dsk.data_offset + (loc << dsk.block_order) + op->offset - stripe_offset
);
PRIV(op)->pending_ops = 1;
if (immediate_commit != IMMEDIATE_ALL && !(dirty_it->second.state & BS_ST_INSTANT))
{
unstable_unsynced++;
}
if (immediate_commit != IMMEDIATE_ALL)
{
// Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
@@ -412,7 +405,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
|| !space_check.check_available(op, 1,
sizeof(journal_entry_small_write) + dyn_size,
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
op->len + ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
{
return 0;
}
@@ -503,11 +496,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
if (journal.next_free >= journal.len)
{
journal.next_free = dsk.journal_block_size;
assert(journal.next_free != journal.used_start);
}
if (immediate_commit == IMMEDIATE_NONE && !(dirty_it->second.state & BS_ST_INSTANT))
{
unstable_unsynced++;
}
if (!PRIV(op)->pending_ops)
{
@@ -547,7 +535,7 @@ resume_2:
uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
{
return 0;
}
@@ -591,20 +579,14 @@ resume_4:
#endif
bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
bool is_instant = ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT));
if (imm)
{
auto & unstab = unstable_writes[op->oid];
unstab = unstab < op->version ? op->version : unstab;
}
else if (!is_instant)
{
unstable_unsynced--;
assert(unstable_unsynced >= 0);
}
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK)
| (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
if (imm && is_instant)
if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
{
// Deletions and 'instant' operations are treated as immediately stable
mark_stable(dirty_it->first);
@@ -750,7 +732,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
});
assert(dirty_it != dirty_db.end());
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), (unstable_writes.size()+unstable_unsynced)*journal.block_size))
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_INSTANT_RESERVATION))
{
return 0;
}

View File

@@ -17,7 +17,7 @@
static const char *exe_name = NULL;
static const char* help_text =
"Vitastor command-line tool " VERSION "\n"
"Vitastor command-line tool\n"
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
"\n"
"COMMANDS:\n"
@@ -331,7 +331,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
{
// Create client
json11::Json cfg_j = cfg;
p->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
p->ringloop = new ring_loop_t(512);
p->epmgr = new epoll_manager_t(p->ringloop);
p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg_j);
// Smaller timeout by default for more interactiveness
@@ -349,7 +349,6 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
p->ringloop->wait();
}
// Destroy the client
p->cli->flush();
delete p->cli;
delete p->epmgr;
delete p->ringloop;

View File

@@ -109,7 +109,7 @@ resume_1:
}
for (auto pg_per_pair: pg_per_osd)
{
uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.real_pg_count / pg_per_pair.second;
uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.pg_count / pg_per_pair.second;
if (pool_avail > pg_free)
{
pool_avail = pg_free;
@@ -124,10 +124,8 @@ resume_1:
pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
}
pool_stats[pool_cfg.id] = json11::Json::object {
{ "id", (uint64_t)pool_cfg.id },
{ "name", pool_cfg.name },
{ "pg_count", pool_cfg.pg_count },
{ "real_pg_count", pool_cfg.real_pg_count },
{ "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "ec" },
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
@@ -178,7 +176,7 @@ resume_1:
{ "title", "SCHEME" },
});
cols.push_back(json11::Json::object{
{ "key", "pg_count_fmt" },
{ "key", "pg_count" },
{ "title", "PGS" },
});
cols.push_back(json11::Json::object{
@@ -207,9 +205,6 @@ resume_1:
double raw_to = kv.second["raw_to_usable"].number_value();
if (raw_to < 0.000001 && raw_to > -0.000001)
raw_to = 1;
kv.second["pg_count_fmt"] = kv.second["real_pg_count"] == kv.second["pg_count"]
? kv.second["real_pg_count"].as_string()
: kv.second["real_pg_count"].as_string()+"->"+kv.second["pg_count"].as_string();
kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / raw_to);
kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / raw_to);
kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());

View File

@@ -53,7 +53,6 @@ struct snap_merger_t
std::map<inode_t, std::vector<uint64_t>> layer_lists;
std::map<inode_t, uint64_t> layer_block_size;
std::map<inode_t, uint64_t> layer_list_pos;
std::vector<snap_rw_op_t*> continue_rwo, continue_rwo2;
int in_flight = 0;
uint64_t last_fsync_offset = 0;
uint64_t last_written_offset = 0;
@@ -305,12 +304,6 @@ struct snap_merger_t
oit = merge_offsets.begin();
resume_5:
// Now read, overwrite and optionally delete offsets one by one
continue_rwo2.swap(continue_rwo);
for (auto rwo: continue_rwo2)
{
next_write(rwo);
}
continue_rwo2.clear();
while (in_flight < parent->iodepth*parent->parallel_osds &&
oit != merge_offsets.end() && !rwo_error.size())
{
@@ -471,8 +464,7 @@ struct snap_merger_t
rwo->error_offset = op->offset;
rwo->error_read = true;
}
continue_rwo.push_back(rwo);
parent->ringloop->wakeup();
next_write(rwo);
};
parent->cli->execute(op);
}
@@ -552,9 +544,11 @@ struct snap_merger_t
}
// Increment CAS version
rwo->op.version = subop->version;
if (use_cas)
next_write(rwo);
else
autofree_op(rwo);
delete subop;
continue_rwo.push_back(rwo);
parent->ringloop->wakeup();
};
parent->cli->execute(subop);
}

View File

@@ -158,7 +158,12 @@ resume_2:
for (auto & pool_pair: parent->cli->st_cli.pool_config)
{
auto & pool_cfg = pool_pair.second;
bool active = pool_cfg.real_pg_count > 0;
bool active = true;
if (pool_cfg.pg_config.size() != pool_cfg.pg_count)
{
active = false;
pgs_by_state["offline"] += pool_cfg.pg_count-pool_cfg.pg_config.size();
}
pool_count++;
for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++)
{

View File

@@ -3,13 +3,21 @@
#include <stdexcept>
#include <assert.h>
#include "cluster_client_impl.h"
#include "http_client.h" // json_is_true
#include "cluster_client.h"
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config)
#define SCRAP_BUFFER_SIZE 4*1024*1024
#define PART_SENT 1
#define PART_DONE 2
#define PART_ERROR 4
#define PART_RETRY 8
#define CACHE_DIRTY 1
#define CACHE_FLUSHING 2
#define CACHE_REPEATING 3
#define OP_FLUSH_BUFFER 0x02
#define OP_IMMEDIATE_COMMIT 0x04
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
{
wb = new writeback_cache_t();
cli_config = config.object_items();
file_config = osd_messenger_t::read_config(config);
config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
@@ -29,14 +37,20 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
continue_lists();
continue_raw_ops(peer_osd);
}
else
else if (dirty_buffers.size())
{
// peer_osd just dropped connection
// determine WHICH dirty_buffers are now obsolete and repeat them
if (wb->repeat_ops_for(this, peer_osd) > 0)
for (auto & wr: dirty_buffers)
{
continue_ops();
if (affects_osd(wr.first.inode, wr.first.stripe, wr.second.len, peer_osd) &&
wr.second.state != CACHE_REPEATING)
{
// FIXME: Flush in larger parts
flush_buffer(wr.first, &wr.second);
}
}
continue_ops();
}
};
msgr.exec_op = [this](osd_op_t *op)
@@ -64,14 +78,16 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
cluster_client_t::~cluster_client_t()
{
msgr.repeer_pgs = [](osd_num_t){};
for (auto bp: dirty_buffers)
{
free(bp.second.buf);
}
dirty_buffers.clear();
if (ringloop)
{
ringloop->unregister_consumer(&consumer);
}
free(scrap_buffer);
delete wb;
wb = NULL;
}
cluster_op_t::~cluster_op_t()
@@ -120,19 +136,6 @@ void cluster_client_t::init_msgr()
}
}
void cluster_client_t::unshift_op(cluster_op_t *op)
{
op->next = op_queue_head;
if (op_queue_head)
{
op_queue_head->prev = op;
op_queue_head = op;
}
else
op_queue_tail = op_queue_head = op;
inc_wait(op->opcode, op->flags, op->next, 1);
}
void cluster_client_t::calc_wait(cluster_op_t *op)
{
op->prev_wait = 0;
@@ -153,7 +156,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
{
for (auto prev = op->prev; prev; prev = prev->prev)
{
if (prev->opcode == OSD_OP_SYNC || prev->opcode == OSD_OP_WRITE && (!(prev->flags & OP_IMMEDIATE_COMMIT) || enable_writeback))
if (prev->opcode == OSD_OP_SYNC || prev->opcode == OSD_OP_WRITE && !(prev->flags & OP_IMMEDIATE_COMMIT))
{
op->prev_wait++;
}
@@ -163,58 +166,68 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
}
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) */
{
continue_rw(op);
for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
{
if (prev->opcode == OSD_OP_WRITE && (prev->flags & OP_FLUSH_BUFFER))
{
op->prev_wait++;
}
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ ||
prev->opcode == OSD_OP_READ_BITMAP || prev->opcode == OSD_OP_READ_CHAIN_BITMAP)
{
// Flushes are always in the beginning (we're scanning from the beginning of the queue)
break;
}
}
if (!op->prev_wait)
continue_rw(op);
}
}
void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *next, int inc)
{
if (opcode != OSD_OP_WRITE && opcode != OSD_OP_SYNC)
if (opcode == OSD_OP_WRITE)
{
return;
}
cluster_op_t *bh_ops_local[32], **bh_ops = bh_ops_local;
int bh_op_count = 0, bh_op_max = 32;
while (next)
{
auto n2 = next->next;
if (opcode == OSD_OP_WRITE
? (next->opcode == OSD_OP_SYNC && (!(flags & OP_IMMEDIATE_COMMIT) || enable_writeback) ||
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER))
: (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE))
while (next)
{
next->prev_wait += inc;
assert(next->prev_wait >= 0);
if (!next->prev_wait)
auto n2 = next->next;
if (next->opcode == OSD_OP_SYNC && !(flags & OP_IMMEDIATE_COMMIT) ||
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP ||
next->opcode == OSD_OP_READ_CHAIN_BITMAP) && (flags & OP_FLUSH_BUFFER))
{
// Kind of std::vector with local "small vector optimisation"
if (bh_op_count >= bh_op_max)
next->prev_wait += inc;
assert(next->prev_wait >= 0);
if (!next->prev_wait)
{
bh_op_max *= 2;
cluster_op_t **n = (cluster_op_t**)malloc_or_die(sizeof(cluster_op_t*) * bh_op_max);
memcpy(n, bh_ops, sizeof(cluster_op_t*) * bh_op_count);
if (bh_ops != bh_ops_local)
{
free(bh_ops);
}
bh_ops = n;
if (next->opcode == OSD_OP_SYNC)
continue_sync(next);
else
continue_rw(next);
}
bh_ops[bh_op_count++] = next;
}
next = n2;
}
next = n2;
}
for (int i = 0; i < bh_op_count; i++)
else if (opcode == OSD_OP_SYNC)
{
cluster_op_t *next = bh_ops[i];
if (next->opcode == OSD_OP_SYNC)
continue_sync(next);
else
continue_rw(next);
}
if (bh_ops != bh_ops_local)
{
free(bh_ops);
while (next)
{
auto n2 = next->next;
if (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE)
{
next->prev_wait += inc;
assert(next->prev_wait >= 0);
if (!next->prev_wait)
{
if (next->opcode == OSD_OP_SYNC)
continue_sync(next);
else
continue_rw(next);
}
}
next = n2;
}
}
}
@@ -232,37 +245,13 @@ void cluster_client_t::erase_op(cluster_op_t *op)
op_queue_tail = op->prev;
op->next = op->prev = NULL;
if (flags & OP_FLUSH_BUFFER)
{
// Completed flushes change writeback buffer states,
// so the callback should be run before inc_wait()
// which may continue following SYNCs, but these SYNCs
// should know about the changed buffer state
// This is ugly but this is the way we do it
std::function<void(cluster_op_t*)>(op->callback)(op);
}
if (!(flags & OP_IMMEDIATE_COMMIT) || enable_writeback)
{
if (!(flags & OP_IMMEDIATE_COMMIT))
inc_wait(opcode, flags, next, -1);
}
// Call callback at the end to avoid inconsistencies in prev_wait
// if the callback adds more operations itself
if (!(flags & OP_FLUSH_BUFFER))
{
// Call callback at the end to avoid inconsistencies in prev_wait
// if the callback adds more operations itself
std::function<void(cluster_op_t*)>(op->callback)(op);
}
if (flags & OP_FLUSH_BUFFER)
{
int i = 0;
while (i < wb->writeback_overflow.size() && wb->writebacks_active < client_max_writeback_iodepth)
{
execute_internal(wb->writeback_overflow[i]);
i++;
}
if (i > 0)
{
wb->writeback_overflow.erase(wb->writeback_overflow.begin(), wb->writeback_overflow.begin()+i);
}
}
}
void cluster_client_t::continue_ops(bool up_retry)
@@ -306,7 +295,6 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
{
this->etcd_global_config = etcd_global_config;
config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
// client_max_dirty_bytes/client_dirty_limit
if (config.find("client_max_dirty_bytes") != config.end())
{
client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
@@ -322,34 +310,11 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
{
client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
}
// client_max_dirty_ops
client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
if (!client_max_dirty_ops)
{
client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
}
// client_enable_writeback
enable_writeback = json_is_true(config["client_enable_writeback"]) &&
json_is_true(config["client_writeback_allowed"]);
// client_max_buffered_bytes
client_max_buffered_bytes = config["client_max_buffered_bytes"].uint64_value();
if (!client_max_buffered_bytes)
{
client_max_buffered_bytes = DEFAULT_CLIENT_MAX_BUFFERED_BYTES;
}
// client_max_buffered_ops
client_max_buffered_ops = config["client_max_buffered_ops"].uint64_value();
if (!client_max_buffered_ops)
{
client_max_buffered_ops = DEFAULT_CLIENT_MAX_BUFFERED_OPS;
}
// client_max_writeback_iodepth
client_max_writeback_iodepth = config["client_max_writeback_iodepth"].uint64_value();
if (!client_max_writeback_iodepth)
{
client_max_writeback_iodepth = DEFAULT_CLIENT_MAX_WRITEBACK_IODEPTH;
}
// up_wait_retry_interval
up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value();
if (!up_wait_retry_interval)
{
@@ -409,8 +374,6 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
bool cluster_client_t::get_immediate_commit(uint64_t inode)
{
if (enable_writeback)
return false;
pool_id_t pool_id = INODE_POOL(inode);
if (!pool_id)
return true;
@@ -445,41 +408,6 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
}
}
bool cluster_client_t::flush()
{
if (!ringloop)
{
if (wb->writeback_queue.size())
{
wb->start_writebacks(this, 0);
cluster_op_t *sync = new cluster_op_t;
sync->opcode = OSD_OP_SYNC;
sync->callback = [](cluster_op_t *sync)
{
delete sync;
};
execute(sync);
}
return op_queue_head == NULL;
}
bool sync_done = false;
cluster_op_t *sync = new cluster_op_t;
sync->opcode = OSD_OP_SYNC;
sync->callback = [&sync_done](cluster_op_t *sync)
{
delete sync;
sync_done = true;
};
execute(sync);
while (!sync_done)
{
ringloop->loop();
if (!sync_done)
ringloop->wait();
}
return true;
}
/**
* How writes are synced when immediate_commit is false
*
@@ -500,9 +428,6 @@ bool cluster_client_t::flush()
* 3) if yes, send all SYNCs. otherwise, leave current SYNC as is.
* 4) if any of them fail due to disconnected peers, repeat SYNC after repeating all writes
* 5) if any of them fail due to other errors, fail the SYNC operation
*
* If writeback caching is turned on and writeback limit is not exhausted:
* data is just copied and the write is confirmed to the client.
*/
void cluster_client_t::execute(cluster_op_t *op)
{
@@ -518,73 +443,67 @@ void cluster_client_t::execute(cluster_op_t *op)
offline_ops.push_back(op);
return;
}
op->flags = op->flags & OSD_OP_IGNORE_READONLY; // the only allowed flag
execute_internal(op);
}
void cluster_client_t::execute_internal(cluster_op_t *op)
{
op->cur_inode = op->inode;
op->retval = 0;
// check alignment, readonly flag and so on
if (!check_rw(op))
op->flags = op->flags & OSD_OP_IGNORE_READONLY; // single allowed flag
if (op->opcode != OSD_OP_SYNC)
{
return;
}
if (op->opcode == OSD_OP_WRITE && enable_writeback && !(op->flags & OP_FLUSH_BUFFER) &&
!op->version /* no CAS writeback */)
{
if (wb->writebacks_active >= client_max_writeback_iodepth)
pool_id_t pool_id = INODE_POOL(op->cur_inode);
if (!pool_id)
{
// Writeback queue is full, postpone the operation
wb->writeback_overflow.push_back(op);
op->retval = -EINVAL;
std::function<void(cluster_op_t*)>(op->callback)(op);
return;
}
// Just copy and acknowledge the operation
wb->copy_write(op, CACHE_DIRTY);
while (wb->writeback_bytes + op->len > client_max_buffered_bytes || wb->writeback_queue_size > client_max_buffered_ops)
auto pool_it = st_cli.pool_config.find(pool_id);
if (pool_it == st_cli.pool_config.end() || pool_it->second.real_pg_count == 0)
{
// Initiate some writeback (asynchronously)
wb->start_writebacks(this, 1);
// Pools are loaded, but this one is unknown
op->retval = -EINVAL;
std::function<void(cluster_op_t*)>(op->callback)(op);
return;
}
// Check alignment
if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
{
op->retval = -EINVAL;
std::function<void(cluster_op_t*)>(op->callback)(op);
return;
}
if (pool_it->second.immediate_commit == IMMEDIATE_ALL)
{
op->flags |= OP_IMMEDIATE_COMMIT;
}
op->retval = op->len;
std::function<void(cluster_op_t*)>(op->callback)(op);
return;
}
if (op->opcode == OSD_OP_WRITE && !(op->flags & OP_IMMEDIATE_COMMIT))
{
if (!(op->flags & OP_FLUSH_BUFFER) && !op->version /* no CAS write-repeat */)
{
wb->copy_write(op, CACHE_WRITTEN);
}
if (dirty_bytes >= client_max_dirty_bytes || dirty_ops >= client_max_dirty_ops)
{
// Push an extra SYNC operation to flush previous writes
cluster_op_t *sync_op = new cluster_op_t;
sync_op->opcode = OSD_OP_SYNC;
sync_op->flags = OP_FLUSH_BUFFER;
sync_op->callback = [](cluster_op_t* sync_op)
{
delete sync_op;
};
execute_internal(sync_op);
sync_op->prev = op_queue_tail;
if (op_queue_tail)
{
op_queue_tail->next = sync_op;
op_queue_tail = sync_op;
}
else
op_queue_tail = op_queue_head = sync_op;
dirty_bytes = 0;
dirty_ops = 0;
calc_wait(sync_op);
}
dirty_bytes += op->len;
dirty_ops++;
}
else if (op->opcode == OSD_OP_SYNC)
{
// Flush the whole write-back queue first
if (!(op->flags & OP_FLUSH_BUFFER) && wb->writeback_overflow.size() > 0)
{
// Writeback queue is full, postpone the operation
wb->writeback_overflow.push_back(op);
return;
}
if (wb->writeback_queue.size())
{
wb->start_writebacks(this, 0);
}
dirty_bytes = 0;
dirty_ops = 0;
}
@@ -596,7 +515,7 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
}
else
op_queue_tail = op_queue_head = op;
if (!(op->flags & OP_IMMEDIATE_COMMIT) || enable_writeback)
if (!(op->flags & OP_IMMEDIATE_COMMIT))
calc_wait(op);
else
{
@@ -607,52 +526,6 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
}
}
bool cluster_client_t::check_rw(cluster_op_t *op)
{
if (op->opcode == OSD_OP_SYNC)
{
return true;
}
pool_id_t pool_id = INODE_POOL(op->cur_inode);
if (!pool_id)
{
op->retval = -EINVAL;
std::function<void(cluster_op_t*)>(op->callback)(op);
return false;
}
auto pool_it = st_cli.pool_config.find(pool_id);
if (pool_it == st_cli.pool_config.end() || pool_it->second.real_pg_count == 0)
{
// Pools are loaded, but this one is unknown
op->retval = -EINVAL;
std::function<void(cluster_op_t*)>(op->callback)(op);
return false;
}
// Check alignment
if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
{
op->retval = -EINVAL;
std::function<void(cluster_op_t*)>(op->callback)(op);
return false;
}
if (pool_it->second.immediate_commit == IMMEDIATE_ALL)
{
op->flags |= OP_IMMEDIATE_COMMIT;
}
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && !(op->flags & OSD_OP_IGNORE_READONLY))
{
auto ino_it = st_cli.inode_config.find(op->inode);
if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly)
{
op->retval = -EROFS;
std::function<void(cluster_op_t*)>(op->callback)(op);
return false;
}
}
return true;
}
void cluster_client_t::execute_raw(osd_num_t osd_num, osd_op_t *op)
{
auto fd_it = msgr.osd_peer_fds.find(osd_num);
@@ -670,6 +543,114 @@ void cluster_client_t::execute_raw(osd_num_t osd_num, osd_op_t *op)
}
}
void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers)
{
// Save operation for replay when one of PGs goes out of sync
// (primary OSD drops our connection in this case)
auto dirty_it = dirty_buffers.lower_bound((object_id){
.inode = op->inode,
.stripe = op->offset,
});
while (dirty_it != dirty_buffers.begin())
{
dirty_it--;
if (dirty_it->first.inode != op->inode ||
(dirty_it->first.stripe + dirty_it->second.len) <= op->offset)
{
dirty_it++;
break;
}
}
uint64_t pos = op->offset, len = op->len, iov_idx = 0, iov_pos = 0;
while (len > 0)
{
uint64_t new_len = 0;
if (dirty_it == dirty_buffers.end() || dirty_it->first.inode != op->inode)
{
new_len = len;
}
else if (dirty_it->first.stripe > pos)
{
new_len = dirty_it->first.stripe - pos;
if (new_len > len)
{
new_len = len;
}
}
if (new_len > 0)
{
dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
.inode = op->inode,
.stripe = pos,
}, (cluster_buffer_t){
.buf = malloc_or_die(new_len),
.len = new_len,
});
}
// FIXME: Split big buffers into smaller ones on overwrites. But this will require refcounting
dirty_it->second.state = CACHE_DIRTY;
uint64_t cur_len = (dirty_it->first.stripe + dirty_it->second.len - pos);
if (cur_len > len)
{
cur_len = len;
}
while (cur_len > 0 && iov_idx < op->iov.count)
{
unsigned iov_len = (op->iov.buf[iov_idx].iov_len - iov_pos);
if (iov_len <= cur_len)
{
memcpy((uint8_t*)dirty_it->second.buf + pos - dirty_it->first.stripe,
(uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, iov_len);
pos += iov_len;
len -= iov_len;
cur_len -= iov_len;
iov_pos = 0;
iov_idx++;
}
else
{
memcpy((uint8_t*)dirty_it->second.buf + pos - dirty_it->first.stripe,
(uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, cur_len);
pos += cur_len;
len -= cur_len;
iov_pos += cur_len;
cur_len = 0;
}
}
dirty_it++;
}
}
void cluster_client_t::flush_buffer(const object_id & oid, cluster_buffer_t *wr)
{
wr->state = CACHE_REPEATING;
cluster_op_t *op = new cluster_op_t;
op->flags = OSD_OP_IGNORE_READONLY|OP_FLUSH_BUFFER;
op->opcode = OSD_OP_WRITE;
op->cur_inode = op->inode = oid.inode;
op->offset = oid.stripe;
op->len = wr->len;
op->iov.push_back(wr->buf, wr->len);
op->callback = [wr](cluster_op_t* op)
{
if (wr->state == CACHE_REPEATING)
{
wr->state = CACHE_DIRTY;
}
delete op;
};
op->next = op_queue_head;
if (op_queue_head)
{
op_queue_head->prev = op;
op_queue_head = op;
}
else
op_queue_tail = op_queue_head = op;
inc_wait(op->opcode, op->flags, op->next, 1);
continue_rw(op);
}
int cluster_client_t::continue_rw(cluster_op_t *op)
{
if (op->state == 0)
@@ -678,7 +659,27 @@ int cluster_client_t::continue_rw(cluster_op_t *op)
goto resume_1;
else if (op->state == 2)
goto resume_2;
else if (op->state == 3)
goto resume_3;
resume_0:
if (op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE)
{
if (!(op->flags & OSD_OP_IGNORE_READONLY))
{
auto ino_it = st_cli.inode_config.find(op->inode);
if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly)
{
op->retval = -EINVAL;
erase_op(op);
return 1;
}
}
if (op->opcode == OSD_OP_WRITE && !(op->flags & OP_IMMEDIATE_COMMIT) && !(op->flags & OP_FLUSH_BUFFER))
{
copy_write(op, dirty_buffers);
}
}
resume_1:
// Slice the operation into parts
slice_rw(op);
op->needs_reslice = false;
@@ -689,9 +690,9 @@ resume_0:
erase_op(op);
return 1;
}
resume_1:
resume_2:
// Send unsent parts, if they're not subject to change
op->state = 2;
op->state = 3;
if (op->needs_reslice)
{
for (int i = 0; i < op->parts.size(); i++)
@@ -701,7 +702,7 @@ resume_1:
op->retval = -EPIPE;
}
}
goto resume_2;
goto resume_3;
}
for (int i = 0; i < op->parts.size(); i++)
{
@@ -722,18 +723,18 @@ resume_1:
});
}
}
op->state = 1;
op->state = 2;
}
}
}
if (op->state == 1)
if (op->state == 2)
{
return 0;
}
resume_2:
resume_3:
if (op->inflight_count > 0)
{
op->state = 2;
op->state = 3;
return 0;
}
if (op->done_count >= op->parts.size())
@@ -761,7 +762,7 @@ resume_2:
op->cur_inode = ino_it->second.parent_id;
op->parts.clear();
op->done_count = 0;
goto resume_0;
goto resume_1;
}
}
op->retval = op->len;
@@ -773,8 +774,7 @@ resume_2:
erase_op(op);
return 1;
}
else if (op->retval != 0 && !(op->flags & OP_FLUSH_BUFFER) &&
op->retval != -EPIPE && op->retval != -EIO && op->retval != -ENOSPC)
else if (op->retval != 0 && op->retval != -EPIPE && op->retval != -EIO && op->retval != -ENOSPC)
{
// Fatal error (neither -EPIPE, -EIO nor -ENOSPC)
// FIXME: Add a parameter to allow to not wait for EIOs (incomplete or corrupted objects) to heal
@@ -789,7 +789,7 @@ resume_2:
{
op->parts.clear();
op->done_count = 0;
goto resume_0;
goto resume_1;
}
else
{
@@ -800,7 +800,7 @@ resume_2:
op->parts[i].flags = PART_RETRY;
}
}
goto resume_1;
goto resume_2;
}
}
return 0;
@@ -874,11 +874,6 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
int iov_idx = 0;
size_t iov_pos = 0;
int i = 0;
// We also have to return reads from CACHE_REPEATING buffers - they are not
// guaranteed to be present on target OSDs at the moment of repeating
// And we're also free to return data from other cached buffers just
// because it's faster
bool dirty_copied = wb->read_from_cache(op, pool_cfg.bitmap_granularity);
for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
{
pg_num_t pg_num = (stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
@@ -887,7 +882,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
? (stripe + pg_block_size) : (op->offset + op->len);
op->parts[i].iov.reset();
op->parts[i].flags = 0;
if (op->cur_inode != op->inode || op->opcode == OSD_OP_READ && dirty_copied)
if (op->cur_inode != op->inode)
{
// Read remaining parts from upper layers
uint64_t prev = begin, cur = begin;
@@ -1050,7 +1045,13 @@ int cluster_client_t::continue_sync(cluster_op_t *op)
do_it++;
}
// Post sync to affected OSDs
wb->fsync_start();
for (auto & prev_op: dirty_buffers)
{
if (prev_op.second.state == CACHE_DIRTY)
{
prev_op.second.state = CACHE_FLUSHING;
}
}
op->parts.resize(dirty_osds.size());
op->retval = 0;
{
@@ -1075,7 +1076,13 @@ resume_1:
}
if (op->retval != 0)
{
wb->fsync_error();
for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); uw_it++)
{
if (uw_it->second.state == CACHE_FLUSHING)
{
uw_it->second.state = CACHE_DIRTY;
}
}
if (op->retval == -EPIPE || op->retval == -EIO || op->retval == -ENOSPC)
{
// Retry later
@@ -1089,7 +1096,16 @@ resume_1:
}
else
{
wb->fsync_ok();
for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); )
{
if (uw_it->second.state == CACHE_FLUSHING)
{
free(uw_it->second.buf);
dirty_buffers.erase(uw_it++);
}
else
uw_it++;
}
}
erase_op(op);
return 1;
@@ -1152,7 +1168,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
);
}
else if (log_level > 0)
else
{
fprintf(
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d)\n",

View File

@@ -8,9 +8,6 @@
#define DEFAULT_CLIENT_MAX_DIRTY_BYTES 32*1024*1024
#define DEFAULT_CLIENT_MAX_DIRTY_OPS 1024
#define DEFAULT_CLIENT_MAX_BUFFERED_BYTES 32*1024*1024
#define DEFAULT_CLIENT_MAX_BUFFERED_OPS 1024
#define DEFAULT_CLIENT_MAX_WRITEBACK_IODEPTH 256
#define INODE_LIST_DONE 1
#define INODE_LIST_HAS_UNSTABLE 2
#define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
@@ -67,12 +64,17 @@ protected:
cluster_op_t *prev = NULL, *next = NULL;
int prev_wait = 0;
friend class cluster_client_t;
friend class writeback_cache_t;
};
struct cluster_buffer_t
{
void *buf;
uint64_t len;
int state;
};
struct inode_list_t;
struct inode_list_osd_t;
class writeback_cache_t;
// FIXME: Split into public and private interfaces
class cluster_client_t
@@ -81,23 +83,16 @@ class cluster_client_t
ring_loop_t *ringloop;
std::map<pool_id_t, uint64_t> pg_counts;
// client_max_dirty_* is actually "max unsynced", for the case when immediate_commit is off
// FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
uint64_t client_max_dirty_bytes = 0;
uint64_t client_max_dirty_ops = 0;
// writeback improves (1) small consecutive writes and (2) Q1 writes without fsync
bool enable_writeback = false;
// client_max_buffered_* is the real "dirty limit" - maximum amount of writes buffered in memory
uint64_t client_max_buffered_bytes = 0;
uint64_t client_max_buffered_ops = 0;
uint64_t client_max_writeback_iodepth = 0;
int log_level;
int up_wait_retry_interval = 500; // ms
int retry_timeout_id = 0;
std::vector<cluster_op_t*> offline_ops;
cluster_op_t *op_queue_head = NULL, *op_queue_tail = NULL;
writeback_cache_t *wb = NULL;
std::map<object_id, cluster_buffer_t> dirty_buffers;
std::set<osd_num_t> dirty_osds;
uint64_t dirty_bytes = 0, dirty_ops = 0;
@@ -121,16 +116,16 @@ public:
json11::Json::object cli_config, file_config, etcd_global_config;
json11::Json::object config;
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config);
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
~cluster_client_t();
void execute(cluster_op_t *op);
void execute_raw(osd_num_t osd_num, osd_op_t *op);
bool is_ready();
void on_ready(std::function<void(void)> fn);
bool flush();
bool get_immediate_commit(uint64_t inode);
static void copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers);
void continue_ops(bool up_retry = false);
inode_list_t *list_inode_start(inode_t inode,
std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback);
@@ -143,14 +138,12 @@ public:
protected:
bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
void flush_buffer(const object_id & oid, cluster_buffer_t *wr);
void on_load_config_hook(json11::Json::object & config);
void on_load_pgs_hook(bool success);
void on_change_hook(std::map<std::string, etcd_kv_t> & changes);
void on_change_osd_state_hook(uint64_t peer_osd);
void execute_internal(cluster_op_t *op);
void unshift_op(cluster_op_t *op);
int continue_rw(cluster_op_t *op);
bool check_rw(cluster_op_t *op);
void slice_rw(cluster_op_t *op);
bool try_send(cluster_op_t *op, int i);
int continue_sync(cluster_op_t *op);
@@ -164,6 +157,4 @@ protected:
void continue_listing(inode_list_t *lst);
void send_list(inode_list_osd_t *cur_list);
void continue_raw_ops(osd_num_t peer_osd);
friend class writeback_cache_t;
};

View File

@@ -1,57 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
#pragma once
#include "cluster_client.h"
#define SCRAP_BUFFER_SIZE 4*1024*1024
#define PART_SENT 1
#define PART_DONE 2
#define PART_ERROR 4
#define PART_RETRY 8
#define CACHE_DIRTY 1
#define CACHE_WRITTEN 2
#define CACHE_FLUSHING 3
#define CACHE_REPEATING 4
#define OP_FLUSH_BUFFER 0x02
#define OP_IMMEDIATE_COMMIT 0x04
struct cluster_buffer_t
{
uint8_t *buf;
uint64_t len;
int state;
uint64_t flush_id;
uint64_t *refcnt;
};
typedef std::map<object_id, cluster_buffer_t>::iterator dirty_buf_it_t;
class writeback_cache_t
{
public:
uint64_t writeback_bytes = 0;
int writeback_queue_size = 0;
int writebacks_active = 0;
uint64_t last_flush_id = 0;
std::map<object_id, cluster_buffer_t> dirty_buffers;
std::vector<cluster_op_t*> writeback_overflow;
std::vector<object_id> writeback_queue;
std::multimap<uint64_t, uint64_t*> flushed_buffers; // flush_id => refcnt
~writeback_cache_t();
dirty_buf_it_t find_dirty(uint64_t inode, uint64_t offset);
bool is_left_merged(dirty_buf_it_t dirty_it);
bool is_right_merged(dirty_buf_it_t dirty_it);
bool is_merged(const dirty_buf_it_t & dirty_it);
void copy_write(cluster_op_t *op, int state);
int repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd);
void start_writebacks(cluster_client_t *cli, int count);
bool read_from_cache(cluster_op_t *op, uint32_t bitmap_granularity);
void flush_buffers(cluster_client_t *cli, dirty_buf_it_t from_it, dirty_buf_it_t to_it);
void fsync_start();
void fsync_error();
void fsync_ok();
};

View File

@@ -1,498 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
#include <cassert>
#include "cluster_client_impl.h"
writeback_cache_t::~writeback_cache_t()
{
for (auto & bp: dirty_buffers)
{
if (!--(*bp.second.refcnt))
{
free(bp.second.refcnt); // refcnt is allocated with the buffer
}
}
dirty_buffers.clear();
}
dirty_buf_it_t writeback_cache_t::find_dirty(uint64_t inode, uint64_t offset)
{
auto dirty_it = dirty_buffers.lower_bound((object_id){
.inode = inode,
.stripe = offset,
});
while (dirty_it != dirty_buffers.begin())
{
dirty_it--;
if (dirty_it->first.inode != inode ||
(dirty_it->first.stripe + dirty_it->second.len) <= offset)
{
dirty_it++;
break;
}
}
return dirty_it;
}
bool writeback_cache_t::is_left_merged(dirty_buf_it_t dirty_it)
{
if (dirty_it != dirty_buffers.begin())
{
auto prev_it = dirty_it;
prev_it--;
if (prev_it->first.inode == dirty_it->first.inode &&
prev_it->first.stripe+prev_it->second.len == dirty_it->first.stripe &&
prev_it->second.state == CACHE_DIRTY)
{
return true;
}
}
return false;
}
bool writeback_cache_t::is_right_merged(dirty_buf_it_t dirty_it)
{
auto next_it = dirty_it;
next_it++;
if (next_it != dirty_buffers.end() &&
next_it->first.inode == dirty_it->first.inode &&
next_it->first.stripe == dirty_it->first.stripe+dirty_it->second.len &&
next_it->second.state == CACHE_DIRTY)
{
return true;
}
return false;
}
bool writeback_cache_t::is_merged(const dirty_buf_it_t & dirty_it)
{
return is_left_merged(dirty_it) || is_right_merged(dirty_it);
}
void writeback_cache_t::copy_write(cluster_op_t *op, int state)
{
// Save operation for replay when one of PGs goes out of sync
// (primary OSD drops our connection in this case)
// ...or just save it for writeback if write buffering is enabled
if (op->len == 0)
{
return;
}
auto dirty_it = find_dirty(op->inode, op->offset);
auto new_end = op->offset + op->len;
while (dirty_it != dirty_buffers.end() &&
dirty_it->first.inode == op->inode &&
dirty_it->first.stripe < op->offset+op->len)
{
assert(dirty_it->first.stripe + dirty_it->second.len > op->offset);
// Remove overlapping part(s) of buffers
auto old_end = dirty_it->first.stripe + dirty_it->second.len;
if (dirty_it->first.stripe < op->offset)
{
if (old_end > new_end)
{
// Split into end and start
dirty_it->second.len = op->offset - dirty_it->first.stripe;
dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
.inode = op->inode,
.stripe = new_end,
}, (cluster_buffer_t){
.buf = dirty_it->second.buf + new_end - dirty_it->first.stripe,
.len = old_end - new_end,
.state = dirty_it->second.state,
.flush_id = dirty_it->second.flush_id,
.refcnt = dirty_it->second.refcnt,
});
(*dirty_it->second.refcnt)++;
if (dirty_it->second.state == CACHE_DIRTY)
{
writeback_bytes -= op->len;
writeback_queue_size++;
}
break;
}
else
{
// Only leave the beginning
if (dirty_it->second.state == CACHE_DIRTY)
{
writeback_bytes -= old_end - op->offset;
if (is_left_merged(dirty_it) && !is_right_merged(dirty_it))
{
writeback_queue_size++;
}
}
dirty_it->second.len = op->offset - dirty_it->first.stripe;
dirty_it++;
}
}
else if (old_end > new_end)
{
// Only leave the end
if (dirty_it->second.state == CACHE_DIRTY)
{
writeback_bytes -= new_end - dirty_it->first.stripe;
if (!is_left_merged(dirty_it) && is_right_merged(dirty_it))
{
writeback_queue_size++;
}
}
auto new_dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
.inode = op->inode,
.stripe = new_end,
}, (cluster_buffer_t){
.buf = dirty_it->second.buf + new_end - dirty_it->first.stripe,
.len = old_end - new_end,
.state = dirty_it->second.state,
.flush_id = dirty_it->second.flush_id,
.refcnt = dirty_it->second.refcnt,
});
dirty_buffers.erase(dirty_it);
dirty_it = new_dirty_it;
break;
}
else
{
// Remove the whole buffer
if (dirty_it->second.state == CACHE_DIRTY && !is_merged(dirty_it))
{
writeback_bytes -= dirty_it->second.len;
assert(writeback_queue_size > 0);
writeback_queue_size--;
}
if (!--(*dirty_it->second.refcnt))
{
free(dirty_it->second.refcnt);
}
dirty_buffers.erase(dirty_it++);
}
}
// Overlapping buffers are removed, just insert the new one
uint64_t *refcnt = (uint64_t*)malloc_or_die(sizeof(uint64_t) + op->len);
uint8_t *buf = (uint8_t*)refcnt + sizeof(uint64_t);
*refcnt = 1;
dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
.inode = op->inode,
.stripe = op->offset,
}, (cluster_buffer_t){
.buf = buf,
.len = op->len,
.state = state,
.refcnt = refcnt,
});
if (state == CACHE_DIRTY)
{
writeback_bytes += op->len;
// Track consecutive write-back operations
if (!is_merged(dirty_it))
{
// <writeback_queue> is OK to contain more than actual number of consecutive
// requests as long as it doesn't miss anything. But <writeback_queue_size>
// is always calculated correctly.
writeback_queue_size++;
writeback_queue.push_back((object_id){
.inode = op->inode,
.stripe = op->offset,
});
}
}
uint64_t pos = 0, len = op->len, iov_idx = 0;
while (len > 0 && iov_idx < op->iov.count)
{
auto & iov = op->iov.buf[iov_idx];
memcpy(buf + pos, iov.iov_base, iov.iov_len);
pos += iov.iov_len;
iov_idx++;
}
}
int writeback_cache_t::repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd)
{
int repeated = 0;
if (dirty_buffers.size())
{
// peer_osd just dropped connection
// determine WHICH dirty_buffers are now obsolete and repeat them
for (auto wr_it = dirty_buffers.begin(), flush_it = wr_it, last_it = wr_it; ; )
{
bool end = wr_it == dirty_buffers.end();
bool flush_this = !end && wr_it->second.state != CACHE_REPEATING &&
cli->affects_osd(wr_it->first.inode, wr_it->first.stripe, wr_it->second.len, peer_osd);
if (flush_it != wr_it && (end || !flush_this ||
wr_it->first.inode != flush_it->first.inode ||
wr_it->first.stripe != last_it->first.stripe+last_it->second.len))
{
repeated++;
flush_buffers(cli, flush_it, wr_it);
flush_it = wr_it;
}
if (end)
break;
last_it = wr_it;
wr_it++;
if (!flush_this)
flush_it = wr_it;
}
}
return repeated;
}
void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from_it, dirty_buf_it_t to_it)
{
auto prev_it = to_it;
prev_it--;
bool is_writeback = from_it->second.state == CACHE_DIRTY;
cluster_op_t *op = new cluster_op_t;
op->flags = OSD_OP_IGNORE_READONLY|OP_FLUSH_BUFFER;
op->opcode = OSD_OP_WRITE;
op->cur_inode = op->inode = from_it->first.inode;
op->offset = from_it->first.stripe;
op->len = prev_it->first.stripe + prev_it->second.len - from_it->first.stripe;
uint32_t calc_len = 0;
uint64_t flush_id = ++last_flush_id;
for (auto it = from_it; it != to_it; it++)
{
it->second.state = CACHE_REPEATING;
it->second.flush_id = flush_id;
(*it->second.refcnt)++;
flushed_buffers.emplace(flush_id, it->second.refcnt);
op->iov.push_back(it->second.buf, it->second.len);
calc_len += it->second.len;
}
assert(calc_len == op->len);
writebacks_active++;
op->callback = [this, flush_id](cluster_op_t* op)
{
// Buffer flushes should be always retried, regardless of the error,
// so they should never result in an error here
assert(op->retval == op->len);
for (auto fl_it = flushed_buffers.find(flush_id);
fl_it != flushed_buffers.end() && fl_it->first == flush_id; )
{
if (!--(*fl_it->second)) // refcnt
{
free(fl_it->second);
}
flushed_buffers.erase(fl_it++);
}
for (auto dirty_it = find_dirty(op->inode, op->offset);
dirty_it != dirty_buffers.end() && dirty_it->first.inode == op->inode &&
dirty_it->first.stripe < op->offset+op->len; dirty_it++)
{
if (dirty_it->second.flush_id == flush_id && dirty_it->second.state == CACHE_REPEATING)
{
dirty_it->second.flush_id = 0;
dirty_it->second.state = CACHE_WRITTEN;
}
}
delete op;
writebacks_active--;
// We can't call execute_internal because it affects an invalid copy of the list here
// (erase_op remembers `next` after writeback callback)
};
if (is_writeback)
{
cli->execute_internal(op);
}
else
{
// Insert repeated flushes into the beginning
cli->unshift_op(op);
cli->continue_rw(op);
}
}
void writeback_cache_t::start_writebacks(cluster_client_t *cli, int count)
{
if (!writeback_queue.size())
{
return;
}
std::vector<object_id> queue_copy;
queue_copy.swap(writeback_queue);
int started = 0, i = 0;
for (i = 0; i < queue_copy.size() && (!count || started < count); i++)
{
object_id & req = queue_copy[i];
auto dirty_it = find_dirty(req.inode, req.stripe);
if (dirty_it == dirty_buffers.end() ||
dirty_it->first.inode != req.inode ||
dirty_it->second.state != CACHE_DIRTY)
{
continue;
}
auto from_it = dirty_it;
uint64_t off = dirty_it->first.stripe;
while (from_it != dirty_buffers.begin())
{
from_it--;
if (from_it->second.state != CACHE_DIRTY ||
from_it->first.inode != req.inode ||
from_it->first.stripe+from_it->second.len != off)
{
from_it++;
break;
}
off = from_it->first.stripe;
}
off = dirty_it->first.stripe + dirty_it->second.len;
auto to_it = dirty_it;
to_it++;
while (to_it != dirty_buffers.end())
{
if (to_it->second.state != CACHE_DIRTY ||
to_it->first.inode != req.inode ||
to_it->first.stripe != off)
{
break;
}
off = to_it->first.stripe + to_it->second.len;
to_it++;
}
started++;
assert(writeback_queue_size > 0);
writeback_queue_size--;
writeback_bytes -= off - from_it->first.stripe;
flush_buffers(cli, from_it, to_it);
}
queue_copy.erase(queue_copy.begin(), queue_copy.begin()+i);
if (writeback_queue.size())
{
queue_copy.insert(queue_copy.end(), writeback_queue.begin(), writeback_queue.end());
}
queue_copy.swap(writeback_queue);
}
static void copy_to_op(cluster_op_t *op, uint64_t offset, uint8_t *buf, uint64_t len, uint32_t bitmap_granularity)
{
if (op->opcode == OSD_OP_READ)
{
// Not OSD_OP_READ_BITMAP or OSD_OP_READ_CHAIN_BITMAP
int iov_idx = 0;
uint64_t cur_offset = op->offset;
while (iov_idx < op->iov.count && cur_offset+op->iov.buf[iov_idx].iov_len <= offset)
{
cur_offset += op->iov.buf[iov_idx].iov_len;
iov_idx++;
}
while (iov_idx < op->iov.count && cur_offset < offset+len)
{
auto & v = op->iov.buf[iov_idx];
auto begin = (cur_offset < offset ? offset : cur_offset);
auto end = (cur_offset+v.iov_len > offset+len ? offset+len : cur_offset+v.iov_len);
memcpy(
(uint8_t*)v.iov_base + begin - cur_offset,
buf + (cur_offset <= offset ? 0 : cur_offset-offset),
end - begin
);
cur_offset += v.iov_len;
iov_idx++;
}
}
// Set bitmap bits
int start_bit = (offset-op->offset)/bitmap_granularity;
int end_bit = (offset-op->offset+len)/bitmap_granularity;
for (int bit = start_bit; bit < end_bit;)
{
if (!(bit%8) && bit <= end_bit-8)
{
((uint8_t*)op->bitmap_buf)[bit/8] = 0xFF;
bit += 8;
}
else
{
((uint8_t*)op->bitmap_buf)[bit/8] |= (1 << (bit%8));
bit++;
}
}
}
bool writeback_cache_t::read_from_cache(cluster_op_t *op, uint32_t bitmap_granularity)
{
bool dirty_copied = false;
if (dirty_buffers.size() && (op->opcode == OSD_OP_READ ||
op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP))
{
// We also have to return reads from CACHE_REPEATING buffers - they are not
// guaranteed to be present on target OSDs at the moment of repeating
// And we're also free to return data from other cached buffers just
// because it's faster
auto dirty_it = find_dirty(op->cur_inode, op->offset);
while (dirty_it != dirty_buffers.end() && dirty_it->first.inode == op->cur_inode &&
dirty_it->first.stripe < op->offset+op->len)
{
uint64_t begin = dirty_it->first.stripe, end = dirty_it->first.stripe + dirty_it->second.len;
if (begin < op->offset)
begin = op->offset;
if (end > op->offset+op->len)
end = op->offset+op->len;
bool skip_prev = true;
uint64_t cur = begin, prev = begin;
while (cur < end)
{
unsigned bmp_loc = (cur - op->offset)/bitmap_granularity;
bool skip = (((*((uint8_t*)op->bitmap_buf + bmp_loc/8)) >> (bmp_loc%8)) & 0x1);
if (skip_prev != skip)
{
if (cur > prev && !skip)
{
// Copy data
dirty_copied = true;
copy_to_op(op, prev, dirty_it->second.buf + prev - dirty_it->first.stripe, cur-prev, bitmap_granularity);
}
skip_prev = skip;
prev = cur;
}
cur += bitmap_granularity;
}
assert(cur > prev);
if (!skip_prev)
{
// Copy data
dirty_copied = true;
copy_to_op(op, prev, dirty_it->second.buf + prev - dirty_it->first.stripe, cur-prev, bitmap_granularity);
}
dirty_it++;
}
}
return dirty_copied;
}
void writeback_cache_t::fsync_start()
{
for (auto & prev_op: dirty_buffers)
{
if (prev_op.second.state == CACHE_WRITTEN)
{
prev_op.second.state = CACHE_FLUSHING;
}
}
}
void writeback_cache_t::fsync_error()
{
for (auto & prev_op: dirty_buffers)
{
if (prev_op.second.state == CACHE_FLUSHING)
{
prev_op.second.state = CACHE_WRITTEN;
}
}
}
void writeback_cache_t::fsync_ok()
{
for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); )
{
if (uw_it->second.state == CACHE_FLUSHING)
{
if (!--(*uw_it->second.refcnt))
free(uw_it->second.refcnt);
dirty_buffers.erase(uw_it++);
}
else
uw_it++;
}
}

View File

@@ -5,7 +5,7 @@
#include "str_util.h"
static const char *help_text =
"Vitastor disk management tool " VERSION "\n"
"Vitastor disk management tool\n"
"(c) Vitaliy Filippov, 2022+ (VNPL-1.1)\n"
"\n"
"COMMANDS:\n"
@@ -74,7 +74,7 @@ static const char *help_text =
" If it doesn't succeed it issues a warning in the system log.\n"
" \n"
" You can also pass other OSD options here as arguments and they'll be persisted\n"
" in the superblock: data_io, meta_io, journal_io,\n"
" in the superblock: cached_io_data, cached_io_meta, cached_io_journal,\n"
" inmemory_metadata, inmemory_journal, max_write_iodepth,\n"
" min_flusher_count, max_flusher_count, journal_sector_buffer_count,\n"
" journal_no_same_sector_overwrites, throttle_small_writes, throttle_target_iops,\n"
@@ -229,7 +229,7 @@ int main(int argc, char *argv[])
{
self.options["allow_data_loss"] = "1";
}
else if (argv[i][0] == '-' && argv[i][1] == '-' && i < argc-1)
else if (argv[i][0] == '-' && argv[i][1] == '-')
{
char *key = argv[i]+2;
self.options[key] = argv[++i];

View File

@@ -320,7 +320,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
if (journal_calc_data_pos != sw.data_offset)
{
printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\""
: " (mismatched, calculated = %08lx)", journal_pos);
: " (mismatched, calculated = %lu)", journal_pos);
}
uint32_t data_csum_size = (!je_start.csum_block_size
? 0

View File

@@ -8,9 +8,9 @@
int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_hdd)
{
static const char *allow_additional_params[] = {
"data_io",
"meta_io",
"journal_io",
"cached_io_data",
"cached_io_meta",
"cached_io_journal",
"max_write_iodepth",
"max_write_iodepth",
"min_flusher_count",
@@ -119,7 +119,7 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
try
{
dsk.parse_config(options);
dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
dsk.cached_io_data = dsk.cached_io_meta = dsk.cached_io_journal = false;
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
@@ -483,7 +483,7 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
{
blockstore_disk_t dsk;
dsk.parse_config(options);
dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
dsk.cached_io_data = dsk.cached_io_meta = dsk.cached_io_journal = false;
dsk.open_data();
dsk.open_meta();
dsk.open_journal();

View File

@@ -91,7 +91,7 @@ int disk_tool_t::resize_parse_params()
try
{
dsk.parse_config(options);
dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
dsk.cached_io_data = dsk.cached_io_meta = dsk.cached_io_journal = false;
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
@@ -245,7 +245,7 @@ int disk_tool_t::resize_copy_data()
{
iodepth = 32;
}
ringloop = new ring_loop_t(iodepth < RINGLOOP_DEFAULT_SIZE ? RINGLOOP_DEFAULT_SIZE : iodepth);
ringloop = new ring_loop_t(iodepth < 512 ? 512 : iodepth);
dsk.data_fd = open(dsk.data_device.c_str(), O_DIRECT|O_RDWR);
if (dsk.data_fd < 0)
{

View File

@@ -23,24 +23,19 @@ epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
tfd = new timerfd_manager_t([this](int fd, bool wr, std::function<void(int, int)> handler) { set_fd_handler(fd, wr, handler); });
if (ringloop)
consumer.loop = [this]()
{
consumer.loop = [this]()
{
if (pending)
handle_uring_event();
};
ringloop->register_consumer(&consumer);
handle_uring_event();
}
if (pending)
handle_epoll_events();
};
ringloop->register_consumer(&consumer);
handle_epoll_events();
}
epoll_manager_t::~epoll_manager_t()
{
if (ringloop)
{
ringloop->unregister_consumer(&consumer);
}
ringloop->unregister_consumer(&consumer);
if (tfd)
{
delete tfd;
@@ -49,11 +44,6 @@ epoll_manager_t::~epoll_manager_t()
close(epoll_fd);
}
int epoll_manager_t::get_fd()
{
return epoll_fd;
}
void epoll_manager_t::set_fd_handler(int fd, bool wr, std::function<void(int, int)> handler)
{
if (handler != NULL)
@@ -85,7 +75,7 @@ void epoll_manager_t::set_fd_handler(int fd, bool wr, std::function<void(int, in
}
}
void epoll_manager_t::handle_uring_event()
void epoll_manager_t::handle_epoll_events()
{
io_uring_sqe *sqe = ringloop->get_sqe();
if (!sqe)
@@ -105,20 +95,14 @@ void epoll_manager_t::handle_uring_event()
{
throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
}
handle_uring_event();
handle_epoll_events();
};
ringloop->submit();
handle_events(0);
}
void epoll_manager_t::handle_events(int timeout)
{
int nfds;
epoll_event events[MAX_EPOLL_EVENTS];
do
{
nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, timeout);
timeout = 0;
nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
for (int i = 0; i < nfds; i++)
{
auto cb_it = epoll_handlers.find(events[i].data.fd);

View File

@@ -15,14 +15,11 @@ class epoll_manager_t
ring_consumer_t consumer;
ring_loop_t *ringloop;
std::map<int, std::function<void(int, int)>> epoll_handlers;
void handle_uring_event();
public:
epoll_manager_t(ring_loop_t *ringloop);
~epoll_manager_t();
int get_fd();
void set_fd_handler(int fd, bool wr, std::function<void(int, int)> handler);
void handle_events(int timeout);
void handle_epoll_events();
timerfd_manager_t *tfd;
};

View File

@@ -24,7 +24,6 @@
#include <netinet/tcp.h>
#include <vector>
#include <string>
#include "vitastor_c.h"
#include "fio_headers.h"
@@ -32,7 +31,6 @@
struct sec_data
{
vitastor_c *cli = NULL;
bool epoll_based = false;
void *watch = NULL;
bool last_sync = false;
/* The list of completed io_u structs. */
@@ -59,7 +57,6 @@ struct sec_options
int rdma_port_num = 0;
int rdma_gid_index = 0;
int rdma_mtu = 0;
int no_io_uring = 0;
};
static struct fio_option options[] = {
@@ -195,16 +192,6 @@ static struct fio_option options[] = {
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "no_io_uring",
.lname = "Disable io_uring",
.type = FIO_OPT_BOOL,
.off1 = offsetof(struct sec_options, no_io_uring),
.help = "Use epoll and plain sendmsg/recvmsg instead of io_uring (slower)",
.def = "0",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = NULL,
},
@@ -216,15 +203,6 @@ static void watch_callback(void *opaque, long watch)
bsd->watch = (void*)watch;
}
static void opt_push(std::vector<char *> & options, const char *opt, const char *value)
{
if (value)
{
options.push_back(strdup(opt));
options.push_back(strdup(value));
}
}
static int sec_setup(struct thread_data *td)
{
sec_options *o = (sec_options*)td->eo;
@@ -276,59 +254,18 @@ static int sec_setup(struct thread_data *td)
{
o->inode = 0;
}
std::vector<char *> options;
opt_push(options, "config_path", o->config_path);
opt_push(options, "etcd_address", o->etcd_host);
opt_push(options, "etcd_prefix", o->etcd_prefix);
if (o->use_rdma != -1)
opt_push(options, "use_rdma", std::to_string(o->use_rdma).c_str());
opt_push(options, "rdma_device", o->rdma_device);
if (o->rdma_port_num)
opt_push(options, "rdma_port_num", std::to_string(o->rdma_port_num).c_str());
if (o->rdma_gid_index)
opt_push(options, "rdma_gid_index", std::to_string(o->rdma_gid_index).c_str());
if (o->rdma_mtu)
opt_push(options, "rdma_mtu", std::to_string(o->rdma_mtu).c_str());
if (o->cluster_log)
opt_push(options, "log_level", std::to_string(o->cluster_log).c_str());
// allow writeback caching if -direct is not set
opt_push(options, "client_writeback_allowed", td->o.odirect ? "0" : "1");
bsd->cli = o->no_io_uring ? NULL : vitastor_c_create_uring_json((const char**)options.data(), options.size());
bsd->epoll_based = false;
if (!bsd->cli)
{
if (o->no_io_uring)
fprintf(stderr, "vitastor: io_uring disabled - I/O will be slower\n");
else
fprintf(stderr, "vitastor: failed to create io_uring: %s - I/O will be slower\n", strerror(errno));
bsd->cli = vitastor_c_create_epoll_json((const char**)options.data(), options.size());
bsd->epoll_based = true;
}
for (auto opt: options)
free(opt);
options.clear();
bsd->cli = vitastor_c_create_uring(o->config_path, o->etcd_host, o->etcd_prefix,
o->use_rdma, o->rdma_device, o->rdma_port_num, o->rdma_gid_index, o->rdma_mtu, o->cluster_log);
if (o->image)
{
bsd->watch = NULL;
vitastor_c_watch_inode(bsd->cli, o->image, watch_callback, bsd);
if (!bsd->epoll_based)
while (true)
{
while (true)
{
vitastor_c_uring_handle_events(bsd->cli);
if (bsd->watch)
break;
vitastor_c_uring_wait_events(bsd->cli);
}
}
else
{
while (true)
{
if (bsd->watch)
break;
vitastor_c_epoll_handle_events(bsd->cli, 1000);
}
vitastor_c_uring_handle_events(bsd->cli);
if (bsd->watch)
break;
vitastor_c_uring_wait_events(bsd->cli);
}
td->files[0]->real_file_size = vitastor_c_inode_get_size(bsd->watch);
if (!vitastor_c_inode_get_num(bsd->watch) ||
@@ -471,24 +408,12 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int max, const struct timespec *t)
{
sec_data *bsd = (sec_data*)td->io_ops_data;
if (!bsd->epoll_based)
while (true)
{
while (true)
{
vitastor_c_uring_handle_events(bsd->cli);
if (bsd->completed.size() >= min)
break;
vitastor_c_uring_wait_events(bsd->cli);
}
}
else
{
while (true)
{
if (bsd->completed.size() >= min)
break;
vitastor_c_epoll_handle_events(bsd->cli, 1000);
}
vitastor_c_uring_handle_events(bsd->cli);
if (bsd->completed.size() >= min)
break;
vitastor_c_uring_wait_events(bsd->cli);
}
return bsd->completed.size();
}

View File

@@ -130,7 +130,7 @@ static int bs_init(struct thread_data *td)
config[p.first] = p.second.dump();
}
}
bsd->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
bsd->ringloop = new ring_loop_t(512);
bsd->epmgr = new epoll_manager_t(bsd->ringloop);
bsd->bs = new blockstore_t(config, bsd->ringloop, bsd->epmgr->tfd);
while (1)

View File

@@ -1,401 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
//
// Vitastor shared key/value database test CLI
#define _XOPEN_SOURCE
#include <limits.h>
#include <netinet/tcp.h>
#include <sys/epoll.h>
#include <unistd.h>
#include <fcntl.h>
//#include <signal.h>
#include "epoll_manager.h"
#include "str_util.h"
#include "kv_db.h"
const char *exe_name = NULL;
class kv_cli_t
{
public:
kv_dbw_t *db = NULL;
ring_loop_t *ringloop = NULL;
epoll_manager_t *epmgr = NULL;
cluster_client_t *cli = NULL;
bool interactive = false;
int in_progress = 0;
char *cur_cmd = NULL;
int cur_cmd_size = 0, cur_cmd_alloc = 0;
bool finished = false, eof = false;
json11::Json::object cfg;
~kv_cli_t();
static json11::Json::object parse_args(int narg, const char *args[]);
void run(const json11::Json::object & cfg);
void read_cmd();
void next_cmd();
void handle_cmd(const std::string & cmd, std::function<void()> cb);
};
kv_cli_t::~kv_cli_t()
{
if (cur_cmd)
{
free(cur_cmd);
cur_cmd = NULL;
}
cur_cmd_alloc = 0;
if (db)
delete db;
if (cli)
{
cli->flush();
delete cli;
}
if (epmgr)
delete epmgr;
if (ringloop)
delete ringloop;
}
json11::Json::object kv_cli_t::parse_args(int narg, const char *args[])
{
json11::Json::object cfg;
for (int i = 1; i < narg; i++)
{
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
{
printf(
"Vitastor Key/Value CLI\n"
"(c) Vitaliy Filippov, 2023+ (VNPL-1.1)\n"
"\n"
"USAGE: %s [--etcd_address ADDR] [OTHER OPTIONS]\n",
exe_name
);
exit(0);
}
else if (args[i][0] == '-' && args[i][1] == '-')
{
const char *opt = args[i]+2;
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
}
}
return cfg;
}
void kv_cli_t::run(const json11::Json::object & cfg)
{
// Create client
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
db = new kv_dbw_t(cli);
// Load image metadata
while (!cli->is_ready())
{
ringloop->loop();
if (cli->is_ready())
break;
ringloop->wait();
}
// Run
fcntl(0, F_SETFL, fcntl(0, F_GETFL, 0) | O_NONBLOCK);
try
{
epmgr->tfd->set_fd_handler(0, false, [this](int fd, int events)
{
if (events & EPOLLIN)
{
read_cmd();
}
if (events & EPOLLRDHUP)
{
epmgr->tfd->set_fd_handler(0, false, NULL);
finished = true;
}
});
interactive = true;
printf("> ");
}
catch (std::exception & e)
{
// Can't add to epoll, STDIN is probably a file
read_cmd();
}
while (!finished)
{
ringloop->loop();
if (!finished)
ringloop->wait();
}
// Destroy the client
delete db;
db = NULL;
cli->flush();
delete cli;
delete epmgr;
delete ringloop;
cli = NULL;
epmgr = NULL;
ringloop = NULL;
}
void kv_cli_t::read_cmd()
{
if (!cur_cmd_alloc)
{
cur_cmd_alloc = 65536;
cur_cmd = (char*)malloc_or_die(cur_cmd_alloc);
}
while (cur_cmd_size < cur_cmd_alloc)
{
int r = read(0, cur_cmd+cur_cmd_size, cur_cmd_alloc-cur_cmd_size);
if (r < 0 && errno != EAGAIN)
fprintf(stderr, "Error reading from stdin: %s\n", strerror(errno));
if (r > 0)
cur_cmd_size += r;
if (r == 0)
eof = true;
if (r <= 0)
break;
}
next_cmd();
}
void kv_cli_t::next_cmd()
{
if (in_progress > 0)
{
return;
}
int pos = 0;
for (; pos < cur_cmd_size; pos++)
{
if (cur_cmd[pos] == '\n' || cur_cmd[pos] == '\r')
{
auto cmd = trim(std::string(cur_cmd, pos));
pos++;
memmove(cur_cmd, cur_cmd+pos, cur_cmd_size-pos);
cur_cmd_size -= pos;
in_progress++;
handle_cmd(cmd, [this]()
{
in_progress--;
if (interactive)
printf("> ");
next_cmd();
if (!in_progress)
read_cmd();
});
break;
}
}
if (eof && !in_progress)
{
finished = true;
}
}
void kv_cli_t::handle_cmd(const std::string & cmd, std::function<void()> cb)
{
if (cmd == "")
{
cb();
return;
}
auto pos = cmd.find_first_of(" \t");
if (pos != std::string::npos)
{
while (pos < cmd.size()-1 && (cmd[pos+1] == ' ' || cmd[pos+1] == '\t'))
pos++;
}
auto opname = strtolower(pos == std::string::npos ? cmd : cmd.substr(0, pos));
if (opname == "open")
{
uint64_t pool_id = 0;
inode_t inode_id = 0;
uint32_t kv_block_size = 0;
int scanned = sscanf(cmd.c_str() + pos+1, "%lu %lu %u", &pool_id, &inode_id, &kv_block_size);
if (scanned == 2)
{
kv_block_size = 4096;
}
if (scanned < 2 || !pool_id || !inode_id || !kv_block_size || (kv_block_size & (kv_block_size-1)) != 0)
{
fprintf(stderr, "Usage: open <pool_id> <inode_id> [block_size]. Block size must be a power of 2. Default is 4096.\n");
cb();
return;
}
cfg["kv_block_size"] = (uint64_t)kv_block_size;
db->open(INODE_WITH_POOL(pool_id, inode_id), cfg, [=](int res)
{
if (res < 0)
fprintf(stderr, "Error opening index: %s (code %d)\n", strerror(-res), res);
else
printf("Index opened. Current size: %lu bytes\n", db->get_size());
cb();
});
}
else if (opname == "config")
{
auto pos2 = cmd.find_first_of(" \t", pos+1);
if (pos2 == std::string::npos)
{
fprintf(stderr, "Usage: config <property> <value>\n");
cb();
return;
}
auto key = trim(cmd.substr(pos+1, pos2-pos-1));
auto value = parse_size(trim(cmd.substr(pos2+1)));
if (key != "kv_memory_limit" &&
key != "kv_allocate_blocks" &&
key != "kv_evict_max_misses" &&
key != "kv_evict_attempts_per_level" &&
key != "kv_evict_unused_age" &&
key != "kv_log_level")
{
fprintf(
stderr, "Allowed properties: kv_memory_limit, kv_allocate_blocks,"
" kv_evict_max_misses, kv_evict_attempts_per_level, kv_evict_unused_age, kv_log_level\n"
);
}
else
{
cfg[key] = value;
db->set_config(cfg);
}
cb();
}
else if (opname == "get" || opname == "set" || opname == "del")
{
if (opname == "get" || opname == "del")
{
if (pos == std::string::npos)
{
fprintf(stderr, "Usage: %s <key>\n", opname.c_str());
cb();
return;
}
auto key = trim(cmd.substr(pos+1));
if (opname == "get")
{
db->get(key, [this, cb](int res, const std::string & value)
{
if (res < 0)
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
else
{
write(1, value.c_str(), value.size());
write(1, "\n", 1);
}
cb();
});
}
else
{
db->del(key, [this, cb](int res)
{
if (res < 0)
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
else
printf("OK\n");
cb();
});
}
}
else
{
auto pos2 = cmd.find_first_of(" \t", pos+1);
if (pos2 == std::string::npos)
{
fprintf(stderr, "Usage: set <key> <value>\n");
cb();
return;
}
auto key = trim(cmd.substr(pos+1, pos2-pos-1));
auto value = trim(cmd.substr(pos2+1));
db->set(key, value, [this, cb](int res)
{
if (res < 0)
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
else
printf("OK\n");
cb();
});
}
}
else if (opname == "list")
{
std::string start, end;
if (pos != std::string::npos)
{
auto pos2 = cmd.find_first_of(" \t", pos+1);
if (pos2 != std::string::npos)
{
start = trim(cmd.substr(pos+1, pos2-pos-1));
end = trim(cmd.substr(pos2+1));
}
else
{
start = trim(cmd.substr(pos+1));
}
}
void *handle = db->list_start(start);
db->list_next(handle, [=](int res, const std::string & key, const std::string & value)
{
if (res < 0)
{
if (res != -ENOENT)
{
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
}
db->list_close(handle);
cb();
}
else
{
printf("%s = %s\n", key.c_str(), value.c_str());
db->list_next(handle, NULL);
}
});
}
else if (opname == "close")
{
db->close([=]()
{
printf("Index closed\n");
cb();
});
}
else if (opname == "quit" || opname == "q")
{
::close(0);
finished = true;
}
else
{
fprintf(
stderr, "Unknown operation: %s. Supported operations:\n"
"open <pool_id> <inode_id> [block_size]\n"
"config <property> <value>\n"
"get <key>\nset <key> <value>\ndel <key>\nlist [<start> [end]]\n"
"close\nquit\n", opname.c_str()
);
cb();
}
}
int main(int narg, const char *args[])
{
setvbuf(stdout, NULL, _IONBF, 0);
setvbuf(stderr, NULL, _IONBF, 0);
exe_name = args[0];
kv_cli_t *p = new kv_cli_t();
p->run(kv_cli_t::parse_args(narg, args));
delete p;
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,36 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
//
// Vitastor shared key/value database
// Parallel optimistic B-Tree O:-)
#pragma once
#include "cluster_client.h"
struct kv_db_t;
struct kv_dbw_t
{
kv_dbw_t(cluster_client_t *cli);
~kv_dbw_t();
void open(inode_t inode_id, json11::Json cfg, std::function<void(int)> cb);
void set_config(json11::Json cfg);
void close(std::function<void()> cb);
uint64_t get_size();
void get(const std::string & key, std::function<void(int res, const std::string & value)> cb,
bool allow_old_cached = false);
void set(const std::string & key, const std::string & value, std::function<void(int res)> cb,
std::function<bool(int res, const std::string & value)> cas_compare = NULL);
void del(const std::string & key, std::function<void(int res)> cb,
std::function<bool(int res, const std::string & value)> cas_compare = NULL);
void* list_start(const std::string & start);
void list_next(void *handle, std::function<void(int res, const std::string & key, const std::string & value)> cb);
void list_close(void *handle);
kv_db_t *db;
};

View File

@@ -1,697 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
//
// Vitastor shared key/value database stress tester / benchmark
#define _XOPEN_SOURCE
#include <limits.h>
#include <netinet/tcp.h>
#include <sys/epoll.h>
#include <unistd.h>
#include <fcntl.h>
//#include <signal.h>
#include "epoll_manager.h"
#include "str_util.h"
#include "kv_db.h"
const char *exe_name = NULL;
struct kv_test_listing_t
{
uint64_t count = 0, done = 0;
void *handle = NULL;
std::string next_after;
std::set<std::string> inflights;
timespec tv_begin;
bool error = false;
};
struct kv_test_lat_t
{
const char *name = NULL;
uint64_t usec = 0, count = 0;
};
struct kv_test_stat_t
{
kv_test_lat_t get, add, update, del, list;
uint64_t list_keys = 0;
};
class kv_test_t
{
public:
// Config
json11::Json::object kv_cfg;
std::string key_prefix, key_suffix;
uint64_t inode_id = 0;
uint64_t op_count = 1000000;
uint64_t runtime_sec = 0;
uint64_t parallelism = 4;
uint64_t reopen_prob = 1;
uint64_t get_prob = 30000;
uint64_t add_prob = 20000;
uint64_t update_prob = 20000;
uint64_t del_prob = 5000;
uint64_t list_prob = 300;
uint64_t min_key_len = 10;
uint64_t max_key_len = 70;
uint64_t min_value_len = 50;
uint64_t max_value_len = 300;
uint64_t min_list_count = 10;
uint64_t max_list_count = 1000;
uint64_t print_stats_interval = 1;
bool json_output = false;
uint64_t log_level = 1;
bool trace = false;
bool stop_on_error = false;
// FIXME: Multiple clients
kv_test_stat_t stat, prev_stat;
timespec prev_stat_time, start_stat_time;
// State
kv_dbw_t *db = NULL;
ring_loop_t *ringloop = NULL;
epoll_manager_t *epmgr = NULL;
cluster_client_t *cli = NULL;
ring_consumer_t consumer;
bool finished = false;
uint64_t total_prob = 0;
uint64_t ops_sent = 0, ops_done = 0;
int stat_timer_id = -1;
int in_progress = 0;
bool reopening = false;
std::set<kv_test_listing_t*> listings;
std::set<std::string> changing_keys;
std::map<std::string, std::string> values;
~kv_test_t();
static json11::Json::object parse_args(int narg, const char *args[]);
void parse_config(json11::Json cfg);
void run(json11::Json cfg);
void loop();
void print_stats(kv_test_stat_t & prev_stat, timespec & prev_stat_time);
void print_total_stats();
void start_change(const std::string & key);
void stop_change(const std::string & key);
void add_stat(kv_test_lat_t & stat, timespec tv_begin);
};
kv_test_t::~kv_test_t()
{
if (db)
delete db;
if (cli)
{
cli->flush();
delete cli;
}
if (epmgr)
delete epmgr;
if (ringloop)
delete ringloop;
}
json11::Json::object kv_test_t::parse_args(int narg, const char *args[])
{
json11::Json::object cfg;
for (int i = 1; i < narg; i++)
{
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
{
printf(
"Vitastor Key/Value DB stress tester / benchmark\n"
"(c) Vitaliy Filippov, 2023+ (VNPL-1.1)\n"
"\n"
"USAGE: %s --pool_id POOL_ID --inode_id INODE_ID [OPTIONS]\n"
" --op_count 1000000\n"
" Total operations to run during test. 0 means unlimited\n"
" --key_prefix \"\"\n"
" Prefix for all keys read or written (to avoid collisions)\n"
" --key_suffix \"\"\n"
" Suffix for all keys read or written (to avoid collisions, but scan all DB)\n"
" --runtime 0\n"
" Run for this number of seconds. 0 means unlimited\n"
" --parallelism 4\n"
" Run this number of operations in parallel\n"
" --get_prob 30000\n"
" Fraction of key retrieve operations\n"
" --add_prob 20000\n"
" Fraction of key addition operations\n"
" --update_prob 20000\n"
" Fraction of key update operations\n"
" --del_prob 30000\n"
" Fraction of key delete operations\n"
" --list_prob 300\n"
" Fraction of listing operations\n"
" --min_key_len 10\n"
" Minimum key size in bytes\n"
" --max_key_len 70\n"
" Maximum key size in bytes\n"
" --min_value_len 50\n"
" Minimum value size in bytes\n"
" --max_value_len 300\n"
" Maximum value size in bytes\n"
" --min_list_count 10\n"
" Minimum number of keys read in listing (0 = all keys)\n"
" --max_list_count 1000\n"
" Maximum number of keys read in listing\n"
" --print_stats 1\n"
" Print operation statistics every this number of seconds\n"
" --json\n"
" JSON output\n"
" --stop_on_error 0\n"
" Stop on first execution error, mismatch, lost key or extra key during listing\n"
" --kv_memory_limit 128M\n"
" Maximum memory to use for vitastor-kv index cache\n"
" --kv_allocate_blocks 4\n"
" Number of PG blocks used for new tree block allocation in parallel\n"
" --kv_evict_max_misses 10\n"
" Eviction algorithm parameter: retry eviction from another random spot\n"
" if this number of keys is used currently or was used recently\n"
" --kv_evict_attempts_per_level 3\n"
" Retry eviction at most this number of times per tree level, starting\n"
" with bottom-most levels\n"
" --kv_evict_unused_age 1000\n"
" Evict only keys unused during this number of last operations\n"
" --kv_log_level 1\n"
" Log level. 0 = errors, 1 = warnings, 10 = trace operations\n",
exe_name
);
exit(0);
}
else if (args[i][0] == '-' && args[i][1] == '-')
{
const char *opt = args[i]+2;
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
}
}
return cfg;
}
void kv_test_t::parse_config(json11::Json cfg)
{
inode_id = INODE_WITH_POOL(cfg["pool_id"].uint64_value(), cfg["inode_id"].uint64_value());
if (cfg["op_count"].uint64_value() > 0)
op_count = cfg["op_count"].uint64_value();
key_prefix = cfg["key_prefix"].string_value();
key_suffix = cfg["key_suffix"].string_value();
if (cfg["runtime"].uint64_value() > 0)
runtime_sec = cfg["runtime"].uint64_value();
if (cfg["parallelism"].uint64_value() > 0)
parallelism = cfg["parallelism"].uint64_value();
if (!cfg["reopen_prob"].is_null())
reopen_prob = cfg["reopen_prob"].uint64_value();
if (!cfg["get_prob"].is_null())
get_prob = cfg["get_prob"].uint64_value();
if (!cfg["add_prob"].is_null())
add_prob = cfg["add_prob"].uint64_value();
if (!cfg["update_prob"].is_null())
update_prob = cfg["update_prob"].uint64_value();
if (!cfg["del_prob"].is_null())
del_prob = cfg["del_prob"].uint64_value();
if (!cfg["list_prob"].is_null())
list_prob = cfg["list_prob"].uint64_value();
if (!cfg["min_key_len"].is_null())
min_key_len = cfg["min_key_len"].uint64_value();
if (cfg["max_key_len"].uint64_value() > 0)
max_key_len = cfg["max_key_len"].uint64_value();
if (!cfg["min_value_len"].is_null())
min_value_len = cfg["min_value_len"].uint64_value();
if (cfg["max_value_len"].uint64_value() > 0)
max_value_len = cfg["max_value_len"].uint64_value();
if (!cfg["min_list_count"].is_null())
min_list_count = cfg["min_list_count"].uint64_value();
if (!cfg["max_list_count"].is_null())
max_list_count = cfg["max_list_count"].uint64_value();
if (!cfg["print_stats"].is_null())
print_stats_interval = cfg["print_stats"].uint64_value();
if (!cfg["json"].is_null())
json_output = true;
if (!cfg["stop_on_error"].is_null())
stop_on_error = cfg["stop_on_error"].bool_value();
if (!cfg["kv_memory_limit"].is_null())
kv_cfg["kv_memory_limit"] = cfg["kv_memory_limit"];
if (!cfg["kv_allocate_blocks"].is_null())
kv_cfg["kv_allocate_blocks"] = cfg["kv_allocate_blocks"];
if (!cfg["kv_evict_max_misses"].is_null())
kv_cfg["kv_evict_max_misses"] = cfg["kv_evict_max_misses"];
if (!cfg["kv_evict_attempts_per_level"].is_null())
kv_cfg["kv_evict_attempts_per_level"] = cfg["kv_evict_attempts_per_level"];
if (!cfg["kv_evict_unused_age"].is_null())
kv_cfg["kv_evict_unused_age"] = cfg["kv_evict_unused_age"];
if (!cfg["kv_log_level"].is_null())
{
log_level = cfg["kv_log_level"].uint64_value();
trace = log_level >= 10;
kv_cfg["kv_log_level"] = cfg["kv_log_level"];
}
total_prob = reopen_prob+get_prob+add_prob+update_prob+del_prob+list_prob;
stat.get.name = "get";
stat.add.name = "add";
stat.update.name = "update";
stat.del.name = "del";
stat.list.name = "list";
}
void kv_test_t::run(json11::Json cfg)
{
srand48(time(NULL));
parse_config(cfg);
// Create client
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
db = new kv_dbw_t(cli);
// Load image metadata
while (!cli->is_ready())
{
ringloop->loop();
if (cli->is_ready())
break;
ringloop->wait();
}
// Run
reopening = true;
db->open(inode_id, kv_cfg, [this](int res)
{
reopening = false;
if (res < 0)
{
fprintf(stderr, "ERROR: Open index: %d (%s)\n", res, strerror(-res));
exit(1);
}
if (trace)
printf("Index opened\n");
ringloop->wakeup();
});
consumer.loop = [this]() { loop(); };
ringloop->register_consumer(&consumer);
if (print_stats_interval)
stat_timer_id = epmgr->tfd->set_timer(print_stats_interval*1000, true, [this](int) { print_stats(prev_stat, prev_stat_time); });
clock_gettime(CLOCK_REALTIME, &start_stat_time);
prev_stat_time = start_stat_time;
while (!finished)
{
ringloop->loop();
if (!finished)
ringloop->wait();
}
if (stat_timer_id >= 0)
epmgr->tfd->clear_timer(stat_timer_id);
ringloop->unregister_consumer(&consumer);
// Print total stats
print_total_stats();
// Destroy the client
delete db;
db = NULL;
cli->flush();
delete cli;
delete epmgr;
delete ringloop;
cli = NULL;
epmgr = NULL;
ringloop = NULL;
}
static const char *base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789@+/";
std::string random_str(int len)
{
std::string str;
str.resize(len);
for (int i = 0; i < len; i++)
{
str[i] = base64_chars[lrand48() % 64];
}
return str;
}
void kv_test_t::loop()
{
if (reopening)
{
return;
}
if (ops_done >= op_count)
{
finished = true;
}
while (!finished && ops_sent < op_count && in_progress < parallelism)
{
uint64_t dice = (lrand48() % total_prob);
if (dice < reopen_prob)
{
reopening = true;
db->close([this]()
{
if (trace)
printf("Index closed\n");
db->open(inode_id, kv_cfg, [this](int res)
{
reopening = false;
if (res < 0)
{
fprintf(stderr, "ERROR: Reopen index: %d (%s)\n", res, strerror(-res));
finished = true;
return;
}
if (trace)
printf("Index reopened\n");
ringloop->wakeup();
});
});
return;
}
else if (dice < reopen_prob+get_prob)
{
// get existing
auto key = random_str(max_key_len);
auto k_it = values.lower_bound(key);
if (k_it == values.end())
continue;
key = k_it->first;
if (changing_keys.find(key) != changing_keys.end())
continue;
in_progress++;
ops_sent++;
if (trace)
printf("get %s\n", key.c_str());
timespec tv_begin;
clock_gettime(CLOCK_REALTIME, &tv_begin);
db->get(key, [this, key, tv_begin](int res, const std::string & value)
{
add_stat(stat.get, tv_begin);
ops_done++;
in_progress--;
auto it = values.find(key);
if (res != (it == values.end() ? -ENOENT : 0))
{
fprintf(stderr, "ERROR: get %s: %d (%s)\n", key.c_str(), res, strerror(-res));
if (stop_on_error)
exit(1);
}
else if (it != values.end() && value != it->second)
{
fprintf(stderr, "ERROR: get %s: mismatch: %s vs %s\n", key.c_str(), value.c_str(), it->second.c_str());
if (stop_on_error)
exit(1);
}
ringloop->wakeup();
});
}
else if (dice < reopen_prob+get_prob+add_prob+update_prob)
{
bool is_add = false;
std::string key;
if (dice < reopen_prob+get_prob+add_prob)
{
// add
is_add = true;
uint64_t key_len = min_key_len + (max_key_len > min_key_len ? lrand48() % (max_key_len-min_key_len) : 0);
key = key_prefix + random_str(key_len) + key_suffix;
}
else
{
// update
key = random_str(max_key_len);
auto k_it = values.lower_bound(key);
if (k_it == values.end())
continue;
key = k_it->first;
}
if (changing_keys.find(key) != changing_keys.end())
continue;
uint64_t value_len = min_value_len + (max_value_len > min_value_len ? lrand48() % (max_value_len-min_value_len) : 0);
auto value = random_str(value_len);
start_change(key);
ops_sent++;
in_progress++;
if (trace)
printf("set %s = %s\n", key.c_str(), value.c_str());
timespec tv_begin;
clock_gettime(CLOCK_REALTIME, &tv_begin);
db->set(key, value, [this, key, value, tv_begin, is_add](int res)
{
add_stat(is_add ? stat.add : stat.update, tv_begin);
stop_change(key);
ops_done++;
in_progress--;
if (res != 0)
{
fprintf(stderr, "ERROR: set %s = %s: %d (%s)\n", key.c_str(), value.c_str(), res, strerror(-res));
if (stop_on_error)
exit(1);
}
else
{
values[key] = value;
}
ringloop->wakeup();
}, NULL);
}
else if (dice < reopen_prob+get_prob+add_prob+update_prob+del_prob)
{
// delete
auto key = random_str(max_key_len);
auto k_it = values.lower_bound(key);
if (k_it == values.end())
continue;
key = k_it->first;
if (changing_keys.find(key) != changing_keys.end())
continue;
start_change(key);
ops_sent++;
in_progress++;
if (trace)
printf("del %s\n", key.c_str());
timespec tv_begin;
clock_gettime(CLOCK_REALTIME, &tv_begin);
db->del(key, [this, key, tv_begin](int res)
{
add_stat(stat.del, tv_begin);
stop_change(key);
ops_done++;
in_progress--;
if (res != 0)
{
fprintf(stderr, "ERROR: del %s: %d (%s)\n", key.c_str(), res, strerror(-res));
if (stop_on_error)
exit(1);
}
else
{
values.erase(key);
}
ringloop->wakeup();
}, NULL);
}
else if (dice < reopen_prob+get_prob+add_prob+update_prob+del_prob+list_prob)
{
// list
ops_sent++;
in_progress++;
auto key = random_str(max_key_len);
auto lst = new kv_test_listing_t;
auto k_it = values.lower_bound(key);
lst->count = min_list_count + (max_list_count > min_list_count ? lrand48() % (max_list_count-min_list_count) : 0);
lst->handle = db->list_start(k_it == values.begin() ? key_prefix : key);
lst->next_after = k_it == values.begin() ? key_prefix : key;
lst->inflights = changing_keys;
listings.insert(lst);
if (trace)
printf("list from %s\n", key.c_str());
clock_gettime(CLOCK_REALTIME, &lst->tv_begin);
db->list_next(lst->handle, [this, lst](int res, const std::string & key, const std::string & value)
{
if (log_level >= 11)
printf("list: %s = %s\n", key.c_str(), value.c_str());
if (res >= 0 && key_prefix.size() && (key.size() < key_prefix.size() ||
key.substr(0, key_prefix.size()) != key_prefix))
{
// stop at this key
res = -ENOENT;
}
if (res < 0 || (lst->count > 0 && lst->done >= lst->count))
{
add_stat(stat.list, lst->tv_begin);
if (res == 0)
{
// ok (done >= count)
}
else if (res != -ENOENT)
{
fprintf(stderr, "ERROR: list: %d (%s)\n", res, strerror(-res));
lst->error = true;
}
else
{
auto k_it = lst->next_after == "" ? values.begin() : values.upper_bound(lst->next_after);
while (k_it != values.end())
{
while (k_it != values.end() && lst->inflights.find(k_it->first) != lst->inflights.end())
k_it++;
if (k_it != values.end())
{
fprintf(stderr, "ERROR: list: missing key %s\n", (k_it++)->first.c_str());
lst->error = true;
}
}
}
if (lst->error && stop_on_error)
exit(1);
ops_done++;
in_progress--;
db->list_close(lst->handle);
delete lst;
listings.erase(lst);
ringloop->wakeup();
}
else
{
stat.list_keys++;
// Do not check modified keys in listing
// Listing may return their old or new state
if ((!key_suffix.size() || key.size() >= key_suffix.size() &&
key.substr(key.size()-key_suffix.size()) == key_suffix) &&
lst->inflights.find(key) == lst->inflights.end())
{
lst->done++;
auto k_it = lst->next_after == "" ? values.begin() : values.upper_bound(lst->next_after);
while (true)
{
while (k_it != values.end() && lst->inflights.find(k_it->first) != lst->inflights.end())
{
k_it++;
}
if (k_it == values.end() || k_it->first > key)
{
fprintf(stderr, "ERROR: list: extra key %s\n", key.c_str());
lst->error = true;
break;
}
else if (k_it->first < key)
{
fprintf(stderr, "ERROR: list: missing key %s\n", k_it->first.c_str());
lst->error = true;
lst->next_after = k_it->first;
k_it++;
}
else
{
if (k_it->second != value)
{
fprintf(stderr, "ERROR: list: mismatch: %s = %s but should be %s\n",
key.c_str(), value.c_str(), k_it->second.c_str());
lst->error = true;
}
lst->next_after = k_it->first;
break;
}
}
}
db->list_next(lst->handle, NULL);
}
});
}
}
}
void kv_test_t::add_stat(kv_test_lat_t & stat, timespec tv_begin)
{
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
int64_t usec = (tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - tv_begin.tv_nsec)/1000;
if (usec > 0)
{
stat.usec += usec;
stat.count++;
}
}
void kv_test_t::print_stats(kv_test_stat_t & prev_stat, timespec & prev_stat_time)
{
timespec cur_stat_time;
clock_gettime(CLOCK_REALTIME, &cur_stat_time);
int64_t usec = (cur_stat_time.tv_sec - prev_stat_time.tv_sec)*1000000 +
(cur_stat_time.tv_nsec - prev_stat_time.tv_nsec)/1000;
if (usec > 0)
{
kv_test_lat_t *lats[] = { &stat.get, &stat.add, &stat.update, &stat.del, &stat.list };
kv_test_lat_t *prev[] = { &prev_stat.get, &prev_stat.add, &prev_stat.update, &prev_stat.del, &prev_stat.list };
if (!json_output)
{
char buf[128] = { 0 };
for (int i = 0; i < sizeof(lats)/sizeof(lats[0]); i++)
{
snprintf(buf, sizeof(buf)-1, "%.1f %s/s (%lu us)", (lats[i]->count-prev[i]->count)*1000000.0/usec,
lats[i]->name, (lats[i]->usec-prev[i]->usec)/(lats[i]->count-prev[i]->count > 0 ? lats[i]->count-prev[i]->count : 1));
int k;
for (k = strlen(buf); k < strlen(lats[i]->name)+21; k++)
buf[k] = ' ';
buf[k] = 0;
printf("%s", buf);
}
printf("\n");
}
else
{
int64_t runtime = (cur_stat_time.tv_sec - start_stat_time.tv_sec)*1000000 +
(cur_stat_time.tv_nsec - start_stat_time.tv_nsec)/1000;
printf("{\"runtime\":%.1f", (double)runtime/1000000.0);
for (int i = 0; i < sizeof(lats)/sizeof(lats[0]); i++)
{
if (lats[i]->count > prev[i]->count)
{
printf(
",\"%s\":{\"avg\":{\"iops\":%.1f,\"usec\":%lu},\"total\":{\"count\":%lu,\"usec\":%lu}}",
lats[i]->name, (lats[i]->count-prev[i]->count)*1000000.0/usec,
(lats[i]->usec-prev[i]->usec)/(lats[i]->count-prev[i]->count),
lats[i]->count, lats[i]->usec
);
}
}
printf("}\n");
}
}
prev_stat = stat;
prev_stat_time = cur_stat_time;
}
void kv_test_t::print_total_stats()
{
if (!json_output)
printf("Total:\n");
kv_test_stat_t start_stats;
timespec start_stat_time = this->start_stat_time;
print_stats(start_stats, start_stat_time);
}
void kv_test_t::start_change(const std::string & key)
{
changing_keys.insert(key);
for (auto lst: listings)
{
lst->inflights.insert(key);
}
}
void kv_test_t::stop_change(const std::string & key)
{
changing_keys.erase(key);
}
int main(int narg, const char *args[])
{
setvbuf(stdout, NULL, _IONBF, 0);
setvbuf(stderr, NULL, _IONBF, 0);
exe_name = args[0];
kv_test_t *p = new kv_test_t();
p->run(kv_test_t::parse_args(narg, args));
delete p;
return 0;
}

View File

@@ -11,9 +11,6 @@
#include "addr_util.h"
#include "messenger.h"
#ifdef WITH_RDMA
#include "msgr_rdma.h"
#endif
void osd_messenger_t::init()
{
@@ -22,7 +19,7 @@ void osd_messenger_t::init()
{
rdma_context = msgr_rdma_context_t::create(
rdma_device != "" ? rdma_device.c_str() : NULL,
rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
rdma_port_num, rdma_gid_index, rdma_mtu, log_level
);
if (!rdma_context)
{
@@ -167,7 +164,6 @@ void osd_messenger_t::parse_config(const json11::Json & config)
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
this->rdma_max_msg = 129*1024;
this->rdma_odp = config["rdma_odp"].bool_value();
#endif
this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
if (!this->receive_buffer_size || this->receive_buffer_size > 1024*1024*1024)
@@ -491,14 +487,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
fprintf(stderr, "Connected to OSD %lu using RDMA\n", cl->osd_num);
}
cl->peer_state = PEER_RDMA;
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
{
// Do not miss the disconnection!
if (epoll_events & EPOLLRDHUP)
{
handle_peer_epoll(peer_fd, epoll_events);
}
});
tfd->set_fd_handler(cl->peer_fd, false, NULL);
// Add the initial receive request
try_recv_rdma(cl);
}

View File

@@ -18,6 +18,10 @@
#include "timerfd_manager.h"
#include <ringloop.h>
#ifdef WITH_RDMA
#include "msgr_rdma.h"
#endif
#define CL_READ_HDR 1
#define CL_READ_DATA 2
#define CL_READ_REPLY_DATA 3
@@ -40,11 +44,6 @@ struct msgr_sendp_t
int flags;
};
#ifdef WITH_RDMA
struct msgr_rdma_connection_t;
struct msgr_rdma_context_t;
#endif
struct osd_client_t
{
int refs = 0;
@@ -131,7 +130,6 @@ protected:
msgr_rdma_context_t *rdma_context = NULL;
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
uint64_t rdma_max_msg = 0;
bool rdma_odp = false;
#endif
std::vector<int> read_ready_clients;
@@ -198,9 +196,7 @@ protected:
void handle_reply_ready(osd_op_t *op);
#ifdef WITH_RDMA
void try_send_rdma(osd_client_t *cl);
void try_send_rdma_odp(osd_client_t *cl);
void try_send_rdma_nodp(osd_client_t *cl);
bool try_send_rdma(osd_client_t *cl);
bool try_recv_rdma(osd_client_t *cl);
void handle_rdma_events();
#endif

View File

@@ -55,10 +55,3 @@ json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object &
{
return cli_config;
}
bool json_is_true(const json11::Json & val)
{
if (val.is_string())
return val == "true" || val == "yes" || val == "1";
return val.bool_value();
}

View File

@@ -22,10 +22,4 @@ public:
void submit()
{
}
void wait()
{
}
void loop()
{
}
};

View File

@@ -47,29 +47,11 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
if (qp)
ibv_destroy_qp(qp);
if (recv_buffers.size())
{
for (auto b: recv_buffers)
{
if (b.mr)
ibv_dereg_mr(b.mr);
free(b.buf);
}
recv_buffers.clear();
}
if (send_out.mr)
{
ibv_dereg_mr(send_out.mr);
send_out.mr = NULL;
}
if (send_out.buf)
{
free(send_out.buf);
send_out.buf = NULL;
}
send_out_size = 0;
free(b);
}
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level)
{
int res;
ibv_device **dev_list = NULL;
@@ -154,27 +136,21 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
fprintf(stderr, "Couldn't query RDMA device for its features\n");
goto cleanup;
}
ctx->odp = odp;
if (ctx->odp &&
(!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
if (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT) ||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV)))
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV))
{
ctx->odp = false;
if (log_level > 0)
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable, disabling it\n");
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable or does not support RC send and receive with ODP\n");
goto cleanup;
}
}
if (ctx->odp)
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
if (!ctx->mr)
{
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
if (!ctx->mr)
{
fprintf(stderr, "Couldn't register RDMA memory region\n");
goto cleanup;
}
fprintf(stderr, "Couldn't register RDMA memory region\n");
goto cleanup;
}
ctx->channel = ibv_create_comp_channel(ctx->context);
@@ -389,34 +365,12 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
cl->rdma_conn->cur_send++;
}
static int try_send_rdma_copy(osd_client_t *cl, uint8_t *dst, int dst_len)
{
auto rc = cl->rdma_conn;
int total_dst_len = dst_len;
while (dst_len > 0 && rc->send_pos < cl->send_list.size())
{
iovec & iov = cl->send_list[rc->send_pos];
uint32_t len = (uint32_t)(iov.iov_len-rc->send_buf_pos < dst_len
? iov.iov_len-rc->send_buf_pos : dst_len);
memcpy(dst, iov.iov_base+rc->send_buf_pos, len);
dst += len;
dst_len -= len;
rc->send_buf_pos += len;
if (rc->send_buf_pos >= iov.iov_len)
{
rc->send_pos++;
rc->send_buf_pos = 0;
}
}
return total_dst_len-dst_len;
}
void osd_messenger_t::try_send_rdma_odp(osd_client_t *cl)
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
{
auto rc = cl->rdma_conn;
if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
{
return;
return true;
}
uint64_t op_size = 0, op_sge = 0;
ibv_sge sge[rc->max_sge];
@@ -454,70 +408,15 @@ void osd_messenger_t::try_send_rdma_odp(osd_client_t *cl)
rc->send_sizes.push_back(op_size);
try_send_rdma_wr(cl, sge, op_sge);
}
return true;
}
void osd_messenger_t::try_send_rdma_nodp(osd_client_t *cl)
{
auto rc = cl->rdma_conn;
if (!rc->send_out_size)
{
// Allocate send ring buffer, if not yet
rc->send_out_size = rc->max_msg*rdma_max_send;
rc->send_out.buf = malloc_or_die(rc->send_out_size);
if (!rdma_context->odp)
{
rc->send_out.mr = ibv_reg_mr(rdma_context->pd, rc->send_out.buf, rc->send_out_size, 0);
if (!rc->send_out.mr)
{
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
exit(1);
}
}
}
// Copy data into the buffer and send it
uint8_t *dst = NULL;
int dst_len = 0;
int copied = 1;
while (!rc->send_out_full && copied > 0 && rc->cur_send < rc->max_send)
{
dst = (uint8_t*)rc->send_out.buf + rc->send_out_pos;
dst_len = (rc->send_out_pos < rc->send_out_size ? rc->send_out_size-rc->send_out_pos : rc->send_done_pos-rc->send_out_pos);
if (dst_len > rc->max_msg)
dst_len = rc->max_msg;
copied = try_send_rdma_copy(cl, dst, dst_len);
if (copied > 0)
{
rc->send_out_pos += copied;
if (rc->send_out_pos == rc->send_out_size)
rc->send_out_pos = 0;
assert(rc->send_out_pos < rc->send_out_size);
if (rc->send_out_pos >= rc->send_done_pos)
rc->send_out_full = true;
ibv_sge sge = {
.addr = (uintptr_t)dst,
.length = (uint32_t)copied,
.lkey = rdma_context->odp ? rdma_context->mr->lkey : rc->send_out.mr->lkey,
};
try_send_rdma_wr(cl, &sge, 1);
rc->send_sizes.push_back(copied);
}
}
}
void osd_messenger_t::try_send_rdma(osd_client_t *cl)
{
if (rdma_context->odp)
try_send_rdma_odp(cl);
else
try_send_rdma_nodp(cl);
}
static void try_recv_rdma_wr(osd_client_t *cl, msgr_rdma_buf_t b)
static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
{
ibv_sge sge = {
.addr = (uintptr_t)b.buf,
.addr = (uintptr_t)buf,
.length = (uint32_t)cl->rdma_conn->max_msg,
.lkey = cl->rdma_conn->ctx->odp ? cl->rdma_conn->ctx->mr->lkey : b.mr->lkey,
.lkey = cl->rdma_conn->ctx->mr->lkey,
};
ibv_recv_wr *bad_wr = NULL;
ibv_recv_wr wr = {
@@ -539,19 +438,9 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
auto rc = cl->rdma_conn;
while (rc->cur_recv < rc->max_recv)
{
msgr_rdma_buf_t b;
b.buf = malloc_or_die(rc->max_msg);
if (!rdma_context->odp)
{
b.mr = ibv_reg_mr(rdma_context->pd, b.buf, rc->max_msg, IBV_ACCESS_LOCAL_WRITE);
if (!b.mr)
{
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
exit(1);
}
}
rc->recv_buffers.push_back(b);
try_recv_rdma_wr(cl, b);
void *buf = malloc_or_die(rc->max_msg);
rc->recv_buffers.push_back(buf);
try_recv_rdma_wr(cl, buf);
}
return true;
}
@@ -603,7 +492,7 @@ void osd_messenger_t::handle_rdma_events()
if (!is_send)
{
rc->cur_recv--;
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
{
// handle_read_buffer may stop the client
continue;
@@ -616,14 +505,6 @@ void osd_messenger_t::handle_rdma_events()
rc->cur_send--;
uint64_t sent_size = rc->send_sizes.at(0);
rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
if (!rdma_context->odp)
{
rc->send_done_pos += sent_size;
rc->send_out_full = false;
if (rc->send_done_pos == rc->send_out_size)
rc->send_done_pos = 0;
assert(rc->send_done_pos < rc->send_out_size);
}
int send_pos = 0, send_buf_pos = 0;
while (sent_size > 0)
{

View File

@@ -23,7 +23,6 @@ struct msgr_rdma_context_t
ibv_device *dev = NULL;
ibv_device_attr_ex attrx;
ibv_pd *pd = NULL;
bool odp = false;
ibv_mr *mr = NULL;
ibv_comp_channel *channel = NULL;
ibv_cq *cq = NULL;
@@ -36,16 +35,10 @@ struct msgr_rdma_context_t
int max_cqe = 0;
int used_max_cqe = 0;
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level);
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level);
~msgr_rdma_context_t();
};
struct msgr_rdma_buf_t
{
void *buf = NULL;
ibv_mr *mr = NULL;
};
struct msgr_rdma_connection_t
{
msgr_rdma_context_t *ctx = NULL;
@@ -57,11 +50,8 @@ struct msgr_rdma_connection_t
int send_pos = 0, send_buf_pos = 0;
int next_recv_buf = 0;
std::vector<msgr_rdma_buf_t> recv_buffers;
std::vector<void*> recv_buffers;
std::vector<uint64_t> send_sizes;
msgr_rdma_buf_t send_out;
int send_out_pos = 0, send_done_pos = 0, send_out_size = 0;
bool send_out_full = false;
~msgr_rdma_connection_t();
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);

View File

@@ -3,7 +3,6 @@
#define _XOPEN_SOURCE
#include <limits.h>
#include <sys/epoll.h>
#include "messenger.h"
@@ -120,9 +119,9 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
try_send(cl);
}
}
else
else if (cl->write_msg.msg_iovlen > 0 || !try_send(cl))
{
if ((cl->write_msg.msg_iovlen > 0 || !try_send(cl)) && (cl->write_state == 0))
if (cl->write_state == 0)
{
cl->write_state = CL_WRITE_READY;
write_ready_clients.push_back(cur_op->peer_fd);
@@ -284,14 +283,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
fprintf(stderr, "Successfully connected with client %d using RDMA\n", cl->peer_fd);
}
cl->peer_state = PEER_RDMA;
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
{
// Do not miss the disconnection!
if (epoll_events & EPOLLRDHUP)
{
handle_peer_epoll(peer_fd, epoll_events);
}
});
tfd->set_fd_handler(cl->peer_fd, false, NULL);
// Add the initial receive request
try_recv_rdma(cl);
}

View File

@@ -5,9 +5,6 @@
#include <assert.h>
#include "messenger.h"
#ifdef WITH_RDMA
#include "msgr_rdma.h"
#endif
void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
{

Some files were not shown because too many files have changed in this diff Show More