forked from vitalif/vitastor
Compare commits
1 Commits
rdma-flow-
...
rdma-test
Author | SHA1 | Date | |
---|---|---|---|
0df51e8b21 |
@@ -1,7 +1,7 @@
|
|||||||
cmake_minimum_required(VERSION 2.8.12)
|
cmake_minimum_required(VERSION 2.8)
|
||||||
|
|
||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
set(VERSION "0.8.6")
|
set(VERSION "0.8.5")
|
||||||
|
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
VERSION ?= v0.8.6
|
VERSION ?= v0.8.5
|
||||||
|
|
||||||
all: build push
|
all: build push
|
||||||
|
|
||||||
|
@@ -49,7 +49,7 @@ spec:
|
|||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
allowPrivilegeEscalation: true
|
allowPrivilegeEscalation: true
|
||||||
image: vitalif/vitastor-csi:v0.8.6
|
image: vitalif/vitastor-csi:v0.8.5
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
@@ -116,7 +116,7 @@ spec:
|
|||||||
privileged: true
|
privileged: true
|
||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
image: vitalif/vitastor-csi:v0.8.6
|
image: vitalif/vitastor-csi:v0.8.5
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
@@ -5,7 +5,7 @@ package vitastor
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
vitastorCSIDriverName = "csi.vitastor.io"
|
vitastorCSIDriverName = "csi.vitastor.io"
|
||||||
vitastorCSIDriverVersion = "0.8.6"
|
vitastorCSIDriverVersion = "0.8.5"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config struct fills the parameters of request or user input
|
// Config struct fills the parameters of request or user input
|
||||||
|
@@ -6,11 +6,11 @@ package vitastor
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
|
||||||
"strings"
|
"strings"
|
||||||
"bytes"
|
"bytes"
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
@@ -21,6 +21,8 @@ import (
|
|||||||
"google.golang.org/grpc/codes"
|
"google.golang.org/grpc/codes"
|
||||||
"google.golang.org/grpc/status"
|
"google.golang.org/grpc/status"
|
||||||
|
|
||||||
|
"go.etcd.io/etcd/clientv3"
|
||||||
|
|
||||||
"github.com/container-storage-interface/spec/lib/go/csi"
|
"github.com/container-storage-interface/spec/lib/go/csi"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -112,34 +114,6 @@ func GetConnectionParams(params map[string]string) (map[string]string, []string,
|
|||||||
return ctxVars, etcdUrl, etcdPrefix
|
return ctxVars, etcdUrl, etcdPrefix
|
||||||
}
|
}
|
||||||
|
|
||||||
func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
|
|
||||||
{
|
|
||||||
if (ctxVars["etcdUrl"] != "")
|
|
||||||
{
|
|
||||||
args = append(args, "--etcd_address", ctxVars["etcdUrl"])
|
|
||||||
}
|
|
||||||
if (ctxVars["etcdPrefix"] != "")
|
|
||||||
{
|
|
||||||
args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
|
|
||||||
}
|
|
||||||
if (ctxVars["configPath"] != "")
|
|
||||||
{
|
|
||||||
args = append(args, "--config_path", ctxVars["configPath"])
|
|
||||||
}
|
|
||||||
c := exec.Command("/usr/bin/vitastor-cli", args...)
|
|
||||||
var stdout, stderr bytes.Buffer
|
|
||||||
c.Stdout = &stdout
|
|
||||||
c.Stderr = &stderr
|
|
||||||
err := c.Run()
|
|
||||||
stderrStr := string(stderr.Bytes())
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
|
|
||||||
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
|
|
||||||
}
|
|
||||||
return stdout.Bytes(), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create the volume
|
// Create the volume
|
||||||
func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
|
func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
|
||||||
{
|
{
|
||||||
@@ -172,41 +146,128 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
|
|||||||
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
|
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
|
||||||
}
|
}
|
||||||
|
|
||||||
ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
|
// FIXME: The following should PROBABLY be implemented externally in a management tool
|
||||||
|
|
||||||
|
ctxVars, etcdUrl, etcdPrefix := GetConnectionParams(req.Parameters)
|
||||||
if (len(etcdUrl) == 0)
|
if (len(etcdUrl) == 0)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create image using vitastor-cli
|
// Connect to etcd
|
||||||
_, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
|
cli, err := clientv3.New(clientv3.Config{
|
||||||
|
DialTimeout: ETCD_TIMEOUT,
|
||||||
|
Endpoints: etcdUrl,
|
||||||
|
})
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
if (strings.Index(err.Error(), "already exists") > 0)
|
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
|
||||||
|
}
|
||||||
|
defer cli.Close()
|
||||||
|
|
||||||
|
var imageId uint64 = 0
|
||||||
|
for
|
||||||
{
|
{
|
||||||
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
|
// Check if the image exists
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
|
||||||
|
cancel()
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, err
|
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||||
}
|
}
|
||||||
var inodeCfg []InodeConfig
|
if (len(resp.Kvs) > 0)
|
||||||
err = json.Unmarshal(stat, &inodeCfg)
|
{
|
||||||
|
kv := resp.Kvs[0]
|
||||||
|
var v InodeIndex
|
||||||
|
err := json.Unmarshal(kv.Value, &v)
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
|
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
|
||||||
}
|
}
|
||||||
if (len(inodeCfg) == 0)
|
poolId = v.PoolId
|
||||||
|
imageId = v.Id
|
||||||
|
inodeCfgKey := fmt.Sprintf("/config/inode/%d/%d", poolId, imageId)
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
resp, err := cli.Get(ctx, etcdPrefix+inodeCfgKey)
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
|
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||||
}
|
}
|
||||||
if (inodeCfg[0].Size < uint64(volSize))
|
if (len(resp.Kvs) == 0)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "missing "+inodeCfgKey+" key in etcd")
|
||||||
|
}
|
||||||
|
var inodeCfg InodeConfig
|
||||||
|
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (inodeCfg.Size < uint64(volSize))
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
|
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
return nil, err
|
// Find a free ID
|
||||||
|
// Create image metadata in a transaction verifying that the image doesn't exist yet AND ID is still free
|
||||||
|
maxIdKey := fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
resp, err := cli.Get(ctx, maxIdKey)
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
var modRev int64
|
||||||
|
var nextId uint64
|
||||||
|
if (len(resp.Kvs) > 0)
|
||||||
|
{
|
||||||
|
var err error
|
||||||
|
nextId, err = strconv.ParseUint(string(resp.Kvs[0].Value), 10, 64)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, maxIdKey+" contains invalid ID")
|
||||||
|
}
|
||||||
|
modRev = resp.Kvs[0].ModRevision
|
||||||
|
nextId++
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
nextId = 1
|
||||||
|
}
|
||||||
|
inodeIdxJson, _ := json.Marshal(InodeIndex{
|
||||||
|
Id: nextId,
|
||||||
|
PoolId: poolId,
|
||||||
|
})
|
||||||
|
inodeCfgJson, _ := json.Marshal(InodeConfig{
|
||||||
|
Name: volName,
|
||||||
|
Size: uint64(volSize),
|
||||||
|
})
|
||||||
|
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
txnResp, err := cli.Txn(ctx).If(
|
||||||
|
clientv3.Compare(clientv3.ModRevision(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)), "=", modRev),
|
||||||
|
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)), "=", 0),
|
||||||
|
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId)), "=", 0),
|
||||||
|
).Then(
|
||||||
|
clientv3.OpPut(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId), fmt.Sprintf("%d", nextId)),
|
||||||
|
clientv3.OpPut(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName), string(inodeIdxJson)),
|
||||||
|
clientv3.OpPut(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId), string(inodeCfgJson)),
|
||||||
|
).Commit()
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to commit transaction in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (txnResp.Succeeded)
|
||||||
|
{
|
||||||
|
imageId = nextId
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// Start over if the transaction fails
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -238,12 +299,97 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
|
|||||||
}
|
}
|
||||||
volName := ctxVars["name"]
|
volName := ctxVars["name"]
|
||||||
|
|
||||||
ctxVars, _, _ = GetConnectionParams(ctxVars)
|
_, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
|
||||||
|
if (len(etcdUrl) == 0)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
||||||
|
}
|
||||||
|
|
||||||
_, err = invokeCLI(ctxVars, []string{ "rm", volName })
|
cli, err := clientv3.New(clientv3.Config{
|
||||||
|
DialTimeout: ETCD_TIMEOUT,
|
||||||
|
Endpoints: etcdUrl,
|
||||||
|
})
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, err
|
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
|
||||||
|
}
|
||||||
|
defer cli.Close()
|
||||||
|
|
||||||
|
// Find inode by name
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (len(resp.Kvs) == 0)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
|
||||||
|
}
|
||||||
|
var idx InodeIndex
|
||||||
|
err = json.Unmarshal(resp.Kvs[0].Value, &idx)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get inode config
|
||||||
|
inodeCfgKey := fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)
|
||||||
|
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
resp, err = cli.Get(ctx, inodeCfgKey)
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (len(resp.Kvs) == 0)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
|
||||||
|
}
|
||||||
|
var inodeCfg InodeConfig
|
||||||
|
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete inode data by invoking vitastor-cli
|
||||||
|
args := []string{
|
||||||
|
"rm-data", "--etcd_address", strings.Join(etcdUrl, ","),
|
||||||
|
"--pool", fmt.Sprintf("%d", idx.PoolId),
|
||||||
|
"--inode", fmt.Sprintf("%d", idx.Id),
|
||||||
|
}
|
||||||
|
if (ctxVars["configPath"] != "")
|
||||||
|
{
|
||||||
|
args = append(args, "--config_path", ctxVars["configPath"])
|
||||||
|
}
|
||||||
|
c := exec.Command("/usr/bin/vitastor-cli", args...)
|
||||||
|
var stderr bytes.Buffer
|
||||||
|
c.Stdout = nil
|
||||||
|
c.Stderr = &stderr
|
||||||
|
err = c.Run()
|
||||||
|
stderrStr := string(stderr.Bytes())
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("vitastor-cli rm-data failed: %s, status %s\n", stderrStr, err)
|
||||||
|
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete inode config in etcd
|
||||||
|
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
txnResp, err := cli.Txn(ctx).Then(
|
||||||
|
clientv3.OpDelete(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)),
|
||||||
|
clientv3.OpDelete(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)),
|
||||||
|
).Commit()
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (!txnResp.Succeeded)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: transaction failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
return &csi.DeleteVolumeResponse{}, nil
|
return &csi.DeleteVolumeResponse{}, nil
|
||||||
|
4
debian/changelog
vendored
4
debian/changelog
vendored
@@ -1,10 +1,10 @@
|
|||||||
vitastor (0.8.6-1) unstable; urgency=medium
|
vitastor (0.8.5-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Bugfixes
|
* Bugfixes
|
||||||
|
|
||||||
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
||||||
|
|
||||||
vitastor (0.8.6-1) unstable; urgency=medium
|
vitastor (0.8.5-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Implement NFS proxy
|
* Implement NFS proxy
|
||||||
* Add documentation
|
* Add documentation
|
||||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -34,8 +34,8 @@ RUN set -e -x; \
|
|||||||
mkdir -p /root/packages/vitastor-$REL; \
|
mkdir -p /root/packages/vitastor-$REL; \
|
||||||
rm -rf /root/packages/vitastor-$REL/*; \
|
rm -rf /root/packages/vitastor-$REL/*; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
cp -r /root/vitastor vitastor-0.8.6; \
|
cp -r /root/vitastor vitastor-0.8.5; \
|
||||||
cd vitastor-0.8.6; \
|
cd vitastor-0.8.5; \
|
||||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||||
@@ -48,8 +48,8 @@ RUN set -e -x; \
|
|||||||
rm -rf a b; \
|
rm -rf a b; \
|
||||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.6.orig.tar.xz vitastor-0.8.6; \
|
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.5.orig.tar.xz vitastor-0.8.5; \
|
||||||
cd vitastor-0.8.6; \
|
cd vitastor-0.8.5; \
|
||||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||||
|
@@ -19,7 +19,6 @@ between clients, OSDs and etcd.
|
|||||||
- [rdma_max_sge](#rdma_max_sge)
|
- [rdma_max_sge](#rdma_max_sge)
|
||||||
- [rdma_max_msg](#rdma_max_msg)
|
- [rdma_max_msg](#rdma_max_msg)
|
||||||
- [rdma_max_recv](#rdma_max_recv)
|
- [rdma_max_recv](#rdma_max_recv)
|
||||||
- [rdma_max_send](#rdma_max_send)
|
|
||||||
- [peer_connect_interval](#peer_connect_interval)
|
- [peer_connect_interval](#peer_connect_interval)
|
||||||
- [peer_connect_timeout](#peer_connect_timeout)
|
- [peer_connect_timeout](#peer_connect_timeout)
|
||||||
- [osd_idle_timeout](#osd_idle_timeout)
|
- [osd_idle_timeout](#osd_idle_timeout)
|
||||||
@@ -75,12 +74,6 @@ to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
|||||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||||
root to list available RDMA devices and their features.
|
root to list available RDMA devices and their features.
|
||||||
|
|
||||||
Remember that you also have to configure your network switches if you use
|
|
||||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
|
||||||
the manual of your network vendor for details about setting up the switch
|
|
||||||
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
|
|
||||||
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
|
||||||
|
|
||||||
## rdma_port_num
|
## rdma_port_num
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
@@ -123,30 +116,20 @@ required to change this parameter.
|
|||||||
## rdma_max_msg
|
## rdma_max_msg
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
- Default: 132096
|
- Default: 1048576
|
||||||
|
|
||||||
Maximum size of a single RDMA send or receive operation in bytes.
|
Maximum size of a single RDMA send or receive operation in bytes.
|
||||||
|
|
||||||
## rdma_max_recv
|
## rdma_max_recv
|
||||||
|
|
||||||
- Type: integer
|
|
||||||
- Default: 16
|
|
||||||
|
|
||||||
Maximum number of RDMA receive buffers per connection (RDMA requires
|
|
||||||
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
|
|
||||||
in size. So this setting directly affects memory usage: a single Vitastor
|
|
||||||
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
|
|
||||||
Default is roughly 2 MB * number of OSDs.
|
|
||||||
|
|
||||||
## rdma_max_send
|
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
- Default: 8
|
- Default: 8
|
||||||
|
|
||||||
Maximum number of outstanding RDMA send operations per connection. Should be
|
Maximum number of parallel RDMA receive operations. Note that this number
|
||||||
less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
|
of receive buffers `rdma_max_msg` in size are allocated for each client,
|
||||||
Doesn't affect memory usage - additional memory isn't allocated for send
|
so this setting actually affects memory usage. This is because RDMA receive
|
||||||
operations.
|
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
|
||||||
|
later versions.
|
||||||
|
|
||||||
## peer_connect_interval
|
## peer_connect_interval
|
||||||
|
|
||||||
|
@@ -19,7 +19,6 @@
|
|||||||
- [rdma_max_sge](#rdma_max_sge)
|
- [rdma_max_sge](#rdma_max_sge)
|
||||||
- [rdma_max_msg](#rdma_max_msg)
|
- [rdma_max_msg](#rdma_max_msg)
|
||||||
- [rdma_max_recv](#rdma_max_recv)
|
- [rdma_max_recv](#rdma_max_recv)
|
||||||
- [rdma_max_send](#rdma_max_send)
|
|
||||||
- [peer_connect_interval](#peer_connect_interval)
|
- [peer_connect_interval](#peer_connect_interval)
|
||||||
- [peer_connect_timeout](#peer_connect_timeout)
|
- [peer_connect_timeout](#peer_connect_timeout)
|
||||||
- [osd_idle_timeout](#osd_idle_timeout)
|
- [osd_idle_timeout](#osd_idle_timeout)
|
||||||
@@ -79,13 +78,6 @@ Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Наприме
|
|||||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||||
параметры и возможности.
|
параметры и возможности.
|
||||||
|
|
||||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
|
||||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
|
||||||
нестабильной производительностью. Подробную информацию о настройке
|
|
||||||
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
|
|
||||||
подразумевает настройку сети без потерь на основе PFC (Priority Flow
|
|
||||||
Control) и ECN (Explicit Congestion Notification).
|
|
||||||
|
|
||||||
## rdma_port_num
|
## rdma_port_num
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
@@ -129,32 +121,22 @@ OSD в любом случае согласовывают реальное зн
|
|||||||
## rdma_max_msg
|
## rdma_max_msg
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
- Значение по умолчанию: 132096
|
- Значение по умолчанию: 1048576
|
||||||
|
|
||||||
Максимальный размер одной RDMA-операции отправки или приёма.
|
Максимальный размер одной RDMA-операции отправки или приёма.
|
||||||
|
|
||||||
## rdma_max_recv
|
## rdma_max_recv
|
||||||
|
|
||||||
- Тип: целое число
|
|
||||||
- Значение по умолчанию: 16
|
|
||||||
|
|
||||||
Максимальное число буферов для RDMA-приёма данных на одно соединение
|
|
||||||
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
|
|
||||||
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
|
|
||||||
потребление памяти - один Vitastor-клиент с RDMA использует
|
|
||||||
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
|
|
||||||
примерно 2 МБ * число OSD.
|
|
||||||
|
|
||||||
## rdma_max_send
|
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
- Значение по умолчанию: 8
|
- Значение по умолчанию: 8
|
||||||
|
|
||||||
Максимальное число RDMA-операций отправки, отправляемых в очередь одного
|
Максимальное число параллельных RDMA-операций получения данных. Следует
|
||||||
соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
|
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
|
||||||
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
для каждого подключённого клиентского соединения, так что данная настройка
|
||||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
|
||||||
не выделяется.
|
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
|
||||||
|
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
|
||||||
|
более новых версиях Vitastor.
|
||||||
|
|
||||||
## peer_connect_interval
|
## peer_connect_interval
|
||||||
|
|
||||||
|
@@ -53,12 +53,6 @@
|
|||||||
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||||
root to list available RDMA devices and their features.
|
root to list available RDMA devices and their features.
|
||||||
|
|
||||||
Remember that you also have to configure your network switches if you use
|
|
||||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
|
||||||
the manual of your network vendor for details about setting up the switch
|
|
||||||
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
|
|
||||||
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
||||||
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
||||||
@@ -67,13 +61,6 @@
|
|||||||
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
||||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||||
параметры и возможности.
|
параметры и возможности.
|
||||||
|
|
||||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
|
||||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
|
||||||
нестабильной производительностью. Подробную информацию о настройке
|
|
||||||
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
|
|
||||||
подразумевает настройку сети без потерь на основе PFC (Priority Flow
|
|
||||||
Control) и ECN (Explicit Congestion Notification).
|
|
||||||
- name: rdma_port_num
|
- name: rdma_port_num
|
||||||
type: int
|
type: int
|
||||||
default: 1
|
default: 1
|
||||||
@@ -127,39 +114,26 @@
|
|||||||
так что менять этот параметр обычно не нужно.
|
так что менять этот параметр обычно не нужно.
|
||||||
- name: rdma_max_msg
|
- name: rdma_max_msg
|
||||||
type: int
|
type: int
|
||||||
default: 132096
|
default: 1048576
|
||||||
info: Maximum size of a single RDMA send or receive operation in bytes.
|
info: Maximum size of a single RDMA send or receive operation in bytes.
|
||||||
info_ru: Максимальный размер одной RDMA-операции отправки или приёма.
|
info_ru: Максимальный размер одной RDMA-операции отправки или приёма.
|
||||||
- name: rdma_max_recv
|
- name: rdma_max_recv
|
||||||
type: int
|
|
||||||
default: 16
|
|
||||||
info: |
|
|
||||||
Maximum number of RDMA receive buffers per connection (RDMA requires
|
|
||||||
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
|
|
||||||
in size. So this setting directly affects memory usage: a single Vitastor
|
|
||||||
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
|
|
||||||
Default is roughly 2 MB * number of OSDs.
|
|
||||||
info_ru: |
|
|
||||||
Максимальное число буферов для RDMA-приёма данных на одно соединение
|
|
||||||
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
|
|
||||||
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
|
|
||||||
потребление памяти - один Vitastor-клиент с RDMA использует
|
|
||||||
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
|
|
||||||
примерно 2 МБ * число OSD.
|
|
||||||
- name: rdma_max_send
|
|
||||||
type: int
|
type: int
|
||||||
default: 8
|
default: 8
|
||||||
info: |
|
info: |
|
||||||
Maximum number of outstanding RDMA send operations per connection. Should be
|
Maximum number of parallel RDMA receive operations. Note that this number
|
||||||
less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
|
of receive buffers `rdma_max_msg` in size are allocated for each client,
|
||||||
Doesn't affect memory usage - additional memory isn't allocated for send
|
so this setting actually affects memory usage. This is because RDMA receive
|
||||||
operations.
|
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
|
||||||
|
later versions.
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Максимальное число RDMA-операций отправки, отправляемых в очередь одного
|
Максимальное число параллельных RDMA-операций получения данных. Следует
|
||||||
соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
|
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
|
||||||
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
для каждого подключённого клиентского соединения, так что данная настройка
|
||||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
|
||||||
не выделяется.
|
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
|
||||||
|
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
|
||||||
|
более новых версиях Vitastor.
|
||||||
- name: peer_connect_interval
|
- name: peer_connect_interval
|
||||||
type: sec
|
type: sec
|
||||||
min: 1
|
min: 1
|
||||||
|
@@ -35,24 +35,15 @@ Write amplification for 4 KB blocks is usually 3-5 in Vitastor:
|
|||||||
If you manage to get an SSD which handles 512 byte blocks well (Optane?) you may
|
If you manage to get an SSD which handles 512 byte blocks well (Optane?) you may
|
||||||
lower 1, 3 and 4 to 512 bytes (1/8 of data size) and get WA as low as 2.375.
|
lower 1, 3 and 4 to 512 bytes (1/8 of data size) and get WA as low as 2.375.
|
||||||
|
|
||||||
Implemented NVDIMM support can basically eliminate WA at all - all extra writes will
|
|
||||||
go to DRAM memory. But this requires a test cluster with NVDIMM - please contact me
|
|
||||||
if you want to provide me with such cluster for tests.
|
|
||||||
|
|
||||||
Lazy fsync also reduces WA for parallel workloads because journal blocks are only
|
Lazy fsync also reduces WA for parallel workloads because journal blocks are only
|
||||||
written when they fill up or fsync is requested.
|
written when they fill up or fsync is requested.
|
||||||
|
|
||||||
## In Practice
|
## In Practice
|
||||||
|
|
||||||
In practice, using tests from [Understanding Performance](understanding.en.md), decent TCP network,
|
In practice, using tests from [Understanding Performance](understanding.en.md)
|
||||||
good server-grade SSD/NVMe drives and disabled CPU power saving, you should head for:
|
and good server-grade SSD/NVMe drives, you should head for:
|
||||||
- At least 5000 T1Q1 replicated read and write iops (maximum 0.2ms latency)
|
- At least 5000 T1Q1 replicated read and write iops (maximum 0.2ms latency)
|
||||||
- At least 5000 T1Q1 EC read IOPS and at least 2200 EC write IOPS (maximum 0.45ms latency)
|
|
||||||
- At least ~80k parallel read iops or ~30k write iops per 1 core (1 OSD)
|
- At least ~80k parallel read iops or ~30k write iops per 1 core (1 OSD)
|
||||||
- Disk-speed or wire-speed linear reads and writes, whichever is the bottleneck in your case
|
- Disk-speed or wire-speed linear reads and writes, whichever is the bottleneck in your case
|
||||||
|
|
||||||
Lower results may mean that you have bad drives, bad network or some kind of misconfiguration.
|
Lower results may mean that you have bad drives, bad network or some kind of misconfiguration.
|
||||||
|
|
||||||
Current latency records:
|
|
||||||
- 9668 T1Q1 replicated write iops (0.103 ms latency) with TCP and NVMe
|
|
||||||
- 9143 T1Q1 replicated read iops (0.109 ms latency) with TCP and NVMe
|
|
||||||
|
@@ -36,25 +36,6 @@ WA (мультипликатор записи) для 4 КБ блоков в Vit
|
|||||||
Если вы найдёте SSD, хорошо работающий с 512-байтными блоками данных (Optane?),
|
Если вы найдёте SSD, хорошо работающий с 512-байтными блоками данных (Optane?),
|
||||||
то 1, 3 и 4 можно снизить до 512 байт (1/8 от размера данных) и получить WA всего 2.375.
|
то 1, 3 и 4 можно снизить до 512 байт (1/8 от размера данных) и получить WA всего 2.375.
|
||||||
|
|
||||||
Если реализовать поддержку NVDIMM, то WA можно, условно говоря, ликвидировать вообще - все
|
|
||||||
дополнительные операции записи смогут обслуживаться DRAM памятью. Но для этого необходим
|
|
||||||
тестовый кластер с NVDIMM - пишите, если готовы предоставить такой для тестов.
|
|
||||||
|
|
||||||
Кроме того, WA снижается при использовании отложенного/ленивого сброса при параллельной
|
Кроме того, WA снижается при использовании отложенного/ленивого сброса при параллельной
|
||||||
нагрузке, т.к. блоки журнала записываются на диск только когда они заполняются или явным
|
нагрузке, т.к. блоки журнала записываются на диск только когда они заполняются или явным
|
||||||
образом запрашивается fsync.
|
образом запрашивается fsync.
|
||||||
|
|
||||||
## На практике
|
|
||||||
|
|
||||||
На практике, используя тесты fio со страницы [Понимание сути производительности систем хранения](understanding.ru.md),
|
|
||||||
нормальную TCP-сеть, хорошие серверные SSD/NVMe, при отключённом энергосбережении процессоров вы можете рассчитывать на:
|
|
||||||
- От 5000 IOPS в 1 поток (T1Q1) и на чтение, и на запись при использовании репликации (задержка до 0.2мс)
|
|
||||||
- От 5000 IOPS в 1 поток (T1Q1) на чтение и 2200 IOPS в 1 поток на запись при использовании EC (задержка до 0.45мс)
|
|
||||||
- От 80000 IOPS на чтение в параллельном режиме на 1 ядро, от 30000 IOPS на запись на 1 ядро (на 1 OSD)
|
|
||||||
- Скорость параллельного линейного чтения и записи, равная меньшему значению из скорости дисков или сети
|
|
||||||
|
|
||||||
Худшие результаты означают, что у вас либо медленные диски, либо медленная сеть, либо что-то неправильно настроено.
|
|
||||||
|
|
||||||
Зафиксированный на данный момент рекорд задержки:
|
|
||||||
- 9668 IOPS (0.103 мс задержка) в 1 поток (T1Q1) на запись с TCP и NVMe при использовании репликации
|
|
||||||
- 9143 IOPS (0.109 мс задержка) в 1 поток (T1Q1) на чтение с TCP и NVMe при использовании репликации
|
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
[Documentation](../../README.md#documentation) → Usage → Disk management tool
|
[Documentation](../../README.md#documentation) → Usage → Disk Tool
|
||||||
|
|
||||||
-----
|
-----
|
||||||
|
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
[Документация](../../README-ru.md#документация) → Использование → Инструмент управления дисками
|
[Документация](../../README-ru.md#документация) → Использование → Управление дисками
|
||||||
|
|
||||||
-----
|
-----
|
||||||
|
|
||||||
|
@@ -70,8 +70,8 @@ const etcd_tree = {
|
|||||||
rdma_gid_index: 0,
|
rdma_gid_index: 0,
|
||||||
rdma_mtu: 4096,
|
rdma_mtu: 4096,
|
||||||
rdma_max_sge: 128,
|
rdma_max_sge: 128,
|
||||||
rdma_max_send: 8,
|
rdma_max_send: 64,
|
||||||
rdma_max_recv: 16,
|
rdma_max_recv: 128,
|
||||||
rdma_max_msg: 132096,
|
rdma_max_msg: 132096,
|
||||||
log_level: 0,
|
log_level: 0,
|
||||||
block_size: 131072,
|
block_size: 131072,
|
||||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
|||||||
from cinder.volume import driver
|
from cinder.volume import driver
|
||||||
from cinder.volume import volume_utils
|
from cinder.volume import volume_utils
|
||||||
|
|
||||||
VERSION = '0.8.6'
|
VERSION = '0.8.5'
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@@ -25,4 +25,4 @@ rm fio
|
|||||||
mv fio-copy fio
|
mv fio-copy fio
|
||||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||||
tar --transform 's#^#vitastor-0.8.6/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.6$(rpm --eval '%dist').tar.gz *
|
tar --transform 's#^#vitastor-0.8.5/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.5$(rpm --eval '%dist').tar.gz *
|
||||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-0.8.6.el7.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-0.8.5.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 0.8.6
|
Version: 0.8.5
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-0.8.6.el7.tar.gz
|
Source0: vitastor-0.8.5.el7.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-0.8.6.el8.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-0.8.5.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 0.8.6
|
Version: 0.8.5
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-0.8.6.el8.tar.gz
|
Source0: vitastor-0.8.5.el8.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
cmake_minimum_required(VERSION 2.8.12)
|
cmake_minimum_required(VERSION 2.8)
|
||||||
|
|
||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
|||||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_definitions(-DVERSION="0.8.6")
|
add_definitions(-DVERSION="0.8.5")
|
||||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||||
if (${WITH_ASAN})
|
if (${WITH_ASAN})
|
||||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||||
|
@@ -281,7 +281,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
|
|||||||
if (je->big_write.size > sizeof(journal_entry_big_write))
|
if (je->big_write.size > sizeof(journal_entry_big_write))
|
||||||
{
|
{
|
||||||
printf(json ? ",\"bitmap\":\"" : " (bitmap: ");
|
printf(json ? ",\"bitmap\":\"" : " (bitmap: ");
|
||||||
for (int i = sizeof(journal_entry_big_write); i < je->big_write.size; i++)
|
for (int i = sizeof(journal_entry_big_write); i < je->small_write.size; i++)
|
||||||
{
|
{
|
||||||
printf("%02x", ((uint8_t*)je)[i]);
|
printf("%02x", ((uint8_t*)je)[i]);
|
||||||
}
|
}
|
||||||
|
@@ -26,7 +26,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)>
|
|||||||
buf_size = dsk.meta_len;
|
buf_size = dsk.meta_len;
|
||||||
void *data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
void *data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
||||||
lseek64(dsk.meta_fd, dsk.meta_offset, 0);
|
lseek64(dsk.meta_fd, dsk.meta_offset, 0);
|
||||||
read_blocking(dsk.meta_fd, data, dsk.meta_block_size);
|
read_blocking(dsk.meta_fd, data, buf_size);
|
||||||
// Check superblock
|
// Check superblock
|
||||||
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
|
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
|
||||||
if (hdr->zero == 0 &&
|
if (hdr->zero == 0 &&
|
||||||
@@ -41,11 +41,8 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)>
|
|||||||
if (buf_size % dsk.meta_block_size)
|
if (buf_size % dsk.meta_block_size)
|
||||||
{
|
{
|
||||||
buf_size = 8*dsk.meta_block_size;
|
buf_size = 8*dsk.meta_block_size;
|
||||||
void *new_data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
|
||||||
memcpy(new_data, data, dsk.meta_block_size);
|
|
||||||
free(data);
|
free(data);
|
||||||
data = new_data;
|
data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
||||||
hdr = (blockstore_meta_header_v1_t *)data;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dsk.bitmap_granularity = hdr->bitmap_granularity;
|
dsk.bitmap_granularity = hdr->bitmap_granularity;
|
||||||
|
@@ -157,10 +157,10 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
|||||||
this->rdma_max_sge = 128;
|
this->rdma_max_sge = 128;
|
||||||
this->rdma_max_send = config["rdma_max_send"].uint64_value();
|
this->rdma_max_send = config["rdma_max_send"].uint64_value();
|
||||||
if (!this->rdma_max_send)
|
if (!this->rdma_max_send)
|
||||||
this->rdma_max_send = 8;
|
this->rdma_max_send = 64;
|
||||||
this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
|
this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
|
||||||
if (!this->rdma_max_recv)
|
if (!this->rdma_max_recv)
|
||||||
this->rdma_max_recv = 16;
|
this->rdma_max_recv = 128;
|
||||||
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
|
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
|
||||||
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
||||||
this->rdma_max_msg = 129*1024;
|
this->rdma_max_msg = 129*1024;
|
||||||
|
@@ -134,6 +134,7 @@ protected:
|
|||||||
msgr_rdma_context_t *rdma_context = NULL;
|
msgr_rdma_context_t *rdma_context = NULL;
|
||||||
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
|
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
|
||||||
uint64_t rdma_max_msg = 0;
|
uint64_t rdma_max_msg = 0;
|
||||||
|
std::vector<rdma_hb_t> rdma_handle_buffers;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::vector<int> read_ready_clients;
|
std::vector<int> read_ready_clients;
|
||||||
|
@@ -353,10 +353,8 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
|||||||
.wr_id = (uint64_t)(cl->peer_fd*2+1),
|
.wr_id = (uint64_t)(cl->peer_fd*2+1),
|
||||||
.sg_list = sge,
|
.sg_list = sge,
|
||||||
.num_sge = op_sge,
|
.num_sge = op_sge,
|
||||||
.opcode = cl->rdma_conn->avail_recv > 0 ? IBV_WR_SEND_WITH_IMM : IBV_WR_SEND,
|
.opcode = IBV_WR_SEND,
|
||||||
.send_flags = IBV_SEND_SIGNALED,
|
.send_flags = IBV_SEND_SIGNALED,
|
||||||
// Notify peer about our available incoming buffers
|
|
||||||
.imm_data = (uint32_t)cl->rdma_conn->avail_recv,
|
|
||||||
};
|
};
|
||||||
int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
|
int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
|
||||||
if (err || bad_wr)
|
if (err || bad_wr)
|
||||||
@@ -365,27 +363,15 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
cl->rdma_conn->cur_send++;
|
cl->rdma_conn->cur_send++;
|
||||||
cl->rdma_conn->avail_send--;
|
|
||||||
cl->rdma_conn->avail_recv = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
auto rc = cl->rdma_conn;
|
auto rc = cl->rdma_conn;
|
||||||
if (rc->cur_send >= rc->max_send)
|
if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
|
||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (!cl->send_list.size() || rc->use_flow_control && rc->avail_send <= 0)
|
|
||||||
{
|
|
||||||
if (rc->avail_recv)
|
|
||||||
{
|
|
||||||
// Only notify about available buffers so 2 peers don't lock each other
|
|
||||||
rc->send_sizes.push_back(0);
|
|
||||||
try_send_rdma_wr(cl, NULL, 0);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
uint64_t op_size = 0, op_sge = 0;
|
uint64_t op_size = 0, op_sge = 0;
|
||||||
ibv_sge sge[rc->max_sge];
|
ibv_sge sge[rc->max_sge];
|
||||||
while (rc->send_pos < cl->send_list.size())
|
while (rc->send_pos < cl->send_list.size())
|
||||||
@@ -445,7 +431,6 @@ static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
cl->rdma_conn->cur_recv++;
|
cl->rdma_conn->cur_recv++;
|
||||||
cl->rdma_conn->avail_recv++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
|
bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
|
||||||
@@ -507,19 +492,9 @@ void osd_messenger_t::handle_rdma_events()
|
|||||||
if (!is_send)
|
if (!is_send)
|
||||||
{
|
{
|
||||||
rc->cur_recv--;
|
rc->cur_recv--;
|
||||||
if ((wc[i].wc_flags & IBV_WC_WITH_IMM) && wc[i].imm_data > 0)
|
rdma_handle_buffers.push_back((rdma_hb_t){ .peer_fd = client_id, .buf = rc->recv_buffers[0], .len = wc[i].byte_len });
|
||||||
{
|
rc->recv_buffers.erase(rc->recv_buffers.begin(), rc->recv_buffers.begin()+1);
|
||||||
rc->avail_send += wc[i].imm_data;
|
try_recv_rdma(cl);
|
||||||
rc->use_flow_control = true;
|
|
||||||
}
|
|
||||||
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
|
|
||||||
{
|
|
||||||
// handle_read_buffer may stop the client
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
|
|
||||||
rc->next_recv_buf = (rc->next_recv_buf+1) % rc->recv_buffers.size();
|
|
||||||
try_send_rdma(cl);
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -568,6 +543,16 @@ void osd_messenger_t::handle_rdma_events()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (event_count > 0);
|
} while (event_count > 0);
|
||||||
|
for (auto & hb: rdma_handle_buffers)
|
||||||
|
{
|
||||||
|
auto cl_it = clients.find(hb.peer_fd);
|
||||||
|
if (cl_it != clients.end())
|
||||||
|
{
|
||||||
|
handle_read_buffer(cl_it->second, hb.buf, hb.len);
|
||||||
|
}
|
||||||
|
free(hb.buf);
|
||||||
|
}
|
||||||
|
rdma_handle_buffers.clear();
|
||||||
for (auto cb: set_immediate)
|
for (auto cb: set_immediate)
|
||||||
{
|
{
|
||||||
cb();
|
cb();
|
||||||
|
@@ -45,8 +45,7 @@ struct msgr_rdma_connection_t
|
|||||||
ibv_qp *qp = NULL;
|
ibv_qp *qp = NULL;
|
||||||
msgr_rdma_address_t addr;
|
msgr_rdma_address_t addr;
|
||||||
int max_send = 0, max_recv = 0, max_sge = 0;
|
int max_send = 0, max_recv = 0, max_sge = 0;
|
||||||
int cur_send = 0, cur_recv = 0, avail_recv = 0, avail_send = 0;
|
int cur_send = 0, cur_recv = 0;
|
||||||
bool use_flow_control = false;
|
|
||||||
uint64_t max_msg = 0;
|
uint64_t max_msg = 0;
|
||||||
|
|
||||||
int send_pos = 0, send_buf_pos = 0;
|
int send_pos = 0, send_buf_pos = 0;
|
||||||
@@ -58,3 +57,10 @@ struct msgr_rdma_connection_t
|
|||||||
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
|
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
|
||||||
int connect(msgr_rdma_address_t *dest);
|
int connect(msgr_rdma_address_t *dest);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct rdma_hb_t
|
||||||
|
{
|
||||||
|
int peer_fd;
|
||||||
|
void *buf;
|
||||||
|
uint64_t len;
|
||||||
|
};
|
||||||
|
@@ -44,10 +44,9 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
|||||||
// FIXME: Use timerfd_interval based directly on io_uring
|
// FIXME: Use timerfd_interval based directly on io_uring
|
||||||
this->tfd = epmgr->tfd;
|
this->tfd = epmgr->tfd;
|
||||||
|
|
||||||
if (!json_is_true(this->config["disable_blockstore"]))
|
|
||||||
{
|
|
||||||
auto bs_cfg = json_to_bs(this->config);
|
auto bs_cfg = json_to_bs(this->config);
|
||||||
this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
|
this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
|
||||||
|
{
|
||||||
// Autosync based on the number of unstable writes to prevent stalls due to insufficient journal space
|
// Autosync based on the number of unstable writes to prevent stalls due to insufficient journal space
|
||||||
uint64_t max_autosync = bs->get_journal_size() / bs->get_block_size() / 2;
|
uint64_t max_autosync = bs->get_journal_size() / bs->get_block_size() / 2;
|
||||||
if (autosync_writes > max_autosync)
|
if (autosync_writes > max_autosync)
|
||||||
@@ -94,7 +93,6 @@ osd_t::~osd_t()
|
|||||||
{
|
{
|
||||||
ringloop->unregister_consumer(&consumer);
|
ringloop->unregister_consumer(&consumer);
|
||||||
delete epmgr;
|
delete epmgr;
|
||||||
if (bs)
|
|
||||||
delete bs;
|
delete bs;
|
||||||
close(listen_fd);
|
close(listen_fd);
|
||||||
free(zero_buffer);
|
free(zero_buffer);
|
||||||
@@ -477,7 +475,7 @@ void osd_t::print_slow()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (has_slow && bs)
|
if (has_slow)
|
||||||
{
|
{
|
||||||
bs->dump_diagnostics();
|
bs->dump_diagnostics();
|
||||||
}
|
}
|
||||||
|
@@ -152,7 +152,7 @@ class osd_t
|
|||||||
|
|
||||||
bool stopping = false;
|
bool stopping = false;
|
||||||
int inflight_ops = 0;
|
int inflight_ops = 0;
|
||||||
blockstore_t *bs = NULL;
|
blockstore_t *bs;
|
||||||
void *zero_buffer = NULL;
|
void *zero_buffer = NULL;
|
||||||
uint64_t zero_buffer_size = 0;
|
uint64_t zero_buffer_size = 0;
|
||||||
uint32_t bs_block_size, bs_bitmap_granularity, clean_entry_bitmap_size;
|
uint32_t bs_block_size, bs_bitmap_granularity, clean_entry_bitmap_size;
|
||||||
|
@@ -182,10 +182,10 @@ json11::Json osd_t::get_statistics()
|
|||||||
char time_str[50] = { 0 };
|
char time_str[50] = { 0 };
|
||||||
sprintf(time_str, "%ld.%03ld", ts.tv_sec, ts.tv_nsec/1000000);
|
sprintf(time_str, "%ld.%03ld", ts.tv_sec, ts.tv_nsec/1000000);
|
||||||
st["time"] = time_str;
|
st["time"] = time_str;
|
||||||
if (bs)
|
|
||||||
{
|
|
||||||
st["blockstore_ready"] = bs->is_started();
|
st["blockstore_ready"] = bs->is_started();
|
||||||
st["data_block_size"] = (uint64_t)bs->get_block_size();
|
st["data_block_size"] = (uint64_t)bs->get_block_size();
|
||||||
|
if (bs)
|
||||||
|
{
|
||||||
st["size"] = bs->get_block_count() * bs->get_block_size();
|
st["size"] = bs->get_block_count() * bs->get_block_size();
|
||||||
st["free"] = bs->get_free_block_count() * bs->get_block_size();
|
st["free"] = bs->get_free_block_count() * bs->get_block_size();
|
||||||
}
|
}
|
||||||
@@ -233,8 +233,7 @@ void osd_t::report_statistics()
|
|||||||
json11::Json::object inode_space;
|
json11::Json::object inode_space;
|
||||||
json11::Json::object last_stat;
|
json11::Json::object last_stat;
|
||||||
pool_id_t last_pool = 0;
|
pool_id_t last_pool = 0;
|
||||||
std::map<uint64_t, uint64_t> bs_empty_space;
|
auto & bs_inode_space = bs->get_inode_space_stats();
|
||||||
auto & bs_inode_space = bs ? bs->get_inode_space_stats() : bs_empty_space;
|
|
||||||
for (auto kv: bs_inode_space)
|
for (auto kv: bs_inode_space)
|
||||||
{
|
{
|
||||||
pool_id_t pool_id = INODE_POOL(kv.first);
|
pool_id_t pool_id = INODE_POOL(kv.first);
|
||||||
|
@@ -53,10 +53,7 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
|||||||
inode_stats[cur_op->req.rw.inode].op_count[inode_st_op]++;
|
inode_stats[cur_op->req.rw.inode].op_count[inode_st_op]++;
|
||||||
inode_stats[cur_op->req.rw.inode].op_sum[inode_st_op] += usec;
|
inode_stats[cur_op->req.rw.inode].op_sum[inode_st_op] += usec;
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
|
if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
|
||||||
{
|
|
||||||
if (cur_op->op_data)
|
|
||||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
|
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
|
||||||
}
|
|
||||||
else
|
else
|
||||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len;
|
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len;
|
||||||
}
|
}
|
||||||
|
@@ -759,18 +759,7 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
|
|||||||
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
|
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
|
||||||
uint32_t &start, uint32_t &end)
|
uint32_t &start, uint32_t &end)
|
||||||
{
|
{
|
||||||
bool required = false;
|
if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set)
|
||||||
for (int role = pg_minsize; role < pg_size; role++)
|
|
||||||
{
|
|
||||||
if (write_osd_set[role] != 0)
|
|
||||||
{
|
|
||||||
// Whole parity chunk is needed when we move the object
|
|
||||||
if (write_osd_set[role] != read_osd_set[role])
|
|
||||||
end = chunk_size;
|
|
||||||
required = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (required && end != chunk_size)
|
|
||||||
{
|
{
|
||||||
// start & end are required for calc_rmw_parity
|
// start & end are required for calc_rmw_parity
|
||||||
for (int role = 0; role < pg_minsize; role++)
|
for (int role = 0; role < pg_minsize; role++)
|
||||||
@@ -781,6 +770,14 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
|
|||||||
end = std::max(stripes[role].req_end, end);
|
end = std::max(stripes[role].req_end, end);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for (int role = pg_minsize; role < pg_size; role++)
|
||||||
|
{
|
||||||
|
if (write_osd_set[role] != 0 && write_osd_set[role] != read_osd_set[role])
|
||||||
|
{
|
||||||
|
start = 0;
|
||||||
|
end = chunk_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Set bitmap bits accordingly
|
// Set bitmap bits accordingly
|
||||||
if (bitmap_granularity > 0)
|
if (bitmap_granularity > 0)
|
||||||
|
@@ -24,7 +24,7 @@ void test11();
|
|||||||
void test12();
|
void test12();
|
||||||
void test13();
|
void test13();
|
||||||
void test14();
|
void test14();
|
||||||
void test15(bool second);
|
void test15();
|
||||||
void test16();
|
void test16();
|
||||||
|
|
||||||
int main(int narg, char *args[])
|
int main(int narg, char *args[])
|
||||||
@@ -54,8 +54,7 @@ int main(int narg, char *args[])
|
|||||||
// Test 14
|
// Test 14
|
||||||
test14();
|
test14();
|
||||||
// Test 15
|
// Test 15
|
||||||
test15(false);
|
test15();
|
||||||
test15(true);
|
|
||||||
// Test 16
|
// Test 16
|
||||||
test16();
|
test16();
|
||||||
// End
|
// End
|
||||||
@@ -827,11 +826,12 @@ void test14()
|
|||||||
|
|
||||||
***/
|
***/
|
||||||
|
|
||||||
void test15(bool second)
|
void test15()
|
||||||
{
|
{
|
||||||
const int bmp = 64*1024 / 4096 / 8;
|
const int bmp = 64*1024 / 4096 / 8;
|
||||||
use_ec(4, 2, true);
|
use_ec(4, 2, true);
|
||||||
osd_num_t osd_set[4] = { 1, 2, (osd_num_t)(second ? 0 : 3), (osd_num_t)(second ? 4 : 0) };
|
osd_num_t osd_set[4] = { 1, 2, 3, 0 };
|
||||||
|
osd_num_t write_osd_set[4] = { 1, 2, 3, 0 };
|
||||||
osd_rmw_stripe_t stripes[4] = {};
|
osd_rmw_stripe_t stripes[4] = {};
|
||||||
unsigned bitmaps[4] = { 0 };
|
unsigned bitmaps[4] = { 0 };
|
||||||
// Test 15.0
|
// Test 15.0
|
||||||
@@ -842,7 +842,7 @@ void test15(bool second)
|
|||||||
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
|
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
|
||||||
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
|
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
|
||||||
// Test 15.1
|
// Test 15.1
|
||||||
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, osd_set, 64*1024, bmp);
|
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, write_osd_set, 64*1024, bmp);
|
||||||
for (int i = 0; i < 4; i++)
|
for (int i = 0; i < 4; i++)
|
||||||
stripes[i].bmp_buf = bitmaps+i;
|
stripes[i].bmp_buf = bitmaps+i;
|
||||||
assert(rmw_buf);
|
assert(rmw_buf);
|
||||||
@@ -852,34 +852,32 @@ void test15(bool second)
|
|||||||
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
|
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
|
||||||
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
||||||
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
||||||
assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
|
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
|
||||||
assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
|
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
|
||||||
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
|
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
|
||||||
assert(stripes[1].read_buf == NULL);
|
assert(stripes[1].read_buf == NULL);
|
||||||
assert(stripes[2].read_buf == NULL);
|
assert(stripes[2].read_buf == NULL);
|
||||||
assert(stripes[3].read_buf == NULL);
|
assert(stripes[3].read_buf == NULL);
|
||||||
assert(stripes[0].write_buf == NULL);
|
assert(stripes[0].write_buf == NULL);
|
||||||
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
||||||
assert(stripes[2+second].write_buf == rmw_buf);
|
assert(stripes[2].write_buf == rmw_buf);
|
||||||
assert(stripes[3-second].write_buf == NULL);
|
assert(stripes[3].write_buf == NULL);
|
||||||
// Test 15.2 - encode
|
// Test 15.2 - encode
|
||||||
set_pattern(write_buf, 4*1024, PATTERN1);
|
set_pattern(write_buf, 4*1024, PATTERN1);
|
||||||
set_pattern(stripes[0].read_buf, 4*1024, PATTERN2);
|
set_pattern(stripes[0].read_buf, 4*1024, PATTERN2);
|
||||||
memset(stripes[0].bmp_buf, 0, bmp);
|
memset(stripes[0].bmp_buf, 0, bmp);
|
||||||
memset(stripes[1].bmp_buf, 0, bmp);
|
memset(stripes[1].bmp_buf, 0, bmp);
|
||||||
memset(stripes[2+second].write_buf, 0, 4096);
|
calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 64*1024, bmp);
|
||||||
calc_rmw_parity_ec(stripes, 4, 2, osd_set, osd_set, 64*1024, bmp);
|
assert(*(uint32_t*)stripes[2].bmp_buf == 0x80);
|
||||||
assert(second || *(uint32_t*)stripes[2].bmp_buf == 0x80);
|
|
||||||
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
||||||
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
||||||
assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
|
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
|
||||||
assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
|
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
|
||||||
assert(stripes[0].write_buf == NULL);
|
assert(stripes[0].write_buf == NULL);
|
||||||
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
||||||
assert(stripes[2+second].write_buf == rmw_buf);
|
assert(stripes[2].write_buf == rmw_buf);
|
||||||
assert(stripes[3-second].write_buf == NULL);
|
assert(stripes[3].write_buf == NULL);
|
||||||
// first parity is always xor :), second isn't...
|
check_pattern(stripes[2].write_buf, 4*1024, PATTERN1^PATTERN2); // first parity is always xor :)
|
||||||
check_pattern(stripes[2+second].write_buf, 4*1024, second ? 0xb79a59a0ce8b9b81 : PATTERN1^PATTERN2);
|
|
||||||
// Done
|
// Done
|
||||||
free(rmw_buf);
|
free(rmw_buf);
|
||||||
free(write_buf);
|
free(write_buf);
|
||||||
|
@@ -150,7 +150,6 @@ int connect_osd(const char *osd_address, int osd_port)
|
|||||||
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
||||||
{
|
{
|
||||||
perror("connect");
|
perror("connect");
|
||||||
close(connect_fd);
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
int one = 1;
|
int one = 1;
|
||||||
|
@@ -15,7 +15,7 @@ int read_blocking(int fd, void *read_buf, size_t remaining)
|
|||||||
size_t done = 0;
|
size_t done = 0;
|
||||||
while (done < remaining)
|
while (done < remaining)
|
||||||
{
|
{
|
||||||
ssize_t r = read(fd, read_buf, remaining-done);
|
size_t r = read(fd, read_buf, remaining-done);
|
||||||
if (r <= 0)
|
if (r <= 0)
|
||||||
{
|
{
|
||||||
if (!errno)
|
if (!errno)
|
||||||
@@ -41,7 +41,7 @@ int write_blocking(int fd, void *write_buf, size_t remaining)
|
|||||||
size_t done = 0;
|
size_t done = 0;
|
||||||
while (done < remaining)
|
while (done < remaining)
|
||||||
{
|
{
|
||||||
ssize_t r = write(fd, write_buf, remaining-done);
|
size_t r = write(fd, write_buf, remaining-done);
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
{
|
{
|
||||||
if (errno != EINTR && errno != EAGAIN && errno != EPIPE)
|
if (errno != EINTR && errno != EAGAIN && errno != EPIPE)
|
||||||
|
@@ -83,7 +83,6 @@ int connect_stub(const char *server_address, int server_port)
|
|||||||
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
||||||
{
|
{
|
||||||
perror("connect");
|
perror("connect");
|
||||||
close(connect_fd);
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
int one = 1;
|
int one = 1;
|
||||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
|||||||
|
|
||||||
Name: Vitastor
|
Name: Vitastor
|
||||||
Description: Vitastor client library
|
Description: Vitastor client library
|
||||||
Version: 0.8.6
|
Version: 0.8.5
|
||||||
Libs: -L${libdir} -lvitastor_client
|
Libs: -L${libdir} -lvitastor_client
|
||||||
Cflags: -I${includedir}
|
Cflags: -I${includedir}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user