forked from vitalif/vitastor
Compare commits
26 Commits
Author | SHA1 | Date | |
---|---|---|---|
7e958afeda | |||
2f5e769a29 | |||
28d5e53c6c | |||
d9f55f11d8 | |||
3237014608 | |||
baaf8f6f44 | |||
1d83fdcd17 | |||
0ddd787c38 | |||
6eff3a60a5 | |||
888a6975ab | |||
cd1e890bd4 | |||
0fbf4c6a08 | |||
d06ed2b0e7 | |||
3bbc46543d | |||
2fb0c85618 | |||
d81a6c04fc | |||
7b35801647 | |||
f3228d5c07 | |||
18366f5055 | |||
851507c147 | |||
9aaad28488 | |||
dd57d086fe | |||
8810eae8fb | |||
c1365f46c9 | |||
14d6acbcba | |||
1e307069bc |
@@ -1,7 +1,7 @@
|
|||||||
cmake_minimum_required(VERSION 2.8)
|
cmake_minimum_required(VERSION 2.8.12)
|
||||||
|
|
||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
set(VERSION "0.8.5")
|
set(VERSION "0.8.7")
|
||||||
|
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
VERSION ?= v0.8.5
|
VERSION ?= v0.8.7
|
||||||
|
|
||||||
all: build push
|
all: build push
|
||||||
|
|
||||||
|
@@ -49,7 +49,7 @@ spec:
|
|||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
allowPrivilegeEscalation: true
|
allowPrivilegeEscalation: true
|
||||||
image: vitalif/vitastor-csi:v0.8.5
|
image: vitalif/vitastor-csi:v0.8.7
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
@@ -116,7 +116,7 @@ spec:
|
|||||||
privileged: true
|
privileged: true
|
||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
image: vitalif/vitastor-csi:v0.8.5
|
image: vitalif/vitastor-csi:v0.8.7
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
@@ -5,7 +5,7 @@ package vitastor
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
vitastorCSIDriverName = "csi.vitastor.io"
|
vitastorCSIDriverName = "csi.vitastor.io"
|
||||||
vitastorCSIDriverVersion = "0.8.5"
|
vitastorCSIDriverVersion = "0.8.7"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config struct fills the parameters of request or user input
|
// Config struct fills the parameters of request or user input
|
||||||
|
@@ -6,11 +6,11 @@ package vitastor
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"strings"
|
"strings"
|
||||||
"bytes"
|
"bytes"
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
"fmt"
|
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
@@ -21,8 +21,6 @@ import (
|
|||||||
"google.golang.org/grpc/codes"
|
"google.golang.org/grpc/codes"
|
||||||
"google.golang.org/grpc/status"
|
"google.golang.org/grpc/status"
|
||||||
|
|
||||||
"go.etcd.io/etcd/clientv3"
|
|
||||||
|
|
||||||
"github.com/container-storage-interface/spec/lib/go/csi"
|
"github.com/container-storage-interface/spec/lib/go/csi"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -114,6 +112,34 @@ func GetConnectionParams(params map[string]string) (map[string]string, []string,
|
|||||||
return ctxVars, etcdUrl, etcdPrefix
|
return ctxVars, etcdUrl, etcdPrefix
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
|
||||||
|
{
|
||||||
|
if (ctxVars["etcdUrl"] != "")
|
||||||
|
{
|
||||||
|
args = append(args, "--etcd_address", ctxVars["etcdUrl"])
|
||||||
|
}
|
||||||
|
if (ctxVars["etcdPrefix"] != "")
|
||||||
|
{
|
||||||
|
args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
|
||||||
|
}
|
||||||
|
if (ctxVars["configPath"] != "")
|
||||||
|
{
|
||||||
|
args = append(args, "--config_path", ctxVars["configPath"])
|
||||||
|
}
|
||||||
|
c := exec.Command("/usr/bin/vitastor-cli", args...)
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
c.Stdout = &stdout
|
||||||
|
c.Stderr = &stderr
|
||||||
|
err := c.Run()
|
||||||
|
stderrStr := string(stderr.Bytes())
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
|
||||||
|
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
|
||||||
|
}
|
||||||
|
return stdout.Bytes(), nil
|
||||||
|
}
|
||||||
|
|
||||||
// Create the volume
|
// Create the volume
|
||||||
func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
|
func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
|
||||||
{
|
{
|
||||||
@@ -146,128 +172,41 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
|
|||||||
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
|
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: The following should PROBABLY be implemented externally in a management tool
|
ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
|
||||||
|
|
||||||
ctxVars, etcdUrl, etcdPrefix := GetConnectionParams(req.Parameters)
|
|
||||||
if (len(etcdUrl) == 0)
|
if (len(etcdUrl) == 0)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Connect to etcd
|
// Create image using vitastor-cli
|
||||||
cli, err := clientv3.New(clientv3.Config{
|
_, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
|
||||||
DialTimeout: ETCD_TIMEOUT,
|
|
||||||
Endpoints: etcdUrl,
|
|
||||||
})
|
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
|
if (strings.Index(err.Error(), "already exists") > 0)
|
||||||
}
|
|
||||||
defer cli.Close()
|
|
||||||
|
|
||||||
var imageId uint64 = 0
|
|
||||||
for
|
|
||||||
{
|
|
||||||
// Check if the image exists
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
|
||||||
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
|
|
||||||
cancel()
|
|
||||||
if (err != nil)
|
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
|
||||||
}
|
|
||||||
if (len(resp.Kvs) > 0)
|
|
||||||
{
|
|
||||||
kv := resp.Kvs[0]
|
|
||||||
var v InodeIndex
|
|
||||||
err := json.Unmarshal(kv.Value, &v)
|
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
|
return nil, err
|
||||||
}
|
}
|
||||||
poolId = v.PoolId
|
var inodeCfg []InodeConfig
|
||||||
imageId = v.Id
|
err = json.Unmarshal(stat, &inodeCfg)
|
||||||
inodeCfgKey := fmt.Sprintf("/config/inode/%d/%d", poolId, imageId)
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
|
||||||
resp, err := cli.Get(ctx, etcdPrefix+inodeCfgKey)
|
|
||||||
cancel()
|
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
|
||||||
}
|
}
|
||||||
if (len(resp.Kvs) == 0)
|
if (len(inodeCfg) == 0)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "missing "+inodeCfgKey+" key in etcd")
|
return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
|
||||||
}
|
}
|
||||||
var inodeCfg InodeConfig
|
if (inodeCfg[0].Size < uint64(volSize))
|
||||||
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
if (inodeCfg.Size < uint64(volSize))
|
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
|
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Find a free ID
|
return nil, err
|
||||||
// Create image metadata in a transaction verifying that the image doesn't exist yet AND ID is still free
|
|
||||||
maxIdKey := fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
|
||||||
resp, err := cli.Get(ctx, maxIdKey)
|
|
||||||
cancel()
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
var modRev int64
|
|
||||||
var nextId uint64
|
|
||||||
if (len(resp.Kvs) > 0)
|
|
||||||
{
|
|
||||||
var err error
|
|
||||||
nextId, err = strconv.ParseUint(string(resp.Kvs[0].Value), 10, 64)
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, maxIdKey+" contains invalid ID")
|
|
||||||
}
|
|
||||||
modRev = resp.Kvs[0].ModRevision
|
|
||||||
nextId++
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
nextId = 1
|
|
||||||
}
|
|
||||||
inodeIdxJson, _ := json.Marshal(InodeIndex{
|
|
||||||
Id: nextId,
|
|
||||||
PoolId: poolId,
|
|
||||||
})
|
|
||||||
inodeCfgJson, _ := json.Marshal(InodeConfig{
|
|
||||||
Name: volName,
|
|
||||||
Size: uint64(volSize),
|
|
||||||
})
|
|
||||||
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
|
||||||
txnResp, err := cli.Txn(ctx).If(
|
|
||||||
clientv3.Compare(clientv3.ModRevision(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)), "=", modRev),
|
|
||||||
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)), "=", 0),
|
|
||||||
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId)), "=", 0),
|
|
||||||
).Then(
|
|
||||||
clientv3.OpPut(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId), fmt.Sprintf("%d", nextId)),
|
|
||||||
clientv3.OpPut(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName), string(inodeIdxJson)),
|
|
||||||
clientv3.OpPut(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId), string(inodeCfgJson)),
|
|
||||||
).Commit()
|
|
||||||
cancel()
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "failed to commit transaction in etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
if (txnResp.Succeeded)
|
|
||||||
{
|
|
||||||
imageId = nextId
|
|
||||||
break
|
|
||||||
}
|
|
||||||
// Start over if the transaction fails
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -299,97 +238,12 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
|
|||||||
}
|
}
|
||||||
volName := ctxVars["name"]
|
volName := ctxVars["name"]
|
||||||
|
|
||||||
_, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
|
ctxVars, _, _ = GetConnectionParams(ctxVars)
|
||||||
if (len(etcdUrl) == 0)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
|
||||||
}
|
|
||||||
|
|
||||||
cli, err := clientv3.New(clientv3.Config{
|
_, err = invokeCLI(ctxVars, []string{ "rm", volName })
|
||||||
DialTimeout: ETCD_TIMEOUT,
|
|
||||||
Endpoints: etcdUrl,
|
|
||||||
})
|
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
|
return nil, err
|
||||||
}
|
|
||||||
defer cli.Close()
|
|
||||||
|
|
||||||
// Find inode by name
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
|
||||||
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
|
|
||||||
cancel()
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
if (len(resp.Kvs) == 0)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
|
|
||||||
}
|
|
||||||
var idx InodeIndex
|
|
||||||
err = json.Unmarshal(resp.Kvs[0].Value, &idx)
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get inode config
|
|
||||||
inodeCfgKey := fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)
|
|
||||||
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
|
||||||
resp, err = cli.Get(ctx, inodeCfgKey)
|
|
||||||
cancel()
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
if (len(resp.Kvs) == 0)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
|
|
||||||
}
|
|
||||||
var inodeCfg InodeConfig
|
|
||||||
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
|
|
||||||
// Delete inode data by invoking vitastor-cli
|
|
||||||
args := []string{
|
|
||||||
"rm-data", "--etcd_address", strings.Join(etcdUrl, ","),
|
|
||||||
"--pool", fmt.Sprintf("%d", idx.PoolId),
|
|
||||||
"--inode", fmt.Sprintf("%d", idx.Id),
|
|
||||||
}
|
|
||||||
if (ctxVars["configPath"] != "")
|
|
||||||
{
|
|
||||||
args = append(args, "--config_path", ctxVars["configPath"])
|
|
||||||
}
|
|
||||||
c := exec.Command("/usr/bin/vitastor-cli", args...)
|
|
||||||
var stderr bytes.Buffer
|
|
||||||
c.Stdout = nil
|
|
||||||
c.Stderr = &stderr
|
|
||||||
err = c.Run()
|
|
||||||
stderrStr := string(stderr.Bytes())
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
klog.Errorf("vitastor-cli rm-data failed: %s, status %s\n", stderrStr, err)
|
|
||||||
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Delete inode config in etcd
|
|
||||||
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
|
||||||
txnResp, err := cli.Txn(ctx).Then(
|
|
||||||
clientv3.OpDelete(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)),
|
|
||||||
clientv3.OpDelete(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)),
|
|
||||||
).Commit()
|
|
||||||
cancel()
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
if (!txnResp.Succeeded)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: transaction failed")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return &csi.DeleteVolumeResponse{}, nil
|
return &csi.DeleteVolumeResponse{}, nil
|
||||||
|
4
debian/changelog
vendored
4
debian/changelog
vendored
@@ -1,10 +1,10 @@
|
|||||||
vitastor (0.8.5-1) unstable; urgency=medium
|
vitastor (0.8.7-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Bugfixes
|
* Bugfixes
|
||||||
|
|
||||||
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
||||||
|
|
||||||
vitastor (0.8.5-1) unstable; urgency=medium
|
vitastor (0.8.7-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Implement NFS proxy
|
* Implement NFS proxy
|
||||||
* Add documentation
|
* Add documentation
|
||||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -34,8 +34,8 @@ RUN set -e -x; \
|
|||||||
mkdir -p /root/packages/vitastor-$REL; \
|
mkdir -p /root/packages/vitastor-$REL; \
|
||||||
rm -rf /root/packages/vitastor-$REL/*; \
|
rm -rf /root/packages/vitastor-$REL/*; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
cp -r /root/vitastor vitastor-0.8.5; \
|
cp -r /root/vitastor vitastor-0.8.7; \
|
||||||
cd vitastor-0.8.5; \
|
cd vitastor-0.8.7; \
|
||||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||||
@@ -48,8 +48,8 @@ RUN set -e -x; \
|
|||||||
rm -rf a b; \
|
rm -rf a b; \
|
||||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.5.orig.tar.xz vitastor-0.8.5; \
|
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.7.orig.tar.xz vitastor-0.8.7; \
|
||||||
cd vitastor-0.8.5; \
|
cd vitastor-0.8.7; \
|
||||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||||
|
@@ -19,6 +19,7 @@ between clients, OSDs and etcd.
|
|||||||
- [rdma_max_sge](#rdma_max_sge)
|
- [rdma_max_sge](#rdma_max_sge)
|
||||||
- [rdma_max_msg](#rdma_max_msg)
|
- [rdma_max_msg](#rdma_max_msg)
|
||||||
- [rdma_max_recv](#rdma_max_recv)
|
- [rdma_max_recv](#rdma_max_recv)
|
||||||
|
- [rdma_max_send](#rdma_max_send)
|
||||||
- [peer_connect_interval](#peer_connect_interval)
|
- [peer_connect_interval](#peer_connect_interval)
|
||||||
- [peer_connect_timeout](#peer_connect_timeout)
|
- [peer_connect_timeout](#peer_connect_timeout)
|
||||||
- [osd_idle_timeout](#osd_idle_timeout)
|
- [osd_idle_timeout](#osd_idle_timeout)
|
||||||
@@ -74,6 +75,12 @@ to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
|||||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||||
root to list available RDMA devices and their features.
|
root to list available RDMA devices and their features.
|
||||||
|
|
||||||
|
Remember that you also have to configure your network switches if you use
|
||||||
|
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
||||||
|
the manual of your network vendor for details about setting up the switch
|
||||||
|
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
|
||||||
|
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
||||||
|
|
||||||
## rdma_port_num
|
## rdma_port_num
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
@@ -116,20 +123,30 @@ required to change this parameter.
|
|||||||
## rdma_max_msg
|
## rdma_max_msg
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
- Default: 1048576
|
- Default: 132096
|
||||||
|
|
||||||
Maximum size of a single RDMA send or receive operation in bytes.
|
Maximum size of a single RDMA send or receive operation in bytes.
|
||||||
|
|
||||||
## rdma_max_recv
|
## rdma_max_recv
|
||||||
|
|
||||||
|
- Type: integer
|
||||||
|
- Default: 16
|
||||||
|
|
||||||
|
Maximum number of RDMA receive buffers per connection (RDMA requires
|
||||||
|
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
|
||||||
|
in size. So this setting directly affects memory usage: a single Vitastor
|
||||||
|
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
|
||||||
|
Default is roughly 2 MB * number of OSDs.
|
||||||
|
|
||||||
|
## rdma_max_send
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
- Default: 8
|
- Default: 8
|
||||||
|
|
||||||
Maximum number of parallel RDMA receive operations. Note that this number
|
Maximum number of outstanding RDMA send operations per connection. Should be
|
||||||
of receive buffers `rdma_max_msg` in size are allocated for each client,
|
less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
|
||||||
so this setting actually affects memory usage. This is because RDMA receive
|
Doesn't affect memory usage - additional memory isn't allocated for send
|
||||||
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
|
operations.
|
||||||
later versions.
|
|
||||||
|
|
||||||
## peer_connect_interval
|
## peer_connect_interval
|
||||||
|
|
||||||
|
@@ -19,6 +19,7 @@
|
|||||||
- [rdma_max_sge](#rdma_max_sge)
|
- [rdma_max_sge](#rdma_max_sge)
|
||||||
- [rdma_max_msg](#rdma_max_msg)
|
- [rdma_max_msg](#rdma_max_msg)
|
||||||
- [rdma_max_recv](#rdma_max_recv)
|
- [rdma_max_recv](#rdma_max_recv)
|
||||||
|
- [rdma_max_send](#rdma_max_send)
|
||||||
- [peer_connect_interval](#peer_connect_interval)
|
- [peer_connect_interval](#peer_connect_interval)
|
||||||
- [peer_connect_timeout](#peer_connect_timeout)
|
- [peer_connect_timeout](#peer_connect_timeout)
|
||||||
- [osd_idle_timeout](#osd_idle_timeout)
|
- [osd_idle_timeout](#osd_idle_timeout)
|
||||||
@@ -78,6 +79,13 @@ Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Наприме
|
|||||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||||
параметры и возможности.
|
параметры и возможности.
|
||||||
|
|
||||||
|
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
||||||
|
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
||||||
|
нестабильной производительностью. Подробную информацию о настройке
|
||||||
|
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
|
||||||
|
подразумевает настройку сети без потерь на основе PFC (Priority Flow
|
||||||
|
Control) и ECN (Explicit Congestion Notification).
|
||||||
|
|
||||||
## rdma_port_num
|
## rdma_port_num
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
@@ -121,22 +129,32 @@ OSD в любом случае согласовывают реальное зн
|
|||||||
## rdma_max_msg
|
## rdma_max_msg
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
- Значение по умолчанию: 1048576
|
- Значение по умолчанию: 132096
|
||||||
|
|
||||||
Максимальный размер одной RDMA-операции отправки или приёма.
|
Максимальный размер одной RDMA-операции отправки или приёма.
|
||||||
|
|
||||||
## rdma_max_recv
|
## rdma_max_recv
|
||||||
|
|
||||||
|
- Тип: целое число
|
||||||
|
- Значение по умолчанию: 16
|
||||||
|
|
||||||
|
Максимальное число буферов для RDMA-приёма данных на одно соединение
|
||||||
|
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
|
||||||
|
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
|
||||||
|
потребление памяти - один Vitastor-клиент с RDMA использует
|
||||||
|
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
|
||||||
|
примерно 2 МБ * число OSD.
|
||||||
|
|
||||||
|
## rdma_max_send
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
- Значение по умолчанию: 8
|
- Значение по умолчанию: 8
|
||||||
|
|
||||||
Максимальное число параллельных RDMA-операций получения данных. Следует
|
Максимальное число RDMA-операций отправки, отправляемых в очередь одного
|
||||||
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
|
соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
|
||||||
для каждого подключённого клиентского соединения, так что данная настройка
|
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
||||||
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
|
Не влияет на потребление памяти - дополнительная память на операции отправки
|
||||||
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
|
не выделяется.
|
||||||
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
|
|
||||||
более новых версиях Vitastor.
|
|
||||||
|
|
||||||
## peer_connect_interval
|
## peer_connect_interval
|
||||||
|
|
||||||
|
@@ -53,6 +53,12 @@
|
|||||||
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||||
root to list available RDMA devices and their features.
|
root to list available RDMA devices and their features.
|
||||||
|
|
||||||
|
Remember that you also have to configure your network switches if you use
|
||||||
|
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
||||||
|
the manual of your network vendor for details about setting up the switch
|
||||||
|
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
|
||||||
|
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
||||||
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
||||||
@@ -61,6 +67,13 @@
|
|||||||
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
||||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||||
параметры и возможности.
|
параметры и возможности.
|
||||||
|
|
||||||
|
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
||||||
|
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
||||||
|
нестабильной производительностью. Подробную информацию о настройке
|
||||||
|
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
|
||||||
|
подразумевает настройку сети без потерь на основе PFC (Priority Flow
|
||||||
|
Control) и ECN (Explicit Congestion Notification).
|
||||||
- name: rdma_port_num
|
- name: rdma_port_num
|
||||||
type: int
|
type: int
|
||||||
default: 1
|
default: 1
|
||||||
@@ -114,26 +127,39 @@
|
|||||||
так что менять этот параметр обычно не нужно.
|
так что менять этот параметр обычно не нужно.
|
||||||
- name: rdma_max_msg
|
- name: rdma_max_msg
|
||||||
type: int
|
type: int
|
||||||
default: 1048576
|
default: 132096
|
||||||
info: Maximum size of a single RDMA send or receive operation in bytes.
|
info: Maximum size of a single RDMA send or receive operation in bytes.
|
||||||
info_ru: Максимальный размер одной RDMA-операции отправки или приёма.
|
info_ru: Максимальный размер одной RDMA-операции отправки или приёма.
|
||||||
- name: rdma_max_recv
|
- name: rdma_max_recv
|
||||||
|
type: int
|
||||||
|
default: 16
|
||||||
|
info: |
|
||||||
|
Maximum number of RDMA receive buffers per connection (RDMA requires
|
||||||
|
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
|
||||||
|
in size. So this setting directly affects memory usage: a single Vitastor
|
||||||
|
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
|
||||||
|
Default is roughly 2 MB * number of OSDs.
|
||||||
|
info_ru: |
|
||||||
|
Максимальное число буферов для RDMA-приёма данных на одно соединение
|
||||||
|
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
|
||||||
|
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
|
||||||
|
потребление памяти - один Vitastor-клиент с RDMA использует
|
||||||
|
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
|
||||||
|
примерно 2 МБ * число OSD.
|
||||||
|
- name: rdma_max_send
|
||||||
type: int
|
type: int
|
||||||
default: 8
|
default: 8
|
||||||
info: |
|
info: |
|
||||||
Maximum number of parallel RDMA receive operations. Note that this number
|
Maximum number of outstanding RDMA send operations per connection. Should be
|
||||||
of receive buffers `rdma_max_msg` in size are allocated for each client,
|
less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
|
||||||
so this setting actually affects memory usage. This is because RDMA receive
|
Doesn't affect memory usage - additional memory isn't allocated for send
|
||||||
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
|
operations.
|
||||||
later versions.
|
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Максимальное число параллельных RDMA-операций получения данных. Следует
|
Максимальное число RDMA-операций отправки, отправляемых в очередь одного
|
||||||
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
|
соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
|
||||||
для каждого подключённого клиентского соединения, так что данная настройка
|
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
||||||
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
|
Не влияет на потребление памяти - дополнительная память на операции отправки
|
||||||
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
|
не выделяется.
|
||||||
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
|
|
||||||
более новых версиях Vitastor.
|
|
||||||
- name: peer_connect_interval
|
- name: peer_connect_interval
|
||||||
type: sec
|
type: sec
|
||||||
min: 1
|
min: 1
|
||||||
|
@@ -35,15 +35,24 @@ Write amplification for 4 KB blocks is usually 3-5 in Vitastor:
|
|||||||
If you manage to get an SSD which handles 512 byte blocks well (Optane?) you may
|
If you manage to get an SSD which handles 512 byte blocks well (Optane?) you may
|
||||||
lower 1, 3 and 4 to 512 bytes (1/8 of data size) and get WA as low as 2.375.
|
lower 1, 3 and 4 to 512 bytes (1/8 of data size) and get WA as low as 2.375.
|
||||||
|
|
||||||
|
Implemented NVDIMM support can basically eliminate WA at all - all extra writes will
|
||||||
|
go to DRAM memory. But this requires a test cluster with NVDIMM - please contact me
|
||||||
|
if you want to provide me with such cluster for tests.
|
||||||
|
|
||||||
Lazy fsync also reduces WA for parallel workloads because journal blocks are only
|
Lazy fsync also reduces WA for parallel workloads because journal blocks are only
|
||||||
written when they fill up or fsync is requested.
|
written when they fill up or fsync is requested.
|
||||||
|
|
||||||
## In Practice
|
## In Practice
|
||||||
|
|
||||||
In practice, using tests from [Understanding Performance](understanding.en.md)
|
In practice, using tests from [Understanding Performance](understanding.en.md), decent TCP network,
|
||||||
and good server-grade SSD/NVMe drives, you should head for:
|
good server-grade SSD/NVMe drives and disabled CPU power saving, you should head for:
|
||||||
- At least 5000 T1Q1 replicated read and write iops (maximum 0.2ms latency)
|
- At least 5000 T1Q1 replicated read and write iops (maximum 0.2ms latency)
|
||||||
|
- At least 5000 T1Q1 EC read IOPS and at least 2200 EC write IOPS (maximum 0.45ms latency)
|
||||||
- At least ~80k parallel read iops or ~30k write iops per 1 core (1 OSD)
|
- At least ~80k parallel read iops or ~30k write iops per 1 core (1 OSD)
|
||||||
- Disk-speed or wire-speed linear reads and writes, whichever is the bottleneck in your case
|
- Disk-speed or wire-speed linear reads and writes, whichever is the bottleneck in your case
|
||||||
|
|
||||||
Lower results may mean that you have bad drives, bad network or some kind of misconfiguration.
|
Lower results may mean that you have bad drives, bad network or some kind of misconfiguration.
|
||||||
|
|
||||||
|
Current latency records:
|
||||||
|
- 9668 T1Q1 replicated write iops (0.103 ms latency) with TCP and NVMe
|
||||||
|
- 9143 T1Q1 replicated read iops (0.109 ms latency) with TCP and NVMe
|
||||||
|
@@ -36,6 +36,25 @@ WA (мультипликатор записи) для 4 КБ блоков в Vit
|
|||||||
Если вы найдёте SSD, хорошо работающий с 512-байтными блоками данных (Optane?),
|
Если вы найдёте SSD, хорошо работающий с 512-байтными блоками данных (Optane?),
|
||||||
то 1, 3 и 4 можно снизить до 512 байт (1/8 от размера данных) и получить WA всего 2.375.
|
то 1, 3 и 4 можно снизить до 512 байт (1/8 от размера данных) и получить WA всего 2.375.
|
||||||
|
|
||||||
|
Если реализовать поддержку NVDIMM, то WA можно, условно говоря, ликвидировать вообще - все
|
||||||
|
дополнительные операции записи смогут обслуживаться DRAM памятью. Но для этого необходим
|
||||||
|
тестовый кластер с NVDIMM - пишите, если готовы предоставить такой для тестов.
|
||||||
|
|
||||||
Кроме того, WA снижается при использовании отложенного/ленивого сброса при параллельной
|
Кроме того, WA снижается при использовании отложенного/ленивого сброса при параллельной
|
||||||
нагрузке, т.к. блоки журнала записываются на диск только когда они заполняются или явным
|
нагрузке, т.к. блоки журнала записываются на диск только когда они заполняются или явным
|
||||||
образом запрашивается fsync.
|
образом запрашивается fsync.
|
||||||
|
|
||||||
|
## На практике
|
||||||
|
|
||||||
|
На практике, используя тесты fio со страницы [Понимание сути производительности систем хранения](understanding.ru.md),
|
||||||
|
нормальную TCP-сеть, хорошие серверные SSD/NVMe, при отключённом энергосбережении процессоров вы можете рассчитывать на:
|
||||||
|
- От 5000 IOPS в 1 поток (T1Q1) и на чтение, и на запись при использовании репликации (задержка до 0.2мс)
|
||||||
|
- От 5000 IOPS в 1 поток (T1Q1) на чтение и 2200 IOPS в 1 поток на запись при использовании EC (задержка до 0.45мс)
|
||||||
|
- От 80000 IOPS на чтение в параллельном режиме на 1 ядро, от 30000 IOPS на запись на 1 ядро (на 1 OSD)
|
||||||
|
- Скорость параллельного линейного чтения и записи, равная меньшему значению из скорости дисков или сети
|
||||||
|
|
||||||
|
Худшие результаты означают, что у вас либо медленные диски, либо медленная сеть, либо что-то неправильно настроено.
|
||||||
|
|
||||||
|
Зафиксированный на данный момент рекорд задержки:
|
||||||
|
- 9668 IOPS (0.103 мс задержка) в 1 поток (T1Q1) на запись с TCP и NVMe при использовании репликации
|
||||||
|
- 9143 IOPS (0.109 мс задержка) в 1 поток (T1Q1) на чтение с TCP и NVMe при использовании репликации
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
[Documentation](../../README.md#documentation) → Usage → Disk Tool
|
[Documentation](../../README.md#documentation) → Usage → Disk management tool
|
||||||
|
|
||||||
-----
|
-----
|
||||||
|
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
[Документация](../../README-ru.md#документация) → Использование → Управление дисками
|
[Документация](../../README-ru.md#документация) → Использование → Инструмент управления дисками
|
||||||
|
|
||||||
-----
|
-----
|
||||||
|
|
||||||
|
24
mon/mon.js
24
mon/mon.js
@@ -51,8 +51,9 @@ const etcd_tree = {
|
|||||||
// THIS IS JUST A POOR MAN'S CONFIG DOCUMENTATION
|
// THIS IS JUST A POOR MAN'S CONFIG DOCUMENTATION
|
||||||
// etcd connection
|
// etcd connection
|
||||||
config_path: "/etc/vitastor/vitastor.conf",
|
config_path: "/etc/vitastor/vitastor.conf",
|
||||||
etcd_address: "10.0.115.10:2379/v3",
|
|
||||||
etcd_prefix: "/vitastor",
|
etcd_prefix: "/vitastor",
|
||||||
|
// etcd connection - configurable online
|
||||||
|
etcd_address: "10.0.115.10:2379/v3",
|
||||||
// mon
|
// mon
|
||||||
etcd_mon_ttl: 30, // min: 10
|
etcd_mon_ttl: 30, // min: 10
|
||||||
etcd_mon_timeout: 1000, // ms. min: 0
|
etcd_mon_timeout: 1000, // ms. min: 0
|
||||||
@@ -70,14 +71,15 @@ const etcd_tree = {
|
|||||||
rdma_gid_index: 0,
|
rdma_gid_index: 0,
|
||||||
rdma_mtu: 4096,
|
rdma_mtu: 4096,
|
||||||
rdma_max_sge: 128,
|
rdma_max_sge: 128,
|
||||||
rdma_max_send: 64,
|
rdma_max_send: 8,
|
||||||
rdma_max_recv: 128,
|
rdma_max_recv: 16,
|
||||||
rdma_max_msg: 132096,
|
rdma_max_msg: 132096,
|
||||||
log_level: 0,
|
|
||||||
block_size: 131072,
|
block_size: 131072,
|
||||||
disk_alignment: 4096,
|
disk_alignment: 4096,
|
||||||
bitmap_granularity: 4096,
|
bitmap_granularity: 4096,
|
||||||
immediate_commit: false, // 'all' or 'small'
|
immediate_commit: false, // 'all' or 'small'
|
||||||
|
// client and osd - configurable online
|
||||||
|
log_level: 0,
|
||||||
client_dirty_limit: 33554432,
|
client_dirty_limit: 33554432,
|
||||||
peer_connect_interval: 5, // seconds. min: 1
|
peer_connect_interval: 5, // seconds. min: 1
|
||||||
peer_connect_timeout: 5, // seconds. min: 1
|
peer_connect_timeout: 5, // seconds. min: 1
|
||||||
@@ -95,18 +97,19 @@ const etcd_tree = {
|
|||||||
osd_network: null, // "192.168.7.0/24" or an array of masks
|
osd_network: null, // "192.168.7.0/24" or an array of masks
|
||||||
bind_address: "0.0.0.0",
|
bind_address: "0.0.0.0",
|
||||||
bind_port: 0,
|
bind_port: 0,
|
||||||
|
readonly: false,
|
||||||
|
osd_memlock: false,
|
||||||
|
// osd - configurable online
|
||||||
autosync_interval: 5,
|
autosync_interval: 5,
|
||||||
autosync_writes: 128,
|
autosync_writes: 128,
|
||||||
client_queue_depth: 128, // unused
|
client_queue_depth: 128, // unused
|
||||||
recovery_queue_depth: 4,
|
recovery_queue_depth: 4,
|
||||||
recovery_sync_batch: 16,
|
recovery_sync_batch: 16,
|
||||||
readonly: false,
|
|
||||||
no_recovery: false,
|
no_recovery: false,
|
||||||
no_rebalance: false,
|
no_rebalance: false,
|
||||||
print_stats_interval: 3,
|
print_stats_interval: 3,
|
||||||
slow_log_interval: 10,
|
slow_log_interval: 10,
|
||||||
inode_vanish_time: 60,
|
inode_vanish_time: 60,
|
||||||
osd_memlock: false,
|
|
||||||
// blockstore - fixed in superblock
|
// blockstore - fixed in superblock
|
||||||
block_size,
|
block_size,
|
||||||
disk_alignment,
|
disk_alignment,
|
||||||
@@ -125,14 +128,15 @@ const etcd_tree = {
|
|||||||
meta_offset,
|
meta_offset,
|
||||||
disable_meta_fsync,
|
disable_meta_fsync,
|
||||||
disable_device_lock,
|
disable_device_lock,
|
||||||
// blockstore - configurable
|
// blockstore - configurable offline
|
||||||
max_write_iodepth,
|
|
||||||
min_flusher_count: 1,
|
|
||||||
max_flusher_count: 256,
|
|
||||||
inmemory_metadata,
|
inmemory_metadata,
|
||||||
inmemory_journal,
|
inmemory_journal,
|
||||||
journal_sector_buffer_count,
|
journal_sector_buffer_count,
|
||||||
journal_no_same_sector_overwrites,
|
journal_no_same_sector_overwrites,
|
||||||
|
// blockstore - configurable online
|
||||||
|
max_write_iodepth,
|
||||||
|
min_flusher_count: 1,
|
||||||
|
max_flusher_count: 256,
|
||||||
throttle_small_writes: false,
|
throttle_small_writes: false,
|
||||||
throttle_target_iops: 100,
|
throttle_target_iops: 100,
|
||||||
throttle_target_mbs: 100,
|
throttle_target_mbs: 100,
|
||||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
|||||||
from cinder.volume import driver
|
from cinder.volume import driver
|
||||||
from cinder.volume import volume_utils
|
from cinder.volume import volume_utils
|
||||||
|
|
||||||
VERSION = '0.8.5'
|
VERSION = '0.8.7'
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@@ -25,4 +25,4 @@ rm fio
|
|||||||
mv fio-copy fio
|
mv fio-copy fio
|
||||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||||
tar --transform 's#^#vitastor-0.8.5/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.5$(rpm --eval '%dist').tar.gz *
|
tar --transform 's#^#vitastor-0.8.7/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.7$(rpm --eval '%dist').tar.gz *
|
||||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-0.8.5.el7.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-0.8.7.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 0.8.5
|
Version: 0.8.7
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-0.8.5.el7.tar.gz
|
Source0: vitastor-0.8.7.el7.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-0.8.5.el8.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-0.8.7.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 0.8.5
|
Version: 0.8.7
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-0.8.5.el8.tar.gz
|
Source0: vitastor-0.8.7.el8.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
cmake_minimum_required(VERSION 2.8)
|
cmake_minimum_required(VERSION 2.8.12)
|
||||||
|
|
||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
|||||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_definitions(-DVERSION="0.8.5")
|
add_definitions(-DVERSION="0.8.7")
|
||||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||||
if (${WITH_ASAN})
|
if (${WITH_ASAN})
|
||||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||||
|
@@ -13,6 +13,11 @@ blockstore_t::~blockstore_t()
|
|||||||
delete impl;
|
delete impl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void blockstore_t::parse_config(blockstore_config_t & config)
|
||||||
|
{
|
||||||
|
impl->parse_config(config, false);
|
||||||
|
}
|
||||||
|
|
||||||
void blockstore_t::loop()
|
void blockstore_t::loop()
|
||||||
{
|
{
|
||||||
impl->loop();
|
impl->loop();
|
||||||
|
@@ -107,7 +107,7 @@ Input:
|
|||||||
- buf = pre-allocated obj_ver_id array <len> units long
|
- buf = pre-allocated obj_ver_id array <len> units long
|
||||||
|
|
||||||
Output:
|
Output:
|
||||||
- retval = 0 or negative error number (-EINVAL, -ENOENT if no such version or -EBUSY if not synced)
|
- retval = 0 or negative error number (-ENOENT if no such version for stabilize)
|
||||||
|
|
||||||
## BS_OP_SYNC_STAB_ALL
|
## BS_OP_SYNC_STAB_ALL
|
||||||
|
|
||||||
@@ -165,6 +165,9 @@ public:
|
|||||||
blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
|
blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
|
||||||
~blockstore_t();
|
~blockstore_t();
|
||||||
|
|
||||||
|
// Update configuration
|
||||||
|
void parse_config(blockstore_config_t & config);
|
||||||
|
|
||||||
// Event loop
|
// Event loop
|
||||||
void loop();
|
void loop();
|
||||||
|
|
||||||
|
@@ -11,7 +11,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
|
|||||||
ring_consumer.loop = [this]() { loop(); };
|
ring_consumer.loop = [this]() { loop(); };
|
||||||
ringloop->register_consumer(&ring_consumer);
|
ringloop->register_consumer(&ring_consumer);
|
||||||
initialized = 0;
|
initialized = 0;
|
||||||
parse_config(config);
|
parse_config(config, true);
|
||||||
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
|
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
@@ -171,7 +171,7 @@ void blockstore_impl_t::loop()
|
|||||||
// Can't submit SYNC before previous writes
|
// Can't submit SYNC before previous writes
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
wr_st = continue_sync(op, false);
|
wr_st = continue_sync(op);
|
||||||
if (wr_st != 2)
|
if (wr_st != 2)
|
||||||
{
|
{
|
||||||
has_writes = wr_st > 0 ? 1 : 2;
|
has_writes = wr_st > 0 ? 1 : 2;
|
||||||
@@ -371,13 +371,18 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
|
|||||||
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
|
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
init_op(op);
|
||||||
|
submit_queue.push_back(op);
|
||||||
|
ringloop->wakeup();
|
||||||
|
}
|
||||||
|
|
||||||
|
void blockstore_impl_t::init_op(blockstore_op_t *op)
|
||||||
|
{
|
||||||
// Call constructor without allocating memory. We'll call destructor before returning op back
|
// Call constructor without allocating memory. We'll call destructor before returning op back
|
||||||
new ((void*)op->private_data) blockstore_op_private_t;
|
new ((void*)op->private_data) blockstore_op_private_t;
|
||||||
PRIV(op)->wait_for = 0;
|
PRIV(op)->wait_for = 0;
|
||||||
PRIV(op)->op_state = 0;
|
PRIV(op)->op_state = 0;
|
||||||
PRIV(op)->pending_ops = 0;
|
PRIV(op)->pending_ops = 0;
|
||||||
submit_queue.push_back(op);
|
|
||||||
ringloop->wakeup();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
|
static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
|
||||||
|
@@ -216,6 +216,11 @@ struct pool_shard_settings_t
|
|||||||
uint32_t pg_stripe_size;
|
uint32_t pg_stripe_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define STAB_SPLIT_DONE 1
|
||||||
|
#define STAB_SPLIT_WAIT 2
|
||||||
|
#define STAB_SPLIT_SYNC 3
|
||||||
|
#define STAB_SPLIT_TODO 4
|
||||||
|
|
||||||
class blockstore_impl_t
|
class blockstore_impl_t
|
||||||
{
|
{
|
||||||
blockstore_disk_t dsk;
|
blockstore_disk_t dsk;
|
||||||
@@ -277,7 +282,6 @@ class blockstore_impl_t
|
|||||||
friend class journal_flusher_t;
|
friend class journal_flusher_t;
|
||||||
friend class journal_flusher_co;
|
friend class journal_flusher_co;
|
||||||
|
|
||||||
void parse_config(blockstore_config_t & config);
|
|
||||||
void calc_lengths();
|
void calc_lengths();
|
||||||
void open_data();
|
void open_data();
|
||||||
void open_meta();
|
void open_meta();
|
||||||
@@ -299,6 +303,7 @@ class blockstore_impl_t
|
|||||||
blockstore_init_journal* journal_init_reader;
|
blockstore_init_journal* journal_init_reader;
|
||||||
|
|
||||||
void check_wait(blockstore_op_t *op);
|
void check_wait(blockstore_op_t *op);
|
||||||
|
void init_op(blockstore_op_t *op);
|
||||||
|
|
||||||
// Read
|
// Read
|
||||||
int dequeue_read(blockstore_op_t *read_op);
|
int dequeue_read(blockstore_op_t *read_op);
|
||||||
@@ -318,7 +323,7 @@ class blockstore_impl_t
|
|||||||
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
|
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
|
||||||
|
|
||||||
// Sync
|
// Sync
|
||||||
int continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync);
|
int continue_sync(blockstore_op_t *op);
|
||||||
void ack_sync(blockstore_op_t *op);
|
void ack_sync(blockstore_op_t *op);
|
||||||
|
|
||||||
// Stabilize
|
// Stabilize
|
||||||
@@ -326,6 +331,8 @@ class blockstore_impl_t
|
|||||||
int continue_stable(blockstore_op_t *op);
|
int continue_stable(blockstore_op_t *op);
|
||||||
void mark_stable(const obj_ver_id & ov, bool forget_dirty = false);
|
void mark_stable(const obj_ver_id & ov, bool forget_dirty = false);
|
||||||
void stabilize_object(object_id oid, uint64_t max_ver);
|
void stabilize_object(object_id oid, uint64_t max_ver);
|
||||||
|
blockstore_op_t* selective_sync(blockstore_op_t *op);
|
||||||
|
int split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider);
|
||||||
|
|
||||||
// Rollback
|
// Rollback
|
||||||
int dequeue_rollback(blockstore_op_t *op);
|
int dequeue_rollback(blockstore_op_t *op);
|
||||||
@@ -341,6 +348,8 @@ public:
|
|||||||
blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
|
blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
|
||||||
~blockstore_impl_t();
|
~blockstore_impl_t();
|
||||||
|
|
||||||
|
void parse_config(blockstore_config_t & config, bool init);
|
||||||
|
|
||||||
// Event loop
|
// Event loop
|
||||||
void loop();
|
void loop();
|
||||||
|
|
||||||
|
@@ -4,8 +4,54 @@
|
|||||||
#include <sys/file.h>
|
#include <sys/file.h>
|
||||||
#include "blockstore_impl.h"
|
#include "blockstore_impl.h"
|
||||||
|
|
||||||
void blockstore_impl_t::parse_config(blockstore_config_t & config)
|
void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||||
{
|
{
|
||||||
|
// Online-configurable options:
|
||||||
|
max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
|
||||||
|
if (!max_flusher_count)
|
||||||
|
{
|
||||||
|
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
|
||||||
|
}
|
||||||
|
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
|
||||||
|
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
|
||||||
|
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
|
||||||
|
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
|
||||||
|
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
|
||||||
|
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
|
||||||
|
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
|
||||||
|
if (!max_flusher_count)
|
||||||
|
{
|
||||||
|
max_flusher_count = 256;
|
||||||
|
}
|
||||||
|
if (!min_flusher_count || journal.flush_journal)
|
||||||
|
{
|
||||||
|
min_flusher_count = 1;
|
||||||
|
}
|
||||||
|
if (!max_write_iodepth)
|
||||||
|
{
|
||||||
|
max_write_iodepth = 128;
|
||||||
|
}
|
||||||
|
if (!throttle_target_iops)
|
||||||
|
{
|
||||||
|
throttle_target_iops = 100;
|
||||||
|
}
|
||||||
|
if (!throttle_target_mbs)
|
||||||
|
{
|
||||||
|
throttle_target_mbs = 100;
|
||||||
|
}
|
||||||
|
if (!throttle_target_parallelism)
|
||||||
|
{
|
||||||
|
throttle_target_parallelism = 1;
|
||||||
|
}
|
||||||
|
if (!throttle_threshold_us)
|
||||||
|
{
|
||||||
|
throttle_threshold_us = 50;
|
||||||
|
}
|
||||||
|
if (!init)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Offline-configurable options:
|
||||||
// Common disk options
|
// Common disk options
|
||||||
dsk.parse_config(config);
|
dsk.parse_config(config);
|
||||||
// Parse
|
// Parse
|
||||||
@@ -44,29 +90,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
|
|||||||
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
|
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
|
||||||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
|
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
|
||||||
journal.inmemory = config["inmemory_journal"] != "false";
|
journal.inmemory = config["inmemory_journal"] != "false";
|
||||||
max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
|
|
||||||
if (!max_flusher_count)
|
|
||||||
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
|
|
||||||
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
|
|
||||||
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
|
|
||||||
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
|
|
||||||
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
|
|
||||||
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
|
|
||||||
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
|
|
||||||
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
|
|
||||||
// Validate
|
// Validate
|
||||||
if (!max_flusher_count)
|
|
||||||
{
|
|
||||||
max_flusher_count = 256;
|
|
||||||
}
|
|
||||||
if (!min_flusher_count || journal.flush_journal)
|
|
||||||
{
|
|
||||||
min_flusher_count = 1;
|
|
||||||
}
|
|
||||||
if (!max_write_iodepth)
|
|
||||||
{
|
|
||||||
max_write_iodepth = 128;
|
|
||||||
}
|
|
||||||
if (journal.sector_count < 2)
|
if (journal.sector_count < 2)
|
||||||
{
|
{
|
||||||
journal.sector_count = 32;
|
journal.sector_count = 32;
|
||||||
@@ -91,22 +115,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
|
|||||||
{
|
{
|
||||||
throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
|
throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
|
||||||
}
|
}
|
||||||
if (!throttle_target_iops)
|
|
||||||
{
|
|
||||||
throttle_target_iops = 100;
|
|
||||||
}
|
|
||||||
if (!throttle_target_mbs)
|
|
||||||
{
|
|
||||||
throttle_target_mbs = 100;
|
|
||||||
}
|
|
||||||
if (!throttle_target_parallelism)
|
|
||||||
{
|
|
||||||
throttle_target_parallelism = 1;
|
|
||||||
}
|
|
||||||
if (!throttle_threshold_us)
|
|
||||||
{
|
|
||||||
throttle_threshold_us = 50;
|
|
||||||
}
|
|
||||||
// init some fields
|
// init some fields
|
||||||
journal.block_size = dsk.journal_block_size;
|
journal.block_size = dsk.journal_block_size;
|
||||||
journal.next_free = dsk.journal_block_size;
|
journal.next_free = dsk.journal_block_size;
|
||||||
|
@@ -9,48 +9,39 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
|
|||||||
{
|
{
|
||||||
return continue_rollback(op);
|
return continue_rollback(op);
|
||||||
}
|
}
|
||||||
obj_ver_id *v, *nv;
|
int r = split_stab_op(op, [this](obj_ver_id ov)
|
||||||
int i, todo = op->len;
|
|
||||||
for (i = 0, v = (obj_ver_id*)op->buf, nv = (obj_ver_id*)op->buf; i < op->len; i++, v++, nv++)
|
|
||||||
{
|
{
|
||||||
if (nv != v)
|
|
||||||
{
|
|
||||||
*nv = *v;
|
|
||||||
}
|
|
||||||
// Check that there are some versions greater than v->version (which may be zero),
|
// Check that there are some versions greater than v->version (which may be zero),
|
||||||
// check that they're unstable, synced, and not currently written to
|
// check that they're unstable, synced, and not currently written to
|
||||||
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
|
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
|
||||||
.oid = v->oid,
|
.oid = ov.oid,
|
||||||
.version = UINT64_MAX,
|
.version = UINT64_MAX,
|
||||||
});
|
});
|
||||||
if (dirty_it == dirty_db.begin())
|
if (dirty_it == dirty_db.begin())
|
||||||
{
|
{
|
||||||
skip_ov:
|
|
||||||
// Already rolled back, skip this object version
|
// Already rolled back, skip this object version
|
||||||
todo--;
|
return STAB_SPLIT_DONE;
|
||||||
nv--;
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
dirty_it--;
|
dirty_it--;
|
||||||
if (dirty_it->first.oid != v->oid || dirty_it->first.version < v->version)
|
if (dirty_it->first.oid != ov.oid || dirty_it->first.version < ov.version)
|
||||||
{
|
{
|
||||||
goto skip_ov;
|
// Already rolled back, skip this object version
|
||||||
|
return STAB_SPLIT_DONE;
|
||||||
}
|
}
|
||||||
while (dirty_it->first.oid == v->oid && dirty_it->first.version > v->version)
|
while (dirty_it->first.oid == ov.oid && dirty_it->first.version > ov.version)
|
||||||
{
|
{
|
||||||
if (IS_IN_FLIGHT(dirty_it->second.state))
|
if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||||
{
|
{
|
||||||
// Object write is still in progress. Wait until the write request completes
|
// Object write is still in progress. Wait until the write request completes
|
||||||
return 0;
|
return STAB_SPLIT_WAIT;
|
||||||
}
|
}
|
||||||
else if (!IS_SYNCED(dirty_it->second.state) ||
|
else if (!IS_SYNCED(dirty_it->second.state) ||
|
||||||
IS_STABLE(dirty_it->second.state))
|
IS_STABLE(dirty_it->second.state))
|
||||||
{
|
{
|
||||||
op->retval = -EBUSY;
|
// Sync the object
|
||||||
FINISH_OP(op);
|
return STAB_SPLIT_SYNC;
|
||||||
return 2;
|
|
||||||
}
|
}
|
||||||
if (dirty_it == dirty_db.begin())
|
if (dirty_it == dirty_db.begin())
|
||||||
{
|
{
|
||||||
@@ -58,19 +49,16 @@ skip_ov:
|
|||||||
}
|
}
|
||||||
dirty_it--;
|
dirty_it--;
|
||||||
}
|
}
|
||||||
|
return STAB_SPLIT_TODO;
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
op->len = todo;
|
if (r != 1)
|
||||||
if (!todo)
|
|
||||||
{
|
{
|
||||||
// Already rolled back
|
return r;
|
||||||
op->retval = 0;
|
|
||||||
FINISH_OP(op);
|
|
||||||
return 2;
|
|
||||||
}
|
}
|
||||||
// Check journal space
|
// Check journal space
|
||||||
blockstore_journal_check_t space_check(this);
|
blockstore_journal_check_t space_check(this);
|
||||||
if (!space_check.check_available(op, todo, sizeof(journal_entry_rollback), 0))
|
if (!space_check.check_available(op, op->len, sizeof(journal_entry_rollback), 0))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -78,7 +66,8 @@ skip_ov:
|
|||||||
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
||||||
// Prepare and submit journal entries
|
// Prepare and submit journal entries
|
||||||
int s = 0;
|
int s = 0;
|
||||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
auto v = (obj_ver_id*)op->buf;
|
||||||
|
for (int i = 0; i < op->len; i++, v++)
|
||||||
{
|
{
|
||||||
if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
|
if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
|
||||||
journal.sector_info[journal.cur_sector].dirty)
|
journal.sector_info[journal.cur_sector].dirty)
|
||||||
|
@@ -41,60 +41,309 @@
|
|||||||
// 4) after a while it takes his synced object list and sends stabilize requests
|
// 4) after a while it takes his synced object list and sends stabilize requests
|
||||||
// to peers and to its own blockstore, thus freeing the old version
|
// to peers and to its own blockstore, thus freeing the old version
|
||||||
|
|
||||||
int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
struct ver_vector_t
|
||||||
{
|
{
|
||||||
if (PRIV(op)->op_state)
|
obj_ver_id *items = NULL;
|
||||||
|
uint64_t alloc = 0, size = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void init_versions(ver_vector_t & vec, obj_ver_id *start, obj_ver_id *end, uint64_t len)
|
||||||
|
{
|
||||||
|
if (!vec.items)
|
||||||
{
|
{
|
||||||
return continue_stable(op);
|
vec.alloc = len;
|
||||||
|
vec.items = (obj_ver_id*)malloc_or_die(sizeof(obj_ver_id) * vec.alloc);
|
||||||
|
for (auto sv = start; sv < end; sv++)
|
||||||
|
{
|
||||||
|
vec.items[vec.size++] = *sv;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void append_version(ver_vector_t & vec, obj_ver_id ov)
|
||||||
|
{
|
||||||
|
if (vec.size >= vec.alloc)
|
||||||
|
{
|
||||||
|
vec.alloc = !vec.alloc ? 4 : vec.alloc*2;
|
||||||
|
vec.items = (obj_ver_id*)realloc_or_die(vec.items, sizeof(obj_ver_id) * vec.alloc);
|
||||||
|
}
|
||||||
|
vec.items[vec.size++] = ov;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool check_unsynced(std::vector<obj_ver_id> & check, obj_ver_id ov, std::vector<obj_ver_id> & to, int *count)
|
||||||
|
{
|
||||||
|
bool found = false;
|
||||||
|
int j = 0, k = 0;
|
||||||
|
while (j < check.size())
|
||||||
|
{
|
||||||
|
if (check[j] == ov)
|
||||||
|
found = true;
|
||||||
|
if (check[j].oid == ov.oid && check[j].version <= ov.version)
|
||||||
|
{
|
||||||
|
to.push_back(check[j++]);
|
||||||
|
if (count)
|
||||||
|
(*count)--;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
check[k++] = check[j++];
|
||||||
|
}
|
||||||
|
check.resize(k);
|
||||||
|
return found;
|
||||||
|
}
|
||||||
|
|
||||||
|
blockstore_op_t* blockstore_impl_t::selective_sync(blockstore_op_t *op)
|
||||||
|
{
|
||||||
|
unsynced_big_write_count -= unsynced_big_writes.size();
|
||||||
|
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
|
||||||
|
unsynced_big_write_count += unsynced_big_writes.size();
|
||||||
|
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
|
||||||
|
// Create a sync operation, insert into the end of the queue
|
||||||
|
// And move ourselves into the end too!
|
||||||
|
// Rather hacky but that's what we need...
|
||||||
|
blockstore_op_t *sync_op = new blockstore_op_t;
|
||||||
|
sync_op->opcode = BS_OP_SYNC;
|
||||||
|
sync_op->buf = NULL;
|
||||||
|
sync_op->callback = [this](blockstore_op_t *sync_op)
|
||||||
|
{
|
||||||
|
delete sync_op;
|
||||||
|
};
|
||||||
|
init_op(sync_op);
|
||||||
|
int sync_res = continue_sync(sync_op);
|
||||||
|
if (sync_res != 2)
|
||||||
|
{
|
||||||
|
// Put SYNC into the queue if it's not finished yet
|
||||||
|
submit_queue.push_back(sync_op);
|
||||||
|
}
|
||||||
|
// Restore unsynced_writes
|
||||||
|
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
|
||||||
|
unsynced_big_write_count -= unsynced_big_writes.size();
|
||||||
|
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
|
||||||
|
unsynced_big_write_count += unsynced_big_writes.size();
|
||||||
|
if (sync_res == 2)
|
||||||
|
{
|
||||||
|
// Sync is immediately completed
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return sync_op;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns: 2 = stop processing and dequeue, 0 = stop processing and do not dequeue, 1 = proceed with op itself
|
||||||
|
int blockstore_impl_t::split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider)
|
||||||
|
{
|
||||||
|
bool add_sync = false;
|
||||||
|
ver_vector_t good_vers, bad_vers;
|
||||||
obj_ver_id* v;
|
obj_ver_id* v;
|
||||||
int i, todo = 0;
|
int i, todo = 0;
|
||||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||||
{
|
{
|
||||||
auto dirty_it = dirty_db.find(*v);
|
int action = decider(*v);
|
||||||
if (dirty_it == dirty_db.end())
|
if (action < 0)
|
||||||
{
|
{
|
||||||
auto & clean_db = clean_db_shard(v->oid);
|
// Rollback changes
|
||||||
auto clean_it = clean_db.find(v->oid);
|
for (auto & ov: PRIV(op)->sync_big_writes)
|
||||||
if (clean_it == clean_db.end() || clean_it->second.version < v->version)
|
|
||||||
{
|
{
|
||||||
// No such object version
|
unsynced_big_writes.push_back(ov);
|
||||||
op->retval = -ENOENT;
|
unsynced_big_write_count++;
|
||||||
FINISH_OP(op);
|
|
||||||
return 2;
|
|
||||||
}
|
}
|
||||||
else
|
for (auto & ov: PRIV(op)->sync_small_writes)
|
||||||
{
|
{
|
||||||
// Already stable
|
unsynced_small_writes.push_back(ov);
|
||||||
}
|
}
|
||||||
}
|
free(good_vers.items);
|
||||||
else if (IS_IN_FLIGHT(dirty_it->second.state))
|
good_vers.items = NULL;
|
||||||
{
|
free(bad_vers.items);
|
||||||
// Object write is still in progress. Wait until the write request completes
|
bad_vers.items = NULL;
|
||||||
return 0;
|
// Error
|
||||||
}
|
op->retval = action;
|
||||||
else if (!IS_SYNCED(dirty_it->second.state))
|
|
||||||
{
|
|
||||||
// Object not synced yet. Caller must sync it first
|
|
||||||
op->retval = -EBUSY;
|
|
||||||
FINISH_OP(op);
|
FINISH_OP(op);
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
else if (!IS_STABLE(dirty_it->second.state))
|
else if (action == STAB_SPLIT_DONE)
|
||||||
{
|
{
|
||||||
|
// Already done
|
||||||
|
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
|
||||||
|
}
|
||||||
|
else if (action == STAB_SPLIT_WAIT)
|
||||||
|
{
|
||||||
|
// Already in progress, we just have to wait until it finishes
|
||||||
|
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
|
||||||
|
append_version(bad_vers, *v);
|
||||||
|
}
|
||||||
|
else if (action == STAB_SPLIT_SYNC)
|
||||||
|
{
|
||||||
|
// Needs a SYNC, we have to send a SYNC if not already in progress
|
||||||
|
//
|
||||||
|
// If the object is not present in unsynced_(big|small)_writes then
|
||||||
|
// it's currently being synced. If it's present then we can initiate
|
||||||
|
// its sync ourselves.
|
||||||
|
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
|
||||||
|
append_version(bad_vers, *v);
|
||||||
|
if (!add_sync)
|
||||||
|
{
|
||||||
|
PRIV(op)->sync_big_writes.clear();
|
||||||
|
PRIV(op)->sync_small_writes.clear();
|
||||||
|
add_sync = true;
|
||||||
|
}
|
||||||
|
check_unsynced(unsynced_small_writes, *v, PRIV(op)->sync_small_writes, NULL);
|
||||||
|
check_unsynced(unsynced_big_writes, *v, PRIV(op)->sync_big_writes, &unsynced_big_write_count);
|
||||||
|
}
|
||||||
|
else /* if (action == STAB_SPLIT_TODO) */
|
||||||
|
{
|
||||||
|
if (good_vers.items)
|
||||||
|
{
|
||||||
|
// If we're selecting versions then append it
|
||||||
|
// Main idea is that 99% of the time all versions passed to BS_OP_STABLE are synced
|
||||||
|
// And we don't want to select/allocate anything in that optimistic case
|
||||||
|
append_version(good_vers, *v);
|
||||||
|
}
|
||||||
todo++;
|
todo++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!todo)
|
// In a pessimistic scenario, an operation may be split into 3:
|
||||||
|
// - Stabilize synced entries
|
||||||
|
// - Sync unsynced entries
|
||||||
|
// - Continue for unsynced entries after sync
|
||||||
|
add_sync = add_sync && (PRIV(op)->sync_big_writes.size() || PRIV(op)->sync_small_writes.size());
|
||||||
|
if (!todo && !bad_vers.size)
|
||||||
{
|
{
|
||||||
// Already stable
|
// Already stable
|
||||||
op->retval = 0;
|
op->retval = 0;
|
||||||
FINISH_OP(op);
|
FINISH_OP(op);
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
op->retval = 0;
|
||||||
|
if (!todo && !add_sync)
|
||||||
|
{
|
||||||
|
// Only wait for inflight writes or current in-progress syncs
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
blockstore_op_t *sync_op = NULL, *split_stab_op = NULL;
|
||||||
|
if (add_sync)
|
||||||
|
{
|
||||||
|
// Initiate a selective sync for PRIV(op)->sync_(big|small)_writes
|
||||||
|
sync_op = selective_sync(op);
|
||||||
|
}
|
||||||
|
if (bad_vers.size)
|
||||||
|
{
|
||||||
|
// Split part of the request into a separate operation
|
||||||
|
split_stab_op = new blockstore_op_t;
|
||||||
|
split_stab_op->opcode = op->opcode;
|
||||||
|
split_stab_op->buf = bad_vers.items;
|
||||||
|
split_stab_op->len = bad_vers.size;
|
||||||
|
init_op(split_stab_op);
|
||||||
|
submit_queue.push_back(split_stab_op);
|
||||||
|
}
|
||||||
|
if (sync_op || split_stab_op || good_vers.items)
|
||||||
|
{
|
||||||
|
void *orig_buf = op->buf;
|
||||||
|
if (good_vers.items)
|
||||||
|
{
|
||||||
|
op->buf = good_vers.items;
|
||||||
|
op->len = good_vers.size;
|
||||||
|
}
|
||||||
|
// Make a wrapped callback
|
||||||
|
int *split_op_counter = (int*)malloc_or_die(sizeof(int));
|
||||||
|
*split_op_counter = (sync_op ? 1 : 0) + (split_stab_op ? 1 : 0) + (todo ? 1 : 0);
|
||||||
|
auto cb = [this, op, good_items = good_vers.items,
|
||||||
|
bad_items = bad_vers.items, split_op_counter,
|
||||||
|
orig_buf, real_cb = op->callback](blockstore_op_t *split_op)
|
||||||
|
{
|
||||||
|
if (split_op->retval != 0)
|
||||||
|
op->retval = split_op->retval;
|
||||||
|
(*split_op_counter)--;
|
||||||
|
assert((*split_op_counter) >= 0);
|
||||||
|
if (op != split_op)
|
||||||
|
delete split_op;
|
||||||
|
if (!*split_op_counter)
|
||||||
|
{
|
||||||
|
free(good_items);
|
||||||
|
free(bad_items);
|
||||||
|
free(split_op_counter);
|
||||||
|
op->buf = orig_buf;
|
||||||
|
real_cb(op);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if (sync_op)
|
||||||
|
{
|
||||||
|
sync_op->callback = cb;
|
||||||
|
}
|
||||||
|
if (split_stab_op)
|
||||||
|
{
|
||||||
|
split_stab_op->callback = cb;
|
||||||
|
}
|
||||||
|
op->callback = cb;
|
||||||
|
}
|
||||||
|
if (!todo)
|
||||||
|
{
|
||||||
|
// All work is postponed
|
||||||
|
op->callback = NULL;
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||||
|
{
|
||||||
|
if (PRIV(op)->op_state)
|
||||||
|
{
|
||||||
|
return continue_stable(op);
|
||||||
|
}
|
||||||
|
int r = split_stab_op(op, [this](obj_ver_id ov)
|
||||||
|
{
|
||||||
|
auto dirty_it = dirty_db.find(ov);
|
||||||
|
if (dirty_it == dirty_db.end())
|
||||||
|
{
|
||||||
|
auto & clean_db = clean_db_shard(ov.oid);
|
||||||
|
auto clean_it = clean_db.find(ov.oid);
|
||||||
|
if (clean_it == clean_db.end() || clean_it->second.version < ov.version)
|
||||||
|
{
|
||||||
|
// No such object version
|
||||||
|
printf("Error: %lx:%lx v%lu not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
||||||
|
return -ENOENT;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Already stable
|
||||||
|
return STAB_SPLIT_DONE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||||
|
{
|
||||||
|
// Object write is still in progress. Wait until the write request completes
|
||||||
|
return STAB_SPLIT_WAIT;
|
||||||
|
}
|
||||||
|
else if (!IS_SYNCED(dirty_it->second.state))
|
||||||
|
{
|
||||||
|
// Object not synced yet - sync it
|
||||||
|
// In previous versions we returned EBUSY here and required
|
||||||
|
// the caller (OSD) to issue a global sync first. But a global sync
|
||||||
|
// waits for all writes in the queue including inflight writes. And
|
||||||
|
// inflight writes may themselves be blocked by unstable writes being
|
||||||
|
// still present in the journal and not flushed away from it.
|
||||||
|
// So we must sync specific objects here.
|
||||||
|
//
|
||||||
|
// Even more, we have to process "stabilize" request in parts. That is,
|
||||||
|
// we must stabilize all objects which are already synced. Otherwise
|
||||||
|
// they may block objects which are NOT synced yet.
|
||||||
|
return STAB_SPLIT_SYNC;
|
||||||
|
}
|
||||||
|
else if (IS_STABLE(dirty_it->second.state))
|
||||||
|
{
|
||||||
|
// Already stable
|
||||||
|
return STAB_SPLIT_DONE;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return STAB_SPLIT_TODO;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (r != 1)
|
||||||
|
{
|
||||||
|
return r;
|
||||||
|
}
|
||||||
// Check journal space
|
// Check journal space
|
||||||
blockstore_journal_check_t space_check(this);
|
blockstore_journal_check_t space_check(this);
|
||||||
if (!space_check.check_available(op, todo, sizeof(journal_entry_stable), 0))
|
if (!space_check.check_available(op, op->len, sizeof(journal_entry_stable), 0))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -102,9 +351,9 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
|||||||
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
||||||
// Prepare and submit journal entries
|
// Prepare and submit journal entries
|
||||||
int s = 0;
|
int s = 0;
|
||||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
auto v = (obj_ver_id*)op->buf;
|
||||||
|
for (int i = 0; i < op->len; i++, v++)
|
||||||
{
|
{
|
||||||
// FIXME: Only stabilize versions that aren't stable yet
|
|
||||||
if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
|
if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
|
||||||
journal.sector_info[journal.cur_sector].dirty)
|
journal.sector_info[journal.cur_sector].dirty)
|
||||||
{
|
{
|
||||||
|
@@ -12,7 +12,7 @@
|
|||||||
#define SYNC_JOURNAL_SYNC_SENT 7
|
#define SYNC_JOURNAL_SYNC_SENT 7
|
||||||
#define SYNC_DONE 8
|
#define SYNC_DONE 8
|
||||||
|
|
||||||
int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync)
|
int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
if (immediate_commit == IMMEDIATE_ALL)
|
if (immediate_commit == IMMEDIATE_ALL)
|
||||||
{
|
{
|
||||||
@@ -145,7 +145,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
|
|||||||
PRIV(op)->op_state = SYNC_DONE;
|
PRIV(op)->op_state = SYNC_DONE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (PRIV(op)->op_state == SYNC_DONE && !queue_has_in_progress_sync)
|
if (PRIV(op)->op_state == SYNC_DONE)
|
||||||
{
|
{
|
||||||
ack_sync(op);
|
ack_sync(op);
|
||||||
return 2;
|
return 2;
|
||||||
|
@@ -278,7 +278,7 @@ struct rm_osd_t
|
|||||||
if (rsp["response_delete_range"]["deleted"].uint64_value() > 0)
|
if (rsp["response_delete_range"]["deleted"].uint64_value() > 0)
|
||||||
{
|
{
|
||||||
// Wait for mon_change_timeout before updating PG history, or the monitor's change will likely interfere with ours
|
// Wait for mon_change_timeout before updating PG history, or the monitor's change will likely interfere with ours
|
||||||
retry_wait = parent->cli->merged_config["mon_change_timeout"].uint64_value();
|
retry_wait = parent->cli->config["mon_change_timeout"].uint64_value();
|
||||||
if (!retry_wait)
|
if (!retry_wait)
|
||||||
retry_wait = 1000;
|
retry_wait = 1000;
|
||||||
retry_wait += etcd_tx_retry_ms;
|
retry_wait += etcd_tx_retry_ms;
|
||||||
|
@@ -198,9 +198,9 @@ resume_2:
|
|||||||
}
|
}
|
||||||
pgs_by_state_str += std::to_string(kv.second)+" "+kv.first;
|
pgs_by_state_str += std::to_string(kv.second)+" "+kv.first;
|
||||||
}
|
}
|
||||||
bool readonly = json_is_true(parent->cli->merged_config["readonly"]);
|
bool readonly = json_is_true(parent->cli->config["readonly"]);
|
||||||
bool no_recovery = json_is_true(parent->cli->merged_config["no_recovery"]);
|
bool no_recovery = json_is_true(parent->cli->config["no_recovery"]);
|
||||||
bool no_rebalance = json_is_true(parent->cli->merged_config["no_rebalance"]);
|
bool no_rebalance = json_is_true(parent->cli->config["no_rebalance"]);
|
||||||
if (parent->json_output)
|
if (parent->json_output)
|
||||||
{
|
{
|
||||||
// JSON output
|
// JSON output
|
||||||
|
@@ -18,11 +18,12 @@
|
|||||||
|
|
||||||
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
|
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
|
||||||
{
|
{
|
||||||
config = osd_messenger_t::read_config(config);
|
cli_config = config.object_items();
|
||||||
|
file_config = osd_messenger_t::read_config(config);
|
||||||
|
config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
|
||||||
|
|
||||||
this->ringloop = ringloop;
|
this->ringloop = ringloop;
|
||||||
this->tfd = tfd;
|
this->tfd = tfd;
|
||||||
this->config = config;
|
|
||||||
|
|
||||||
msgr.osd_num = 0;
|
msgr.osd_num = 0;
|
||||||
msgr.tfd = tfd;
|
msgr.tfd = tfd;
|
||||||
@@ -58,7 +59,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
|||||||
msgr.stop_client(op->peer_fd);
|
msgr.stop_client(op->peer_fd);
|
||||||
delete op;
|
delete op;
|
||||||
};
|
};
|
||||||
msgr.parse_config(this->config);
|
msgr.parse_config(config);
|
||||||
|
|
||||||
st_cli.tfd = tfd;
|
st_cli.tfd = tfd;
|
||||||
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
|
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
|
||||||
@@ -276,13 +277,10 @@ restart:
|
|||||||
continuing_ops = 0;
|
continuing_ops = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void cluster_client_t::on_load_config_hook(json11::Json::object & config)
|
void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_config)
|
||||||
{
|
{
|
||||||
this->merged_config = config;
|
this->etcd_global_config = etcd_global_config;
|
||||||
for (auto & kv: this->config.object_items())
|
config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
|
||||||
{
|
|
||||||
this->merged_config[kv.first] = kv.second;
|
|
||||||
}
|
|
||||||
if (config.find("client_max_dirty_bytes") != config.end())
|
if (config.find("client_max_dirty_bytes") != config.end())
|
||||||
{
|
{
|
||||||
client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
|
client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
|
||||||
@@ -292,14 +290,13 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
|
|||||||
// Old name
|
// Old name
|
||||||
client_max_dirty_bytes = config["client_dirty_limit"].uint64_value();
|
client_max_dirty_bytes = config["client_dirty_limit"].uint64_value();
|
||||||
}
|
}
|
||||||
if (config.find("client_max_dirty_ops") != config.end())
|
else
|
||||||
{
|
client_max_dirty_bytes = 0;
|
||||||
client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
|
|
||||||
}
|
|
||||||
if (!client_max_dirty_bytes)
|
if (!client_max_dirty_bytes)
|
||||||
{
|
{
|
||||||
client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
|
client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
|
||||||
}
|
}
|
||||||
|
client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
|
||||||
if (!client_max_dirty_ops)
|
if (!client_max_dirty_ops)
|
||||||
{
|
{
|
||||||
client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
|
client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
|
||||||
@@ -314,7 +311,7 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
|
|||||||
up_wait_retry_interval = 50;
|
up_wait_retry_interval = 50;
|
||||||
}
|
}
|
||||||
msgr.parse_config(config);
|
msgr.parse_config(config);
|
||||||
msgr.parse_config(this->config);
|
st_cli.parse_config(config);
|
||||||
st_cli.load_pgs();
|
st_cli.load_pgs();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1121,6 +1118,24 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||||||
if (part->op.reply.hdr.retval != expected)
|
if (part->op.reply.hdr.retval != expected)
|
||||||
{
|
{
|
||||||
// Operation failed, retry
|
// Operation failed, retry
|
||||||
|
part->flags |= PART_ERROR;
|
||||||
|
if (!op->retval || op->retval == -EPIPE)
|
||||||
|
{
|
||||||
|
// Don't overwrite other errors with -EPIPE
|
||||||
|
op->retval = part->op.reply.hdr.retval;
|
||||||
|
}
|
||||||
|
int stop_fd = -1;
|
||||||
|
if (op->retval != -EINTR && op->retval != -EIO)
|
||||||
|
{
|
||||||
|
stop_fd = part->op.peer_fd;
|
||||||
|
fprintf(
|
||||||
|
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
|
||||||
|
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
||||||
|
);
|
||||||
|
}
|
||||||
|
// All next things like timer, continue_sync/rw and stop_client may affect the operation again
|
||||||
|
// So do all these things after modifying operation state, otherwise we may hit reenterability bugs
|
||||||
|
// FIXME postpone such things to set_immediate here to avoid bugs
|
||||||
if (part->op.reply.hdr.retval == -EPIPE)
|
if (part->op.reply.hdr.retval == -EPIPE)
|
||||||
{
|
{
|
||||||
// Mark op->up_wait = true before stopping the client
|
// Mark op->up_wait = true before stopping the client
|
||||||
@@ -1134,20 +1149,17 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!op->retval || op->retval == -EPIPE)
|
if (op->inflight_count == 0)
|
||||||
{
|
{
|
||||||
// Don't overwrite other errors with -EPIPE
|
if (op->opcode == OSD_OP_SYNC)
|
||||||
op->retval = part->op.reply.hdr.retval;
|
continue_sync(op);
|
||||||
|
else
|
||||||
|
continue_rw(op);
|
||||||
}
|
}
|
||||||
if (op->retval != -EINTR && op->retval != -EIO)
|
if (stop_fd >= 0)
|
||||||
{
|
{
|
||||||
fprintf(
|
msgr.stop_client(stop_fd);
|
||||||
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
|
|
||||||
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
|
||||||
);
|
|
||||||
msgr.stop_client(part->op.peer_fd);
|
|
||||||
}
|
}
|
||||||
part->flags |= PART_ERROR;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -1161,13 +1173,13 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||||||
copy_part_bitmap(op, part);
|
copy_part_bitmap(op, part);
|
||||||
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
||||||
}
|
}
|
||||||
}
|
if (op->inflight_count == 0)
|
||||||
if (op->inflight_count == 0)
|
{
|
||||||
{
|
if (op->opcode == OSD_OP_SYNC)
|
||||||
if (op->opcode == OSD_OP_SYNC)
|
continue_sync(op);
|
||||||
continue_sync(op);
|
else
|
||||||
else
|
continue_rw(op);
|
||||||
continue_rw(op);
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -112,8 +112,8 @@ public:
|
|||||||
osd_messenger_t msgr;
|
osd_messenger_t msgr;
|
||||||
void init_msgr();
|
void init_msgr();
|
||||||
|
|
||||||
json11::Json config;
|
json11::Json::object cli_config, file_config, etcd_global_config;
|
||||||
json11::Json::object merged_config;
|
json11::Json::object config;
|
||||||
|
|
||||||
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
|
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
|
||||||
~cluster_client_t();
|
~cluster_client_t();
|
||||||
|
@@ -281,7 +281,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
|
|||||||
if (je->big_write.size > sizeof(journal_entry_big_write))
|
if (je->big_write.size > sizeof(journal_entry_big_write))
|
||||||
{
|
{
|
||||||
printf(json ? ",\"bitmap\":\"" : " (bitmap: ");
|
printf(json ? ",\"bitmap\":\"" : " (bitmap: ");
|
||||||
for (int i = sizeof(journal_entry_big_write); i < je->small_write.size; i++)
|
for (int i = sizeof(journal_entry_big_write); i < je->big_write.size; i++)
|
||||||
{
|
{
|
||||||
printf("%02x", ((uint8_t*)je)[i]);
|
printf("%02x", ((uint8_t*)je)[i]);
|
||||||
}
|
}
|
||||||
|
@@ -26,7 +26,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)>
|
|||||||
buf_size = dsk.meta_len;
|
buf_size = dsk.meta_len;
|
||||||
void *data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
void *data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
||||||
lseek64(dsk.meta_fd, dsk.meta_offset, 0);
|
lseek64(dsk.meta_fd, dsk.meta_offset, 0);
|
||||||
read_blocking(dsk.meta_fd, data, buf_size);
|
read_blocking(dsk.meta_fd, data, dsk.meta_block_size);
|
||||||
// Check superblock
|
// Check superblock
|
||||||
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
|
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
|
||||||
if (hdr->zero == 0 &&
|
if (hdr->zero == 0 &&
|
||||||
@@ -41,8 +41,11 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)>
|
|||||||
if (buf_size % dsk.meta_block_size)
|
if (buf_size % dsk.meta_block_size)
|
||||||
{
|
{
|
||||||
buf_size = 8*dsk.meta_block_size;
|
buf_size = 8*dsk.meta_block_size;
|
||||||
|
void *new_data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
||||||
|
memcpy(new_data, data, dsk.meta_block_size);
|
||||||
free(data);
|
free(data);
|
||||||
data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
data = new_data;
|
||||||
|
hdr = (blockstore_meta_header_v1_t *)data;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dsk.bitmap_granularity = hdr->bitmap_granularity;
|
dsk.bitmap_granularity = hdr->bitmap_granularity;
|
||||||
|
@@ -18,12 +18,8 @@ etcd_state_client_t::~etcd_state_client_t()
|
|||||||
}
|
}
|
||||||
watches.clear();
|
watches.clear();
|
||||||
etcd_watches_initialised = -1;
|
etcd_watches_initialised = -1;
|
||||||
if (ws_keepalive_timer >= 0)
|
|
||||||
{
|
|
||||||
tfd->clear_timer(ws_keepalive_timer);
|
|
||||||
ws_keepalive_timer = -1;
|
|
||||||
}
|
|
||||||
#ifndef __MOCK__
|
#ifndef __MOCK__
|
||||||
|
stop_ws_keepalive();
|
||||||
if (etcd_watch_ws)
|
if (etcd_watch_ws)
|
||||||
{
|
{
|
||||||
http_close(etcd_watch_ws);
|
http_close(etcd_watch_ws);
|
||||||
@@ -245,6 +241,7 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
|
|||||||
if (this->etcd_keepalive_timeout < 30)
|
if (this->etcd_keepalive_timeout < 30)
|
||||||
this->etcd_keepalive_timeout = 30;
|
this->etcd_keepalive_timeout = 30;
|
||||||
}
|
}
|
||||||
|
auto old_etcd_ws_keepalive_interval = this->etcd_ws_keepalive_interval;
|
||||||
this->etcd_ws_keepalive_interval = config["etcd_ws_keepalive_interval"].uint64_value();
|
this->etcd_ws_keepalive_interval = config["etcd_ws_keepalive_interval"].uint64_value();
|
||||||
if (this->etcd_ws_keepalive_interval <= 0)
|
if (this->etcd_ws_keepalive_interval <= 0)
|
||||||
{
|
{
|
||||||
@@ -265,6 +262,13 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
|
|||||||
{
|
{
|
||||||
this->etcd_quick_timeout = 1000;
|
this->etcd_quick_timeout = 1000;
|
||||||
}
|
}
|
||||||
|
if (this->etcd_ws_keepalive_interval != old_etcd_ws_keepalive_interval && ws_keepalive_timer >= 0)
|
||||||
|
{
|
||||||
|
#ifndef __MOCK__
|
||||||
|
stop_ws_keepalive();
|
||||||
|
start_ws_keepalive();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void etcd_state_client_t::pick_next_etcd()
|
void etcd_state_client_t::pick_next_etcd()
|
||||||
@@ -478,6 +482,20 @@ void etcd_state_client_t::start_etcd_watcher()
|
|||||||
{
|
{
|
||||||
on_start_watcher_hook(etcd_watch_ws);
|
on_start_watcher_hook(etcd_watch_ws);
|
||||||
}
|
}
|
||||||
|
start_ws_keepalive();
|
||||||
|
}
|
||||||
|
|
||||||
|
void etcd_state_client_t::stop_ws_keepalive()
|
||||||
|
{
|
||||||
|
if (ws_keepalive_timer >= 0)
|
||||||
|
{
|
||||||
|
tfd->clear_timer(ws_keepalive_timer);
|
||||||
|
ws_keepalive_timer = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void etcd_state_client_t::start_ws_keepalive()
|
||||||
|
{
|
||||||
if (ws_keepalive_timer < 0)
|
if (ws_keepalive_timer < 0)
|
||||||
{
|
{
|
||||||
ws_keepalive_timer = tfd->set_timer(etcd_ws_keepalive_interval*1000, true, [this](int)
|
ws_keepalive_timer = tfd->set_timer(etcd_ws_keepalive_interval*1000, true, [this](int)
|
||||||
|
@@ -132,6 +132,8 @@ public:
|
|||||||
void etcd_txn(json11::Json txn, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
|
void etcd_txn(json11::Json txn, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
|
||||||
void etcd_txn_slow(json11::Json txn, std::function<void(std::string, json11::Json)> callback);
|
void etcd_txn_slow(json11::Json txn, std::function<void(std::string, json11::Json)> callback);
|
||||||
void start_etcd_watcher();
|
void start_etcd_watcher();
|
||||||
|
void stop_ws_keepalive();
|
||||||
|
void start_ws_keepalive();
|
||||||
void load_global_config();
|
void load_global_config();
|
||||||
void load_pgs();
|
void load_pgs();
|
||||||
void parse_state(const etcd_kv_t & kv);
|
void parse_state(const etcd_kv_t & kv);
|
||||||
|
@@ -157,10 +157,10 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
|||||||
this->rdma_max_sge = 128;
|
this->rdma_max_sge = 128;
|
||||||
this->rdma_max_send = config["rdma_max_send"].uint64_value();
|
this->rdma_max_send = config["rdma_max_send"].uint64_value();
|
||||||
if (!this->rdma_max_send)
|
if (!this->rdma_max_send)
|
||||||
this->rdma_max_send = 64;
|
this->rdma_max_send = 8;
|
||||||
this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
|
this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
|
||||||
if (!this->rdma_max_recv)
|
if (!this->rdma_max_recv)
|
||||||
this->rdma_max_recv = 128;
|
this->rdma_max_recv = 16;
|
||||||
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
|
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
|
||||||
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
||||||
this->rdma_max_msg = 129*1024;
|
this->rdma_max_msg = 129*1024;
|
||||||
@@ -534,8 +534,9 @@ bool osd_messenger_t::is_rdma_enabled()
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
json11::Json osd_messenger_t::read_config(const json11::Json & config)
|
json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
|
||||||
{
|
{
|
||||||
|
json11::Json::object file_config;
|
||||||
const char *config_path = config["config_path"].string_value() != ""
|
const char *config_path = config["config_path"].string_value() != ""
|
||||||
? config["config_path"].string_value().c_str() : VITASTOR_CONFIG_PATH;
|
? config["config_path"].string_value().c_str() : VITASTOR_CONFIG_PATH;
|
||||||
int fd = open(config_path, O_RDONLY);
|
int fd = open(config_path, O_RDONLY);
|
||||||
@@ -543,14 +544,14 @@ json11::Json osd_messenger_t::read_config(const json11::Json & config)
|
|||||||
{
|
{
|
||||||
if (errno != ENOENT)
|
if (errno != ENOENT)
|
||||||
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
||||||
return config;
|
return file_config;
|
||||||
}
|
}
|
||||||
struct stat st;
|
struct stat st;
|
||||||
if (fstat(fd, &st) != 0)
|
if (fstat(fd, &st) != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
||||||
close(fd);
|
close(fd);
|
||||||
return config;
|
return file_config;
|
||||||
}
|
}
|
||||||
std::string buf;
|
std::string buf;
|
||||||
buf.resize(st.st_size);
|
buf.resize(st.st_size);
|
||||||
@@ -562,23 +563,125 @@ json11::Json osd_messenger_t::read_config(const json11::Json & config)
|
|||||||
{
|
{
|
||||||
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
||||||
close(fd);
|
close(fd);
|
||||||
return config;
|
return file_config;
|
||||||
}
|
}
|
||||||
done += r;
|
done += r;
|
||||||
}
|
}
|
||||||
close(fd);
|
close(fd);
|
||||||
std::string json_err;
|
std::string json_err;
|
||||||
json11::Json::object file_config = json11::Json::parse(buf, json_err).object_items();
|
file_config = json11::Json::parse(buf, json_err).object_items();
|
||||||
if (json_err != "")
|
if (json_err != "")
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Invalid JSON in %s: %s\n", config_path, json_err.c_str());
|
fprintf(stderr, "Invalid JSON in %s: %s\n", config_path, json_err.c_str());
|
||||||
return config;
|
|
||||||
}
|
|
||||||
file_config.erase("config_path");
|
|
||||||
file_config.erase("osd_num");
|
|
||||||
for (auto kv: config.object_items())
|
|
||||||
{
|
|
||||||
file_config[kv.first] = kv.second;
|
|
||||||
}
|
}
|
||||||
return file_config;
|
return file_config;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const char* cli_only_params[] = {
|
||||||
|
// The list has to be sorted
|
||||||
|
"bitmap_granularity",
|
||||||
|
"block_size",
|
||||||
|
"data_device",
|
||||||
|
"data_offset",
|
||||||
|
"data_size",
|
||||||
|
"disable_data_fsync",
|
||||||
|
"disable_device_lock",
|
||||||
|
"disable_journal_fsync",
|
||||||
|
"disable_meta_fsync",
|
||||||
|
"disk_alignment",
|
||||||
|
"flush_journal",
|
||||||
|
"immediate_commit",
|
||||||
|
"inmemory_journal",
|
||||||
|
"inmemory_metadata",
|
||||||
|
"journal_block_size",
|
||||||
|
"journal_device",
|
||||||
|
"journal_no_same_sector_overwrites",
|
||||||
|
"journal_offset",
|
||||||
|
"journal_sector_buffer_count",
|
||||||
|
"journal_size",
|
||||||
|
"meta_block_size",
|
||||||
|
"meta_buf_size",
|
||||||
|
"meta_device",
|
||||||
|
"meta_offset",
|
||||||
|
"osd_num",
|
||||||
|
"readonly",
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char **cli_only_end = cli_only_params + (sizeof(cli_only_params)/sizeof(cli_only_params[0]));
|
||||||
|
|
||||||
|
static const char* local_only_params[] = {
|
||||||
|
// The list has to be sorted
|
||||||
|
"config_path",
|
||||||
|
"rdma_device",
|
||||||
|
"rdma_gid_index",
|
||||||
|
"rdma_max_msg",
|
||||||
|
"rdma_max_recv",
|
||||||
|
"rdma_max_send",
|
||||||
|
"rdma_max_sge",
|
||||||
|
"rdma_mtu",
|
||||||
|
"rdma_port_num",
|
||||||
|
"tcp_header_buffer_size",
|
||||||
|
"use_rdma",
|
||||||
|
"use_sync_send_recv",
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char **local_only_end = local_only_params + (sizeof(local_only_params)/sizeof(local_only_params[0]));
|
||||||
|
|
||||||
|
// Basically could be replaced by std::lower_bound()...
|
||||||
|
static int find_str_array(const char **start, const char **end, const std::string & s)
|
||||||
|
{
|
||||||
|
int min = 0, max = end-start;
|
||||||
|
while (max-min >= 2)
|
||||||
|
{
|
||||||
|
int mid = (min+max)/2;
|
||||||
|
int r = strcmp(s.c_str(), start[mid]);
|
||||||
|
if (r < 0)
|
||||||
|
max = mid;
|
||||||
|
else if (r > 0)
|
||||||
|
min = mid;
|
||||||
|
else
|
||||||
|
return mid;
|
||||||
|
}
|
||||||
|
if (min < end-start && !strcmp(s.c_str(), start[min]))
|
||||||
|
return min;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object & cli_config,
|
||||||
|
const json11::Json::object & file_config,
|
||||||
|
const json11::Json::object & etcd_global_config,
|
||||||
|
const json11::Json::object & etcd_osd_config)
|
||||||
|
{
|
||||||
|
// Priority: most important -> less important:
|
||||||
|
// etcd_osd_config -> cli_config -> etcd_global_config -> file_config
|
||||||
|
json11::Json::object res = file_config;
|
||||||
|
for (auto & kv: file_config)
|
||||||
|
{
|
||||||
|
int cli_only = find_str_array(cli_only_params, cli_only_end, kv.first);
|
||||||
|
if (cli_only < 0)
|
||||||
|
{
|
||||||
|
res[kv.first] = kv.second;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto & kv: etcd_global_config)
|
||||||
|
{
|
||||||
|
int local_only = find_str_array(local_only_params, local_only_end, kv.first);
|
||||||
|
if (local_only < 0)
|
||||||
|
{
|
||||||
|
res[kv.first] = kv.second;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto & kv: cli_config)
|
||||||
|
{
|
||||||
|
res[kv.first] = kv.second;
|
||||||
|
}
|
||||||
|
for (auto & kv: etcd_osd_config)
|
||||||
|
{
|
||||||
|
int local_only = find_str_array(local_only_params, local_only_end, kv.first);
|
||||||
|
if (local_only < 0)
|
||||||
|
{
|
||||||
|
res[kv.first] = kv.second;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
@@ -166,7 +166,11 @@ public:
|
|||||||
void accept_connections(int listen_fd);
|
void accept_connections(int listen_fd);
|
||||||
~osd_messenger_t();
|
~osd_messenger_t();
|
||||||
|
|
||||||
static json11::Json read_config(const json11::Json & config);
|
static json11::Json::object read_config(const json11::Json & config);
|
||||||
|
static json11::Json::object merge_configs(const json11::Json::object & cli_config,
|
||||||
|
const json11::Json::object & file_config,
|
||||||
|
const json11::Json::object & etcd_global_config,
|
||||||
|
const json11::Json::object & etcd_osd_config);
|
||||||
|
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
bool is_rdma_enabled();
|
bool is_rdma_enabled();
|
||||||
|
@@ -43,7 +43,15 @@ void osd_messenger_t::send_replies()
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
json11::Json osd_messenger_t::read_config(const json11::Json & config)
|
json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
|
||||||
{
|
{
|
||||||
return config;
|
return json11::Json::object();
|
||||||
|
}
|
||||||
|
|
||||||
|
json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object & cli_config,
|
||||||
|
const json11::Json::object & file_config,
|
||||||
|
const json11::Json::object & etcd_global_config,
|
||||||
|
const json11::Json::object & etcd_osd_config)
|
||||||
|
{
|
||||||
|
return cli_config;
|
||||||
}
|
}
|
||||||
|
@@ -39,6 +39,11 @@ struct __attribute__((__packed__)) obj_ver_id
|
|||||||
uint64_t version;
|
uint64_t version;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
inline bool operator == (const obj_ver_id & a, const obj_ver_id & b)
|
||||||
|
{
|
||||||
|
return a.oid == b.oid && a.version == b.version;
|
||||||
|
}
|
||||||
|
|
||||||
inline bool operator < (const obj_ver_id & a, const obj_ver_id & b)
|
inline bool operator < (const obj_ver_id & a, const obj_ver_id & b)
|
||||||
{
|
{
|
||||||
return a.oid < b.oid || a.oid == b.oid && a.version < b.version;
|
return a.oid < b.oid || a.oid == b.oid && a.version < b.version;
|
||||||
|
110
src/osd.cpp
110
src/osd.cpp
@@ -35,18 +35,18 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
|||||||
|
|
||||||
this->ringloop = ringloop;
|
this->ringloop = ringloop;
|
||||||
|
|
||||||
this->config = msgr.read_config(config).object_items();
|
this->cli_config = config.object_items();
|
||||||
if (this->config.find("log_level") == this->config.end())
|
this->file_config = msgr.read_config(this->cli_config);
|
||||||
this->config["log_level"] = 1;
|
parse_config(true);
|
||||||
parse_config(this->config, true);
|
|
||||||
|
|
||||||
epmgr = new epoll_manager_t(ringloop);
|
epmgr = new epoll_manager_t(ringloop);
|
||||||
// FIXME: Use timerfd_interval based directly on io_uring
|
// FIXME: Use timerfd_interval based directly on io_uring
|
||||||
this->tfd = epmgr->tfd;
|
this->tfd = epmgr->tfd;
|
||||||
|
|
||||||
auto bs_cfg = json_to_bs(this->config);
|
if (!json_is_true(this->config["disable_blockstore"]))
|
||||||
this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
|
|
||||||
{
|
{
|
||||||
|
auto bs_cfg = json_to_bs(this->config);
|
||||||
|
this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
|
||||||
// Autosync based on the number of unstable writes to prevent stalls due to insufficient journal space
|
// Autosync based on the number of unstable writes to prevent stalls due to insufficient journal space
|
||||||
uint64_t max_autosync = bs->get_journal_size() / bs->get_block_size() / 2;
|
uint64_t max_autosync = bs->get_journal_size() / bs->get_block_size() / 2;
|
||||||
if (autosync_writes > max_autosync)
|
if (autosync_writes > max_autosync)
|
||||||
@@ -67,11 +67,11 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
||||||
{
|
{
|
||||||
print_stats();
|
print_stats();
|
||||||
});
|
});
|
||||||
this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
|
slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
|
||||||
{
|
{
|
||||||
print_slow();
|
print_slow();
|
||||||
});
|
});
|
||||||
@@ -91,18 +91,42 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
|||||||
|
|
||||||
osd_t::~osd_t()
|
osd_t::~osd_t()
|
||||||
{
|
{
|
||||||
|
if (slow_log_timer_id >= 0)
|
||||||
|
{
|
||||||
|
tfd->clear_timer(slow_log_timer_id);
|
||||||
|
slow_log_timer_id = -1;
|
||||||
|
}
|
||||||
|
if (print_stats_timer_id >= 0)
|
||||||
|
{
|
||||||
|
tfd->clear_timer(print_stats_timer_id);
|
||||||
|
print_stats_timer_id = -1;
|
||||||
|
}
|
||||||
|
if (autosync_timer_id >= 0)
|
||||||
|
{
|
||||||
|
tfd->clear_timer(autosync_timer_id);
|
||||||
|
autosync_timer_id = -1;
|
||||||
|
}
|
||||||
ringloop->unregister_consumer(&consumer);
|
ringloop->unregister_consumer(&consumer);
|
||||||
delete epmgr;
|
delete epmgr;
|
||||||
delete bs;
|
if (bs)
|
||||||
|
delete bs;
|
||||||
close(listen_fd);
|
close(listen_fd);
|
||||||
free(zero_buffer);
|
free(zero_buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
|
void osd_t::parse_config(bool init)
|
||||||
{
|
{
|
||||||
|
config = msgr.merge_configs(cli_config, file_config, etcd_global_config, etcd_osd_config);
|
||||||
|
if (config.find("log_level") == this->config.end())
|
||||||
|
config["log_level"] = 1;
|
||||||
|
if (bs)
|
||||||
|
{
|
||||||
|
auto bs_cfg = json_to_bs(config);
|
||||||
|
bs->parse_config(bs_cfg);
|
||||||
|
}
|
||||||
st_cli.parse_config(config);
|
st_cli.parse_config(config);
|
||||||
msgr.parse_config(config);
|
msgr.parse_config(config);
|
||||||
if (allow_disk_params)
|
if (init)
|
||||||
{
|
{
|
||||||
// OSD number
|
// OSD number
|
||||||
osd_num = config["osd_num"].uint64_value();
|
osd_num = config["osd_num"].uint64_value();
|
||||||
@@ -124,24 +148,27 @@ void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
|
|||||||
immediate_commit = IMMEDIATE_SMALL;
|
immediate_commit = IMMEDIATE_SMALL;
|
||||||
else
|
else
|
||||||
immediate_commit = IMMEDIATE_NONE;
|
immediate_commit = IMMEDIATE_NONE;
|
||||||
|
// Bind address
|
||||||
|
bind_address = config["bind_address"].string_value();
|
||||||
|
if (bind_address == "")
|
||||||
|
bind_address = "0.0.0.0";
|
||||||
|
bind_port = config["bind_port"].uint64_value();
|
||||||
|
if (bind_port <= 0 || bind_port > 65535)
|
||||||
|
bind_port = 0;
|
||||||
|
// OSD configuration
|
||||||
|
etcd_report_interval = config["etcd_report_interval"].uint64_value();
|
||||||
|
if (etcd_report_interval <= 0)
|
||||||
|
etcd_report_interval = 5;
|
||||||
|
readonly = json_is_true(config["readonly"]);
|
||||||
|
run_primary = !json_is_false(config["run_primary"]);
|
||||||
|
allow_test_ops = json_is_true(config["allow_test_ops"]);
|
||||||
}
|
}
|
||||||
// Bind address
|
|
||||||
bind_address = config["bind_address"].string_value();
|
|
||||||
if (bind_address == "")
|
|
||||||
bind_address = "0.0.0.0";
|
|
||||||
bind_port = config["bind_port"].uint64_value();
|
|
||||||
if (bind_port <= 0 || bind_port > 65535)
|
|
||||||
bind_port = 0;
|
|
||||||
// OSD configuration
|
|
||||||
log_level = config["log_level"].uint64_value();
|
log_level = config["log_level"].uint64_value();
|
||||||
etcd_report_interval = config["etcd_report_interval"].uint64_value();
|
auto old_no_rebalance = no_rebalance;
|
||||||
if (etcd_report_interval <= 0)
|
|
||||||
etcd_report_interval = 5;
|
|
||||||
readonly = json_is_true(config["readonly"]);
|
|
||||||
run_primary = !json_is_false(config["run_primary"]);
|
|
||||||
no_rebalance = json_is_true(config["no_rebalance"]);
|
no_rebalance = json_is_true(config["no_rebalance"]);
|
||||||
|
auto old_no_recovery = no_recovery;
|
||||||
no_recovery = json_is_true(config["no_recovery"]);
|
no_recovery = json_is_true(config["no_recovery"]);
|
||||||
allow_test_ops = json_is_true(config["allow_test_ops"]);
|
auto old_autosync_interval = autosync_interval;
|
||||||
if (!config["autosync_interval"].is_null())
|
if (!config["autosync_interval"].is_null())
|
||||||
{
|
{
|
||||||
// Allow to set it to 0
|
// Allow to set it to 0
|
||||||
@@ -169,15 +196,46 @@ void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
|
|||||||
recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
|
recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
|
||||||
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
|
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
|
||||||
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||||
|
auto old_print_stats_interval = print_stats_interval;
|
||||||
print_stats_interval = config["print_stats_interval"].uint64_value();
|
print_stats_interval = config["print_stats_interval"].uint64_value();
|
||||||
if (!print_stats_interval)
|
if (!print_stats_interval)
|
||||||
print_stats_interval = 3;
|
print_stats_interval = 3;
|
||||||
|
auto old_slow_log_interval = slow_log_interval;
|
||||||
slow_log_interval = config["slow_log_interval"].uint64_value();
|
slow_log_interval = config["slow_log_interval"].uint64_value();
|
||||||
if (!slow_log_interval)
|
if (!slow_log_interval)
|
||||||
slow_log_interval = 10;
|
slow_log_interval = 10;
|
||||||
inode_vanish_time = config["inode_vanish_time"].uint64_value();
|
inode_vanish_time = config["inode_vanish_time"].uint64_value();
|
||||||
if (!inode_vanish_time)
|
if (!inode_vanish_time)
|
||||||
inode_vanish_time = 60;
|
inode_vanish_time = 60;
|
||||||
|
if ((old_no_rebalance && !no_rebalance || old_no_recovery && !no_recovery) &&
|
||||||
|
!(peering_state & (OSD_RECOVERING | OSD_FLUSHING_PGS)))
|
||||||
|
{
|
||||||
|
peering_state = peering_state | OSD_RECOVERING;
|
||||||
|
}
|
||||||
|
if (old_autosync_interval != autosync_interval && autosync_timer_id >= 0)
|
||||||
|
{
|
||||||
|
this->tfd->clear_timer(autosync_timer_id);
|
||||||
|
autosync_timer_id = this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
|
||||||
|
{
|
||||||
|
autosync();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (old_print_stats_interval != print_stats_interval && print_stats_timer_id >= 0)
|
||||||
|
{
|
||||||
|
tfd->clear_timer(print_stats_timer_id);
|
||||||
|
print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
||||||
|
{
|
||||||
|
print_stats();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (old_slow_log_interval != slow_log_interval && slow_log_timer_id >= 0)
|
||||||
|
{
|
||||||
|
tfd->clear_timer(slow_log_timer_id);
|
||||||
|
slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
|
||||||
|
{
|
||||||
|
print_slow();
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::bind_socket()
|
void osd_t::bind_socket()
|
||||||
@@ -475,7 +533,7 @@ void osd_t::print_slow()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (has_slow)
|
if (has_slow && bs)
|
||||||
{
|
{
|
||||||
bs->dump_diagnostics();
|
bs->dump_diagnostics();
|
||||||
}
|
}
|
||||||
|
@@ -90,7 +90,7 @@ class osd_t
|
|||||||
{
|
{
|
||||||
// config
|
// config
|
||||||
|
|
||||||
json11::Json::object config;
|
json11::Json::object cli_config, file_config, etcd_global_config, etcd_osd_config, config;
|
||||||
int etcd_report_interval = 5;
|
int etcd_report_interval = 5;
|
||||||
|
|
||||||
bool readonly = false;
|
bool readonly = false;
|
||||||
@@ -126,6 +126,7 @@ class osd_t
|
|||||||
bool pg_config_applied = false;
|
bool pg_config_applied = false;
|
||||||
bool etcd_reporting_pg_state = false;
|
bool etcd_reporting_pg_state = false;
|
||||||
bool etcd_reporting_stats = false;
|
bool etcd_reporting_stats = false;
|
||||||
|
int autosync_timer_id = -1, print_stats_timer_id = -1, slow_log_timer_id = -1;
|
||||||
|
|
||||||
// peers and PGs
|
// peers and PGs
|
||||||
|
|
||||||
@@ -152,7 +153,7 @@ class osd_t
|
|||||||
|
|
||||||
bool stopping = false;
|
bool stopping = false;
|
||||||
int inflight_ops = 0;
|
int inflight_ops = 0;
|
||||||
blockstore_t *bs;
|
blockstore_t *bs = NULL;
|
||||||
void *zero_buffer = NULL;
|
void *zero_buffer = NULL;
|
||||||
uint64_t zero_buffer_size = 0;
|
uint64_t zero_buffer_size = 0;
|
||||||
uint32_t bs_block_size, bs_bitmap_granularity, clean_entry_bitmap_size;
|
uint32_t bs_block_size, bs_bitmap_granularity, clean_entry_bitmap_size;
|
||||||
@@ -173,7 +174,7 @@ class osd_t
|
|||||||
uint64_t recovery_stat_bytes[2][2] = {};
|
uint64_t recovery_stat_bytes[2][2] = {};
|
||||||
|
|
||||||
// cluster connection
|
// cluster connection
|
||||||
void parse_config(const json11::Json & config, bool allow_disk_params);
|
void parse_config(bool init);
|
||||||
void init_cluster();
|
void init_cluster();
|
||||||
void on_change_osd_state_hook(osd_num_t peer_osd);
|
void on_change_osd_state_hook(osd_num_t peer_osd);
|
||||||
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
|
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
|
||||||
|
@@ -75,7 +75,7 @@ void osd_t::init_cluster()
|
|||||||
}
|
}
|
||||||
if (run_primary && autosync_interval > 0)
|
if (run_primary && autosync_interval > 0)
|
||||||
{
|
{
|
||||||
this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
|
autosync_timer_id = this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
|
||||||
{
|
{
|
||||||
autosync();
|
autosync();
|
||||||
});
|
});
|
||||||
@@ -182,10 +182,10 @@ json11::Json osd_t::get_statistics()
|
|||||||
char time_str[50] = { 0 };
|
char time_str[50] = { 0 };
|
||||||
sprintf(time_str, "%ld.%03ld", ts.tv_sec, ts.tv_nsec/1000000);
|
sprintf(time_str, "%ld.%03ld", ts.tv_sec, ts.tv_nsec/1000000);
|
||||||
st["time"] = time_str;
|
st["time"] = time_str;
|
||||||
st["blockstore_ready"] = bs->is_started();
|
|
||||||
st["data_block_size"] = (uint64_t)bs->get_block_size();
|
|
||||||
if (bs)
|
if (bs)
|
||||||
{
|
{
|
||||||
|
st["blockstore_ready"] = bs->is_started();
|
||||||
|
st["data_block_size"] = (uint64_t)bs->get_block_size();
|
||||||
st["size"] = bs->get_block_count() * bs->get_block_size();
|
st["size"] = bs->get_block_count() * bs->get_block_size();
|
||||||
st["free"] = bs->get_free_block_count() * bs->get_block_size();
|
st["free"] = bs->get_free_block_count() * bs->get_block_size();
|
||||||
}
|
}
|
||||||
@@ -233,7 +233,8 @@ void osd_t::report_statistics()
|
|||||||
json11::Json::object inode_space;
|
json11::Json::object inode_space;
|
||||||
json11::Json::object last_stat;
|
json11::Json::object last_stat;
|
||||||
pool_id_t last_pool = 0;
|
pool_id_t last_pool = 0;
|
||||||
auto & bs_inode_space = bs->get_inode_space_stats();
|
std::map<uint64_t, uint64_t> bs_empty_space;
|
||||||
|
auto & bs_inode_space = bs ? bs->get_inode_space_stats() : bs_empty_space;
|
||||||
for (auto kv: bs_inode_space)
|
for (auto kv: bs_inode_space)
|
||||||
{
|
{
|
||||||
pool_id_t pool_id = INODE_POOL(kv.first);
|
pool_id_t pool_id = INODE_POOL(kv.first);
|
||||||
@@ -374,7 +375,11 @@ void osd_t::on_change_osd_state_hook(osd_num_t peer_osd)
|
|||||||
|
|
||||||
void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes)
|
void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes)
|
||||||
{
|
{
|
||||||
// FIXME apply config changes in runtime (maybe, some)
|
if (changes.find(st_cli.etcd_prefix+"/config/global") != changes.end())
|
||||||
|
{
|
||||||
|
etcd_global_config = changes[st_cli.etcd_prefix+"/config/global"].value.object_items();
|
||||||
|
parse_config(false);
|
||||||
|
}
|
||||||
if (run_primary)
|
if (run_primary)
|
||||||
{
|
{
|
||||||
apply_pg_count();
|
apply_pg_count();
|
||||||
@@ -384,11 +389,8 @@ void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes
|
|||||||
|
|
||||||
void osd_t::on_load_config_hook(json11::Json::object & global_config)
|
void osd_t::on_load_config_hook(json11::Json::object & global_config)
|
||||||
{
|
{
|
||||||
json11::Json::object osd_config = this->config;
|
etcd_global_config = global_config;
|
||||||
for (auto & kv: global_config)
|
parse_config(true);
|
||||||
if (osd_config.find(kv.first) == osd_config.end())
|
|
||||||
osd_config[kv.first] = kv.second;
|
|
||||||
parse_config(osd_config, false);
|
|
||||||
bind_socket();
|
bind_socket();
|
||||||
acquire_lease();
|
acquire_lease();
|
||||||
}
|
}
|
||||||
|
@@ -64,6 +64,11 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
|
|||||||
|
|
||||||
void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval)
|
void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval)
|
||||||
{
|
{
|
||||||
|
if (log_level > 2)
|
||||||
|
{
|
||||||
|
printf("[PG %u/%u] flush batch %lx completed on OSD %lu with result %d\n",
|
||||||
|
pool_id, pg_num, (uint64_t)fb, peer_osd, retval);
|
||||||
|
}
|
||||||
pool_pg_num_t pg_id = { .pool_id = pool_id, .pg_num = pg_num };
|
pool_pg_num_t pg_id = { .pool_id = pool_id, .pg_num = pg_num };
|
||||||
if (pgs.find(pg_id) == pgs.end() || pgs[pg_id].flush_batch != fb)
|
if (pgs.find(pg_id) == pgs.end() || pgs[pg_id].flush_batch != fb)
|
||||||
{
|
{
|
||||||
@@ -99,10 +104,9 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
|||||||
std::vector<osd_op_t*> continue_ops;
|
std::vector<osd_op_t*> continue_ops;
|
||||||
auto & pg = pgs.at(pg_id);
|
auto & pg = pgs.at(pg_id);
|
||||||
auto it = pg.flush_actions.begin(), prev_it = it;
|
auto it = pg.flush_actions.begin(), prev_it = it;
|
||||||
auto erase_start = it;
|
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
if (it == pg.flush_actions.end() ||
|
if (it == pg.flush_actions.end() || !it->second.submitted ||
|
||||||
it->first.oid.inode != prev_it->first.oid.inode ||
|
it->first.oid.inode != prev_it->first.oid.inode ||
|
||||||
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK))
|
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK))
|
||||||
{
|
{
|
||||||
@@ -116,29 +120,23 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
|||||||
});
|
});
|
||||||
if (wr_it != pg.write_queue.end())
|
if (wr_it != pg.write_queue.end())
|
||||||
{
|
{
|
||||||
|
if (log_level > 2)
|
||||||
|
{
|
||||||
|
printf("[PG %u/%u] continuing write %lx to object %lx:%lx after flush\n",
|
||||||
|
pool_id, pg_num, (uint64_t)wr_it->second, wr_it->first.inode, wr_it->first.stripe);
|
||||||
|
}
|
||||||
continue_ops.push_back(wr_it->second);
|
continue_ops.push_back(wr_it->second);
|
||||||
pg.write_queue.erase(wr_it);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ((it == pg.flush_actions.end() || !it->second.submitted) &&
|
if (it == pg.flush_actions.end() || !it->second.submitted)
|
||||||
erase_start != it)
|
|
||||||
{
|
|
||||||
pg.flush_actions.erase(erase_start, it);
|
|
||||||
}
|
|
||||||
if (it == pg.flush_actions.end())
|
|
||||||
{
|
{
|
||||||
|
if (it != pg.flush_actions.begin())
|
||||||
|
{
|
||||||
|
pg.flush_actions.erase(pg.flush_actions.begin(), it);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
prev_it = it;
|
prev_it = it++;
|
||||||
if (!it->second.submitted)
|
|
||||||
{
|
|
||||||
it++;
|
|
||||||
erase_start = it;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
it++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
delete fb;
|
delete fb;
|
||||||
pg.flush_batch = NULL;
|
pg.flush_batch = NULL;
|
||||||
@@ -168,6 +166,18 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
|||||||
// Copy buffer so it gets freed along with the operation
|
// Copy buffer so it gets freed along with the operation
|
||||||
op->buf = malloc_or_die(sizeof(obj_ver_id) * count);
|
op->buf = malloc_or_die(sizeof(obj_ver_id) * count);
|
||||||
memcpy(op->buf, data, sizeof(obj_ver_id) * count);
|
memcpy(op->buf, data, sizeof(obj_ver_id) * count);
|
||||||
|
if (log_level > 2)
|
||||||
|
{
|
||||||
|
printf(
|
||||||
|
"[PG %u/%u] flush batch %lx on OSD %lu: %s objects: ",
|
||||||
|
pool_id, pg_num, (uint64_t)fb, peer_osd, rollback ? "rollback" : "stabilize"
|
||||||
|
);
|
||||||
|
for (int i = 0; i < count; i++)
|
||||||
|
{
|
||||||
|
printf(i > 0 ? ", %lx:%lx v%lu" : "%lx:%lx v%lu", data[i].oid.inode, data[i].oid.stripe, data[i].version);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
if (peer_osd == this->osd_num)
|
if (peer_osd == this->osd_num)
|
||||||
{
|
{
|
||||||
// local
|
// local
|
||||||
@@ -304,9 +314,10 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
|||||||
{
|
{
|
||||||
// PG is stopped or one of the OSDs is gone, error is harmless
|
// PG is stopped or one of the OSDs is gone, error is harmless
|
||||||
printf(
|
printf(
|
||||||
"Recovery operation failed with object %lx:%lx (PG %u/%u)\n",
|
"[PG %u/%u] Recovery operation failed with object %lx:%lx\n",
|
||||||
op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
|
INODE_POOL(op->oid.inode),
|
||||||
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size)
|
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
|
||||||
|
op->oid.inode, op->oid.stripe
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@@ -76,7 +76,7 @@ void osd_t::handle_peers()
|
|||||||
peering_state = peering_state & ~OSD_FLUSHING_PGS | OSD_RECOVERING;
|
peering_state = peering_state & ~OSD_FLUSHING_PGS | OSD_RECOVERING;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ((peering_state & OSD_RECOVERING) && !readonly)
|
if (!(peering_state & OSD_FLUSHING_PGS) && (peering_state & OSD_RECOVERING) && !readonly)
|
||||||
{
|
{
|
||||||
if (!continue_recovery())
|
if (!continue_recovery())
|
||||||
{
|
{
|
||||||
|
@@ -91,7 +91,7 @@ void pg_obj_state_check_t::walk()
|
|||||||
pg->state |= PG_DEGRADED;
|
pg->state |= PG_DEGRADED;
|
||||||
}
|
}
|
||||||
pg->state |= PG_ACTIVE;
|
pg->state |= PG_ACTIVE;
|
||||||
if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
|
if (pg->cur_peers.size() < pg->all_peers.size())
|
||||||
{
|
{
|
||||||
pg->state |= PG_LEFT_ON_DEAD;
|
pg->state |= PG_LEFT_ON_DEAD;
|
||||||
}
|
}
|
||||||
|
@@ -53,7 +53,10 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
|||||||
inode_stats[cur_op->req.rw.inode].op_count[inode_st_op]++;
|
inode_stats[cur_op->req.rw.inode].op_count[inode_st_op]++;
|
||||||
inode_stats[cur_op->req.rw.inode].op_sum[inode_st_op] += usec;
|
inode_stats[cur_op->req.rw.inode].op_sum[inode_st_op] += usec;
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
|
if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
|
||||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
|
{
|
||||||
|
if (cur_op->op_data)
|
||||||
|
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len;
|
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len;
|
||||||
}
|
}
|
||||||
|
@@ -166,7 +166,7 @@ resume_6:
|
|||||||
for (int i = 0; i < unstable_osd.len; i++)
|
for (int i = 0; i < unstable_osd.len; i++)
|
||||||
{
|
{
|
||||||
// Except those from peered PGs
|
// Except those from peered PGs
|
||||||
auto & w = op_data->unstable_writes[i];
|
auto & w = op_data->unstable_writes[unstable_osd.start + i];
|
||||||
pool_pg_num_t wpg = {
|
pool_pg_num_t wpg = {
|
||||||
.pool_id = INODE_POOL(w.oid.inode),
|
.pool_id = INODE_POOL(w.oid.inode),
|
||||||
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
|
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
|
||||||
|
@@ -12,6 +12,7 @@ bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
|
|||||||
.oid = op_data->oid,
|
.oid = op_data->oid,
|
||||||
.osd_num = 0,
|
.osd_num = 0,
|
||||||
});
|
});
|
||||||
|
op_data->st = 1;
|
||||||
if (act_it != pg.flush_actions.end() &&
|
if (act_it != pg.flush_actions.end() &&
|
||||||
act_it->first.oid.inode == op_data->oid.inode &&
|
act_it->first.oid.inode == op_data->oid.inode &&
|
||||||
(act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
|
(act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
|
||||||
@@ -23,7 +24,6 @@ bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
|
|||||||
auto vo_it = pg.write_queue.find(op_data->oid);
|
auto vo_it = pg.write_queue.find(op_data->oid);
|
||||||
if (vo_it != pg.write_queue.end())
|
if (vo_it != pg.write_queue.end())
|
||||||
{
|
{
|
||||||
op_data->st = 1;
|
|
||||||
pg.write_queue.emplace(op_data->oid, cur_op);
|
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@@ -142,11 +142,11 @@ inline bool operator < (const reed_sol_erased_t &a, const reed_sol_erased_t &b)
|
|||||||
for (int i = 0; i < a.size && i < b.size; i++)
|
for (int i = 0; i < a.size && i < b.size; i++)
|
||||||
{
|
{
|
||||||
if (a.data[i] < b.data[i])
|
if (a.data[i] < b.data[i])
|
||||||
return -1;
|
return true;
|
||||||
else if (a.data[i] > b.data[i])
|
else if (a.data[i] > b.data[i])
|
||||||
return 1;
|
return false;
|
||||||
}
|
}
|
||||||
return 0;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct reed_sol_matrix_t
|
struct reed_sol_matrix_t
|
||||||
@@ -677,11 +677,11 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
|
|||||||
static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, uint32_t wr_end, buf_len_t *bufs, int & nbufs)
|
static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, uint32_t wr_end, buf_len_t *bufs, int & nbufs)
|
||||||
{
|
{
|
||||||
uint32_t ns = 0, ne = 0, os = 0, oe = 0;
|
uint32_t ns = 0, ne = 0, os = 0, oe = 0;
|
||||||
if (stripe.req_end > wr_start &&
|
if (stripe.write_end > wr_start &&
|
||||||
stripe.req_start < wr_end)
|
stripe.write_start < wr_end)
|
||||||
{
|
{
|
||||||
ns = std::max(stripe.req_start, wr_start);
|
ns = std::max(stripe.write_start, wr_start);
|
||||||
ne = std::min(stripe.req_end, wr_end);
|
ne = std::min(stripe.write_end, wr_end);
|
||||||
}
|
}
|
||||||
if (stripe.read_end > wr_start &&
|
if (stripe.read_end > wr_start &&
|
||||||
stripe.read_start < wr_end)
|
stripe.read_start < wr_end)
|
||||||
@@ -692,7 +692,7 @@ static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, ui
|
|||||||
if (ne && (!oe || ns <= os))
|
if (ne && (!oe || ns <= os))
|
||||||
{
|
{
|
||||||
// NEW or NEW->OLD
|
// NEW or NEW->OLD
|
||||||
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.req_start, .len = ne-ns };
|
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.write_start, .len = ne-ns };
|
||||||
if (os < ne)
|
if (os < ne)
|
||||||
os = ne;
|
os = ne;
|
||||||
if (oe > os)
|
if (oe > os)
|
||||||
@@ -708,7 +708,7 @@ static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, ui
|
|||||||
{
|
{
|
||||||
// OLD->NEW or OLD->NEW->OLD
|
// OLD->NEW or OLD->NEW->OLD
|
||||||
bufs[nbufs++] = { .buf = (uint8_t*)stripe.read_buf + os - stripe.read_start, .len = ns-os };
|
bufs[nbufs++] = { .buf = (uint8_t*)stripe.read_buf + os - stripe.read_start, .len = ns-os };
|
||||||
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.req_start, .len = ne-ns };
|
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.write_start, .len = ne-ns };
|
||||||
if (oe > ne)
|
if (oe > ne)
|
||||||
{
|
{
|
||||||
// OLD->NEW->OLD
|
// OLD->NEW->OLD
|
||||||
@@ -759,7 +759,18 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
|
|||||||
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
|
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
|
||||||
uint32_t &start, uint32_t &end)
|
uint32_t &start, uint32_t &end)
|
||||||
{
|
{
|
||||||
if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set)
|
bool required = false;
|
||||||
|
for (int role = pg_minsize; role < pg_size; role++)
|
||||||
|
{
|
||||||
|
if (write_osd_set[role] != 0)
|
||||||
|
{
|
||||||
|
// Whole parity chunk is needed when we move the object
|
||||||
|
if (write_osd_set[role] != read_osd_set[role])
|
||||||
|
end = chunk_size;
|
||||||
|
required = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (required && end != chunk_size)
|
||||||
{
|
{
|
||||||
// start & end are required for calc_rmw_parity
|
// start & end are required for calc_rmw_parity
|
||||||
for (int role = 0; role < pg_minsize; role++)
|
for (int role = 0; role < pg_minsize; role++)
|
||||||
@@ -770,14 +781,6 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
|
|||||||
end = std::max(stripes[role].req_end, end);
|
end = std::max(stripes[role].req_end, end);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int role = pg_minsize; role < pg_size; role++)
|
|
||||||
{
|
|
||||||
if (write_osd_set[role] != 0 && write_osd_set[role] != read_osd_set[role])
|
|
||||||
{
|
|
||||||
start = 0;
|
|
||||||
end = chunk_size;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// Set bitmap bits accordingly
|
// Set bitmap bits accordingly
|
||||||
if (bitmap_granularity > 0)
|
if (bitmap_granularity > 0)
|
||||||
|
@@ -17,6 +17,7 @@ void test4();
|
|||||||
void test5();
|
void test5();
|
||||||
void test6();
|
void test6();
|
||||||
void test7();
|
void test7();
|
||||||
|
void test_rmw_4k_degraded_into_lost_to_normal(bool ec);
|
||||||
void test8();
|
void test8();
|
||||||
void test9();
|
void test9();
|
||||||
void test10();
|
void test10();
|
||||||
@@ -24,7 +25,7 @@ void test11();
|
|||||||
void test12();
|
void test12();
|
||||||
void test13();
|
void test13();
|
||||||
void test14();
|
void test14();
|
||||||
void test15();
|
void test15(bool second);
|
||||||
void test16();
|
void test16();
|
||||||
|
|
||||||
int main(int narg, char *args[])
|
int main(int narg, char *args[])
|
||||||
@@ -39,6 +40,8 @@ int main(int narg, char *args[])
|
|||||||
test6();
|
test6();
|
||||||
// Test 7
|
// Test 7
|
||||||
test7();
|
test7();
|
||||||
|
test_rmw_4k_degraded_into_lost_to_normal(false);
|
||||||
|
test_rmw_4k_degraded_into_lost_to_normal(true);
|
||||||
// Test 8
|
// Test 8
|
||||||
test8();
|
test8();
|
||||||
// Test 9
|
// Test 9
|
||||||
@@ -54,7 +57,8 @@ int main(int narg, char *args[])
|
|||||||
// Test 14
|
// Test 14
|
||||||
test14();
|
test14();
|
||||||
// Test 15
|
// Test 15
|
||||||
test15();
|
test15(false);
|
||||||
|
test15(true);
|
||||||
// Test 16
|
// Test 16
|
||||||
test16();
|
test16();
|
||||||
// End
|
// End
|
||||||
@@ -315,6 +319,69 @@ void test7()
|
|||||||
|
|
||||||
/***
|
/***
|
||||||
|
|
||||||
|
7/2. calc_rmw(offset=48K, len=4K, osd_set=[0,2,3], write_set=[1,2,3])
|
||||||
|
= {
|
||||||
|
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
|
||||||
|
write: [ [ 48K, 52K ], [ 0, 0 ], [ 48K, 52K ] ],
|
||||||
|
input buffer: [ write0 ],
|
||||||
|
rmw buffer: [ write2, read0, read1, read2 ],
|
||||||
|
}
|
||||||
|
then, after calc_rmw_parity_xor/ec(): {
|
||||||
|
write: [ [ 0, 128K ], [ 0, 0 ], [ 48K, 52K ] ],
|
||||||
|
write0==read0,
|
||||||
|
}
|
||||||
|
+ check write0, write2 buffers
|
||||||
|
|
||||||
|
***/
|
||||||
|
|
||||||
|
void test_rmw_4k_degraded_into_lost_to_normal(bool ec)
|
||||||
|
{
|
||||||
|
osd_num_t osd_set[3] = { 0, 2, 3 };
|
||||||
|
osd_num_t write_osd_set[3] = { 1, 2, 3 };
|
||||||
|
osd_rmw_stripe_t stripes[3] = {};
|
||||||
|
// Subtest 1
|
||||||
|
split_stripes(2, 128*1024, 48*1024, 4096, stripes);
|
||||||
|
void *write_buf = malloc(4096);
|
||||||
|
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024, 0);
|
||||||
|
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
|
||||||
|
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
|
||||||
|
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
|
||||||
|
assert(stripes[0].write_start == 48*1024 && stripes[0].write_end == 52*1024);
|
||||||
|
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
|
||||||
|
assert(stripes[2].write_start == 48*1024 && stripes[2].write_end == 52*1024);
|
||||||
|
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
|
||||||
|
assert(stripes[1].read_buf == (uint8_t*)rmw_buf+4*1024+128*1024);
|
||||||
|
assert(stripes[2].read_buf == (uint8_t*)rmw_buf+4*1024+2*128*1024);
|
||||||
|
assert(stripes[0].write_buf == write_buf);
|
||||||
|
assert(stripes[1].write_buf == NULL);
|
||||||
|
assert(stripes[2].write_buf == rmw_buf);
|
||||||
|
// Subtest 2
|
||||||
|
set_pattern(write_buf, 4096, PATTERN2);
|
||||||
|
set_pattern(stripes[1].read_buf, 128*1024, PATTERN1);
|
||||||
|
set_pattern(stripes[2].read_buf, 128*1024, PATTERN0^PATTERN1);
|
||||||
|
if (!ec)
|
||||||
|
calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024, 0);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
use_ec(3, 2, true);
|
||||||
|
calc_rmw_parity_ec(stripes, 3, 2, osd_set, write_osd_set, 128*1024, 0);
|
||||||
|
use_ec(3, 2, false);
|
||||||
|
}
|
||||||
|
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
|
||||||
|
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
|
||||||
|
assert(stripes[2].write_start == 48*1024 && stripes[2].write_end == 52*1024);
|
||||||
|
assert(stripes[0].write_buf == stripes[0].read_buf);
|
||||||
|
assert(stripes[1].write_buf == NULL);
|
||||||
|
assert(stripes[2].write_buf == rmw_buf);
|
||||||
|
check_pattern(stripes[0].write_buf, 4096, PATTERN0);
|
||||||
|
check_pattern(stripes[0].write_buf+48*1024, 4096, PATTERN2);
|
||||||
|
check_pattern(stripes[2].write_buf, 4096, PATTERN2^PATTERN1); // new parity
|
||||||
|
free(rmw_buf);
|
||||||
|
free(write_buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
/***
|
||||||
|
|
||||||
8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
|
8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
|
||||||
= {
|
= {
|
||||||
read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
|
read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
|
||||||
@@ -826,12 +893,11 @@ void test14()
|
|||||||
|
|
||||||
***/
|
***/
|
||||||
|
|
||||||
void test15()
|
void test15(bool second)
|
||||||
{
|
{
|
||||||
const int bmp = 64*1024 / 4096 / 8;
|
const int bmp = 64*1024 / 4096 / 8;
|
||||||
use_ec(4, 2, true);
|
use_ec(4, 2, true);
|
||||||
osd_num_t osd_set[4] = { 1, 2, 3, 0 };
|
osd_num_t osd_set[4] = { 1, 2, (osd_num_t)(second ? 0 : 3), (osd_num_t)(second ? 4 : 0) };
|
||||||
osd_num_t write_osd_set[4] = { 1, 2, 3, 0 };
|
|
||||||
osd_rmw_stripe_t stripes[4] = {};
|
osd_rmw_stripe_t stripes[4] = {};
|
||||||
unsigned bitmaps[4] = { 0 };
|
unsigned bitmaps[4] = { 0 };
|
||||||
// Test 15.0
|
// Test 15.0
|
||||||
@@ -842,7 +908,7 @@ void test15()
|
|||||||
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
|
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
|
||||||
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
|
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
|
||||||
// Test 15.1
|
// Test 15.1
|
||||||
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, write_osd_set, 64*1024, bmp);
|
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, osd_set, 64*1024, bmp);
|
||||||
for (int i = 0; i < 4; i++)
|
for (int i = 0; i < 4; i++)
|
||||||
stripes[i].bmp_buf = bitmaps+i;
|
stripes[i].bmp_buf = bitmaps+i;
|
||||||
assert(rmw_buf);
|
assert(rmw_buf);
|
||||||
@@ -852,32 +918,34 @@ void test15()
|
|||||||
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
|
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
|
||||||
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
||||||
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
||||||
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
|
assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
|
||||||
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
|
assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
|
||||||
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
|
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
|
||||||
assert(stripes[1].read_buf == NULL);
|
assert(stripes[1].read_buf == NULL);
|
||||||
assert(stripes[2].read_buf == NULL);
|
assert(stripes[2].read_buf == NULL);
|
||||||
assert(stripes[3].read_buf == NULL);
|
assert(stripes[3].read_buf == NULL);
|
||||||
assert(stripes[0].write_buf == NULL);
|
assert(stripes[0].write_buf == NULL);
|
||||||
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
||||||
assert(stripes[2].write_buf == rmw_buf);
|
assert(stripes[2+second].write_buf == rmw_buf);
|
||||||
assert(stripes[3].write_buf == NULL);
|
assert(stripes[3-second].write_buf == NULL);
|
||||||
// Test 15.2 - encode
|
// Test 15.2 - encode
|
||||||
set_pattern(write_buf, 4*1024, PATTERN1);
|
set_pattern(write_buf, 4*1024, PATTERN1);
|
||||||
set_pattern(stripes[0].read_buf, 4*1024, PATTERN2);
|
set_pattern(stripes[0].read_buf, 4*1024, PATTERN2);
|
||||||
memset(stripes[0].bmp_buf, 0, bmp);
|
memset(stripes[0].bmp_buf, 0, bmp);
|
||||||
memset(stripes[1].bmp_buf, 0, bmp);
|
memset(stripes[1].bmp_buf, 0, bmp);
|
||||||
calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 64*1024, bmp);
|
memset(stripes[2+second].write_buf, 0, 4096);
|
||||||
assert(*(uint32_t*)stripes[2].bmp_buf == 0x80);
|
calc_rmw_parity_ec(stripes, 4, 2, osd_set, osd_set, 64*1024, bmp);
|
||||||
|
assert(second || *(uint32_t*)stripes[2].bmp_buf == 0x80);
|
||||||
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
||||||
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
||||||
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
|
assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
|
||||||
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
|
assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
|
||||||
assert(stripes[0].write_buf == NULL);
|
assert(stripes[0].write_buf == NULL);
|
||||||
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
||||||
assert(stripes[2].write_buf == rmw_buf);
|
assert(stripes[2+second].write_buf == rmw_buf);
|
||||||
assert(stripes[3].write_buf == NULL);
|
assert(stripes[3-second].write_buf == NULL);
|
||||||
check_pattern(stripes[2].write_buf, 4*1024, PATTERN1^PATTERN2); // first parity is always xor :)
|
// first parity is always xor :), second isn't...
|
||||||
|
check_pattern(stripes[2+second].write_buf, 4*1024, second ? 0xb79a59a0ce8b9b81 : PATTERN1^PATTERN2);
|
||||||
// Done
|
// Done
|
||||||
free(rmw_buf);
|
free(rmw_buf);
|
||||||
free(write_buf);
|
free(write_buf);
|
||||||
|
@@ -150,6 +150,7 @@ int connect_osd(const char *osd_address, int osd_port)
|
|||||||
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
||||||
{
|
{
|
||||||
perror("connect");
|
perror("connect");
|
||||||
|
close(connect_fd);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
int one = 1;
|
int one = 1;
|
||||||
|
@@ -15,7 +15,7 @@ int read_blocking(int fd, void *read_buf, size_t remaining)
|
|||||||
size_t done = 0;
|
size_t done = 0;
|
||||||
while (done < remaining)
|
while (done < remaining)
|
||||||
{
|
{
|
||||||
size_t r = read(fd, read_buf, remaining-done);
|
ssize_t r = read(fd, read_buf, remaining-done);
|
||||||
if (r <= 0)
|
if (r <= 0)
|
||||||
{
|
{
|
||||||
if (!errno)
|
if (!errno)
|
||||||
@@ -41,7 +41,7 @@ int write_blocking(int fd, void *write_buf, size_t remaining)
|
|||||||
size_t done = 0;
|
size_t done = 0;
|
||||||
while (done < remaining)
|
while (done < remaining)
|
||||||
{
|
{
|
||||||
size_t r = write(fd, write_buf, remaining-done);
|
ssize_t r = write(fd, write_buf, remaining-done);
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
{
|
{
|
||||||
if (errno != EINTR && errno != EAGAIN && errno != EPIPE)
|
if (errno != EINTR && errno != EAGAIN && errno != EPIPE)
|
||||||
|
@@ -83,6 +83,7 @@ int connect_stub(const char *server_address, int server_port)
|
|||||||
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
||||||
{
|
{
|
||||||
perror("connect");
|
perror("connect");
|
||||||
|
close(connect_fd);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
int one = 1;
|
int one = 1;
|
||||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
|||||||
|
|
||||||
Name: Vitastor
|
Name: Vitastor
|
||||||
Description: Vitastor client library
|
Description: Vitastor client library
|
||||||
Version: 0.8.5
|
Version: 0.8.7
|
||||||
Libs: -L${libdir} -lvitastor_client
|
Libs: -L${libdir} -lvitastor_client
|
||||||
Cflags: -I${includedir}
|
Cflags: -I${includedir}
|
||||||
|
|
||||||
|
@@ -64,4 +64,4 @@ echo leak:librbd >> testdata/lsan-suppress.txt
|
|||||||
echo leak:_M_mutate >> testdata/lsan-suppress.txt
|
echo leak:_M_mutate >> testdata/lsan-suppress.txt
|
||||||
echo leak:_M_assign >> testdata/lsan-suppress.txt
|
echo leak:_M_assign >> testdata/lsan-suppress.txt
|
||||||
export LSAN_OPTIONS=report_objects=true:suppressions=`pwd`/testdata/lsan-suppress.txt
|
export LSAN_OPTIONS=report_objects=true:suppressions=`pwd`/testdata/lsan-suppress.txt
|
||||||
export ASAN_OPTIONS=verify_asan_link_order=false
|
export ASAN_OPTIONS=verify_asan_link_order=false:abort_on_error=1
|
||||||
|
@@ -17,17 +17,17 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$IMMEDIATE_COMMIT" != "" ]; then
|
if [ "$IMMEDIATE_COMMIT" != "" ]; then
|
||||||
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 1"
|
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 10"
|
||||||
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1,"immediate_commit":"all"}'
|
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1,"immediate_commit":"all"}'
|
||||||
else
|
else
|
||||||
NO_SAME="--journal_sector_buffer_count 1024 --log_level 1"
|
NO_SAME="--journal_sector_buffer_count 1024 --log_level 10"
|
||||||
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1}'
|
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1}'
|
||||||
fi
|
fi
|
||||||
|
|
||||||
start_osd()
|
start_osd()
|
||||||
{
|
{
|
||||||
local i=$1
|
local i=$1
|
||||||
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
|
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
|
||||||
eval OSD${i}_PID=$!
|
eval OSD${i}_PID=$!
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -43,3 +43,6 @@ SCHEME=ec ./test_snapshot.sh
|
|||||||
SCHEME=xor ./test_write.sh
|
SCHEME=xor ./test_write.sh
|
||||||
|
|
||||||
./test_write_no_same.sh
|
./test_write_no_same.sh
|
||||||
|
|
||||||
|
./test_heal.sh
|
||||||
|
SCHEME=ec PG_MINSIZE=2 ./test_heal.sh
|
||||||
|
@@ -43,7 +43,7 @@ kill_osds &
|
|||||||
|
|
||||||
LD_PRELOAD="build/src/libfio_vitastor.so" \
|
LD_PRELOAD="build/src/libfio_vitastor.so" \
|
||||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=16 -fsync=256 -rw=randwrite \
|
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=16 -fsync=256 -rw=randwrite \
|
||||||
-mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120 2>/dev/null
|
-mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120
|
||||||
|
|
||||||
qemu-img convert -S 4096 -p \
|
qemu-img convert -S 4096 -p \
|
||||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testimg" \
|
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testimg" \
|
||||||
|
@@ -7,7 +7,7 @@ OSD_COUNT=5
|
|||||||
OSD_ARGS=
|
OSD_ARGS=
|
||||||
for i in $(seq 1 $OSD_COUNT); do
|
for i in $(seq 1 $OSD_COUNT); do
|
||||||
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
|
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
|
||||||
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
|
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
|
||||||
eval OSD${i}_PID=$!
|
eval OSD${i}_PID=$!
|
||||||
done
|
done
|
||||||
|
|
||||||
|
@@ -53,7 +53,7 @@ for i in $(seq 1 $OSD_COUNT); do
|
|||||||
--data_device ./testdata/test_osd$i.bin \
|
--data_device ./testdata/test_osd$i.bin \
|
||||||
--meta_offset 0 \
|
--meta_offset 0 \
|
||||||
--journal_offset $((1024*1024)) \
|
--journal_offset $((1024*1024)) \
|
||||||
--data_offset $((128*1024*1024)) &>./testdata/osd$i.log &
|
--data_offset $((128*1024*1024)) >>./testdata/osd$i.log 2>&1 &
|
||||||
eval OSD${i}_PID=$!
|
eval OSD${i}_PID=$!
|
||||||
done
|
done
|
||||||
|
|
||||||
|
@@ -21,7 +21,8 @@ LD_PRELOAD="build/src/libfio_vitastor.so" \
|
|||||||
# Kill OSD 2, start OSD 1
|
# Kill OSD 2, start OSD 1
|
||||||
|
|
||||||
kill $OSD2_PID
|
kill $OSD2_PID
|
||||||
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options --device ./testdata/test_osd2.bin 2>/dev/null) >>./testdata/osd2.log 2>&1 &
|
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL \
|
||||||
|
$(build/src/vitastor-disk simple-offsets --format options --device ./testdata/test_osd2.bin 2>/dev/null) >>./testdata/osd2.log 2>&1 &
|
||||||
sleep 2
|
sleep 2
|
||||||
|
|
||||||
# Check PG state - it should NOT become active
|
# Check PG state - it should NOT become active
|
||||||
|
@@ -10,7 +10,7 @@ etcdctl --endpoints=http://127.0.0.1:12379/v3 del --prefix /vitastor/osd/state
|
|||||||
OSD_COUNT=3
|
OSD_COUNT=3
|
||||||
OSD_ARGS=
|
OSD_ARGS=
|
||||||
for i in $(seq 1 $OSD_COUNT); do
|
for i in $(seq 1 $OSD_COUNT); do
|
||||||
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
|
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
|
||||||
eval OSD${i}_PID=$!
|
eval OSD${i}_PID=$!
|
||||||
done
|
done
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user