forked from vitalif/vitastor
Compare commits
1 Commits
epoch-dele
...
rm-left-on
Author | SHA1 | Date | |
---|---|---|---|
0f964d62db |
@@ -1,7 +1,7 @@
|
|||||||
cmake_minimum_required(VERSION 2.8.12)
|
cmake_minimum_required(VERSION 2.8)
|
||||||
|
|
||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
set(VERSION "0.8.8")
|
set(VERSION "0.8.3")
|
||||||
|
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
@@ -48,9 +48,9 @@ Vitastor, составлены для того, чтобы убедиться,
|
|||||||
интерфейс (прокси), опять же, без открытия в свободный публичный доступ как
|
интерфейс (прокси), опять же, без открытия в свободный публичный доступ как
|
||||||
самой программы, так и прокси.
|
самой программы, так и прокси.
|
||||||
|
|
||||||
Сетевая Публичная Лицензия Vitastor разработана специально, чтобы
|
Сетевая Публичная Лицензия Vitastor разработана специально чтобы
|
||||||
гарантировать, что в таких случаях и модифицированная версия программы, и
|
гарантировать, что в таких случаях и модифицированная версия программы, и
|
||||||
прокси останутся доступными сообществу. Для этого лицензия требует от
|
прокси оставались доступными сообществу. Для этого лицензия требует от
|
||||||
операторов сетевых серверов предоставлять исходный код оригинальной программы,
|
операторов сетевых серверов предоставлять исходный код оригинальной программы,
|
||||||
а также всех других программ, взаимодействующих с ней на их серверах,
|
а также всех других программ, взаимодействующих с ней на их серверах,
|
||||||
пользователям этих серверов, на условиях свободных лицензий. Таким образом,
|
пользователям этих серверов, на условиях свободных лицензий. Таким образом,
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
VERSION ?= v0.8.8
|
VERSION ?= v0.8.3
|
||||||
|
|
||||||
all: build push
|
all: build push
|
||||||
|
|
||||||
|
@@ -49,7 +49,7 @@ spec:
|
|||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
allowPrivilegeEscalation: true
|
allowPrivilegeEscalation: true
|
||||||
image: vitalif/vitastor-csi:v0.8.8
|
image: vitalif/vitastor-csi:v0.8.3
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
@@ -116,7 +116,7 @@ spec:
|
|||||||
privileged: true
|
privileged: true
|
||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
image: vitalif/vitastor-csi:v0.8.8
|
image: vitalif/vitastor-csi:v0.8.3
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
@@ -5,7 +5,7 @@ package vitastor
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
vitastorCSIDriverName = "csi.vitastor.io"
|
vitastorCSIDriverName = "csi.vitastor.io"
|
||||||
vitastorCSIDriverVersion = "0.8.8"
|
vitastorCSIDriverVersion = "0.8.3"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config struct fills the parameters of request or user input
|
// Config struct fills the parameters of request or user input
|
||||||
|
@@ -6,11 +6,11 @@ package vitastor
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
|
||||||
"strings"
|
"strings"
|
||||||
"bytes"
|
"bytes"
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
@@ -21,6 +21,8 @@ import (
|
|||||||
"google.golang.org/grpc/codes"
|
"google.golang.org/grpc/codes"
|
||||||
"google.golang.org/grpc/status"
|
"google.golang.org/grpc/status"
|
||||||
|
|
||||||
|
"go.etcd.io/etcd/clientv3"
|
||||||
|
|
||||||
"github.com/container-storage-interface/spec/lib/go/csi"
|
"github.com/container-storage-interface/spec/lib/go/csi"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -112,34 +114,6 @@ func GetConnectionParams(params map[string]string) (map[string]string, []string,
|
|||||||
return ctxVars, etcdUrl, etcdPrefix
|
return ctxVars, etcdUrl, etcdPrefix
|
||||||
}
|
}
|
||||||
|
|
||||||
func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
|
|
||||||
{
|
|
||||||
if (ctxVars["etcdUrl"] != "")
|
|
||||||
{
|
|
||||||
args = append(args, "--etcd_address", ctxVars["etcdUrl"])
|
|
||||||
}
|
|
||||||
if (ctxVars["etcdPrefix"] != "")
|
|
||||||
{
|
|
||||||
args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
|
|
||||||
}
|
|
||||||
if (ctxVars["configPath"] != "")
|
|
||||||
{
|
|
||||||
args = append(args, "--config_path", ctxVars["configPath"])
|
|
||||||
}
|
|
||||||
c := exec.Command("/usr/bin/vitastor-cli", args...)
|
|
||||||
var stdout, stderr bytes.Buffer
|
|
||||||
c.Stdout = &stdout
|
|
||||||
c.Stderr = &stderr
|
|
||||||
err := c.Run()
|
|
||||||
stderrStr := string(stderr.Bytes())
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
|
|
||||||
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
|
|
||||||
}
|
|
||||||
return stdout.Bytes(), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create the volume
|
// Create the volume
|
||||||
func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
|
func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
|
||||||
{
|
{
|
||||||
@@ -172,41 +146,128 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
|
|||||||
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
|
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
|
||||||
}
|
}
|
||||||
|
|
||||||
ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
|
// FIXME: The following should PROBABLY be implemented externally in a management tool
|
||||||
|
|
||||||
|
ctxVars, etcdUrl, etcdPrefix := GetConnectionParams(req.Parameters)
|
||||||
if (len(etcdUrl) == 0)
|
if (len(etcdUrl) == 0)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create image using vitastor-cli
|
// Connect to etcd
|
||||||
_, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
|
cli, err := clientv3.New(clientv3.Config{
|
||||||
|
DialTimeout: ETCD_TIMEOUT,
|
||||||
|
Endpoints: etcdUrl,
|
||||||
|
})
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
if (strings.Index(err.Error(), "already exists") > 0)
|
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
|
||||||
|
}
|
||||||
|
defer cli.Close()
|
||||||
|
|
||||||
|
var imageId uint64 = 0
|
||||||
|
for
|
||||||
|
{
|
||||||
|
// Check if the image exists
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
{
|
{
|
||||||
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
|
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (len(resp.Kvs) > 0)
|
||||||
|
{
|
||||||
|
kv := resp.Kvs[0]
|
||||||
|
var v InodeIndex
|
||||||
|
err := json.Unmarshal(kv.Value, &v)
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, err
|
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
|
||||||
}
|
}
|
||||||
var inodeCfg []InodeConfig
|
poolId = v.PoolId
|
||||||
err = json.Unmarshal(stat, &inodeCfg)
|
imageId = v.Id
|
||||||
|
inodeCfgKey := fmt.Sprintf("/config/inode/%d/%d", poolId, imageId)
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
resp, err := cli.Get(ctx, etcdPrefix+inodeCfgKey)
|
||||||
|
cancel()
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
|
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||||
}
|
}
|
||||||
if (len(inodeCfg) == 0)
|
if (len(resp.Kvs) == 0)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
|
return nil, status.Error(codes.Internal, "missing "+inodeCfgKey+" key in etcd")
|
||||||
}
|
}
|
||||||
if (inodeCfg[0].Size < uint64(volSize))
|
var inodeCfg InodeConfig
|
||||||
|
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (inodeCfg.Size < uint64(volSize))
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
|
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
return nil, err
|
// Find a free ID
|
||||||
|
// Create image metadata in a transaction verifying that the image doesn't exist yet AND ID is still free
|
||||||
|
maxIdKey := fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
resp, err := cli.Get(ctx, maxIdKey)
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
var modRev int64
|
||||||
|
var nextId uint64
|
||||||
|
if (len(resp.Kvs) > 0)
|
||||||
|
{
|
||||||
|
var err error
|
||||||
|
nextId, err = strconv.ParseUint(string(resp.Kvs[0].Value), 10, 64)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, maxIdKey+" contains invalid ID")
|
||||||
|
}
|
||||||
|
modRev = resp.Kvs[0].ModRevision
|
||||||
|
nextId++
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
nextId = 1
|
||||||
|
}
|
||||||
|
inodeIdxJson, _ := json.Marshal(InodeIndex{
|
||||||
|
Id: nextId,
|
||||||
|
PoolId: poolId,
|
||||||
|
})
|
||||||
|
inodeCfgJson, _ := json.Marshal(InodeConfig{
|
||||||
|
Name: volName,
|
||||||
|
Size: uint64(volSize),
|
||||||
|
})
|
||||||
|
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
txnResp, err := cli.Txn(ctx).If(
|
||||||
|
clientv3.Compare(clientv3.ModRevision(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)), "=", modRev),
|
||||||
|
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)), "=", 0),
|
||||||
|
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId)), "=", 0),
|
||||||
|
).Then(
|
||||||
|
clientv3.OpPut(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId), fmt.Sprintf("%d", nextId)),
|
||||||
|
clientv3.OpPut(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName), string(inodeIdxJson)),
|
||||||
|
clientv3.OpPut(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId), string(inodeCfgJson)),
|
||||||
|
).Commit()
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to commit transaction in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (txnResp.Succeeded)
|
||||||
|
{
|
||||||
|
imageId = nextId
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// Start over if the transaction fails
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -238,12 +299,97 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
|
|||||||
}
|
}
|
||||||
volName := ctxVars["name"]
|
volName := ctxVars["name"]
|
||||||
|
|
||||||
ctxVars, _, _ = GetConnectionParams(ctxVars)
|
_, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
|
||||||
|
if (len(etcdUrl) == 0)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
||||||
|
}
|
||||||
|
|
||||||
_, err = invokeCLI(ctxVars, []string{ "rm", volName })
|
cli, err := clientv3.New(clientv3.Config{
|
||||||
|
DialTimeout: ETCD_TIMEOUT,
|
||||||
|
Endpoints: etcdUrl,
|
||||||
|
})
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, err
|
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
|
||||||
|
}
|
||||||
|
defer cli.Close()
|
||||||
|
|
||||||
|
// Find inode by name
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (len(resp.Kvs) == 0)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
|
||||||
|
}
|
||||||
|
var idx InodeIndex
|
||||||
|
err = json.Unmarshal(resp.Kvs[0].Value, &idx)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get inode config
|
||||||
|
inodeCfgKey := fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)
|
||||||
|
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
resp, err = cli.Get(ctx, inodeCfgKey)
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (len(resp.Kvs) == 0)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
|
||||||
|
}
|
||||||
|
var inodeCfg InodeConfig
|
||||||
|
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete inode data by invoking vitastor-cli
|
||||||
|
args := []string{
|
||||||
|
"rm-data", "--etcd_address", strings.Join(etcdUrl, ","),
|
||||||
|
"--pool", fmt.Sprintf("%d", idx.PoolId),
|
||||||
|
"--inode", fmt.Sprintf("%d", idx.Id),
|
||||||
|
}
|
||||||
|
if (ctxVars["configPath"] != "")
|
||||||
|
{
|
||||||
|
args = append(args, "--config_path", ctxVars["configPath"])
|
||||||
|
}
|
||||||
|
c := exec.Command("/usr/bin/vitastor-cli", args...)
|
||||||
|
var stderr bytes.Buffer
|
||||||
|
c.Stdout = nil
|
||||||
|
c.Stderr = &stderr
|
||||||
|
err = c.Run()
|
||||||
|
stderrStr := string(stderr.Bytes())
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("vitastor-cli rm-data failed: %s, status %s\n", stderrStr, err)
|
||||||
|
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete inode config in etcd
|
||||||
|
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
txnResp, err := cli.Txn(ctx).Then(
|
||||||
|
clientv3.OpDelete(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)),
|
||||||
|
clientv3.OpDelete(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)),
|
||||||
|
).Commit()
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (!txnResp.Succeeded)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: transaction failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
return &csi.DeleteVolumeResponse{}, nil
|
return &csi.DeleteVolumeResponse{}, nil
|
||||||
|
4
debian/changelog
vendored
4
debian/changelog
vendored
@@ -1,10 +1,10 @@
|
|||||||
vitastor (0.8.8-1) unstable; urgency=medium
|
vitastor (0.8.3-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Bugfixes
|
* Bugfixes
|
||||||
|
|
||||||
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
||||||
|
|
||||||
vitastor (0.8.8-1) unstable; urgency=medium
|
vitastor (0.8.3-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Implement NFS proxy
|
* Implement NFS proxy
|
||||||
* Add documentation
|
* Add documentation
|
||||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -34,8 +34,8 @@ RUN set -e -x; \
|
|||||||
mkdir -p /root/packages/vitastor-$REL; \
|
mkdir -p /root/packages/vitastor-$REL; \
|
||||||
rm -rf /root/packages/vitastor-$REL/*; \
|
rm -rf /root/packages/vitastor-$REL/*; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
cp -r /root/vitastor vitastor-0.8.8; \
|
cp -r /root/vitastor vitastor-0.8.3; \
|
||||||
cd vitastor-0.8.8; \
|
cd vitastor-0.8.3; \
|
||||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||||
@@ -48,8 +48,8 @@ RUN set -e -x; \
|
|||||||
rm -rf a b; \
|
rm -rf a b; \
|
||||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.8.orig.tar.xz vitastor-0.8.8; \
|
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.3.orig.tar.xz vitastor-0.8.3; \
|
||||||
cd vitastor-0.8.8; \
|
cd vitastor-0.8.3; \
|
||||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||||
|
@@ -17,16 +17,14 @@ Configuration parameters can be set in 3 places:
|
|||||||
- Configuration file (`/etc/vitastor/vitastor.conf` or other path)
|
- Configuration file (`/etc/vitastor/vitastor.conf` or other path)
|
||||||
- etcd key `/vitastor/config/global`. Most variables can be set there, but etcd
|
- etcd key `/vitastor/config/global`. Most variables can be set there, but etcd
|
||||||
connection parameters should obviously be set in the configuration file.
|
connection parameters should obviously be set in the configuration file.
|
||||||
- Command line of Vitastor components: OSD (when you run it without vitastor-disk),
|
- Command line of Vitastor components: OSD, mon, fio and QEMU options,
|
||||||
mon, fio and QEMU options, OpenStack/Proxmox/etc configuration. The latter
|
OpenStack/Proxmox/etc configuration. The latter doesn't allow to set all
|
||||||
doesn't allow to set all variables directly, but it allows to override the
|
variables directly, but it allows to override the configuration file and
|
||||||
configuration file and set everything you need inside it.
|
set everything you need inside it.
|
||||||
- OSD superblocks created by [vitastor-disk](../usage/disk.en.md) contain
|
|
||||||
primarily disk layout parameters of specific OSDs. In fact, these parameters
|
|
||||||
are automatically passed into the command line of vitastor-osd process, so
|
|
||||||
they have the same "status" as command-line parameters.
|
|
||||||
|
|
||||||
In the future, additional configuration methods may be added:
|
In the future, additional configuration methods may be added:
|
||||||
|
- OSD superblock which will, by design, contain parameters related to the disk
|
||||||
|
layout and to one specific OSD.
|
||||||
- OSD-specific keys in etcd like `/vitastor/config/osd/<number>`.
|
- OSD-specific keys in etcd like `/vitastor/config/osd/<number>`.
|
||||||
|
|
||||||
## Parameter Reference
|
## Parameter Reference
|
||||||
|
@@ -19,17 +19,14 @@
|
|||||||
- Ключе в etcd `/vitastor/config/global`. Большая часть параметров может
|
- Ключе в etcd `/vitastor/config/global`. Большая часть параметров может
|
||||||
задаваться там, кроме, естественно, самих параметров соединения с etcd,
|
задаваться там, кроме, естественно, самих параметров соединения с etcd,
|
||||||
которые должны задаваться в файле конфигурации
|
которые должны задаваться в файле конфигурации
|
||||||
- В командной строке компонентов Vitastor: OSD (при ручном запуске без vitastor-disk),
|
- В командной строке компонентов Vitastor: OSD, монитора, опциях fio и QEMU,
|
||||||
монитора, опциях fio и QEMU, настроек OpenStack, Proxmox и т.п. Последние,
|
настроек OpenStack, Proxmox и т.п. Последние, как правило, не включают полный
|
||||||
как правило, не включают полный набор параметров напрямую, но позволяют
|
набор параметров напрямую, но разрешают определить путь к файлу конфигурации
|
||||||
определить путь к файлу конфигурации и задать любые параметры в нём.
|
и задать любые параметры в нём.
|
||||||
- В суперблоке OSD, записываемом [vitastor-disk](../usage/disk.ru.md) - параметры,
|
|
||||||
связанные с дисковым форматом и с этим конкретным OSD. На самом деле,
|
|
||||||
при запуске OSD эти параметры автоматически передаются в командную строку
|
|
||||||
процесса vitastor-osd, то есть по "статусу" они эквивалентны параметрам
|
|
||||||
командной строки OSD.
|
|
||||||
|
|
||||||
В будущем также могут быть добавлены другие способы конфигурации:
|
В будущем также могут быть добавлены другие способы конфигурации:
|
||||||
|
- Суперблок OSD, в котором будут храниться параметры OSD, связанные с дисковым
|
||||||
|
форматом и с этим конкретным OSD.
|
||||||
- OSD-специфичные ключи в etcd типа `/vitastor/config/osd/<номер>`.
|
- OSD-специфичные ключи в etcd типа `/vitastor/config/osd/<номер>`.
|
||||||
|
|
||||||
## Список параметров
|
## Список параметров
|
||||||
|
@@ -19,7 +19,6 @@ between clients, OSDs and etcd.
|
|||||||
- [rdma_max_sge](#rdma_max_sge)
|
- [rdma_max_sge](#rdma_max_sge)
|
||||||
- [rdma_max_msg](#rdma_max_msg)
|
- [rdma_max_msg](#rdma_max_msg)
|
||||||
- [rdma_max_recv](#rdma_max_recv)
|
- [rdma_max_recv](#rdma_max_recv)
|
||||||
- [rdma_max_send](#rdma_max_send)
|
|
||||||
- [peer_connect_interval](#peer_connect_interval)
|
- [peer_connect_interval](#peer_connect_interval)
|
||||||
- [peer_connect_timeout](#peer_connect_timeout)
|
- [peer_connect_timeout](#peer_connect_timeout)
|
||||||
- [osd_idle_timeout](#osd_idle_timeout)
|
- [osd_idle_timeout](#osd_idle_timeout)
|
||||||
@@ -75,12 +74,6 @@ to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
|||||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||||
root to list available RDMA devices and their features.
|
root to list available RDMA devices and their features.
|
||||||
|
|
||||||
Remember that you also have to configure your network switches if you use
|
|
||||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
|
||||||
the manual of your network vendor for details about setting up the switch
|
|
||||||
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
|
|
||||||
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
|
||||||
|
|
||||||
## rdma_port_num
|
## rdma_port_num
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
@@ -123,30 +116,20 @@ required to change this parameter.
|
|||||||
## rdma_max_msg
|
## rdma_max_msg
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
- Default: 132096
|
- Default: 1048576
|
||||||
|
|
||||||
Maximum size of a single RDMA send or receive operation in bytes.
|
Maximum size of a single RDMA send or receive operation in bytes.
|
||||||
|
|
||||||
## rdma_max_recv
|
## rdma_max_recv
|
||||||
|
|
||||||
- Type: integer
|
|
||||||
- Default: 16
|
|
||||||
|
|
||||||
Maximum number of RDMA receive buffers per connection (RDMA requires
|
|
||||||
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
|
|
||||||
in size. So this setting directly affects memory usage: a single Vitastor
|
|
||||||
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
|
|
||||||
Default is roughly 2 MB * number of OSDs.
|
|
||||||
|
|
||||||
## rdma_max_send
|
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
- Default: 8
|
- Default: 8
|
||||||
|
|
||||||
Maximum number of outstanding RDMA send operations per connection. Should be
|
Maximum number of parallel RDMA receive operations. Note that this number
|
||||||
less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
|
of receive buffers `rdma_max_msg` in size are allocated for each client,
|
||||||
Doesn't affect memory usage - additional memory isn't allocated for send
|
so this setting actually affects memory usage. This is because RDMA receive
|
||||||
operations.
|
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
|
||||||
|
later versions.
|
||||||
|
|
||||||
## peer_connect_interval
|
## peer_connect_interval
|
||||||
|
|
||||||
|
@@ -19,7 +19,6 @@
|
|||||||
- [rdma_max_sge](#rdma_max_sge)
|
- [rdma_max_sge](#rdma_max_sge)
|
||||||
- [rdma_max_msg](#rdma_max_msg)
|
- [rdma_max_msg](#rdma_max_msg)
|
||||||
- [rdma_max_recv](#rdma_max_recv)
|
- [rdma_max_recv](#rdma_max_recv)
|
||||||
- [rdma_max_send](#rdma_max_send)
|
|
||||||
- [peer_connect_interval](#peer_connect_interval)
|
- [peer_connect_interval](#peer_connect_interval)
|
||||||
- [peer_connect_timeout](#peer_connect_timeout)
|
- [peer_connect_timeout](#peer_connect_timeout)
|
||||||
- [osd_idle_timeout](#osd_idle_timeout)
|
- [osd_idle_timeout](#osd_idle_timeout)
|
||||||
@@ -79,13 +78,6 @@ Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Наприме
|
|||||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||||
параметры и возможности.
|
параметры и возможности.
|
||||||
|
|
||||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
|
||||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
|
||||||
нестабильной производительностью. Подробную информацию о настройке
|
|
||||||
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
|
|
||||||
подразумевает настройку сети без потерь на основе PFC (Priority Flow
|
|
||||||
Control) и ECN (Explicit Congestion Notification).
|
|
||||||
|
|
||||||
## rdma_port_num
|
## rdma_port_num
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
@@ -129,32 +121,22 @@ OSD в любом случае согласовывают реальное зн
|
|||||||
## rdma_max_msg
|
## rdma_max_msg
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
- Значение по умолчанию: 132096
|
- Значение по умолчанию: 1048576
|
||||||
|
|
||||||
Максимальный размер одной RDMA-операции отправки или приёма.
|
Максимальный размер одной RDMA-операции отправки или приёма.
|
||||||
|
|
||||||
## rdma_max_recv
|
## rdma_max_recv
|
||||||
|
|
||||||
- Тип: целое число
|
|
||||||
- Значение по умолчанию: 16
|
|
||||||
|
|
||||||
Максимальное число буферов для RDMA-приёма данных на одно соединение
|
|
||||||
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
|
|
||||||
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
|
|
||||||
потребление памяти - один Vitastor-клиент с RDMA использует
|
|
||||||
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
|
|
||||||
примерно 2 МБ * число OSD.
|
|
||||||
|
|
||||||
## rdma_max_send
|
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
- Значение по умолчанию: 8
|
- Значение по умолчанию: 8
|
||||||
|
|
||||||
Максимальное число RDMA-операций отправки, отправляемых в очередь одного
|
Максимальное число параллельных RDMA-операций получения данных. Следует
|
||||||
соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
|
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
|
||||||
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
для каждого подключённого клиентского соединения, так что данная настройка
|
||||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
|
||||||
не выделяется.
|
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
|
||||||
|
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
|
||||||
|
более новых версиях Vitastor.
|
||||||
|
|
||||||
## peer_connect_interval
|
## peer_connect_interval
|
||||||
|
|
||||||
|
@@ -53,12 +53,6 @@
|
|||||||
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||||
root to list available RDMA devices and their features.
|
root to list available RDMA devices and their features.
|
||||||
|
|
||||||
Remember that you also have to configure your network switches if you use
|
|
||||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
|
||||||
the manual of your network vendor for details about setting up the switch
|
|
||||||
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
|
|
||||||
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
||||||
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
||||||
@@ -67,13 +61,6 @@
|
|||||||
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
||||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||||
параметры и возможности.
|
параметры и возможности.
|
||||||
|
|
||||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
|
||||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
|
||||||
нестабильной производительностью. Подробную информацию о настройке
|
|
||||||
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
|
|
||||||
подразумевает настройку сети без потерь на основе PFC (Priority Flow
|
|
||||||
Control) и ECN (Explicit Congestion Notification).
|
|
||||||
- name: rdma_port_num
|
- name: rdma_port_num
|
||||||
type: int
|
type: int
|
||||||
default: 1
|
default: 1
|
||||||
@@ -127,39 +114,26 @@
|
|||||||
так что менять этот параметр обычно не нужно.
|
так что менять этот параметр обычно не нужно.
|
||||||
- name: rdma_max_msg
|
- name: rdma_max_msg
|
||||||
type: int
|
type: int
|
||||||
default: 132096
|
default: 1048576
|
||||||
info: Maximum size of a single RDMA send or receive operation in bytes.
|
info: Maximum size of a single RDMA send or receive operation in bytes.
|
||||||
info_ru: Максимальный размер одной RDMA-операции отправки или приёма.
|
info_ru: Максимальный размер одной RDMA-операции отправки или приёма.
|
||||||
- name: rdma_max_recv
|
- name: rdma_max_recv
|
||||||
type: int
|
|
||||||
default: 16
|
|
||||||
info: |
|
|
||||||
Maximum number of RDMA receive buffers per connection (RDMA requires
|
|
||||||
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
|
|
||||||
in size. So this setting directly affects memory usage: a single Vitastor
|
|
||||||
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
|
|
||||||
Default is roughly 2 MB * number of OSDs.
|
|
||||||
info_ru: |
|
|
||||||
Максимальное число буферов для RDMA-приёма данных на одно соединение
|
|
||||||
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
|
|
||||||
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
|
|
||||||
потребление памяти - один Vitastor-клиент с RDMA использует
|
|
||||||
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
|
|
||||||
примерно 2 МБ * число OSD.
|
|
||||||
- name: rdma_max_send
|
|
||||||
type: int
|
type: int
|
||||||
default: 8
|
default: 8
|
||||||
info: |
|
info: |
|
||||||
Maximum number of outstanding RDMA send operations per connection. Should be
|
Maximum number of parallel RDMA receive operations. Note that this number
|
||||||
less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
|
of receive buffers `rdma_max_msg` in size are allocated for each client,
|
||||||
Doesn't affect memory usage - additional memory isn't allocated for send
|
so this setting actually affects memory usage. This is because RDMA receive
|
||||||
operations.
|
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
|
||||||
|
later versions.
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Максимальное число RDMA-операций отправки, отправляемых в очередь одного
|
Максимальное число параллельных RDMA-операций получения данных. Следует
|
||||||
соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
|
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
|
||||||
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
для каждого подключённого клиентского соединения, так что данная настройка
|
||||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
|
||||||
не выделяется.
|
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
|
||||||
|
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
|
||||||
|
более новых версиях Vitastor.
|
||||||
- name: peer_connect_interval
|
- name: peer_connect_interval
|
||||||
type: sec
|
type: sec
|
||||||
min: 1
|
min: 1
|
||||||
|
@@ -9,7 +9,7 @@
|
|||||||
## Debian
|
## Debian
|
||||||
|
|
||||||
- Trust Vitastor package signing key:
|
- Trust Vitastor package signing key:
|
||||||
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
|
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
|
||||||
- Add Vitastor package repository to your /etc/apt/sources.list:
|
- Add Vitastor package repository to your /etc/apt/sources.list:
|
||||||
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
||||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||||
@@ -20,19 +20,16 @@
|
|||||||
## CentOS
|
## CentOS
|
||||||
|
|
||||||
- Add Vitastor package repository:
|
- Add Vitastor package repository:
|
||||||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
|
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
|
||||||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
|
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
|
||||||
- AlmaLinux 9 and other RHEL 9 clones (Rocky, Oracle...): `dnf install https://vitastor.io/rpms/centos/9/vitastor-release.rpm`
|
|
||||||
- Enable EPEL: `yum/dnf install epel-release`
|
- Enable EPEL: `yum/dnf install epel-release`
|
||||||
- Enable additional CentOS repositories:
|
- Enable additional CentOS repositories:
|
||||||
- CentOS 7: `yum install centos-release-scl`
|
- CentOS 7: `yum install centos-release-scl`
|
||||||
- CentOS 8: `dnf install centos-release-advanced-virtualization`
|
- CentOS 8: `dnf install centos-release-advanced-virtualization`
|
||||||
- RHEL 9 clones: not required
|
|
||||||
- Enable elrepo-kernel:
|
- Enable elrepo-kernel:
|
||||||
- CentOS 7: `yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm`
|
- CentOS 7: `yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm`
|
||||||
- CentOS 8: `dnf install https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm`
|
- CentOS 8: `dnf install https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm`
|
||||||
- RHEL 9 clones: optional, not required: `dnf install https://www.elrepo.org/elrepo-release-9.el9.elrepo.noarch.rpm`
|
- Install packages: `yum/dnf install vitastor lpsolve etcd kernel-ml qemu-kvm`
|
||||||
- Install packages: `yum/dnf install vitastor lpsolve etcd qemu-kvm` and optionally `kernel-ml` if you use elrepo-kernel
|
|
||||||
|
|
||||||
## Installation requirements
|
## Installation requirements
|
||||||
|
|
||||||
|
@@ -9,7 +9,7 @@
|
|||||||
## Debian
|
## Debian
|
||||||
|
|
||||||
- Добавьте ключ репозитория Vitastor:
|
- Добавьте ключ репозитория Vitastor:
|
||||||
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
|
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
|
||||||
- Добавьте репозиторий Vitastor в /etc/apt/sources.list:
|
- Добавьте репозиторий Vitastor в /etc/apt/sources.list:
|
||||||
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
||||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||||
@@ -20,8 +20,8 @@
|
|||||||
## CentOS
|
## CentOS
|
||||||
|
|
||||||
- Добавьте в систему репозиторий Vitastor:
|
- Добавьте в систему репозиторий Vitastor:
|
||||||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
|
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
|
||||||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
|
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
|
||||||
- Включите EPEL: `yum/dnf install epel-release`
|
- Включите EPEL: `yum/dnf install epel-release`
|
||||||
- Включите дополнительные репозитории CentOS:
|
- Включите дополнительные репозитории CentOS:
|
||||||
- CentOS 7: `yum install centos-release-scl`
|
- CentOS 7: `yum install centos-release-scl`
|
||||||
|
@@ -6,10 +6,10 @@
|
|||||||
|
|
||||||
# Proxmox VE
|
# Proxmox VE
|
||||||
|
|
||||||
To enable Vitastor support in Proxmox Virtual Environment (6.4-7.4 are supported):
|
To enable Vitastor support in Proxmox Virtual Environment (6.4-7.3 are supported):
|
||||||
|
|
||||||
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
|
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
|
||||||
buster for 6.4, bullseye for 7.4, pve7.1 for 7.1, pve7.2 for 7.2, pve7.3 for 7.3
|
buster for 6.4, bullseye for 7.3, pve7.1 for 7.1, pve7.2 for 7.2
|
||||||
- Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
|
- Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
|
||||||
- Define storage in `/etc/pve/storage.cfg` (see below)
|
- Define storage in `/etc/pve/storage.cfg` (see below)
|
||||||
- Block network access from VMs to Vitastor network (to OSDs and etcd),
|
- Block network access from VMs to Vitastor network (to OSDs and etcd),
|
||||||
|
@@ -6,10 +6,10 @@
|
|||||||
|
|
||||||
# Proxmox
|
# Proxmox
|
||||||
|
|
||||||
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-7.4):
|
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-7.3):
|
||||||
|
|
||||||
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
|
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
|
||||||
buster для 6.4, bullseye для 7.4, pve7.1 для 7.1, pve7.2 для 7.2, pve7.3 для 7.3
|
buster для 6.4, bullseye для 7.3, pve7.1 для 7.1, pve7.2 для 7.2
|
||||||
- Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
|
- Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
|
||||||
- Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
|
- Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
|
||||||
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
|
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
|
||||||
|
@@ -45,9 +45,7 @@ On the monitor hosts:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
- Initialize OSDs:
|
- Initialize OSDs:
|
||||||
- SSD-only: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`. You can add
|
- SSD-only: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`
|
||||||
`--disable_data_fsync off` to leave disk cache enabled if you use desktop
|
|
||||||
SSDs without capacitors.
|
|
||||||
- Hybrid, SSD+HDD: `vitastor-disk prepare --hybrid /dev/sdXXX [/dev/sdYYY ...]`.
|
- Hybrid, SSD+HDD: `vitastor-disk prepare --hybrid /dev/sdXXX [/dev/sdYYY ...]`.
|
||||||
Pass all your devices (HDD and SSD) to this script — it will partition disks and initialize journals on its own.
|
Pass all your devices (HDD and SSD) to this script — it will partition disks and initialize journals on its own.
|
||||||
This script skips HDDs which are already partitioned so if you want to use non-empty disks for
|
This script skips HDDs which are already partitioned so if you want to use non-empty disks for
|
||||||
@@ -55,9 +53,7 @@ On the monitor hosts:
|
|||||||
but some free unpartitioned space must be available because the script creates new partitions for journals.
|
but some free unpartitioned space must be available because the script creates new partitions for journals.
|
||||||
- You can change OSD configuration in units or in `vitastor.conf`.
|
- You can change OSD configuration in units or in `vitastor.conf`.
|
||||||
Check [Configuration Reference](../config.en.md) for parameter descriptions.
|
Check [Configuration Reference](../config.en.md) for parameter descriptions.
|
||||||
- If all your drives have capacitors, and even if not, but if you ran `vitastor-disk`
|
- If all your drives have capacitors, create global configuration in etcd: \
|
||||||
without `--disable_data_fsync off` at the first step, then put the following
|
|
||||||
setting into etcd: \
|
|
||||||
`etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
|
`etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
|
||||||
- Start all OSDs: `systemctl start vitastor.target`
|
- Start all OSDs: `systemctl start vitastor.target`
|
||||||
|
|
||||||
@@ -74,15 +70,11 @@ For EC pools the configuration should look like the following:
|
|||||||
|
|
||||||
```
|
```
|
||||||
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
|
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
|
||||||
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}}'
|
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}'
|
||||||
```
|
```
|
||||||
|
|
||||||
After you do this, one of the monitors will configure PGs and OSDs will start them.
|
After you do this, one of the monitors will configure PGs and OSDs will start them.
|
||||||
|
|
||||||
If you use HDDs you should also add `"block_size": 1048576` to pool configuration.
|
|
||||||
The other option is to add it into /vitastor/config/global, in this case it will
|
|
||||||
apply to all pools by default.
|
|
||||||
|
|
||||||
## Check cluster status
|
## Check cluster status
|
||||||
|
|
||||||
`vitastor-cli status`
|
`vitastor-cli status`
|
||||||
|
@@ -45,9 +45,7 @@
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
- Инициализуйте OSD:
|
- Инициализуйте OSD:
|
||||||
- SSD: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`. Если вы используете
|
- SSD: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`
|
||||||
десктопные SSD без конденсаторов, можете оставить кэш включённым, добавив
|
|
||||||
опцию `--disable_data_fsync off`.
|
|
||||||
- Гибридные, SSD+HDD: `vitastor-disk prepare --hybrid /dev/sdXXX [/dev/sdYYY ...]`.
|
- Гибридные, SSD+HDD: `vitastor-disk prepare --hybrid /dev/sdXXX [/dev/sdYYY ...]`.
|
||||||
Передайте все ваши SSD и HDD скрипту в командной строке подряд, скрипт автоматически выделит
|
Передайте все ваши SSD и HDD скрипту в командной строке подряд, скрипт автоматически выделит
|
||||||
разделы под журналы на SSD и данные на HDD. Скрипт пропускает HDD, на которых уже есть разделы
|
разделы под журналы на SSD и данные на HDD. Скрипт пропускает HDD, на которых уже есть разделы
|
||||||
@@ -56,11 +54,8 @@
|
|||||||
для журналов, на SSD должно быть доступно свободное нераспределённое место.
|
для журналов, на SSD должно быть доступно свободное нераспределённое место.
|
||||||
- Вы можете менять параметры OSD в юнитах systemd или в `vitastor.conf`. Описания параметров
|
- Вы можете менять параметры OSD в юнитах systemd или в `vitastor.conf`. Описания параметров
|
||||||
смотрите в [справке по конфигурации](../config.ru.md).
|
смотрите в [справке по конфигурации](../config.ru.md).
|
||||||
- Если все ваши диски - серверные с конденсаторами, и даже если нет, но при этом
|
- Если все ваши диски - серверные с конденсаторами, пропишите это в глобальную конфигурацию в etcd: \
|
||||||
вы не добавляли опцию `--disable_data_fsync off` на первом шаге, а `vitastor-disk`
|
`etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
|
||||||
не ругался на невозможность отключения кэша дисков, пропишите следующую настройку
|
|
||||||
в глобальную конфигурацию в etcd: \
|
|
||||||
`etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`.
|
|
||||||
- Запустите все OSD: `systemctl start vitastor.target`
|
- Запустите все OSD: `systemctl start vitastor.target`
|
||||||
|
|
||||||
## Создайте пул
|
## Создайте пул
|
||||||
@@ -76,15 +71,11 @@ etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
|
|||||||
|
|
||||||
```
|
```
|
||||||
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
|
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
|
||||||
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}}'
|
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}'
|
||||||
```
|
```
|
||||||
|
|
||||||
После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.
|
После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.
|
||||||
|
|
||||||
Если вы используете HDD-диски, то добавьте в конфигурацию пулов опцию `"block_size": 1048576`.
|
|
||||||
Также эту опцию можно добавить в /vitastor/config/global, в этом случае она будет
|
|
||||||
применяться ко всем пулам по умолчанию.
|
|
||||||
|
|
||||||
## Проверьте состояние кластера
|
## Проверьте состояние кластера
|
||||||
|
|
||||||
`vitastor-cli status`
|
`vitastor-cli status`
|
||||||
|
@@ -35,24 +35,15 @@ Write amplification for 4 KB blocks is usually 3-5 in Vitastor:
|
|||||||
If you manage to get an SSD which handles 512 byte blocks well (Optane?) you may
|
If you manage to get an SSD which handles 512 byte blocks well (Optane?) you may
|
||||||
lower 1, 3 and 4 to 512 bytes (1/8 of data size) and get WA as low as 2.375.
|
lower 1, 3 and 4 to 512 bytes (1/8 of data size) and get WA as low as 2.375.
|
||||||
|
|
||||||
Implemented NVDIMM support can basically eliminate WA at all - all extra writes will
|
|
||||||
go to DRAM memory. But this requires a test cluster with NVDIMM - please contact me
|
|
||||||
if you want to provide me with such cluster for tests.
|
|
||||||
|
|
||||||
Lazy fsync also reduces WA for parallel workloads because journal blocks are only
|
Lazy fsync also reduces WA for parallel workloads because journal blocks are only
|
||||||
written when they fill up or fsync is requested.
|
written when they fill up or fsync is requested.
|
||||||
|
|
||||||
## In Practice
|
## In Practice
|
||||||
|
|
||||||
In practice, using tests from [Understanding Performance](understanding.en.md), decent TCP network,
|
In practice, using tests from [Understanding Performance](understanding.en.md)
|
||||||
good server-grade SSD/NVMe drives and disabled CPU power saving, you should head for:
|
and good server-grade SSD/NVMe drives, you should head for:
|
||||||
- At least 5000 T1Q1 replicated read and write iops (maximum 0.2ms latency)
|
- At least 5000 T1Q1 replicated read and write iops (maximum 0.2ms latency)
|
||||||
- At least 5000 T1Q1 EC read IOPS and at least 2200 EC write IOPS (maximum 0.45ms latency)
|
|
||||||
- At least ~80k parallel read iops or ~30k write iops per 1 core (1 OSD)
|
- At least ~80k parallel read iops or ~30k write iops per 1 core (1 OSD)
|
||||||
- Disk-speed or wire-speed linear reads and writes, whichever is the bottleneck in your case
|
- Disk-speed or wire-speed linear reads and writes, whichever is the bottleneck in your case
|
||||||
|
|
||||||
Lower results may mean that you have bad drives, bad network or some kind of misconfiguration.
|
Lower results may mean that you have bad drives, bad network or some kind of misconfiguration.
|
||||||
|
|
||||||
Current latency records:
|
|
||||||
- 9668 T1Q1 replicated write iops (0.103 ms latency) with TCP and NVMe
|
|
||||||
- 9143 T1Q1 replicated read iops (0.109 ms latency) with TCP and NVMe
|
|
||||||
|
@@ -36,25 +36,6 @@ WA (мультипликатор записи) для 4 КБ блоков в Vit
|
|||||||
Если вы найдёте SSD, хорошо работающий с 512-байтными блоками данных (Optane?),
|
Если вы найдёте SSD, хорошо работающий с 512-байтными блоками данных (Optane?),
|
||||||
то 1, 3 и 4 можно снизить до 512 байт (1/8 от размера данных) и получить WA всего 2.375.
|
то 1, 3 и 4 можно снизить до 512 байт (1/8 от размера данных) и получить WA всего 2.375.
|
||||||
|
|
||||||
Если реализовать поддержку NVDIMM, то WA можно, условно говоря, ликвидировать вообще - все
|
|
||||||
дополнительные операции записи смогут обслуживаться DRAM памятью. Но для этого необходим
|
|
||||||
тестовый кластер с NVDIMM - пишите, если готовы предоставить такой для тестов.
|
|
||||||
|
|
||||||
Кроме того, WA снижается при использовании отложенного/ленивого сброса при параллельной
|
Кроме того, WA снижается при использовании отложенного/ленивого сброса при параллельной
|
||||||
нагрузке, т.к. блоки журнала записываются на диск только когда они заполняются или явным
|
нагрузке, т.к. блоки журнала записываются на диск только когда они заполняются или явным
|
||||||
образом запрашивается fsync.
|
образом запрашивается fsync.
|
||||||
|
|
||||||
## На практике
|
|
||||||
|
|
||||||
На практике, используя тесты fio со страницы [Понимание сути производительности систем хранения](understanding.ru.md),
|
|
||||||
нормальную TCP-сеть, хорошие серверные SSD/NVMe, при отключённом энергосбережении процессоров вы можете рассчитывать на:
|
|
||||||
- От 5000 IOPS в 1 поток (T1Q1) и на чтение, и на запись при использовании репликации (задержка до 0.2мс)
|
|
||||||
- От 5000 IOPS в 1 поток (T1Q1) на чтение и 2200 IOPS в 1 поток на запись при использовании EC (задержка до 0.45мс)
|
|
||||||
- От 80000 IOPS на чтение в параллельном режиме на 1 ядро, от 30000 IOPS на запись на 1 ядро (на 1 OSD)
|
|
||||||
- Скорость параллельного линейного чтения и записи, равная меньшему значению из скорости дисков или сети
|
|
||||||
|
|
||||||
Худшие результаты означают, что у вас либо медленные диски, либо медленная сеть, либо что-то неправильно настроено.
|
|
||||||
|
|
||||||
Зафиксированный на данный момент рекорд задержки:
|
|
||||||
- 9668 IOPS (0.103 мс задержка) в 1 поток (T1Q1) на запись с TCP и NVMe при использовании репликации
|
|
||||||
- 9143 IOPS (0.109 мс задержка) в 1 поток (T1Q1) на чтение с TCP и NVMe при использовании репликации
|
|
||||||
|
@@ -14,7 +14,6 @@ It supports the following commands:
|
|||||||
- [df](#df)
|
- [df](#df)
|
||||||
- [ls](#ls)
|
- [ls](#ls)
|
||||||
- [create](#create)
|
- [create](#create)
|
||||||
- [snap-create](#create)
|
|
||||||
- [modify](#modify)
|
- [modify](#modify)
|
||||||
- [rm](#rm)
|
- [rm](#rm)
|
||||||
- [flatten](#flatten)
|
- [flatten](#flatten)
|
||||||
@@ -124,8 +123,6 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
|||||||
|
|
||||||
Create a snapshot of image `<name>` (either form can be used). May be used live if only a single writer is active.
|
Create a snapshot of image `<name>` (either form can be used). May be used live if only a single writer is active.
|
||||||
|
|
||||||
See also about [how to export snapshots](qemu.en.md#exporting-snapshots).
|
|
||||||
|
|
||||||
## modify
|
## modify
|
||||||
|
|
||||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
||||||
|
@@ -15,7 +15,6 @@ vitastor-cli - интерфейс командной строки для адм
|
|||||||
- [df](#df)
|
- [df](#df)
|
||||||
- [ls](#ls)
|
- [ls](#ls)
|
||||||
- [create](#create)
|
- [create](#create)
|
||||||
- [snap-create](#create)
|
|
||||||
- [modify](#modify)
|
- [modify](#modify)
|
||||||
- [rm](#rm)
|
- [rm](#rm)
|
||||||
- [flatten](#flatten)
|
- [flatten](#flatten)
|
||||||
@@ -127,8 +126,6 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
|||||||
Создать снимок образа `<name>` (можно использовать любую форму команды). Снимок можно создавать без остановки
|
Создать снимок образа `<name>` (можно использовать любую форму команды). Снимок можно создавать без остановки
|
||||||
клиентов, если пишущий клиент максимум 1.
|
клиентов, если пишущий клиент максимум 1.
|
||||||
|
|
||||||
Смотрите также информацию о том, [как экспортировать снимки](qemu.ru.md#экспорт-снимков).
|
|
||||||
|
|
||||||
## modify
|
## modify
|
||||||
|
|
||||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
[Documentation](../../README.md#documentation) → Usage → Disk management tool
|
[Documentation](../../README.md#documentation) → Usage → Disk Tool
|
||||||
|
|
||||||
-----
|
-----
|
||||||
|
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
[Документация](../../README-ru.md#документация) → Использование → Инструмент управления дисками
|
[Документация](../../README-ru.md#документация) → Использование → Управление дисками
|
||||||
|
|
||||||
-----
|
-----
|
||||||
|
|
||||||
|
@@ -46,40 +46,3 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=192.168.7
|
|||||||
|
|
||||||
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
|
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
|
||||||
if you don't want to use inode metadata.
|
if you don't want to use inode metadata.
|
||||||
|
|
||||||
### Exporting snapshots
|
|
||||||
|
|
||||||
Starting with 0.8.4, you can also export individual layers (snapshot diffs) using `qemu-img`.
|
|
||||||
|
|
||||||
Suppose you have an image `testimg` and a snapshot `testimg@0` created with `vitastor-cli snap-create testimg@0`.
|
|
||||||
|
|
||||||
Then you can export the `testimg@0` snapshot and the data written to `testimg` after creating
|
|
||||||
the snapshot separately using the following commands (key points are using `skip-parents=1` and
|
|
||||||
`-B backing_file` option):
|
|
||||||
|
|
||||||
```
|
|
||||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
|
|
||||||
-O qcow2 testimg_0.qcow2
|
|
||||||
|
|
||||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
|
|
||||||
-O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
|
|
||||||
```
|
|
||||||
|
|
||||||
In fact, with `cluster_size=4k` any QCOW2 file can be used instead `-B testimg_0.qcow2`, even an empty one.
|
|
||||||
|
|
||||||
QCOW2 `cluster_size=4k` option is required if you want `testimg.qcow2` to contain only the data
|
|
||||||
overwritten **exactly** in the child layer. With the default 64 KB QCOW2 cluster size you'll
|
|
||||||
get a bit of extra data from parent layers, i.e. a 4 KB overwrite will result in `testimg.qcow2`
|
|
||||||
containing 64 KB of data. And this extra data will be taken by `qemu-img` from the file passed
|
|
||||||
in `-B` option, so you really need 4 KB cluster if you use an empty image in `-B`.
|
|
||||||
|
|
||||||
After this procedure you'll get two chained QCOW2 images. To detach `testimg.qcow2` from
|
|
||||||
its parent, run:
|
|
||||||
|
|
||||||
```
|
|
||||||
qemu-img rebase -u -b '' testimg.qcow2
|
|
||||||
```
|
|
||||||
|
|
||||||
This can be used for backups. Just note that exporting an image that is currently being written to
|
|
||||||
is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
|
|
||||||
on a live VM.
|
|
||||||
|
@@ -50,40 +50,3 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.
|
|||||||
|
|
||||||
Если вы не хотите обращаться к образу по имени, вместо `:image=<IMAGE>` можно указать номер пула, номер инода и размер:
|
Если вы не хотите обращаться к образу по имени, вместо `:image=<IMAGE>` можно указать номер пула, номер инода и размер:
|
||||||
`:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
|
`:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
|
||||||
|
|
||||||
### Экспорт снимков
|
|
||||||
|
|
||||||
Начиная с 0.8.4 вы можете экспортировать отдельные слои (изменения в снимках) с помощью `qemu-img`.
|
|
||||||
|
|
||||||
Допустим, что у вас есть образ `testimg` и его снимок `testimg@0`, созданный с помощью `vitastor-cli snap-create testimg@0`.
|
|
||||||
|
|
||||||
Тогда вы можете выгрузить снимок `testimg@0` и данные, изменённые в `testimg` после создания снимка, отдельно,
|
|
||||||
с помощью следующих команд (ключевые моменты - использование `skip-parents=1` и опции `-B backing_file.qcow2`):
|
|
||||||
|
|
||||||
```
|
|
||||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
|
|
||||||
-O qcow2 testimg_0.qcow2
|
|
||||||
|
|
||||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
|
|
||||||
-O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
|
|
||||||
```
|
|
||||||
|
|
||||||
На самом деле, с `cluster_size=4k` вместо `-B testimg_0.qcow2` можно использовать любой qcow2-файл,
|
|
||||||
даже пустой.
|
|
||||||
|
|
||||||
Опция QCOW2 `cluster_size=4k` нужна, если вы хотите, чтобы `testimg.qcow2` содержал **в точности**
|
|
||||||
данные, перезаписанные в дочернем слое. С размером кластера QCOW2 по умолчанию, составляющим 64 КБ,
|
|
||||||
вы получите немного "лишних" данных из родительских слоёв - перезапись 4 КБ будет приводить к тому,
|
|
||||||
что в `testimg.qcow2` будет появляться 64 КБ данных. Причём "лишние" данные qemu-img будет брать
|
|
||||||
как раз из файла, указанного в опции `-B`, так что если там указан пустой образ, кластер обязан быть 4 КБ.
|
|
||||||
|
|
||||||
После данной процедуры вы получите два QCOW2-образа, связанных в цепочку. Чтобы "отцепить" образ
|
|
||||||
`testimg.qcow2` от базового, выполните:
|
|
||||||
|
|
||||||
```
|
|
||||||
qemu-img rebase -u -b '' testimg.qcow2
|
|
||||||
```
|
|
||||||
|
|
||||||
Это можно использовать для резервного копирования. Только помните, что экспортировать образ, в который
|
|
||||||
в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
|
|
||||||
с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.
|
|
||||||
|
2
json11
2
json11
Submodule json11 updated: fd37016cf8...52a3af664f
@@ -10,25 +10,18 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p
|
|||||||
if (!new_pg_history[new_pg])
|
if (!new_pg_history[new_pg])
|
||||||
{
|
{
|
||||||
new_pg_history[new_pg] = {
|
new_pg_history[new_pg] = {
|
||||||
osd_set_epochs: {},
|
osd_sets: {},
|
||||||
all_peers: {},
|
all_peers: {},
|
||||||
epoch: 0,
|
epoch: 0,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
const nh = new_pg_history[new_pg], oh = prev_pg_history[old_pg];
|
const nh = new_pg_history[new_pg], oh = prev_pg_history[old_pg];
|
||||||
nh.osd_set_epochs[prev_pgs[old_pg].join(' ')] = { osd_set: prev_pgs[old_pg] };
|
nh.osd_sets[prev_pgs[old_pg].join(' ')] = prev_pgs[old_pg];
|
||||||
if (oh && oh.osd_sets && oh.osd_sets.length)
|
if (oh && oh.osd_sets && oh.osd_sets.length)
|
||||||
{
|
{
|
||||||
for (const pg of oh.osd_sets)
|
for (const pg of oh.osd_sets)
|
||||||
{
|
{
|
||||||
nh.osd_set_epochs[pg.join(' ')] = { osd_set: pg.map(osd_num => Number(osd_num)) };
|
nh.osd_sets[pg.join(' ')] = pg.map(osd_num => Number(osd_num));
|
||||||
}
|
|
||||||
}
|
|
||||||
if (oh && oh.osd_set_epochs && oh.osd_set_epochs.length)
|
|
||||||
{
|
|
||||||
for (const pg of oh.osd_set_epochs)
|
|
||||||
{
|
|
||||||
nh.osd_set_epochs[pg.osd_set.join(' ')] = { osd_set: pg.osd_set.map(osd_num => Number(osd_num)) };
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (oh && oh.all_peers && oh.all_peers.length)
|
if (oh && oh.all_peers && oh.all_peers.length)
|
||||||
@@ -46,7 +39,7 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p
|
|||||||
|
|
||||||
function finish_pg_history(merged_history)
|
function finish_pg_history(merged_history)
|
||||||
{
|
{
|
||||||
merged_history.osd_set_epochs = Object.values(merged_history.osd_set_epochs);
|
merged_history.osd_sets = Object.values(merged_history.osd_sets);
|
||||||
merged_history.all_peers = Object.values(merged_history.all_peers);
|
merged_history.all_peers = Object.values(merged_history.all_peers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -550,8 +550,8 @@ function random_combinations(osd_tree, pg_size, count, ordered)
|
|||||||
seed ^= seed << 5;
|
seed ^= seed << 5;
|
||||||
return seed + 2147483648;
|
return seed + 2147483648;
|
||||||
};
|
};
|
||||||
|
const hosts = Object.keys(osd_tree).sort();
|
||||||
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
|
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
|
||||||
const hosts = Object.keys(osd_tree).sort().filter(h => osds[h].length > 0);
|
|
||||||
const r = {};
|
const r = {};
|
||||||
// Generate random combinations including each OSD at least once
|
// Generate random combinations including each OSD at least once
|
||||||
for (let h = 0; h < hosts.length; h++)
|
for (let h = 0; h < hosts.length; h++)
|
||||||
|
@@ -79,7 +79,7 @@ StartLimitInterval=0
|
|||||||
RestartSec=10
|
RestartSec=10
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=local.target
|
||||||
`);
|
`);
|
||||||
await system(`useradd etcd`);
|
await system(`useradd etcd`);
|
||||||
await system(`systemctl daemon-reload`);
|
await system(`systemctl daemon-reload`);
|
||||||
|
47
mon/mon.js
47
mon/mon.js
@@ -51,9 +51,8 @@ const etcd_tree = {
|
|||||||
// THIS IS JUST A POOR MAN'S CONFIG DOCUMENTATION
|
// THIS IS JUST A POOR MAN'S CONFIG DOCUMENTATION
|
||||||
// etcd connection
|
// etcd connection
|
||||||
config_path: "/etc/vitastor/vitastor.conf",
|
config_path: "/etc/vitastor/vitastor.conf",
|
||||||
etcd_prefix: "/vitastor",
|
|
||||||
// etcd connection - configurable online
|
|
||||||
etcd_address: "10.0.115.10:2379/v3",
|
etcd_address: "10.0.115.10:2379/v3",
|
||||||
|
etcd_prefix: "/vitastor",
|
||||||
// mon
|
// mon
|
||||||
etcd_mon_ttl: 30, // min: 10
|
etcd_mon_ttl: 30, // min: 10
|
||||||
etcd_mon_timeout: 1000, // ms. min: 0
|
etcd_mon_timeout: 1000, // ms. min: 0
|
||||||
@@ -71,15 +70,14 @@ const etcd_tree = {
|
|||||||
rdma_gid_index: 0,
|
rdma_gid_index: 0,
|
||||||
rdma_mtu: 4096,
|
rdma_mtu: 4096,
|
||||||
rdma_max_sge: 128,
|
rdma_max_sge: 128,
|
||||||
rdma_max_send: 8,
|
rdma_max_send: 32,
|
||||||
rdma_max_recv: 16,
|
rdma_max_recv: 8,
|
||||||
rdma_max_msg: 132096,
|
rdma_max_msg: 1048576,
|
||||||
|
log_level: 0,
|
||||||
block_size: 131072,
|
block_size: 131072,
|
||||||
disk_alignment: 4096,
|
disk_alignment: 4096,
|
||||||
bitmap_granularity: 4096,
|
bitmap_granularity: 4096,
|
||||||
immediate_commit: false, // 'all' or 'small'
|
immediate_commit: false, // 'all' or 'small'
|
||||||
// client and osd - configurable online
|
|
||||||
log_level: 0,
|
|
||||||
client_dirty_limit: 33554432,
|
client_dirty_limit: 33554432,
|
||||||
peer_connect_interval: 5, // seconds. min: 1
|
peer_connect_interval: 5, // seconds. min: 1
|
||||||
peer_connect_timeout: 5, // seconds. min: 1
|
peer_connect_timeout: 5, // seconds. min: 1
|
||||||
@@ -97,19 +95,18 @@ const etcd_tree = {
|
|||||||
osd_network: null, // "192.168.7.0/24" or an array of masks
|
osd_network: null, // "192.168.7.0/24" or an array of masks
|
||||||
bind_address: "0.0.0.0",
|
bind_address: "0.0.0.0",
|
||||||
bind_port: 0,
|
bind_port: 0,
|
||||||
readonly: false,
|
|
||||||
osd_memlock: false,
|
|
||||||
// osd - configurable online
|
|
||||||
autosync_interval: 5,
|
autosync_interval: 5,
|
||||||
autosync_writes: 128,
|
autosync_writes: 128,
|
||||||
client_queue_depth: 128, // unused
|
client_queue_depth: 128, // unused
|
||||||
recovery_queue_depth: 4,
|
recovery_queue_depth: 4,
|
||||||
recovery_sync_batch: 16,
|
recovery_sync_batch: 16,
|
||||||
|
readonly: false,
|
||||||
no_recovery: false,
|
no_recovery: false,
|
||||||
no_rebalance: false,
|
no_rebalance: false,
|
||||||
print_stats_interval: 3,
|
print_stats_interval: 3,
|
||||||
slow_log_interval: 10,
|
slow_log_interval: 10,
|
||||||
inode_vanish_time: 60,
|
inode_vanish_time: 60,
|
||||||
|
osd_memlock: false,
|
||||||
// blockstore - fixed in superblock
|
// blockstore - fixed in superblock
|
||||||
block_size,
|
block_size,
|
||||||
disk_alignment,
|
disk_alignment,
|
||||||
@@ -128,15 +125,14 @@ const etcd_tree = {
|
|||||||
meta_offset,
|
meta_offset,
|
||||||
disable_meta_fsync,
|
disable_meta_fsync,
|
||||||
disable_device_lock,
|
disable_device_lock,
|
||||||
// blockstore - configurable offline
|
// blockstore - configurable
|
||||||
|
max_write_iodepth,
|
||||||
|
min_flusher_count: 1,
|
||||||
|
max_flusher_count: 256,
|
||||||
inmemory_metadata,
|
inmemory_metadata,
|
||||||
inmemory_journal,
|
inmemory_journal,
|
||||||
journal_sector_buffer_count,
|
journal_sector_buffer_count,
|
||||||
journal_no_same_sector_overwrites,
|
journal_no_same_sector_overwrites,
|
||||||
// blockstore - configurable online
|
|
||||||
max_write_iodepth,
|
|
||||||
min_flusher_count: 1,
|
|
||||||
max_flusher_count: 256,
|
|
||||||
throttle_small_writes: false,
|
throttle_small_writes: false,
|
||||||
throttle_target_iops: 100,
|
throttle_target_iops: 100,
|
||||||
throttle_target_mbs: 100,
|
throttle_target_mbs: 100,
|
||||||
@@ -265,7 +261,7 @@ const etcd_tree = {
|
|||||||
/* <pool_id>: {
|
/* <pool_id>: {
|
||||||
<pg_id>: {
|
<pg_id>: {
|
||||||
primary: osd_num_t,
|
primary: osd_num_t,
|
||||||
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
state: ("starting"|"peering"|"peered"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
||||||
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
||||||
"has_invalid"|"left_on_dead")[],
|
"has_invalid"|"left_on_dead")[],
|
||||||
}
|
}
|
||||||
@@ -286,12 +282,7 @@ const etcd_tree = {
|
|||||||
history: {
|
history: {
|
||||||
/* <pool_id>: {
|
/* <pool_id>: {
|
||||||
<pg_id>: {
|
<pg_id>: {
|
||||||
osd_set_epochs: {
|
osd_sets: osd_num_t[][],
|
||||||
osd_set: osd_num_t[],
|
|
||||||
min_epoch: uint64_t,
|
|
||||||
max_epoch: uint64_t,
|
|
||||||
}[],
|
|
||||||
osd_sets: osd_num_t[][], // outdated
|
|
||||||
all_peers: osd_num_t[],
|
all_peers: osd_num_t[],
|
||||||
epoch: uint64_t,
|
epoch: uint64_t,
|
||||||
},
|
},
|
||||||
@@ -973,6 +964,18 @@ class Mon
|
|||||||
osd_set,
|
osd_set,
|
||||||
primary: this.pick_primary(pool_id, osd_set, up_osds, aff_osds),
|
primary: this.pick_primary(pool_id, osd_set, up_osds, aff_osds),
|
||||||
};
|
};
|
||||||
|
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
|
||||||
|
prev_pgs[i].filter(osd_num => osd_num).length > 0)
|
||||||
|
{
|
||||||
|
pg_history[i] = pg_history[i] || {};
|
||||||
|
pg_history[i].osd_sets = pg_history[i].osd_sets || [];
|
||||||
|
pg_history[i].osd_sets.push(prev_pgs[i]);
|
||||||
|
}
|
||||||
|
if (pg_history[i] && pg_history[i].osd_sets)
|
||||||
|
{
|
||||||
|
pg_history[i].osd_sets = Object.values(pg_history[i].osd_sets
|
||||||
|
.reduce((a, c) => { a[c.join(' ')] = c; return a; }, {}));
|
||||||
|
}
|
||||||
});
|
});
|
||||||
for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
|
for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
|
||||||
{
|
{
|
||||||
|
@@ -15,4 +15,4 @@ StartLimitInterval=0
|
|||||||
RestartSec=10
|
RestartSec=10
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=vitastor.target
|
||||||
|
@@ -16,11 +16,6 @@ use PVE::Tools qw(run_command);
|
|||||||
|
|
||||||
use base qw(PVE::Storage::Plugin);
|
use base qw(PVE::Storage::Plugin);
|
||||||
|
|
||||||
if (@PVE::Storage::Plugin::SHARED_STORAGE)
|
|
||||||
{
|
|
||||||
push @PVE::Storage::Plugin::SHARED_STORAGE, 'vitastor';
|
|
||||||
}
|
|
||||||
|
|
||||||
sub api
|
sub api
|
||||||
{
|
{
|
||||||
# Trick it :)
|
# Trick it :)
|
||||||
@@ -138,11 +133,9 @@ sub properties
|
|||||||
sub options
|
sub options
|
||||||
{
|
{
|
||||||
return {
|
return {
|
||||||
shared => { optional => 1 },
|
|
||||||
content => { optional => 1 },
|
|
||||||
nodes => { optional => 1 },
|
nodes => { optional => 1 },
|
||||||
disable => { optional => 1 },
|
disable => { optional => 1 },
|
||||||
vitastor_etcd_address => { optional => 1 },
|
vitastor_etcd_address => { optional => 1},
|
||||||
vitastor_etcd_prefix => { optional => 1 },
|
vitastor_etcd_prefix => { optional => 1 },
|
||||||
vitastor_config_path => { optional => 1 },
|
vitastor_config_path => { optional => 1 },
|
||||||
vitastor_prefix => { optional => 1 },
|
vitastor_prefix => { optional => 1 },
|
||||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
|||||||
from cinder.volume import driver
|
from cinder.volume import driver
|
||||||
from cinder.volume import volume_utils
|
from cinder.volume import volume_utils
|
||||||
|
|
||||||
VERSION = '0.8.8'
|
VERSION = '0.8.3'
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@@ -1,169 +0,0 @@
|
|||||||
Index: pve-qemu-kvm-7.2.0/block/meson.build
|
|
||||||
===================================================================
|
|
||||||
--- pve-qemu-kvm-7.2.0.orig/block/meson.build
|
|
||||||
+++ pve-qemu-kvm-7.2.0/block/meson.build
|
|
||||||
@@ -113,6 +113,7 @@ foreach m : [
|
|
||||||
[libnfs, 'nfs', files('nfs.c')],
|
|
||||||
[libssh, 'ssh', files('ssh.c')],
|
|
||||||
[rbd, 'rbd', files('rbd.c')],
|
|
||||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
|
||||||
]
|
|
||||||
if m[0].found()
|
|
||||||
module_ss = ss.source_set()
|
|
||||||
Index: pve-qemu-kvm-7.2.0/meson.build
|
|
||||||
===================================================================
|
|
||||||
--- pve-qemu-kvm-7.2.0.orig/meson.build
|
|
||||||
+++ pve-qemu-kvm-7.2.0/meson.build
|
|
||||||
@@ -1026,6 +1026,26 @@ if not get_option('rbd').auto() or have_
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
|
|
||||||
+vitastor = not_found
|
|
||||||
+if not get_option('vitastor').auto() or have_block
|
|
||||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
|
||||||
+ required: get_option('vitastor'), kwargs: static_kwargs)
|
|
||||||
+ if libvitastor_client.found()
|
|
||||||
+ if cc.links('''
|
|
||||||
+ #include <vitastor_c.h>
|
|
||||||
+ int main(void) {
|
|
||||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
||||||
+ return 0;
|
|
||||||
+ }''', dependencies: libvitastor_client)
|
|
||||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
|
||||||
+ elif get_option('vitastor').enabled()
|
|
||||||
+ error('could not link libvitastor_client')
|
|
||||||
+ else
|
|
||||||
+ warning('could not link libvitastor_client, disabling')
|
|
||||||
+ endif
|
|
||||||
+ endif
|
|
||||||
+endif
|
|
||||||
+
|
|
||||||
glusterfs = not_found
|
|
||||||
glusterfs_ftruncate_has_stat = false
|
|
||||||
glusterfs_iocb_has_stat = false
|
|
||||||
@@ -1865,6 +1885,7 @@ config_host_data.set('CONFIG_NUMA', numa
|
|
||||||
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
|
||||||
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
|
|
||||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
|
||||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
|
||||||
config_host_data.set('CONFIG_RDMA', rdma.found())
|
|
||||||
config_host_data.set('CONFIG_SDL', sdl.found())
|
|
||||||
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
|
|
||||||
@@ -3957,6 +3978,7 @@ if spice_protocol.found()
|
|
||||||
summary_info += {' spice server support': spice}
|
|
||||||
endif
|
|
||||||
summary_info += {'rbd support': rbd}
|
|
||||||
+summary_info += {'vitastor support': vitastor}
|
|
||||||
summary_info += {'smartcard support': cacard}
|
|
||||||
summary_info += {'U2F support': u2f}
|
|
||||||
summary_info += {'libusb': libusb}
|
|
||||||
Index: pve-qemu-kvm-7.2.0/meson_options.txt
|
|
||||||
===================================================================
|
|
||||||
--- pve-qemu-kvm-7.2.0.orig/meson_options.txt
|
|
||||||
+++ pve-qemu-kvm-7.2.0/meson_options.txt
|
|
||||||
@@ -169,6 +169,8 @@ option('lzo', type : 'feature', value :
|
|
||||||
description: 'lzo compression support')
|
|
||||||
option('rbd', type : 'feature', value : 'auto',
|
|
||||||
description: 'Ceph block device driver')
|
|
||||||
+option('vitastor', type : 'feature', value : 'auto',
|
|
||||||
+ description: 'Vitastor block device driver')
|
|
||||||
option('opengl', type : 'feature', value : 'auto',
|
|
||||||
description: 'OpenGL support')
|
|
||||||
option('rdma', type : 'feature', value : 'auto',
|
|
||||||
Index: pve-qemu-kvm-7.2.0/qapi/block-core.json
|
|
||||||
===================================================================
|
|
||||||
--- pve-qemu-kvm-7.2.0.orig/qapi/block-core.json
|
|
||||||
+++ pve-qemu-kvm-7.2.0/qapi/block-core.json
|
|
||||||
@@ -3213,7 +3213,7 @@
|
|
||||||
'raw', 'rbd',
|
|
||||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
|
||||||
'pbs',
|
|
||||||
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
|
||||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
|
||||||
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
|
||||||
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
|
||||||
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
|
||||||
@@ -4223,6 +4223,28 @@
|
|
||||||
'*server': ['InetSocketAddressBase'] } }
|
|
||||||
|
|
||||||
##
|
|
||||||
+# @BlockdevOptionsVitastor:
|
|
||||||
+#
|
|
||||||
+# Driver specific block device options for vitastor
|
|
||||||
+#
|
|
||||||
+# @image: Image name
|
|
||||||
+# @inode: Inode number
|
|
||||||
+# @pool: Pool ID
|
|
||||||
+# @size: Desired image size in bytes
|
|
||||||
+# @config-path: Path to Vitastor configuration
|
|
||||||
+# @etcd-host: etcd connection address(es)
|
|
||||||
+# @etcd-prefix: etcd key/value prefix
|
|
||||||
+##
|
|
||||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
|
||||||
+ 'data': { '*inode': 'uint64',
|
|
||||||
+ '*pool': 'uint64',
|
|
||||||
+ '*size': 'uint64',
|
|
||||||
+ '*image': 'str',
|
|
||||||
+ '*config-path': 'str',
|
|
||||||
+ '*etcd-host': 'str',
|
|
||||||
+ '*etcd-prefix': 'str' } }
|
|
||||||
+
|
|
||||||
+##
|
|
||||||
# @ReplicationMode:
|
|
||||||
#
|
|
||||||
# An enumeration of replication modes.
|
|
||||||
@@ -4671,6 +4693,7 @@
|
|
||||||
'throttle': 'BlockdevOptionsThrottle',
|
|
||||||
'vdi': 'BlockdevOptionsGenericFormat',
|
|
||||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
|
||||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
|
||||||
'virtio-blk-vfio-pci':
|
|
||||||
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
|
||||||
'if': 'CONFIG_BLKIO' },
|
|
||||||
@@ -5072,6 +5095,17 @@
|
|
||||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
|
||||||
|
|
||||||
##
|
|
||||||
+# @BlockdevCreateOptionsVitastor:
|
|
||||||
+#
|
|
||||||
+# Driver specific image creation options for Vitastor.
|
|
||||||
+#
|
|
||||||
+# @size: Size of the virtual disk in bytes
|
|
||||||
+##
|
|
||||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
|
||||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
|
||||||
+ 'size': 'size' } }
|
|
||||||
+
|
|
||||||
+##
|
|
||||||
# @BlockdevVmdkSubformat:
|
|
||||||
#
|
|
||||||
# Subformat options for VMDK images
|
|
||||||
@@ -5269,6 +5303,7 @@
|
|
||||||
'ssh': 'BlockdevCreateOptionsSsh',
|
|
||||||
'vdi': 'BlockdevCreateOptionsVdi',
|
|
||||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
|
||||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
|
||||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
|
||||||
'vpc': 'BlockdevCreateOptionsVpc'
|
|
||||||
} }
|
|
||||||
Index: pve-qemu-kvm-7.2.0/scripts/ci/org.centos/stream/8/x86_64/configure
|
|
||||||
===================================================================
|
|
||||||
--- pve-qemu-kvm-7.2.0.orig/scripts/ci/org.centos/stream/8/x86_64/configure
|
|
||||||
+++ pve-qemu-kvm-7.2.0/scripts/ci/org.centos/stream/8/x86_64/configure
|
|
||||||
@@ -31,7 +31,7 @@
|
|
||||||
--with-git=meson \
|
|
||||||
--with-git-submodules=update \
|
|
||||||
--target-list="x86_64-softmmu" \
|
|
||||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
|
||||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
|
||||||
--audio-drv-list="" \
|
|
||||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
|
||||||
--with-coroutine=ucontext \
|
|
||||||
@@ -179,6 +179,7 @@
|
|
||||||
--enable-opengl \
|
|
||||||
--enable-pie \
|
|
||||||
--enable-rbd \
|
|
||||||
+--enable-vitastor \
|
|
||||||
--enable-rdma \
|
|
||||||
--enable-seccomp \
|
|
||||||
--enable-snappy \
|
|
@@ -1,169 +0,0 @@
|
|||||||
diff --git a/block/meson.build b/block/meson.build
|
|
||||||
index deb73ca389..e269f599a1 100644
|
|
||||||
--- a/block/meson.build
|
|
||||||
+++ b/block/meson.build
|
|
||||||
@@ -78,6 +78,7 @@ foreach m : [
|
|
||||||
[libnfs, 'nfs', files('nfs.c')],
|
|
||||||
[libssh, 'ssh', files('ssh.c')],
|
|
||||||
[rbd, 'rbd', files('rbd.c')],
|
|
||||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
|
||||||
]
|
|
||||||
if m[0].found()
|
|
||||||
module_ss = ss.source_set()
|
|
||||||
diff --git a/meson.build b/meson.build
|
|
||||||
index 96de1a6ef9..2e3994777d 100644
|
|
||||||
--- a/meson.build
|
|
||||||
+++ b/meson.build
|
|
||||||
@@ -838,6 +838,26 @@ if not get_option('rbd').auto() or have_block
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
|
|
||||||
+vitastor = not_found
|
|
||||||
+if not get_option('vitastor').auto() or have_block
|
|
||||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
|
||||||
+ required: get_option('vitastor'), kwargs: static_kwargs)
|
|
||||||
+ if libvitastor_client.found()
|
|
||||||
+ if cc.links('''
|
|
||||||
+ #include <vitastor_c.h>
|
|
||||||
+ int main(void) {
|
|
||||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
||||||
+ return 0;
|
|
||||||
+ }''', dependencies: libvitastor_client)
|
|
||||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
|
||||||
+ elif get_option('vitastor').enabled()
|
|
||||||
+ error('could not link libvitastor_client')
|
|
||||||
+ else
|
|
||||||
+ warning('could not link libvitastor_client, disabling')
|
|
||||||
+ endif
|
|
||||||
+ endif
|
|
||||||
+endif
|
|
||||||
+
|
|
||||||
glusterfs = not_found
|
|
||||||
glusterfs_ftruncate_has_stat = false
|
|
||||||
glusterfs_iocb_has_stat = false
|
|
||||||
@@ -1455,6 +1475,7 @@ config_host_data.set('CONFIG_LINUX_AIO', libaio.found())
|
|
||||||
config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found())
|
|
||||||
config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
|
|
||||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
|
||||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
|
||||||
config_host_data.set('CONFIG_SDL', sdl.found())
|
|
||||||
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
|
|
||||||
config_host_data.set('CONFIG_SECCOMP', seccomp.found())
|
|
||||||
@@ -3412,6 +3433,7 @@ if spice_protocol.found()
|
|
||||||
summary_info += {' spice server support': spice}
|
|
||||||
endif
|
|
||||||
summary_info += {'rbd support': rbd}
|
|
||||||
+summary_info += {'vitastor support': vitastor}
|
|
||||||
summary_info += {'xfsctl support': config_host.has_key('CONFIG_XFS')}
|
|
||||||
summary_info += {'smartcard support': cacard}
|
|
||||||
summary_info += {'U2F support': u2f}
|
|
||||||
diff --git a/meson_options.txt b/meson_options.txt
|
|
||||||
index e392323732..5b56007475 100644
|
|
||||||
--- a/meson_options.txt
|
|
||||||
+++ b/meson_options.txt
|
|
||||||
@@ -121,6 +121,8 @@ option('lzo', type : 'feature', value : 'auto',
|
|
||||||
description: 'lzo compression support')
|
|
||||||
option('rbd', type : 'feature', value : 'auto',
|
|
||||||
description: 'Ceph block device driver')
|
|
||||||
+option('vitastor', type : 'feature', value : 'auto',
|
|
||||||
+ description: 'Vitastor block device driver')
|
|
||||||
option('gtk', type : 'feature', value : 'auto',
|
|
||||||
description: 'GTK+ user interface')
|
|
||||||
option('sdl', type : 'feature', value : 'auto',
|
|
||||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
|
||||||
index 1d3dd9cb48..88453405e5 100644
|
|
||||||
--- a/qapi/block-core.json
|
|
||||||
+++ b/qapi/block-core.json
|
|
||||||
@@ -2930,7 +2930,7 @@
|
|
||||||
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
|
|
||||||
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
|
|
||||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
|
||||||
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
|
|
||||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
|
|
||||||
|
|
||||||
##
|
|
||||||
# @BlockdevOptionsFile:
|
|
||||||
@@ -3864,6 +3864,28 @@
|
|
||||||
'*key-secret': 'str',
|
|
||||||
'*server': ['InetSocketAddressBase'] } }
|
|
||||||
|
|
||||||
+##
|
|
||||||
+# @BlockdevOptionsVitastor:
|
|
||||||
+#
|
|
||||||
+# Driver specific block device options for vitastor
|
|
||||||
+#
|
|
||||||
+# @image: Image name
|
|
||||||
+# @inode: Inode number
|
|
||||||
+# @pool: Pool ID
|
|
||||||
+# @size: Desired image size in bytes
|
|
||||||
+# @config-path: Path to Vitastor configuration
|
|
||||||
+# @etcd-host: etcd connection address(es)
|
|
||||||
+# @etcd-prefix: etcd key/value prefix
|
|
||||||
+##
|
|
||||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
|
||||||
+ 'data': { '*inode': 'uint64',
|
|
||||||
+ '*pool': 'uint64',
|
|
||||||
+ '*size': 'uint64',
|
|
||||||
+ '*image': 'str',
|
|
||||||
+ '*config-path': 'str',
|
|
||||||
+ '*etcd-host': 'str',
|
|
||||||
+ '*etcd-prefix': 'str' } }
|
|
||||||
+
|
|
||||||
##
|
|
||||||
# @ReplicationMode:
|
|
||||||
#
|
|
||||||
@@ -4259,6 +4281,7 @@
|
|
||||||
'throttle': 'BlockdevOptionsThrottle',
|
|
||||||
'vdi': 'BlockdevOptionsGenericFormat',
|
|
||||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
|
||||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
|
||||||
'vmdk': 'BlockdevOptionsGenericCOWFormat',
|
|
||||||
'vpc': 'BlockdevOptionsGenericFormat',
|
|
||||||
'vvfat': 'BlockdevOptionsVVFAT'
|
|
||||||
@@ -4647,6 +4670,17 @@
|
|
||||||
'*cluster-size' : 'size',
|
|
||||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
|
||||||
|
|
||||||
+##
|
|
||||||
+# @BlockdevCreateOptionsVitastor:
|
|
||||||
+#
|
|
||||||
+# Driver specific image creation options for Vitastor.
|
|
||||||
+#
|
|
||||||
+# @size: Size of the virtual disk in bytes
|
|
||||||
+##
|
|
||||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
|
||||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
|
||||||
+ 'size': 'size' } }
|
|
||||||
+
|
|
||||||
##
|
|
||||||
# @BlockdevVmdkSubformat:
|
|
||||||
#
|
|
||||||
@@ -4846,6 +4880,7 @@
|
|
||||||
'ssh': 'BlockdevCreateOptionsSsh',
|
|
||||||
'vdi': 'BlockdevCreateOptionsVdi',
|
|
||||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
|
||||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
|
||||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
|
||||||
'vpc': 'BlockdevCreateOptionsVpc'
|
|
||||||
} }
|
|
||||||
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
|
||||||
index 7a17ff4218..cdddbf32aa 100644
|
|
||||||
--- a/scripts/meson-buildoptions.sh
|
|
||||||
+++ b/scripts/meson-buildoptions.sh
|
|
||||||
@@ -69,6 +69,7 @@ meson_options_help() {
|
|
||||||
printf "%s\n" ' oss OSS sound support'
|
|
||||||
printf "%s\n" ' pa PulseAudio sound support'
|
|
||||||
printf "%s\n" ' rbd Ceph block device driver'
|
|
||||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
|
||||||
printf "%s\n" ' sdl SDL user interface'
|
|
||||||
printf "%s\n" ' sdl-image SDL Image support for icons'
|
|
||||||
printf "%s\n" ' seccomp seccomp support'
|
|
||||||
@@ -210,6 +211,8 @@ _meson_option_parse() {
|
|
||||||
--disable-pa) printf "%s" -Dpa=disabled ;;
|
|
||||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
|
||||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
|
||||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
|
||||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
|
||||||
--enable-sdl) printf "%s" -Dsdl=enabled ;;
|
|
||||||
--disable-sdl) printf "%s" -Dsdl=disabled ;;
|
|
||||||
--enable-sdl-image) printf "%s" -Dsdl_image=enabled ;;
|
|
@@ -1,190 +0,0 @@
|
|||||||
diff --git a/block/meson.build b/block/meson.build
|
|
||||||
index 0b2a60c99b..d923713804 100644
|
|
||||||
--- a/block/meson.build
|
|
||||||
+++ b/block/meson.build
|
|
||||||
@@ -98,6 +98,7 @@ foreach m : [
|
|
||||||
[libnfs, 'nfs', files('nfs.c')],
|
|
||||||
[libssh, 'ssh', files('ssh.c')],
|
|
||||||
[rbd, 'rbd', files('rbd.c')],
|
|
||||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
|
||||||
]
|
|
||||||
if m[0].found()
|
|
||||||
module_ss = ss.source_set()
|
|
||||||
diff --git a/meson.build b/meson.build
|
|
||||||
index 861de93c4f..272f72af11 100644
|
|
||||||
--- a/meson.build
|
|
||||||
+++ b/meson.build
|
|
||||||
@@ -884,6 +884,26 @@ if not get_option('rbd').auto() or have_block
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
|
|
||||||
+vitastor = not_found
|
|
||||||
+if not get_option('vitastor').auto() or have_block
|
|
||||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
|
||||||
+ required: get_option('vitastor'), kwargs: static_kwargs)
|
|
||||||
+ if libvitastor_client.found()
|
|
||||||
+ if cc.links('''
|
|
||||||
+ #include <vitastor_c.h>
|
|
||||||
+ int main(void) {
|
|
||||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
||||||
+ return 0;
|
|
||||||
+ }''', dependencies: libvitastor_client)
|
|
||||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
|
||||||
+ elif get_option('vitastor').enabled()
|
|
||||||
+ error('could not link libvitastor_client')
|
|
||||||
+ else
|
|
||||||
+ warning('could not link libvitastor_client, disabling')
|
|
||||||
+ endif
|
|
||||||
+ endif
|
|
||||||
+endif
|
|
||||||
+
|
|
||||||
glusterfs = not_found
|
|
||||||
glusterfs_ftruncate_has_stat = false
|
|
||||||
glusterfs_iocb_has_stat = false
|
|
||||||
@@ -1546,6 +1566,7 @@ config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
|
|
||||||
config_host_data.set('CONFIG_NUMA', numa.found())
|
|
||||||
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
|
|
||||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
|
||||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
|
||||||
config_host_data.set('CONFIG_SDL', sdl.found())
|
|
||||||
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
|
|
||||||
config_host_data.set('CONFIG_SECCOMP', seccomp.found())
|
|
||||||
@@ -3709,6 +3730,7 @@ if spice_protocol.found()
|
|
||||||
summary_info += {' spice server support': spice}
|
|
||||||
endif
|
|
||||||
summary_info += {'rbd support': rbd}
|
|
||||||
+summary_info += {'vitastor support': vitastor}
|
|
||||||
summary_info += {'smartcard support': cacard}
|
|
||||||
summary_info += {'U2F support': u2f}
|
|
||||||
summary_info += {'libusb': libusb}
|
|
||||||
diff --git a/meson_options.txt b/meson_options.txt
|
|
||||||
index 52b11cead4..d8d0868174 100644
|
|
||||||
--- a/meson_options.txt
|
|
||||||
+++ b/meson_options.txt
|
|
||||||
@@ -149,6 +149,8 @@ option('lzo', type : 'feature', value : 'auto',
|
|
||||||
description: 'lzo compression support')
|
|
||||||
option('rbd', type : 'feature', value : 'auto',
|
|
||||||
description: 'Ceph block device driver')
|
|
||||||
+option('vitastor', type : 'feature', value : 'auto',
|
|
||||||
+ description: 'Vitastor block device driver')
|
|
||||||
option('gtk', type : 'feature', value : 'auto',
|
|
||||||
description: 'GTK+ user interface')
|
|
||||||
option('sdl', type : 'feature', value : 'auto',
|
|
||||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
|
||||||
index beeb91952a..1c98dc0e12 100644
|
|
||||||
--- a/qapi/block-core.json
|
|
||||||
+++ b/qapi/block-core.json
|
|
||||||
@@ -2929,7 +2929,7 @@
|
|
||||||
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
|
|
||||||
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
|
|
||||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
|
||||||
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
|
|
||||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
|
|
||||||
|
|
||||||
##
|
|
||||||
# @BlockdevOptionsFile:
|
|
||||||
@@ -3863,6 +3863,28 @@
|
|
||||||
'*key-secret': 'str',
|
|
||||||
'*server': ['InetSocketAddressBase'] } }
|
|
||||||
|
|
||||||
+##
|
|
||||||
+# @BlockdevOptionsVitastor:
|
|
||||||
+#
|
|
||||||
+# Driver specific block device options for vitastor
|
|
||||||
+#
|
|
||||||
+# @image: Image name
|
|
||||||
+# @inode: Inode number
|
|
||||||
+# @pool: Pool ID
|
|
||||||
+# @size: Desired image size in bytes
|
|
||||||
+# @config-path: Path to Vitastor configuration
|
|
||||||
+# @etcd-host: etcd connection address(es)
|
|
||||||
+# @etcd-prefix: etcd key/value prefix
|
|
||||||
+##
|
|
||||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
|
||||||
+ 'data': { '*inode': 'uint64',
|
|
||||||
+ '*pool': 'uint64',
|
|
||||||
+ '*size': 'uint64',
|
|
||||||
+ '*image': 'str',
|
|
||||||
+ '*config-path': 'str',
|
|
||||||
+ '*etcd-host': 'str',
|
|
||||||
+ '*etcd-prefix': 'str' } }
|
|
||||||
+
|
|
||||||
##
|
|
||||||
# @ReplicationMode:
|
|
||||||
#
|
|
||||||
@@ -4277,6 +4299,7 @@
|
|
||||||
'throttle': 'BlockdevOptionsThrottle',
|
|
||||||
'vdi': 'BlockdevOptionsGenericFormat',
|
|
||||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
|
||||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
|
||||||
'vmdk': 'BlockdevOptionsGenericCOWFormat',
|
|
||||||
'vpc': 'BlockdevOptionsGenericFormat',
|
|
||||||
'vvfat': 'BlockdevOptionsVVFAT'
|
|
||||||
@@ -4665,6 +4688,17 @@
|
|
||||||
'*cluster-size' : 'size',
|
|
||||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
|
||||||
|
|
||||||
+##
|
|
||||||
+# @BlockdevCreateOptionsVitastor:
|
|
||||||
+#
|
|
||||||
+# Driver specific image creation options for Vitastor.
|
|
||||||
+#
|
|
||||||
+# @size: Size of the virtual disk in bytes
|
|
||||||
+##
|
|
||||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
|
||||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
|
||||||
+ 'size': 'size' } }
|
|
||||||
+
|
|
||||||
##
|
|
||||||
# @BlockdevVmdkSubformat:
|
|
||||||
#
|
|
||||||
@@ -4864,6 +4898,7 @@
|
|
||||||
'ssh': 'BlockdevCreateOptionsSsh',
|
|
||||||
'vdi': 'BlockdevCreateOptionsVdi',
|
|
||||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
|
||||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
|
||||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
|
||||||
'vpc': 'BlockdevCreateOptionsVpc'
|
|
||||||
} }
|
|
||||||
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
|
|
||||||
index 9850dd4444..72b1287520 100755
|
|
||||||
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
|
|
||||||
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
|
|
||||||
@@ -31,7 +31,7 @@
|
|
||||||
--with-git=meson \
|
|
||||||
--with-git-submodules=update \
|
|
||||||
--target-list="x86_64-softmmu" \
|
|
||||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
|
||||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
|
||||||
--audio-drv-list="" \
|
|
||||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
|
||||||
--with-coroutine=ucontext \
|
|
||||||
@@ -181,6 +181,7 @@
|
|
||||||
--enable-opengl \
|
|
||||||
--enable-pie \
|
|
||||||
--enable-rbd \
|
|
||||||
+--enable-vitastor \
|
|
||||||
--enable-rdma \
|
|
||||||
--enable-seccomp \
|
|
||||||
--enable-snappy \
|
|
||||||
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
|
||||||
index 1e26f4571e..370898d48c 100644
|
|
||||||
--- a/scripts/meson-buildoptions.sh
|
|
||||||
+++ b/scripts/meson-buildoptions.sh
|
|
||||||
@@ -98,6 +98,7 @@ meson_options_help() {
|
|
||||||
printf "%s\n" ' qed qed image format support'
|
|
||||||
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
|
||||||
printf "%s\n" ' rbd Ceph block device driver'
|
|
||||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
|
||||||
printf "%s\n" ' replication replication support'
|
|
||||||
printf "%s\n" ' sdl SDL user interface'
|
|
||||||
printf "%s\n" ' sdl-image SDL Image support for icons'
|
|
||||||
@@ -289,6 +290,8 @@ _meson_option_parse() {
|
|
||||||
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
|
|
||||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
|
||||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
|
||||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
|
||||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
|
||||||
--enable-replication) printf "%s" -Dreplication=enabled ;;
|
|
||||||
--disable-replication) printf "%s" -Dreplication=disabled ;;
|
|
||||||
--enable-rng-none) printf "%s" -Drng_none=true ;;
|
|
@@ -1,190 +0,0 @@
|
|||||||
diff --git a/block/meson.build b/block/meson.build
|
|
||||||
index 60bc305597..89a042216f 100644
|
|
||||||
--- a/block/meson.build
|
|
||||||
+++ b/block/meson.build
|
|
||||||
@@ -98,6 +98,7 @@ foreach m : [
|
|
||||||
[libnfs, 'nfs', files('nfs.c')],
|
|
||||||
[libssh, 'ssh', files('ssh.c')],
|
|
||||||
[rbd, 'rbd', files('rbd.c')],
|
|
||||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
|
||||||
]
|
|
||||||
if m[0].found()
|
|
||||||
module_ss = ss.source_set()
|
|
||||||
diff --git a/meson.build b/meson.build
|
|
||||||
index 20fddbd707..600db4e2fb 100644
|
|
||||||
--- a/meson.build
|
|
||||||
+++ b/meson.build
|
|
||||||
@@ -967,6 +967,26 @@ if not get_option('rbd').auto() or have_block
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
|
|
||||||
+vitastor = not_found
|
|
||||||
+if not get_option('vitastor').auto() or have_block
|
|
||||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
|
||||||
+ required: get_option('vitastor'), kwargs: static_kwargs)
|
|
||||||
+ if libvitastor_client.found()
|
|
||||||
+ if cc.links('''
|
|
||||||
+ #include <vitastor_c.h>
|
|
||||||
+ int main(void) {
|
|
||||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
||||||
+ return 0;
|
|
||||||
+ }''', dependencies: libvitastor_client)
|
|
||||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
|
||||||
+ elif get_option('vitastor').enabled()
|
|
||||||
+ error('could not link libvitastor_client')
|
|
||||||
+ else
|
|
||||||
+ warning('could not link libvitastor_client, disabling')
|
|
||||||
+ endif
|
|
||||||
+ endif
|
|
||||||
+endif
|
|
||||||
+
|
|
||||||
glusterfs = not_found
|
|
||||||
glusterfs_ftruncate_has_stat = false
|
|
||||||
glusterfs_iocb_has_stat = false
|
|
||||||
@@ -1799,6 +1819,7 @@ config_host_data.set('CONFIG_NUMA', numa.found())
|
|
||||||
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
|
||||||
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
|
|
||||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
|
||||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
|
||||||
config_host_data.set('CONFIG_RDMA', rdma.found())
|
|
||||||
config_host_data.set('CONFIG_SDL', sdl.found())
|
|
||||||
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
|
|
||||||
@@ -3954,6 +3975,7 @@ if spice_protocol.found()
|
|
||||||
summary_info += {' spice server support': spice}
|
|
||||||
endif
|
|
||||||
summary_info += {'rbd support': rbd}
|
|
||||||
+summary_info += {'vitastor support': vitastor}
|
|
||||||
summary_info += {'smartcard support': cacard}
|
|
||||||
summary_info += {'U2F support': u2f}
|
|
||||||
summary_info += {'libusb': libusb}
|
|
||||||
diff --git a/meson_options.txt b/meson_options.txt
|
|
||||||
index e58e158396..9747b38fd0 100644
|
|
||||||
--- a/meson_options.txt
|
|
||||||
+++ b/meson_options.txt
|
|
||||||
@@ -167,6 +167,8 @@ option('lzo', type : 'feature', value : 'auto',
|
|
||||||
description: 'lzo compression support')
|
|
||||||
option('rbd', type : 'feature', value : 'auto',
|
|
||||||
description: 'Ceph block device driver')
|
|
||||||
+option('vitastor', type : 'feature', value : 'auto',
|
|
||||||
+ description: 'Vitastor block device driver')
|
|
||||||
option('opengl', type : 'feature', value : 'auto',
|
|
||||||
description: 'OpenGL support')
|
|
||||||
option('rdma', type : 'feature', value : 'auto',
|
|
||||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
|
||||||
index 2173e7734a..5a4900b322 100644
|
|
||||||
--- a/qapi/block-core.json
|
|
||||||
+++ b/qapi/block-core.json
|
|
||||||
@@ -2955,7 +2955,7 @@
|
|
||||||
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
|
|
||||||
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
|
|
||||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
|
||||||
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
|
|
||||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
|
|
||||||
|
|
||||||
##
|
|
||||||
# @BlockdevOptionsFile:
|
|
||||||
@@ -3883,6 +3883,28 @@
|
|
||||||
'*key-secret': 'str',
|
|
||||||
'*server': ['InetSocketAddressBase'] } }
|
|
||||||
|
|
||||||
+##
|
|
||||||
+# @BlockdevOptionsVitastor:
|
|
||||||
+#
|
|
||||||
+# Driver specific block device options for vitastor
|
|
||||||
+#
|
|
||||||
+# @image: Image name
|
|
||||||
+# @inode: Inode number
|
|
||||||
+# @pool: Pool ID
|
|
||||||
+# @size: Desired image size in bytes
|
|
||||||
+# @config-path: Path to Vitastor configuration
|
|
||||||
+# @etcd-host: etcd connection address(es)
|
|
||||||
+# @etcd-prefix: etcd key/value prefix
|
|
||||||
+##
|
|
||||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
|
||||||
+ 'data': { '*inode': 'uint64',
|
|
||||||
+ '*pool': 'uint64',
|
|
||||||
+ '*size': 'uint64',
|
|
||||||
+ '*image': 'str',
|
|
||||||
+ '*config-path': 'str',
|
|
||||||
+ '*etcd-host': 'str',
|
|
||||||
+ '*etcd-prefix': 'str' } }
|
|
||||||
+
|
|
||||||
##
|
|
||||||
# @ReplicationMode:
|
|
||||||
#
|
|
||||||
@@ -4327,6 +4349,7 @@
|
|
||||||
'throttle': 'BlockdevOptionsThrottle',
|
|
||||||
'vdi': 'BlockdevOptionsGenericFormat',
|
|
||||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
|
||||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
|
||||||
'vmdk': 'BlockdevOptionsGenericCOWFormat',
|
|
||||||
'vpc': 'BlockdevOptionsGenericFormat',
|
|
||||||
'vvfat': 'BlockdevOptionsVVFAT'
|
|
||||||
@@ -4717,6 +4740,17 @@
|
|
||||||
'*cluster-size' : 'size',
|
|
||||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
|
||||||
|
|
||||||
+##
|
|
||||||
+# @BlockdevCreateOptionsVitastor:
|
|
||||||
+#
|
|
||||||
+# Driver specific image creation options for Vitastor.
|
|
||||||
+#
|
|
||||||
+# @size: Size of the virtual disk in bytes
|
|
||||||
+##
|
|
||||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
|
||||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
|
||||||
+ 'size': 'size' } }
|
|
||||||
+
|
|
||||||
##
|
|
||||||
# @BlockdevVmdkSubformat:
|
|
||||||
#
|
|
||||||
@@ -4915,6 +4949,7 @@
|
|
||||||
'ssh': 'BlockdevCreateOptionsSsh',
|
|
||||||
'vdi': 'BlockdevCreateOptionsVdi',
|
|
||||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
|
||||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
|
||||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
|
||||||
'vpc': 'BlockdevCreateOptionsVpc'
|
|
||||||
} }
|
|
||||||
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
|
|
||||||
index a7f92aff90..53dc55be2e 100755
|
|
||||||
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
|
|
||||||
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
|
|
||||||
@@ -31,7 +31,7 @@
|
|
||||||
--with-git=meson \
|
|
||||||
--with-git-submodules=update \
|
|
||||||
--target-list="x86_64-softmmu" \
|
|
||||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
|
||||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
|
||||||
--audio-drv-list="" \
|
|
||||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
|
||||||
--with-coroutine=ucontext \
|
|
||||||
@@ -179,6 +179,7 @@
|
|
||||||
--enable-opengl \
|
|
||||||
--enable-pie \
|
|
||||||
--enable-rbd \
|
|
||||||
+--enable-vitastor \
|
|
||||||
--enable-rdma \
|
|
||||||
--enable-seccomp \
|
|
||||||
--enable-snappy \
|
|
||||||
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
|
||||||
index 359b04e0e6..f5b85ba78c 100644
|
|
||||||
--- a/scripts/meson-buildoptions.sh
|
|
||||||
+++ b/scripts/meson-buildoptions.sh
|
|
||||||
@@ -135,6 +135,7 @@ meson_options_help() {
|
|
||||||
printf "%s\n" ' qed qed image format support'
|
|
||||||
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
|
||||||
printf "%s\n" ' rbd Ceph block device driver'
|
|
||||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
|
||||||
printf "%s\n" ' rdma Enable RDMA-based migration'
|
|
||||||
printf "%s\n" ' replication replication support'
|
|
||||||
printf "%s\n" ' sdl SDL user interface'
|
|
||||||
@@ -370,6 +371,8 @@ _meson_option_parse() {
|
|
||||||
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
|
|
||||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
|
||||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
|
||||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
|
||||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
|
||||||
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
|
||||||
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
|
||||||
--enable-replication) printf "%s" -Dreplication=enabled ;;
|
|
@@ -1,190 +0,0 @@
|
|||||||
diff --git a/block/meson.build b/block/meson.build
|
|
||||||
index b7c68b83a3..95d8a6f15d 100644
|
|
||||||
--- a/block/meson.build
|
|
||||||
+++ b/block/meson.build
|
|
||||||
@@ -100,6 +100,7 @@ foreach m : [
|
|
||||||
[libnfs, 'nfs', files('nfs.c')],
|
|
||||||
[libssh, 'ssh', files('ssh.c')],
|
|
||||||
[rbd, 'rbd', files('rbd.c')],
|
|
||||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
|
||||||
]
|
|
||||||
if m[0].found()
|
|
||||||
module_ss = ss.source_set()
|
|
||||||
diff --git a/meson.build b/meson.build
|
|
||||||
index 5c6b5a1c75..f31f73612e 100644
|
|
||||||
--- a/meson.build
|
|
||||||
+++ b/meson.build
|
|
||||||
@@ -1026,6 +1026,26 @@ if not get_option('rbd').auto() or have_block
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
|
|
||||||
+vitastor = not_found
|
|
||||||
+if not get_option('vitastor').auto() or have_block
|
|
||||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
|
||||||
+ required: get_option('vitastor'), kwargs: static_kwargs)
|
|
||||||
+ if libvitastor_client.found()
|
|
||||||
+ if cc.links('''
|
|
||||||
+ #include <vitastor_c.h>
|
|
||||||
+ int main(void) {
|
|
||||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
||||||
+ return 0;
|
|
||||||
+ }''', dependencies: libvitastor_client)
|
|
||||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
|
||||||
+ elif get_option('vitastor').enabled()
|
|
||||||
+ error('could not link libvitastor_client')
|
|
||||||
+ else
|
|
||||||
+ warning('could not link libvitastor_client, disabling')
|
|
||||||
+ endif
|
|
||||||
+ endif
|
|
||||||
+endif
|
|
||||||
+
|
|
||||||
glusterfs = not_found
|
|
||||||
glusterfs_ftruncate_has_stat = false
|
|
||||||
glusterfs_iocb_has_stat = false
|
|
||||||
@@ -1861,6 +1881,7 @@ config_host_data.set('CONFIG_NUMA', numa.found())
|
|
||||||
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
|
||||||
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
|
|
||||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
|
||||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
|
||||||
config_host_data.set('CONFIG_RDMA', rdma.found())
|
|
||||||
config_host_data.set('CONFIG_SDL', sdl.found())
|
|
||||||
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
|
|
||||||
@@ -3945,6 +3966,7 @@ if spice_protocol.found()
|
|
||||||
summary_info += {' spice server support': spice}
|
|
||||||
endif
|
|
||||||
summary_info += {'rbd support': rbd}
|
|
||||||
+summary_info += {'vitastor support': vitastor}
|
|
||||||
summary_info += {'smartcard support': cacard}
|
|
||||||
summary_info += {'U2F support': u2f}
|
|
||||||
summary_info += {'libusb': libusb}
|
|
||||||
diff --git a/meson_options.txt b/meson_options.txt
|
|
||||||
index 4b749ca549..6b37bd6b77 100644
|
|
||||||
--- a/meson_options.txt
|
|
||||||
+++ b/meson_options.txt
|
|
||||||
@@ -169,6 +169,8 @@ option('lzo', type : 'feature', value : 'auto',
|
|
||||||
description: 'lzo compression support')
|
|
||||||
option('rbd', type : 'feature', value : 'auto',
|
|
||||||
description: 'Ceph block device driver')
|
|
||||||
+option('vitastor', type : 'feature', value : 'auto',
|
|
||||||
+ description: 'Vitastor block device driver')
|
|
||||||
option('opengl', type : 'feature', value : 'auto',
|
|
||||||
description: 'OpenGL support')
|
|
||||||
option('rdma', type : 'feature', value : 'auto',
|
|
||||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
|
||||||
index 95ac4fa634..7a240827e4 100644
|
|
||||||
--- a/qapi/block-core.json
|
|
||||||
+++ b/qapi/block-core.json
|
|
||||||
@@ -2959,7 +2959,7 @@
|
|
||||||
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
|
|
||||||
'raw', 'rbd',
|
|
||||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
|
||||||
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
|
||||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
|
||||||
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
|
||||||
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
|
||||||
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
|
||||||
@@ -3957,6 +3957,28 @@
|
|
||||||
'*key-secret': 'str',
|
|
||||||
'*server': ['InetSocketAddressBase'] } }
|
|
||||||
|
|
||||||
+##
|
|
||||||
+# @BlockdevOptionsVitastor:
|
|
||||||
+#
|
|
||||||
+# Driver specific block device options for vitastor
|
|
||||||
+#
|
|
||||||
+# @image: Image name
|
|
||||||
+# @inode: Inode number
|
|
||||||
+# @pool: Pool ID
|
|
||||||
+# @size: Desired image size in bytes
|
|
||||||
+# @config-path: Path to Vitastor configuration
|
|
||||||
+# @etcd-host: etcd connection address(es)
|
|
||||||
+# @etcd-prefix: etcd key/value prefix
|
|
||||||
+##
|
|
||||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
|
||||||
+ 'data': { '*inode': 'uint64',
|
|
||||||
+ '*pool': 'uint64',
|
|
||||||
+ '*size': 'uint64',
|
|
||||||
+ '*image': 'str',
|
|
||||||
+ '*config-path': 'str',
|
|
||||||
+ '*etcd-host': 'str',
|
|
||||||
+ '*etcd-prefix': 'str' } }
|
|
||||||
+
|
|
||||||
##
|
|
||||||
# @ReplicationMode:
|
|
||||||
#
|
|
||||||
@@ -4405,6 +4427,7 @@
|
|
||||||
'throttle': 'BlockdevOptionsThrottle',
|
|
||||||
'vdi': 'BlockdevOptionsGenericFormat',
|
|
||||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
|
||||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
|
||||||
'virtio-blk-vfio-pci':
|
|
||||||
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
|
||||||
'if': 'CONFIG_BLKIO' },
|
|
||||||
@@ -4804,6 +4827,17 @@
|
|
||||||
'*cluster-size' : 'size',
|
|
||||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
|
||||||
|
|
||||||
+##
|
|
||||||
+# @BlockdevCreateOptionsVitastor:
|
|
||||||
+#
|
|
||||||
+# Driver specific image creation options for Vitastor.
|
|
||||||
+#
|
|
||||||
+# @size: Size of the virtual disk in bytes
|
|
||||||
+##
|
|
||||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
|
||||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
|
||||||
+ 'size': 'size' } }
|
|
||||||
+
|
|
||||||
##
|
|
||||||
# @BlockdevVmdkSubformat:
|
|
||||||
#
|
|
||||||
@@ -5002,6 +5036,7 @@
|
|
||||||
'ssh': 'BlockdevCreateOptionsSsh',
|
|
||||||
'vdi': 'BlockdevCreateOptionsVdi',
|
|
||||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
|
||||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
|
||||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
|
||||||
'vpc': 'BlockdevCreateOptionsVpc'
|
|
||||||
} }
|
|
||||||
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
|
|
||||||
index a7f92aff90..53dc55be2e 100755
|
|
||||||
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
|
|
||||||
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
|
|
||||||
@@ -31,7 +31,7 @@
|
|
||||||
--with-git=meson \
|
|
||||||
--with-git-submodules=update \
|
|
||||||
--target-list="x86_64-softmmu" \
|
|
||||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
|
||||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
|
||||||
--audio-drv-list="" \
|
|
||||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
|
||||||
--with-coroutine=ucontext \
|
|
||||||
@@ -179,6 +179,7 @@
|
|
||||||
--enable-opengl \
|
|
||||||
--enable-pie \
|
|
||||||
--enable-rbd \
|
|
||||||
+--enable-vitastor \
|
|
||||||
--enable-rdma \
|
|
||||||
--enable-seccomp \
|
|
||||||
--enable-snappy \
|
|
||||||
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
|
||||||
index aa6e30ea91..c45d21c40f 100644
|
|
||||||
--- a/scripts/meson-buildoptions.sh
|
|
||||||
+++ b/scripts/meson-buildoptions.sh
|
|
||||||
@@ -135,6 +135,7 @@ meson_options_help() {
|
|
||||||
printf "%s\n" ' qed qed image format support'
|
|
||||||
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
|
||||||
printf "%s\n" ' rbd Ceph block device driver'
|
|
||||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
|
||||||
printf "%s\n" ' rdma Enable RDMA-based migration'
|
|
||||||
printf "%s\n" ' replication replication support'
|
|
||||||
printf "%s\n" ' sdl SDL user interface'
|
|
||||||
@@ -376,6 +377,8 @@ _meson_option_parse() {
|
|
||||||
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
|
|
||||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
|
||||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
|
||||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
|
||||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
|
||||||
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
|
||||||
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
|
||||||
--enable-replication) printf "%s" -Dreplication=enabled ;;
|
|
@@ -1,190 +0,0 @@
|
|||||||
diff --git a/block/meson.build b/block/meson.build
|
|
||||||
index 382bec0e7d..af6207dbce 100644
|
|
||||||
--- a/block/meson.build
|
|
||||||
+++ b/block/meson.build
|
|
||||||
@@ -101,6 +101,7 @@ foreach m : [
|
|
||||||
[libnfs, 'nfs', files('nfs.c')],
|
|
||||||
[libssh, 'ssh', files('ssh.c')],
|
|
||||||
[rbd, 'rbd', files('rbd.c')],
|
|
||||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
|
||||||
]
|
|
||||||
if m[0].found()
|
|
||||||
module_ss = ss.source_set()
|
|
||||||
diff --git a/meson.build b/meson.build
|
|
||||||
index c44d05a13f..ebedb42843 100644
|
|
||||||
--- a/meson.build
|
|
||||||
+++ b/meson.build
|
|
||||||
@@ -1028,6 +1028,26 @@ if not get_option('rbd').auto() or have_block
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
|
|
||||||
+vitastor = not_found
|
|
||||||
+if not get_option('vitastor').auto() or have_block
|
|
||||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
|
||||||
+ required: get_option('vitastor'), kwargs: static_kwargs)
|
|
||||||
+ if libvitastor_client.found()
|
|
||||||
+ if cc.links('''
|
|
||||||
+ #include <vitastor_c.h>
|
|
||||||
+ int main(void) {
|
|
||||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
||||||
+ return 0;
|
|
||||||
+ }''', dependencies: libvitastor_client)
|
|
||||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
|
||||||
+ elif get_option('vitastor').enabled()
|
|
||||||
+ error('could not link libvitastor_client')
|
|
||||||
+ else
|
|
||||||
+ warning('could not link libvitastor_client, disabling')
|
|
||||||
+ endif
|
|
||||||
+ endif
|
|
||||||
+endif
|
|
||||||
+
|
|
||||||
glusterfs = not_found
|
|
||||||
glusterfs_ftruncate_has_stat = false
|
|
||||||
glusterfs_iocb_has_stat = false
|
|
||||||
@@ -1878,6 +1898,7 @@ endif
|
|
||||||
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
|
||||||
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
|
|
||||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
|
||||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
|
||||||
config_host_data.set('CONFIG_RDMA', rdma.found())
|
|
||||||
config_host_data.set('CONFIG_SDL', sdl.found())
|
|
||||||
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
|
|
||||||
@@ -4002,6 +4023,7 @@ if spice_protocol.found()
|
|
||||||
summary_info += {' spice server support': spice}
|
|
||||||
endif
|
|
||||||
summary_info += {'rbd support': rbd}
|
|
||||||
+summary_info += {'vitastor support': vitastor}
|
|
||||||
summary_info += {'smartcard support': cacard}
|
|
||||||
summary_info += {'U2F support': u2f}
|
|
||||||
summary_info += {'libusb': libusb}
|
|
||||||
diff --git a/meson_options.txt b/meson_options.txt
|
|
||||||
index fc9447d267..c4ac55c283 100644
|
|
||||||
--- a/meson_options.txt
|
|
||||||
+++ b/meson_options.txt
|
|
||||||
@@ -173,6 +173,8 @@ option('lzo', type : 'feature', value : 'auto',
|
|
||||||
description: 'lzo compression support')
|
|
||||||
option('rbd', type : 'feature', value : 'auto',
|
|
||||||
description: 'Ceph block device driver')
|
|
||||||
+option('vitastor', type : 'feature', value : 'auto',
|
|
||||||
+ description: 'Vitastor block device driver')
|
|
||||||
option('opengl', type : 'feature', value : 'auto',
|
|
||||||
description: 'OpenGL support')
|
|
||||||
option('rdma', type : 'feature', value : 'auto',
|
|
||||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
|
||||||
index c05ad0c07e..f5eb701604 100644
|
|
||||||
--- a/qapi/block-core.json
|
|
||||||
+++ b/qapi/block-core.json
|
|
||||||
@@ -3054,7 +3054,7 @@
|
|
||||||
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
|
|
||||||
'raw', 'rbd',
|
|
||||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
|
||||||
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
|
||||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
|
||||||
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
|
||||||
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
|
||||||
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
|
||||||
@@ -4073,6 +4073,28 @@
|
|
||||||
'*key-secret': 'str',
|
|
||||||
'*server': ['InetSocketAddressBase'] } }
|
|
||||||
|
|
||||||
+##
|
|
||||||
+# @BlockdevOptionsVitastor:
|
|
||||||
+#
|
|
||||||
+# Driver specific block device options for vitastor
|
|
||||||
+#
|
|
||||||
+# @image: Image name
|
|
||||||
+# @inode: Inode number
|
|
||||||
+# @pool: Pool ID
|
|
||||||
+# @size: Desired image size in bytes
|
|
||||||
+# @config-path: Path to Vitastor configuration
|
|
||||||
+# @etcd-host: etcd connection address(es)
|
|
||||||
+# @etcd-prefix: etcd key/value prefix
|
|
||||||
+##
|
|
||||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
|
||||||
+ 'data': { '*inode': 'uint64',
|
|
||||||
+ '*pool': 'uint64',
|
|
||||||
+ '*size': 'uint64',
|
|
||||||
+ '*image': 'str',
|
|
||||||
+ '*config-path': 'str',
|
|
||||||
+ '*etcd-host': 'str',
|
|
||||||
+ '*etcd-prefix': 'str' } }
|
|
||||||
+
|
|
||||||
##
|
|
||||||
# @ReplicationMode:
|
|
||||||
#
|
|
||||||
@@ -4521,6 +4543,7 @@
|
|
||||||
'throttle': 'BlockdevOptionsThrottle',
|
|
||||||
'vdi': 'BlockdevOptionsGenericFormat',
|
|
||||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
|
||||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
|
||||||
'virtio-blk-vfio-pci':
|
|
||||||
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
|
||||||
'if': 'CONFIG_BLKIO' },
|
|
||||||
@@ -4920,6 +4943,17 @@
|
|
||||||
'*cluster-size' : 'size',
|
|
||||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
|
||||||
|
|
||||||
+##
|
|
||||||
+# @BlockdevCreateOptionsVitastor:
|
|
||||||
+#
|
|
||||||
+# Driver specific image creation options for Vitastor.
|
|
||||||
+#
|
|
||||||
+# @size: Size of the virtual disk in bytes
|
|
||||||
+##
|
|
||||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
|
||||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
|
||||||
+ 'size': 'size' } }
|
|
||||||
+
|
|
||||||
##
|
|
||||||
# @BlockdevVmdkSubformat:
|
|
||||||
#
|
|
||||||
@@ -5118,6 +5152,7 @@
|
|
||||||
'ssh': 'BlockdevCreateOptionsSsh',
|
|
||||||
'vdi': 'BlockdevCreateOptionsVdi',
|
|
||||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
|
||||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
|
||||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
|
||||||
'vpc': 'BlockdevCreateOptionsVpc'
|
|
||||||
} }
|
|
||||||
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
|
|
||||||
index 6e8983f39c..1b0b9fcf3e 100755
|
|
||||||
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
|
|
||||||
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
|
|
||||||
@@ -32,7 +32,7 @@
|
|
||||||
--with-git=meson \
|
|
||||||
--with-git-submodules=update \
|
|
||||||
--target-list="x86_64-softmmu" \
|
|
||||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
|
||||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
|
||||||
--audio-drv-list="" \
|
|
||||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
|
||||||
--with-coroutine=ucontext \
|
|
||||||
@@ -179,6 +179,7 @@
|
|
||||||
--enable-opengl \
|
|
||||||
--enable-pie \
|
|
||||||
--enable-rbd \
|
|
||||||
+--enable-vitastor \
|
|
||||||
--enable-rdma \
|
|
||||||
--enable-seccomp \
|
|
||||||
--enable-snappy \
|
|
||||||
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
|
||||||
index 009fab1515..95914e6ebc 100644
|
|
||||||
--- a/scripts/meson-buildoptions.sh
|
|
||||||
+++ b/scripts/meson-buildoptions.sh
|
|
||||||
@@ -142,6 +142,7 @@ meson_options_help() {
|
|
||||||
printf "%s\n" ' qed qed image format support'
|
|
||||||
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
|
||||||
printf "%s\n" ' rbd Ceph block device driver'
|
|
||||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
|
||||||
printf "%s\n" ' rdma Enable RDMA-based migration'
|
|
||||||
printf "%s\n" ' replication replication support'
|
|
||||||
printf "%s\n" ' sdl SDL user interface'
|
|
||||||
@@ -388,6 +389,8 @@ _meson_option_parse() {
|
|
||||||
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
|
|
||||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
|
||||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
|
||||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
|
||||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
|
||||||
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
|
||||||
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
|
||||||
--enable-replication) printf "%s" -Dreplication=enabled ;;
|
|
@@ -7,12 +7,13 @@ set -e
|
|||||||
VITASTOR=$(dirname $0)
|
VITASTOR=$(dirname $0)
|
||||||
VITASTOR=$(realpath "$VITASTOR/..")
|
VITASTOR=$(realpath "$VITASTOR/..")
|
||||||
|
|
||||||
EL=$(rpm --eval '%dist')
|
if [ -d /opt/rh/gcc-toolset-9 ]; then
|
||||||
if [ "$EL" = ".el8" ]; then
|
|
||||||
# CentOS 8
|
# CentOS 8
|
||||||
|
EL=8
|
||||||
. /opt/rh/gcc-toolset-9/enable
|
. /opt/rh/gcc-toolset-9/enable
|
||||||
elif [ "$EL" = ".el7" ]; then
|
else
|
||||||
# CentOS 7
|
# CentOS 7
|
||||||
|
EL=7
|
||||||
. /opt/rh/devtoolset-9/enable
|
. /opt/rh/devtoolset-9/enable
|
||||||
fi
|
fi
|
||||||
cd ~/rpmbuild/SPECS
|
cd ~/rpmbuild/SPECS
|
||||||
@@ -24,4 +25,4 @@ rm fio
|
|||||||
mv fio-copy fio
|
mv fio-copy fio
|
||||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||||
tar --transform 's#^#vitastor-0.8.8/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.8$(rpm --eval '%dist').tar.gz *
|
tar --transform 's#^#vitastor-0.8.3/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.3$(rpm --eval '%dist').tar.gz *
|
||||||
|
@@ -1,93 +0,0 @@
|
|||||||
--- qemu-kvm.spec.orig 2023-02-28 08:04:06.000000000 +0000
|
|
||||||
+++ qemu-kvm.spec 2023-04-27 22:29:18.094878829 +0000
|
|
||||||
@@ -100,8 +100,6 @@
|
|
||||||
%endif
|
|
||||||
|
|
||||||
%global target_list %{kvm_target}-softmmu
|
|
||||||
-%global block_drivers_rw_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,compress
|
|
||||||
-%global block_drivers_ro_list vdi,vmdk,vhdx,vpc,https
|
|
||||||
%define qemudocdir %{_docdir}/%{name}
|
|
||||||
%global firmwaredirs "%{_datadir}/qemu-firmware:%{_datadir}/ipxe/qemu:%{_datadir}/seavgabios:%{_datadir}/seabios"
|
|
||||||
|
|
||||||
@@ -129,6 +127,7 @@ Requires: %{name}-device-usb-host = %{ep
|
|
||||||
Requires: %{name}-device-usb-redirect = %{epoch}:%{version}-%{release} \
|
|
||||||
%endif \
|
|
||||||
Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \
|
|
||||||
+Requires: %{name}-block-vitastor = %{epoch}:%{version}-%{release}\
|
|
||||||
Requires: %{name}-audio-pa = %{epoch}:%{version}-%{release}
|
|
||||||
|
|
||||||
# Since SPICE is removed from RHEL-9, the following Obsoletes:
|
|
||||||
@@ -151,7 +150,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}
|
|
||||||
Summary: QEMU is a machine emulator and virtualizer
|
|
||||||
Name: qemu-kvm
|
|
||||||
Version: 7.0.0
|
|
||||||
-Release: 13%{?rcrel}%{?dist}%{?cc_suffix}.2
|
|
||||||
+Release: 13.vitastor%{?rcrel}%{?dist}%{?cc_suffix}
|
|
||||||
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
|
|
||||||
# Epoch 15 used for RHEL 8
|
|
||||||
# Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5)
|
|
||||||
@@ -174,6 +173,7 @@ Source28: 95-kvm-memlock.conf
|
|
||||||
Source30: kvm-s390x.conf
|
|
||||||
Source31: kvm-x86.conf
|
|
||||||
Source36: README.tests
|
|
||||||
+Source37: qemu-vitastor.c
|
|
||||||
|
|
||||||
|
|
||||||
Patch0004: 0004-Initial-redhat-build.patch
|
|
||||||
@@ -498,6 +498,7 @@ Patch171: kvm-i386-do-kvm_put_msr_featur
|
|
||||||
Patch172: kvm-target-i386-kvm-fix-kvmclock_current_nsec-Assertion-.patch
|
|
||||||
# For bz#2168221 - while live-migrating many instances concurrently, libvirt sometimes return internal error: migration was active, but no RAM info was set [rhel-9.1.0.z]
|
|
||||||
Patch173: kvm-migration-Read-state-once.patch
|
|
||||||
+Patch174: qemu-7.0-vitastor.patch
|
|
||||||
|
|
||||||
# Source-git patches
|
|
||||||
|
|
||||||
@@ -531,6 +532,7 @@ BuildRequires: libcurl-devel
|
|
||||||
%if %{have_block_rbd}
|
|
||||||
BuildRequires: librbd-devel
|
|
||||||
%endif
|
|
||||||
+BuildRequires: vitastor-client-devel
|
|
||||||
# We need both because the 'stap' binary is probed for by configure
|
|
||||||
BuildRequires: systemtap
|
|
||||||
BuildRequires: systemtap-sdt-devel
|
|
||||||
@@ -718,6 +720,14 @@ using the rbd protocol.
|
|
||||||
%endif
|
|
||||||
|
|
||||||
|
|
||||||
+%package block-vitastor
|
|
||||||
+Summary: QEMU Vitastor block driver
|
|
||||||
+Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
|
|
||||||
+
|
|
||||||
+%description block-vitastor
|
|
||||||
+This package provides the additional Vitastor block driver for QEMU.
|
|
||||||
+
|
|
||||||
+
|
|
||||||
%package audio-pa
|
|
||||||
Summary: QEMU PulseAudio audio driver
|
|
||||||
Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
|
|
||||||
@@ -811,6 +821,7 @@ This package provides usbredir support.
|
|
||||||
%prep
|
|
||||||
%setup -q -n qemu-%{version}%{?rcstr}
|
|
||||||
%autopatch -p1
|
|
||||||
+cp %{SOURCE37} ./block/vitastor.c
|
|
||||||
|
|
||||||
%global qemu_kvm_build qemu_kvm_build
|
|
||||||
mkdir -p %{qemu_kvm_build}
|
|
||||||
@@ -1032,6 +1043,7 @@ run_configure \
|
|
||||||
%if %{have_block_rbd}
|
|
||||||
--enable-rbd \
|
|
||||||
%endif
|
|
||||||
+ --enable-vitastor \
|
|
||||||
%if %{have_librdma}
|
|
||||||
--enable-rdma \
|
|
||||||
%endif
|
|
||||||
@@ -1511,6 +1523,9 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
|
|
||||||
%files block-rbd
|
|
||||||
%{_libdir}/%{name}/block-rbd.so
|
|
||||||
%endif
|
|
||||||
+%files block-vitastor
|
|
||||||
+%{_libdir}/%{name}/block-vitastor.so
|
|
||||||
+
|
|
||||||
%files audio-pa
|
|
||||||
%{_libdir}/%{name}/audio-pa.so
|
|
||||||
|
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-0.8.8.el7.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-0.8.3.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 0.8.8
|
Version: 0.8.3
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-0.8.8.el7.tar.gz
|
Source0: vitastor-0.8.3.el7.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
@@ -35,7 +35,6 @@ Summary: Vitastor - OSD
|
|||||||
Requires: libJerasure2
|
Requires: libJerasure2
|
||||||
Requires: libisa-l
|
Requires: libisa-l
|
||||||
Requires: liburing >= 0.6
|
Requires: liburing >= 0.6
|
||||||
Requires: liburing < 2
|
|
||||||
Requires: vitastor-client = %{version}-%{release}
|
Requires: vitastor-client = %{version}-%{release}
|
||||||
Requires: util-linux
|
Requires: util-linux
|
||||||
Requires: parted
|
Requires: parted
|
||||||
@@ -60,7 +59,6 @@ scheduling cluster-level operations.
|
|||||||
%package -n vitastor-client
|
%package -n vitastor-client
|
||||||
Summary: Vitastor - client
|
Summary: Vitastor - client
|
||||||
Requires: liburing >= 0.6
|
Requires: liburing >= 0.6
|
||||||
Requires: liburing < 2
|
|
||||||
|
|
||||||
|
|
||||||
%description -n vitastor-client
|
%description -n vitastor-client
|
||||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-0.8.8.el8.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-0.8.3.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 0.8.8
|
Version: 0.8.3
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-0.8.8.el8.tar.gz
|
Source0: vitastor-0.8.3.el8.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
@@ -34,7 +34,6 @@ Summary: Vitastor - OSD
|
|||||||
Requires: libJerasure2
|
Requires: libJerasure2
|
||||||
Requires: libisa-l
|
Requires: libisa-l
|
||||||
Requires: liburing >= 0.6
|
Requires: liburing >= 0.6
|
||||||
Requires: liburing < 2
|
|
||||||
Requires: vitastor-client = %{version}-%{release}
|
Requires: vitastor-client = %{version}-%{release}
|
||||||
Requires: util-linux
|
Requires: util-linux
|
||||||
Requires: parted
|
Requires: parted
|
||||||
@@ -58,7 +57,6 @@ scheduling cluster-level operations.
|
|||||||
%package -n vitastor-client
|
%package -n vitastor-client
|
||||||
Summary: Vitastor - client
|
Summary: Vitastor - client
|
||||||
Requires: liburing >= 0.6
|
Requires: liburing >= 0.6
|
||||||
Requires: liburing < 2
|
|
||||||
|
|
||||||
|
|
||||||
%description -n vitastor-client
|
%description -n vitastor-client
|
||||||
|
@@ -1,28 +0,0 @@
|
|||||||
# Build packages for AlmaLinux 9 inside a container
|
|
||||||
# cd ..; podman build -t vitastor-el9 -v `pwd`/packages:/root/packages -f rpm/vitastor-el9.Dockerfile .
|
|
||||||
|
|
||||||
FROM almalinux:9
|
|
||||||
|
|
||||||
WORKDIR /root
|
|
||||||
|
|
||||||
RUN sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/*.repo
|
|
||||||
RUN dnf -y install epel-release dnf-plugins-core
|
|
||||||
RUN dnf -y install https://vitastor.io/rpms/centos/9/vitastor-release-1.0-1.el9.noarch.rpm
|
|
||||||
RUN dnf -y install gcc-c++ gperftools-devel fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libarchive liburing-devel cmake
|
|
||||||
RUN dnf download --source fio
|
|
||||||
RUN rpm --nomd5 -i fio*.src.rpm
|
|
||||||
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --spec fio.spec
|
|
||||||
|
|
||||||
ADD . /root/vitastor
|
|
||||||
|
|
||||||
RUN set -e; \
|
|
||||||
cd /root/vitastor/rpm; \
|
|
||||||
sh build-tarball.sh; \
|
|
||||||
cp /root/vitastor-0.8.8.el9.tar.gz ~/rpmbuild/SOURCES; \
|
|
||||||
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
|
||||||
cd ~/rpmbuild/SPECS/; \
|
|
||||||
rpmbuild -ba vitastor.spec; \
|
|
||||||
mkdir -p /root/packages/vitastor-el9; \
|
|
||||||
rm -rf /root/packages/vitastor-el9/*; \
|
|
||||||
cp ~/rpmbuild/RPMS/*/vitastor* /root/packages/vitastor-el9/; \
|
|
||||||
cp ~/rpmbuild/SRPMS/vitastor* /root/packages/vitastor-el9/
|
|
@@ -1,158 +0,0 @@
|
|||||||
Name: vitastor
|
|
||||||
Version: 0.8.8
|
|
||||||
Release: 1%{?dist}
|
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
|
||||||
URL: https://vitastor.io/
|
|
||||||
Source0: vitastor-0.8.8.el9.tar.gz
|
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
|
||||||
BuildRequires: gperftools-devel
|
|
||||||
BuildRequires: gcc-c++
|
|
||||||
BuildRequires: nodejs >= 10
|
|
||||||
BuildRequires: jerasure-devel
|
|
||||||
BuildRequires: libisa-l-devel
|
|
||||||
BuildRequires: gf-complete-devel
|
|
||||||
BuildRequires: rdma-core-devel
|
|
||||||
BuildRequires: cmake
|
|
||||||
Requires: vitastor-osd = %{version}-%{release}
|
|
||||||
Requires: vitastor-mon = %{version}-%{release}
|
|
||||||
Requires: vitastor-client = %{version}-%{release}
|
|
||||||
Requires: vitastor-client-devel = %{version}-%{release}
|
|
||||||
Requires: vitastor-fio = %{version}-%{release}
|
|
||||||
|
|
||||||
%description
|
|
||||||
Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
|
|
||||||
architecturally similar to Ceph which means strong consistency, primary-replication,
|
|
||||||
symmetric clustering and automatic data distribution over any number of drives of any
|
|
||||||
size with configurable redundancy (replication or erasure codes/XOR).
|
|
||||||
|
|
||||||
|
|
||||||
%package -n vitastor-osd
|
|
||||||
Summary: Vitastor - OSD
|
|
||||||
Requires: vitastor-client = %{version}-%{release}
|
|
||||||
Requires: util-linux
|
|
||||||
Requires: parted
|
|
||||||
|
|
||||||
|
|
||||||
%description -n vitastor-osd
|
|
||||||
Vitastor object storage daemon, i.e. server program that stores data.
|
|
||||||
|
|
||||||
|
|
||||||
%package -n vitastor-mon
|
|
||||||
Summary: Vitastor - monitor
|
|
||||||
Requires: nodejs >= 10
|
|
||||||
Requires: lpsolve
|
|
||||||
|
|
||||||
|
|
||||||
%description -n vitastor-mon
|
|
||||||
Vitastor monitor, i.e. server program responsible for watching cluster state and
|
|
||||||
scheduling cluster-level operations.
|
|
||||||
|
|
||||||
|
|
||||||
%package -n vitastor-client
|
|
||||||
Summary: Vitastor - client
|
|
||||||
|
|
||||||
|
|
||||||
%description -n vitastor-client
|
|
||||||
Vitastor client library and command-line interface.
|
|
||||||
|
|
||||||
|
|
||||||
%package -n vitastor-client-devel
|
|
||||||
Summary: Vitastor - development files
|
|
||||||
Group: Development/Libraries
|
|
||||||
Requires: vitastor-client = %{version}-%{release}
|
|
||||||
|
|
||||||
|
|
||||||
%description -n vitastor-client-devel
|
|
||||||
Vitastor library headers for development.
|
|
||||||
|
|
||||||
|
|
||||||
%package -n vitastor-fio
|
|
||||||
Summary: Vitastor - fio drivers
|
|
||||||
Group: Development/Libraries
|
|
||||||
Requires: vitastor-client = %{version}-%{release}
|
|
||||||
Requires: fio = 3.27-7.el9
|
|
||||||
|
|
||||||
|
|
||||||
%description -n vitastor-fio
|
|
||||||
Vitastor fio drivers for benchmarking.
|
|
||||||
|
|
||||||
|
|
||||||
%prep
|
|
||||||
%setup -q
|
|
||||||
|
|
||||||
|
|
||||||
%build
|
|
||||||
%cmake
|
|
||||||
%cmake_build
|
|
||||||
|
|
||||||
|
|
||||||
%install
|
|
||||||
rm -rf $RPM_BUILD_ROOT
|
|
||||||
%cmake_install
|
|
||||||
cd mon
|
|
||||||
npm install
|
|
||||||
cd ..
|
|
||||||
mkdir -p %buildroot/usr/lib/vitastor
|
|
||||||
cp -r mon %buildroot/usr/lib/vitastor
|
|
||||||
mkdir -p %buildroot/lib/systemd/system
|
|
||||||
cp mon/vitastor.target mon/vitastor-mon.service mon/vitastor-osd@.service %buildroot/lib/systemd/system
|
|
||||||
mkdir -p %buildroot/lib/udev/rules.d
|
|
||||||
cp mon/90-vitastor.rules %buildroot/lib/udev/rules.d
|
|
||||||
|
|
||||||
|
|
||||||
%files
|
|
||||||
%doc GPL-2.0.txt VNPL-1.1.txt README.md README-ru.md
|
|
||||||
|
|
||||||
|
|
||||||
%files -n vitastor-osd
|
|
||||||
%_bindir/vitastor-osd
|
|
||||||
%_bindir/vitastor-disk
|
|
||||||
%_bindir/vitastor-dump-journal
|
|
||||||
/lib/systemd/system/vitastor-osd@.service
|
|
||||||
/lib/systemd/system/vitastor.target
|
|
||||||
/lib/udev/rules.d/90-vitastor.rules
|
|
||||||
|
|
||||||
|
|
||||||
%pre -n vitastor-osd
|
|
||||||
groupadd -r -f vitastor 2>/dev/null ||:
|
|
||||||
useradd -r -g vitastor -s /sbin/nologin -c "Vitastor daemons" -M -d /nonexistent vitastor 2>/dev/null ||:
|
|
||||||
install -o vitastor -g vitastor -d /var/log/vitastor
|
|
||||||
mkdir -p /etc/vitastor
|
|
||||||
|
|
||||||
|
|
||||||
%files -n vitastor-mon
|
|
||||||
/usr/lib/vitastor/mon
|
|
||||||
/lib/systemd/system/vitastor-mon.service
|
|
||||||
|
|
||||||
|
|
||||||
%pre -n vitastor-mon
|
|
||||||
groupadd -r -f vitastor 2>/dev/null ||:
|
|
||||||
useradd -r -g vitastor -s /sbin/nologin -c "Vitastor daemons" -M -d /nonexistent vitastor 2>/dev/null ||:
|
|
||||||
mkdir -p /etc/vitastor
|
|
||||||
|
|
||||||
|
|
||||||
%files -n vitastor-client
|
|
||||||
%_bindir/vitastor-nbd
|
|
||||||
%_bindir/vitastor-nfs
|
|
||||||
%_bindir/vitastor-cli
|
|
||||||
%_bindir/vitastor-rm
|
|
||||||
%_bindir/vita
|
|
||||||
%_libdir/libvitastor_blk.so*
|
|
||||||
%_libdir/libvitastor_client.so*
|
|
||||||
|
|
||||||
|
|
||||||
%files -n vitastor-client-devel
|
|
||||||
%_includedir/vitastor_c.h
|
|
||||||
%_libdir/pkgconfig
|
|
||||||
|
|
||||||
|
|
||||||
%files -n vitastor-fio
|
|
||||||
%_libdir/libfio_vitastor.so
|
|
||||||
%_libdir/libfio_vitastor_blk.so
|
|
||||||
%_libdir/libfio_vitastor_sec.so
|
|
||||||
|
|
||||||
|
|
||||||
%changelog
|
|
@@ -1,9 +1,8 @@
|
|||||||
cmake_minimum_required(VERSION 2.8.12)
|
cmake_minimum_required(VERSION 2.8)
|
||||||
|
|
||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
include(GNUInstallDirs)
|
include(GNUInstallDirs)
|
||||||
include(CTest)
|
|
||||||
|
|
||||||
set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree")
|
set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree")
|
||||||
set(WITH_FIO true CACHE BOOL "Build FIO driver")
|
set(WITH_FIO true CACHE BOOL "Build FIO driver")
|
||||||
@@ -16,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
|||||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_definitions(-DVERSION="0.8.8")
|
add_definitions(-DVERSION="0.8.3")
|
||||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||||
if (${WITH_ASAN})
|
if (${WITH_ASAN})
|
||||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||||
@@ -56,14 +55,6 @@ if (ISAL_LIBRARIES)
|
|||||||
add_definitions(-DWITH_ISAL)
|
add_definitions(-DWITH_ISAL)
|
||||||
endif (ISAL_LIBRARIES)
|
endif (ISAL_LIBRARIES)
|
||||||
|
|
||||||
add_custom_target(build_tests)
|
|
||||||
add_custom_target(test
|
|
||||||
COMMAND
|
|
||||||
echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
|
|
||||||
env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
|
|
||||||
)
|
|
||||||
add_dependencies(test build_tests)
|
|
||||||
|
|
||||||
include_directories(
|
include_directories(
|
||||||
../
|
../
|
||||||
/usr/include/jerasure
|
/usr/include/jerasure
|
||||||
@@ -154,6 +145,7 @@ add_library(vitastor_client SHARED
|
|||||||
set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h")
|
set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h")
|
||||||
target_link_libraries(vitastor_client
|
target_link_libraries(vitastor_client
|
||||||
vitastor_common
|
vitastor_common
|
||||||
|
tcmalloc_minimal
|
||||||
${LIBURING_LIBRARIES}
|
${LIBURING_LIBRARIES}
|
||||||
${IBVERBS_LIBRARIES}
|
${IBVERBS_LIBRARIES}
|
||||||
)
|
)
|
||||||
@@ -243,17 +235,14 @@ add_executable(osd_test osd_test.cpp rw_blocking.cpp addr_util.cpp)
|
|||||||
target_link_libraries(osd_test tcmalloc_minimal)
|
target_link_libraries(osd_test tcmalloc_minimal)
|
||||||
|
|
||||||
# osd_rmw_test
|
# osd_rmw_test
|
||||||
add_executable(osd_rmw_test EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
|
# FIXME: Move to tests
|
||||||
|
add_executable(osd_rmw_test osd_rmw_test.cpp allocator.cpp)
|
||||||
target_link_libraries(osd_rmw_test Jerasure ${ISAL_LIBRARIES} tcmalloc_minimal)
|
target_link_libraries(osd_rmw_test Jerasure ${ISAL_LIBRARIES} tcmalloc_minimal)
|
||||||
add_dependencies(build_tests osd_rmw_test)
|
|
||||||
add_test(NAME osd_rmw_test COMMAND osd_rmw_test)
|
|
||||||
|
|
||||||
if (ISAL_LIBRARIES)
|
if (ISAL_LIBRARIES)
|
||||||
add_executable(osd_rmw_test_je EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
|
add_executable(osd_rmw_test_je osd_rmw_test.cpp allocator.cpp)
|
||||||
target_compile_definitions(osd_rmw_test_je PUBLIC -DNO_ISAL)
|
target_compile_definitions(osd_rmw_test_je PUBLIC -DNO_ISAL)
|
||||||
target_link_libraries(osd_rmw_test_je Jerasure tcmalloc_minimal)
|
target_link_libraries(osd_rmw_test_je Jerasure tcmalloc_minimal)
|
||||||
add_dependencies(build_tests osd_rmw_test_je)
|
|
||||||
add_test(NAME osd_rmw_test_jerasure COMMAND osd_rmw_test_je)
|
|
||||||
endif (ISAL_LIBRARIES)
|
endif (ISAL_LIBRARIES)
|
||||||
|
|
||||||
# stub_uring_osd
|
# stub_uring_osd
|
||||||
@@ -268,15 +257,11 @@ target_link_libraries(stub_uring_osd
|
|||||||
)
|
)
|
||||||
|
|
||||||
# osd_peering_pg_test
|
# osd_peering_pg_test
|
||||||
add_executable(osd_peering_pg_test EXCLUDE_FROM_ALL osd_peering_pg_test.cpp osd_peering_pg.cpp)
|
add_executable(osd_peering_pg_test osd_peering_pg_test.cpp osd_peering_pg.cpp)
|
||||||
target_link_libraries(osd_peering_pg_test tcmalloc_minimal)
|
target_link_libraries(osd_peering_pg_test tcmalloc_minimal)
|
||||||
add_dependencies(build_tests osd_peering_pg_test)
|
|
||||||
add_test(NAME osd_peering_pg_test COMMAND osd_peering_pg_test)
|
|
||||||
|
|
||||||
# test_allocator
|
# test_allocator
|
||||||
add_executable(test_allocator EXCLUDE_FROM_ALL test_allocator.cpp allocator.cpp)
|
add_executable(test_allocator test_allocator.cpp allocator.cpp)
|
||||||
add_dependencies(build_tests test_allocator)
|
|
||||||
add_test(NAME test_allocator COMMAND test_allocator)
|
|
||||||
|
|
||||||
# test_cas
|
# test_cas
|
||||||
add_executable(test_cas
|
add_executable(test_cas
|
||||||
@@ -296,15 +281,12 @@ target_link_libraries(test_crc32
|
|||||||
|
|
||||||
# test_cluster_client
|
# test_cluster_client
|
||||||
add_executable(test_cluster_client
|
add_executable(test_cluster_client
|
||||||
EXCLUDE_FROM_ALL
|
|
||||||
test_cluster_client.cpp
|
test_cluster_client.cpp
|
||||||
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
|
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
|
||||||
etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
|
etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
|
||||||
)
|
)
|
||||||
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
|
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
|
||||||
target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
|
target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
|
||||||
add_dependencies(build_tests test_cluster_client)
|
|
||||||
add_test(NAME test_cluster_client COMMAND test_cluster_client)
|
|
||||||
|
|
||||||
## test_blockstore, test_shit
|
## test_blockstore, test_shit
|
||||||
#add_executable(test_blockstore test_blockstore.cpp)
|
#add_executable(test_blockstore test_blockstore.cpp)
|
||||||
|
@@ -13,11 +13,6 @@ blockstore_t::~blockstore_t()
|
|||||||
delete impl;
|
delete impl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void blockstore_t::parse_config(blockstore_config_t & config)
|
|
||||||
{
|
|
||||||
impl->parse_config(config, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
void blockstore_t::loop()
|
void blockstore_t::loop()
|
||||||
{
|
{
|
||||||
impl->loop();
|
impl->loop();
|
||||||
|
@@ -107,7 +107,7 @@ Input:
|
|||||||
- buf = pre-allocated obj_ver_id array <len> units long
|
- buf = pre-allocated obj_ver_id array <len> units long
|
||||||
|
|
||||||
Output:
|
Output:
|
||||||
- retval = 0 or negative error number (-ENOENT if no such version for stabilize)
|
- retval = 0 or negative error number (-EINVAL, -ENOENT if no such version or -EBUSY if not synced)
|
||||||
|
|
||||||
## BS_OP_SYNC_STAB_ALL
|
## BS_OP_SYNC_STAB_ALL
|
||||||
|
|
||||||
@@ -165,9 +165,6 @@ public:
|
|||||||
blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
|
blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
|
||||||
~blockstore_t();
|
~blockstore_t();
|
||||||
|
|
||||||
// Update configuration
|
|
||||||
void parse_config(blockstore_config_t & config);
|
|
||||||
|
|
||||||
// Event loop
|
// Event loop
|
||||||
void loop();
|
void loop();
|
||||||
|
|
||||||
|
@@ -932,7 +932,7 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
|
|||||||
resume_1:
|
resume_1:
|
||||||
if (!cur_sync->state)
|
if (!cur_sync->state)
|
||||||
{
|
{
|
||||||
if (flusher->syncing_flushers >= flusher->active_flushers || !flusher->flush_queue.size())
|
if (flusher->syncing_flushers >= flusher->cur_flusher_count || !flusher->flush_queue.size())
|
||||||
{
|
{
|
||||||
// Sync batch is ready. Do it.
|
// Sync batch is ready. Do it.
|
||||||
await_sqe(0);
|
await_sqe(0);
|
||||||
|
@@ -11,7 +11,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
|
|||||||
ring_consumer.loop = [this]() { loop(); };
|
ring_consumer.loop = [this]() { loop(); };
|
||||||
ringloop->register_consumer(&ring_consumer);
|
ringloop->register_consumer(&ring_consumer);
|
||||||
initialized = 0;
|
initialized = 0;
|
||||||
parse_config(config, true);
|
parse_config(config);
|
||||||
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
|
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
@@ -171,7 +171,7 @@ void blockstore_impl_t::loop()
|
|||||||
// Can't submit SYNC before previous writes
|
// Can't submit SYNC before previous writes
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
wr_st = continue_sync(op);
|
wr_st = continue_sync(op, false);
|
||||||
if (wr_st != 2)
|
if (wr_st != 2)
|
||||||
{
|
{
|
||||||
has_writes = wr_st > 0 ? 1 : 2;
|
has_writes = wr_st > 0 ? 1 : 2;
|
||||||
@@ -325,7 +325,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
|
|||||||
{
|
{
|
||||||
// Basic verification not passed
|
// Basic verification not passed
|
||||||
op->retval = -EINVAL;
|
op->retval = -EINVAL;
|
||||||
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
|
std::function<void (blockstore_op_t*)>(op->callback)(op);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (op->opcode == BS_OP_SYNC_STAB_ALL)
|
if (op->opcode == BS_OP_SYNC_STAB_ALL)
|
||||||
@@ -368,21 +368,16 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
|
|||||||
}
|
}
|
||||||
if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
|
if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
|
||||||
{
|
{
|
||||||
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
|
std::function<void (blockstore_op_t*)>(op->callback)(op);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
init_op(op);
|
|
||||||
submit_queue.push_back(op);
|
|
||||||
ringloop->wakeup();
|
|
||||||
}
|
|
||||||
|
|
||||||
void blockstore_impl_t::init_op(blockstore_op_t *op)
|
|
||||||
{
|
|
||||||
// Call constructor without allocating memory. We'll call destructor before returning op back
|
// Call constructor without allocating memory. We'll call destructor before returning op back
|
||||||
new ((void*)op->private_data) blockstore_op_private_t;
|
new ((void*)op->private_data) blockstore_op_private_t;
|
||||||
PRIV(op)->wait_for = 0;
|
PRIV(op)->wait_for = 0;
|
||||||
PRIV(op)->op_state = 0;
|
PRIV(op)->op_state = 0;
|
||||||
PRIV(op)->pending_ops = 0;
|
PRIV(op)->pending_ops = 0;
|
||||||
|
submit_queue.push_back(op);
|
||||||
|
ringloop->wakeup();
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
|
static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
|
||||||
|
@@ -216,11 +216,6 @@ struct pool_shard_settings_t
|
|||||||
uint32_t pg_stripe_size;
|
uint32_t pg_stripe_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define STAB_SPLIT_DONE 1
|
|
||||||
#define STAB_SPLIT_WAIT 2
|
|
||||||
#define STAB_SPLIT_SYNC 3
|
|
||||||
#define STAB_SPLIT_TODO 4
|
|
||||||
|
|
||||||
class blockstore_impl_t
|
class blockstore_impl_t
|
||||||
{
|
{
|
||||||
blockstore_disk_t dsk;
|
blockstore_disk_t dsk;
|
||||||
@@ -282,6 +277,7 @@ class blockstore_impl_t
|
|||||||
friend class journal_flusher_t;
|
friend class journal_flusher_t;
|
||||||
friend class journal_flusher_co;
|
friend class journal_flusher_co;
|
||||||
|
|
||||||
|
void parse_config(blockstore_config_t & config);
|
||||||
void calc_lengths();
|
void calc_lengths();
|
||||||
void open_data();
|
void open_data();
|
||||||
void open_meta();
|
void open_meta();
|
||||||
@@ -303,7 +299,6 @@ class blockstore_impl_t
|
|||||||
blockstore_init_journal* journal_init_reader;
|
blockstore_init_journal* journal_init_reader;
|
||||||
|
|
||||||
void check_wait(blockstore_op_t *op);
|
void check_wait(blockstore_op_t *op);
|
||||||
void init_op(blockstore_op_t *op);
|
|
||||||
|
|
||||||
// Read
|
// Read
|
||||||
int dequeue_read(blockstore_op_t *read_op);
|
int dequeue_read(blockstore_op_t *read_op);
|
||||||
@@ -323,7 +318,7 @@ class blockstore_impl_t
|
|||||||
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
|
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
|
||||||
|
|
||||||
// Sync
|
// Sync
|
||||||
int continue_sync(blockstore_op_t *op);
|
int continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync);
|
||||||
void ack_sync(blockstore_op_t *op);
|
void ack_sync(blockstore_op_t *op);
|
||||||
|
|
||||||
// Stabilize
|
// Stabilize
|
||||||
@@ -331,8 +326,6 @@ class blockstore_impl_t
|
|||||||
int continue_stable(blockstore_op_t *op);
|
int continue_stable(blockstore_op_t *op);
|
||||||
void mark_stable(const obj_ver_id & ov, bool forget_dirty = false);
|
void mark_stable(const obj_ver_id & ov, bool forget_dirty = false);
|
||||||
void stabilize_object(object_id oid, uint64_t max_ver);
|
void stabilize_object(object_id oid, uint64_t max_ver);
|
||||||
blockstore_op_t* selective_sync(blockstore_op_t *op);
|
|
||||||
int split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider);
|
|
||||||
|
|
||||||
// Rollback
|
// Rollback
|
||||||
int dequeue_rollback(blockstore_op_t *op);
|
int dequeue_rollback(blockstore_op_t *op);
|
||||||
@@ -348,8 +341,6 @@ public:
|
|||||||
blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
|
blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
|
||||||
~blockstore_impl_t();
|
~blockstore_impl_t();
|
||||||
|
|
||||||
void parse_config(blockstore_config_t & config, bool init);
|
|
||||||
|
|
||||||
// Event loop
|
// Event loop
|
||||||
void loop();
|
void loop();
|
||||||
|
|
||||||
|
@@ -4,54 +4,8 @@
|
|||||||
#include <sys/file.h>
|
#include <sys/file.h>
|
||||||
#include "blockstore_impl.h"
|
#include "blockstore_impl.h"
|
||||||
|
|
||||||
void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
void blockstore_impl_t::parse_config(blockstore_config_t & config)
|
||||||
{
|
{
|
||||||
// Online-configurable options:
|
|
||||||
max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
|
|
||||||
if (!max_flusher_count)
|
|
||||||
{
|
|
||||||
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
|
|
||||||
}
|
|
||||||
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
|
|
||||||
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
|
|
||||||
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
|
|
||||||
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
|
|
||||||
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
|
|
||||||
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
|
|
||||||
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
|
|
||||||
if (!max_flusher_count)
|
|
||||||
{
|
|
||||||
max_flusher_count = 256;
|
|
||||||
}
|
|
||||||
if (!min_flusher_count || journal.flush_journal)
|
|
||||||
{
|
|
||||||
min_flusher_count = 1;
|
|
||||||
}
|
|
||||||
if (!max_write_iodepth)
|
|
||||||
{
|
|
||||||
max_write_iodepth = 128;
|
|
||||||
}
|
|
||||||
if (!throttle_target_iops)
|
|
||||||
{
|
|
||||||
throttle_target_iops = 100;
|
|
||||||
}
|
|
||||||
if (!throttle_target_mbs)
|
|
||||||
{
|
|
||||||
throttle_target_mbs = 100;
|
|
||||||
}
|
|
||||||
if (!throttle_target_parallelism)
|
|
||||||
{
|
|
||||||
throttle_target_parallelism = 1;
|
|
||||||
}
|
|
||||||
if (!throttle_threshold_us)
|
|
||||||
{
|
|
||||||
throttle_threshold_us = 50;
|
|
||||||
}
|
|
||||||
if (!init)
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// Offline-configurable options:
|
|
||||||
// Common disk options
|
// Common disk options
|
||||||
dsk.parse_config(config);
|
dsk.parse_config(config);
|
||||||
// Parse
|
// Parse
|
||||||
@@ -90,7 +44,29 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
|||||||
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
|
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
|
||||||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
|
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
|
||||||
journal.inmemory = config["inmemory_journal"] != "false";
|
journal.inmemory = config["inmemory_journal"] != "false";
|
||||||
|
max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
|
||||||
|
if (!max_flusher_count)
|
||||||
|
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
|
||||||
|
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
|
||||||
|
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
|
||||||
|
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
|
||||||
|
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
|
||||||
|
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
|
||||||
|
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
|
||||||
|
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
|
||||||
// Validate
|
// Validate
|
||||||
|
if (!max_flusher_count)
|
||||||
|
{
|
||||||
|
max_flusher_count = 256;
|
||||||
|
}
|
||||||
|
if (!min_flusher_count || journal.flush_journal)
|
||||||
|
{
|
||||||
|
min_flusher_count = 1;
|
||||||
|
}
|
||||||
|
if (!max_write_iodepth)
|
||||||
|
{
|
||||||
|
max_write_iodepth = 128;
|
||||||
|
}
|
||||||
if (journal.sector_count < 2)
|
if (journal.sector_count < 2)
|
||||||
{
|
{
|
||||||
journal.sector_count = 32;
|
journal.sector_count = 32;
|
||||||
@@ -115,6 +91,22 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
|||||||
{
|
{
|
||||||
throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
|
throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
|
||||||
}
|
}
|
||||||
|
if (!throttle_target_iops)
|
||||||
|
{
|
||||||
|
throttle_target_iops = 100;
|
||||||
|
}
|
||||||
|
if (!throttle_target_mbs)
|
||||||
|
{
|
||||||
|
throttle_target_mbs = 100;
|
||||||
|
}
|
||||||
|
if (!throttle_target_parallelism)
|
||||||
|
{
|
||||||
|
throttle_target_parallelism = 1;
|
||||||
|
}
|
||||||
|
if (!throttle_threshold_us)
|
||||||
|
{
|
||||||
|
throttle_threshold_us = 50;
|
||||||
|
}
|
||||||
// init some fields
|
// init some fields
|
||||||
journal.block_size = dsk.journal_block_size;
|
journal.block_size = dsk.journal_block_size;
|
||||||
journal.next_free = dsk.journal_block_size;
|
journal.next_free = dsk.journal_block_size;
|
||||||
|
@@ -9,39 +9,48 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
|
|||||||
{
|
{
|
||||||
return continue_rollback(op);
|
return continue_rollback(op);
|
||||||
}
|
}
|
||||||
int r = split_stab_op(op, [this](obj_ver_id ov)
|
obj_ver_id *v, *nv;
|
||||||
|
int i, todo = op->len;
|
||||||
|
for (i = 0, v = (obj_ver_id*)op->buf, nv = (obj_ver_id*)op->buf; i < op->len; i++, v++, nv++)
|
||||||
{
|
{
|
||||||
|
if (nv != v)
|
||||||
|
{
|
||||||
|
*nv = *v;
|
||||||
|
}
|
||||||
// Check that there are some versions greater than v->version (which may be zero),
|
// Check that there are some versions greater than v->version (which may be zero),
|
||||||
// check that they're unstable, synced, and not currently written to
|
// check that they're unstable, synced, and not currently written to
|
||||||
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
|
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
|
||||||
.oid = ov.oid,
|
.oid = v->oid,
|
||||||
.version = UINT64_MAX,
|
.version = UINT64_MAX,
|
||||||
});
|
});
|
||||||
if (dirty_it == dirty_db.begin())
|
if (dirty_it == dirty_db.begin())
|
||||||
{
|
{
|
||||||
|
skip_ov:
|
||||||
// Already rolled back, skip this object version
|
// Already rolled back, skip this object version
|
||||||
return STAB_SPLIT_DONE;
|
todo--;
|
||||||
|
nv--;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
dirty_it--;
|
dirty_it--;
|
||||||
if (dirty_it->first.oid != ov.oid || dirty_it->first.version < ov.version)
|
if (dirty_it->first.oid != v->oid || dirty_it->first.version < v->version)
|
||||||
{
|
{
|
||||||
// Already rolled back, skip this object version
|
goto skip_ov;
|
||||||
return STAB_SPLIT_DONE;
|
|
||||||
}
|
}
|
||||||
while (dirty_it->first.oid == ov.oid && dirty_it->first.version > ov.version)
|
while (dirty_it->first.oid == v->oid && dirty_it->first.version > v->version)
|
||||||
{
|
{
|
||||||
if (IS_IN_FLIGHT(dirty_it->second.state))
|
if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||||
{
|
{
|
||||||
// Object write is still in progress. Wait until the write request completes
|
// Object write is still in progress. Wait until the write request completes
|
||||||
return STAB_SPLIT_WAIT;
|
return 0;
|
||||||
}
|
}
|
||||||
else if (!IS_SYNCED(dirty_it->second.state) ||
|
else if (!IS_SYNCED(dirty_it->second.state) ||
|
||||||
IS_STABLE(dirty_it->second.state))
|
IS_STABLE(dirty_it->second.state))
|
||||||
{
|
{
|
||||||
// Sync the object
|
op->retval = -EBUSY;
|
||||||
return STAB_SPLIT_SYNC;
|
FINISH_OP(op);
|
||||||
|
return 2;
|
||||||
}
|
}
|
||||||
if (dirty_it == dirty_db.begin())
|
if (dirty_it == dirty_db.begin())
|
||||||
{
|
{
|
||||||
@@ -49,16 +58,19 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
|
|||||||
}
|
}
|
||||||
dirty_it--;
|
dirty_it--;
|
||||||
}
|
}
|
||||||
return STAB_SPLIT_TODO;
|
|
||||||
}
|
}
|
||||||
});
|
}
|
||||||
if (r != 1)
|
op->len = todo;
|
||||||
|
if (!todo)
|
||||||
{
|
{
|
||||||
return r;
|
// Already rolled back
|
||||||
|
op->retval = 0;
|
||||||
|
FINISH_OP(op);
|
||||||
|
return 2;
|
||||||
}
|
}
|
||||||
// Check journal space
|
// Check journal space
|
||||||
blockstore_journal_check_t space_check(this);
|
blockstore_journal_check_t space_check(this);
|
||||||
if (!space_check.check_available(op, op->len, sizeof(journal_entry_rollback), 0))
|
if (!space_check.check_available(op, todo, sizeof(journal_entry_rollback), 0))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -66,8 +78,7 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
|
|||||||
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
||||||
// Prepare and submit journal entries
|
// Prepare and submit journal entries
|
||||||
int s = 0;
|
int s = 0;
|
||||||
auto v = (obj_ver_id*)op->buf;
|
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||||
for (int i = 0; i < op->len; i++, v++)
|
|
||||||
{
|
{
|
||||||
if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
|
if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
|
||||||
journal.sector_info[journal.cur_sector].dirty)
|
journal.sector_info[journal.cur_sector].dirty)
|
||||||
|
@@ -41,309 +41,60 @@
|
|||||||
// 4) after a while it takes his synced object list and sends stabilize requests
|
// 4) after a while it takes his synced object list and sends stabilize requests
|
||||||
// to peers and to its own blockstore, thus freeing the old version
|
// to peers and to its own blockstore, thus freeing the old version
|
||||||
|
|
||||||
struct ver_vector_t
|
|
||||||
{
|
|
||||||
obj_ver_id *items = NULL;
|
|
||||||
uint64_t alloc = 0, size = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
static void init_versions(ver_vector_t & vec, obj_ver_id *start, obj_ver_id *end, uint64_t len)
|
|
||||||
{
|
|
||||||
if (!vec.items)
|
|
||||||
{
|
|
||||||
vec.alloc = len;
|
|
||||||
vec.items = (obj_ver_id*)malloc_or_die(sizeof(obj_ver_id) * vec.alloc);
|
|
||||||
for (auto sv = start; sv < end; sv++)
|
|
||||||
{
|
|
||||||
vec.items[vec.size++] = *sv;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void append_version(ver_vector_t & vec, obj_ver_id ov)
|
|
||||||
{
|
|
||||||
if (vec.size >= vec.alloc)
|
|
||||||
{
|
|
||||||
vec.alloc = !vec.alloc ? 4 : vec.alloc*2;
|
|
||||||
vec.items = (obj_ver_id*)realloc_or_die(vec.items, sizeof(obj_ver_id) * vec.alloc);
|
|
||||||
}
|
|
||||||
vec.items[vec.size++] = ov;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool check_unsynced(std::vector<obj_ver_id> & check, obj_ver_id ov, std::vector<obj_ver_id> & to, int *count)
|
|
||||||
{
|
|
||||||
bool found = false;
|
|
||||||
int j = 0, k = 0;
|
|
||||||
while (j < check.size())
|
|
||||||
{
|
|
||||||
if (check[j] == ov)
|
|
||||||
found = true;
|
|
||||||
if (check[j].oid == ov.oid && check[j].version <= ov.version)
|
|
||||||
{
|
|
||||||
to.push_back(check[j++]);
|
|
||||||
if (count)
|
|
||||||
(*count)--;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
check[k++] = check[j++];
|
|
||||||
}
|
|
||||||
check.resize(k);
|
|
||||||
return found;
|
|
||||||
}
|
|
||||||
|
|
||||||
blockstore_op_t* blockstore_impl_t::selective_sync(blockstore_op_t *op)
|
|
||||||
{
|
|
||||||
unsynced_big_write_count -= unsynced_big_writes.size();
|
|
||||||
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
|
|
||||||
unsynced_big_write_count += unsynced_big_writes.size();
|
|
||||||
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
|
|
||||||
// Create a sync operation, insert into the end of the queue
|
|
||||||
// And move ourselves into the end too!
|
|
||||||
// Rather hacky but that's what we need...
|
|
||||||
blockstore_op_t *sync_op = new blockstore_op_t;
|
|
||||||
sync_op->opcode = BS_OP_SYNC;
|
|
||||||
sync_op->buf = NULL;
|
|
||||||
sync_op->callback = [this](blockstore_op_t *sync_op)
|
|
||||||
{
|
|
||||||
delete sync_op;
|
|
||||||
};
|
|
||||||
init_op(sync_op);
|
|
||||||
int sync_res = continue_sync(sync_op);
|
|
||||||
if (sync_res != 2)
|
|
||||||
{
|
|
||||||
// Put SYNC into the queue if it's not finished yet
|
|
||||||
submit_queue.push_back(sync_op);
|
|
||||||
}
|
|
||||||
// Restore unsynced_writes
|
|
||||||
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
|
|
||||||
unsynced_big_write_count -= unsynced_big_writes.size();
|
|
||||||
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
|
|
||||||
unsynced_big_write_count += unsynced_big_writes.size();
|
|
||||||
if (sync_res == 2)
|
|
||||||
{
|
|
||||||
// Sync is immediately completed
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
return sync_op;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns: 2 = stop processing and dequeue, 0 = stop processing and do not dequeue, 1 = proceed with op itself
|
|
||||||
int blockstore_impl_t::split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider)
|
|
||||||
{
|
|
||||||
bool add_sync = false;
|
|
||||||
ver_vector_t good_vers, bad_vers;
|
|
||||||
obj_ver_id* v;
|
|
||||||
int i, todo = 0;
|
|
||||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
|
||||||
{
|
|
||||||
int action = decider(*v);
|
|
||||||
if (action < 0)
|
|
||||||
{
|
|
||||||
// Rollback changes
|
|
||||||
for (auto & ov: PRIV(op)->sync_big_writes)
|
|
||||||
{
|
|
||||||
unsynced_big_writes.push_back(ov);
|
|
||||||
unsynced_big_write_count++;
|
|
||||||
}
|
|
||||||
for (auto & ov: PRIV(op)->sync_small_writes)
|
|
||||||
{
|
|
||||||
unsynced_small_writes.push_back(ov);
|
|
||||||
}
|
|
||||||
free(good_vers.items);
|
|
||||||
good_vers.items = NULL;
|
|
||||||
free(bad_vers.items);
|
|
||||||
bad_vers.items = NULL;
|
|
||||||
// Error
|
|
||||||
op->retval = action;
|
|
||||||
FINISH_OP(op);
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
else if (action == STAB_SPLIT_DONE)
|
|
||||||
{
|
|
||||||
// Already done
|
|
||||||
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
|
|
||||||
}
|
|
||||||
else if (action == STAB_SPLIT_WAIT)
|
|
||||||
{
|
|
||||||
// Already in progress, we just have to wait until it finishes
|
|
||||||
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
|
|
||||||
append_version(bad_vers, *v);
|
|
||||||
}
|
|
||||||
else if (action == STAB_SPLIT_SYNC)
|
|
||||||
{
|
|
||||||
// Needs a SYNC, we have to send a SYNC if not already in progress
|
|
||||||
//
|
|
||||||
// If the object is not present in unsynced_(big|small)_writes then
|
|
||||||
// it's currently being synced. If it's present then we can initiate
|
|
||||||
// its sync ourselves.
|
|
||||||
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
|
|
||||||
append_version(bad_vers, *v);
|
|
||||||
if (!add_sync)
|
|
||||||
{
|
|
||||||
PRIV(op)->sync_big_writes.clear();
|
|
||||||
PRIV(op)->sync_small_writes.clear();
|
|
||||||
add_sync = true;
|
|
||||||
}
|
|
||||||
check_unsynced(unsynced_small_writes, *v, PRIV(op)->sync_small_writes, NULL);
|
|
||||||
check_unsynced(unsynced_big_writes, *v, PRIV(op)->sync_big_writes, &unsynced_big_write_count);
|
|
||||||
}
|
|
||||||
else /* if (action == STAB_SPLIT_TODO) */
|
|
||||||
{
|
|
||||||
if (good_vers.items)
|
|
||||||
{
|
|
||||||
// If we're selecting versions then append it
|
|
||||||
// Main idea is that 99% of the time all versions passed to BS_OP_STABLE are synced
|
|
||||||
// And we don't want to select/allocate anything in that optimistic case
|
|
||||||
append_version(good_vers, *v);
|
|
||||||
}
|
|
||||||
todo++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// In a pessimistic scenario, an operation may be split into 3:
|
|
||||||
// - Stabilize synced entries
|
|
||||||
// - Sync unsynced entries
|
|
||||||
// - Continue for unsynced entries after sync
|
|
||||||
add_sync = add_sync && (PRIV(op)->sync_big_writes.size() || PRIV(op)->sync_small_writes.size());
|
|
||||||
if (!todo && !bad_vers.size)
|
|
||||||
{
|
|
||||||
// Already stable
|
|
||||||
op->retval = 0;
|
|
||||||
FINISH_OP(op);
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
op->retval = 0;
|
|
||||||
if (!todo && !add_sync)
|
|
||||||
{
|
|
||||||
// Only wait for inflight writes or current in-progress syncs
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
blockstore_op_t *sync_op = NULL, *split_stab_op = NULL;
|
|
||||||
if (add_sync)
|
|
||||||
{
|
|
||||||
// Initiate a selective sync for PRIV(op)->sync_(big|small)_writes
|
|
||||||
sync_op = selective_sync(op);
|
|
||||||
}
|
|
||||||
if (bad_vers.size)
|
|
||||||
{
|
|
||||||
// Split part of the request into a separate operation
|
|
||||||
split_stab_op = new blockstore_op_t;
|
|
||||||
split_stab_op->opcode = op->opcode;
|
|
||||||
split_stab_op->buf = bad_vers.items;
|
|
||||||
split_stab_op->len = bad_vers.size;
|
|
||||||
init_op(split_stab_op);
|
|
||||||
submit_queue.push_back(split_stab_op);
|
|
||||||
}
|
|
||||||
if (sync_op || split_stab_op || good_vers.items)
|
|
||||||
{
|
|
||||||
void *orig_buf = op->buf;
|
|
||||||
if (good_vers.items)
|
|
||||||
{
|
|
||||||
op->buf = good_vers.items;
|
|
||||||
op->len = good_vers.size;
|
|
||||||
}
|
|
||||||
// Make a wrapped callback
|
|
||||||
int *split_op_counter = (int*)malloc_or_die(sizeof(int));
|
|
||||||
*split_op_counter = (sync_op ? 1 : 0) + (split_stab_op ? 1 : 0) + (todo ? 1 : 0);
|
|
||||||
auto cb = [this, op, good_items = good_vers.items,
|
|
||||||
bad_items = bad_vers.items, split_op_counter,
|
|
||||||
orig_buf, real_cb = op->callback](blockstore_op_t *split_op)
|
|
||||||
{
|
|
||||||
if (split_op->retval != 0)
|
|
||||||
op->retval = split_op->retval;
|
|
||||||
(*split_op_counter)--;
|
|
||||||
assert((*split_op_counter) >= 0);
|
|
||||||
if (op != split_op)
|
|
||||||
delete split_op;
|
|
||||||
if (!*split_op_counter)
|
|
||||||
{
|
|
||||||
free(good_items);
|
|
||||||
free(bad_items);
|
|
||||||
free(split_op_counter);
|
|
||||||
op->buf = orig_buf;
|
|
||||||
real_cb(op);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
if (sync_op)
|
|
||||||
{
|
|
||||||
sync_op->callback = cb;
|
|
||||||
}
|
|
||||||
if (split_stab_op)
|
|
||||||
{
|
|
||||||
split_stab_op->callback = cb;
|
|
||||||
}
|
|
||||||
op->callback = cb;
|
|
||||||
}
|
|
||||||
if (!todo)
|
|
||||||
{
|
|
||||||
// All work is postponed
|
|
||||||
op->callback = NULL;
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
if (PRIV(op)->op_state)
|
if (PRIV(op)->op_state)
|
||||||
{
|
{
|
||||||
return continue_stable(op);
|
return continue_stable(op);
|
||||||
}
|
}
|
||||||
int r = split_stab_op(op, [this](obj_ver_id ov)
|
obj_ver_id* v;
|
||||||
|
int i, todo = 0;
|
||||||
|
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||||
{
|
{
|
||||||
auto dirty_it = dirty_db.find(ov);
|
auto dirty_it = dirty_db.find(*v);
|
||||||
if (dirty_it == dirty_db.end())
|
if (dirty_it == dirty_db.end())
|
||||||
{
|
{
|
||||||
auto & clean_db = clean_db_shard(ov.oid);
|
auto & clean_db = clean_db_shard(v->oid);
|
||||||
auto clean_it = clean_db.find(ov.oid);
|
auto clean_it = clean_db.find(v->oid);
|
||||||
if (clean_it == clean_db.end() || clean_it->second.version < ov.version)
|
if (clean_it == clean_db.end() || clean_it->second.version < v->version)
|
||||||
{
|
{
|
||||||
// No such object version
|
// No such object version
|
||||||
printf("Error: %lx:%lx v%lu not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
op->retval = -ENOENT;
|
||||||
return -ENOENT;
|
FINISH_OP(op);
|
||||||
|
return 2;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Already stable
|
// Already stable
|
||||||
return STAB_SPLIT_DONE;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (IS_IN_FLIGHT(dirty_it->second.state))
|
else if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||||
{
|
{
|
||||||
// Object write is still in progress. Wait until the write request completes
|
// Object write is still in progress. Wait until the write request completes
|
||||||
return STAB_SPLIT_WAIT;
|
return 0;
|
||||||
}
|
}
|
||||||
else if (!IS_SYNCED(dirty_it->second.state))
|
else if (!IS_SYNCED(dirty_it->second.state))
|
||||||
{
|
{
|
||||||
// Object not synced yet - sync it
|
// Object not synced yet. Caller must sync it first
|
||||||
// In previous versions we returned EBUSY here and required
|
op->retval = -EBUSY;
|
||||||
// the caller (OSD) to issue a global sync first. But a global sync
|
FINISH_OP(op);
|
||||||
// waits for all writes in the queue including inflight writes. And
|
return 2;
|
||||||
// inflight writes may themselves be blocked by unstable writes being
|
|
||||||
// still present in the journal and not flushed away from it.
|
|
||||||
// So we must sync specific objects here.
|
|
||||||
//
|
|
||||||
// Even more, we have to process "stabilize" request in parts. That is,
|
|
||||||
// we must stabilize all objects which are already synced. Otherwise
|
|
||||||
// they may block objects which are NOT synced yet.
|
|
||||||
return STAB_SPLIT_SYNC;
|
|
||||||
}
|
}
|
||||||
else if (IS_STABLE(dirty_it->second.state))
|
else if (!IS_STABLE(dirty_it->second.state))
|
||||||
{
|
{
|
||||||
// Already stable
|
todo++;
|
||||||
return STAB_SPLIT_DONE;
|
|
||||||
}
|
}
|
||||||
else
|
}
|
||||||
{
|
if (!todo)
|
||||||
return STAB_SPLIT_TODO;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
if (r != 1)
|
|
||||||
{
|
{
|
||||||
return r;
|
// Already stable
|
||||||
|
op->retval = 0;
|
||||||
|
FINISH_OP(op);
|
||||||
|
return 2;
|
||||||
}
|
}
|
||||||
// Check journal space
|
// Check journal space
|
||||||
blockstore_journal_check_t space_check(this);
|
blockstore_journal_check_t space_check(this);
|
||||||
if (!space_check.check_available(op, op->len, sizeof(journal_entry_stable), 0))
|
if (!space_check.check_available(op, todo, sizeof(journal_entry_stable), 0))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -351,9 +102,9 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
|||||||
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
||||||
// Prepare and submit journal entries
|
// Prepare and submit journal entries
|
||||||
int s = 0;
|
int s = 0;
|
||||||
auto v = (obj_ver_id*)op->buf;
|
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||||
for (int i = 0; i < op->len; i++, v++)
|
|
||||||
{
|
{
|
||||||
|
// FIXME: Only stabilize versions that aren't stable yet
|
||||||
if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
|
if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
|
||||||
journal.sector_info[journal.cur_sector].dirty)
|
journal.sector_info[journal.cur_sector].dirty)
|
||||||
{
|
{
|
||||||
|
@@ -12,7 +12,7 @@
|
|||||||
#define SYNC_JOURNAL_SYNC_SENT 7
|
#define SYNC_JOURNAL_SYNC_SENT 7
|
||||||
#define SYNC_DONE 8
|
#define SYNC_DONE 8
|
||||||
|
|
||||||
int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync)
|
||||||
{
|
{
|
||||||
if (immediate_commit == IMMEDIATE_ALL)
|
if (immediate_commit == IMMEDIATE_ALL)
|
||||||
{
|
{
|
||||||
@@ -145,7 +145,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
|||||||
PRIV(op)->op_state = SYNC_DONE;
|
PRIV(op)->op_state = SYNC_DONE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (PRIV(op)->op_state == SYNC_DONE)
|
if (PRIV(op)->op_state == SYNC_DONE && !queue_has_in_progress_sync)
|
||||||
{
|
{
|
||||||
ack_sync(op);
|
ack_sync(op);
|
||||||
return 2;
|
return 2;
|
||||||
|
@@ -121,7 +121,8 @@ resume_1:
|
|||||||
}
|
}
|
||||||
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
|
uint64_t pg_real_size = pool_stats[pool_cfg.id]["pg_real_size"].uint64_value();
|
||||||
|
pool_avail = pg_real_size > 0 ? pool_avail * (pool_cfg.pg_size - pool_cfg.parity_chunks) / pg_real_size : 0;
|
||||||
}
|
}
|
||||||
pool_stats[pool_cfg.id] = json11::Json::object {
|
pool_stats[pool_cfg.id] = json11::Json::object {
|
||||||
{ "name", pool_cfg.name },
|
{ "name", pool_cfg.name },
|
||||||
|
@@ -403,7 +403,7 @@ struct snap_merger_t
|
|||||||
op->opcode = OSD_OP_READ_BITMAP;
|
op->opcode = OSD_OP_READ_BITMAP;
|
||||||
op->inode = target;
|
op->inode = target;
|
||||||
op->offset = offset;
|
op->offset = offset;
|
||||||
op->len = target_block_size;
|
op->len = 0;
|
||||||
op->callback = [this](cluster_op_t *op)
|
op->callback = [this](cluster_op_t *op)
|
||||||
{
|
{
|
||||||
if (op->retval < 0)
|
if (op->retval < 0)
|
||||||
|
@@ -92,7 +92,6 @@ struct rm_inode_t
|
|||||||
|
|
||||||
void send_ops(rm_pg_t *cur_list)
|
void send_ops(rm_pg_t *cur_list)
|
||||||
{
|
{
|
||||||
parent->cli->init_msgr();
|
|
||||||
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
||||||
parent->cli->msgr.osd_peer_fds.end())
|
parent->cli->msgr.osd_peer_fds.end())
|
||||||
{
|
{
|
||||||
|
@@ -88,7 +88,7 @@ struct rm_osd_t
|
|||||||
for (auto & hist_item: pg_cfg.target_history)
|
for (auto & hist_item: pg_cfg.target_history)
|
||||||
{
|
{
|
||||||
int hist_size = 0, hist_rm = 0;
|
int hist_size = 0, hist_rm = 0;
|
||||||
for (auto & old_osd: hist_item.osd_set)
|
for (auto & old_osd: hist_item)
|
||||||
{
|
{
|
||||||
if (old_osd != 0)
|
if (old_osd != 0)
|
||||||
{
|
{
|
||||||
@@ -278,7 +278,7 @@ struct rm_osd_t
|
|||||||
if (rsp["response_delete_range"]["deleted"].uint64_value() > 0)
|
if (rsp["response_delete_range"]["deleted"].uint64_value() > 0)
|
||||||
{
|
{
|
||||||
// Wait for mon_change_timeout before updating PG history, or the monitor's change will likely interfere with ours
|
// Wait for mon_change_timeout before updating PG history, or the monitor's change will likely interfere with ours
|
||||||
retry_wait = parent->cli->config["mon_change_timeout"].uint64_value();
|
retry_wait = parent->cli->merged_config["mon_change_timeout"].uint64_value();
|
||||||
if (!retry_wait)
|
if (!retry_wait)
|
||||||
retry_wait = 1000;
|
retry_wait = 1000;
|
||||||
retry_wait += etcd_tx_retry_ms;
|
retry_wait += etcd_tx_retry_ms;
|
||||||
@@ -382,7 +382,7 @@ struct rm_osd_t
|
|||||||
for (int i = 0; i < pg_cfg.target_history.size(); i++)
|
for (int i = 0; i < pg_cfg.target_history.size(); i++)
|
||||||
{
|
{
|
||||||
int hist_size = 0, hist_rm = 0;
|
int hist_size = 0, hist_rm = 0;
|
||||||
for (auto & old_osd: pg_cfg.target_history[i].osd_set)
|
for (auto & old_osd: pg_cfg.target_history[i])
|
||||||
{
|
{
|
||||||
if (old_osd != 0)
|
if (old_osd != 0)
|
||||||
{
|
{
|
||||||
@@ -406,15 +406,6 @@ struct rm_osd_t
|
|||||||
}
|
}
|
||||||
if (update_pg_history)
|
if (update_pg_history)
|
||||||
{
|
{
|
||||||
json11::Json::array target_history;
|
|
||||||
for (auto & pgh: pg_cfg.target_history)
|
|
||||||
{
|
|
||||||
target_history.push_back(json11::Json::object {
|
|
||||||
{ "osd_set", pgh.osd_set },
|
|
||||||
{ "min_epoch", pgh.min_epoch },
|
|
||||||
{ "max_epoch", pgh.max_epoch },
|
|
||||||
});
|
|
||||||
}
|
|
||||||
std::string history_key = base64_encode(
|
std::string history_key = base64_encode(
|
||||||
parent->cli->st_cli.etcd_prefix+"/pg/history/"+
|
parent->cli->st_cli.etcd_prefix+"/pg/history/"+
|
||||||
std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num)
|
std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num)
|
||||||
@@ -425,7 +416,7 @@ struct rm_osd_t
|
|||||||
{ "value", base64_encode(json11::Json(json11::Json::object {
|
{ "value", base64_encode(json11::Json(json11::Json::object {
|
||||||
{ "epoch", pg_cfg.epoch },
|
{ "epoch", pg_cfg.epoch },
|
||||||
{ "all_peers", pg_cfg.all_peers },
|
{ "all_peers", pg_cfg.all_peers },
|
||||||
{ "osd_set_epochs", target_history },
|
{ "osd_sets", pg_cfg.target_history },
|
||||||
}).dump()) },
|
}).dump()) },
|
||||||
} },
|
} },
|
||||||
});
|
});
|
||||||
|
@@ -198,9 +198,9 @@ resume_2:
|
|||||||
}
|
}
|
||||||
pgs_by_state_str += std::to_string(kv.second)+" "+kv.first;
|
pgs_by_state_str += std::to_string(kv.second)+" "+kv.first;
|
||||||
}
|
}
|
||||||
bool readonly = json_is_true(parent->cli->config["readonly"]);
|
bool readonly = json_is_true(parent->cli->merged_config["readonly"]);
|
||||||
bool no_recovery = json_is_true(parent->cli->config["no_recovery"]);
|
bool no_recovery = json_is_true(parent->cli->merged_config["no_recovery"]);
|
||||||
bool no_rebalance = json_is_true(parent->cli->config["no_rebalance"]);
|
bool no_rebalance = json_is_true(parent->cli->merged_config["no_rebalance"]);
|
||||||
if (parent->json_output)
|
if (parent->json_output)
|
||||||
{
|
{
|
||||||
// JSON output
|
// JSON output
|
||||||
|
@@ -18,12 +18,11 @@
|
|||||||
|
|
||||||
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
|
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
|
||||||
{
|
{
|
||||||
cli_config = config.object_items();
|
config = osd_messenger_t::read_config(config);
|
||||||
file_config = osd_messenger_t::read_config(config);
|
|
||||||
config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
|
|
||||||
|
|
||||||
this->ringloop = ringloop;
|
this->ringloop = ringloop;
|
||||||
this->tfd = tfd;
|
this->tfd = tfd;
|
||||||
|
this->config = config;
|
||||||
|
|
||||||
msgr.osd_num = 0;
|
msgr.osd_num = 0;
|
||||||
msgr.tfd = tfd;
|
msgr.tfd = tfd;
|
||||||
@@ -59,7 +58,8 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
|||||||
msgr.stop_client(op->peer_fd);
|
msgr.stop_client(op->peer_fd);
|
||||||
delete op;
|
delete op;
|
||||||
};
|
};
|
||||||
msgr.parse_config(config);
|
msgr.parse_config(this->config);
|
||||||
|
msgr.init();
|
||||||
|
|
||||||
st_cli.tfd = tfd;
|
st_cli.tfd = tfd;
|
||||||
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
|
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
|
||||||
@@ -73,6 +73,17 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
|||||||
|
|
||||||
scrap_buffer_size = SCRAP_BUFFER_SIZE;
|
scrap_buffer_size = SCRAP_BUFFER_SIZE;
|
||||||
scrap_buffer = malloc_or_die(scrap_buffer_size);
|
scrap_buffer = malloc_or_die(scrap_buffer_size);
|
||||||
|
|
||||||
|
if (ringloop)
|
||||||
|
{
|
||||||
|
consumer.loop = [this]()
|
||||||
|
{
|
||||||
|
msgr.read_requests();
|
||||||
|
msgr.send_replies();
|
||||||
|
this->ringloop->submit();
|
||||||
|
};
|
||||||
|
ringloop->register_consumer(&consumer);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cluster_client_t::~cluster_client_t()
|
cluster_client_t::~cluster_client_t()
|
||||||
@@ -104,24 +115,6 @@ cluster_op_t::~cluster_op_t()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cluster_client_t::init_msgr()
|
|
||||||
{
|
|
||||||
if (msgr_initialized)
|
|
||||||
return;
|
|
||||||
msgr.init();
|
|
||||||
msgr_initialized = true;
|
|
||||||
if (ringloop)
|
|
||||||
{
|
|
||||||
consumer.loop = [this]()
|
|
||||||
{
|
|
||||||
msgr.read_requests();
|
|
||||||
msgr.send_replies();
|
|
||||||
this->ringloop->submit();
|
|
||||||
};
|
|
||||||
ringloop->register_consumer(&consumer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void cluster_client_t::calc_wait(cluster_op_t *op)
|
void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||||
{
|
{
|
||||||
op->prev_wait = 0;
|
op->prev_wait = 0;
|
||||||
@@ -150,7 +143,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
|||||||
if (!op->prev_wait)
|
if (!op->prev_wait)
|
||||||
continue_sync(op);
|
continue_sync(op);
|
||||||
}
|
}
|
||||||
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) */
|
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
|
||||||
{
|
{
|
||||||
for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
|
for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
|
||||||
{
|
{
|
||||||
@@ -158,8 +151,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
|||||||
{
|
{
|
||||||
op->prev_wait++;
|
op->prev_wait++;
|
||||||
}
|
}
|
||||||
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ ||
|
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
|
||||||
prev->opcode == OSD_OP_READ_BITMAP || prev->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
|
||||||
{
|
{
|
||||||
// Flushes are always in the beginning (we're scanning from the beginning of the queue)
|
// Flushes are always in the beginning (we're scanning from the beginning of the queue)
|
||||||
break;
|
break;
|
||||||
@@ -179,8 +171,7 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
|
|||||||
auto n2 = next->next;
|
auto n2 = next->next;
|
||||||
if (next->opcode == OSD_OP_SYNC && !(flags & OP_IMMEDIATE_COMMIT) ||
|
if (next->opcode == OSD_OP_SYNC && !(flags & OP_IMMEDIATE_COMMIT) ||
|
||||||
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
|
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
|
||||||
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP ||
|
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
|
||||||
next->opcode == OSD_OP_READ_CHAIN_BITMAP) && (flags & OP_FLUSH_BUFFER))
|
|
||||||
{
|
{
|
||||||
next->prev_wait += inc;
|
next->prev_wait += inc;
|
||||||
assert(next->prev_wait >= 0);
|
assert(next->prev_wait >= 0);
|
||||||
@@ -230,14 +221,11 @@ void cluster_client_t::erase_op(cluster_op_t *op)
|
|||||||
if (op_queue_tail == op)
|
if (op_queue_tail == op)
|
||||||
op_queue_tail = op->prev;
|
op_queue_tail = op->prev;
|
||||||
op->next = op->prev = NULL;
|
op->next = op->prev = NULL;
|
||||||
if (flags & OP_FLUSH_BUFFER)
|
|
||||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
|
||||||
if (!(flags & OP_IMMEDIATE_COMMIT))
|
if (!(flags & OP_IMMEDIATE_COMMIT))
|
||||||
inc_wait(opcode, flags, next, -1);
|
inc_wait(opcode, flags, next, -1);
|
||||||
// Call callback at the end to avoid inconsistencies in prev_wait
|
// Call callback at the end to avoid inconsistencies in prev_wait
|
||||||
// if the callback adds more operations itself
|
// if the callback adds more operations itself
|
||||||
if (!(flags & OP_FLUSH_BUFFER))
|
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void cluster_client_t::continue_ops(bool up_retry)
|
void cluster_client_t::continue_ops(bool up_retry)
|
||||||
@@ -277,10 +265,13 @@ restart:
|
|||||||
continuing_ops = 0;
|
continuing_ops = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_config)
|
void cluster_client_t::on_load_config_hook(json11::Json::object & config)
|
||||||
{
|
{
|
||||||
this->etcd_global_config = etcd_global_config;
|
this->merged_config = config;
|
||||||
config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
|
for (auto & kv: this->config.object_items())
|
||||||
|
{
|
||||||
|
this->merged_config[kv.first] = kv.second;
|
||||||
|
}
|
||||||
if (config.find("client_max_dirty_bytes") != config.end())
|
if (config.find("client_max_dirty_bytes") != config.end())
|
||||||
{
|
{
|
||||||
client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
|
client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
|
||||||
@@ -290,13 +281,14 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
|
|||||||
// Old name
|
// Old name
|
||||||
client_max_dirty_bytes = config["client_dirty_limit"].uint64_value();
|
client_max_dirty_bytes = config["client_dirty_limit"].uint64_value();
|
||||||
}
|
}
|
||||||
else
|
if (config.find("client_max_dirty_ops") != config.end())
|
||||||
client_max_dirty_bytes = 0;
|
{
|
||||||
|
client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
|
||||||
|
}
|
||||||
if (!client_max_dirty_bytes)
|
if (!client_max_dirty_bytes)
|
||||||
{
|
{
|
||||||
client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
|
client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
|
||||||
}
|
}
|
||||||
client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
|
|
||||||
if (!client_max_dirty_ops)
|
if (!client_max_dirty_ops)
|
||||||
{
|
{
|
||||||
client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
|
client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
|
||||||
@@ -311,7 +303,7 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
|
|||||||
up_wait_retry_interval = 50;
|
up_wait_retry_interval = 50;
|
||||||
}
|
}
|
||||||
msgr.parse_config(config);
|
msgr.parse_config(config);
|
||||||
st_cli.parse_config(config);
|
msgr.parse_config(this->config);
|
||||||
st_cli.load_pgs();
|
st_cli.load_pgs();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -345,8 +337,7 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
|
|||||||
// And now they have to be resliced!
|
// And now they have to be resliced!
|
||||||
for (auto op = op_queue_head; op; op = op->next)
|
for (auto op = op_queue_head; op; op = op->next)
|
||||||
{
|
{
|
||||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ ||
|
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) &&
|
||||||
op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) &&
|
|
||||||
INODE_POOL(op->cur_inode) == pool_item.first)
|
INODE_POOL(op->cur_inode) == pool_item.first)
|
||||||
{
|
{
|
||||||
op->needs_reslice = true;
|
op->needs_reslice = true;
|
||||||
@@ -418,7 +409,7 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
|
|||||||
void cluster_client_t::execute(cluster_op_t *op)
|
void cluster_client_t::execute(cluster_op_t *op)
|
||||||
{
|
{
|
||||||
if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ &&
|
if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ &&
|
||||||
op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_WRITE)
|
op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_WRITE)
|
||||||
{
|
{
|
||||||
op->retval = -EINVAL;
|
op->retval = -EINVAL;
|
||||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||||
@@ -450,7 +441,7 @@ void cluster_client_t::execute(cluster_op_t *op)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Check alignment
|
// Check alignment
|
||||||
if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
|
if ((op->opcode == OSD_OP_READ || op->opcode == OSD_OP_WRITE) && !op->len ||
|
||||||
op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
|
op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
|
||||||
{
|
{
|
||||||
op->retval = -EINVAL;
|
op->retval = -EINVAL;
|
||||||
@@ -711,7 +702,8 @@ resume_3:
|
|||||||
// Finished successfully
|
// Finished successfully
|
||||||
// Even if the PG count has changed in meanwhile we treat it as success
|
// Even if the PG count has changed in meanwhile we treat it as success
|
||||||
// because if some operations were invalid for the new PG count we'd get errors
|
// because if some operations were invalid for the new PG count we'd get errors
|
||||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
bool is_read = op->opcode == OSD_OP_READ;
|
||||||
|
if (is_read)
|
||||||
{
|
{
|
||||||
// Check parent inode
|
// Check parent inode
|
||||||
auto ino_it = st_cli.inode_config.find(op->cur_inode);
|
auto ino_it = st_cli.inode_config.find(op->cur_inode);
|
||||||
@@ -735,11 +727,6 @@ resume_3:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
op->retval = op->len;
|
op->retval = op->len;
|
||||||
if (op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
|
||||||
{
|
|
||||||
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
|
|
||||||
op->retval = op->len / pool_cfg.bitmap_granularity;
|
|
||||||
}
|
|
||||||
erase_op(op);
|
erase_op(op);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@@ -763,10 +750,7 @@ resume_3:
|
|||||||
{
|
{
|
||||||
for (int i = 0; i < op->parts.size(); i++)
|
for (int i = 0; i < op->parts.size(); i++)
|
||||||
{
|
{
|
||||||
if (!(op->parts[i].flags & PART_DONE))
|
op->parts[i].flags = PART_RETRY;
|
||||||
{
|
|
||||||
op->parts[i].flags = PART_RETRY;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
goto resume_2;
|
goto resume_2;
|
||||||
}
|
}
|
||||||
@@ -825,19 +809,23 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
|||||||
uint64_t last_stripe = op->len > 0 ? ((op->offset + op->len - 1) / pg_block_size) * pg_block_size : first_stripe;
|
uint64_t last_stripe = op->len > 0 ? ((op->offset + op->len - 1) / pg_block_size) * pg_block_size : first_stripe;
|
||||||
op->retval = 0;
|
op->retval = 0;
|
||||||
op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
|
op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
|
||||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
|
||||||
{
|
{
|
||||||
// Allocate memory for the bitmap
|
// Allocate memory for the bitmap
|
||||||
unsigned object_bitmap_size = ((op->len / pool_cfg.bitmap_granularity + 7) / 8);
|
unsigned object_bitmap_size = (((op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : op->len) / pool_cfg.bitmap_granularity + 7) / 8);
|
||||||
object_bitmap_size = (object_bitmap_size < 8 ? 8 : object_bitmap_size);
|
object_bitmap_size = (object_bitmap_size < 8 ? 8 : object_bitmap_size);
|
||||||
unsigned bitmap_mem = object_bitmap_size + (pool_cfg.data_block_size / pool_cfg.bitmap_granularity / 8 * pg_data_size) * op->parts.size();
|
unsigned bitmap_mem = object_bitmap_size + (pool_cfg.data_block_size / pool_cfg.bitmap_granularity / 8 * pg_data_size) * op->parts.size();
|
||||||
if (!op->bitmap_buf || op->bitmap_buf_size < bitmap_mem)
|
if (op->bitmap_buf_size < bitmap_mem)
|
||||||
{
|
{
|
||||||
op->bitmap_buf = realloc_or_die(op->bitmap_buf, bitmap_mem);
|
op->bitmap_buf = realloc_or_die(op->bitmap_buf, bitmap_mem);
|
||||||
|
if (!op->bitmap_buf_size)
|
||||||
|
{
|
||||||
|
// First allocation
|
||||||
|
memset(op->bitmap_buf, 0, object_bitmap_size);
|
||||||
|
}
|
||||||
op->part_bitmaps = (uint8_t*)op->bitmap_buf + object_bitmap_size;
|
op->part_bitmaps = (uint8_t*)op->bitmap_buf + object_bitmap_size;
|
||||||
op->bitmap_buf_size = bitmap_mem;
|
op->bitmap_buf_size = bitmap_mem;
|
||||||
}
|
}
|
||||||
memset(op->bitmap_buf, 0, bitmap_mem);
|
|
||||||
}
|
}
|
||||||
int iov_idx = 0;
|
int iov_idx = 0;
|
||||||
size_t iov_pos = 0;
|
size_t iov_pos = 0;
|
||||||
@@ -888,14 +876,13 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
|||||||
if (end == begin)
|
if (end == begin)
|
||||||
op->done_count++;
|
op->done_count++;
|
||||||
}
|
}
|
||||||
else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
|
else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||||
{
|
{
|
||||||
add_iov(end-begin, false, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
|
add_iov(end-begin, false, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
|
||||||
}
|
}
|
||||||
op->parts[i].parent = op;
|
op->parts[i].parent = op;
|
||||||
op->parts[i].offset = begin;
|
op->parts[i].offset = begin;
|
||||||
op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ||
|
op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
|
||||||
op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
|
|
||||||
op->parts[i].pg_num = pg_num;
|
op->parts[i].pg_num = pg_num;
|
||||||
op->parts[i].osd_num = 0;
|
op->parts[i].osd_num = 0;
|
||||||
op->parts[i].flags = 0;
|
op->parts[i].flags = 0;
|
||||||
@@ -924,10 +911,6 @@ bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len
|
|||||||
|
|
||||||
bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||||
{
|
{
|
||||||
if (!msgr_initialized)
|
|
||||||
{
|
|
||||||
init_msgr();
|
|
||||||
}
|
|
||||||
auto part = &op->parts[i];
|
auto part = &op->parts[i];
|
||||||
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
|
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
|
||||||
auto pg_it = pool_cfg.pg_config.find(part->pg_num);
|
auto pg_it = pool_cfg.pg_config.find(part->pg_num);
|
||||||
@@ -946,7 +929,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
|||||||
pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
|
pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
|
||||||
);
|
);
|
||||||
uint64_t meta_rev = 0;
|
uint64_t meta_rev = 0;
|
||||||
if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
|
if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||||
{
|
{
|
||||||
auto ino_it = st_cli.inode_config.find(op->inode);
|
auto ino_it = st_cli.inode_config.find(op->inode);
|
||||||
if (ino_it != st_cli.inode_config.end())
|
if (ino_it != st_cli.inode_config.end())
|
||||||
@@ -959,7 +942,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
|||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = next_op_id(),
|
.id = next_op_id(),
|
||||||
.opcode = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ? OSD_OP_READ : op->opcode,
|
.opcode = op->opcode == OSD_OP_READ_BITMAP ? OSD_OP_READ : op->opcode,
|
||||||
},
|
},
|
||||||
.inode = op->cur_inode,
|
.inode = op->cur_inode,
|
||||||
.offset = part->offset,
|
.offset = part->offset,
|
||||||
@@ -967,10 +950,8 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
|||||||
.meta_revision = meta_rev,
|
.meta_revision = meta_rev,
|
||||||
.version = op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE ? op->version : 0,
|
.version = op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE ? op->version : 0,
|
||||||
} },
|
} },
|
||||||
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
|
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
|
||||||
? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
|
.bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? pg_bitmap_size : 0),
|
||||||
.bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
|
|
||||||
? pg_bitmap_size : 0),
|
|
||||||
.callback = [this, part](osd_op_t *op_part)
|
.callback = [this, part](osd_op_t *op_part)
|
||||||
{
|
{
|
||||||
handle_op_part(part);
|
handle_op_part(part);
|
||||||
@@ -1118,24 +1099,6 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||||||
if (part->op.reply.hdr.retval != expected)
|
if (part->op.reply.hdr.retval != expected)
|
||||||
{
|
{
|
||||||
// Operation failed, retry
|
// Operation failed, retry
|
||||||
part->flags |= PART_ERROR;
|
|
||||||
if (!op->retval || op->retval == -EPIPE)
|
|
||||||
{
|
|
||||||
// Don't overwrite other errors with -EPIPE
|
|
||||||
op->retval = part->op.reply.hdr.retval;
|
|
||||||
}
|
|
||||||
int stop_fd = -1;
|
|
||||||
if (op->retval != -EINTR && op->retval != -EIO)
|
|
||||||
{
|
|
||||||
stop_fd = part->op.peer_fd;
|
|
||||||
fprintf(
|
|
||||||
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
|
|
||||||
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
|
||||||
);
|
|
||||||
}
|
|
||||||
// All next things like timer, continue_sync/rw and stop_client may affect the operation again
|
|
||||||
// So do all these things after modifying operation state, otherwise we may hit reenterability bugs
|
|
||||||
// FIXME postpone such things to set_immediate here to avoid bugs
|
|
||||||
if (part->op.reply.hdr.retval == -EPIPE)
|
if (part->op.reply.hdr.retval == -EPIPE)
|
||||||
{
|
{
|
||||||
// Mark op->up_wait = true before stopping the client
|
// Mark op->up_wait = true before stopping the client
|
||||||
@@ -1149,37 +1112,40 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (op->inflight_count == 0)
|
if (!op->retval || op->retval == -EPIPE)
|
||||||
{
|
{
|
||||||
if (op->opcode == OSD_OP_SYNC)
|
// Don't overwrite other errors with -EPIPE
|
||||||
continue_sync(op);
|
op->retval = part->op.reply.hdr.retval;
|
||||||
else
|
|
||||||
continue_rw(op);
|
|
||||||
}
|
}
|
||||||
if (stop_fd >= 0)
|
if (op->retval != -EINTR && op->retval != -EIO)
|
||||||
{
|
{
|
||||||
msgr.stop_client(stop_fd);
|
fprintf(
|
||||||
|
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
|
||||||
|
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
||||||
|
);
|
||||||
|
msgr.stop_client(part->op.peer_fd);
|
||||||
}
|
}
|
||||||
|
part->flags |= PART_ERROR;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// OK
|
// OK
|
||||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && !(op->flags & OP_IMMEDIATE_COMMIT))
|
if (!(op->flags & OP_IMMEDIATE_COMMIT))
|
||||||
dirty_osds.insert(part->osd_num);
|
dirty_osds.insert(part->osd_num);
|
||||||
part->flags |= PART_DONE;
|
part->flags |= PART_DONE;
|
||||||
op->done_count++;
|
op->done_count++;
|
||||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
|
||||||
{
|
{
|
||||||
copy_part_bitmap(op, part);
|
copy_part_bitmap(op, part);
|
||||||
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
||||||
}
|
}
|
||||||
if (op->inflight_count == 0)
|
}
|
||||||
{
|
if (op->inflight_count == 0)
|
||||||
if (op->opcode == OSD_OP_SYNC)
|
{
|
||||||
continue_sync(op);
|
if (op->opcode == OSD_OP_SYNC)
|
||||||
else
|
continue_sync(op);
|
||||||
continue_rw(op);
|
else
|
||||||
}
|
continue_rw(op);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1192,12 +1158,7 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
|
|||||||
);
|
);
|
||||||
uint32_t object_offset = (part->op.req.rw.offset - op->offset) / pool_cfg.bitmap_granularity;
|
uint32_t object_offset = (part->op.req.rw.offset - op->offset) / pool_cfg.bitmap_granularity;
|
||||||
uint32_t part_offset = (part->op.req.rw.offset % pg_block_size) / pool_cfg.bitmap_granularity;
|
uint32_t part_offset = (part->op.req.rw.offset % pg_block_size) / pool_cfg.bitmap_granularity;
|
||||||
uint32_t op_len = op->len / pool_cfg.bitmap_granularity;
|
uint32_t part_len = (op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : part->op.req.rw.len) / pool_cfg.bitmap_granularity;
|
||||||
uint32_t part_len = pg_block_size/pool_cfg.bitmap_granularity - part_offset;
|
|
||||||
if (part_len > op_len-object_offset)
|
|
||||||
{
|
|
||||||
part_len = op_len-object_offset;
|
|
||||||
}
|
|
||||||
if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
|
if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
|
||||||
{
|
{
|
||||||
// Copy bytes
|
// Copy bytes
|
||||||
|
@@ -11,7 +11,6 @@
|
|||||||
#define INODE_LIST_DONE 1
|
#define INODE_LIST_DONE 1
|
||||||
#define INODE_LIST_HAS_UNSTABLE 2
|
#define INODE_LIST_HAS_UNSTABLE 2
|
||||||
#define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
|
#define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
|
||||||
#define OSD_OP_READ_CHAIN_BITMAP 0x102
|
|
||||||
|
|
||||||
#define OSD_OP_IGNORE_READONLY 0x08
|
#define OSD_OP_IGNORE_READONLY 0x08
|
||||||
|
|
||||||
@@ -31,7 +30,7 @@ struct cluster_op_part_t
|
|||||||
|
|
||||||
struct cluster_op_t
|
struct cluster_op_t
|
||||||
{
|
{
|
||||||
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP, OSD_OP_READ_CHAIN_BITMAP
|
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP
|
||||||
uint64_t inode;
|
uint64_t inode;
|
||||||
uint64_t offset;
|
uint64_t offset;
|
||||||
uint64_t len;
|
uint64_t len;
|
||||||
@@ -40,13 +39,9 @@ struct cluster_op_t
|
|||||||
uint64_t version = 0;
|
uint64_t version = 0;
|
||||||
// now only OSD_OP_IGNORE_READONLY is supported
|
// now only OSD_OP_IGNORE_READONLY is supported
|
||||||
uint64_t flags = 0;
|
uint64_t flags = 0;
|
||||||
// negative retval is an error number
|
|
||||||
// write and read return len on success
|
|
||||||
// sync and delete return 0 on success
|
|
||||||
// read_bitmap and read_chain_bitmap return the length of bitmap in bits(!)
|
|
||||||
int retval;
|
int retval;
|
||||||
osd_op_buf_list_t iov;
|
osd_op_buf_list_t iov;
|
||||||
// READ, READ_BITMAP, READ_CHAIN_BITMAP return the bitmap here
|
// READ and READ_BITMAP return the bitmap here
|
||||||
void *bitmap_buf = NULL;
|
void *bitmap_buf = NULL;
|
||||||
std::function<void(cluster_op_t*)> callback;
|
std::function<void(cluster_op_t*)> callback;
|
||||||
~cluster_op_t();
|
~cluster_op_t();
|
||||||
@@ -104,16 +99,12 @@ class cluster_client_t
|
|||||||
std::vector<std::function<void(void)>> on_ready_hooks;
|
std::vector<std::function<void(void)>> on_ready_hooks;
|
||||||
std::vector<inode_list_t*> lists;
|
std::vector<inode_list_t*> lists;
|
||||||
int continuing_ops = 0;
|
int continuing_ops = 0;
|
||||||
bool msgr_initialized = false;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
etcd_state_client_t st_cli;
|
etcd_state_client_t st_cli;
|
||||||
|
|
||||||
osd_messenger_t msgr;
|
osd_messenger_t msgr;
|
||||||
void init_msgr();
|
json11::Json config;
|
||||||
|
json11::Json::object merged_config;
|
||||||
json11::Json::object cli_config, file_config, etcd_global_config;
|
|
||||||
json11::Json::object config;
|
|
||||||
|
|
||||||
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
|
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
|
||||||
~cluster_client_t();
|
~cluster_client_t();
|
||||||
|
@@ -43,7 +43,6 @@ struct inode_list_t
|
|||||||
inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
|
inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
|
||||||
std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback)
|
std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback)
|
||||||
{
|
{
|
||||||
init_msgr();
|
|
||||||
int skipped_pgs = 0;
|
int skipped_pgs = 0;
|
||||||
pool_id_t pool_id = INODE_POOL(inode);
|
pool_id_t pool_id = INODE_POOL(inode);
|
||||||
if (!pool_id || st_cli.pool_config.find(pool_id) == st_cli.pool_config.end())
|
if (!pool_id || st_cli.pool_config.find(pool_id) == st_cli.pool_config.end())
|
||||||
@@ -96,7 +95,7 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
|
|||||||
}
|
}
|
||||||
for (auto & hist_item: pg.target_history)
|
for (auto & hist_item: pg.target_history)
|
||||||
{
|
{
|
||||||
for (auto pg_osd: hist_item.osd_set)
|
for (auto pg_osd: hist_item)
|
||||||
{
|
{
|
||||||
if (pg_osd != 0)
|
if (pg_osd != 0)
|
||||||
{
|
{
|
||||||
@@ -106,14 +105,11 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
|
|||||||
}
|
}
|
||||||
for (osd_num_t peer_osd: all_peers)
|
for (osd_num_t peer_osd: all_peers)
|
||||||
{
|
{
|
||||||
if (st_cli.peer_states.find(peer_osd) != st_cli.peer_states.end())
|
r->list_osds.push_back((inode_list_osd_t){
|
||||||
{
|
.pg = r,
|
||||||
r->list_osds.push_back((inode_list_osd_t){
|
.osd_num = peer_osd,
|
||||||
.pg = r,
|
.sent = false,
|
||||||
.osd_num = peer_osd,
|
});
|
||||||
.sent = false,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@@ -281,7 +281,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
|
|||||||
if (je->big_write.size > sizeof(journal_entry_big_write))
|
if (je->big_write.size > sizeof(journal_entry_big_write))
|
||||||
{
|
{
|
||||||
printf(json ? ",\"bitmap\":\"" : " (bitmap: ");
|
printf(json ? ",\"bitmap\":\"" : " (bitmap: ");
|
||||||
for (int i = sizeof(journal_entry_big_write); i < je->big_write.size; i++)
|
for (int i = sizeof(journal_entry_big_write); i < je->small_write.size; i++)
|
||||||
{
|
{
|
||||||
printf("%02x", ((uint8_t*)je)[i]);
|
printf("%02x", ((uint8_t*)je)[i]);
|
||||||
}
|
}
|
||||||
|
@@ -26,7 +26,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)>
|
|||||||
buf_size = dsk.meta_len;
|
buf_size = dsk.meta_len;
|
||||||
void *data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
void *data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
||||||
lseek64(dsk.meta_fd, dsk.meta_offset, 0);
|
lseek64(dsk.meta_fd, dsk.meta_offset, 0);
|
||||||
read_blocking(dsk.meta_fd, data, dsk.meta_block_size);
|
read_blocking(dsk.meta_fd, data, buf_size);
|
||||||
// Check superblock
|
// Check superblock
|
||||||
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
|
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
|
||||||
if (hdr->zero == 0 &&
|
if (hdr->zero == 0 &&
|
||||||
@@ -41,11 +41,8 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)>
|
|||||||
if (buf_size % dsk.meta_block_size)
|
if (buf_size % dsk.meta_block_size)
|
||||||
{
|
{
|
||||||
buf_size = 8*dsk.meta_block_size;
|
buf_size = 8*dsk.meta_block_size;
|
||||||
void *new_data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
|
||||||
memcpy(new_data, data, dsk.meta_block_size);
|
|
||||||
free(data);
|
free(data);
|
||||||
data = new_data;
|
data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
||||||
hdr = (blockstore_meta_header_v1_t *)data;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dsk.bitmap_granularity = hdr->bitmap_granularity;
|
dsk.bitmap_granularity = hdr->bitmap_granularity;
|
||||||
|
@@ -305,10 +305,10 @@ int write_zero(int fd, uint64_t offset, uint64_t size)
|
|||||||
json11::Json read_parttable(std::string dev)
|
json11::Json read_parttable(std::string dev)
|
||||||
{
|
{
|
||||||
std::string part_dump;
|
std::string part_dump;
|
||||||
int r = shell_exec({ "sfdisk", "--json", dev }, "", &part_dump, NULL);
|
int r = shell_exec({ "sfdisk", "--dump", dev, "--json" }, "", &part_dump, NULL);
|
||||||
if (r == 255)
|
if (r == 255)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Error running sfdisk --json %s\n", dev.c_str());
|
fprintf(stderr, "Error running sfdisk --dump %s --json\n", dev.c_str());
|
||||||
return json11::Json(false);
|
return json11::Json(false);
|
||||||
}
|
}
|
||||||
// Decode partition table
|
// Decode partition table
|
||||||
@@ -319,7 +319,7 @@ json11::Json read_parttable(std::string dev)
|
|||||||
pt = json11::Json::parse(part_dump, err);
|
pt = json11::Json::parse(part_dump, err);
|
||||||
if (err != "")
|
if (err != "")
|
||||||
{
|
{
|
||||||
fprintf(stderr, "sfdisk --json %s returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
|
fprintf(stderr, "sfdisk --dump %s --json returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
|
||||||
return json11::Json(false);
|
return json11::Json(false);
|
||||||
}
|
}
|
||||||
pt = pt["partitiontable"];
|
pt = pt["partitiontable"];
|
||||||
|
@@ -18,8 +18,12 @@ etcd_state_client_t::~etcd_state_client_t()
|
|||||||
}
|
}
|
||||||
watches.clear();
|
watches.clear();
|
||||||
etcd_watches_initialised = -1;
|
etcd_watches_initialised = -1;
|
||||||
|
if (ws_keepalive_timer >= 0)
|
||||||
|
{
|
||||||
|
tfd->clear_timer(ws_keepalive_timer);
|
||||||
|
ws_keepalive_timer = -1;
|
||||||
|
}
|
||||||
#ifndef __MOCK__
|
#ifndef __MOCK__
|
||||||
stop_ws_keepalive();
|
|
||||||
if (etcd_watch_ws)
|
if (etcd_watch_ws)
|
||||||
{
|
{
|
||||||
http_close(etcd_watch_ws);
|
http_close(etcd_watch_ws);
|
||||||
@@ -241,7 +245,6 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
|
|||||||
if (this->etcd_keepalive_timeout < 30)
|
if (this->etcd_keepalive_timeout < 30)
|
||||||
this->etcd_keepalive_timeout = 30;
|
this->etcd_keepalive_timeout = 30;
|
||||||
}
|
}
|
||||||
auto old_etcd_ws_keepalive_interval = this->etcd_ws_keepalive_interval;
|
|
||||||
this->etcd_ws_keepalive_interval = config["etcd_ws_keepalive_interval"].uint64_value();
|
this->etcd_ws_keepalive_interval = config["etcd_ws_keepalive_interval"].uint64_value();
|
||||||
if (this->etcd_ws_keepalive_interval <= 0)
|
if (this->etcd_ws_keepalive_interval <= 0)
|
||||||
{
|
{
|
||||||
@@ -262,13 +265,6 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
|
|||||||
{
|
{
|
||||||
this->etcd_quick_timeout = 1000;
|
this->etcd_quick_timeout = 1000;
|
||||||
}
|
}
|
||||||
if (this->etcd_ws_keepalive_interval != old_etcd_ws_keepalive_interval && ws_keepalive_timer >= 0)
|
|
||||||
{
|
|
||||||
#ifndef __MOCK__
|
|
||||||
stop_ws_keepalive();
|
|
||||||
start_ws_keepalive();
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void etcd_state_client_t::pick_next_etcd()
|
void etcd_state_client_t::pick_next_etcd()
|
||||||
@@ -482,20 +478,6 @@ void etcd_state_client_t::start_etcd_watcher()
|
|||||||
{
|
{
|
||||||
on_start_watcher_hook(etcd_watch_ws);
|
on_start_watcher_hook(etcd_watch_ws);
|
||||||
}
|
}
|
||||||
start_ws_keepalive();
|
|
||||||
}
|
|
||||||
|
|
||||||
void etcd_state_client_t::stop_ws_keepalive()
|
|
||||||
{
|
|
||||||
if (ws_keepalive_timer >= 0)
|
|
||||||
{
|
|
||||||
tfd->clear_timer(ws_keepalive_timer);
|
|
||||||
ws_keepalive_timer = -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void etcd_state_client_t::start_ws_keepalive()
|
|
||||||
{
|
|
||||||
if (ws_keepalive_timer < 0)
|
if (ws_keepalive_timer < 0)
|
||||||
{
|
{
|
||||||
ws_keepalive_timer = tfd->set_timer(etcd_ws_keepalive_interval*1000, true, [this](int)
|
ws_keepalive_timer = tfd->set_timer(etcd_ws_keepalive_interval*1000, true, [this](int)
|
||||||
@@ -885,6 +867,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
fprintf(stderr, "RECEIVED PG %u/%u HISTORY: %s\n", pool_id, pg_num, value.dump().c_str());
|
||||||
auto & pg_cfg = this->pool_config[pool_id].pg_config[pg_num];
|
auto & pg_cfg = this->pool_config[pool_id].pg_config[pg_num];
|
||||||
pg_cfg.target_history.clear();
|
pg_cfg.target_history.clear();
|
||||||
pg_cfg.all_peers.clear();
|
pg_cfg.all_peers.clear();
|
||||||
@@ -902,32 +885,9 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
history_set.insert(it, pg_osd_num);
|
history_set.insert(it, pg_osd_num);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pg_history_set_t epoch_set = { .osd_set = history_set };
|
auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), history_set);
|
||||||
auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), epoch_set);
|
if (it == pg_cfg.target_history.end() || *it != history_set)
|
||||||
if (it == pg_cfg.target_history.end() || *it != epoch_set)
|
pg_cfg.target_history.insert(it, history_set);
|
||||||
pg_cfg.target_history.insert(it, epoch_set);
|
|
||||||
}
|
|
||||||
// Newer format with epochs
|
|
||||||
for (auto hist_item: value["osd_set_epochs"].array_items())
|
|
||||||
{
|
|
||||||
pg_history_set_t history_set;
|
|
||||||
history_set.min_epoch = hist_item["min_epoch"].uint64_value();
|
|
||||||
history_set.max_epoch = hist_item["max_epoch"].uint64_value();
|
|
||||||
if (history_set.max_epoch < history_set.min_epoch)
|
|
||||||
{
|
|
||||||
history_set.max_epoch = 0;
|
|
||||||
history_set.min_epoch = 0;
|
|
||||||
}
|
|
||||||
for (auto pg_osd: hist_item["osd_set"].array_items())
|
|
||||||
{
|
|
||||||
history_set.osd_set.push_back(pg_osd.uint64_value());
|
|
||||||
}
|
|
||||||
if (history_set.max_epoch || history_set.osd_set.size())
|
|
||||||
{
|
|
||||||
auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), history_set);
|
|
||||||
if (it == pg_cfg.target_history.end() || *it != history_set)
|
|
||||||
pg_cfg.target_history.insert(it, history_set);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// Include these additional OSDs when peering the PG
|
// Include these additional OSDs when peering the PG
|
||||||
for (auto pg_osd: value["all_peers"].array_items())
|
for (auto pg_osd: value["all_peers"].array_items())
|
||||||
|
@@ -33,7 +33,7 @@ struct pg_config_t
|
|||||||
bool exists;
|
bool exists;
|
||||||
osd_num_t primary;
|
osd_num_t primary;
|
||||||
std::vector<osd_num_t> target_set;
|
std::vector<osd_num_t> target_set;
|
||||||
std::vector<pg_history_set_t> target_history;
|
std::vector<std::vector<osd_num_t>> target_history;
|
||||||
std::vector<osd_num_t> all_peers;
|
std::vector<osd_num_t> all_peers;
|
||||||
bool pause;
|
bool pause;
|
||||||
osd_num_t cur_primary;
|
osd_num_t cur_primary;
|
||||||
@@ -132,8 +132,6 @@ public:
|
|||||||
void etcd_txn(json11::Json txn, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
|
void etcd_txn(json11::Json txn, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
|
||||||
void etcd_txn_slow(json11::Json txn, std::function<void(std::string, json11::Json)> callback);
|
void etcd_txn_slow(json11::Json txn, std::function<void(std::string, json11::Json)> callback);
|
||||||
void start_etcd_watcher();
|
void start_etcd_watcher();
|
||||||
void stop_ws_keepalive();
|
|
||||||
void start_ws_keepalive();
|
|
||||||
void load_global_config();
|
void load_global_config();
|
||||||
void load_pgs();
|
void load_pgs();
|
||||||
void parse_state(const etcd_kv_t & kv);
|
void parse_state(const etcd_kv_t & kv);
|
||||||
|
@@ -157,10 +157,10 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
|||||||
this->rdma_max_sge = 128;
|
this->rdma_max_sge = 128;
|
||||||
this->rdma_max_send = config["rdma_max_send"].uint64_value();
|
this->rdma_max_send = config["rdma_max_send"].uint64_value();
|
||||||
if (!this->rdma_max_send)
|
if (!this->rdma_max_send)
|
||||||
this->rdma_max_send = 8;
|
this->rdma_max_send = 1;
|
||||||
this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
|
this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
|
||||||
if (!this->rdma_max_recv)
|
if (!this->rdma_max_recv)
|
||||||
this->rdma_max_recv = 16;
|
this->rdma_max_recv = 128;
|
||||||
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
|
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
|
||||||
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
||||||
this->rdma_max_msg = 129*1024;
|
this->rdma_max_msg = 129*1024;
|
||||||
@@ -534,9 +534,8 @@ bool osd_messenger_t::is_rdma_enabled()
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
|
json11::Json osd_messenger_t::read_config(const json11::Json & config)
|
||||||
{
|
{
|
||||||
json11::Json::object file_config;
|
|
||||||
const char *config_path = config["config_path"].string_value() != ""
|
const char *config_path = config["config_path"].string_value() != ""
|
||||||
? config["config_path"].string_value().c_str() : VITASTOR_CONFIG_PATH;
|
? config["config_path"].string_value().c_str() : VITASTOR_CONFIG_PATH;
|
||||||
int fd = open(config_path, O_RDONLY);
|
int fd = open(config_path, O_RDONLY);
|
||||||
@@ -544,14 +543,14 @@ json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
|
|||||||
{
|
{
|
||||||
if (errno != ENOENT)
|
if (errno != ENOENT)
|
||||||
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
||||||
return file_config;
|
return config;
|
||||||
}
|
}
|
||||||
struct stat st;
|
struct stat st;
|
||||||
if (fstat(fd, &st) != 0)
|
if (fstat(fd, &st) != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
||||||
close(fd);
|
close(fd);
|
||||||
return file_config;
|
return config;
|
||||||
}
|
}
|
||||||
std::string buf;
|
std::string buf;
|
||||||
buf.resize(st.st_size);
|
buf.resize(st.st_size);
|
||||||
@@ -563,125 +562,23 @@ json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
|
|||||||
{
|
{
|
||||||
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
||||||
close(fd);
|
close(fd);
|
||||||
return file_config;
|
return config;
|
||||||
}
|
}
|
||||||
done += r;
|
done += r;
|
||||||
}
|
}
|
||||||
close(fd);
|
close(fd);
|
||||||
std::string json_err;
|
std::string json_err;
|
||||||
file_config = json11::Json::parse(buf, json_err).object_items();
|
json11::Json::object file_config = json11::Json::parse(buf, json_err).object_items();
|
||||||
if (json_err != "")
|
if (json_err != "")
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Invalid JSON in %s: %s\n", config_path, json_err.c_str());
|
fprintf(stderr, "Invalid JSON in %s: %s\n", config_path, json_err.c_str());
|
||||||
|
return config;
|
||||||
|
}
|
||||||
|
file_config.erase("config_path");
|
||||||
|
file_config.erase("osd_num");
|
||||||
|
for (auto kv: config.object_items())
|
||||||
|
{
|
||||||
|
file_config[kv.first] = kv.second;
|
||||||
}
|
}
|
||||||
return file_config;
|
return file_config;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char* cli_only_params[] = {
|
|
||||||
// The list has to be sorted
|
|
||||||
"bitmap_granularity",
|
|
||||||
"block_size",
|
|
||||||
"data_device",
|
|
||||||
"data_offset",
|
|
||||||
"data_size",
|
|
||||||
"disable_data_fsync",
|
|
||||||
"disable_device_lock",
|
|
||||||
"disable_journal_fsync",
|
|
||||||
"disable_meta_fsync",
|
|
||||||
"disk_alignment",
|
|
||||||
"flush_journal",
|
|
||||||
"immediate_commit",
|
|
||||||
"inmemory_journal",
|
|
||||||
"inmemory_metadata",
|
|
||||||
"journal_block_size",
|
|
||||||
"journal_device",
|
|
||||||
"journal_no_same_sector_overwrites",
|
|
||||||
"journal_offset",
|
|
||||||
"journal_sector_buffer_count",
|
|
||||||
"journal_size",
|
|
||||||
"meta_block_size",
|
|
||||||
"meta_buf_size",
|
|
||||||
"meta_device",
|
|
||||||
"meta_offset",
|
|
||||||
"osd_num",
|
|
||||||
"readonly",
|
|
||||||
};
|
|
||||||
|
|
||||||
static const char **cli_only_end = cli_only_params + (sizeof(cli_only_params)/sizeof(cli_only_params[0]));
|
|
||||||
|
|
||||||
static const char* local_only_params[] = {
|
|
||||||
// The list has to be sorted
|
|
||||||
"config_path",
|
|
||||||
"rdma_device",
|
|
||||||
"rdma_gid_index",
|
|
||||||
"rdma_max_msg",
|
|
||||||
"rdma_max_recv",
|
|
||||||
"rdma_max_send",
|
|
||||||
"rdma_max_sge",
|
|
||||||
"rdma_mtu",
|
|
||||||
"rdma_port_num",
|
|
||||||
"tcp_header_buffer_size",
|
|
||||||
"use_rdma",
|
|
||||||
"use_sync_send_recv",
|
|
||||||
};
|
|
||||||
|
|
||||||
static const char **local_only_end = local_only_params + (sizeof(local_only_params)/sizeof(local_only_params[0]));
|
|
||||||
|
|
||||||
// Basically could be replaced by std::lower_bound()...
|
|
||||||
static int find_str_array(const char **start, const char **end, const std::string & s)
|
|
||||||
{
|
|
||||||
int min = 0, max = end-start;
|
|
||||||
while (max-min >= 2)
|
|
||||||
{
|
|
||||||
int mid = (min+max)/2;
|
|
||||||
int r = strcmp(s.c_str(), start[mid]);
|
|
||||||
if (r < 0)
|
|
||||||
max = mid;
|
|
||||||
else if (r > 0)
|
|
||||||
min = mid;
|
|
||||||
else
|
|
||||||
return mid;
|
|
||||||
}
|
|
||||||
if (min < end-start && !strcmp(s.c_str(), start[min]))
|
|
||||||
return min;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object & cli_config,
|
|
||||||
const json11::Json::object & file_config,
|
|
||||||
const json11::Json::object & etcd_global_config,
|
|
||||||
const json11::Json::object & etcd_osd_config)
|
|
||||||
{
|
|
||||||
// Priority: most important -> less important:
|
|
||||||
// etcd_osd_config -> cli_config -> etcd_global_config -> file_config
|
|
||||||
json11::Json::object res = file_config;
|
|
||||||
for (auto & kv: file_config)
|
|
||||||
{
|
|
||||||
int cli_only = find_str_array(cli_only_params, cli_only_end, kv.first);
|
|
||||||
if (cli_only < 0)
|
|
||||||
{
|
|
||||||
res[kv.first] = kv.second;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (auto & kv: etcd_global_config)
|
|
||||||
{
|
|
||||||
int local_only = find_str_array(local_only_params, local_only_end, kv.first);
|
|
||||||
if (local_only < 0)
|
|
||||||
{
|
|
||||||
res[kv.first] = kv.second;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (auto & kv: cli_config)
|
|
||||||
{
|
|
||||||
res[kv.first] = kv.second;
|
|
||||||
}
|
|
||||||
for (auto & kv: etcd_osd_config)
|
|
||||||
{
|
|
||||||
int local_only = find_str_array(local_only_params, local_only_end, kv.first);
|
|
||||||
if (local_only < 0)
|
|
||||||
{
|
|
||||||
res[kv.first] = kv.second;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
@@ -138,7 +138,6 @@ protected:
|
|||||||
|
|
||||||
std::vector<int> read_ready_clients;
|
std::vector<int> read_ready_clients;
|
||||||
std::vector<int> write_ready_clients;
|
std::vector<int> write_ready_clients;
|
||||||
// We don't use ringloop->set_immediate here because we may have no ringloop in client :)
|
|
||||||
std::vector<std::function<void()>> set_immediate;
|
std::vector<std::function<void()>> set_immediate;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@@ -166,11 +165,7 @@ public:
|
|||||||
void accept_connections(int listen_fd);
|
void accept_connections(int listen_fd);
|
||||||
~osd_messenger_t();
|
~osd_messenger_t();
|
||||||
|
|
||||||
static json11::Json::object read_config(const json11::Json & config);
|
static json11::Json read_config(const json11::Json & config);
|
||||||
static json11::Json::object merge_configs(const json11::Json::object & cli_config,
|
|
||||||
const json11::Json::object & file_config,
|
|
||||||
const json11::Json::object & etcd_global_config,
|
|
||||||
const json11::Json::object & etcd_osd_config);
|
|
||||||
|
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
bool is_rdma_enabled();
|
bool is_rdma_enabled();
|
||||||
|
@@ -43,15 +43,7 @@ void osd_messenger_t::send_replies()
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
|
json11::Json osd_messenger_t::read_config(const json11::Json & config)
|
||||||
{
|
{
|
||||||
return json11::Json::object();
|
return config;
|
||||||
}
|
|
||||||
|
|
||||||
json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object & cli_config,
|
|
||||||
const json11::Json::object & file_config,
|
|
||||||
const json11::Json::object & etcd_global_config,
|
|
||||||
const json11::Json::object & etcd_osd_config)
|
|
||||||
{
|
|
||||||
return cli_config;
|
|
||||||
}
|
}
|
||||||
|
@@ -368,8 +368,9 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
|||||||
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
auto rc = cl->rdma_conn;
|
auto rc = cl->rdma_conn;
|
||||||
if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
|
if (!cl->send_list.size() || rc->cur_send > 0)
|
||||||
{
|
{
|
||||||
|
// Only send one batch at a time
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
uint64_t op_size = 0, op_sge = 0;
|
uint64_t op_size = 0, op_sge = 0;
|
||||||
@@ -379,7 +380,6 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
|||||||
iovec & iov = cl->send_list[rc->send_pos];
|
iovec & iov = cl->send_list[rc->send_pos];
|
||||||
if (op_size >= rc->max_msg || op_sge >= rc->max_sge)
|
if (op_size >= rc->max_msg || op_sge >= rc->max_sge)
|
||||||
{
|
{
|
||||||
rc->send_sizes.push_back(op_size);
|
|
||||||
try_send_rdma_wr(cl, sge, op_sge);
|
try_send_rdma_wr(cl, sge, op_sge);
|
||||||
op_sge = 0;
|
op_sge = 0;
|
||||||
op_size = 0;
|
op_size = 0;
|
||||||
@@ -405,24 +405,18 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
|||||||
}
|
}
|
||||||
if (op_sge > 0)
|
if (op_sge > 0)
|
||||||
{
|
{
|
||||||
rc->send_sizes.push_back(op_size);
|
|
||||||
try_send_rdma_wr(cl, sge, op_sge);
|
try_send_rdma_wr(cl, sge, op_sge);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
|
static void try_recv_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
||||||
{
|
{
|
||||||
ibv_sge sge = {
|
|
||||||
.addr = (uintptr_t)buf,
|
|
||||||
.length = (uint32_t)cl->rdma_conn->max_msg,
|
|
||||||
.lkey = cl->rdma_conn->ctx->mr->lkey,
|
|
||||||
};
|
|
||||||
ibv_recv_wr *bad_wr = NULL;
|
ibv_recv_wr *bad_wr = NULL;
|
||||||
ibv_recv_wr wr = {
|
ibv_recv_wr wr = {
|
||||||
.wr_id = (uint64_t)(cl->peer_fd*2),
|
.wr_id = (uint64_t)(cl->peer_fd*2),
|
||||||
.sg_list = &sge,
|
.sg_list = sge,
|
||||||
.num_sge = 1,
|
.num_sge = op_sge,
|
||||||
};
|
};
|
||||||
int err = ibv_post_recv(cl->rdma_conn->qp, &wr, &bad_wr);
|
int err = ibv_post_recv(cl->rdma_conn->qp, &wr, &bad_wr);
|
||||||
if (err || bad_wr)
|
if (err || bad_wr)
|
||||||
@@ -440,7 +434,12 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
|
|||||||
{
|
{
|
||||||
void *buf = malloc_or_die(rc->max_msg);
|
void *buf = malloc_or_die(rc->max_msg);
|
||||||
rc->recv_buffers.push_back(buf);
|
rc->recv_buffers.push_back(buf);
|
||||||
try_recv_rdma_wr(cl, buf);
|
ibv_sge sge = {
|
||||||
|
.addr = (uintptr_t)buf,
|
||||||
|
.length = (uint32_t)rc->max_msg,
|
||||||
|
.lkey = rc->ctx->mr->lkey,
|
||||||
|
};
|
||||||
|
try_recv_rdma_wr(cl, &sge, 1);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -477,7 +476,6 @@ void osd_messenger_t::handle_rdma_events()
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
osd_client_t *cl = cl_it->second;
|
osd_client_t *cl = cl_it->second;
|
||||||
auto rc = cl->rdma_conn;
|
|
||||||
if (wc[i].status != IBV_WC_SUCCESS)
|
if (wc[i].status != IBV_WC_SUCCESS)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "RDMA work request failed for client %d", client_id);
|
fprintf(stderr, "RDMA work request failed for client %d", client_id);
|
||||||
@@ -491,59 +489,44 @@ void osd_messenger_t::handle_rdma_events()
|
|||||||
}
|
}
|
||||||
if (!is_send)
|
if (!is_send)
|
||||||
{
|
{
|
||||||
rc->cur_recv--;
|
cl->rdma_conn->cur_recv--;
|
||||||
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
|
if (!handle_read_buffer(cl, cl->rdma_conn->recv_buffers[0], wc[i].byte_len))
|
||||||
{
|
{
|
||||||
// handle_read_buffer may stop the client
|
// handle_read_buffer may stop the client
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
|
free(cl->rdma_conn->recv_buffers[0]);
|
||||||
rc->next_recv_buf = (rc->next_recv_buf+1) % rc->recv_buffers.size();
|
cl->rdma_conn->recv_buffers.erase(cl->rdma_conn->recv_buffers.begin(), cl->rdma_conn->recv_buffers.begin()+1);
|
||||||
|
try_recv_rdma(cl);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
rc->cur_send--;
|
cl->rdma_conn->cur_send--;
|
||||||
uint64_t sent_size = rc->send_sizes.at(0);
|
if (!cl->rdma_conn->cur_send)
|
||||||
rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
|
|
||||||
int send_pos = 0, send_buf_pos = 0;
|
|
||||||
while (sent_size > 0)
|
|
||||||
{
|
{
|
||||||
if (sent_size >= cl->send_list.at(send_pos).iov_len)
|
// Wait for the whole batch
|
||||||
|
for (int i = 0; i < cl->rdma_conn->send_pos; i++)
|
||||||
{
|
{
|
||||||
sent_size -= cl->send_list[send_pos].iov_len;
|
if (cl->outbox[i].flags & MSGR_SENDP_FREE)
|
||||||
send_pos++;
|
{
|
||||||
|
// Reply fully sent
|
||||||
|
delete cl->outbox[i].op;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
if (cl->rdma_conn->send_pos > 0)
|
||||||
{
|
{
|
||||||
send_buf_pos = sent_size;
|
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+cl->rdma_conn->send_pos);
|
||||||
sent_size = 0;
|
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+cl->rdma_conn->send_pos);
|
||||||
|
cl->rdma_conn->send_pos = 0;
|
||||||
}
|
}
|
||||||
}
|
if (cl->rdma_conn->send_buf_pos > 0)
|
||||||
assert(rc->send_pos >= send_pos);
|
|
||||||
if (rc->send_pos == send_pos)
|
|
||||||
{
|
|
||||||
rc->send_buf_pos -= send_buf_pos;
|
|
||||||
}
|
|
||||||
rc->send_pos -= send_pos;
|
|
||||||
for (int i = 0; i < send_pos; i++)
|
|
||||||
{
|
|
||||||
if (cl->outbox[i].flags & MSGR_SENDP_FREE)
|
|
||||||
{
|
{
|
||||||
// Reply fully sent
|
cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + cl->rdma_conn->send_buf_pos;
|
||||||
delete cl->outbox[i].op;
|
cl->send_list[0].iov_len -= cl->rdma_conn->send_buf_pos;
|
||||||
|
cl->rdma_conn->send_buf_pos = 0;
|
||||||
}
|
}
|
||||||
|
try_send_rdma(cl);
|
||||||
}
|
}
|
||||||
if (send_pos > 0)
|
|
||||||
{
|
|
||||||
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+send_pos);
|
|
||||||
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+send_pos);
|
|
||||||
}
|
|
||||||
if (send_buf_pos > 0)
|
|
||||||
{
|
|
||||||
cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + send_buf_pos;
|
|
||||||
cl->send_list[0].iov_len -= send_buf_pos;
|
|
||||||
}
|
|
||||||
try_send_rdma(cl);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (event_count > 0);
|
} while (event_count > 0);
|
||||||
|
@@ -49,9 +49,8 @@ struct msgr_rdma_connection_t
|
|||||||
uint64_t max_msg = 0;
|
uint64_t max_msg = 0;
|
||||||
|
|
||||||
int send_pos = 0, send_buf_pos = 0;
|
int send_pos = 0, send_buf_pos = 0;
|
||||||
int next_recv_buf = 0;
|
int recv_pos = 0, recv_buf_pos = 0;
|
||||||
std::vector<void*> recv_buffers;
|
std::vector<void*> recv_buffers;
|
||||||
std::vector<uint64_t> send_sizes;
|
|
||||||
|
|
||||||
~msgr_rdma_connection_t();
|
~msgr_rdma_connection_t();
|
||||||
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
|
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
|
||||||
|
@@ -313,18 +313,17 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
|||||||
stop_client(cl->peer_fd);
|
stop_client(cl->peer_fd);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (bmp_len > 0)
|
if (op->reply.hdr.retval >= 0 && bmp_len > 0)
|
||||||
{
|
{
|
||||||
assert(op->bitmap);
|
assert(op->bitmap);
|
||||||
cl->recv_list.push_back(op->bitmap, bmp_len);
|
cl->recv_list.push_back(op->bitmap, bmp_len);
|
||||||
cl->read_remaining += bmp_len;
|
|
||||||
}
|
}
|
||||||
if (op->reply.hdr.retval > 0)
|
if (op->reply.hdr.retval > 0)
|
||||||
{
|
{
|
||||||
assert(op->iov.count > 0);
|
assert(op->iov.count > 0);
|
||||||
cl->recv_list.append(op->iov);
|
cl->recv_list.append(op->iov);
|
||||||
cl->read_remaining += op->reply.hdr.retval;
|
|
||||||
}
|
}
|
||||||
|
cl->read_remaining = op->reply.hdr.retval + bmp_len;
|
||||||
if (cl->read_remaining == 0)
|
if (cl->read_remaining == 0)
|
||||||
{
|
{
|
||||||
goto reuse;
|
goto reuse;
|
||||||
|
@@ -39,11 +39,6 @@ struct __attribute__((__packed__)) obj_ver_id
|
|||||||
uint64_t version;
|
uint64_t version;
|
||||||
};
|
};
|
||||||
|
|
||||||
inline bool operator == (const obj_ver_id & a, const obj_ver_id & b)
|
|
||||||
{
|
|
||||||
return a.oid == b.oid && a.version == b.version;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool operator < (const obj_ver_id & a, const obj_ver_id & b)
|
inline bool operator < (const obj_ver_id & a, const obj_ver_id & b)
|
||||||
{
|
{
|
||||||
return a.oid < b.oid || a.oid == b.oid && a.version < b.version;
|
return a.oid < b.oid || a.oid == b.oid && a.version < b.version;
|
||||||
|
112
src/osd.cpp
112
src/osd.cpp
@@ -35,18 +35,18 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
|||||||
|
|
||||||
this->ringloop = ringloop;
|
this->ringloop = ringloop;
|
||||||
|
|
||||||
this->cli_config = config.object_items();
|
this->config = msgr.read_config(config).object_items();
|
||||||
this->file_config = msgr.read_config(this->cli_config);
|
if (this->config.find("log_level") == this->config.end())
|
||||||
parse_config(true);
|
this->config["log_level"] = 1;
|
||||||
|
parse_config(this->config, true);
|
||||||
|
|
||||||
epmgr = new epoll_manager_t(ringloop);
|
epmgr = new epoll_manager_t(ringloop);
|
||||||
// FIXME: Use timerfd_interval based directly on io_uring
|
// FIXME: Use timerfd_interval based directly on io_uring
|
||||||
this->tfd = epmgr->tfd;
|
this->tfd = epmgr->tfd;
|
||||||
|
|
||||||
if (!json_is_true(this->config["disable_blockstore"]))
|
auto bs_cfg = json_to_bs(this->config);
|
||||||
|
this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
|
||||||
{
|
{
|
||||||
auto bs_cfg = json_to_bs(this->config);
|
|
||||||
this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
|
|
||||||
// Autosync based on the number of unstable writes to prevent stalls due to insufficient journal space
|
// Autosync based on the number of unstable writes to prevent stalls due to insufficient journal space
|
||||||
uint64_t max_autosync = bs->get_journal_size() / bs->get_block_size() / 2;
|
uint64_t max_autosync = bs->get_journal_size() / bs->get_block_size() / 2;
|
||||||
if (autosync_writes > max_autosync)
|
if (autosync_writes > max_autosync)
|
||||||
@@ -67,11 +67,11 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
||||||
{
|
{
|
||||||
print_stats();
|
print_stats();
|
||||||
});
|
});
|
||||||
slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
|
this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
|
||||||
{
|
{
|
||||||
print_slow();
|
print_slow();
|
||||||
});
|
});
|
||||||
@@ -91,42 +91,18 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
|||||||
|
|
||||||
osd_t::~osd_t()
|
osd_t::~osd_t()
|
||||||
{
|
{
|
||||||
if (slow_log_timer_id >= 0)
|
|
||||||
{
|
|
||||||
tfd->clear_timer(slow_log_timer_id);
|
|
||||||
slow_log_timer_id = -1;
|
|
||||||
}
|
|
||||||
if (print_stats_timer_id >= 0)
|
|
||||||
{
|
|
||||||
tfd->clear_timer(print_stats_timer_id);
|
|
||||||
print_stats_timer_id = -1;
|
|
||||||
}
|
|
||||||
if (autosync_timer_id >= 0)
|
|
||||||
{
|
|
||||||
tfd->clear_timer(autosync_timer_id);
|
|
||||||
autosync_timer_id = -1;
|
|
||||||
}
|
|
||||||
ringloop->unregister_consumer(&consumer);
|
ringloop->unregister_consumer(&consumer);
|
||||||
delete epmgr;
|
delete epmgr;
|
||||||
if (bs)
|
delete bs;
|
||||||
delete bs;
|
|
||||||
close(listen_fd);
|
close(listen_fd);
|
||||||
free(zero_buffer);
|
free(zero_buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::parse_config(bool init)
|
void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
|
||||||
{
|
{
|
||||||
config = msgr.merge_configs(cli_config, file_config, etcd_global_config, etcd_osd_config);
|
|
||||||
if (config.find("log_level") == this->config.end())
|
|
||||||
config["log_level"] = 1;
|
|
||||||
if (bs)
|
|
||||||
{
|
|
||||||
auto bs_cfg = json_to_bs(config);
|
|
||||||
bs->parse_config(bs_cfg);
|
|
||||||
}
|
|
||||||
st_cli.parse_config(config);
|
st_cli.parse_config(config);
|
||||||
msgr.parse_config(config);
|
msgr.parse_config(config);
|
||||||
if (init)
|
if (allow_disk_params)
|
||||||
{
|
{
|
||||||
// OSD number
|
// OSD number
|
||||||
osd_num = config["osd_num"].uint64_value();
|
osd_num = config["osd_num"].uint64_value();
|
||||||
@@ -148,27 +124,24 @@ void osd_t::parse_config(bool init)
|
|||||||
immediate_commit = IMMEDIATE_SMALL;
|
immediate_commit = IMMEDIATE_SMALL;
|
||||||
else
|
else
|
||||||
immediate_commit = IMMEDIATE_NONE;
|
immediate_commit = IMMEDIATE_NONE;
|
||||||
// Bind address
|
|
||||||
bind_address = config["bind_address"].string_value();
|
|
||||||
if (bind_address == "")
|
|
||||||
bind_address = "0.0.0.0";
|
|
||||||
bind_port = config["bind_port"].uint64_value();
|
|
||||||
if (bind_port <= 0 || bind_port > 65535)
|
|
||||||
bind_port = 0;
|
|
||||||
// OSD configuration
|
|
||||||
etcd_report_interval = config["etcd_report_interval"].uint64_value();
|
|
||||||
if (etcd_report_interval <= 0)
|
|
||||||
etcd_report_interval = 5;
|
|
||||||
readonly = json_is_true(config["readonly"]);
|
|
||||||
run_primary = !json_is_false(config["run_primary"]);
|
|
||||||
allow_test_ops = json_is_true(config["allow_test_ops"]);
|
|
||||||
}
|
}
|
||||||
|
// Bind address
|
||||||
|
bind_address = config["bind_address"].string_value();
|
||||||
|
if (bind_address == "")
|
||||||
|
bind_address = "0.0.0.0";
|
||||||
|
bind_port = config["bind_port"].uint64_value();
|
||||||
|
if (bind_port <= 0 || bind_port > 65535)
|
||||||
|
bind_port = 0;
|
||||||
|
// OSD configuration
|
||||||
log_level = config["log_level"].uint64_value();
|
log_level = config["log_level"].uint64_value();
|
||||||
auto old_no_rebalance = no_rebalance;
|
etcd_report_interval = config["etcd_report_interval"].uint64_value();
|
||||||
|
if (etcd_report_interval <= 0)
|
||||||
|
etcd_report_interval = 5;
|
||||||
|
readonly = json_is_true(config["readonly"]);
|
||||||
|
run_primary = !json_is_false(config["run_primary"]);
|
||||||
no_rebalance = json_is_true(config["no_rebalance"]);
|
no_rebalance = json_is_true(config["no_rebalance"]);
|
||||||
auto old_no_recovery = no_recovery;
|
|
||||||
no_recovery = json_is_true(config["no_recovery"]);
|
no_recovery = json_is_true(config["no_recovery"]);
|
||||||
auto old_autosync_interval = autosync_interval;
|
allow_test_ops = json_is_true(config["allow_test_ops"]);
|
||||||
if (!config["autosync_interval"].is_null())
|
if (!config["autosync_interval"].is_null())
|
||||||
{
|
{
|
||||||
// Allow to set it to 0
|
// Allow to set it to 0
|
||||||
@@ -196,46 +169,15 @@ void osd_t::parse_config(bool init)
|
|||||||
recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
|
recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
|
||||||
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
|
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
|
||||||
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||||
auto old_print_stats_interval = print_stats_interval;
|
|
||||||
print_stats_interval = config["print_stats_interval"].uint64_value();
|
print_stats_interval = config["print_stats_interval"].uint64_value();
|
||||||
if (!print_stats_interval)
|
if (!print_stats_interval)
|
||||||
print_stats_interval = 3;
|
print_stats_interval = 3;
|
||||||
auto old_slow_log_interval = slow_log_interval;
|
|
||||||
slow_log_interval = config["slow_log_interval"].uint64_value();
|
slow_log_interval = config["slow_log_interval"].uint64_value();
|
||||||
if (!slow_log_interval)
|
if (!slow_log_interval)
|
||||||
slow_log_interval = 10;
|
slow_log_interval = 10;
|
||||||
inode_vanish_time = config["inode_vanish_time"].uint64_value();
|
inode_vanish_time = config["inode_vanish_time"].uint64_value();
|
||||||
if (!inode_vanish_time)
|
if (!inode_vanish_time)
|
||||||
inode_vanish_time = 60;
|
inode_vanish_time = 60;
|
||||||
if ((old_no_rebalance && !no_rebalance || old_no_recovery && !no_recovery) &&
|
|
||||||
!(peering_state & (OSD_RECOVERING | OSD_FLUSHING_PGS)))
|
|
||||||
{
|
|
||||||
peering_state = peering_state | OSD_RECOVERING;
|
|
||||||
}
|
|
||||||
if (old_autosync_interval != autosync_interval && autosync_timer_id >= 0)
|
|
||||||
{
|
|
||||||
this->tfd->clear_timer(autosync_timer_id);
|
|
||||||
autosync_timer_id = this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
|
|
||||||
{
|
|
||||||
autosync();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
if (old_print_stats_interval != print_stats_interval && print_stats_timer_id >= 0)
|
|
||||||
{
|
|
||||||
tfd->clear_timer(print_stats_timer_id);
|
|
||||||
print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
|
||||||
{
|
|
||||||
print_stats();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
if (old_slow_log_interval != slow_log_interval && slow_log_timer_id >= 0)
|
|
||||||
{
|
|
||||||
tfd->clear_timer(slow_log_timer_id);
|
|
||||||
slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
|
|
||||||
{
|
|
||||||
print_slow();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::bind_socket()
|
void osd_t::bind_socket()
|
||||||
@@ -460,7 +402,7 @@ void osd_t::print_slow()
|
|||||||
int l = sizeof(alloc), n;
|
int l = sizeof(alloc), n;
|
||||||
char *buf = alloc;
|
char *buf = alloc;
|
||||||
#define bufprintf(s, ...) { n = snprintf(buf, l, s, __VA_ARGS__); n = n < 0 ? 0 : n; buf += n; l -= n; }
|
#define bufprintf(s, ...) { n = snprintf(buf, l, s, __VA_ARGS__); n = n < 0 ? 0 : n; buf += n; l -= n; }
|
||||||
bufprintf("[OSD %lu] Slow op %lx", osd_num, (unsigned long)op);
|
bufprintf("[OSD %lu] Slow op", osd_num);
|
||||||
if (kv.second->osd_num)
|
if (kv.second->osd_num)
|
||||||
{
|
{
|
||||||
bufprintf(" from peer OSD %lu (client %d)", kv.second->osd_num, kv.second->peer_fd);
|
bufprintf(" from peer OSD %lu (client %d)", kv.second->osd_num, kv.second->peer_fd);
|
||||||
@@ -533,7 +475,7 @@ void osd_t::print_slow()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (has_slow && bs)
|
if (has_slow)
|
||||||
{
|
{
|
||||||
bs->dump_diagnostics();
|
bs->dump_diagnostics();
|
||||||
}
|
}
|
||||||
|
@@ -90,7 +90,7 @@ class osd_t
|
|||||||
{
|
{
|
||||||
// config
|
// config
|
||||||
|
|
||||||
json11::Json::object cli_config, file_config, etcd_global_config, etcd_osd_config, config;
|
json11::Json::object config;
|
||||||
int etcd_report_interval = 5;
|
int etcd_report_interval = 5;
|
||||||
|
|
||||||
bool readonly = false;
|
bool readonly = false;
|
||||||
@@ -126,7 +126,6 @@ class osd_t
|
|||||||
bool pg_config_applied = false;
|
bool pg_config_applied = false;
|
||||||
bool etcd_reporting_pg_state = false;
|
bool etcd_reporting_pg_state = false;
|
||||||
bool etcd_reporting_stats = false;
|
bool etcd_reporting_stats = false;
|
||||||
int autosync_timer_id = -1, print_stats_timer_id = -1, slow_log_timer_id = -1;
|
|
||||||
|
|
||||||
// peers and PGs
|
// peers and PGs
|
||||||
|
|
||||||
@@ -153,7 +152,7 @@ class osd_t
|
|||||||
|
|
||||||
bool stopping = false;
|
bool stopping = false;
|
||||||
int inflight_ops = 0;
|
int inflight_ops = 0;
|
||||||
blockstore_t *bs = NULL;
|
blockstore_t *bs;
|
||||||
void *zero_buffer = NULL;
|
void *zero_buffer = NULL;
|
||||||
uint64_t zero_buffer_size = 0;
|
uint64_t zero_buffer_size = 0;
|
||||||
uint32_t bs_block_size, bs_bitmap_granularity, clean_entry_bitmap_size;
|
uint32_t bs_block_size, bs_bitmap_granularity, clean_entry_bitmap_size;
|
||||||
@@ -174,7 +173,7 @@ class osd_t
|
|||||||
uint64_t recovery_stat_bytes[2][2] = {};
|
uint64_t recovery_stat_bytes[2][2] = {};
|
||||||
|
|
||||||
// cluster connection
|
// cluster connection
|
||||||
void parse_config(bool init);
|
void parse_config(const json11::Json & config, bool allow_disk_params);
|
||||||
void init_cluster();
|
void init_cluster();
|
||||||
void on_change_osd_state_hook(osd_num_t peer_osd);
|
void on_change_osd_state_hook(osd_num_t peer_osd);
|
||||||
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
|
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
|
||||||
@@ -192,7 +191,6 @@ class osd_t
|
|||||||
void reset_stats();
|
void reset_stats();
|
||||||
json11::Json get_statistics();
|
json11::Json get_statistics();
|
||||||
void report_statistics();
|
void report_statistics();
|
||||||
void add_pg_history(pg_t & pg);
|
|
||||||
void report_pg_state(pg_t & pg);
|
void report_pg_state(pg_t & pg);
|
||||||
void report_pg_states();
|
void report_pg_states();
|
||||||
void apply_pg_count();
|
void apply_pg_count();
|
||||||
|
@@ -75,7 +75,7 @@ void osd_t::init_cluster()
|
|||||||
}
|
}
|
||||||
if (run_primary && autosync_interval > 0)
|
if (run_primary && autosync_interval > 0)
|
||||||
{
|
{
|
||||||
autosync_timer_id = this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
|
this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
|
||||||
{
|
{
|
||||||
autosync();
|
autosync();
|
||||||
});
|
});
|
||||||
@@ -182,10 +182,10 @@ json11::Json osd_t::get_statistics()
|
|||||||
char time_str[50] = { 0 };
|
char time_str[50] = { 0 };
|
||||||
sprintf(time_str, "%ld.%03ld", ts.tv_sec, ts.tv_nsec/1000000);
|
sprintf(time_str, "%ld.%03ld", ts.tv_sec, ts.tv_nsec/1000000);
|
||||||
st["time"] = time_str;
|
st["time"] = time_str;
|
||||||
|
st["blockstore_ready"] = bs->is_started();
|
||||||
|
st["data_block_size"] = (uint64_t)bs->get_block_size();
|
||||||
if (bs)
|
if (bs)
|
||||||
{
|
{
|
||||||
st["blockstore_ready"] = bs->is_started();
|
|
||||||
st["data_block_size"] = (uint64_t)bs->get_block_size();
|
|
||||||
st["size"] = bs->get_block_count() * bs->get_block_size();
|
st["size"] = bs->get_block_count() * bs->get_block_size();
|
||||||
st["free"] = bs->get_free_block_count() * bs->get_block_size();
|
st["free"] = bs->get_free_block_count() * bs->get_block_size();
|
||||||
}
|
}
|
||||||
@@ -233,8 +233,7 @@ void osd_t::report_statistics()
|
|||||||
json11::Json::object inode_space;
|
json11::Json::object inode_space;
|
||||||
json11::Json::object last_stat;
|
json11::Json::object last_stat;
|
||||||
pool_id_t last_pool = 0;
|
pool_id_t last_pool = 0;
|
||||||
std::map<uint64_t, uint64_t> bs_empty_space;
|
auto & bs_inode_space = bs->get_inode_space_stats();
|
||||||
auto & bs_inode_space = bs ? bs->get_inode_space_stats() : bs_empty_space;
|
|
||||||
for (auto kv: bs_inode_space)
|
for (auto kv: bs_inode_space)
|
||||||
{
|
{
|
||||||
pool_id_t pool_id = INODE_POOL(kv.first);
|
pool_id_t pool_id = INODE_POOL(kv.first);
|
||||||
@@ -375,11 +374,7 @@ void osd_t::on_change_osd_state_hook(osd_num_t peer_osd)
|
|||||||
|
|
||||||
void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes)
|
void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes)
|
||||||
{
|
{
|
||||||
if (changes.find(st_cli.etcd_prefix+"/config/global") != changes.end())
|
// FIXME apply config changes in runtime (maybe, some)
|
||||||
{
|
|
||||||
etcd_global_config = changes[st_cli.etcd_prefix+"/config/global"].value.object_items();
|
|
||||||
parse_config(false);
|
|
||||||
}
|
|
||||||
if (run_primary)
|
if (run_primary)
|
||||||
{
|
{
|
||||||
apply_pg_count();
|
apply_pg_count();
|
||||||
@@ -389,8 +384,11 @@ void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes
|
|||||||
|
|
||||||
void osd_t::on_load_config_hook(json11::Json::object & global_config)
|
void osd_t::on_load_config_hook(json11::Json::object & global_config)
|
||||||
{
|
{
|
||||||
etcd_global_config = global_config;
|
json11::Json::object osd_config = this->config;
|
||||||
parse_config(true);
|
for (auto & kv: global_config)
|
||||||
|
if (osd_config.find(kv.first) == osd_config.end())
|
||||||
|
osd_config[kv.first] = kv.second;
|
||||||
|
parse_config(osd_config, false);
|
||||||
bind_socket();
|
bind_socket();
|
||||||
acquire_lease();
|
acquire_lease();
|
||||||
}
|
}
|
||||||
@@ -674,7 +672,7 @@ void osd_t::apply_pg_config()
|
|||||||
}
|
}
|
||||||
for (auto & hist_item: pg_cfg.target_history)
|
for (auto & hist_item: pg_cfg.target_history)
|
||||||
{
|
{
|
||||||
for (auto pg_osd: hist_item.osd_set)
|
for (auto pg_osd: hist_item)
|
||||||
{
|
{
|
||||||
if (pg_osd != 0)
|
if (pg_osd != 0)
|
||||||
{
|
{
|
||||||
@@ -685,7 +683,7 @@ void osd_t::apply_pg_config()
|
|||||||
auto vec_all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end());
|
auto vec_all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end());
|
||||||
if (currently_taken)
|
if (currently_taken)
|
||||||
{
|
{
|
||||||
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING))
|
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING | PG_PEERED))
|
||||||
{
|
{
|
||||||
if (pg_it->second.target_set == pg_cfg.target_set &&
|
if (pg_it->second.target_set == pg_cfg.target_set &&
|
||||||
pg_it->second.target_history == pg_cfg.target_history &&
|
pg_it->second.target_history == pg_cfg.target_history &&
|
||||||
@@ -696,6 +694,20 @@ void osd_t::apply_pg_config()
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
printf(
|
||||||
|
"Repeer %u/%u because of history: %s vs %s\n",
|
||||||
|
pool_id, pg_num,
|
||||||
|
json11::Json(json11::Json::object {
|
||||||
|
{ "target_set", pg_cfg.target_set },
|
||||||
|
{ "osd_sets", pg_cfg.target_history },
|
||||||
|
{ "all_peers", vec_all_peers },
|
||||||
|
}).dump().c_str(),
|
||||||
|
json11::Json(json11::Json::object {
|
||||||
|
{ "target_set", pg_it->second.target_set },
|
||||||
|
{ "osd_sets", pg_it->second.target_history },
|
||||||
|
{ "all_peers", pg_it->second.all_peers },
|
||||||
|
}).dump().c_str()
|
||||||
|
);
|
||||||
// Stop PG, reapply change after stopping
|
// Stop PG, reapply change after stopping
|
||||||
stop_pg(pg_it->second);
|
stop_pg(pg_it->second);
|
||||||
all_applied = false;
|
all_applied = false;
|
||||||
@@ -736,7 +748,7 @@ void osd_t::apply_pg_config()
|
|||||||
.pg_cursize = 0,
|
.pg_cursize = 0,
|
||||||
.pg_size = pool_item.second.pg_size,
|
.pg_size = pool_item.second.pg_size,
|
||||||
.pg_minsize = pool_item.second.pg_minsize,
|
.pg_minsize = pool_item.second.pg_minsize,
|
||||||
.pg_data_size = pool_item.second.scheme == POOL_SCHEME_REPLICATED
|
.pg_data_size = pg.scheme == POOL_SCHEME_REPLICATED
|
||||||
? 1 : pool_item.second.pg_size - pool_item.second.parity_chunks,
|
? 1 : pool_item.second.pg_size - pool_item.second.parity_chunks,
|
||||||
.pool_id = pool_id,
|
.pool_id = pool_id,
|
||||||
.pg_num = pg_num,
|
.pg_num = pg_num,
|
||||||
@@ -868,40 +880,12 @@ void osd_t::report_pg_states()
|
|||||||
// Prevent race conditions (for the case when the monitor is updating this key at the same time)
|
// Prevent race conditions (for the case when the monitor is updating this key at the same time)
|
||||||
pg.history_changed = false;
|
pg.history_changed = false;
|
||||||
std::string history_key = base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num));
|
std::string history_key = base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num));
|
||||||
json11::Json::array target_history;
|
|
||||||
for (auto & pgh: pg.target_history)
|
|
||||||
{
|
|
||||||
target_history.push_back(json11::Json::object {
|
|
||||||
{ "osd_set", pgh.osd_set },
|
|
||||||
{ "min_epoch", pgh.min_epoch },
|
|
||||||
{ "max_epoch", pgh.max_epoch },
|
|
||||||
});
|
|
||||||
}
|
|
||||||
std::vector<osd_num_t> all_peers;
|
|
||||||
for (auto peer_osd: pg.all_peers)
|
|
||||||
{
|
|
||||||
bool found = false;
|
|
||||||
for (auto target_peer: pg.target_set)
|
|
||||||
{
|
|
||||||
if (target_peer == peer_osd)
|
|
||||||
{
|
|
||||||
found = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!found)
|
|
||||||
{
|
|
||||||
all_peers.push_back(peer_osd);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
json11::Json::object history_value = {
|
json11::Json::object history_value = {
|
||||||
{ "epoch", pg.epoch },
|
{ "epoch", pg.epoch },
|
||||||
{ "osd_set_epochs", target_history },
|
{ "all_peers", pg.all_peers },
|
||||||
|
{ "osd_sets", pg.target_history },
|
||||||
};
|
};
|
||||||
if (all_peers.size())
|
printf("PG %u/%u HISTORY -> %s\n", pg.pool_id, pg.pg_num, json11::Json(history_value).dump().c_str());
|
||||||
{
|
|
||||||
history_value["all_peers"] = all_peers;
|
|
||||||
}
|
|
||||||
checks.push_back(json11::Json::object {
|
checks.push_back(json11::Json::object {
|
||||||
{ "target", "MOD" },
|
{ "target", "MOD" },
|
||||||
{ "key", history_key },
|
{ "key", history_key },
|
||||||
@@ -994,6 +978,13 @@ void osd_t::report_pg_states()
|
|||||||
}
|
}
|
||||||
this->pgs.erase(pg_it);
|
this->pgs.erase(pg_it);
|
||||||
}
|
}
|
||||||
|
else if (pg_it->second.state & PG_PEERED)
|
||||||
|
{
|
||||||
|
// Activate PG after PG PEERED state is reported along with history
|
||||||
|
// (if the state wasn't changed again)
|
||||||
|
pg_it->second.state = pg_it->second.state & ~PG_PEERED | PG_ACTIVE;
|
||||||
|
report_pg_state(pg_it->second);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Push other PG state updates, if any
|
// Push other PG state updates, if any
|
||||||
|
@@ -64,11 +64,6 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
|
|||||||
|
|
||||||
void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval)
|
void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval)
|
||||||
{
|
{
|
||||||
if (log_level > 2)
|
|
||||||
{
|
|
||||||
printf("[PG %u/%u] flush batch %lx completed on OSD %lu with result %d\n",
|
|
||||||
pool_id, pg_num, (uint64_t)fb, peer_osd, retval);
|
|
||||||
}
|
|
||||||
pool_pg_num_t pg_id = { .pool_id = pool_id, .pg_num = pg_num };
|
pool_pg_num_t pg_id = { .pool_id = pool_id, .pg_num = pg_num };
|
||||||
if (pgs.find(pg_id) == pgs.end() || pgs[pg_id].flush_batch != fb)
|
if (pgs.find(pg_id) == pgs.end() || pgs[pg_id].flush_batch != fb)
|
||||||
{
|
{
|
||||||
@@ -104,9 +99,10 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
|||||||
std::vector<osd_op_t*> continue_ops;
|
std::vector<osd_op_t*> continue_ops;
|
||||||
auto & pg = pgs.at(pg_id);
|
auto & pg = pgs.at(pg_id);
|
||||||
auto it = pg.flush_actions.begin(), prev_it = it;
|
auto it = pg.flush_actions.begin(), prev_it = it;
|
||||||
|
auto erase_start = it;
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
if (it == pg.flush_actions.end() || !it->second.submitted ||
|
if (it == pg.flush_actions.end() ||
|
||||||
it->first.oid.inode != prev_it->first.oid.inode ||
|
it->first.oid.inode != prev_it->first.oid.inode ||
|
||||||
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK))
|
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK))
|
||||||
{
|
{
|
||||||
@@ -120,23 +116,29 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
|||||||
});
|
});
|
||||||
if (wr_it != pg.write_queue.end())
|
if (wr_it != pg.write_queue.end())
|
||||||
{
|
{
|
||||||
if (log_level > 2)
|
|
||||||
{
|
|
||||||
printf("[PG %u/%u] continuing write %lx to object %lx:%lx after flush\n",
|
|
||||||
pool_id, pg_num, (uint64_t)wr_it->second, wr_it->first.inode, wr_it->first.stripe);
|
|
||||||
}
|
|
||||||
continue_ops.push_back(wr_it->second);
|
continue_ops.push_back(wr_it->second);
|
||||||
|
pg.write_queue.erase(wr_it);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (it == pg.flush_actions.end() || !it->second.submitted)
|
if ((it == pg.flush_actions.end() || !it->second.submitted) &&
|
||||||
|
erase_start != it)
|
||||||
|
{
|
||||||
|
pg.flush_actions.erase(erase_start, it);
|
||||||
|
}
|
||||||
|
if (it == pg.flush_actions.end())
|
||||||
{
|
{
|
||||||
if (it != pg.flush_actions.begin())
|
|
||||||
{
|
|
||||||
pg.flush_actions.erase(pg.flush_actions.begin(), it);
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
prev_it = it++;
|
prev_it = it;
|
||||||
|
if (!it->second.submitted)
|
||||||
|
{
|
||||||
|
it++;
|
||||||
|
erase_start = it;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
it++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
delete fb;
|
delete fb;
|
||||||
pg.flush_batch = NULL;
|
pg.flush_batch = NULL;
|
||||||
@@ -166,18 +168,6 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
|||||||
// Copy buffer so it gets freed along with the operation
|
// Copy buffer so it gets freed along with the operation
|
||||||
op->buf = malloc_or_die(sizeof(obj_ver_id) * count);
|
op->buf = malloc_or_die(sizeof(obj_ver_id) * count);
|
||||||
memcpy(op->buf, data, sizeof(obj_ver_id) * count);
|
memcpy(op->buf, data, sizeof(obj_ver_id) * count);
|
||||||
if (log_level > 2)
|
|
||||||
{
|
|
||||||
printf(
|
|
||||||
"[PG %u/%u] flush batch %lx on OSD %lu: %s objects: ",
|
|
||||||
pool_id, pg_num, (uint64_t)fb, peer_osd, rollback ? "rollback" : "stabilize"
|
|
||||||
);
|
|
||||||
for (int i = 0; i < count; i++)
|
|
||||||
{
|
|
||||||
printf(i > 0 ? ", %lx:%lx v%lu" : "%lx:%lx v%lu", data[i].oid.inode, data[i].oid.stripe, data[i].version);
|
|
||||||
}
|
|
||||||
printf("\n");
|
|
||||||
}
|
|
||||||
if (peer_osd == this->osd_num)
|
if (peer_osd == this->osd_num)
|
||||||
{
|
{
|
||||||
// local
|
// local
|
||||||
@@ -287,25 +277,6 @@ bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
|||||||
|
|
||||||
void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
||||||
{
|
{
|
||||||
// Check if the object is deleted
|
|
||||||
bool is_deleted = false;
|
|
||||||
pool_id_t pool_id = INODE_POOL(op->oid.inode);
|
|
||||||
auto pool_cfg_it = st_cli.pool_config.find(pool_id);
|
|
||||||
if (pool_cfg_it != st_cli.pool_config.end())
|
|
||||||
{
|
|
||||||
pg_num_t pg_num = (op->oid.stripe/pool_cfg_it->second.pg_stripe_size) % pg_counts[pool_id] + 1; // like map_to_pg()
|
|
||||||
auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num });
|
|
||||||
if (pg_it != pgs.end())
|
|
||||||
{
|
|
||||||
pg_osd_set_state_t *object_state;
|
|
||||||
get_object_osd_set(pg_it->second, op->oid, pg_it->second.cur_set.data(), &object_state);
|
|
||||||
if (object_state && (object_state->state & OBJ_DELETED))
|
|
||||||
{
|
|
||||||
// Object is deleted, but not from all OSDs - delete remaining copies
|
|
||||||
is_deleted = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
op->osd_op = new osd_op_t();
|
op->osd_op = new osd_op_t();
|
||||||
op->osd_op->op_type = OSD_OP_OUT;
|
op->osd_op->op_type = OSD_OP_OUT;
|
||||||
op->osd_op->req = (osd_any_op_t){
|
op->osd_op->req = (osd_any_op_t){
|
||||||
@@ -313,7 +284,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
|||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = 1,
|
.id = 1,
|
||||||
.opcode = (uint64_t)(is_deleted ? OSD_OP_DELETE : OSD_OP_WRITE),
|
.opcode = OSD_OP_WRITE,
|
||||||
},
|
},
|
||||||
.inode = op->oid.inode,
|
.inode = op->oid.inode,
|
||||||
.offset = op->oid.stripe,
|
.offset = op->oid.stripe,
|
||||||
@@ -333,10 +304,9 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
|||||||
{
|
{
|
||||||
// PG is stopped or one of the OSDs is gone, error is harmless
|
// PG is stopped or one of the OSDs is gone, error is harmless
|
||||||
printf(
|
printf(
|
||||||
"[PG %u/%u] Recovery operation failed with object %lx:%lx\n",
|
"Recovery operation failed with object %lx:%lx (PG %u/%u)\n",
|
||||||
INODE_POOL(op->oid.inode),
|
op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
|
||||||
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
|
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size)
|
||||||
op->oid.inode, op->oid.stripe
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
24
src/osd_id.h
24
src/osd_id.h
@@ -3,8 +3,6 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#define POOL_SCHEME_REPLICATED 1
|
#define POOL_SCHEME_REPLICATED 1
|
||||||
#define POOL_SCHEME_XOR 2
|
#define POOL_SCHEME_XOR 2
|
||||||
#define POOL_SCHEME_EC 3
|
#define POOL_SCHEME_EC 3
|
||||||
@@ -40,25 +38,3 @@ inline bool operator != (const pool_pg_num_t & a, const pool_pg_num_t & b)
|
|||||||
{
|
{
|
||||||
return a.pool_id != b.pool_id || a.pg_num != b.pg_num;
|
return a.pool_id != b.pool_id || a.pg_num != b.pg_num;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct pg_history_set_t
|
|
||||||
{
|
|
||||||
std::vector<osd_num_t> osd_set;
|
|
||||||
uint64_t min_epoch, max_epoch;
|
|
||||||
};
|
|
||||||
|
|
||||||
inline bool operator == (const pg_history_set_t & a, const pg_history_set_t & b)
|
|
||||||
{
|
|
||||||
return a.min_epoch == b.min_epoch && a.max_epoch == b.max_epoch && a.osd_set == b.osd_set;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool operator != (const pg_history_set_t & a, const pg_history_set_t & b)
|
|
||||||
{
|
|
||||||
return a.min_epoch != b.min_epoch || a.max_epoch != b.max_epoch || a.osd_set != b.osd_set;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool operator < (const pg_history_set_t & a, const pg_history_set_t & b)
|
|
||||||
{
|
|
||||||
return a.min_epoch < b.min_epoch || a.min_epoch == b.min_epoch &&
|
|
||||||
(a.max_epoch < b.max_epoch || a.max_epoch == b.max_epoch && a.osd_set < b.osd_set);
|
|
||||||
}
|
|
||||||
|
@@ -50,6 +50,10 @@ void osd_t::handle_peers()
|
|||||||
still = true;
|
still = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (p.second.state & PG_PEERED)
|
||||||
|
{
|
||||||
|
still = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (!still)
|
if (!still)
|
||||||
{
|
{
|
||||||
@@ -70,13 +74,17 @@ void osd_t::handle_peers()
|
|||||||
}
|
}
|
||||||
still = true;
|
still = true;
|
||||||
}
|
}
|
||||||
|
else if (p.second.state & PG_PEERED)
|
||||||
|
{
|
||||||
|
still = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (!still)
|
if (!still)
|
||||||
{
|
{
|
||||||
peering_state = peering_state & ~OSD_FLUSHING_PGS | OSD_RECOVERING;
|
peering_state = peering_state & ~OSD_FLUSHING_PGS | OSD_RECOVERING;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!(peering_state & OSD_FLUSHING_PGS) && (peering_state & OSD_RECOVERING) && !readonly)
|
if ((peering_state & OSD_RECOVERING) && !readonly)
|
||||||
{
|
{
|
||||||
if (!continue_recovery())
|
if (!continue_recovery())
|
||||||
{
|
{
|
||||||
@@ -92,7 +100,7 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
|
|||||||
{
|
{
|
||||||
auto & pg = p.second;
|
auto & pg = p.second;
|
||||||
bool repeer = false;
|
bool repeer = false;
|
||||||
if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
|
if (pg.state & (PG_PEERING | PG_PEERED | PG_ACTIVE | PG_INCOMPLETE))
|
||||||
{
|
{
|
||||||
for (osd_num_t pg_osd: pg.all_peers)
|
for (osd_num_t pg_osd: pg.all_peers)
|
||||||
{
|
{
|
||||||
@@ -231,7 +239,7 @@ void osd_t::start_pg_peering(pg_t & pg)
|
|||||||
for (auto & history_set: pg.target_history)
|
for (auto & history_set: pg.target_history)
|
||||||
{
|
{
|
||||||
bool found = true;
|
bool found = true;
|
||||||
for (auto history_osd: history_set.osd_set)
|
for (auto history_osd: history_set)
|
||||||
{
|
{
|
||||||
if (history_osd != 0)
|
if (history_osd != 0)
|
||||||
{
|
{
|
||||||
@@ -471,74 +479,57 @@ void osd_t::finish_stop_pg(pg_t & pg)
|
|||||||
report_pg_state(pg);
|
report_pg_state(pg);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int count_nonzero_osds(const std::vector<osd_num_t> & v)
|
|
||||||
{
|
|
||||||
int n = 0;
|
|
||||||
for (auto & osd_num: v)
|
|
||||||
{
|
|
||||||
if (osd_num != 0)
|
|
||||||
{
|
|
||||||
n++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return n;
|
|
||||||
}
|
|
||||||
|
|
||||||
void osd_t::report_pg_state(pg_t & pg)
|
void osd_t::report_pg_state(pg_t & pg)
|
||||||
{
|
{
|
||||||
pg.print_state();
|
pg.print_state();
|
||||||
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||||
if ((pg.state == PG_ACTIVE || pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD)) &&
|
if (pg.state == PG_ACTIVE && (pg.target_history.size() > 0 || pg.all_peers.size() > pg.target_set.size()))
|
||||||
(pg.target_history.size() != 1 ||
|
|
||||||
pg.target_history[0].osd_set != pg.target_set ||
|
|
||||||
pg.target_history[0].min_epoch != 0 ||
|
|
||||||
pg.target_history[0].max_epoch != pg.epoch ||
|
|
||||||
pg.all_peers.size() > count_nonzero_osds(pg.target_set)))
|
|
||||||
{
|
{
|
||||||
// Clear history of active+clean PGs
|
// Clear history of active+clean PGs
|
||||||
pg.history_changed = true;
|
pg.history_changed = true;
|
||||||
pg.target_history.clear();
|
pg.target_history.clear();
|
||||||
pg.target_history.push_back((pg_history_set_t){
|
pg.all_peers = pg.target_set;
|
||||||
.osd_set = pg.cur_set,
|
|
||||||
.min_epoch = 0,
|
|
||||||
.max_epoch = pg.epoch,
|
|
||||||
});
|
|
||||||
if (pg.state == PG_ACTIVE)
|
|
||||||
{
|
|
||||||
pg.all_peers.clear();
|
|
||||||
for (auto pg_osd: pg.target_set)
|
|
||||||
{
|
|
||||||
if (pg_osd)
|
|
||||||
pg.all_peers.push_back(pg_osd);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
|
|
||||||
std::set<osd_num_t> dead_peers(pg.all_peers.begin(), pg.all_peers.end());
|
|
||||||
for (auto pg_osd: pg.cur_peers)
|
|
||||||
{
|
|
||||||
dead_peers.erase(pg_osd);
|
|
||||||
}
|
|
||||||
for (auto pg_osd: pg.target_set)
|
|
||||||
{
|
|
||||||
if (pg_osd)
|
|
||||||
dead_peers.insert(pg_osd);
|
|
||||||
}
|
|
||||||
pg.all_peers.clear();
|
|
||||||
pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end());
|
|
||||||
}
|
|
||||||
std::sort(pg.all_peers.begin(), pg.all_peers.end());
|
std::sort(pg.all_peers.begin(), pg.all_peers.end());
|
||||||
|
pg.cur_peers = pg.target_set;
|
||||||
|
}
|
||||||
|
else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
|
||||||
|
{
|
||||||
|
// Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
|
||||||
|
if (pg.target_history.size())
|
||||||
|
{
|
||||||
|
pg.history_changed = true;
|
||||||
|
pg.target_history.clear();
|
||||||
|
}
|
||||||
|
std::set<osd_num_t> dead_peers;
|
||||||
|
for (auto pg_osd: pg.all_peers)
|
||||||
|
{
|
||||||
|
dead_peers.insert(pg_osd);
|
||||||
|
}
|
||||||
|
for (auto pg_osd: pg.cur_peers)
|
||||||
|
{
|
||||||
|
dead_peers.erase(pg_osd);
|
||||||
|
}
|
||||||
|
for (auto pg_osd: pg.target_set)
|
||||||
|
{
|
||||||
|
if (pg_osd)
|
||||||
|
{
|
||||||
|
dead_peers.insert(pg_osd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto new_all_peers = std::vector<osd_num_t>(dead_peers.begin(), dead_peers.end());
|
||||||
|
if (pg.all_peers != new_all_peers)
|
||||||
|
{
|
||||||
|
pg.history_changed = true;
|
||||||
|
pg.all_peers = new_all_peers;
|
||||||
|
}
|
||||||
pg.cur_peers.clear();
|
pg.cur_peers.clear();
|
||||||
for (auto pg_osd: pg.target_set)
|
for (auto pg_osd: pg.target_set)
|
||||||
{
|
{
|
||||||
if (pg_osd)
|
if (pg_osd)
|
||||||
|
{
|
||||||
pg.cur_peers.push_back(pg_osd);
|
pg.cur_peers.push_back(pg_osd);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Change pg_config at the same time, otherwise our PG reconciling loop may try to apply the old metadata
|
|
||||||
auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num];
|
|
||||||
pg_cfg.target_history = pg.target_history;
|
|
||||||
pg_cfg.all_peers = pg.all_peers;
|
|
||||||
}
|
}
|
||||||
if (pg.state == PG_OFFLINE && !this->pg_config_applied)
|
if (pg.state == PG_OFFLINE && !this->pg_config_applied)
|
||||||
{
|
{
|
||||||
@@ -546,51 +537,3 @@ void osd_t::report_pg_state(pg_t & pg)
|
|||||||
}
|
}
|
||||||
report_pg_states();
|
report_pg_states();
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::add_pg_history(pg_t & pg)
|
|
||||||
{
|
|
||||||
bool epoch_already_reported = false;
|
|
||||||
int max_epoch_pos = -1;
|
|
||||||
for (int i = pg.target_history.size()-1; i >= 0; i--)
|
|
||||||
{
|
|
||||||
if (pg.target_history[i].min_epoch > pg.epoch)
|
|
||||||
{
|
|
||||||
printf("[PG %u/%u] Invalid PG history: there is an entry with min_epoch (%lu) > current epoch (%lu)\n",
|
|
||||||
pg.pool_id, pg.pg_num, pg.target_history[i].min_epoch, pg.epoch);
|
|
||||||
force_stop(1);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (max_epoch_pos < 0 || pg.target_history[i].max_epoch > pg.target_history[max_epoch_pos].max_epoch)
|
|
||||||
{
|
|
||||||
max_epoch_pos = i;
|
|
||||||
}
|
|
||||||
if (pg.target_history[i].min_epoch <= pg.epoch &&
|
|
||||||
pg.target_history[i].max_epoch >= pg.epoch)
|
|
||||||
{
|
|
||||||
if (pg.target_history[i].osd_set != pg.cur_set)
|
|
||||||
{
|
|
||||||
printf("[PG %u/%u] Invalid target_history: epoch %lu has another OSD set already registered\n", pg.pool_id, pg.pg_num, pg.epoch);
|
|
||||||
force_stop(1);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// Already reported
|
|
||||||
epoch_already_reported = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!epoch_already_reported)
|
|
||||||
{
|
|
||||||
if (max_epoch_pos >= 0 && pg.target_history[max_epoch_pos].osd_set == pg.cur_set)
|
|
||||||
{
|
|
||||||
pg.target_history[max_epoch_pos].max_epoch = pg.epoch;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
pg.target_history.push_back((pg_history_set_t){
|
|
||||||
.osd_set = pg.cur_set,
|
|
||||||
.min_epoch = pg.epoch,
|
|
||||||
.max_epoch = pg.epoch,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@@ -52,7 +52,6 @@ struct pg_obj_state_check_t
|
|||||||
|
|
||||||
void walk();
|
void walk();
|
||||||
void start_object();
|
void start_object();
|
||||||
void recheck_version_osd_set();
|
|
||||||
void handle_version();
|
void handle_version();
|
||||||
void finish_object();
|
void finish_object();
|
||||||
};
|
};
|
||||||
@@ -85,15 +84,18 @@ void pg_obj_state_check_t::walk()
|
|||||||
pg->state = PG_INCOMPLETE | PG_HAS_INVALID;
|
pg->state = PG_INCOMPLETE | PG_HAS_INVALID;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Activate PG
|
|
||||||
if (pg->pg_cursize < pg->pg_size)
|
if (pg->pg_cursize < pg->pg_size)
|
||||||
{
|
{
|
||||||
// Activate as degraded
|
// Activate as degraded
|
||||||
// Current OSD set will be added into target_history on first write
|
// Current OSD set will be added into target_history on first write
|
||||||
pg->state |= PG_DEGRADED;
|
pg->state |= PG_DEGRADED | PG_PEERED;
|
||||||
}
|
}
|
||||||
pg->state |= PG_ACTIVE;
|
else
|
||||||
if (pg->cur_peers.size() < pg->all_peers.size())
|
{
|
||||||
|
// Just activate
|
||||||
|
pg->state |= PG_ACTIVE;
|
||||||
|
}
|
||||||
|
if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
|
||||||
{
|
{
|
||||||
pg->state |= PG_LEFT_ON_DEAD;
|
pg->state |= PG_LEFT_ON_DEAD;
|
||||||
}
|
}
|
||||||
@@ -110,85 +112,13 @@ void pg_obj_state_check_t::start_object()
|
|||||||
n_unstable = n_invalid = 0;
|
n_unstable = n_invalid = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: Put this under a feature flag
|
|
||||||
// FIXME: Implement OSD 'cookies' to be fool-proof so that if an OSD is wiped and
|
|
||||||
// recreated it doesn't also wipe all other data
|
|
||||||
void pg_obj_state_check_t::recheck_version_osd_set()
|
|
||||||
{
|
|
||||||
uint64_t epoch = (last_ver >> (64-PG_EPOCH_BITS));
|
|
||||||
if (!pg->epoch_sizes_differ && n_copies >= pg->pg_size)
|
|
||||||
{
|
|
||||||
// Enough copies
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
auto epoch_it = pg->target_by_epoch.lower_bound(epoch);
|
|
||||||
if (epoch_it == pg->target_by_epoch.end() || epoch_it->second.min_epoch > epoch)
|
|
||||||
{
|
|
||||||
// Epoch info not found
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (pg->epoch_sizes_differ && n_copies >= epoch_it->second.osd_set.size())
|
|
||||||
{
|
|
||||||
// For the (unlikely) case of PG size change - enough copies
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// Recheck version against the OSD set corresponding to epoch if it's known
|
|
||||||
if (epoch_it != pg->target_by_epoch.end() && epoch_it->second.min_epoch <= epoch)
|
|
||||||
{
|
|
||||||
for (int j = 0; j < epoch_it->second.osd_set.size(); j++)
|
|
||||||
{
|
|
||||||
osd_num_t cur_osd = epoch_it->second.osd_set[j];
|
|
||||||
bool found = false;
|
|
||||||
for (int i = ver_start; i < ver_end; i++)
|
|
||||||
{
|
|
||||||
if (cur_osd == list[i].osd_num)
|
|
||||||
{
|
|
||||||
found = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!found)
|
|
||||||
{
|
|
||||||
// Check if a newer version is present on the same OSD and masks the older one
|
|
||||||
// It happens for overwritten replicas in the following case:
|
|
||||||
// Version 1 is present on OSD 1,2,3
|
|
||||||
// Client tries to write Version 2
|
|
||||||
// OSD 3 succeeds to write Version 2, others don't. OSD 3 crashes, then starts again
|
|
||||||
// OSD 1 sees: version 1 on OSD 1,2 and version 2 on OSD 3
|
|
||||||
// (version 1 on OSD 3 is already masked/removed)
|
|
||||||
// Version 1 is not present on a full set, but it must not be removed
|
|
||||||
if (replicated)
|
|
||||||
{
|
|
||||||
for (int i = obj_start; i < ver_start; i++)
|
|
||||||
{
|
|
||||||
if (cur_osd == list[i].osd_num)
|
|
||||||
{
|
|
||||||
found = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!found)
|
|
||||||
{
|
|
||||||
// Object is missing from one of the OSDs of that set.
|
|
||||||
// This means it's deleted or moved and we can safely drop this version.
|
|
||||||
target_ver = 0;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void pg_obj_state_check_t::handle_version()
|
void pg_obj_state_check_t::handle_version()
|
||||||
{
|
{
|
||||||
if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_data_size))
|
if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_data_size))
|
||||||
{
|
{
|
||||||
// Version is either stable or recoverable
|
// Version is either stable or recoverable
|
||||||
ver_end = list_pos;
|
|
||||||
target_ver = last_ver;
|
target_ver = last_ver;
|
||||||
// Skip versions that are not present on any of OSDs for the corresponding PG epoch
|
ver_end = list_pos;
|
||||||
recheck_version_osd_set();
|
|
||||||
}
|
}
|
||||||
if (!target_ver)
|
if (!target_ver)
|
||||||
{
|
{
|
||||||
@@ -252,8 +182,6 @@ void pg_obj_state_check_t::finish_object()
|
|||||||
// Version is either stable or recoverable
|
// Version is either stable or recoverable
|
||||||
target_ver = last_ver;
|
target_ver = last_ver;
|
||||||
ver_end = list_pos;
|
ver_end = list_pos;
|
||||||
// Skip versions that are not present on any of OSDs for the corresponding PG epoch
|
|
||||||
recheck_version_osd_set();
|
|
||||||
}
|
}
|
||||||
obj_end = list_pos;
|
obj_end = list_pos;
|
||||||
// Remember the decision
|
// Remember the decision
|
||||||
@@ -307,23 +235,11 @@ void pg_obj_state_check_t::finish_object()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!target_ver && (n_unstable >= obj_end-obj_start))
|
if (!target_ver)
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (!target_ver)
|
if (!replicated && n_roles < pg->pg_data_size)
|
||||||
{
|
|
||||||
// Object is present, but should not be :) i.e. it's a deleted object that reappeared
|
|
||||||
if (log_level > 1)
|
|
||||||
{
|
|
||||||
printf("Object is deleted: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
|
|
||||||
}
|
|
||||||
state = OBJ_DELETED;
|
|
||||||
pg->state = pg->state | PG_HAS_MISPLACED;
|
|
||||||
// To record all versions as outdated:
|
|
||||||
ver_end = obj_start;
|
|
||||||
}
|
|
||||||
else if (!replicated && n_roles < pg->pg_data_size)
|
|
||||||
{
|
{
|
||||||
if (log_level > 1)
|
if (log_level > 1)
|
||||||
{
|
{
|
||||||
@@ -351,7 +267,7 @@ void pg_obj_state_check_t::finish_object()
|
|||||||
pg->state = pg->state | PG_HAS_MISPLACED;
|
pg->state = pg->state | PG_HAS_MISPLACED;
|
||||||
}
|
}
|
||||||
if (log_level > 1 && (state & (OBJ_INCOMPLETE | OBJ_DEGRADED)) ||
|
if (log_level > 1 && (state & (OBJ_INCOMPLETE | OBJ_DEGRADED)) ||
|
||||||
log_level > 2 && (state & (OBJ_MISPLACED | OBJ_DELETED)))
|
log_level > 2 && (state & OBJ_MISPLACED))
|
||||||
{
|
{
|
||||||
for (int i = obj_start; i < obj_end; i++)
|
for (int i = obj_start; i < obj_end; i++)
|
||||||
{
|
{
|
||||||
@@ -360,9 +276,9 @@ void pg_obj_state_check_t::finish_object()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
pg->total_count++;
|
pg->total_count++;
|
||||||
osd_set.clear();
|
if (state != 0 || ver_end < obj_end)
|
||||||
if (target_ver != 0 && (state != 0 || ver_end < obj_end))
|
|
||||||
{
|
{
|
||||||
|
osd_set.clear();
|
||||||
for (int i = ver_start; i < ver_end; i++)
|
for (int i = ver_start; i < ver_end; i++)
|
||||||
{
|
{
|
||||||
osd_set.push_back((pg_obj_loc_t){
|
osd_set.push_back((pg_obj_loc_t){
|
||||||
@@ -385,8 +301,7 @@ void pg_obj_state_check_t::finish_object()
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (j >= osd_set.size() && ((state & OBJ_DELETED) ||
|
if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num)
|
||||||
pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num))
|
|
||||||
{
|
{
|
||||||
osd_set.push_back((pg_obj_loc_t){
|
osd_set.push_back((pg_obj_loc_t){
|
||||||
.role = (list[i].oid.stripe & STRIPE_MASK),
|
.role = (list[i].oid.stripe & STRIPE_MASK),
|
||||||
@@ -401,11 +316,7 @@ void pg_obj_state_check_t::finish_object()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (state & OBJ_DELETED)
|
if (target_ver < max_ver)
|
||||||
{
|
|
||||||
pg->ver_override[oid] = max_ver;
|
|
||||||
}
|
|
||||||
else if (target_ver < max_ver)
|
|
||||||
{
|
{
|
||||||
pg->ver_override[oid] = target_ver;
|
pg->ver_override[oid] = target_ver;
|
||||||
}
|
}
|
||||||
@@ -459,7 +370,6 @@ void pg_obj_state_check_t::finish_object()
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
assert(it->second.state == state);
|
|
||||||
it->second.object_count++;
|
it->second.object_count++;
|
||||||
}
|
}
|
||||||
if (state & OBJ_INCOMPLETE)
|
if (state & OBJ_INCOMPLETE)
|
||||||
@@ -480,34 +390,6 @@ void pg_obj_state_check_t::finish_object()
|
|||||||
// FIXME: Write at least some tests for this function
|
// FIXME: Write at least some tests for this function
|
||||||
void pg_t::calc_object_states(int log_level)
|
void pg_t::calc_object_states(int log_level)
|
||||||
{
|
{
|
||||||
// Calculate intersections of target_history with cur_peers
|
|
||||||
for (auto & history_item: target_history)
|
|
||||||
{
|
|
||||||
if (history_item.max_epoch)
|
|
||||||
{
|
|
||||||
pg_history_set_t & set_copy = target_by_epoch[history_item.max_epoch];
|
|
||||||
set_copy.min_epoch = history_item.min_epoch;
|
|
||||||
set_copy.max_epoch = history_item.max_epoch;
|
|
||||||
for (int i = 0; i < history_item.osd_set.size(); i++)
|
|
||||||
{
|
|
||||||
if (history_item.osd_set[i] != 0)
|
|
||||||
{
|
|
||||||
for (int j = 0; j < cur_set.size(); j++)
|
|
||||||
{
|
|
||||||
if (cur_set[j] == history_item.osd_set[i])
|
|
||||||
{
|
|
||||||
set_copy.osd_set.push_back(history_item.osd_set[i]);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (set_copy.osd_set.size() != pg_size)
|
|
||||||
{
|
|
||||||
epoch_sizes_differ = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Copy all object lists into one array
|
// Copy all object lists into one array
|
||||||
pg_obj_state_check_t st;
|
pg_obj_state_check_t st;
|
||||||
st.log_level = log_level;
|
st.log_level = log_level;
|
||||||
@@ -544,18 +426,10 @@ void pg_t::calc_object_states(int log_level)
|
|||||||
std::sort(st.list.begin(), st.list.end());
|
std::sort(st.list.begin(), st.list.end());
|
||||||
// Walk over it and check object states
|
// Walk over it and check object states
|
||||||
st.walk();
|
st.walk();
|
||||||
target_by_epoch.clear(); // needed only in this function
|
|
||||||
if (this->state != PG_ACTIVE)
|
if (this->state != PG_ACTIVE)
|
||||||
{
|
{
|
||||||
assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
|
assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
|
||||||
epoch++;
|
epoch++;
|
||||||
for (auto & pgh: target_history)
|
|
||||||
{
|
|
||||||
if (epoch <= pgh.max_epoch)
|
|
||||||
{
|
|
||||||
epoch = pgh.max_epoch+1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (log_level > 0)
|
if (log_level > 0)
|
||||||
{
|
{
|
||||||
@@ -586,10 +460,11 @@ void pg_t::calc_object_states(int log_level)
|
|||||||
void pg_t::print_state()
|
void pg_t::print_state()
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||||
(state & PG_STARTING) ? "starting" : "",
|
(state & PG_STARTING) ? "starting" : "",
|
||||||
(state & PG_OFFLINE) ? "offline" : "",
|
(state & PG_OFFLINE) ? "offline" : "",
|
||||||
(state & PG_PEERING) ? "peering" : "",
|
(state & PG_PEERING) ? "peering" : "",
|
||||||
|
(state & PG_PEERED) ? "peered" : "",
|
||||||
(state & PG_INCOMPLETE) ? "incomplete" : "",
|
(state & PG_INCOMPLETE) ? "incomplete" : "",
|
||||||
(state & PG_ACTIVE) ? "active" : "",
|
(state & PG_ACTIVE) ? "active" : "",
|
||||||
(state & PG_REPEERING) ? "repeering" : "",
|
(state & PG_REPEERING) ? "repeering" : "",
|
||||||
|
@@ -89,9 +89,7 @@ struct pg_t
|
|||||||
// epoch number - should increase with each non-clean activation of the PG
|
// epoch number - should increase with each non-clean activation of the PG
|
||||||
uint64_t epoch = 0, reported_epoch = 0;
|
uint64_t epoch = 0, reported_epoch = 0;
|
||||||
// target history and all potential peers
|
// target history and all potential peers
|
||||||
std::vector<pg_history_set_t> target_history;
|
std::vector<std::vector<osd_num_t>> target_history;
|
||||||
std::map<uint64_t, pg_history_set_t> target_by_epoch;
|
|
||||||
bool epoch_sizes_differ = false;
|
|
||||||
std::vector<osd_num_t> all_peers;
|
std::vector<osd_num_t> all_peers;
|
||||||
bool history_changed = false;
|
bool history_changed = false;
|
||||||
// peer list from the last peering event
|
// peer list from the last peering event
|
||||||
|
@@ -54,6 +54,5 @@ int main(int argc, char *argv[])
|
|||||||
{
|
{
|
||||||
printf("dev: state=%lx\n", it.second.state);
|
printf("dev: state=%lx\n", it.second.state);
|
||||||
}
|
}
|
||||||
delete pg.peering_state;
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@@ -199,21 +199,6 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
|
|||||||
{
|
{
|
||||||
// PG may be degraded or have misplaced objects
|
// PG may be degraded or have misplaced objects
|
||||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
||||||
if (op_data->object_state && (op_data->object_state->state & OBJ_DELETED))
|
|
||||||
{
|
|
||||||
// Object is deleted, just return zeroes
|
|
||||||
cur_op->reply.rw.version = 0;
|
|
||||||
cur_op->reply.rw.bitmap_len = op_data->pg_data_size * clean_entry_bitmap_size;
|
|
||||||
uint64_t zero_len = cur_op->reply.rw.bitmap_len + cur_op->req.rw.len;
|
|
||||||
while (zero_len >= 0)
|
|
||||||
{
|
|
||||||
uint64_t cur_zero_len = zero_buffer_size > zero_len ? zero_len : zero_buffer_size;
|
|
||||||
cur_op->iov.push_back(zero_buffer, cur_zero_len);
|
|
||||||
zero_len -= cur_zero_len;
|
|
||||||
}
|
|
||||||
finish_op(cur_op, cur_op->req.rw.len);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
|
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
@@ -305,7 +290,7 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
|
|||||||
report_pg_state(pg);
|
report_pg_state(pg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (object_state->state & (OBJ_MISPLACED | OBJ_DELETED))
|
else if (object_state->state & OBJ_MISPLACED)
|
||||||
{
|
{
|
||||||
this->misplaced_objects--;
|
this->misplaced_objects--;
|
||||||
pg.misplaced_objects.erase(oid);
|
pg.misplaced_objects.erase(oid);
|
||||||
@@ -344,6 +329,12 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
|
|||||||
else if (op_data->st == 4) goto resume_4;
|
else if (op_data->st == 4) goto resume_4;
|
||||||
else if (op_data->st == 5) goto resume_5;
|
else if (op_data->st == 5) goto resume_5;
|
||||||
assert(op_data->st == 0);
|
assert(op_data->st == 0);
|
||||||
|
// Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs
|
||||||
|
if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD))
|
||||||
|
{
|
||||||
|
finish_op(cur_op, -EBUSY);
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (!check_write_queue(cur_op, pg))
|
if (!check_write_queue(cur_op, pg))
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
@@ -351,18 +342,11 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
|
|||||||
resume_1:
|
resume_1:
|
||||||
// Determine which OSDs contain this object and delete it
|
// Determine which OSDs contain this object and delete it
|
||||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
||||||
if (op_data->object_state && (op_data->object_state->state & OBJ_DELETED))
|
// Submit 1 read to determine the actual version number
|
||||||
{
|
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
|
||||||
op_data->fact_ver = pg.ver_override[op_data->oid];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Submit 1 read to determine the actual version number
|
|
||||||
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
|
|
||||||
resume_2:
|
resume_2:
|
||||||
op_data->st = 2;
|
op_data->st = 2;
|
||||||
return;
|
return;
|
||||||
}
|
|
||||||
resume_3:
|
resume_3:
|
||||||
if (op_data->errors > 0)
|
if (op_data->errors > 0)
|
||||||
{
|
{
|
||||||
|
@@ -133,12 +133,6 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitm
|
|||||||
uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||||
pg_osd_set_state_t *object_state;
|
pg_osd_set_state_t *object_state;
|
||||||
uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
|
uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
|
||||||
if (object_state && (object_state->state & OBJ_DELETED))
|
|
||||||
{
|
|
||||||
// Object is deleted, zero out the bitmap
|
|
||||||
memset((uint8_t*)op_data->snapshot_bitmaps + chain_num*clean_entry_bitmap_size, 0, clean_entry_bitmap_size);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
osd_num_t read_target = 0;
|
osd_num_t read_target = 0;
|
||||||
@@ -225,7 +219,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
|||||||
op_data->n_subops++;
|
op_data->n_subops++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (op_data->n_subops > 0)
|
if (op_data->n_subops)
|
||||||
{
|
{
|
||||||
op_data->fact_ver = 0;
|
op_data->fact_ver = 0;
|
||||||
op_data->done = op_data->errors = 0;
|
op_data->done = op_data->errors = 0;
|
||||||
@@ -303,7 +297,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
|||||||
// Fail it immediately
|
// Fail it immediately
|
||||||
subop->peer_fd = -1;
|
subop->peer_fd = -1;
|
||||||
subop->reply.hdr.retval = -EPIPE;
|
subop->reply.hdr.retval = -EPIPE;
|
||||||
ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
subop->callback(subop);
|
||||||
}
|
}
|
||||||
subop_idx++;
|
subop_idx++;
|
||||||
}
|
}
|
||||||
|
@@ -53,10 +53,7 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
|||||||
inode_stats[cur_op->req.rw.inode].op_count[inode_st_op]++;
|
inode_stats[cur_op->req.rw.inode].op_count[inode_st_op]++;
|
||||||
inode_stats[cur_op->req.rw.inode].op_sum[inode_st_op] += usec;
|
inode_stats[cur_op->req.rw.inode].op_sum[inode_st_op] += usec;
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
|
if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
|
||||||
{
|
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
|
||||||
if (cur_op->op_data)
|
|
||||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len;
|
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len;
|
||||||
}
|
}
|
||||||
@@ -119,19 +116,17 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
|
|||||||
if (osd_set[role] != 0 && (wr || !rep && stripes[role].read_end != 0))
|
if (osd_set[role] != 0 && (wr || !rep && stripes[role].read_end != 0))
|
||||||
n_subops++;
|
n_subops++;
|
||||||
}
|
}
|
||||||
if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep) && zero_read >= 0)
|
if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep))
|
||||||
n_subops = 1;
|
n_subops = 1;
|
||||||
else
|
else
|
||||||
zero_read = -1;
|
zero_read = -1;
|
||||||
|
osd_op_t *subops = new osd_op_t[n_subops];
|
||||||
op_data->fact_ver = 0;
|
op_data->fact_ver = 0;
|
||||||
op_data->done = op_data->errors = op_data->errcode = 0;
|
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||||
op_data->n_subops = n_subops;
|
op_data->n_subops = n_subops;
|
||||||
if (n_subops > 0)
|
op_data->subops = subops;
|
||||||
{
|
int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read);
|
||||||
op_data->subops = new osd_op_t[n_subops];
|
assert(sent == n_subops);
|
||||||
int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read);
|
|
||||||
assert(sent == n_subops);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t op_version,
|
int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t op_version,
|
||||||
@@ -240,7 +235,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
|||||||
// Fail it immediately
|
// Fail it immediately
|
||||||
subop->peer_fd = -1;
|
subop->peer_fd = -1;
|
||||||
subop->reply.hdr.retval = -EPIPE;
|
subop->reply.hdr.retval = -EPIPE;
|
||||||
ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
subop->callback(subop);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
i++;
|
i++;
|
||||||
@@ -474,7 +469,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
|||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
op_data->n_subops = chunks_to_delete_count;
|
op_data->n_subops = chunks_to_delete_count;
|
||||||
op_data->done = op_data->errors = op_data->errcode = 0;
|
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||||
if (op_data->n_subops <= 0)
|
if (!op_data->n_subops)
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -525,7 +520,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
|||||||
// Fail it immediately
|
// Fail it immediately
|
||||||
subops[i].peer_fd = -1;
|
subops[i].peer_fd = -1;
|
||||||
subops[i].reply.hdr.retval = -EPIPE;
|
subops[i].reply.hdr.retval = -EPIPE;
|
||||||
ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
subops[i].callback(&subops[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -640,7 +635,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
|||||||
// Fail it immediately
|
// Fail it immediately
|
||||||
subops[i].peer_fd = -1;
|
subops[i].peer_fd = -1;
|
||||||
subops[i].reply.hdr.retval = -EPIPE;
|
subops[i].reply.hdr.retval = -EPIPE;
|
||||||
ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
subops[i].callback(&subops[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -166,7 +166,7 @@ resume_6:
|
|||||||
for (int i = 0; i < unstable_osd.len; i++)
|
for (int i = 0; i < unstable_osd.len; i++)
|
||||||
{
|
{
|
||||||
// Except those from peered PGs
|
// Except those from peered PGs
|
||||||
auto & w = op_data->unstable_writes[unstable_osd.start + i];
|
auto & w = op_data->unstable_writes[i];
|
||||||
pool_pg_num_t wpg = {
|
pool_pg_num_t wpg = {
|
||||||
.pool_id = INODE_POOL(w.oid.inode),
|
.pool_id = INODE_POOL(w.oid.inode),
|
||||||
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
|
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
|
||||||
|
@@ -12,7 +12,6 @@ bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
|
|||||||
.oid = op_data->oid,
|
.oid = op_data->oid,
|
||||||
.osd_num = 0,
|
.osd_num = 0,
|
||||||
});
|
});
|
||||||
op_data->st = 1;
|
|
||||||
if (act_it != pg.flush_actions.end() &&
|
if (act_it != pg.flush_actions.end() &&
|
||||||
act_it->first.oid.inode == op_data->oid.inode &&
|
act_it->first.oid.inode == op_data->oid.inode &&
|
||||||
(act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
|
(act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
|
||||||
@@ -24,6 +23,7 @@ bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
|
|||||||
auto vo_it = pg.write_queue.find(op_data->oid);
|
auto vo_it = pg.write_queue.find(op_data->oid);
|
||||||
if (vo_it != pg.write_queue.end())
|
if (vo_it != pg.write_queue.end())
|
||||||
{
|
{
|
||||||
|
op_data->st = 1;
|
||||||
pg.write_queue.emplace(op_data->oid, cur_op);
|
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -156,13 +156,21 @@ resume_3:
|
|||||||
{
|
{
|
||||||
// Report newer epoch before writing
|
// Report newer epoch before writing
|
||||||
// FIXME: We don't have to report all changed PG states here
|
// FIXME: We don't have to report all changed PG states here
|
||||||
|
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||||
if (pg.state != PG_ACTIVE)
|
if (pg.state != PG_ACTIVE)
|
||||||
{
|
{
|
||||||
// Check that current OSD set is in history and/or add it there
|
// Check that current OSD set is in history and/or add it there
|
||||||
add_pg_history(pg);
|
std::vector<osd_num_t> history_set;
|
||||||
|
for (auto peer_osd: pg.cur_set)
|
||||||
|
if (peer_osd != 0)
|
||||||
|
history_set.push_back(peer_osd);
|
||||||
|
std::sort(history_set.begin(), history_set.end());
|
||||||
|
auto it = std::lower_bound(pg.target_history.begin(), pg.target_history.end(), history_set);
|
||||||
|
if (it == pg.target_history.end() || *it != history_set)
|
||||||
|
pg.target_history.insert(it, history_set);
|
||||||
|
pg.history_changed = true;
|
||||||
}
|
}
|
||||||
pg.history_changed = true;
|
report_pg_states();
|
||||||
report_pg_state(pg);
|
|
||||||
resume_10:
|
resume_10:
|
||||||
if (pg.epoch > pg.reported_epoch)
|
if (pg.epoch > pg.reported_epoch)
|
||||||
{
|
{
|
||||||
|
@@ -142,11 +142,11 @@ inline bool operator < (const reed_sol_erased_t &a, const reed_sol_erased_t &b)
|
|||||||
for (int i = 0; i < a.size && i < b.size; i++)
|
for (int i = 0; i < a.size && i < b.size; i++)
|
||||||
{
|
{
|
||||||
if (a.data[i] < b.data[i])
|
if (a.data[i] < b.data[i])
|
||||||
return true;
|
return -1;
|
||||||
else if (a.data[i] > b.data[i])
|
else if (a.data[i] > b.data[i])
|
||||||
return false;
|
return 1;
|
||||||
}
|
}
|
||||||
return false;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct reed_sol_matrix_t
|
struct reed_sol_matrix_t
|
||||||
@@ -677,11 +677,11 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
|
|||||||
static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, uint32_t wr_end, buf_len_t *bufs, int & nbufs)
|
static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, uint32_t wr_end, buf_len_t *bufs, int & nbufs)
|
||||||
{
|
{
|
||||||
uint32_t ns = 0, ne = 0, os = 0, oe = 0;
|
uint32_t ns = 0, ne = 0, os = 0, oe = 0;
|
||||||
if (stripe.write_end > wr_start &&
|
if (stripe.req_end > wr_start &&
|
||||||
stripe.write_start < wr_end)
|
stripe.req_start < wr_end)
|
||||||
{
|
{
|
||||||
ns = std::max(stripe.write_start, wr_start);
|
ns = std::max(stripe.req_start, wr_start);
|
||||||
ne = std::min(stripe.write_end, wr_end);
|
ne = std::min(stripe.req_end, wr_end);
|
||||||
}
|
}
|
||||||
if (stripe.read_end > wr_start &&
|
if (stripe.read_end > wr_start &&
|
||||||
stripe.read_start < wr_end)
|
stripe.read_start < wr_end)
|
||||||
@@ -692,7 +692,7 @@ static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, ui
|
|||||||
if (ne && (!oe || ns <= os))
|
if (ne && (!oe || ns <= os))
|
||||||
{
|
{
|
||||||
// NEW or NEW->OLD
|
// NEW or NEW->OLD
|
||||||
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.write_start, .len = ne-ns };
|
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.req_start, .len = ne-ns };
|
||||||
if (os < ne)
|
if (os < ne)
|
||||||
os = ne;
|
os = ne;
|
||||||
if (oe > os)
|
if (oe > os)
|
||||||
@@ -708,7 +708,7 @@ static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, ui
|
|||||||
{
|
{
|
||||||
// OLD->NEW or OLD->NEW->OLD
|
// OLD->NEW or OLD->NEW->OLD
|
||||||
bufs[nbufs++] = { .buf = (uint8_t*)stripe.read_buf + os - stripe.read_start, .len = ns-os };
|
bufs[nbufs++] = { .buf = (uint8_t*)stripe.read_buf + os - stripe.read_start, .len = ns-os };
|
||||||
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.write_start, .len = ne-ns };
|
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.req_start, .len = ne-ns };
|
||||||
if (oe > ne)
|
if (oe > ne)
|
||||||
{
|
{
|
||||||
// OLD->NEW->OLD
|
// OLD->NEW->OLD
|
||||||
@@ -759,18 +759,7 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
|
|||||||
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
|
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
|
||||||
uint32_t &start, uint32_t &end)
|
uint32_t &start, uint32_t &end)
|
||||||
{
|
{
|
||||||
bool required = false;
|
if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set)
|
||||||
for (int role = pg_minsize; role < pg_size; role++)
|
|
||||||
{
|
|
||||||
if (write_osd_set[role] != 0)
|
|
||||||
{
|
|
||||||
// Whole parity chunk is needed when we move the object
|
|
||||||
if (write_osd_set[role] != read_osd_set[role])
|
|
||||||
end = chunk_size;
|
|
||||||
required = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (required && end != chunk_size)
|
|
||||||
{
|
{
|
||||||
// start & end are required for calc_rmw_parity
|
// start & end are required for calc_rmw_parity
|
||||||
for (int role = 0; role < pg_minsize; role++)
|
for (int role = 0; role < pg_minsize; role++)
|
||||||
@@ -781,6 +770,14 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
|
|||||||
end = std::max(stripes[role].req_end, end);
|
end = std::max(stripes[role].req_end, end);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for (int role = pg_minsize; role < pg_size; role++)
|
||||||
|
{
|
||||||
|
if (write_osd_set[role] != 0 && write_osd_set[role] != read_osd_set[role])
|
||||||
|
{
|
||||||
|
start = 0;
|
||||||
|
end = chunk_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Set bitmap bits accordingly
|
// Set bitmap bits accordingly
|
||||||
if (bitmap_granularity > 0)
|
if (bitmap_granularity > 0)
|
||||||
@@ -948,7 +945,7 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
|
|||||||
{
|
{
|
||||||
if (write_osd_set[i])
|
if (write_osd_set[i])
|
||||||
{
|
{
|
||||||
memcpy((uint8_t*)subm + item_size*pg_minsize*j, (uint8_t*)matrix_data + item_size*pg_minsize*(i-pg_minsize), item_size*pg_minsize);
|
memcpy(subm + item_size*pg_minsize*j, matrix_data + item_size*pg_minsize*(i-pg_minsize), item_size*pg_minsize);
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -17,7 +17,6 @@ void test4();
|
|||||||
void test5();
|
void test5();
|
||||||
void test6();
|
void test6();
|
||||||
void test7();
|
void test7();
|
||||||
void test_rmw_4k_degraded_into_lost_to_normal(bool ec);
|
|
||||||
void test8();
|
void test8();
|
||||||
void test9();
|
void test9();
|
||||||
void test10();
|
void test10();
|
||||||
@@ -25,7 +24,7 @@ void test11();
|
|||||||
void test12();
|
void test12();
|
||||||
void test13();
|
void test13();
|
||||||
void test14();
|
void test14();
|
||||||
void test15(bool second);
|
void test15();
|
||||||
void test16();
|
void test16();
|
||||||
|
|
||||||
int main(int narg, char *args[])
|
int main(int narg, char *args[])
|
||||||
@@ -40,8 +39,6 @@ int main(int narg, char *args[])
|
|||||||
test6();
|
test6();
|
||||||
// Test 7
|
// Test 7
|
||||||
test7();
|
test7();
|
||||||
test_rmw_4k_degraded_into_lost_to_normal(false);
|
|
||||||
test_rmw_4k_degraded_into_lost_to_normal(true);
|
|
||||||
// Test 8
|
// Test 8
|
||||||
test8();
|
test8();
|
||||||
// Test 9
|
// Test 9
|
||||||
@@ -57,8 +54,7 @@ int main(int narg, char *args[])
|
|||||||
// Test 14
|
// Test 14
|
||||||
test14();
|
test14();
|
||||||
// Test 15
|
// Test 15
|
||||||
test15(false);
|
test15();
|
||||||
test15(true);
|
|
||||||
// Test 16
|
// Test 16
|
||||||
test16();
|
test16();
|
||||||
// End
|
// End
|
||||||
@@ -319,69 +315,6 @@ void test7()
|
|||||||
|
|
||||||
/***
|
/***
|
||||||
|
|
||||||
7/2. calc_rmw(offset=48K, len=4K, osd_set=[0,2,3], write_set=[1,2,3])
|
|
||||||
= {
|
|
||||||
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
|
|
||||||
write: [ [ 48K, 52K ], [ 0, 0 ], [ 48K, 52K ] ],
|
|
||||||
input buffer: [ write0 ],
|
|
||||||
rmw buffer: [ write2, read0, read1, read2 ],
|
|
||||||
}
|
|
||||||
then, after calc_rmw_parity_xor/ec(): {
|
|
||||||
write: [ [ 0, 128K ], [ 0, 0 ], [ 48K, 52K ] ],
|
|
||||||
write0==read0,
|
|
||||||
}
|
|
||||||
+ check write0, write2 buffers
|
|
||||||
|
|
||||||
***/
|
|
||||||
|
|
||||||
void test_rmw_4k_degraded_into_lost_to_normal(bool ec)
|
|
||||||
{
|
|
||||||
osd_num_t osd_set[3] = { 0, 2, 3 };
|
|
||||||
osd_num_t write_osd_set[3] = { 1, 2, 3 };
|
|
||||||
osd_rmw_stripe_t stripes[3] = {};
|
|
||||||
// Subtest 1
|
|
||||||
split_stripes(2, 128*1024, 48*1024, 4096, stripes);
|
|
||||||
void *write_buf = malloc(4096);
|
|
||||||
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024, 0);
|
|
||||||
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
|
|
||||||
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
|
|
||||||
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
|
|
||||||
assert(stripes[0].write_start == 48*1024 && stripes[0].write_end == 52*1024);
|
|
||||||
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
|
|
||||||
assert(stripes[2].write_start == 48*1024 && stripes[2].write_end == 52*1024);
|
|
||||||
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
|
|
||||||
assert(stripes[1].read_buf == (uint8_t*)rmw_buf+4*1024+128*1024);
|
|
||||||
assert(stripes[2].read_buf == (uint8_t*)rmw_buf+4*1024+2*128*1024);
|
|
||||||
assert(stripes[0].write_buf == write_buf);
|
|
||||||
assert(stripes[1].write_buf == NULL);
|
|
||||||
assert(stripes[2].write_buf == rmw_buf);
|
|
||||||
// Subtest 2
|
|
||||||
set_pattern(write_buf, 4096, PATTERN2);
|
|
||||||
set_pattern(stripes[1].read_buf, 128*1024, PATTERN1);
|
|
||||||
set_pattern(stripes[2].read_buf, 128*1024, PATTERN0^PATTERN1);
|
|
||||||
if (!ec)
|
|
||||||
calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024, 0);
|
|
||||||
else
|
|
||||||
{
|
|
||||||
use_ec(3, 2, true);
|
|
||||||
calc_rmw_parity_ec(stripes, 3, 2, osd_set, write_osd_set, 128*1024, 0);
|
|
||||||
use_ec(3, 2, false);
|
|
||||||
}
|
|
||||||
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
|
|
||||||
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
|
|
||||||
assert(stripes[2].write_start == 48*1024 && stripes[2].write_end == 52*1024);
|
|
||||||
assert(stripes[0].write_buf == stripes[0].read_buf);
|
|
||||||
assert(stripes[1].write_buf == NULL);
|
|
||||||
assert(stripes[2].write_buf == rmw_buf);
|
|
||||||
check_pattern(stripes[0].write_buf, 4096, PATTERN0);
|
|
||||||
check_pattern(stripes[0].write_buf+48*1024, 4096, PATTERN2);
|
|
||||||
check_pattern(stripes[2].write_buf, 4096, PATTERN2^PATTERN1); // new parity
|
|
||||||
free(rmw_buf);
|
|
||||||
free(write_buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
/***
|
|
||||||
|
|
||||||
8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
|
8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
|
||||||
= {
|
= {
|
||||||
read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
|
read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
|
||||||
@@ -893,11 +826,12 @@ void test14()
|
|||||||
|
|
||||||
***/
|
***/
|
||||||
|
|
||||||
void test15(bool second)
|
void test15()
|
||||||
{
|
{
|
||||||
const int bmp = 64*1024 / 4096 / 8;
|
const int bmp = 64*1024 / 4096 / 8;
|
||||||
use_ec(4, 2, true);
|
use_ec(4, 2, true);
|
||||||
osd_num_t osd_set[4] = { 1, 2, (osd_num_t)(second ? 0 : 3), (osd_num_t)(second ? 4 : 0) };
|
osd_num_t osd_set[4] = { 1, 2, 3, 0 };
|
||||||
|
osd_num_t write_osd_set[4] = { 1, 2, 3, 0 };
|
||||||
osd_rmw_stripe_t stripes[4] = {};
|
osd_rmw_stripe_t stripes[4] = {};
|
||||||
unsigned bitmaps[4] = { 0 };
|
unsigned bitmaps[4] = { 0 };
|
||||||
// Test 15.0
|
// Test 15.0
|
||||||
@@ -908,7 +842,7 @@ void test15(bool second)
|
|||||||
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
|
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
|
||||||
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
|
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
|
||||||
// Test 15.1
|
// Test 15.1
|
||||||
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, osd_set, 64*1024, bmp);
|
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, write_osd_set, 64*1024, bmp);
|
||||||
for (int i = 0; i < 4; i++)
|
for (int i = 0; i < 4; i++)
|
||||||
stripes[i].bmp_buf = bitmaps+i;
|
stripes[i].bmp_buf = bitmaps+i;
|
||||||
assert(rmw_buf);
|
assert(rmw_buf);
|
||||||
@@ -918,38 +852,36 @@ void test15(bool second)
|
|||||||
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
|
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
|
||||||
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
||||||
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
||||||
assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
|
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
|
||||||
assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
|
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
|
||||||
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
|
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
|
||||||
assert(stripes[1].read_buf == NULL);
|
assert(stripes[1].read_buf == NULL);
|
||||||
assert(stripes[2].read_buf == NULL);
|
assert(stripes[2].read_buf == NULL);
|
||||||
assert(stripes[3].read_buf == NULL);
|
assert(stripes[3].read_buf == NULL);
|
||||||
assert(stripes[0].write_buf == NULL);
|
assert(stripes[0].write_buf == NULL);
|
||||||
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
||||||
assert(stripes[2+second].write_buf == rmw_buf);
|
assert(stripes[2].write_buf == rmw_buf);
|
||||||
assert(stripes[3-second].write_buf == NULL);
|
assert(stripes[3].write_buf == NULL);
|
||||||
// Test 15.2 - encode
|
// Test 15.2 - encode
|
||||||
set_pattern(write_buf, 4*1024, PATTERN1);
|
set_pattern(write_buf, 4*1024, PATTERN1);
|
||||||
set_pattern(stripes[0].read_buf, 4*1024, PATTERN2);
|
set_pattern(stripes[0].read_buf, 4*1024, PATTERN2);
|
||||||
memset(stripes[0].bmp_buf, 0, bmp);
|
memset(stripes[0].bmp_buf, 0, bmp);
|
||||||
memset(stripes[1].bmp_buf, 0, bmp);
|
memset(stripes[1].bmp_buf, 0, bmp);
|
||||||
memset(stripes[2+second].write_buf, 0, 4096);
|
calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 64*1024, bmp);
|
||||||
calc_rmw_parity_ec(stripes, 4, 2, osd_set, osd_set, 64*1024, bmp);
|
assert(*(uint32_t*)stripes[2].bmp_buf == 0x80);
|
||||||
assert(second || *(uint32_t*)stripes[2].bmp_buf == 0x80);
|
|
||||||
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
||||||
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
||||||
assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
|
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
|
||||||
assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
|
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
|
||||||
assert(stripes[0].write_buf == NULL);
|
assert(stripes[0].write_buf == NULL);
|
||||||
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
||||||
assert(stripes[2+second].write_buf == rmw_buf);
|
assert(stripes[2].write_buf == rmw_buf);
|
||||||
assert(stripes[3-second].write_buf == NULL);
|
assert(stripes[3].write_buf == NULL);
|
||||||
// first parity is always xor :), second isn't...
|
check_pattern(stripes[2].write_buf, 4*1024, PATTERN1^PATTERN2); // first parity is always xor :)
|
||||||
check_pattern(stripes[2+second].write_buf, 4*1024, second ? 0xb79a59a0ce8b9b81 : PATTERN1^PATTERN2);
|
|
||||||
// Done
|
// Done
|
||||||
free(rmw_buf);
|
free(rmw_buf);
|
||||||
free(write_buf);
|
free(write_buf);
|
||||||
use_ec(4, 2, false);
|
use_ec(3, 2, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/***
|
/***
|
||||||
@@ -1052,5 +984,5 @@ void test16()
|
|||||||
// Done
|
// Done
|
||||||
free(rmw_buf);
|
free(rmw_buf);
|
||||||
free(write_buf);
|
free(write_buf);
|
||||||
use_ec(4, 2, false);
|
use_ec(3, 2, false);
|
||||||
}
|
}
|
||||||
|
@@ -150,7 +150,6 @@ int connect_osd(const char *osd_address, int osd_port)
|
|||||||
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
||||||
{
|
{
|
||||||
perror("connect");
|
perror("connect");
|
||||||
close(connect_fd);
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
int one = 1;
|
int one = 1;
|
||||||
|
@@ -3,11 +3,12 @@
|
|||||||
|
|
||||||
#include "pg_states.h"
|
#include "pg_states.h"
|
||||||
|
|
||||||
const int pg_state_bit_count = 14;
|
const int pg_state_bit_count = 16;
|
||||||
|
|
||||||
const int pg_state_bits[14] = {
|
const int pg_state_bits[16] = {
|
||||||
PG_STARTING,
|
PG_STARTING,
|
||||||
PG_PEERING,
|
PG_PEERING,
|
||||||
|
PG_PEERED,
|
||||||
PG_INCOMPLETE,
|
PG_INCOMPLETE,
|
||||||
PG_ACTIVE,
|
PG_ACTIVE,
|
||||||
PG_REPEERING,
|
PG_REPEERING,
|
||||||
@@ -22,9 +23,10 @@ const int pg_state_bits[14] = {
|
|||||||
PG_LEFT_ON_DEAD,
|
PG_LEFT_ON_DEAD,
|
||||||
};
|
};
|
||||||
|
|
||||||
const char *pg_state_names[14] = {
|
const char *pg_state_names[16] = {
|
||||||
"starting",
|
"starting",
|
||||||
"peering",
|
"peering",
|
||||||
|
"peered",
|
||||||
"incomplete",
|
"incomplete",
|
||||||
"active",
|
"active",
|
||||||
"repeering",
|
"repeering",
|
||||||
|
@@ -4,25 +4,27 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
// Placement group states
|
// Placement group states
|
||||||
// STARTING -> [acquire lock] -> PEERING -> INCOMPLETE|ACTIVE
|
// STARTING -> [acquire lock] -> PEERING -> PEERED
|
||||||
|
// PEERED -> [report history if required!] -> INCOMPLETE|ACTIVE
|
||||||
// ACTIVE -> REPEERING -> PEERING
|
// ACTIVE -> REPEERING -> PEERING
|
||||||
// ACTIVE -> STOPPING -> OFFLINE -> [release lock]
|
// ACTIVE -> STOPPING -> OFFLINE -> [release lock]
|
||||||
// Exactly one of these:
|
// Exactly one of these:
|
||||||
#define PG_STARTING (1<<0)
|
#define PG_STARTING (1<<0)
|
||||||
#define PG_PEERING (1<<1)
|
#define PG_PEERING (1<<1)
|
||||||
#define PG_INCOMPLETE (1<<2)
|
#define PG_PEERED (1<<2)
|
||||||
#define PG_ACTIVE (1<<3)
|
#define PG_INCOMPLETE (1<<3)
|
||||||
#define PG_REPEERING (1<<4)
|
#define PG_ACTIVE (1<<4)
|
||||||
#define PG_STOPPING (1<<5)
|
#define PG_REPEERING (1<<5)
|
||||||
#define PG_OFFLINE (1<<6)
|
#define PG_STOPPING (1<<6)
|
||||||
|
#define PG_OFFLINE (1<<7)
|
||||||
// Plus any of these:
|
// Plus any of these:
|
||||||
#define PG_DEGRADED (1<<7)
|
#define PG_DEGRADED (1<<8)
|
||||||
#define PG_HAS_INCOMPLETE (1<<8)
|
#define PG_HAS_INCOMPLETE (1<<9)
|
||||||
#define PG_HAS_DEGRADED (1<<9)
|
#define PG_HAS_DEGRADED (1<<10)
|
||||||
#define PG_HAS_MISPLACED (1<<10)
|
#define PG_HAS_MISPLACED (1<<11)
|
||||||
#define PG_HAS_UNCLEAN (1<<11)
|
#define PG_HAS_UNCLEAN (1<<12)
|
||||||
#define PG_HAS_INVALID (1<<12)
|
#define PG_HAS_INVALID (1<<13)
|
||||||
#define PG_LEFT_ON_DEAD (1<<13)
|
#define PG_LEFT_ON_DEAD (1<<14)
|
||||||
|
|
||||||
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
|
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
|
||||||
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
|
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
|
||||||
@@ -32,7 +34,6 @@
|
|||||||
#define OBJ_DEGRADED 0x02
|
#define OBJ_DEGRADED 0x02
|
||||||
#define OBJ_INCOMPLETE 0x04
|
#define OBJ_INCOMPLETE 0x04
|
||||||
#define OBJ_MISPLACED 0x08
|
#define OBJ_MISPLACED 0x08
|
||||||
#define OBJ_DELETED 0x10
|
|
||||||
#define OBJ_NEEDS_STABLE 0x10000
|
#define OBJ_NEEDS_STABLE 0x10000
|
||||||
#define OBJ_NEEDS_ROLLBACK 0x20000
|
#define OBJ_NEEDS_ROLLBACK 0x20000
|
||||||
|
|
||||||
|
@@ -9,9 +9,6 @@
|
|||||||
#endif
|
#endif
|
||||||
#include "qemu/osdep.h"
|
#include "qemu/osdep.h"
|
||||||
#include "qemu/main-loop.h"
|
#include "qemu/main-loop.h"
|
||||||
#if QEMU_VERSION_MAJOR >= 8
|
|
||||||
#include "block/block-io.h"
|
|
||||||
#endif
|
|
||||||
#include "block/block_int.h"
|
#include "block/block_int.h"
|
||||||
#include "qapi/error.h"
|
#include "qapi/error.h"
|
||||||
#include "qapi/qmp/qdict.h"
|
#include "qapi/qmp/qdict.h"
|
||||||
@@ -56,7 +53,6 @@ typedef struct VitastorClient
|
|||||||
char *etcd_host;
|
char *etcd_host;
|
||||||
char *etcd_prefix;
|
char *etcd_prefix;
|
||||||
char *image;
|
char *image;
|
||||||
int skip_parents;
|
|
||||||
uint64_t inode;
|
uint64_t inode;
|
||||||
uint64_t pool;
|
uint64_t pool;
|
||||||
uint64_t size;
|
uint64_t size;
|
||||||
@@ -67,10 +63,6 @@ typedef struct VitastorClient
|
|||||||
int rdma_gid_index;
|
int rdma_gid_index;
|
||||||
int rdma_mtu;
|
int rdma_mtu;
|
||||||
QemuMutex mutex;
|
QemuMutex mutex;
|
||||||
|
|
||||||
uint64_t last_bitmap_inode, last_bitmap_offset, last_bitmap_len;
|
|
||||||
uint32_t last_bitmap_granularity;
|
|
||||||
uint8_t *last_bitmap;
|
|
||||||
} VitastorClient;
|
} VitastorClient;
|
||||||
|
|
||||||
typedef struct VitastorRPC
|
typedef struct VitastorRPC
|
||||||
@@ -80,9 +72,6 @@ typedef struct VitastorRPC
|
|||||||
QEMUIOVector *iov;
|
QEMUIOVector *iov;
|
||||||
long ret;
|
long ret;
|
||||||
int complete;
|
int complete;
|
||||||
uint64_t inode, offset, len;
|
|
||||||
uint32_t bitmap_granularity;
|
|
||||||
uint8_t *bitmap;
|
|
||||||
} VitastorRPC;
|
} VitastorRPC;
|
||||||
|
|
||||||
static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task);
|
static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task);
|
||||||
@@ -158,7 +147,6 @@ static void vitastor_parse_filename(const char *filename, QDict *options, Error
|
|||||||
if (!strcmp(name, "inode") ||
|
if (!strcmp(name, "inode") ||
|
||||||
!strcmp(name, "pool") ||
|
!strcmp(name, "pool") ||
|
||||||
!strcmp(name, "size") ||
|
!strcmp(name, "size") ||
|
||||||
!strcmp(name, "skip-parents") ||
|
|
||||||
!strcmp(name, "use-rdma") ||
|
!strcmp(name, "use-rdma") ||
|
||||||
!strcmp(name, "rdma-port_num") ||
|
!strcmp(name, "rdma-port_num") ||
|
||||||
!strcmp(name, "rdma-gid-index") ||
|
!strcmp(name, "rdma-gid-index") ||
|
||||||
@@ -239,16 +227,13 @@ static void vitastor_aio_set_fd_handler(void *ctx, int fd, int unused1, IOHandle
|
|||||||
|
|
||||||
static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
|
static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
|
||||||
{
|
{
|
||||||
VitastorRPC task;
|
|
||||||
VitastorClient *client = bs->opaque;
|
VitastorClient *client = bs->opaque;
|
||||||
void *image = NULL;
|
|
||||||
int64_t ret = 0;
|
int64_t ret = 0;
|
||||||
qemu_mutex_init(&client->mutex);
|
qemu_mutex_init(&client->mutex);
|
||||||
client->config_path = g_strdup(qdict_get_try_str(options, "config-path"));
|
client->config_path = g_strdup(qdict_get_try_str(options, "config-path"));
|
||||||
// FIXME: Rename to etcd_address
|
// FIXME: Rename to etcd_address
|
||||||
client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd-host"));
|
client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd-host"));
|
||||||
client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd-prefix"));
|
client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd-prefix"));
|
||||||
client->skip_parents = qdict_get_try_int(options, "skip-parents", 0);
|
|
||||||
client->use_rdma = qdict_get_try_int(options, "use-rdma", -1);
|
client->use_rdma = qdict_get_try_int(options, "use-rdma", -1);
|
||||||
client->rdma_device = g_strdup(qdict_get_try_str(options, "rdma-device"));
|
client->rdma_device = g_strdup(qdict_get_try_str(options, "rdma-device"));
|
||||||
client->rdma_port_num = qdict_get_try_int(options, "rdma-port-num", 0);
|
client->rdma_port_num = qdict_get_try_int(options, "rdma-port-num", 0);
|
||||||
@@ -258,31 +243,23 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
|||||||
vitastor_aio_set_fd_handler, bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix,
|
vitastor_aio_set_fd_handler, bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix,
|
||||||
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
|
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
|
||||||
);
|
);
|
||||||
image = client->image = g_strdup(qdict_get_try_str(options, "image"));
|
client->image = g_strdup(qdict_get_try_str(options, "image"));
|
||||||
client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
|
client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
|
||||||
// Get image metadata (size and readonly flag) or just wait until the client is ready
|
|
||||||
if (!image)
|
|
||||||
client->image = (char*)"x";
|
|
||||||
task.complete = 0;
|
|
||||||
task.bs = bs;
|
|
||||||
if (qemu_in_coroutine())
|
|
||||||
{
|
|
||||||
vitastor_co_get_metadata(&task);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
#if QEMU_VERSION_MAJOR >= 8
|
|
||||||
aio_co_enter(bdrv_get_aio_context(bs), qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
|
|
||||||
#elif QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3
|
|
||||||
bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
|
|
||||||
#else
|
|
||||||
qemu_coroutine_enter(qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
|
|
||||||
#endif
|
|
||||||
BDRV_POLL_WHILE(bs, !task.complete);
|
|
||||||
}
|
|
||||||
client->image = image;
|
|
||||||
if (client->image)
|
if (client->image)
|
||||||
{
|
{
|
||||||
|
// Get image metadata (size and readonly flag)
|
||||||
|
VitastorRPC task;
|
||||||
|
task.complete = 0;
|
||||||
|
task.bs = bs;
|
||||||
|
if (qemu_in_coroutine())
|
||||||
|
{
|
||||||
|
vitastor_co_get_metadata(&task);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
|
||||||
|
BDRV_POLL_WHILE(bs, !task.complete);
|
||||||
|
}
|
||||||
client->watch = (void*)task.ret;
|
client->watch = (void*)task.ret;
|
||||||
client->readonly = client->readonly || vitastor_c_inode_get_readonly(client->watch);
|
client->readonly = client->readonly || vitastor_c_inode_get_readonly(client->watch);
|
||||||
client->size = vitastor_c_inode_get_size(client->watch);
|
client->size = vitastor_c_inode_get_size(client->watch);
|
||||||
@@ -307,7 +284,6 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
|||||||
client->inode = (client->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
|
client->inode = (client->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
|
||||||
}
|
}
|
||||||
client->size = qdict_get_try_int(options, "size", 0);
|
client->size = qdict_get_try_int(options, "size", 0);
|
||||||
vitastor_c_close_watch(client->proxy, (void*)task.ret);
|
|
||||||
}
|
}
|
||||||
if (!client->size)
|
if (!client->size)
|
||||||
{
|
{
|
||||||
@@ -329,7 +305,6 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
|||||||
qdict_del(options, "inode");
|
qdict_del(options, "inode");
|
||||||
qdict_del(options, "pool");
|
qdict_del(options, "pool");
|
||||||
qdict_del(options, "size");
|
qdict_del(options, "size");
|
||||||
qdict_del(options, "skip-parents");
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -346,8 +321,6 @@ static void vitastor_close(BlockDriverState *bs)
|
|||||||
g_free(client->etcd_prefix);
|
g_free(client->etcd_prefix);
|
||||||
if (client->image)
|
if (client->image)
|
||||||
g_free(client->image);
|
g_free(client->image);
|
||||||
free(client->last_bitmap);
|
|
||||||
client->last_bitmap = NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
|
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
|
||||||
@@ -513,13 +486,6 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
|
|||||||
vitastor_co_init_task(bs, &task);
|
vitastor_co_init_task(bs, &task);
|
||||||
task.iov = iov;
|
task.iov = iov;
|
||||||
|
|
||||||
if (client->last_bitmap)
|
|
||||||
{
|
|
||||||
// Invalidate last bitmap on write
|
|
||||||
free(client->last_bitmap);
|
|
||||||
client->last_bitmap = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
||||||
qemu_mutex_lock(&client->mutex);
|
qemu_mutex_lock(&client->mutex);
|
||||||
vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_bh_cb, &task);
|
vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_bh_cb, &task);
|
||||||
@@ -533,140 +499,6 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
|
|||||||
return task.ret;
|
return task.ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
|
|
||||||
#if QEMU_VERSION_MAJOR >= 2 || QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7
|
|
||||||
static void vitastor_co_read_bitmap_cb(void *opaque, long retval, uint8_t *bitmap)
|
|
||||||
{
|
|
||||||
VitastorRPC *task = opaque;
|
|
||||||
VitastorClient *client = task->bs->opaque;
|
|
||||||
task->ret = retval;
|
|
||||||
task->complete = 1;
|
|
||||||
if (retval >= 0)
|
|
||||||
{
|
|
||||||
task->bitmap = bitmap;
|
|
||||||
if (client->last_bitmap_inode == task->inode &&
|
|
||||||
client->last_bitmap_offset == task->offset &&
|
|
||||||
client->last_bitmap_len == task->len)
|
|
||||||
{
|
|
||||||
free(client->last_bitmap);
|
|
||||||
client->last_bitmap = bitmap;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (qemu_coroutine_self() != task->co)
|
|
||||||
{
|
|
||||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8
|
|
||||||
aio_co_wake(task->co);
|
|
||||||
#else
|
|
||||||
qemu_coroutine_enter(task->co, NULL);
|
|
||||||
qemu_aio_release(task);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static int coroutine_fn vitastor_co_block_status(
|
|
||||||
BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
|
|
||||||
int64_t *pnum, int64_t *map, BlockDriverState **file)
|
|
||||||
{
|
|
||||||
// Allocated => return BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID
|
|
||||||
// Not allocated => return 0
|
|
||||||
// Error => return -errno
|
|
||||||
// Set pnum to length of the extent, `*map` = `offset`, `*file` = `bs`
|
|
||||||
VitastorRPC task;
|
|
||||||
VitastorClient *client = bs->opaque;
|
|
||||||
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
|
||||||
uint8_t bit = 0;
|
|
||||||
if (client->last_bitmap && client->last_bitmap_inode == inode &&
|
|
||||||
client->last_bitmap_offset <= offset &&
|
|
||||||
client->last_bitmap_offset+client->last_bitmap_len >= (want_zero ? offset+1 : offset+bytes))
|
|
||||||
{
|
|
||||||
// Use the previously read bitmap
|
|
||||||
task.bitmap_granularity = client->last_bitmap_granularity;
|
|
||||||
task.offset = client->last_bitmap_offset;
|
|
||||||
task.len = client->last_bitmap_len;
|
|
||||||
task.bitmap = client->last_bitmap;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Read bitmap from this position, rounding to full inode PG blocks
|
|
||||||
uint32_t block_size = vitastor_c_inode_get_block_size(client->proxy, inode);
|
|
||||||
if (!block_size)
|
|
||||||
return -EAGAIN;
|
|
||||||
// Init coroutine
|
|
||||||
vitastor_co_init_task(bs, &task);
|
|
||||||
free(client->last_bitmap);
|
|
||||||
task.inode = client->last_bitmap_inode = inode;
|
|
||||||
task.bitmap_granularity = client->last_bitmap_granularity = vitastor_c_inode_get_bitmap_granularity(client->proxy, inode);
|
|
||||||
task.offset = client->last_bitmap_offset = offset / block_size * block_size;
|
|
||||||
task.len = client->last_bitmap_len = (offset+bytes+block_size-1) / block_size * block_size - task.offset;
|
|
||||||
task.bitmap = client->last_bitmap = NULL;
|
|
||||||
qemu_mutex_lock(&client->mutex);
|
|
||||||
vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
|
|
||||||
qemu_mutex_unlock(&client->mutex);
|
|
||||||
while (!task.complete)
|
|
||||||
{
|
|
||||||
qemu_coroutine_yield();
|
|
||||||
}
|
|
||||||
if (task.ret < 0)
|
|
||||||
{
|
|
||||||
// Error
|
|
||||||
return task.ret;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (want_zero)
|
|
||||||
{
|
|
||||||
// Get precise mapping with all holes
|
|
||||||
uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
|
|
||||||
uint64_t bmp_len = task.len / task.bitmap_granularity;
|
|
||||||
uint64_t bmp_end = bmp_pos+1;
|
|
||||||
bit = (task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1;
|
|
||||||
while (bmp_end < bmp_len && ((task.bitmap[bmp_end >> 3] >> (bmp_end & 0x7)) & 1) == bit)
|
|
||||||
{
|
|
||||||
bmp_end++;
|
|
||||||
}
|
|
||||||
*pnum = (bmp_end-bmp_pos) * task.bitmap_granularity;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Get larger allocated extents, possibly with false positives
|
|
||||||
uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
|
|
||||||
uint64_t bmp_end = (offset+bytes-task.offset) / task.bitmap_granularity - bmp_pos;
|
|
||||||
while (bmp_pos < bmp_end)
|
|
||||||
{
|
|
||||||
if (!(bmp_pos & 7) && bmp_end >= bmp_pos+8)
|
|
||||||
{
|
|
||||||
bit = bit || task.bitmap[bmp_pos >> 3];
|
|
||||||
bmp_pos += 8;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
bit = bit || ((task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1);
|
|
||||||
bmp_pos++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*pnum = bytes;
|
|
||||||
}
|
|
||||||
if (bit)
|
|
||||||
{
|
|
||||||
*map = offset;
|
|
||||||
*file = bs;
|
|
||||||
}
|
|
||||||
return (bit ? (BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID) : 0);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
|
|
||||||
// QEMU 1.7-2.11
|
|
||||||
static int64_t coroutine_fn vitastor_co_get_block_status(BlockDriverState *bs,
|
|
||||||
int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
|
|
||||||
{
|
|
||||||
int64_t map = 0;
|
|
||||||
int64_t pnumbytes = 0;
|
|
||||||
int r = vitastor_co_block_status(bs, 1, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, &pnumbytes, &map, &file);
|
|
||||||
*pnum = pnumbytes/BDRV_SECTOR_SIZE;
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !( QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7 )
|
#if !( QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7 )
|
||||||
static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
|
static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
|
||||||
{
|
{
|
||||||
@@ -741,13 +573,8 @@ static BlockDriver bdrv_vitastor = {
|
|||||||
.bdrv_parse_filename = vitastor_parse_filename,
|
.bdrv_parse_filename = vitastor_parse_filename,
|
||||||
|
|
||||||
.bdrv_has_zero_init = bdrv_has_zero_init_1,
|
.bdrv_has_zero_init = bdrv_has_zero_init_1,
|
||||||
#if QEMU_VERSION_MAJOR >= 8
|
|
||||||
.bdrv_co_get_info = vitastor_get_info,
|
|
||||||
.bdrv_co_getlength = vitastor_getlength,
|
|
||||||
#else
|
|
||||||
.bdrv_get_info = vitastor_get_info,
|
.bdrv_get_info = vitastor_get_info,
|
||||||
.bdrv_getlength = vitastor_getlength,
|
.bdrv_getlength = vitastor_getlength,
|
||||||
#endif
|
|
||||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
|
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
|
||||||
.bdrv_probe_blocksizes = vitastor_probe_blocksizes,
|
.bdrv_probe_blocksizes = vitastor_probe_blocksizes,
|
||||||
#endif
|
#endif
|
||||||
@@ -779,15 +606,6 @@ static BlockDriver bdrv_vitastor = {
|
|||||||
.bdrv_co_truncate = vitastor_co_truncate,
|
.bdrv_co_truncate = vitastor_co_truncate,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
|
|
||||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 12
|
|
||||||
// For snapshot export
|
|
||||||
.bdrv_co_block_status = vitastor_co_block_status,
|
|
||||||
#elif QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
|
|
||||||
.bdrv_co_get_block_status = vitastor_co_get_block_status,
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7
|
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7
|
||||||
.bdrv_co_preadv = vitastor_co_preadv,
|
.bdrv_co_preadv = vitastor_co_preadv,
|
||||||
.bdrv_co_pwritev = vitastor_co_pwritev,
|
.bdrv_co_pwritev = vitastor_co_pwritev,
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user