Compare commits
104 Commits
Author | SHA1 | Date | |
---|---|---|---|
8810eae8fb | |||
c1365f46c9 | |||
14d6acbcba | |||
1e307069bc | |||
c3e80abad7 | |||
138ffe4032 | |||
8139a34e97 | |||
4ab630b44d | |||
2c8241b7db | |||
36a7dd3671 | |||
936122bbcf | |||
1a1ba0d1e7 | |||
3d09c9cec7 | |||
3d08a1ad6c | |||
499881d81c | |||
aba93b951b | |||
d125fb1f30 | |||
9d3fd72298 | |||
8b552a01f9 | |||
0385b2f9e8 | |||
749c837045 | |||
98001d845b | |||
c96bcae74b | |||
9f4e34a8cc | |||
81fc8bb94c | |||
bc465c16de | |||
8763e9211c | |||
9e1a80bd17 | |||
3e280f2f08 | |||
fe87b4076b | |||
a38957c1a7 | |||
137309cf29 | |||
373f9d0387 | |||
c4516ea971 | |||
91065c80fc | |||
0f6b946add | |||
465cbf0b2f | |||
41add50e4e | |||
02e7be7dc9 | |||
73940adf07 | |||
e950c024d3 | |||
71d6d9f868 | |||
a4dfa519af | |||
37a6aff2fa | |||
67019f5b02 | |||
0593e5c21c | |||
998e24adf8 | |||
d7bd36dc32 | |||
cf5c562800 | |||
629200b0cc | |||
3589ccec22 | |||
8d55a1e780 | |||
65f6b3a4eb | |||
fd216eac77 | |||
61fca7c426 | |||
1c29ed80b9 | |||
68f3fb795e | |||
fa90f287da | |||
795020674d | |||
8e12285629 | |||
b9b50ab4cc | |||
0d8625f92d | |||
2f3c2c5140 | |||
4ebdd02b0f | |||
bf6fdc4141 | |||
c2244331e6 | |||
3de57e87b1 | |||
2d4cc688b2 | |||
31bd1ec145 | |||
c08d1f2dfe | |||
1d80bcc8d0 | |||
5ef8bed75f | |||
8669998e5e | |||
b457327e77 | |||
f7fa9d5e34 | |||
49b88b01f9 | |||
71688bcb59 | |||
552e207d2b | |||
5464821fa5 | |||
6917a32ca8 | |||
f8722a8bd5 | |||
9c2f69c9fa | |||
1a93e3f33a | |||
3f35744052 | |||
66f14ac019 | |||
1364009931 | |||
d7e30b8353 | |||
cb437913d3 | |||
472bce58ab | |||
7a71e7ef01 | |||
c71e5e7bbd | |||
8fdf30b21f | |||
238037ae31 | |||
09a8864686 | |||
6e6f6ecbb0 | |||
9491f81419 | |||
44c2b30167 | |||
bf8a0581cd | |||
5953942042 | |||
a276a1f737 | |||
cc24e5796e | |||
6e26732e6a | |||
b4edc79449 | |||
5f26887d32 |
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VERSION "0.8.0")
|
||||
set(VERSION "0.8.6")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
@@ -48,9 +48,9 @@ Vitastor, составлены для того, чтобы убедиться,
|
||||
интерфейс (прокси), опять же, без открытия в свободный публичный доступ как
|
||||
самой программы, так и прокси.
|
||||
|
||||
Сетевая Публичная Лицензия Vitastor разработана специально чтобы
|
||||
Сетевая Публичная Лицензия Vitastor разработана специально, чтобы
|
||||
гарантировать, что в таких случаях и модифицированная версия программы, и
|
||||
прокси оставались доступными сообществу. Для этого лицензия требует от
|
||||
прокси останутся доступными сообществу. Для этого лицензия требует от
|
||||
операторов сетевых серверов предоставлять исходный код оригинальной программы,
|
||||
а также всех других программ, взаимодействующих с ней на их серверах,
|
||||
пользователям этих серверов, на условиях свободных лицензий. Таким образом,
|
||||
|
@@ -18,15 +18,19 @@ ENV CSI_ENDPOINT=""
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y wget && \
|
||||
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
|
||||
(echo deb http://vitastor.io/debian buster main > /etc/apt/sources.list.d/vitastor.list) && \
|
||||
(echo deb http://deb.debian.org/debian buster-backports main > /etc/apt/sources.list.d/backports.list) && \
|
||||
(echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
|
||||
apt-get update && \
|
||||
apt-get install -y e2fsprogs xfsprogs vitastor kmod && \
|
||||
apt-get install -y e2fsprogs xfsprogs kmod && \
|
||||
apt-get clean && \
|
||||
(echo options nbd nbds_max=128 > /etc/modprobe.d/nbd.conf)
|
||||
|
||||
COPY --from=build /app/vitastor-csi /bin/
|
||||
|
||||
RUN (echo deb http://vitastor.io/debian buster main > /etc/apt/sources.list.d/vitastor.list) && \
|
||||
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
|
||||
apt-get update && \
|
||||
apt-get install -y vitastor-client && \
|
||||
apt-get clean
|
||||
|
||||
ENTRYPOINT ["/bin/vitastor-csi"]
|
||||
|
@@ -1,4 +1,4 @@
|
||||
VERSION ?= v0.8.0
|
||||
VERSION ?= v0.8.6
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v0.8.0
|
||||
image: vitalif/vitastor-csi:v0.8.6
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
@@ -102,7 +102,7 @@ spec:
|
||||
- "--health-port=9898"
|
||||
env:
|
||||
- name: CSI_ENDPOINT
|
||||
value: unix://csi/csi.sock
|
||||
value: unix:///csi/csi.sock
|
||||
volumeMounts:
|
||||
- mountPath: /csi
|
||||
name: socket-dir
|
||||
|
@@ -116,7 +116,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v0.8.0
|
||||
image: vitalif/vitastor-csi:v0.8.6
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "0.8.0"
|
||||
vitastorCSIDriverVersion = "0.8.6"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
@@ -10,7 +10,6 @@ import (
|
||||
"bytes"
|
||||
"strconv"
|
||||
"time"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"io/ioutil"
|
||||
@@ -21,8 +20,6 @@ import (
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/status"
|
||||
|
||||
"go.etcd.io/etcd/clientv3"
|
||||
|
||||
"github.com/container-storage-interface/spec/lib/go/csi"
|
||||
)
|
||||
|
||||
@@ -114,6 +111,34 @@ func GetConnectionParams(params map[string]string) (map[string]string, []string,
|
||||
return ctxVars, etcdUrl, etcdPrefix
|
||||
}
|
||||
|
||||
func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
|
||||
{
|
||||
if (ctxVars["etcdUrl"] != "")
|
||||
{
|
||||
args = append(args, "--etcd_address", ctxVars["etcdUrl"])
|
||||
}
|
||||
if (ctxVars["etcdPrefix"] != "")
|
||||
{
|
||||
args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
|
||||
}
|
||||
if (ctxVars["configPath"] != "")
|
||||
{
|
||||
args = append(args, "--config_path", ctxVars["configPath"])
|
||||
}
|
||||
c := exec.Command("/usr/bin/vitastor-cli", args...)
|
||||
var stdout, stderr bytes.Buffer
|
||||
c.Stdout = &stdout
|
||||
c.Stderr = &stderr
|
||||
err := c.Run()
|
||||
stderrStr := string(stderr.Bytes())
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
|
||||
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
|
||||
}
|
||||
return stdout.Bytes(), nil
|
||||
}
|
||||
|
||||
// Create the volume
|
||||
func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
|
||||
{
|
||||
@@ -146,128 +171,41 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
|
||||
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
|
||||
}
|
||||
|
||||
// FIXME: The following should PROBABLY be implemented externally in a management tool
|
||||
|
||||
ctxVars, etcdUrl, etcdPrefix := GetConnectionParams(req.Parameters)
|
||||
ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
|
||||
if (len(etcdUrl) == 0)
|
||||
{
|
||||
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
||||
}
|
||||
|
||||
// Connect to etcd
|
||||
cli, err := clientv3.New(clientv3.Config{
|
||||
DialTimeout: ETCD_TIMEOUT,
|
||||
Endpoints: etcdUrl,
|
||||
})
|
||||
// Create image using vitastor-cli
|
||||
_, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", string(volSize), "--pool", string(poolId) })
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
|
||||
}
|
||||
defer cli.Close()
|
||||
|
||||
var imageId uint64 = 0
|
||||
for
|
||||
{
|
||||
// Check if the image exists
|
||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
|
||||
cancel()
|
||||
if (err != nil)
|
||||
if (strings.Index(err.Error(), "already exists") > 0)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||
}
|
||||
if (len(resp.Kvs) > 0)
|
||||
{
|
||||
kv := resp.Kvs[0]
|
||||
var v InodeIndex
|
||||
err := json.Unmarshal(kv.Value, &v)
|
||||
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
|
||||
return nil, err
|
||||
}
|
||||
poolId = v.PoolId
|
||||
imageId = v.Id
|
||||
inodeCfgKey := fmt.Sprintf("/config/inode/%d/%d", poolId, imageId)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||
resp, err := cli.Get(ctx, etcdPrefix+inodeCfgKey)
|
||||
cancel()
|
||||
var inodeCfg []InodeConfig
|
||||
err = json.Unmarshal(stat, &inodeCfg)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
|
||||
}
|
||||
if (len(resp.Kvs) == 0)
|
||||
if (len(inodeCfg) == 0)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "missing "+inodeCfgKey+" key in etcd")
|
||||
return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
|
||||
}
|
||||
var inodeCfg InodeConfig
|
||||
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
|
||||
}
|
||||
if (inodeCfg.Size < uint64(volSize))
|
||||
if (inodeCfg[0].Size < uint64(volSize))
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Find a free ID
|
||||
// Create image metadata in a transaction verifying that the image doesn't exist yet AND ID is still free
|
||||
maxIdKey := fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||
resp, err := cli.Get(ctx, maxIdKey)
|
||||
cancel()
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||
}
|
||||
var modRev int64
|
||||
var nextId uint64
|
||||
if (len(resp.Kvs) > 0)
|
||||
{
|
||||
var err error
|
||||
nextId, err = strconv.ParseUint(string(resp.Kvs[0].Value), 10, 64)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, maxIdKey+" contains invalid ID")
|
||||
}
|
||||
modRev = resp.Kvs[0].ModRevision
|
||||
nextId++
|
||||
}
|
||||
else
|
||||
{
|
||||
nextId = 1
|
||||
}
|
||||
inodeIdxJson, _ := json.Marshal(InodeIndex{
|
||||
Id: nextId,
|
||||
PoolId: poolId,
|
||||
})
|
||||
inodeCfgJson, _ := json.Marshal(InodeConfig{
|
||||
Name: volName,
|
||||
Size: uint64(volSize),
|
||||
})
|
||||
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||
txnResp, err := cli.Txn(ctx).If(
|
||||
clientv3.Compare(clientv3.ModRevision(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)), "=", modRev),
|
||||
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)), "=", 0),
|
||||
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId)), "=", 0),
|
||||
).Then(
|
||||
clientv3.OpPut(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId), fmt.Sprintf("%d", nextId)),
|
||||
clientv3.OpPut(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName), string(inodeIdxJson)),
|
||||
clientv3.OpPut(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId), string(inodeCfgJson)),
|
||||
).Commit()
|
||||
cancel()
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to commit transaction in etcd: "+err.Error())
|
||||
}
|
||||
if (txnResp.Succeeded)
|
||||
{
|
||||
imageId = nextId
|
||||
break
|
||||
}
|
||||
// Start over if the transaction fails
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
@@ -299,97 +237,12 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
|
||||
}
|
||||
volName := ctxVars["name"]
|
||||
|
||||
_, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
|
||||
if (len(etcdUrl) == 0)
|
||||
{
|
||||
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
||||
}
|
||||
ctxVars, _, _ = GetConnectionParams(ctxVars)
|
||||
|
||||
cli, err := clientv3.New(clientv3.Config{
|
||||
DialTimeout: ETCD_TIMEOUT,
|
||||
Endpoints: etcdUrl,
|
||||
})
|
||||
_, err = invokeCLI(ctxVars, []string{ "rm", volName })
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
|
||||
}
|
||||
defer cli.Close()
|
||||
|
||||
// Find inode by name
|
||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
|
||||
cancel()
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||
}
|
||||
if (len(resp.Kvs) == 0)
|
||||
{
|
||||
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
|
||||
}
|
||||
var idx InodeIndex
|
||||
err = json.Unmarshal(resp.Kvs[0].Value, &idx)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
|
||||
}
|
||||
|
||||
// Get inode config
|
||||
inodeCfgKey := fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)
|
||||
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||
resp, err = cli.Get(ctx, inodeCfgKey)
|
||||
cancel()
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||
}
|
||||
if (len(resp.Kvs) == 0)
|
||||
{
|
||||
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
|
||||
}
|
||||
var inodeCfg InodeConfig
|
||||
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
|
||||
}
|
||||
|
||||
// Delete inode data by invoking vitastor-cli
|
||||
args := []string{
|
||||
"rm-data", "--etcd_address", strings.Join(etcdUrl, ","),
|
||||
"--pool", fmt.Sprintf("%d", idx.PoolId),
|
||||
"--inode", fmt.Sprintf("%d", idx.Id),
|
||||
}
|
||||
if (ctxVars["configPath"] != "")
|
||||
{
|
||||
args = append(args, "--config_path", ctxVars["configPath"])
|
||||
}
|
||||
c := exec.Command("/usr/bin/vitastor-cli", args...)
|
||||
var stderr bytes.Buffer
|
||||
c.Stdout = nil
|
||||
c.Stderr = &stderr
|
||||
err = c.Run()
|
||||
stderrStr := string(stderr.Bytes())
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("vitastor-cli rm-data failed: %s, status %s\n", stderrStr, err)
|
||||
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
|
||||
}
|
||||
|
||||
// Delete inode config in etcd
|
||||
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||
txnResp, err := cli.Txn(ctx).Then(
|
||||
clientv3.OpDelete(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)),
|
||||
clientv3.OpDelete(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)),
|
||||
).Commit()
|
||||
cancel()
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: "+err.Error())
|
||||
}
|
||||
if (!txnResp.Succeeded)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: transaction failed")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &csi.DeleteVolumeResponse{}, nil
|
||||
|
4
debian/changelog
vendored
4
debian/changelog
vendored
@@ -1,10 +1,10 @@
|
||||
vitastor (0.8.0-1) unstable; urgency=medium
|
||||
vitastor (0.8.6-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
||||
|
||||
vitastor (0.8.0-1) unstable; urgency=medium
|
||||
vitastor (0.8.6-1) unstable; urgency=medium
|
||||
|
||||
* Implement NFS proxy
|
||||
* Add documentation
|
||||
|
2
debian/pve-storage-vitastor.install
vendored
2
debian/pve-storage-vitastor.install
vendored
@@ -1 +1 @@
|
||||
patches/PVE_VitastorPlugin.pm usr/share/perl5/PVE/Storage/Custom/VitastorPlugin.pm
|
||||
patches/VitastorPlugin.pm usr/share/perl5/PVE/Storage/Custom/
|
||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -34,8 +34,8 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-0.8.0; \
|
||||
cd vitastor-0.8.0; \
|
||||
cp -r /root/vitastor vitastor-0.8.6; \
|
||||
cd vitastor-0.8.6; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
@@ -48,8 +48,8 @@ RUN set -e -x; \
|
||||
rm -rf a b; \
|
||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.0.orig.tar.xz vitastor-0.8.0; \
|
||||
cd vitastor-0.8.0; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.6.orig.tar.xz vitastor-0.8.6; \
|
||||
cd vitastor-0.8.6; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
@@ -19,6 +19,7 @@ between clients, OSDs and etcd.
|
||||
- [rdma_max_sge](#rdma_max_sge)
|
||||
- [rdma_max_msg](#rdma_max_msg)
|
||||
- [rdma_max_recv](#rdma_max_recv)
|
||||
- [rdma_max_send](#rdma_max_send)
|
||||
- [peer_connect_interval](#peer_connect_interval)
|
||||
- [peer_connect_timeout](#peer_connect_timeout)
|
||||
- [osd_idle_timeout](#osd_idle_timeout)
|
||||
@@ -74,6 +75,12 @@ to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||
root to list available RDMA devices and their features.
|
||||
|
||||
Remember that you also have to configure your network switches if you use
|
||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
||||
the manual of your network vendor for details about setting up the switch
|
||||
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
|
||||
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
||||
|
||||
## rdma_port_num
|
||||
|
||||
- Type: integer
|
||||
@@ -116,20 +123,30 @@ required to change this parameter.
|
||||
## rdma_max_msg
|
||||
|
||||
- Type: integer
|
||||
- Default: 1048576
|
||||
- Default: 132096
|
||||
|
||||
Maximum size of a single RDMA send or receive operation in bytes.
|
||||
|
||||
## rdma_max_recv
|
||||
|
||||
- Type: integer
|
||||
- Default: 16
|
||||
|
||||
Maximum number of RDMA receive buffers per connection (RDMA requires
|
||||
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
|
||||
in size. So this setting directly affects memory usage: a single Vitastor
|
||||
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
|
||||
Default is roughly 2 MB * number of OSDs.
|
||||
|
||||
## rdma_max_send
|
||||
|
||||
- Type: integer
|
||||
- Default: 8
|
||||
|
||||
Maximum number of parallel RDMA receive operations. Note that this number
|
||||
of receive buffers `rdma_max_msg` in size are allocated for each client,
|
||||
so this setting actually affects memory usage. This is because RDMA receive
|
||||
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
|
||||
later versions.
|
||||
Maximum number of outstanding RDMA send operations per connection. Should be
|
||||
less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
|
||||
Doesn't affect memory usage - additional memory isn't allocated for send
|
||||
operations.
|
||||
|
||||
## peer_connect_interval
|
||||
|
||||
|
@@ -19,6 +19,7 @@
|
||||
- [rdma_max_sge](#rdma_max_sge)
|
||||
- [rdma_max_msg](#rdma_max_msg)
|
||||
- [rdma_max_recv](#rdma_max_recv)
|
||||
- [rdma_max_send](#rdma_max_send)
|
||||
- [peer_connect_interval](#peer_connect_interval)
|
||||
- [peer_connect_timeout](#peer_connect_timeout)
|
||||
- [osd_idle_timeout](#osd_idle_timeout)
|
||||
@@ -78,6 +79,13 @@ Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Наприме
|
||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||
параметры и возможности.
|
||||
|
||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
||||
нестабильной производительностью. Подробную информацию о настройке
|
||||
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
|
||||
подразумевает настройку сети без потерь на основе PFC (Priority Flow
|
||||
Control) и ECN (Explicit Congestion Notification).
|
||||
|
||||
## rdma_port_num
|
||||
|
||||
- Тип: целое число
|
||||
@@ -121,22 +129,32 @@ OSD в любом случае согласовывают реальное зн
|
||||
## rdma_max_msg
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 1048576
|
||||
- Значение по умолчанию: 132096
|
||||
|
||||
Максимальный размер одной RDMA-операции отправки или приёма.
|
||||
|
||||
## rdma_max_recv
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 16
|
||||
|
||||
Максимальное число буферов для RDMA-приёма данных на одно соединение
|
||||
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
|
||||
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
|
||||
потребление памяти - один Vitastor-клиент с RDMA использует
|
||||
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
|
||||
примерно 2 МБ * число OSD.
|
||||
|
||||
## rdma_max_send
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 8
|
||||
|
||||
Максимальное число параллельных RDMA-операций получения данных. Следует
|
||||
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
|
||||
для каждого подключённого клиентского соединения, так что данная настройка
|
||||
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
|
||||
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
|
||||
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
|
||||
более новых версиях Vitastor.
|
||||
Максимальное число RDMA-операций отправки, отправляемых в очередь одного
|
||||
соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
|
||||
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
||||
не выделяется.
|
||||
|
||||
## peer_connect_interval
|
||||
|
||||
|
@@ -17,6 +17,7 @@ initialization and can be changed with an OSD restart.
|
||||
- [autosync_interval](#autosync_interval)
|
||||
- [autosync_writes](#autosync_writes)
|
||||
- [recovery_queue_depth](#recovery_queue_depth)
|
||||
- [recovery_pg_switch](#recovery_pg_switch)
|
||||
- [recovery_sync_batch](#recovery_sync_batch)
|
||||
- [readonly](#readonly)
|
||||
- [no_recovery](#no_recovery)
|
||||
@@ -115,6 +116,16 @@ Maximum recovery operations per one primary OSD at any given moment of time.
|
||||
Currently it's the only parameter available to tune the speed or recovery
|
||||
and rebalancing, but it's planned to implement more.
|
||||
|
||||
## recovery_pg_switch
|
||||
|
||||
- Type: integer
|
||||
- Default: 128
|
||||
|
||||
Number of recovery operations before switching to recovery of the next PG.
|
||||
The idea is to mix all PGs during recovery for more even space and load
|
||||
distribution but still benefit from recovery queue depth greater than 1.
|
||||
Degraded PGs are anyway scanned first.
|
||||
|
||||
## recovery_sync_batch
|
||||
|
||||
- Type: integer
|
||||
|
@@ -18,6 +18,7 @@
|
||||
- [autosync_interval](#autosync_interval)
|
||||
- [autosync_writes](#autosync_writes)
|
||||
- [recovery_queue_depth](#recovery_queue_depth)
|
||||
- [recovery_pg_switch](#recovery_pg_switch)
|
||||
- [recovery_sync_batch](#recovery_sync_batch)
|
||||
- [readonly](#readonly)
|
||||
- [no_recovery](#no_recovery)
|
||||
@@ -119,6 +120,17 @@ OSD, чтобы успевать очищать журнал - без них OSD
|
||||
для ускорения или замедления восстановления и перебалансировки данных, но
|
||||
в планах реализация других параметров.
|
||||
|
||||
## recovery_pg_switch
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 128
|
||||
|
||||
Число операций восстановления перед переключением на восстановление другой PG.
|
||||
Идея заключается в том, чтобы восстанавливать все PG одновременно для более
|
||||
равномерного распределения места и нагрузки, но при этом всё равно выигрывать
|
||||
от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом
|
||||
случае сканируются первыми.
|
||||
|
||||
## recovery_sync_batch
|
||||
|
||||
- Тип: целое число
|
||||
|
@@ -82,7 +82,7 @@ Parent node reference is required for intermediate tree nodes.
|
||||
Separate OSD settings are set in etc keys `/vitastor/config/osd/<number>`
|
||||
in JSON format `{"<key>":<value>}`.
|
||||
|
||||
As of now, there is only one setting:
|
||||
As of now, two settings are supported:
|
||||
|
||||
## reweight
|
||||
|
||||
@@ -96,6 +96,15 @@ This means an OSD configured with reweight lower than 1 receives less PGs than
|
||||
it normally would. An OSD with reweight = 0 won't store any data. You can set
|
||||
reweight to 0 to trigger rebalance and remove all data from an OSD.
|
||||
|
||||
## tags
|
||||
|
||||
- Type: string or array of strings
|
||||
|
||||
Sets tag or multiple tags for this OSD. Tags can be used to group OSDs into
|
||||
subsets and then use a specific subset for pool instead of all OSDs.
|
||||
For example you can mark SSD OSDs with tag "ssd" and HDD OSDs with "hdd" and
|
||||
such tags will work as device classes.
|
||||
|
||||
# Pool parameters
|
||||
|
||||
## name
|
||||
|
@@ -81,7 +81,10 @@
|
||||
Настройки отдельных OSD задаются в ключах etcd `/vitastor/config/osd/<number>`
|
||||
в JSON-формате `{"<key>":<value>}`.
|
||||
|
||||
На данный момент поддерживается одна настройка:
|
||||
На данный момент поддерживаются две настройки:
|
||||
|
||||
- [reweight](#reweight)
|
||||
- [tags](#tags)
|
||||
|
||||
## reweight
|
||||
|
||||
@@ -96,6 +99,15 @@
|
||||
хранении данных вообще. Вы можете установить reweight в 0, чтобы убрать
|
||||
все данные с OSD.
|
||||
|
||||
## tags
|
||||
|
||||
- Тип: строка или массив строк
|
||||
|
||||
Задаёт тег или набор тегов для данного OSD. Теги можно использовать, чтобы
|
||||
делить OSD на множества и потом размещать пул только на части OSD, а не на
|
||||
всех. Можно, например, пометить SSD OSD тегом "ssd", а HDD тегом "hdd", в
|
||||
этом смысле теги работают аналогично классам устройств.
|
||||
|
||||
# Параметры
|
||||
|
||||
## name
|
||||
|
@@ -53,6 +53,12 @@
|
||||
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||
root to list available RDMA devices and their features.
|
||||
|
||||
Remember that you also have to configure your network switches if you use
|
||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
||||
the manual of your network vendor for details about setting up the switch
|
||||
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
|
||||
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
||||
info_ru: |
|
||||
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
||||
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
||||
@@ -61,6 +67,13 @@
|
||||
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||
параметры и возможности.
|
||||
|
||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
||||
нестабильной производительностью. Подробную информацию о настройке
|
||||
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
|
||||
подразумевает настройку сети без потерь на основе PFC (Priority Flow
|
||||
Control) и ECN (Explicit Congestion Notification).
|
||||
- name: rdma_port_num
|
||||
type: int
|
||||
default: 1
|
||||
@@ -114,26 +127,39 @@
|
||||
так что менять этот параметр обычно не нужно.
|
||||
- name: rdma_max_msg
|
||||
type: int
|
||||
default: 1048576
|
||||
default: 132096
|
||||
info: Maximum size of a single RDMA send or receive operation in bytes.
|
||||
info_ru: Максимальный размер одной RDMA-операции отправки или приёма.
|
||||
- name: rdma_max_recv
|
||||
type: int
|
||||
default: 16
|
||||
info: |
|
||||
Maximum number of RDMA receive buffers per connection (RDMA requires
|
||||
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
|
||||
in size. So this setting directly affects memory usage: a single Vitastor
|
||||
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
|
||||
Default is roughly 2 MB * number of OSDs.
|
||||
info_ru: |
|
||||
Максимальное число буферов для RDMA-приёма данных на одно соединение
|
||||
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
|
||||
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
|
||||
потребление памяти - один Vitastor-клиент с RDMA использует
|
||||
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
|
||||
примерно 2 МБ * число OSD.
|
||||
- name: rdma_max_send
|
||||
type: int
|
||||
default: 8
|
||||
info: |
|
||||
Maximum number of parallel RDMA receive operations. Note that this number
|
||||
of receive buffers `rdma_max_msg` in size are allocated for each client,
|
||||
so this setting actually affects memory usage. This is because RDMA receive
|
||||
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
|
||||
later versions.
|
||||
Maximum number of outstanding RDMA send operations per connection. Should be
|
||||
less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
|
||||
Doesn't affect memory usage - additional memory isn't allocated for send
|
||||
operations.
|
||||
info_ru: |
|
||||
Максимальное число параллельных RDMA-операций получения данных. Следует
|
||||
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
|
||||
для каждого подключённого клиентского соединения, так что данная настройка
|
||||
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
|
||||
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
|
||||
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
|
||||
более новых версиях Vitastor.
|
||||
Максимальное число RDMA-операций отправки, отправляемых в очередь одного
|
||||
соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
|
||||
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
||||
не выделяется.
|
||||
- name: peer_connect_interval
|
||||
type: sec
|
||||
min: 1
|
||||
|
@@ -102,6 +102,20 @@
|
||||
момент времени. На данный момент единственный параметр, который можно менять
|
||||
для ускорения или замедления восстановления и перебалансировки данных, но
|
||||
в планах реализация других параметров.
|
||||
- name: recovery_pg_switch
|
||||
type: int
|
||||
default: 128
|
||||
info: |
|
||||
Number of recovery operations before switching to recovery of the next PG.
|
||||
The idea is to mix all PGs during recovery for more even space and load
|
||||
distribution but still benefit from recovery queue depth greater than 1.
|
||||
Degraded PGs are anyway scanned first.
|
||||
info_ru: |
|
||||
Число операций восстановления перед переключением на восстановление другой PG.
|
||||
Идея заключается в том, чтобы восстанавливать все PG одновременно для более
|
||||
равномерного распределения места и нагрузки, но при этом всё равно выигрывать
|
||||
от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом
|
||||
случае сканируются первыми.
|
||||
- name: recovery_sync_batch
|
||||
type: int
|
||||
default: 16
|
||||
|
@@ -9,7 +9,7 @@
|
||||
## Debian
|
||||
|
||||
- Trust Vitastor package signing key:
|
||||
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
|
||||
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
|
||||
- Add Vitastor package repository to your /etc/apt/sources.list:
|
||||
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
@@ -20,8 +20,8 @@
|
||||
## CentOS
|
||||
|
||||
- Add Vitastor package repository:
|
||||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
|
||||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
|
||||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
|
||||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
|
||||
- Enable EPEL: `yum/dnf install epel-release`
|
||||
- Enable additional CentOS repositories:
|
||||
- CentOS 7: `yum install centos-release-scl`
|
||||
|
@@ -9,7 +9,7 @@
|
||||
## Debian
|
||||
|
||||
- Добавьте ключ репозитория Vitastor:
|
||||
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
|
||||
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
|
||||
- Добавьте репозиторий Vitastor в /etc/apt/sources.list:
|
||||
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
@@ -20,8 +20,8 @@
|
||||
## CentOS
|
||||
|
||||
- Добавьте в систему репозиторий Vitastor:
|
||||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
|
||||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
|
||||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
|
||||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
|
||||
- Включите EPEL: `yum/dnf install epel-release`
|
||||
- Включите дополнительные репозитории CentOS:
|
||||
- CentOS 7: `yum install centos-release-scl`
|
||||
|
@@ -6,10 +6,10 @@
|
||||
|
||||
# Proxmox VE
|
||||
|
||||
To enable Vitastor support in Proxmox Virtual Environment (6.4 and 7.1 are supported):
|
||||
To enable Vitastor support in Proxmox Virtual Environment (6.4-7.3 are supported):
|
||||
|
||||
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts
|
||||
(buster for 6.4, bullseye for 7.1)
|
||||
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
|
||||
buster for 6.4, bullseye for 7.3, pve7.1 for 7.1, pve7.2 for 7.2
|
||||
- Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
|
||||
- Define storage in `/etc/pve/storage.cfg` (see below)
|
||||
- Block network access from VMs to Vitastor network (to OSDs and etcd),
|
||||
@@ -35,5 +35,5 @@ vitastor: vitastor
|
||||
vitastor_nbd 0
|
||||
```
|
||||
|
||||
\* Note: you can also manually copy [patches/PVE_VitastorPlugin.pm](patches/PVE_VitastorPlugin.pm) to Proxmox hosts
|
||||
\* Note: you can also manually copy [patches/VitastorPlugin.pm](patches/VitastorPlugin.pm) to Proxmox hosts
|
||||
as `/usr/share/perl5/PVE/Storage/Custom/VitastorPlugin.pm` instead of installing pve-storage-vitastor.
|
||||
|
@@ -6,10 +6,10 @@
|
||||
|
||||
# Proxmox
|
||||
|
||||
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4 и 7.1):
|
||||
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-7.3):
|
||||
|
||||
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox
|
||||
(buster для 6.4, bullseye для 7.1)
|
||||
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
|
||||
buster для 6.4, bullseye для 7.3, pve7.1 для 7.1, pve7.2 для 7.2
|
||||
- Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
|
||||
- Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
|
||||
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
|
||||
@@ -35,5 +35,5 @@ vitastor: vitastor
|
||||
```
|
||||
|
||||
\* Примечание: вместо установки пакета pve-storage-vitastor вы можете вручную скопировать файл
|
||||
[patches/PVE_VitastorPlugin.pm](patches/PVE_VitastorPlugin.pm) на хосты Proxmox как
|
||||
[patches/VitastorPlugin.pm](patches/VitastorPlugin.pm) на хосты Proxmox как
|
||||
`/usr/share/perl5/PVE/Storage/Custom/VitastorPlugin.pm`.
|
||||
|
@@ -70,7 +70,7 @@ For EC pools the configuration should look like the following:
|
||||
|
||||
```
|
||||
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
|
||||
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
|
||||
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}'
|
||||
```
|
||||
|
||||
After you do this, one of the monitors will configure PGs and OSDs will start them.
|
||||
|
@@ -71,7 +71,7 @@ etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
|
||||
|
||||
```
|
||||
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
|
||||
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
|
||||
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}'
|
||||
```
|
||||
|
||||
После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.
|
||||
|
@@ -14,12 +14,14 @@ It supports the following commands:
|
||||
- [df](#df)
|
||||
- [ls](#ls)
|
||||
- [create](#create)
|
||||
- [snap-create](#create)
|
||||
- [modify](#modify)
|
||||
- [rm](#rm)
|
||||
- [flatten](#flatten)
|
||||
- [rm-data](#rm-data)
|
||||
- [merge-data](#merge-data)
|
||||
- [alloc-osd](#alloc-osd)
|
||||
- [rm-osd](#rm-osd)
|
||||
|
||||
Global options:
|
||||
|
||||
@@ -122,6 +124,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
|
||||
Create a snapshot of image `<name>` (either form can be used). May be used live if only a single writer is active.
|
||||
|
||||
See also about [how to export snapshots](qemu.en.md#exporting-snapshots).
|
||||
|
||||
## modify
|
||||
|
||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
||||
@@ -175,3 +179,14 @@ Merge layer data without changing metadata. Merge `<from>`..`<to>` to `<target>`
|
||||
`vitastor-cli alloc-osd`
|
||||
|
||||
Allocate a new OSD number and reserve it by creating empty `/osd/stats/<n>` key.
|
||||
|
||||
## rm-osd
|
||||
|
||||
`vitastor-cli rm-osd [--force] [--allow-data-loss] [--dry-run] <osd_id> [osd_id...]`
|
||||
|
||||
Remove metadata and configuration for specified OSD(s) from etcd.
|
||||
|
||||
Refuses to remove OSDs with data without `--force` and `--allow-data-loss`.
|
||||
|
||||
With `--dry-run` only checks if deletion is possible without data loss and
|
||||
redundancy degradation.
|
||||
|
@@ -15,12 +15,14 @@ vitastor-cli - интерфейс командной строки для адм
|
||||
- [df](#df)
|
||||
- [ls](#ls)
|
||||
- [create](#create)
|
||||
- [snap-create](#create)
|
||||
- [modify](#modify)
|
||||
- [rm](#rm)
|
||||
- [flatten](#flatten)
|
||||
- [rm-data](#rm-data)
|
||||
- [merge-data](#merge-data)
|
||||
- [alloc-osd](#alloc-osd)
|
||||
- [rm-osd](#rm-osd)
|
||||
|
||||
Глобальные опции:
|
||||
|
||||
@@ -125,6 +127,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
Создать снимок образа `<name>` (можно использовать любую форму команды). Снимок можно создавать без остановки
|
||||
клиентов, если пишущий клиент максимум 1.
|
||||
|
||||
Смотрите также информацию о том, [как экспортировать снимки](qemu.ru.md#экспорт-снимков).
|
||||
|
||||
## modify
|
||||
|
||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
||||
@@ -186,3 +190,14 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
|
||||
Атомарно выделить новый номер OSD и зарезервировать его, создав в etcd пустой
|
||||
ключ `/osd/stats/<n>`.
|
||||
|
||||
## rm-osd
|
||||
|
||||
`vitastor-cli rm-osd [--force] [--allow-data-loss] [--dry-run] <osd_id> [osd_id...]`
|
||||
|
||||
Удалить метаданные и конфигурацию для заданных OSD из etcd.
|
||||
|
||||
Отказывается удалять OSD с данными без опций `--force` и `--allow-data-loss`.
|
||||
|
||||
С опцией `--dry-run` только проверяет, возможно ли удаление без потери данных и деградации
|
||||
избыточности.
|
||||
|
@@ -1,4 +1,4 @@
|
||||
[Documentation](../../README.md#documentation) → Usage → Disk Tool
|
||||
[Documentation](../../README.md#documentation) → Usage → Disk management tool
|
||||
|
||||
-----
|
||||
|
||||
@@ -14,6 +14,7 @@ It supports the following commands:
|
||||
- [upgrade-simple](#upgrade-simple)
|
||||
- [resize](#resize)
|
||||
- [start/stop/restart/enable/disable](#start/stop/restart/enable/disable)
|
||||
- [purge](#purge)
|
||||
- [read-sb](#read-sb)
|
||||
- [write-sb](#write-sb)
|
||||
- [udev](#udev)
|
||||
@@ -155,11 +156,22 @@ Commands are passed to `systemctl` with `vitastor-osd@<num>` units as arguments.
|
||||
|
||||
When `--now` is added to enable/disable, OSDs are also immediately started/stopped.
|
||||
|
||||
## purge
|
||||
|
||||
`vitastor-disk purge [--force] [--allow-data-loss] <device> [device2 device3 ...]`
|
||||
|
||||
Purge Vitastor OSD(s) on specified device(s). Uses `vitastor-cli rm-osd` to check
|
||||
if deletion is possible without data loss and to actually remove metadata from etcd.
|
||||
`--force` and `--allow-data-loss` options may be used to ignore safety check results.
|
||||
|
||||
Requires `vitastor-cli`, `sfdisk` and `partprobe` (from parted) utilities.
|
||||
|
||||
## read-sb
|
||||
|
||||
`vitastor-disk read-sb <device>`
|
||||
`vitastor-disk read-sb [--force] <device>`
|
||||
|
||||
Try to read Vitastor OSD superblock from `<device>` and print it in JSON format.
|
||||
`--force` allows to ignore validation errors.
|
||||
|
||||
## write-sb
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
[Документация](../../README-ru.md#документация) → Использование → Управление дисками
|
||||
[Документация](../../README-ru.md#документация) → Использование → Инструмент управления дисками
|
||||
|
||||
-----
|
||||
|
||||
@@ -14,6 +14,7 @@ vitastor-disk - инструмент командной строки для уп
|
||||
- [upgrade-simple](#upgrade-simple)
|
||||
- [resize](#resize)
|
||||
- [start/stop/restart/enable/disable](#start/stop/restart/enable/disable)
|
||||
- [purge](#purge)
|
||||
- [read-sb](#read-sb)
|
||||
- [write-sb](#write-sb)
|
||||
- [udev](#udev)
|
||||
@@ -158,12 +159,25 @@ throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.
|
||||
Когда к командам включения/выключения добавляется параметр `--now`, OSD также сразу
|
||||
запускаются/останавливаются.
|
||||
|
||||
## purge
|
||||
|
||||
`vitastor-disk purge [--force] [--allow-data-loss] <device> [device2 device3 ...]`
|
||||
|
||||
Удалить OSD на заданном диске/дисках. Использует `vitastor-cli rm-osd` для проверки
|
||||
возможности удаления без потери данных и для удаления OSD из etcd. Опции `--force`
|
||||
и `--allow-data-loss` служат для обхода данной защиты в случае необходимости.
|
||||
|
||||
Команде требуются утилиты `vitastor-cli`, `sfdisk` и `partprobe` (из состава parted).
|
||||
|
||||
## read-sb
|
||||
|
||||
`vitastor-disk read-sb <device>`
|
||||
`vitastor-disk read-sb [--force] <device>`
|
||||
|
||||
Прочитать суперблок OSD с диска `<device>` и вывести его в формате JSON.
|
||||
|
||||
Опция `--force` позволяет читать суперблок, даже если он считается некорректным
|
||||
из-за ошибок валидации.
|
||||
|
||||
## write-sb
|
||||
|
||||
`vitastor-disk write-sb <device>`
|
||||
|
@@ -46,3 +46,40 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=192.168.7
|
||||
|
||||
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
|
||||
if you don't want to use inode metadata.
|
||||
|
||||
### Exporting snapshots
|
||||
|
||||
Starting with 0.8.4, you can also export individual layers (snapshot diffs) using `qemu-img`.
|
||||
|
||||
Suppose you have an image `testimg` and a snapshot `testimg@0` created with `vitastor-cli snap-create testimg@0`.
|
||||
|
||||
Then you can export the `testimg@0` snapshot and the data written to `testimg` after creating
|
||||
the snapshot separately using the following commands (key points are using `skip-parents=1` and
|
||||
`-B backing_file` option):
|
||||
|
||||
```
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
|
||||
-O qcow2 testimg_0.qcow2
|
||||
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
|
||||
-O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
|
||||
```
|
||||
|
||||
In fact, with `cluster_size=4k` any QCOW2 file can be used instead `-B testimg_0.qcow2`, even an empty one.
|
||||
|
||||
QCOW2 `cluster_size=4k` option is required if you want `testimg.qcow2` to contain only the data
|
||||
overwritten **exactly** in the child layer. With the default 64 KB QCOW2 cluster size you'll
|
||||
get a bit of extra data from parent layers, i.e. a 4 KB overwrite will result in `testimg.qcow2`
|
||||
containing 64 KB of data. And this extra data will be taken by `qemu-img` from the file passed
|
||||
in `-B` option, so you really need 4 KB cluster if you use an empty image in `-B`.
|
||||
|
||||
After this procedure you'll get two chained QCOW2 images. To detach `testimg.qcow2` from
|
||||
its parent, run:
|
||||
|
||||
```
|
||||
qemu-img rebase -u -b '' testimg.qcow2
|
||||
```
|
||||
|
||||
This can be used for backups. Just note that exporting an image that is currently being written to
|
||||
is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
|
||||
on a live VM.
|
||||
|
@@ -50,3 +50,40 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.
|
||||
|
||||
Если вы не хотите обращаться к образу по имени, вместо `:image=<IMAGE>` можно указать номер пула, номер инода и размер:
|
||||
`:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
|
||||
|
||||
### Экспорт снимков
|
||||
|
||||
Начиная с 0.8.4 вы можете экспортировать отдельные слои (изменения в снимках) с помощью `qemu-img`.
|
||||
|
||||
Допустим, что у вас есть образ `testimg` и его снимок `testimg@0`, созданный с помощью `vitastor-cli snap-create testimg@0`.
|
||||
|
||||
Тогда вы можете выгрузить снимок `testimg@0` и данные, изменённые в `testimg` после создания снимка, отдельно,
|
||||
с помощью следующих команд (ключевые моменты - использование `skip-parents=1` и опции `-B backing_file.qcow2`):
|
||||
|
||||
```
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
|
||||
-O qcow2 testimg_0.qcow2
|
||||
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
|
||||
-O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
|
||||
```
|
||||
|
||||
На самом деле, с `cluster_size=4k` вместо `-B testimg_0.qcow2` можно использовать любой qcow2-файл,
|
||||
даже пустой.
|
||||
|
||||
Опция QCOW2 `cluster_size=4k` нужна, если вы хотите, чтобы `testimg.qcow2` содержал **в точности**
|
||||
данные, перезаписанные в дочернем слое. С размером кластера QCOW2 по умолчанию, составляющим 64 КБ,
|
||||
вы получите немного "лишних" данных из родительских слоёв - перезапись 4 КБ будет приводить к тому,
|
||||
что в `testimg.qcow2` будет появляться 64 КБ данных. Причём "лишние" данные qemu-img будет брать
|
||||
как раз из файла, указанного в опции `-B`, так что если там указан пустой образ, кластер обязан быть 4 КБ.
|
||||
|
||||
После данной процедуры вы получите два QCOW2-образа, связанных в цепочку. Чтобы "отцепить" образ
|
||||
`testimg.qcow2` от базового, выполните:
|
||||
|
||||
```
|
||||
qemu-img rebase -u -b '' testimg.qcow2
|
||||
```
|
||||
|
||||
Это можно использовать для резервного копирования. Только помните, что экспортировать образ, в который
|
||||
в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
|
||||
с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.
|
||||
|
2
json11
2
json11
Submodule json11 updated: 52a3af664f...fd37016cf8
@@ -21,7 +21,7 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p
|
||||
{
|
||||
for (const pg of oh.osd_sets)
|
||||
{
|
||||
nh.osd_sets[pg.join(' ')] = pg;
|
||||
nh.osd_sets[pg.join(' ')] = pg.map(osd_num => Number(osd_num));
|
||||
}
|
||||
}
|
||||
if (oh && oh.all_peers && oh.all_peers.length)
|
||||
|
@@ -550,8 +550,8 @@ function random_combinations(osd_tree, pg_size, count, ordered)
|
||||
seed ^= seed << 5;
|
||||
return seed + 2147483648;
|
||||
};
|
||||
const hosts = Object.keys(osd_tree).sort();
|
||||
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
|
||||
const hosts = Object.keys(osd_tree).sort().filter(h => osds[h].length > 0);
|
||||
const r = {};
|
||||
// Generate random combinations including each OSD at least once
|
||||
for (let h = 0; h < hosts.length; h++)
|
||||
|
@@ -79,7 +79,7 @@ StartLimitInterval=0
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=local.target
|
||||
WantedBy=multi-user.target
|
||||
`);
|
||||
await system(`useradd etcd`);
|
||||
await system(`systemctl daemon-reload`);
|
||||
|
42
mon/mon.js
42
mon/mon.js
@@ -70,9 +70,9 @@ const etcd_tree = {
|
||||
rdma_gid_index: 0,
|
||||
rdma_mtu: 4096,
|
||||
rdma_max_sge: 128,
|
||||
rdma_max_send: 32,
|
||||
rdma_max_recv: 8,
|
||||
rdma_max_msg: 1048576,
|
||||
rdma_max_send: 8,
|
||||
rdma_max_recv: 16,
|
||||
rdma_max_msg: 132096,
|
||||
log_level: 0,
|
||||
block_size: 131072,
|
||||
disk_alignment: 4096,
|
||||
@@ -261,7 +261,7 @@ const etcd_tree = {
|
||||
/* <pool_id>: {
|
||||
<pg_id>: {
|
||||
primary: osd_num_t,
|
||||
state: ("starting"|"peering"|"peered"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
||||
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
||||
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
||||
"has_invalid"|"left_on_dead")[],
|
||||
}
|
||||
@@ -663,12 +663,15 @@ class Mon
|
||||
async save_last_clean()
|
||||
{
|
||||
// last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
|
||||
const new_clean_pgs = { items: {} };
|
||||
next_pool:
|
||||
for (const pool_id in this.state.config.pools)
|
||||
{
|
||||
new_clean_pgs.items[pool_id] = (this.state.history.last_clean_pgs.items||{})[pool_id];
|
||||
const pool_cfg = this.state.config.pools[pool_id];
|
||||
if (!this.validate_pool_cfg(pool_id, pool_cfg, false))
|
||||
{
|
||||
continue;
|
||||
continue next_pool;
|
||||
}
|
||||
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
|
||||
{
|
||||
@@ -677,17 +680,18 @@ class Mon
|
||||
!(this.state.pg.state[pool_id][pg_num].state instanceof Array))
|
||||
{
|
||||
// Unclean
|
||||
return;
|
||||
continue next_pool;
|
||||
}
|
||||
let st = this.state.pg.state[pool_id][pg_num].state.join(',');
|
||||
if (st != 'active' && st != 'active,left_on_dead' && st != 'left_on_dead,active')
|
||||
{
|
||||
// Unclean
|
||||
return;
|
||||
continue next_pool;
|
||||
}
|
||||
}
|
||||
new_clean_pgs.items[pool_id] = this.state.config.pgs.items[pool_id];
|
||||
}
|
||||
this.state.history.last_clean_pgs = JSON.parse(JSON.stringify(this.state.config.pgs));
|
||||
this.state.history.last_clean_pgs = new_clean_pgs;
|
||||
await this.etcd_call('/kv/txn', {
|
||||
success: [ { requestPut: {
|
||||
key: b64(this.etcd_prefix+'/history/last_clean_pgs'),
|
||||
@@ -1374,16 +1378,14 @@ class Mon
|
||||
// This is required for multiple change events to trigger at most 1 recheck in 1s
|
||||
schedule_recheck()
|
||||
{
|
||||
if (this.recheck_timer)
|
||||
if (!this.recheck_timer)
|
||||
{
|
||||
clearTimeout(this.recheck_timer);
|
||||
this.recheck_timer = null;
|
||||
this.recheck_timer = setTimeout(() =>
|
||||
{
|
||||
this.recheck_timer = null;
|
||||
this.recheck_pgs().catch(this.die);
|
||||
}, this.config.mon_change_timeout || 1000);
|
||||
}
|
||||
this.recheck_timer = setTimeout(() =>
|
||||
{
|
||||
this.recheck_timer = null;
|
||||
this.recheck_pgs().catch(this.die);
|
||||
}, this.config.mon_change_timeout || 1000);
|
||||
}
|
||||
|
||||
sum_op_stats(timestamp, prev_stats)
|
||||
@@ -1693,6 +1695,7 @@ class Mon
|
||||
// Do not clear these to null
|
||||
kv.value = kv.value || {};
|
||||
}
|
||||
const old = cur[key_parts[key_parts.length-1]];
|
||||
cur[key_parts[key_parts.length-1]] = kv.value;
|
||||
if (key === 'config/global')
|
||||
{
|
||||
@@ -1717,7 +1720,12 @@ class Mon
|
||||
}
|
||||
else if (key_parts[0] === 'osd' && key_parts[1] === 'stats')
|
||||
{
|
||||
// Recheck PGs <osd_out_time> later
|
||||
// Recheck OSD tree on OSD addition/deletion
|
||||
if ((!old) != (!kv.value) || old && kv.value && old.size != kv.value.size)
|
||||
{
|
||||
this.schedule_recheck();
|
||||
}
|
||||
// Recheck PGs <osd_out_time> after last OSD statistics report
|
||||
this.schedule_next_recheck_at(
|
||||
!this.state.osd.stats[key[2]] ? 0 : this.state.osd.stats[key[2]].time+this.config.osd_out_time
|
||||
);
|
||||
|
@@ -16,6 +16,11 @@ use PVE::Tools qw(run_command);
|
||||
|
||||
use base qw(PVE::Storage::Plugin);
|
||||
|
||||
if (@PVE::Storage::Plugin::SHARED_STORAGE)
|
||||
{
|
||||
push @PVE::Storage::Plugin::SHARED_STORAGE, 'vitastor';
|
||||
}
|
||||
|
||||
sub api
|
||||
{
|
||||
# Trick it :)
|
||||
@@ -133,9 +138,11 @@ sub properties
|
||||
sub options
|
||||
{
|
||||
return {
|
||||
shared => { optional => 1 },
|
||||
content => { optional => 1 },
|
||||
nodes => { optional => 1 },
|
||||
disable => { optional => 1 },
|
||||
vitastor_etcd_address => { optional => 1},
|
||||
vitastor_etcd_address => { optional => 1 },
|
||||
vitastor_etcd_prefix => { optional => 1 },
|
||||
vitastor_config_path => { optional => 1 },
|
||||
vitastor_prefix => { optional => 1 },
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VERSION = '0.8.0'
|
||||
VERSION = '0.8.6'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
169
patches/pve-qemu-6.2-vitastor.patch
Normal file
169
patches/pve-qemu-6.2-vitastor.patch
Normal file
@@ -0,0 +1,169 @@
|
||||
Index: qemu/block/meson.build
|
||||
===================================================================
|
||||
--- qemu.orig/block/meson.build
|
||||
+++ qemu/block/meson.build
|
||||
@@ -91,6 +91,7 @@ foreach m : [
|
||||
[libnfs, 'nfs', files('nfs.c')],
|
||||
[libssh, 'ssh', files('ssh.c')],
|
||||
[rbd, 'rbd', files('rbd.c')],
|
||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||
]
|
||||
if m[0].found()
|
||||
module_ss = ss.source_set()
|
||||
Index: qemu/meson.build
|
||||
===================================================================
|
||||
--- qemu.orig/meson.build
|
||||
+++ qemu/meson.build
|
||||
@@ -838,6 +838,26 @@ if not get_option('rbd').auto() or have_
|
||||
endif
|
||||
endif
|
||||
|
||||
+vitastor = not_found
|
||||
+if not get_option('vitastor').auto() or have_block
|
||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||
+ required: get_option('vitastor'), kwargs: static_kwargs)
|
||||
+ if libvitastor_client.found()
|
||||
+ if cc.links('''
|
||||
+ #include <vitastor_c.h>
|
||||
+ int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+ }''', dependencies: libvitastor_client)
|
||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||
+ elif get_option('vitastor').enabled()
|
||||
+ error('could not link libvitastor_client')
|
||||
+ else
|
||||
+ warning('could not link libvitastor_client, disabling')
|
||||
+ endif
|
||||
+ endif
|
||||
+endif
|
||||
+
|
||||
glusterfs = not_found
|
||||
glusterfs_ftruncate_has_stat = false
|
||||
glusterfs_iocb_has_stat = false
|
||||
@@ -1459,6 +1479,7 @@ config_host_data.set('CONFIG_LINUX_AIO',
|
||||
config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found())
|
||||
config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
|
||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||
config_host_data.set('CONFIG_SDL', sdl.found())
|
||||
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
|
||||
config_host_data.set('CONFIG_SECCOMP', seccomp.found())
|
||||
@@ -3424,6 +3445,7 @@ if spice_protocol.found()
|
||||
summary_info += {' spice server support': spice}
|
||||
endif
|
||||
summary_info += {'rbd support': rbd}
|
||||
+summary_info += {'vitastor support': vitastor}
|
||||
summary_info += {'xfsctl support': config_host.has_key('CONFIG_XFS')}
|
||||
summary_info += {'smartcard support': cacard}
|
||||
summary_info += {'U2F support': u2f}
|
||||
Index: qemu/meson_options.txt
|
||||
===================================================================
|
||||
--- qemu.orig/meson_options.txt
|
||||
+++ qemu/meson_options.txt
|
||||
@@ -121,6 +121,8 @@ option('lzo', type : 'feature', value :
|
||||
description: 'lzo compression support')
|
||||
option('rbd', type : 'feature', value : 'auto',
|
||||
description: 'Ceph block device driver')
|
||||
+option('vitastor', type : 'feature', value : 'auto',
|
||||
+ description: 'Vitastor block device driver')
|
||||
option('gtk', type : 'feature', value : 'auto',
|
||||
description: 'GTK+ user interface')
|
||||
option('sdl', type : 'feature', value : 'auto',
|
||||
Index: qemu/qapi/block-core.json
|
||||
===================================================================
|
||||
--- qemu.orig/qapi/block-core.json
|
||||
+++ qemu/qapi/block-core.json
|
||||
@@ -3179,7 +3179,7 @@
|
||||
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
|
||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||
'pbs',
|
||||
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
|
||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
|
||||
|
||||
##
|
||||
# @BlockdevOptionsFile:
|
||||
@@ -4125,6 +4125,28 @@
|
||||
'*server': ['InetSocketAddressBase'] } }
|
||||
|
||||
##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
+##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
# An enumeration of replication modes.
|
||||
@@ -4520,6 +4542,7 @@
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'vmdk': 'BlockdevOptionsGenericCOWFormat',
|
||||
'vpc': 'BlockdevOptionsGenericFormat',
|
||||
'vvfat': 'BlockdevOptionsVVFAT'
|
||||
@@ -4910,6 +4933,17 @@
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
+##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
# Subformat options for VMDK images
|
||||
@@ -5108,6 +5142,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||
'vpc': 'BlockdevCreateOptionsVpc'
|
||||
} }
|
||||
Index: qemu/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
===================================================================
|
||||
--- qemu.orig/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
+++ qemu/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
@@ -31,7 +31,7 @@
|
||||
--with-git=meson \
|
||||
--with-git-submodules=update \
|
||||
--target-list="x86_64-softmmu" \
|
||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
--audio-drv-list="" \
|
||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
||||
--with-coroutine=ucontext \
|
||||
@@ -183,6 +183,7 @@
|
||||
--enable-opengl \
|
||||
--enable-pie \
|
||||
--enable-rbd \
|
||||
+--enable-vitastor \
|
||||
--enable-rdma \
|
||||
--enable-seccomp \
|
||||
--enable-snappy \
|
169
patches/pve-qemu-7.1-vitastor.patch
Normal file
169
patches/pve-qemu-7.1-vitastor.patch
Normal file
@@ -0,0 +1,169 @@
|
||||
Index: qemu/block/meson.build
|
||||
===================================================================
|
||||
--- qemu.orig/block/meson.build
|
||||
+++ qemu/block/meson.build
|
||||
@@ -111,6 +111,7 @@ foreach m : [
|
||||
[libnfs, 'nfs', files('nfs.c')],
|
||||
[libssh, 'ssh', files('ssh.c')],
|
||||
[rbd, 'rbd', files('rbd.c')],
|
||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||
]
|
||||
if m[0].found()
|
||||
module_ss = ss.source_set()
|
||||
Index: qemu/meson.build
|
||||
===================================================================
|
||||
--- qemu.orig/meson.build
|
||||
+++ qemu/meson.build
|
||||
@@ -967,6 +967,26 @@ if not get_option('rbd').auto() or have_
|
||||
endif
|
||||
endif
|
||||
|
||||
+vitastor = not_found
|
||||
+if not get_option('vitastor').auto() or have_block
|
||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||
+ required: get_option('vitastor'), kwargs: static_kwargs)
|
||||
+ if libvitastor_client.found()
|
||||
+ if cc.links('''
|
||||
+ #include <vitastor_c.h>
|
||||
+ int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+ }''', dependencies: libvitastor_client)
|
||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||
+ elif get_option('vitastor').enabled()
|
||||
+ error('could not link libvitastor_client')
|
||||
+ else
|
||||
+ warning('could not link libvitastor_client, disabling')
|
||||
+ endif
|
||||
+ endif
|
||||
+endif
|
||||
+
|
||||
glusterfs = not_found
|
||||
glusterfs_ftruncate_has_stat = false
|
||||
glusterfs_iocb_has_stat = false
|
||||
@@ -1802,6 +1822,7 @@ config_host_data.set('CONFIG_NUMA', numa
|
||||
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
|
||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||
config_host_data.set('CONFIG_SDL', sdl.found())
|
||||
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
|
||||
@@ -3965,6 +3986,7 @@ if spice_protocol.found()
|
||||
summary_info += {' spice server support': spice}
|
||||
endif
|
||||
summary_info += {'rbd support': rbd}
|
||||
+summary_info += {'vitastor support': vitastor}
|
||||
summary_info += {'smartcard support': cacard}
|
||||
summary_info += {'U2F support': u2f}
|
||||
summary_info += {'libusb': libusb}
|
||||
Index: qemu/meson_options.txt
|
||||
===================================================================
|
||||
--- qemu.orig/meson_options.txt
|
||||
+++ qemu/meson_options.txt
|
||||
@@ -167,6 +167,8 @@ option('lzo', type : 'feature', value :
|
||||
description: 'lzo compression support')
|
||||
option('rbd', type : 'feature', value : 'auto',
|
||||
description: 'Ceph block device driver')
|
||||
+option('vitastor', type : 'feature', value : 'auto',
|
||||
+ description: 'Vitastor block device driver')
|
||||
option('opengl', type : 'feature', value : 'auto',
|
||||
description: 'OpenGL support')
|
||||
option('rdma', type : 'feature', value : 'auto',
|
||||
Index: qemu/qapi/block-core.json
|
||||
===================================================================
|
||||
--- qemu.orig/qapi/block-core.json
|
||||
+++ qemu/qapi/block-core.json
|
||||
@@ -3209,7 +3209,7 @@
|
||||
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
|
||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||
'pbs',
|
||||
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
|
||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
|
||||
|
||||
##
|
||||
# @BlockdevOptionsFile:
|
||||
@@ -4149,6 +4149,28 @@
|
||||
'*server': ['InetSocketAddressBase'] } }
|
||||
|
||||
##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
+##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
# An enumeration of replication modes.
|
||||
@@ -4593,6 +4615,7 @@
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'vmdk': 'BlockdevOptionsGenericCOWFormat',
|
||||
'vpc': 'BlockdevOptionsGenericFormat',
|
||||
'vvfat': 'BlockdevOptionsVVFAT'
|
||||
@@ -4985,6 +5008,17 @@
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
+##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
# Subformat options for VMDK images
|
||||
@@ -5182,6 +5216,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||
'vpc': 'BlockdevCreateOptionsVpc'
|
||||
} }
|
||||
Index: qemu/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
===================================================================
|
||||
--- qemu.orig/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
+++ qemu/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
@@ -31,7 +31,7 @@
|
||||
--with-git=meson \
|
||||
--with-git-submodules=update \
|
||||
--target-list="x86_64-softmmu" \
|
||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
--audio-drv-list="" \
|
||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
||||
--with-coroutine=ucontext \
|
||||
@@ -179,6 +179,7 @@
|
||||
--enable-opengl \
|
||||
--enable-pie \
|
||||
--enable-rbd \
|
||||
+--enable-vitastor \
|
||||
--enable-rdma \
|
||||
--enable-seccomp \
|
||||
--enable-snappy \
|
@@ -9,7 +9,7 @@ for i in "$DIR"/qemu-*-vitastor.patch "$DIR"/pve-qemu-*-vitastor.patch; do
|
||||
echo '===================================================================' >> $i
|
||||
echo '--- /dev/null' >> $i
|
||||
echo '+++ a/block/vitastor.c' >> $i
|
||||
echo '@@ -0,0 +1,'$(wc -l "$DIR"/../src/qemu_driver.c)' @@' >> $i
|
||||
echo '@@ -0,0 +1,'$(wc -l "$DIR"/../src/qemu_driver.c | cut -d ' ' -f 1)' @@' >> $i
|
||||
cat "$DIR"/../src/qemu_driver.c | sed 's/^/+/' >> $i
|
||||
fi
|
||||
done
|
||||
|
@@ -25,4 +25,4 @@ rm fio
|
||||
mv fio-copy fio
|
||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-0.8.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.0$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-0.8.6/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.6$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -58,7 +58,7 @@
|
||||
+BuildRequires: gperftools-devel
|
||||
+BuildRequires: libusbx-devel >= 1.0.21
|
||||
%if %{have_usbredir}
|
||||
BuildRequires: usbredir-devel >= 0.8.0
|
||||
BuildRequires: usbredir-devel >= 0.8.2
|
||||
%endif
|
||||
@@ -856,12 +861,13 @@ BuildRequires: virglrenderer-devel
|
||||
# For smartcard NSS support
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.8.0.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.8.6.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.8.0
|
||||
Version: 0.8.6
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.8.0.el7.tar.gz
|
||||
Source0: vitastor-0.8.6.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
@@ -35,6 +35,7 @@ Summary: Vitastor - OSD
|
||||
Requires: libJerasure2
|
||||
Requires: libisa-l
|
||||
Requires: liburing >= 0.6
|
||||
Requires: liburing < 2
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
Requires: util-linux
|
||||
Requires: parted
|
||||
@@ -59,6 +60,7 @@ scheduling cluster-level operations.
|
||||
%package -n vitastor-client
|
||||
Summary: Vitastor - client
|
||||
Requires: liburing >= 0.6
|
||||
Requires: liburing < 2
|
||||
|
||||
|
||||
%description -n vitastor-client
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.8.0.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.8.6.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.8.0
|
||||
Version: 0.8.6
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.8.0.el8.tar.gz
|
||||
Source0: vitastor-0.8.6.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
@@ -34,6 +34,7 @@ Summary: Vitastor - OSD
|
||||
Requires: libJerasure2
|
||||
Requires: libisa-l
|
||||
Requires: liburing >= 0.6
|
||||
Requires: liburing < 2
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
Requires: util-linux
|
||||
Requires: parted
|
||||
@@ -57,6 +58,7 @@ scheduling cluster-level operations.
|
||||
%package -n vitastor-client
|
||||
Summary: Vitastor - client
|
||||
Requires: liburing >= 0.6
|
||||
Requires: liburing < 2
|
||||
|
||||
|
||||
%description -n vitastor-client
|
||||
|
@@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 2.8)
|
||||
project(vitastor)
|
||||
|
||||
include(GNUInstallDirs)
|
||||
include(CTest)
|
||||
|
||||
set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree")
|
||||
set(WITH_FIO true CACHE BOOL "Build FIO driver")
|
||||
@@ -15,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="0.8.0")
|
||||
add_definitions(-DVERSION="0.8.6")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||
@@ -55,6 +56,14 @@ if (ISAL_LIBRARIES)
|
||||
add_definitions(-DWITH_ISAL)
|
||||
endif (ISAL_LIBRARIES)
|
||||
|
||||
add_custom_target(build_tests)
|
||||
add_custom_target(test
|
||||
COMMAND
|
||||
echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
|
||||
env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
|
||||
)
|
||||
add_dependencies(test build_tests)
|
||||
|
||||
include_directories(
|
||||
../
|
||||
/usr/include/jerasure
|
||||
@@ -140,11 +149,11 @@ add_library(vitastor_client SHARED
|
||||
cli_merge.cpp
|
||||
cli_rm_data.cpp
|
||||
cli_rm.cpp
|
||||
cli_rm_osd.cpp
|
||||
)
|
||||
set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h")
|
||||
target_link_libraries(vitastor_client
|
||||
vitastor_common
|
||||
tcmalloc_minimal
|
||||
${LIBURING_LIBRARIES}
|
||||
${IBVERBS_LIBRARIES}
|
||||
)
|
||||
@@ -234,8 +243,18 @@ add_executable(osd_test osd_test.cpp rw_blocking.cpp addr_util.cpp)
|
||||
target_link_libraries(osd_test tcmalloc_minimal)
|
||||
|
||||
# osd_rmw_test
|
||||
add_executable(osd_rmw_test osd_rmw_test.cpp allocator.cpp)
|
||||
add_executable(osd_rmw_test EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
|
||||
target_link_libraries(osd_rmw_test Jerasure ${ISAL_LIBRARIES} tcmalloc_minimal)
|
||||
add_dependencies(build_tests osd_rmw_test)
|
||||
add_test(NAME osd_rmw_test COMMAND osd_rmw_test)
|
||||
|
||||
if (ISAL_LIBRARIES)
|
||||
add_executable(osd_rmw_test_je EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
|
||||
target_compile_definitions(osd_rmw_test_je PUBLIC -DNO_ISAL)
|
||||
target_link_libraries(osd_rmw_test_je Jerasure tcmalloc_minimal)
|
||||
add_dependencies(build_tests osd_rmw_test_je)
|
||||
add_test(NAME osd_rmw_test_jerasure COMMAND osd_rmw_test_je)
|
||||
endif (ISAL_LIBRARIES)
|
||||
|
||||
# stub_uring_osd
|
||||
add_executable(stub_uring_osd
|
||||
@@ -249,11 +268,15 @@ target_link_libraries(stub_uring_osd
|
||||
)
|
||||
|
||||
# osd_peering_pg_test
|
||||
add_executable(osd_peering_pg_test osd_peering_pg_test.cpp osd_peering_pg.cpp)
|
||||
add_executable(osd_peering_pg_test EXCLUDE_FROM_ALL osd_peering_pg_test.cpp osd_peering_pg.cpp)
|
||||
target_link_libraries(osd_peering_pg_test tcmalloc_minimal)
|
||||
add_dependencies(build_tests osd_peering_pg_test)
|
||||
add_test(NAME osd_peering_pg_test COMMAND osd_peering_pg_test)
|
||||
|
||||
# test_allocator
|
||||
add_executable(test_allocator test_allocator.cpp allocator.cpp)
|
||||
add_executable(test_allocator EXCLUDE_FROM_ALL test_allocator.cpp allocator.cpp)
|
||||
add_dependencies(build_tests test_allocator)
|
||||
add_test(NAME test_allocator COMMAND test_allocator)
|
||||
|
||||
# test_cas
|
||||
add_executable(test_cas
|
||||
@@ -263,14 +286,25 @@ target_link_libraries(test_cas
|
||||
vitastor_client
|
||||
)
|
||||
|
||||
# test_crc32
|
||||
add_executable(test_crc32
|
||||
test_crc32.cpp
|
||||
)
|
||||
target_link_libraries(test_crc32
|
||||
vitastor_blk
|
||||
)
|
||||
|
||||
# test_cluster_client
|
||||
add_executable(test_cluster_client
|
||||
EXCLUDE_FROM_ALL
|
||||
test_cluster_client.cpp
|
||||
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
|
||||
etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
|
||||
)
|
||||
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
|
||||
target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
|
||||
add_dependencies(build_tests test_cluster_client)
|
||||
add_test(NAME test_cluster_client COMMAND test_cluster_client)
|
||||
|
||||
## test_blockstore, test_shit
|
||||
#add_executable(test_blockstore test_blockstore.cpp)
|
||||
|
@@ -35,24 +35,14 @@ journal_flusher_co::journal_flusher_co()
|
||||
{
|
||||
bs->live = true;
|
||||
if (data->res != data->iov.iov_len)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"data read operation failed during flush ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
|
||||
"). can't continue, sorry :-("
|
||||
);
|
||||
}
|
||||
bs->disk_error_abort("read operation during flush", data->res, data->iov.iov_len);
|
||||
wait_count--;
|
||||
};
|
||||
simple_callback_w = [this](ring_data_t* data)
|
||||
{
|
||||
bs->live = true;
|
||||
if (data->res != data->iov.iov_len)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
|
||||
"). state "+std::to_string(wait_state)+". in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
|
||||
);
|
||||
}
|
||||
bs->disk_error_abort("write operation during flush", data->res, data->iov.iov_len);
|
||||
wait_count--;
|
||||
};
|
||||
}
|
||||
@@ -87,7 +77,7 @@ void journal_flusher_t::loop()
|
||||
cur_flusher_count--;
|
||||
}
|
||||
}
|
||||
for (int i = 0; (active_flushers > 0 || dequeuing) && i < cur_flusher_count; i++)
|
||||
for (int i = 0; (active_flushers > 0 || dequeuing || trim_wanted > 0) && i < cur_flusher_count; i++)
|
||||
co[i].loop();
|
||||
}
|
||||
|
||||
@@ -172,7 +162,8 @@ void journal_flusher_t::mark_trim_possible()
|
||||
if (trim_wanted > 0)
|
||||
{
|
||||
dequeuing = true;
|
||||
journal_trim_counter++;
|
||||
if (!journal_trim_counter)
|
||||
journal_trim_counter = journal_trim_interval;
|
||||
bs->ringloop->wakeup();
|
||||
}
|
||||
}
|
||||
@@ -306,6 +297,8 @@ bool journal_flusher_co::loop()
|
||||
goto resume_20;
|
||||
else if (wait_state == 21)
|
||||
goto resume_21;
|
||||
else if (wait_state == 22)
|
||||
goto resume_22;
|
||||
resume_0:
|
||||
if (flusher->flush_queue.size() < flusher->min_flusher_count && !flusher->trim_wanted ||
|
||||
!flusher->flush_queue.size() || !flusher->dequeuing)
|
||||
@@ -511,6 +504,13 @@ resume_1:
|
||||
);
|
||||
wait_count++;
|
||||
}
|
||||
// Wait for data writes before fsyncing it
|
||||
resume_22:
|
||||
if (wait_count > 0)
|
||||
{
|
||||
wait_state = 22;
|
||||
return false;
|
||||
}
|
||||
// Sync data before writing metadata
|
||||
resume_16:
|
||||
resume_17:
|
||||
@@ -521,7 +521,7 @@ resume_1:
|
||||
return false;
|
||||
}
|
||||
resume_5:
|
||||
// And metadata writes, but only after data writes complete
|
||||
// Submit metadata writes, but only when data is written and fsynced
|
||||
if (!bs->inmemory_meta && meta_new.it->second.state == 0 || wait_count > 0)
|
||||
{
|
||||
// metadata sector is still being read or data is still being written, wait for it
|
||||
@@ -615,7 +615,12 @@ resume_1:
|
||||
}
|
||||
for (it = v.begin(); it != v.end(); it++)
|
||||
{
|
||||
free(it->buf);
|
||||
// Free it if it's not taken from the journal
|
||||
if (it->buf && (!bs->journal.inmemory || it->buf < bs->journal.buffer ||
|
||||
it->buf >= (uint8_t*)bs->journal.buffer + bs->journal.len))
|
||||
{
|
||||
free(it->buf);
|
||||
}
|
||||
}
|
||||
v.clear();
|
||||
// And sync metadata (in batches - not per each operation!)
|
||||
@@ -760,16 +765,17 @@ bool journal_flusher_co::scan_dirty(int wait_base)
|
||||
{
|
||||
submit_offset = dirty_it->second.location + offset - dirty_it->second.offset;
|
||||
submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset;
|
||||
it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign_or_die(MEM_ALIGNMENT, submit_len) });
|
||||
it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len });
|
||||
copy_count++;
|
||||
if (bs->journal.inmemory)
|
||||
{
|
||||
// Take it from memory
|
||||
memcpy(it->buf, (uint8_t*)bs->journal.buffer + submit_offset, submit_len);
|
||||
// Take it from memory, don't copy it
|
||||
it->buf = (uint8_t*)bs->journal.buffer + submit_offset;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Read it from disk
|
||||
it->buf = memalign_or_die(MEM_ALIGNMENT, submit_len);
|
||||
await_sqe(0);
|
||||
data->iov = (struct iovec){ it->buf, (size_t)submit_len };
|
||||
data->callback = simple_callback_r;
|
||||
|
@@ -107,7 +107,7 @@ void blockstore_impl_t::loop()
|
||||
// has_writes == 0 - no writes before the current queue item
|
||||
// has_writes == 1 - some writes in progress
|
||||
// has_writes == 2 - tried to submit some writes, but failed
|
||||
int has_writes = 0, op_idx = 0, new_idx = 0, done_lists = 0;
|
||||
int has_writes = 0, op_idx = 0, new_idx = 0;
|
||||
for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
|
||||
{
|
||||
auto op = submit_queue[op_idx];
|
||||
@@ -188,16 +188,12 @@ void blockstore_impl_t::loop()
|
||||
else if (op->opcode == BS_OP_LIST)
|
||||
{
|
||||
// LIST doesn't have to be blocked by previous modifications
|
||||
// But don't do a lot of LISTs at once, because they're blocking and potentially slow
|
||||
if (single_tick_list_limit <= 0 || done_lists < single_tick_list_limit)
|
||||
{
|
||||
process_list(op);
|
||||
done_lists++;
|
||||
wr_st = 2;
|
||||
}
|
||||
process_list(op);
|
||||
wr_st = 2;
|
||||
}
|
||||
if (wr_st == 2)
|
||||
{
|
||||
submit_queue[op_idx] = NULL;
|
||||
new_idx--;
|
||||
}
|
||||
if (wr_st == 0)
|
||||
@@ -306,17 +302,6 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
|
||||
// do not submit
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Still waiting for a journal buffer\n");
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
PRIV(op)->wait_for = 0;
|
||||
}
|
||||
else if (PRIV(op)->wait_for == WAIT_FREE)
|
||||
{
|
||||
if (!data_alloc->get_free_count() && flusher->is_active())
|
||||
{
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Still waiting for free space on the data device\n");
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
@@ -340,7 +325,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
|
||||
{
|
||||
// Basic verification not passed
|
||||
op->retval = -EINVAL;
|
||||
std::function<void (blockstore_op_t*)>(op->callback)(op);
|
||||
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
|
||||
return;
|
||||
}
|
||||
if (op->opcode == BS_OP_SYNC_STAB_ALL)
|
||||
@@ -383,7 +368,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
|
||||
}
|
||||
if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
|
||||
{
|
||||
std::function<void (blockstore_op_t*)>(op->callback)(op);
|
||||
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
|
||||
return;
|
||||
}
|
||||
// Call constructor without allocating memory. We'll call destructor before returning op back
|
||||
@@ -598,7 +583,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
replace_stable(dirty_it->first.oid, 0, clean_stable_count, stable_count, stable);
|
||||
}
|
||||
}
|
||||
else if (IS_STABLE(dirty_it->second.state))
|
||||
else if (IS_STABLE(dirty_it->second.state) || (dirty_it->second.state & BS_ST_INSTANT))
|
||||
{
|
||||
// First try to replace a clean stable version in the first part of the list
|
||||
if (!replace_stable(dirty_it->first.oid, dirty_it->first.version, 0, clean_stable_count, stable))
|
||||
@@ -687,3 +672,16 @@ void blockstore_impl_t::dump_diagnostics()
|
||||
journal.dump_diagnostics();
|
||||
flusher->dump_diagnostics();
|
||||
}
|
||||
|
||||
void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expected)
|
||||
{
|
||||
if (retval == -EAGAIN)
|
||||
{
|
||||
fprintf(stderr, "EAGAIN error received from a disk %s during flush."
|
||||
" It must never happen with io_uring and indicates a kernel bug."
|
||||
" Please upgrade your kernel. Aborting.\n", op);
|
||||
exit(1);
|
||||
}
|
||||
fprintf(stderr, "Disk %s failed: result is %d, expected %d. Can't continue, sorry :-(\n", op, retval, expected);
|
||||
exit(1);
|
||||
}
|
||||
|
@@ -160,12 +160,11 @@ struct __attribute__((__packed__)) dirty_entry
|
||||
#define WAIT_JOURNAL 3
|
||||
// Suspend operation until the next journal sector buffer is free
|
||||
#define WAIT_JOURNAL_BUFFER 4
|
||||
// Suspend operation until there is some free space on the data device
|
||||
#define WAIT_FREE 5
|
||||
|
||||
struct fulfill_read_t
|
||||
{
|
||||
uint64_t offset, len;
|
||||
uint64_t journal_sector; // sector+1 if used and !journal.inmemory, otherwise 0
|
||||
};
|
||||
|
||||
#define PRIV(op) ((blockstore_op_private_t*)(op)->private_data)
|
||||
@@ -241,8 +240,6 @@ class blockstore_impl_t
|
||||
int throttle_target_parallelism = 1;
|
||||
// Minimum difference in microseconds between target and real execution times to throttle the response
|
||||
int throttle_threshold_us = 50;
|
||||
// Maximum number of LIST operations to be processed between
|
||||
int single_tick_list_limit = 1;
|
||||
/******* END OF OPTIONS *******/
|
||||
|
||||
struct ring_consumer_t ring_consumer;
|
||||
@@ -293,6 +290,7 @@ class blockstore_impl_t
|
||||
// Journaling
|
||||
void prepare_journal_sector_write(int sector, blockstore_op_t *op);
|
||||
void handle_journal_write(ring_data_t *data, uint64_t flush_id);
|
||||
void disk_error_abort(const char *op, int retval, int expected);
|
||||
|
||||
// Asynchronous init
|
||||
int initialized;
|
||||
@@ -305,7 +303,7 @@ class blockstore_impl_t
|
||||
// Read
|
||||
int dequeue_read(blockstore_op_t *read_op);
|
||||
int fulfill_read(blockstore_op_t *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
|
||||
uint32_t item_state, uint64_t item_version, uint64_t item_location);
|
||||
uint32_t item_state, uint64_t item_version, uint64_t item_location, uint64_t journal_sector);
|
||||
int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
|
||||
uint32_t item_state, uint64_t item_version);
|
||||
void handle_read_event(ring_data_t *data, blockstore_op_t *op);
|
||||
|
@@ -48,14 +48,12 @@ void blockstore_init_meta::handle_event(ring_data_t *data, int buf_num)
|
||||
|
||||
int blockstore_init_meta::loop()
|
||||
{
|
||||
if (wait_state == 1)
|
||||
goto resume_1;
|
||||
else if (wait_state == 2)
|
||||
goto resume_2;
|
||||
else if (wait_state == 3)
|
||||
goto resume_3;
|
||||
else if (wait_state == 4)
|
||||
goto resume_4;
|
||||
if (wait_state == 1) goto resume_1;
|
||||
else if (wait_state == 2) goto resume_2;
|
||||
else if (wait_state == 3) goto resume_3;
|
||||
else if (wait_state == 4) goto resume_4;
|
||||
else if (wait_state == 5) goto resume_5;
|
||||
else if (wait_state == 6) goto resume_6;
|
||||
printf("Reading blockstore metadata\n");
|
||||
if (bs->inmemory_meta)
|
||||
metadata_buffer = bs->metadata_buffer;
|
||||
@@ -140,6 +138,7 @@ resume_1:
|
||||
// Skip superblock
|
||||
md_offset = bs->dsk.meta_block_size;
|
||||
next_offset = md_offset;
|
||||
entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size;
|
||||
// Read the rest of the metadata
|
||||
resume_2:
|
||||
if (next_offset < bs->dsk.meta_len && submitted == 0)
|
||||
@@ -179,17 +178,15 @@ resume_2:
|
||||
if (bufs[i].state == INIT_META_READ_DONE)
|
||||
{
|
||||
// Handle result
|
||||
unsigned entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size;
|
||||
bool changed = false;
|
||||
for (uint64_t sector = 0; sector < bufs[i].size; sector += bs->dsk.meta_block_size)
|
||||
{
|
||||
// handle <count> entries
|
||||
changed = changed || handle_entries(
|
||||
bufs[i].buf + sector, entries_per_block,
|
||||
((bufs[i].offset + sector - md_offset) / bs->dsk.meta_block_size) * entries_per_block
|
||||
);
|
||||
if (handle_meta_block(bufs[i].buf + sector, entries_per_block,
|
||||
((bufs[i].offset + sector - md_offset) / bs->dsk.meta_block_size) * entries_per_block))
|
||||
changed = true;
|
||||
}
|
||||
if (changed && !bs->inmemory_meta)
|
||||
if (changed && !bs->inmemory_meta && !bs->readonly)
|
||||
{
|
||||
// write the modified buffer back
|
||||
GET_SQE();
|
||||
@@ -211,6 +208,43 @@ resume_2:
|
||||
wait_state = 2;
|
||||
return 1;
|
||||
}
|
||||
if (entries_to_zero.size() && !bs->inmemory_meta && !bs->readonly)
|
||||
{
|
||||
// we have to zero out additional entries
|
||||
for (i = 0; i < entries_to_zero.size(); )
|
||||
{
|
||||
next_offset = entries_to_zero[i]/entries_per_block;
|
||||
for (j = i; j < entries_to_zero.size() && entries_to_zero[j]/entries_per_block == next_offset; j++) {}
|
||||
GET_SQE();
|
||||
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
|
||||
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
||||
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
|
||||
submitted++;
|
||||
resume_5:
|
||||
if (submitted > 0)
|
||||
{
|
||||
wait_state = 5;
|
||||
return 1;
|
||||
}
|
||||
for (; i < j; i++)
|
||||
{
|
||||
uint64_t pos = (entries_to_zero[i] % entries_per_block);
|
||||
memset((uint8_t*)metadata_buffer + pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
|
||||
}
|
||||
GET_SQE();
|
||||
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
|
||||
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
|
||||
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
|
||||
submitted++;
|
||||
resume_6:
|
||||
if (submitted > 0)
|
||||
{
|
||||
wait_state = 6;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
entries_to_zero.clear();
|
||||
}
|
||||
// metadata read finished
|
||||
printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->dsk.block_count);
|
||||
if (!bs->inmemory_meta)
|
||||
@@ -236,10 +270,13 @@ resume_2:
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool blockstore_init_meta::handle_entries(uint8_t *buf, uint64_t count, uint64_t done_cnt)
|
||||
bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_block, uint64_t done_cnt)
|
||||
{
|
||||
bool updated = false;
|
||||
for (uint64_t i = 0; i < count; i++)
|
||||
uint64_t max_i = entries_per_block;
|
||||
if (max_i > bs->dsk.block_count-done_cnt)
|
||||
max_i = bs->dsk.block_count-done_cnt;
|
||||
for (uint64_t i = 0; i < max_i; i++)
|
||||
{
|
||||
clean_disk_entry *entry = (clean_disk_entry*)(buf + i*bs->dsk.clean_entry_size);
|
||||
if (!bs->inmemory_meta && bs->dsk.clean_entry_bitmap_size)
|
||||
@@ -255,17 +292,35 @@ bool blockstore_init_meta::handle_entries(uint8_t *buf, uint64_t count, uint64_t
|
||||
if (clean_it != clean_db.end())
|
||||
{
|
||||
// free the previous block
|
||||
// here we have to zero out the entry because otherwise we'll hit
|
||||
// here we have to zero out the previous entry because otherwise we'll hit
|
||||
// "tried to overwrite non-zero metadata entry" later
|
||||
updated = true;
|
||||
memset(entry, 0, bs->dsk.clean_entry_size);
|
||||
uint64_t old_clean_loc = clean_it->second.location >> bs->dsk.block_order;
|
||||
if (bs->inmemory_meta)
|
||||
{
|
||||
uint64_t sector = (old_clean_loc / entries_per_block) * bs->dsk.meta_block_size;
|
||||
uint64_t pos = (old_clean_loc % entries_per_block);
|
||||
clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)bs->metadata_buffer + sector + pos*bs->dsk.clean_entry_size);
|
||||
memset(old_entry, 0, bs->dsk.clean_entry_size);
|
||||
}
|
||||
else if (old_clean_loc >= done_cnt)
|
||||
{
|
||||
updated = true;
|
||||
uint64_t sector = ((old_clean_loc - done_cnt) / entries_per_block) * bs->dsk.meta_block_size;
|
||||
uint64_t pos = (old_clean_loc % entries_per_block);
|
||||
clean_disk_entry *old_entry = (clean_disk_entry*)(buf + sector + pos*bs->dsk.clean_entry_size);
|
||||
memset(old_entry, 0, bs->dsk.clean_entry_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
entries_to_zero.push_back(clean_it->second.location >> bs->dsk.block_order);
|
||||
}
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
|
||||
clean_it->second.location >> bs->dsk.block_order,
|
||||
old_clean_loc,
|
||||
clean_it->first.inode, clean_it->first.stripe, clean_it->second.version,
|
||||
done_cnt+i);
|
||||
#endif
|
||||
bs->data_alloc->set(clean_it->second.location >> bs->dsk.block_order, false);
|
||||
bs->data_alloc->set(old_clean_loc, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@@ -24,7 +24,10 @@ class blockstore_init_meta
|
||||
uint64_t md_offset = 0;
|
||||
uint64_t next_offset = 0;
|
||||
uint64_t entries_loaded = 0;
|
||||
bool handle_entries(uint8_t *buf, uint64_t count, uint64_t done_cnt);
|
||||
unsigned entries_per_block = 0;
|
||||
int i = 0, j = 0;
|
||||
std::vector<uint64_t> entries_to_zero;
|
||||
bool handle_meta_block(uint8_t *buf, uint64_t count, uint64_t done_cnt);
|
||||
void handle_event(ring_data_t *data, int buf_num);
|
||||
public:
|
||||
blockstore_init_meta(blockstore_impl_t *bs);
|
||||
|
@@ -198,10 +198,7 @@ void blockstore_impl_t::handle_journal_write(ring_data_t *data, uint64_t flush_i
|
||||
if (data->res != data->iov.iov_len)
|
||||
{
|
||||
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
|
||||
throw std::runtime_error(
|
||||
"journal write failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
|
||||
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
|
||||
);
|
||||
disk_error_abort("journal write", data->res, data->iov.iov_len);
|
||||
}
|
||||
auto fl_it = journal.flushing_ops.upper_bound((pending_journaling_t){ .flush_id = flush_id });
|
||||
if (fl_it != journal.flushing_ops.end() && fl_it->flush_id == flush_id)
|
||||
|
@@ -16,6 +16,7 @@
|
||||
// FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
|
||||
// writing more than can be stabilized afterwards
|
||||
#define JOURNAL_STABILIZE_RESERVATION 65536
|
||||
#define JOURNAL_INSTANT_RESERVATION 131072
|
||||
|
||||
// Journal entries
|
||||
// Journal entries are linked to each other by their crc32 value
|
||||
|
@@ -42,7 +42,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
|
||||
|
||||
// FIXME I've seen a bug here so I want some tests
|
||||
int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
|
||||
uint32_t item_state, uint64_t item_version, uint64_t item_location)
|
||||
uint32_t item_state, uint64_t item_version, uint64_t item_location, uint64_t journal_sector)
|
||||
{
|
||||
uint32_t cur_start = item_start;
|
||||
if (cur_start < read_op->offset + read_op->len && item_end > read_op->offset)
|
||||
@@ -72,6 +72,7 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfille
|
||||
fulfill_read_t el = {
|
||||
.offset = cur_start,
|
||||
.len = it == PRIV(read_op)->read_vec.end() || it->offset >= item_end ? item_end-cur_start : it->offset-cur_start,
|
||||
.journal_sector = journal_sector,
|
||||
};
|
||||
it = PRIV(read_op)->read_vec.insert(it, el);
|
||||
if (!fulfill_read_push(read_op,
|
||||
@@ -138,7 +139,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
while (dirty_it->first.oid == read_op->oid)
|
||||
{
|
||||
dirty_entry& dirty = dirty_it->second;
|
||||
bool version_ok = read_op->version >= dirty_it->first.version;
|
||||
bool version_ok = !IS_IN_FLIGHT(dirty.state) && read_op->version >= dirty_it->first.version;
|
||||
if (IS_SYNCED(dirty.state))
|
||||
{
|
||||
if (!version_ok && read_op->version != 0)
|
||||
@@ -156,8 +157,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
memcpy(read_op->bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
|
||||
}
|
||||
}
|
||||
// If inmemory_journal is false, journal trim will have to wait until the read is completed
|
||||
if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
|
||||
dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset)))
|
||||
dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset),
|
||||
(IS_JOURNAL(dirty.state) ? dirty.journal_sector+1 : 0)))
|
||||
{
|
||||
// need to wait. undo added requests, don't dequeue op
|
||||
PRIV(read_op)->read_vec.clear();
|
||||
@@ -171,7 +174,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
dirty_it--;
|
||||
}
|
||||
}
|
||||
if (clean_it != clean_db.end())
|
||||
if (clean_found)
|
||||
{
|
||||
if (!result_version)
|
||||
{
|
||||
@@ -186,7 +189,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
{
|
||||
if (!dsk.clean_entry_bitmap_size)
|
||||
{
|
||||
if (!fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location))
|
||||
if (!fulfill_read(read_op, fulfilled, 0, dsk.data_block_size,
|
||||
(BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location, 0))
|
||||
{
|
||||
// need to wait. undo added requests, don't dequeue op
|
||||
PRIV(read_op)->read_vec.clear();
|
||||
@@ -207,7 +211,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
{
|
||||
// fill with zeroes
|
||||
assert(fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
|
||||
bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
|
||||
bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
|
||||
}
|
||||
bmp_start = bmp_end;
|
||||
while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
|
||||
@@ -218,7 +222,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
{
|
||||
if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
|
||||
bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
|
||||
clean_it->second.location + bmp_start * dsk.bitmap_granularity))
|
||||
clean_it->second.location + bmp_start * dsk.bitmap_granularity, 0))
|
||||
{
|
||||
// need to wait. undo added requests, don't dequeue op
|
||||
PRIV(read_op)->read_vec.clear();
|
||||
@@ -233,7 +237,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
else if (fulfilled < read_op->len)
|
||||
{
|
||||
// fill remaining parts with zeroes
|
||||
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
|
||||
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
|
||||
}
|
||||
assert(fulfilled == read_op->len);
|
||||
read_op->version = result_version;
|
||||
@@ -249,6 +253,15 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
||||
FINISH_OP(read_op);
|
||||
return 2;
|
||||
}
|
||||
if (!journal.inmemory)
|
||||
{
|
||||
// Journal trim has to wait until the read is completed - record journal sector usage
|
||||
for (auto & rv: PRIV(read_op)->read_vec)
|
||||
{
|
||||
if (rv.journal_sector)
|
||||
journal.used_sectors[rv.journal_sector-1]++;
|
||||
}
|
||||
}
|
||||
read_op->retval = 0;
|
||||
return 2;
|
||||
}
|
||||
@@ -264,6 +277,22 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
||||
}
|
||||
if (PRIV(op)->pending_ops == 0)
|
||||
{
|
||||
if (!journal.inmemory)
|
||||
{
|
||||
// Release journal sector usage
|
||||
for (auto & rv: PRIV(op)->read_vec)
|
||||
{
|
||||
if (rv.journal_sector)
|
||||
{
|
||||
auto used = --journal.used_sectors[rv.journal_sector-1];
|
||||
if (used == 0)
|
||||
{
|
||||
journal.used_sectors.erase(rv.journal_sector-1);
|
||||
flusher->mark_trim_possible();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (op->retval == 0)
|
||||
op->retval = op->len;
|
||||
FINISH_OP(op);
|
||||
|
@@ -127,7 +127,6 @@ resume_4:
|
||||
{
|
||||
mark_rolled_back(*v);
|
||||
}
|
||||
flusher->mark_trim_possible();
|
||||
// Acknowledge op
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
@@ -222,7 +221,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
|
||||
#endif
|
||||
data_alloc->set(dirty_it->second.location >> dsk.block_order, false);
|
||||
}
|
||||
int used = --journal.used_sectors[dirty_it->second.journal_sector];
|
||||
auto used = --journal.used_sectors[dirty_it->second.journal_sector];
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"remove usage of journal offset %08lx by %lx:%lx v%lu (%d refs)\n", dirty_it->second.journal_sector,
|
||||
@@ -232,6 +231,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
|
||||
if (used == 0)
|
||||
{
|
||||
journal.used_sectors.erase(dirty_it->second.journal_sector);
|
||||
flusher->mark_trim_possible();
|
||||
}
|
||||
if (dsk.clean_entry_bitmap_size > sizeof(void*))
|
||||
{
|
||||
|
@@ -89,6 +89,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
else
|
||||
{
|
||||
// Invalid version requested
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Write %lx:%lx v%lu requested, but we already have v%lu\n", op->oid.inode, op->oid.stripe, op->version, version);
|
||||
#endif
|
||||
op->retval = -EEXIST;
|
||||
if (!is_del && dsk.clean_entry_bitmap_size > sizeof(void*))
|
||||
{
|
||||
@@ -115,8 +118,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
else if (!wait_del)
|
||||
printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
|
||||
#endif
|
||||
// FIXME No strict need to add it into dirty_db here, it's just left
|
||||
// from the previous implementation where reads waited for writes
|
||||
// No strict need to add it into dirty_db here except maybe for listings to return
|
||||
// correct data when there are inflight operations in the queue
|
||||
uint32_t state;
|
||||
if (is_del)
|
||||
state = BS_ST_DELETE | BS_ST_IN_FLIGHT;
|
||||
@@ -139,7 +142,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
||||
uint8_t *bmp_ptr = (uint8_t*)(dsk.clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp);
|
||||
uint32_t bit = op->offset/dsk.bitmap_granularity;
|
||||
uint32_t bits_left = op->len/dsk.bitmap_granularity;
|
||||
while (!(bit % 8) && bits_left > 8)
|
||||
while (!(bit % 8) && bits_left >= 8)
|
||||
{
|
||||
// Copy bytes
|
||||
bmp_ptr[bit/8] = ((uint8_t*)op->bitmap)[bit/8];
|
||||
@@ -182,8 +185,15 @@ void blockstore_impl_t::cancel_all_writes(blockstore_op_t *op, blockstore_dirty_
|
||||
bool found = false;
|
||||
for (auto other_op: submit_queue)
|
||||
{
|
||||
if (!found && other_op == op)
|
||||
if (!other_op)
|
||||
{
|
||||
// freed operations during submitting are zeroed
|
||||
}
|
||||
else if (other_op == op)
|
||||
{
|
||||
// <op> may be present in queue multiple times due to moving operations in submit_queue
|
||||
found = true;
|
||||
}
|
||||
else if (found && other_op->oid == op->oid &&
|
||||
(other_op->opcode == BS_OP_WRITE || other_op->opcode == BS_OP_WRITE_STABLE))
|
||||
{
|
||||
@@ -251,7 +261,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
{
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, unsynced_big_write_count + 1,
|
||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
|
||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
|
||||
(dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -260,12 +271,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
if (loc == UINT64_MAX)
|
||||
{
|
||||
// no space
|
||||
if (flusher->is_active())
|
||||
{
|
||||
// hope that some space will be available after flush
|
||||
PRIV(op)->wait_for = WAIT_FREE;
|
||||
return 0;
|
||||
}
|
||||
cancel_all_writes(op, dirty_it, -ENOSPC);
|
||||
return 2;
|
||||
}
|
||||
@@ -337,7 +342,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
!space_check.check_available(op, unsynced_big_write_count,
|
||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0)
|
||||
|| !space_check.check_available(op, 1,
|
||||
sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size, op->len + JOURNAL_STABILIZE_RESERVATION))
|
||||
sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size,
|
||||
op->len + ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -448,12 +454,19 @@ int blockstore_impl_t::continue_write(blockstore_op_t *op)
|
||||
resume_2:
|
||||
// Only for the immediate_commit mode: prepare and submit big_write journal entry
|
||||
{
|
||||
BS_SUBMIT_CHECK_SQES(1);
|
||||
auto dirty_it = dirty_db.find((obj_ver_id){
|
||||
.oid = op->oid,
|
||||
.version = op->version,
|
||||
});
|
||||
assert(dirty_it != dirty_db.end());
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, 1,
|
||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
|
||||
((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
BS_SUBMIT_CHECK_SQES(1);
|
||||
journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
|
||||
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
|
||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size
|
||||
@@ -585,10 +598,7 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
|
||||
if (data->res != data->iov.iov_len)
|
||||
{
|
||||
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
|
||||
throw std::runtime_error(
|
||||
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
|
||||
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
|
||||
);
|
||||
disk_error_abort("data write", data->res, data->iov.iov_len);
|
||||
}
|
||||
PRIV(op)->pending_ops--;
|
||||
assert(PRIV(op)->pending_ops >= 0);
|
||||
@@ -643,7 +653,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
||||
});
|
||||
assert(dirty_it != dirty_db.end());
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_STABILIZE_RESERVATION))
|
||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_INSTANT_RESERVATION))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
48
src/cli.cpp
48
src/cli.cpp
@@ -76,6 +76,12 @@ static const char* help_text =
|
||||
"vitastor-cli alloc-osd\n"
|
||||
" Allocate a new OSD number and reserve it by creating empty /osd/stats/<n> key.\n"
|
||||
"\n"
|
||||
"vitastor-cli rm-osd [--force] [--allow-data-loss] [--dry-run] <osd_id> [osd_id...]\n"
|
||||
" Remove metadata and configuration for specified OSD(s) from etcd.\n"
|
||||
" Refuses to remove OSDs with data without --force and --allow-data-loss.\n"
|
||||
" With --dry-run only checks if deletion is possible without data loss and\n"
|
||||
" redundancy degradation.\n"
|
||||
"\n"
|
||||
"Use vitastor-cli --help <command> for command details or vitastor-cli --help --all for all details.\n"
|
||||
"\n"
|
||||
"GLOBAL OPTIONS:\n"
|
||||
@@ -95,43 +101,47 @@ static json11::Json::object parse_args(int narg, const char *args[])
|
||||
cfg["progress"] = "1";
|
||||
for (int i = 1; i < narg; i++)
|
||||
{
|
||||
if (args[i][0] == '-' && args[i][1] == 'h')
|
||||
if (args[i][0] == '-' && args[i][1] == 'h' && args[i][2] == 0)
|
||||
{
|
||||
cfg["help"] = "1";
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == 'l')
|
||||
else if (args[i][0] == '-' && args[i][1] == 'l' && args[i][2] == 0)
|
||||
{
|
||||
cfg["long"] = "1";
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == 'n')
|
||||
else if (args[i][0] == '-' && args[i][1] == 'n' && args[i][2] == 0)
|
||||
{
|
||||
cfg["count"] = args[++i];
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == 'p')
|
||||
else if (args[i][0] == '-' && args[i][1] == 'p' && args[i][2] == 0)
|
||||
{
|
||||
cfg["pool"] = args[++i];
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == 's')
|
||||
else if (args[i][0] == '-' && args[i][1] == 's' && args[i][2] == 0)
|
||||
{
|
||||
cfg["size"] = args[++i];
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == 'r')
|
||||
else if (args[i][0] == '-' && args[i][1] == 'r' && args[i][2] == 0)
|
||||
{
|
||||
cfg["reverse"] = "1";
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == 'f')
|
||||
else if (args[i][0] == '-' && args[i][1] == 'f' && args[i][2] == 0)
|
||||
{
|
||||
cfg["force"] = "1";
|
||||
}
|
||||
else if (args[i][0] == '-' && args[i][1] == '-')
|
||||
{
|
||||
const char *opt = args[i]+2;
|
||||
cfg[opt] = i == narg-1 || !strcmp(opt, "json") || !strcmp(opt, "wait-list") ||
|
||||
!strcmp(opt, "long") || !strcmp(opt, "del") || !strcmp(opt, "no-color") ||
|
||||
cfg[opt] = i == narg-1 || !strcmp(opt, "json") ||
|
||||
!strcmp(opt, "wait-list") || !strcmp(opt, "wait_list") ||
|
||||
!strcmp(opt, "long") || !strcmp(opt, "del") ||
|
||||
!strcmp(opt, "no-color") || !strcmp(opt, "no_color") ||
|
||||
!strcmp(opt, "readonly") || !strcmp(opt, "readwrite") ||
|
||||
!strcmp(opt, "force") || !strcmp(opt, "reverse") ||
|
||||
!strcmp(opt, "allow-data-loss") || !strcmp(opt, "allow_data_loss") ||
|
||||
!strcmp(opt, "dry-run") || !strcmp(opt, "dry_run") ||
|
||||
!strcmp(opt, "help") || !strcmp(opt, "all") ||
|
||||
!strcmp(opt, "writers-stopped") && strcmp("1", args[i+1]) != 0
|
||||
(!strcmp(opt, "writers-stopped") || !strcmp(opt, "writers_stopped")) && strcmp("1", args[i+1]) != 0
|
||||
? "1" : args[++i];
|
||||
}
|
||||
else
|
||||
@@ -139,10 +149,6 @@ static json11::Json::object parse_args(int narg, const char *args[])
|
||||
cmd.push_back(std::string(args[i]));
|
||||
}
|
||||
}
|
||||
if (cfg["help"].bool_value())
|
||||
{
|
||||
print_help(help_text, "vitastor-cli", cmd.size() ? cmd[0].string_value() : "", cfg["all"].bool_value());
|
||||
}
|
||||
if (!cmd.size())
|
||||
{
|
||||
std::string exe(exe_name);
|
||||
@@ -151,6 +157,10 @@ static json11::Json::object parse_args(int narg, const char *args[])
|
||||
cmd.push_back("rm-data");
|
||||
}
|
||||
}
|
||||
if (!cmd.size() || cfg["help"].bool_value())
|
||||
{
|
||||
print_help(help_text, "vitastor-cli", cmd.size() ? cmd[0].string_value() : "", cfg["all"].bool_value());
|
||||
}
|
||||
cfg["command"] = cmd;
|
||||
return cfg;
|
||||
}
|
||||
@@ -225,6 +235,16 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
|
||||
// Delete inode data
|
||||
action_cb = p->start_rm_data(cfg);
|
||||
}
|
||||
else if (cmd[0] == "rm-osd")
|
||||
{
|
||||
// Delete OSD metadata from etcd
|
||||
if (cmd.size() > 1)
|
||||
{
|
||||
cmd.erase(cmd.begin(), cmd.begin()+1);
|
||||
cfg["osd_id"] = cmd;
|
||||
}
|
||||
action_cb = p->start_rm_osd(cfg);
|
||||
}
|
||||
else if (cmd[0] == "merge-data")
|
||||
{
|
||||
// Merge layer data without affecting metadata
|
||||
|
@@ -45,7 +45,7 @@ public:
|
||||
cli_result_t etcd_err;
|
||||
json11::Json etcd_result;
|
||||
|
||||
void parse_config(json11::Json cfg);
|
||||
void parse_config(json11::Json::object & cfg);
|
||||
|
||||
void change_parent(inode_t cur, inode_t new_parent, cli_result_t *result);
|
||||
inode_config_t* get_inode_cfg(const std::string & name);
|
||||
@@ -64,6 +64,7 @@ public:
|
||||
std::function<bool(cli_result_t &)> start_merge(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_flatten(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_rm(json11::Json);
|
||||
std::function<bool(cli_result_t &)> start_rm_osd(json11::Json cfg);
|
||||
std::function<bool(cli_result_t &)> start_alloc_osd(json11::Json cfg);
|
||||
|
||||
// Should be called like loop_and_wait(start_status(), <completion callback>)
|
||||
|
@@ -100,9 +100,20 @@ inode_config_t* cli_tool_t::get_inode_cfg(const std::string & name)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void cli_tool_t::parse_config(json11::Json cfg)
|
||||
void cli_tool_t::parse_config(json11::Json::object & cfg)
|
||||
{
|
||||
color = !cfg["no-color"].bool_value();
|
||||
for (auto kv_it = cfg.begin(); kv_it != cfg.end();)
|
||||
{
|
||||
// Translate all options with - to _
|
||||
if (kv_it->first.find("-") != std::string::npos)
|
||||
{
|
||||
cfg[str_replace(kv_it->first, "-", "_")] = kv_it->second;
|
||||
cfg.erase(kv_it++);
|
||||
}
|
||||
else
|
||||
kv_it++;
|
||||
}
|
||||
color = !cfg["no_color"].bool_value();
|
||||
json_output = cfg["json"].bool_value();
|
||||
iodepth = cfg["iodepth"].uint64_value();
|
||||
if (!iodepth)
|
||||
@@ -112,7 +123,7 @@ void cli_tool_t::parse_config(json11::Json cfg)
|
||||
parallel_osds = 4;
|
||||
log_level = cfg["log_level"].int64_value();
|
||||
progress = cfg["progress"].uint64_value() ? true : false;
|
||||
list_first = cfg["wait-list"].uint64_value() ? true : false;
|
||||
list_first = cfg["wait_list"].uint64_value() ? true : false;
|
||||
}
|
||||
|
||||
struct cli_result_looper_t
|
||||
|
@@ -517,7 +517,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_create(json11::Json cfg)
|
||||
image_creator->force_size = cfg["force_size"].bool_value();
|
||||
if (cfg["image_meta"].is_object())
|
||||
{
|
||||
image_creator->new_meta = cfg["image-meta"];
|
||||
image_creator->new_meta = cfg["image_meta"];
|
||||
}
|
||||
if (cfg["snapshot"].string_value() != "")
|
||||
{
|
||||
|
@@ -121,8 +121,7 @@ resume_1:
|
||||
}
|
||||
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
uint64_t pg_real_size = pool_stats[pool_cfg.id]["pg_real_size"].uint64_value();
|
||||
pool_avail = pg_real_size > 0 ? pool_avail * (pool_cfg.pg_size - pool_cfg.parity_chunks) / pg_real_size : 0;
|
||||
pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
|
||||
}
|
||||
pool_stats[pool_cfg.id] = json11::Json::object {
|
||||
{ "name", pool_cfg.name },
|
||||
|
@@ -133,7 +133,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_flatten(json11::Json cfg)
|
||||
auto flattener = new snap_flattener_t();
|
||||
flattener->parent = this;
|
||||
flattener->target_name = cfg["image"].string_value();
|
||||
flattener->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
flattener->fsync_interval = cfg["fsync_interval"].uint64_value();
|
||||
if (!flattener->fsync_interval)
|
||||
flattener->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
|
@@ -403,7 +403,7 @@ struct snap_merger_t
|
||||
op->opcode = OSD_OP_READ_BITMAP;
|
||||
op->inode = target;
|
||||
op->offset = offset;
|
||||
op->len = 0;
|
||||
op->len = target_block_size;
|
||||
op->callback = [this](cluster_op_t *op)
|
||||
{
|
||||
if (op->retval < 0)
|
||||
@@ -631,8 +631,8 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_merge(json11::Json cfg)
|
||||
merger->from_name = cfg["from"].string_value();
|
||||
merger->to_name = cfg["to"].string_value();
|
||||
merger->target_name = cfg["target"].string_value();
|
||||
merger->delete_source = cfg["delete-source"].string_value() != "";
|
||||
merger->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
merger->delete_source = cfg["delete_source"].string_value() != "";
|
||||
merger->fsync_interval = cfg["fsync_interval"].uint64_value();
|
||||
if (!merger->fsync_interval)
|
||||
merger->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
|
@@ -236,7 +236,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
|
||||
changer->force = cfg["force"].bool_value();
|
||||
changer->set_readonly = cfg["readonly"].bool_value();
|
||||
changer->set_readwrite = cfg["readwrite"].bool_value();
|
||||
changer->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
changer->fsync_interval = cfg["fsync_interval"].uint64_value();
|
||||
if (!changer->fsync_interval)
|
||||
changer->fsync_interval = 128;
|
||||
// FIXME Check that the image doesn't have children when shrinking
|
||||
|
@@ -639,7 +639,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_rm(json11::Json cfg)
|
||||
snap_remover->parent = this;
|
||||
snap_remover->from_name = cfg["from"].string_value();
|
||||
snap_remover->to_name = cfg["to"].string_value();
|
||||
snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
|
||||
snap_remover->fsync_interval = cfg["fsync_interval"].uint64_value();
|
||||
if (!snap_remover->fsync_interval)
|
||||
snap_remover->fsync_interval = 128;
|
||||
if (!cfg["cas"].is_null())
|
||||
|
@@ -92,6 +92,7 @@ struct rm_inode_t
|
||||
|
||||
void send_ops(rm_pg_t *cur_list)
|
||||
{
|
||||
parent->cli->init_msgr();
|
||||
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
||||
parent->cli->msgr.osd_peer_fds.end())
|
||||
{
|
||||
@@ -218,7 +219,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_rm_data(json11::Json cfg)
|
||||
remover->inode = (remover->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
|
||||
}
|
||||
remover->pool_id = INODE_POOL(remover->inode);
|
||||
remover->min_offset = cfg["min-offset"].uint64_value();
|
||||
remover->min_offset = cfg["min_offset"].uint64_value();
|
||||
return [remover](cli_result_t & result)
|
||||
{
|
||||
remover->loop();
|
||||
|
491
src/cli_rm_osd.cpp
Normal file
491
src/cli_rm_osd.cpp
Normal file
@@ -0,0 +1,491 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include <ctype.h>
|
||||
#include "cli.h"
|
||||
#include "cluster_client.h"
|
||||
#include "str_util.h"
|
||||
#include "epoll_manager.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
// Delete OSD metadata from etcd
|
||||
struct rm_osd_t
|
||||
{
|
||||
cli_tool_t *parent;
|
||||
|
||||
bool dry_run, force_warning, force_dataloss;
|
||||
uint64_t etcd_tx_retry_ms = 500;
|
||||
uint64_t etcd_tx_retries = 10000;
|
||||
std::vector<uint64_t> osd_ids;
|
||||
|
||||
int state = 0;
|
||||
cli_result_t result;
|
||||
|
||||
std::set<uint64_t> to_remove;
|
||||
std::set<uint64_t> to_restart;
|
||||
json11::Json::array pool_effects;
|
||||
json11::Json::array history_updates, history_checks;
|
||||
json11::Json new_pgs, new_clean_pgs;
|
||||
uint64_t new_pgs_mod_rev, new_clean_pgs_mod_rev;
|
||||
uint64_t cur_retry = 0;
|
||||
uint64_t retry_wait = 0;
|
||||
bool is_warning, is_dataloss;
|
||||
|
||||
bool is_done()
|
||||
{
|
||||
return state == 100;
|
||||
}
|
||||
|
||||
void loop()
|
||||
{
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
else if (state == 2)
|
||||
goto resume_2;
|
||||
else if (state == 3)
|
||||
goto resume_3;
|
||||
else if (state == 4)
|
||||
goto resume_4;
|
||||
if (!osd_ids.size())
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "OSD numbers are not specified" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
for (auto osd_id: osd_ids)
|
||||
{
|
||||
if (!osd_id)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "OSD number can't be zero" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
to_remove.insert(osd_id);
|
||||
}
|
||||
// Check if OSDs are still used in data distribution
|
||||
is_warning = is_dataloss = false;
|
||||
for (auto & pp: parent->cli->st_cli.pool_config)
|
||||
{
|
||||
// Will OSD deletion make pool incomplete / down / degraded?
|
||||
bool pool_incomplete = false, pool_down = false, pool_degraded = false;
|
||||
bool hist_incomplete = false, hist_degraded = false;
|
||||
auto & pool_cfg = pp.second;
|
||||
uint64_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
||||
for (auto & pgp: pool_cfg.pg_config)
|
||||
{
|
||||
auto & pg_cfg = pgp.second;
|
||||
int pg_cursize = 0, pg_rm = 0;
|
||||
for (auto pg_osd: pg_cfg.target_set)
|
||||
{
|
||||
if (pg_osd != 0)
|
||||
{
|
||||
pg_cursize++;
|
||||
if (to_remove.find(pg_osd) != to_remove.end())
|
||||
pg_rm++;
|
||||
}
|
||||
}
|
||||
for (auto & hist_item: pg_cfg.target_history)
|
||||
{
|
||||
int hist_size = 0, hist_rm = 0;
|
||||
for (auto & old_osd: hist_item)
|
||||
{
|
||||
if (old_osd != 0)
|
||||
{
|
||||
hist_size++;
|
||||
if (to_remove.find(old_osd) != to_remove.end())
|
||||
hist_rm++;
|
||||
}
|
||||
}
|
||||
if (hist_rm > 0)
|
||||
{
|
||||
hist_degraded = true;
|
||||
if (hist_size-hist_rm == 0)
|
||||
pool_incomplete = true;
|
||||
else if (hist_size-hist_rm < pg_data_size)
|
||||
hist_incomplete = true;
|
||||
}
|
||||
}
|
||||
if (pg_rm > 0)
|
||||
{
|
||||
pool_degraded = true;
|
||||
if (pg_cursize-pg_rm < pg_data_size)
|
||||
pool_incomplete = true;
|
||||
else if (pg_cursize-pg_rm < pool_cfg.pg_minsize)
|
||||
pool_down = true;
|
||||
}
|
||||
}
|
||||
if (pool_incomplete || pool_down || pool_degraded || hist_incomplete || hist_degraded)
|
||||
{
|
||||
pool_effects.push_back(json11::Json::object {
|
||||
{ "pool_id", (uint64_t)pool_cfg.id },
|
||||
{ "pool_name", pool_cfg.name },
|
||||
{ "effect", (pool_incomplete
|
||||
? "incomplete"
|
||||
: (hist_incomplete
|
||||
? "has_incomplete"
|
||||
: (pool_down
|
||||
? "offline"
|
||||
: (pool_degraded
|
||||
? "degraded"
|
||||
: (hist_degraded ? "has_degraded" : "?")
|
||||
)
|
||||
)
|
||||
)
|
||||
) },
|
||||
});
|
||||
is_warning = true;
|
||||
if (pool_incomplete || hist_incomplete)
|
||||
is_dataloss = true;
|
||||
}
|
||||
}
|
||||
result.data = json11::Json::object {
|
||||
{ "osd_ids", osd_ids },
|
||||
{ "pool_errors", pool_effects },
|
||||
};
|
||||
if (is_dataloss || is_warning || dry_run)
|
||||
{
|
||||
std::string error;
|
||||
for (auto & e: pool_effects)
|
||||
{
|
||||
error += "Pool "+e["pool_name"].string_value()+" (ID "+e["pool_id"].as_string()+") will have "+(
|
||||
e["effect"] == "has_incomplete"
|
||||
? std::string("INCOMPLETE objects (DATA LOSS)")
|
||||
: (e["effect"] == "incomplete"
|
||||
? std::string("INCOMPLETE PGs (DATA LOSS)")
|
||||
: (e["effect"] == "has_degraded"
|
||||
? std::string("DEGRADED objects")
|
||||
: strtoupper(e["effect"].string_value())+" PGs"))
|
||||
)+" after deleting OSD(s).\n";
|
||||
}
|
||||
if (is_dataloss && !force_dataloss && !dry_run)
|
||||
error += "OSDs not deleted. Please move data to other OSDs or bypass this check with --allow-data-loss if you know what you are doing.\n";
|
||||
else if (is_warning && !force_warning && !dry_run)
|
||||
error += "OSDs not deleted. Please move data to other OSDs or bypass this check with --force if you know what you are doing.\n";
|
||||
else if (!is_dataloss && !is_warning && dry_run)
|
||||
error += "OSDs can be deleted without data loss.\n";
|
||||
result.text = error;
|
||||
if (dry_run || is_dataloss && !force_dataloss || is_warning && !force_warning)
|
||||
{
|
||||
result.err = is_dataloss && !force_dataloss || is_warning && !force_warning ? EBUSY : 0;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
}
|
||||
parent->etcd_txn(json11::Json::object { { "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+"/config/pgs"
|
||||
) },
|
||||
} },
|
||||
},
|
||||
json11::Json::object {
|
||||
{ "request_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+"/history/last_clean_pgs"
|
||||
) },
|
||||
} },
|
||||
},
|
||||
} } });
|
||||
resume_4:
|
||||
state = 4;
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
{
|
||||
auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]);
|
||||
new_pgs = remove_osds_from_pgs(kv);
|
||||
new_pgs_mod_rev = kv.mod_revision;
|
||||
kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][1]["response_range"]["kvs"][0]);
|
||||
new_clean_pgs = remove_osds_from_pgs(kv);
|
||||
new_clean_pgs_mod_rev = kv.mod_revision;
|
||||
}
|
||||
// Remove keys from etcd
|
||||
{
|
||||
json11::Json::array rm_items, rm_checks;
|
||||
for (auto osd_id: osd_ids)
|
||||
{
|
||||
rm_items.push_back("/config/osd/"+std::to_string(osd_id));
|
||||
rm_items.push_back("/osd/stats/"+std::to_string(osd_id));
|
||||
rm_items.push_back("/osd/state/"+std::to_string(osd_id));
|
||||
rm_items.push_back("/osd/inodestats/"+std::to_string(osd_id));
|
||||
rm_items.push_back("/osd/space/"+std::to_string(osd_id));
|
||||
}
|
||||
for (int i = 0; i < rm_items.size(); i++)
|
||||
{
|
||||
rm_items[i] = json11::Json::object {
|
||||
{ "request_delete_range", json11::Json::object {
|
||||
{ "key", base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+rm_items[i].string_value()
|
||||
) },
|
||||
} },
|
||||
};
|
||||
}
|
||||
if (!new_pgs.is_null())
|
||||
{
|
||||
auto pgs_key = base64_encode(parent->cli->st_cli.etcd_prefix+"/config/pgs");
|
||||
rm_items.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", pgs_key },
|
||||
{ "value", base64_encode(new_pgs.dump()) },
|
||||
} },
|
||||
});
|
||||
rm_checks.push_back(json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", pgs_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", new_pgs_mod_rev+1 },
|
||||
});
|
||||
}
|
||||
if (!new_clean_pgs.is_null())
|
||||
{
|
||||
auto pgs_key = base64_encode(parent->cli->st_cli.etcd_prefix+"/history/last_clean_pgs");
|
||||
rm_items.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", pgs_key },
|
||||
{ "value", base64_encode(new_clean_pgs.dump()) },
|
||||
} },
|
||||
});
|
||||
rm_checks.push_back(json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", pgs_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", new_clean_pgs_mod_rev+1 },
|
||||
});
|
||||
}
|
||||
parent->etcd_txn(json11::Json::object { { "success", rm_items }, { "checks", rm_checks } });
|
||||
}
|
||||
resume_1:
|
||||
state = 1;
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
// Remove old OSD from PG all_peers to prevent left_on_dead and from
|
||||
// target_history to prevent INCOMPLETE if --allow-data-loss is specified
|
||||
for (auto & rsp: parent->etcd_result["responses"].array_items())
|
||||
{
|
||||
if (rsp["response_delete_range"]["deleted"].uint64_value() > 0)
|
||||
{
|
||||
// Wait for mon_change_timeout before updating PG history, or the monitor's change will likely interfere with ours
|
||||
retry_wait = parent->cli->merged_config["mon_change_timeout"].uint64_value();
|
||||
if (!retry_wait)
|
||||
retry_wait = 1000;
|
||||
retry_wait += etcd_tx_retry_ms;
|
||||
}
|
||||
}
|
||||
while (1)
|
||||
{
|
||||
resume_2:
|
||||
if (!remove_osds_from_history(2))
|
||||
return;
|
||||
resume_3:
|
||||
state = 3;
|
||||
if (parent->waiting > 0)
|
||||
return;
|
||||
if (parent->etcd_err.err)
|
||||
{
|
||||
result = parent->etcd_err;
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (parent->etcd_result["succeeded"].bool_value())
|
||||
break;
|
||||
if ((++cur_retry) >= etcd_tx_retries)
|
||||
{
|
||||
result.err = EAGAIN;
|
||||
result.text += "Failed to remove OSDs from PG history due to update conflicts."
|
||||
" Some PGs may remain left_on_dead or incomplete. Please retry later\n";
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
retry_wait = etcd_tx_retry_ms;
|
||||
}
|
||||
std::string ids = "";
|
||||
for (auto osd_id: osd_ids)
|
||||
{
|
||||
ids += (ids.size() ? ", " : "")+std::to_string(osd_id);
|
||||
}
|
||||
ids = (osd_ids.size() > 1 ? "OSDs " : "OSD ")+ids+(osd_ids.size() > 1 ? " are" : " is")+" removed from etcd";
|
||||
state = 100;
|
||||
result.text = (result.text != "" ? ids+"\n"+result.text : ids);
|
||||
result.err = 0;
|
||||
}
|
||||
|
||||
json11::Json remove_osds_from_pgs(const etcd_kv_t & kv)
|
||||
{
|
||||
if (kv.value.is_null())
|
||||
{
|
||||
return kv.value;
|
||||
}
|
||||
json11::Json::object new_pgs;
|
||||
for (auto & pp: kv.value["items"].object_items())
|
||||
{
|
||||
if (pp.second.is_object())
|
||||
{
|
||||
json11::Json::object new_pool;
|
||||
for (auto & pgp: pp.second.object_items())
|
||||
{
|
||||
json11::Json::array osd_set;
|
||||
for (auto & osd_json: pgp.second["osd_set"].array_items())
|
||||
{
|
||||
uint64_t osd_num = osd_json.uint64_value();
|
||||
osd_set.push_back(osd_num == 0 || to_remove.find(osd_num) != to_remove.end() ? 0 : osd_num);
|
||||
}
|
||||
json11::Json::object new_pg = pgp.second.object_items();
|
||||
new_pg["osd_set"] = osd_set;
|
||||
new_pool[pgp.first] = new_pg;
|
||||
}
|
||||
new_pgs[pp.first] = new_pool;
|
||||
}
|
||||
else
|
||||
new_pgs[pp.first] = pp.second;
|
||||
}
|
||||
auto res = kv.value.object_items();
|
||||
res["items"] = new_pgs;
|
||||
return res;
|
||||
}
|
||||
|
||||
bool remove_osds_from_history(int base_state)
|
||||
{
|
||||
if (state == base_state+0)
|
||||
goto resume_0;
|
||||
history_updates.clear();
|
||||
history_checks.clear();
|
||||
for (auto & pp: parent->cli->st_cli.pool_config)
|
||||
{
|
||||
bool update_pg_history = false;
|
||||
auto & pool_cfg = pp.second;
|
||||
for (auto & pgp: pool_cfg.pg_config)
|
||||
{
|
||||
auto pg_num = pgp.first;
|
||||
auto & pg_cfg = pgp.second;
|
||||
for (int i = 0; i < pg_cfg.all_peers.size(); i++)
|
||||
{
|
||||
if (to_remove.find(pg_cfg.all_peers[i]) != to_remove.end())
|
||||
{
|
||||
update_pg_history = true;
|
||||
pg_cfg.all_peers.erase(pg_cfg.all_peers.begin()+i, pg_cfg.all_peers.begin()+i+1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < pg_cfg.target_history.size(); i++)
|
||||
{
|
||||
int hist_size = 0, hist_rm = 0;
|
||||
for (auto & old_osd: pg_cfg.target_history[i])
|
||||
{
|
||||
if (old_osd != 0)
|
||||
{
|
||||
hist_size++;
|
||||
if (to_remove.find(old_osd) != to_remove.end())
|
||||
{
|
||||
hist_rm++;
|
||||
old_osd = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (hist_rm > 0)
|
||||
{
|
||||
if (hist_size-hist_rm == 0)
|
||||
{
|
||||
pg_cfg.target_history.erase(pg_cfg.target_history.begin()+i, pg_cfg.target_history.begin()+i+1);
|
||||
i--;
|
||||
}
|
||||
update_pg_history = true;
|
||||
}
|
||||
}
|
||||
if (update_pg_history)
|
||||
{
|
||||
std::string history_key = base64_encode(
|
||||
parent->cli->st_cli.etcd_prefix+"/pg/history/"+
|
||||
std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num)
|
||||
);
|
||||
history_updates.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", history_key },
|
||||
{ "value", base64_encode(json11::Json(json11::Json::object {
|
||||
{ "epoch", pg_cfg.epoch },
|
||||
{ "all_peers", pg_cfg.all_peers },
|
||||
{ "osd_sets", pg_cfg.target_history },
|
||||
}).dump()) },
|
||||
} },
|
||||
});
|
||||
history_checks.push_back(json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", history_key },
|
||||
{ "result", "LESS" },
|
||||
{ "mod_revision", parent->cli->st_cli.etcd_watch_revision+1 },
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
if (history_updates.size())
|
||||
{
|
||||
if (retry_wait)
|
||||
{
|
||||
parent->waiting++;
|
||||
parent->epmgr->tfd->set_timer(retry_wait, false, [this](int timer_id)
|
||||
{
|
||||
parent->waiting--;
|
||||
parent->ringloop->wakeup();
|
||||
});
|
||||
resume_0:
|
||||
state = base_state+0;
|
||||
if (parent->waiting > 0)
|
||||
return false;
|
||||
}
|
||||
parent->etcd_txn(json11::Json::object {
|
||||
{ "success", history_updates },
|
||||
{ "compare", history_checks },
|
||||
});
|
||||
}
|
||||
else
|
||||
parent->etcd_result = json11::Json::object{ { "succeeded", true } };
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<bool(cli_result_t &)> cli_tool_t::start_rm_osd(json11::Json cfg)
|
||||
{
|
||||
auto rm_osd = new rm_osd_t();
|
||||
rm_osd->parent = this;
|
||||
rm_osd->dry_run = cfg["dry_run"].bool_value();
|
||||
rm_osd->force_dataloss = cfg["allow_data_loss"].bool_value();
|
||||
rm_osd->force_warning = rm_osd->force_dataloss || cfg["force"].bool_value();
|
||||
if (!cfg["etcd_tx_retries"].is_null())
|
||||
rm_osd->etcd_tx_retries = cfg["etcd_tx_retries"].uint64_value();
|
||||
if (!cfg["etcd_tx_retry_ms"].is_null())
|
||||
{
|
||||
rm_osd->etcd_tx_retry_ms = cfg["etcd_tx_retry_ms"].uint64_value();
|
||||
if (rm_osd->etcd_tx_retry_ms < 100)
|
||||
rm_osd->etcd_tx_retry_ms = 100;
|
||||
}
|
||||
if (cfg["osd_id"].is_number() || cfg["osd_id"].is_string())
|
||||
rm_osd->osd_ids.push_back(cfg["osd_id"].uint64_value());
|
||||
else
|
||||
{
|
||||
for (auto & id: cfg["osd_id"].array_items())
|
||||
rm_osd->osd_ids.push_back(id.uint64_value());
|
||||
}
|
||||
return [rm_osd](cli_result_t & result)
|
||||
{
|
||||
rm_osd->loop();
|
||||
if (rm_osd->is_done())
|
||||
{
|
||||
result = rm_osd->result;
|
||||
delete rm_osd;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
@@ -59,7 +59,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||
delete op;
|
||||
};
|
||||
msgr.parse_config(this->config);
|
||||
msgr.init();
|
||||
|
||||
st_cli.tfd = tfd;
|
||||
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
|
||||
@@ -73,17 +72,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||
|
||||
scrap_buffer_size = SCRAP_BUFFER_SIZE;
|
||||
scrap_buffer = malloc_or_die(scrap_buffer_size);
|
||||
|
||||
if (ringloop)
|
||||
{
|
||||
consumer.loop = [this]()
|
||||
{
|
||||
msgr.read_requests();
|
||||
msgr.send_replies();
|
||||
this->ringloop->submit();
|
||||
};
|
||||
ringloop->register_consumer(&consumer);
|
||||
}
|
||||
}
|
||||
|
||||
cluster_client_t::~cluster_client_t()
|
||||
@@ -115,6 +103,24 @@ cluster_op_t::~cluster_op_t()
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::init_msgr()
|
||||
{
|
||||
if (msgr_initialized)
|
||||
return;
|
||||
msgr.init();
|
||||
msgr_initialized = true;
|
||||
if (ringloop)
|
||||
{
|
||||
consumer.loop = [this]()
|
||||
{
|
||||
msgr.read_requests();
|
||||
msgr.send_replies();
|
||||
this->ringloop->submit();
|
||||
};
|
||||
ringloop->register_consumer(&consumer);
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
{
|
||||
op->prev_wait = 0;
|
||||
@@ -143,7 +149,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
if (!op->prev_wait)
|
||||
continue_sync(op);
|
||||
}
|
||||
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
|
||||
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) */
|
||||
{
|
||||
for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
|
||||
{
|
||||
@@ -151,7 +157,8 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
{
|
||||
op->prev_wait++;
|
||||
}
|
||||
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
|
||||
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ ||
|
||||
prev->opcode == OSD_OP_READ_BITMAP || prev->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||
{
|
||||
// Flushes are always in the beginning (we're scanning from the beginning of the queue)
|
||||
break;
|
||||
@@ -171,7 +178,8 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
|
||||
auto n2 = next->next;
|
||||
if (next->opcode == OSD_OP_SYNC && !(flags & OP_IMMEDIATE_COMMIT) ||
|
||||
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
|
||||
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
|
||||
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP ||
|
||||
next->opcode == OSD_OP_READ_CHAIN_BITMAP) && (flags & OP_FLUSH_BUFFER))
|
||||
{
|
||||
next->prev_wait += inc;
|
||||
assert(next->prev_wait >= 0);
|
||||
@@ -221,9 +229,14 @@ void cluster_client_t::erase_op(cluster_op_t *op)
|
||||
if (op_queue_tail == op)
|
||||
op_queue_tail = op->prev;
|
||||
op->next = op->prev = NULL;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
if (flags & OP_FLUSH_BUFFER)
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
if (!(flags & OP_IMMEDIATE_COMMIT))
|
||||
inc_wait(opcode, flags, next, -1);
|
||||
// Call callback at the end to avoid inconsistencies in prev_wait
|
||||
// if the callback adds more operations itself
|
||||
if (!(flags & OP_FLUSH_BUFFER))
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
}
|
||||
|
||||
void cluster_client_t::continue_ops(bool up_retry)
|
||||
@@ -335,7 +348,8 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
|
||||
// And now they have to be resliced!
|
||||
for (auto op = op_queue_head; op; op = op->next)
|
||||
{
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) &&
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ ||
|
||||
op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) &&
|
||||
INODE_POOL(op->cur_inode) == pool_item.first)
|
||||
{
|
||||
op->needs_reslice = true;
|
||||
@@ -407,7 +421,7 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
|
||||
void cluster_client_t::execute(cluster_op_t *op)
|
||||
{
|
||||
if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ &&
|
||||
op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_WRITE)
|
||||
op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_WRITE)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
@@ -439,7 +453,7 @@ void cluster_client_t::execute(cluster_op_t *op)
|
||||
return;
|
||||
}
|
||||
// Check alignment
|
||||
if ((op->opcode == OSD_OP_READ || op->opcode == OSD_OP_WRITE) && !op->len ||
|
||||
if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
|
||||
op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
@@ -700,8 +714,7 @@ resume_3:
|
||||
// Finished successfully
|
||||
// Even if the PG count has changed in meanwhile we treat it as success
|
||||
// because if some operations were invalid for the new PG count we'd get errors
|
||||
bool is_read = op->opcode == OSD_OP_READ;
|
||||
if (is_read)
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||
{
|
||||
// Check parent inode
|
||||
auto ino_it = st_cli.inode_config.find(op->cur_inode);
|
||||
@@ -725,6 +738,11 @@ resume_3:
|
||||
}
|
||||
}
|
||||
op->retval = op->len;
|
||||
if (op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||
{
|
||||
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
|
||||
op->retval = op->len / pool_cfg.bitmap_granularity;
|
||||
}
|
||||
erase_op(op);
|
||||
return 1;
|
||||
}
|
||||
@@ -748,7 +766,10 @@ resume_3:
|
||||
{
|
||||
for (int i = 0; i < op->parts.size(); i++)
|
||||
{
|
||||
op->parts[i].flags = PART_RETRY;
|
||||
if (!(op->parts[i].flags & PART_DONE))
|
||||
{
|
||||
op->parts[i].flags = PART_RETRY;
|
||||
}
|
||||
}
|
||||
goto resume_2;
|
||||
}
|
||||
@@ -807,23 +828,19 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
||||
uint64_t last_stripe = op->len > 0 ? ((op->offset + op->len - 1) / pg_block_size) * pg_block_size : first_stripe;
|
||||
op->retval = 0;
|
||||
op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||
{
|
||||
// Allocate memory for the bitmap
|
||||
unsigned object_bitmap_size = (((op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : op->len) / pool_cfg.bitmap_granularity + 7) / 8);
|
||||
unsigned object_bitmap_size = ((op->len / pool_cfg.bitmap_granularity + 7) / 8);
|
||||
object_bitmap_size = (object_bitmap_size < 8 ? 8 : object_bitmap_size);
|
||||
unsigned bitmap_mem = object_bitmap_size + (pool_cfg.data_block_size / pool_cfg.bitmap_granularity / 8 * pg_data_size) * op->parts.size();
|
||||
if (op->bitmap_buf_size < bitmap_mem)
|
||||
if (!op->bitmap_buf || op->bitmap_buf_size < bitmap_mem)
|
||||
{
|
||||
op->bitmap_buf = realloc_or_die(op->bitmap_buf, bitmap_mem);
|
||||
if (!op->bitmap_buf_size)
|
||||
{
|
||||
// First allocation
|
||||
memset(op->bitmap_buf, 0, object_bitmap_size);
|
||||
}
|
||||
op->part_bitmaps = (uint8_t*)op->bitmap_buf + object_bitmap_size;
|
||||
op->bitmap_buf_size = bitmap_mem;
|
||||
}
|
||||
memset(op->bitmap_buf, 0, bitmap_mem);
|
||||
}
|
||||
int iov_idx = 0;
|
||||
size_t iov_pos = 0;
|
||||
@@ -874,13 +891,14 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
||||
if (end == begin)
|
||||
op->done_count++;
|
||||
}
|
||||
else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
{
|
||||
add_iov(end-begin, false, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
|
||||
}
|
||||
op->parts[i].parent = op;
|
||||
op->parts[i].offset = begin;
|
||||
op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
|
||||
op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ||
|
||||
op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
|
||||
op->parts[i].pg_num = pg_num;
|
||||
op->parts[i].osd_num = 0;
|
||||
op->parts[i].flags = 0;
|
||||
@@ -909,6 +927,10 @@ bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len
|
||||
|
||||
bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
{
|
||||
if (!msgr_initialized)
|
||||
{
|
||||
init_msgr();
|
||||
}
|
||||
auto part = &op->parts[i];
|
||||
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
|
||||
auto pg_it = pool_cfg.pg_config.find(part->pg_num);
|
||||
@@ -927,7 +949,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
|
||||
);
|
||||
uint64_t meta_rev = 0;
|
||||
if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
{
|
||||
auto ino_it = st_cli.inode_config.find(op->inode);
|
||||
if (ino_it != st_cli.inode_config.end())
|
||||
@@ -939,8 +961,8 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
.req = { .rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = op_id++,
|
||||
.opcode = op->opcode == OSD_OP_READ_BITMAP ? OSD_OP_READ : op->opcode,
|
||||
.id = next_op_id(),
|
||||
.opcode = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ? OSD_OP_READ : op->opcode,
|
||||
},
|
||||
.inode = op->cur_inode,
|
||||
.offset = part->offset,
|
||||
@@ -948,8 +970,10 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
.meta_revision = meta_rev,
|
||||
.version = op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE ? op->version : 0,
|
||||
} },
|
||||
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
|
||||
.bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? pg_bitmap_size : 0),
|
||||
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
|
||||
? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
|
||||
.bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
|
||||
? pg_bitmap_size : 0),
|
||||
.callback = [this, part](osd_op_t *op_part)
|
||||
{
|
||||
handle_op_part(part);
|
||||
@@ -1067,7 +1091,7 @@ void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
|
||||
.req = {
|
||||
.hdr = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = op_id++,
|
||||
.id = next_op_id(),
|
||||
.opcode = OSD_OP_SYNC,
|
||||
},
|
||||
},
|
||||
@@ -1128,11 +1152,11 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
else
|
||||
{
|
||||
// OK
|
||||
if (!(op->flags & OP_IMMEDIATE_COMMIT))
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && !(op->flags & OP_IMMEDIATE_COMMIT))
|
||||
dirty_osds.insert(part->osd_num);
|
||||
part->flags |= PART_DONE;
|
||||
op->done_count++;
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||
{
|
||||
copy_part_bitmap(op, part);
|
||||
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
||||
@@ -1156,7 +1180,12 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
|
||||
);
|
||||
uint32_t object_offset = (part->op.req.rw.offset - op->offset) / pool_cfg.bitmap_granularity;
|
||||
uint32_t part_offset = (part->op.req.rw.offset % pg_block_size) / pool_cfg.bitmap_granularity;
|
||||
uint32_t part_len = (op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : part->op.req.rw.len) / pool_cfg.bitmap_granularity;
|
||||
uint32_t op_len = op->len / pool_cfg.bitmap_granularity;
|
||||
uint32_t part_len = pg_block_size/pool_cfg.bitmap_granularity - part_offset;
|
||||
if (part_len > op_len-object_offset)
|
||||
{
|
||||
part_len = op_len-object_offset;
|
||||
}
|
||||
if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
|
||||
{
|
||||
// Copy bytes
|
||||
@@ -1179,5 +1208,5 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
|
||||
|
||||
uint64_t cluster_client_t::next_op_id()
|
||||
{
|
||||
return op_id++;
|
||||
return msgr.next_subop_id++;
|
||||
}
|
||||
|
@@ -11,6 +11,7 @@
|
||||
#define INODE_LIST_DONE 1
|
||||
#define INODE_LIST_HAS_UNSTABLE 2
|
||||
#define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
|
||||
#define OSD_OP_READ_CHAIN_BITMAP 0x102
|
||||
|
||||
#define OSD_OP_IGNORE_READONLY 0x08
|
||||
|
||||
@@ -30,7 +31,7 @@ struct cluster_op_part_t
|
||||
|
||||
struct cluster_op_t
|
||||
{
|
||||
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP
|
||||
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP, OSD_OP_READ_CHAIN_BITMAP
|
||||
uint64_t inode;
|
||||
uint64_t offset;
|
||||
uint64_t len;
|
||||
@@ -39,9 +40,13 @@ struct cluster_op_t
|
||||
uint64_t version = 0;
|
||||
// now only OSD_OP_IGNORE_READONLY is supported
|
||||
uint64_t flags = 0;
|
||||
// negative retval is an error number
|
||||
// write and read return len on success
|
||||
// sync and delete return 0 on success
|
||||
// read_bitmap and read_chain_bitmap return the length of bitmap in bits(!)
|
||||
int retval;
|
||||
osd_op_buf_list_t iov;
|
||||
// READ and READ_BITMAP return the bitmap here
|
||||
// READ, READ_BITMAP, READ_CHAIN_BITMAP return the bitmap here
|
||||
void *bitmap_buf = NULL;
|
||||
std::function<void(cluster_op_t*)> callback;
|
||||
~cluster_op_t();
|
||||
@@ -85,7 +90,6 @@ class cluster_client_t
|
||||
int up_wait_retry_interval = 500; // ms
|
||||
|
||||
int retry_timeout_id = 0;
|
||||
uint64_t op_id = 1;
|
||||
std::vector<cluster_op_t*> offline_ops;
|
||||
cluster_op_t *op_queue_head = NULL, *op_queue_tail = NULL;
|
||||
std::map<object_id, cluster_buffer_t> dirty_buffers;
|
||||
@@ -100,10 +104,14 @@ class cluster_client_t
|
||||
std::vector<std::function<void(void)>> on_ready_hooks;
|
||||
std::vector<inode_list_t*> lists;
|
||||
int continuing_ops = 0;
|
||||
bool msgr_initialized = false;
|
||||
|
||||
public:
|
||||
etcd_state_client_t st_cli;
|
||||
|
||||
osd_messenger_t msgr;
|
||||
void init_msgr();
|
||||
|
||||
json11::Json config;
|
||||
json11::Json::object merged_config;
|
||||
|
||||
|
@@ -196,7 +196,7 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
||||
.sec_list = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = op_id++,
|
||||
.id = next_op_id(),
|
||||
.opcode = OSD_OP_SEC_LIST,
|
||||
},
|
||||
.list_pg = cur_list->pg->pg_num,
|
||||
|
@@ -52,11 +52,12 @@ static const char *help_text =
|
||||
" --disable_data_fsync 0 Disable data device cache and fsync (default off)\n"
|
||||
" --disable_meta_fsync 0 Disable metadata device cache and fsync (default off)\n"
|
||||
" --disable_journal_fsync 0 Disable journal device cache and fsync (default off)\n"
|
||||
" --hdd Enable HDD defaults (1M block, 1G journal, throttling)\n"
|
||||
" --force Bypass partition safety checks (for emptiness and so on)\n"
|
||||
" \n"
|
||||
" Options (both modes):\n"
|
||||
" --journal_size 1G/32M Set journal size (area or partition size)\n"
|
||||
" --block_size 1M/128k Set blockstore object size\n"
|
||||
" --journal_size 32M/1G Set journal size (area or partition size)\n"
|
||||
" --block_size 128k/1M Set blockstore object size\n"
|
||||
" --bitmap_granularity 4k Set bitmap granularity\n"
|
||||
" --data_device_block 4k Override data device block size\n"
|
||||
" --meta_device_block 4k Override metadata device block size\n"
|
||||
@@ -109,8 +110,16 @@ static const char *help_text =
|
||||
" Commands are passed to systemctl with vitastor-osd@<num> units as arguments.\n"
|
||||
" When --now is added to enable/disable, OSDs are also immediately started/stopped.\n"
|
||||
"\n"
|
||||
"vitastor-disk read-sb <device>\n"
|
||||
"vitastor-disk purge [--force] [--allow-data-loss] <device> [device2 device3 ...]\n"
|
||||
" Purge Vitastor OSD(s) on specified device(s). Uses vitastor-cli rm-osd to check\n"
|
||||
" if deletion is possible without data loss and to actually remove metadata from etcd.\n"
|
||||
" --force and --allow-data-loss options may be used to ignore safety check results.\n"
|
||||
" \n"
|
||||
" Requires `vitastor-cli`, `sfdisk` and `partprobe` (from parted) utilities.\n"
|
||||
"\n"
|
||||
"vitastor-disk read-sb [--force] <device>\n"
|
||||
" Try to read Vitastor OSD superblock from <device> and print it in JSON format.\n"
|
||||
" --force allows to ignore validation errors.\n"
|
||||
"\n"
|
||||
"vitastor-disk write-sb <device>\n"
|
||||
" Read JSON from STDIN and write it into Vitastor OSD superblock on <device>.\n"
|
||||
@@ -195,6 +204,10 @@ int main(int argc, char *argv[])
|
||||
{
|
||||
self.options["hybrid"] = "1";
|
||||
}
|
||||
else if (!strcmp(argv[i], "--hdd"))
|
||||
{
|
||||
self.options["hdd"] = "1";
|
||||
}
|
||||
else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
|
||||
{
|
||||
cmd.insert(cmd.begin(), (char*)"help");
|
||||
@@ -207,6 +220,10 @@ int main(int argc, char *argv[])
|
||||
{
|
||||
self.options["force"] = "1";
|
||||
}
|
||||
else if (!strcmp(argv[i], "--allow-data-loss"))
|
||||
{
|
||||
self.options["allow_data_loss"] = "1";
|
||||
}
|
||||
else if (argv[i][0] == '-' && argv[i][1] == '-')
|
||||
{
|
||||
char *key = argv[i]+2;
|
||||
@@ -339,6 +356,10 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
return self.systemd_start_stop_osds(systemd_cmd, std::vector<std::string>(cmd.begin()+1, cmd.end()));
|
||||
}
|
||||
else if (!strcmp(cmd[0], "purge"))
|
||||
{
|
||||
return self.purge_devices(std::vector<std::string>(cmd.begin()+1, cmd.end()));
|
||||
}
|
||||
else if (!strcmp(cmd[0], "exec-osd"))
|
||||
{
|
||||
if (cmd.size() != 2)
|
||||
|
@@ -56,7 +56,7 @@ struct disk_tool_t
|
||||
uint64_t meta_pos;
|
||||
uint64_t journal_pos, journal_calc_data_pos;
|
||||
|
||||
bool first, first2;
|
||||
bool first_block, first_entry;
|
||||
|
||||
allocator *data_alloc;
|
||||
std::map<uint64_t, uint64_t> data_remap;
|
||||
@@ -108,10 +108,11 @@ struct disk_tool_t
|
||||
int read_sb(std::string device);
|
||||
int write_sb(std::string device);
|
||||
int exec_osd(std::string device);
|
||||
int systemd_start_stop_osds(std::vector<std::string> cmd, std::vector<std::string> devices);
|
||||
int systemd_start_stop_osds(const std::vector<std::string> & cmd, const std::vector<std::string> & devices);
|
||||
int pre_exec_osd(std::string device);
|
||||
int purge_devices(const std::vector<std::string> & devices);
|
||||
|
||||
json11::Json read_osd_superblock(std::string device, bool expect_exist = true);
|
||||
json11::Json read_osd_superblock(std::string device, bool expect_exist = true, bool ignore_nonref = false);
|
||||
uint32_t write_osd_superblock(std::string device, json11::Json params);
|
||||
|
||||
int prepare_one(std::map<std::string, std::string> options, int is_hdd = -1);
|
||||
@@ -139,3 +140,4 @@ int write_zero(int fd, uint64_t offset, uint64_t size);
|
||||
json11::Json read_parttable(std::string dev);
|
||||
uint64_t dev_size_from_parttable(json11::Json pt);
|
||||
uint64_t free_from_parttable(json11::Json pt);
|
||||
int fix_partition_type(std::string dev_by_uuid);
|
||||
|
@@ -13,7 +13,7 @@ int disk_tool_t::dump_journal()
|
||||
fprintf(stderr, "Invalid journal block size\n");
|
||||
return 1;
|
||||
}
|
||||
first = true;
|
||||
first_block = true;
|
||||
if (json)
|
||||
printf("[\n");
|
||||
if (all)
|
||||
@@ -38,8 +38,8 @@ int disk_tool_t::dump_journal()
|
||||
}
|
||||
if (json)
|
||||
{
|
||||
printf("%s{\"offset\":\"0x%lx\"", first ? "" : ",\n", journal_pos);
|
||||
first = false;
|
||||
printf("%s{\"offset\":\"0x%lx\"", first_block ? "" : ",\n", journal_pos);
|
||||
first_block = false;
|
||||
}
|
||||
if (s == dsk.journal_block_size)
|
||||
{
|
||||
@@ -55,10 +55,10 @@ int disk_tool_t::dump_journal()
|
||||
printf("offset %08lx:\n", journal_pos);
|
||||
else
|
||||
printf(",\"entries\":[\n");
|
||||
first2 = true;
|
||||
first_entry = true;
|
||||
process_journal_block(journal_buf, [this](int num, journal_entry *je) { dump_journal_entry(num, je, json); });
|
||||
if (json)
|
||||
printf(first2 ? "]}" : "\n]}");
|
||||
printf(first_entry ? "]}" : "\n]}");
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -75,34 +75,30 @@ int disk_tool_t::dump_journal()
|
||||
}
|
||||
else
|
||||
{
|
||||
first_entry = true;
|
||||
process_journal([this](void *data)
|
||||
{
|
||||
first2 = true;
|
||||
if (json && dump_with_blocks)
|
||||
first_entry = true;
|
||||
if (!json)
|
||||
printf("offset %08lx:\n", journal_pos);
|
||||
auto pos = journal_pos;
|
||||
int r = process_journal_block(data, [this, pos](int num, journal_entry *je)
|
||||
{
|
||||
if (json && first2)
|
||||
{
|
||||
if (dump_with_blocks)
|
||||
printf("%s{\"offset\":\"0x%lx\",\"entries\":[\n", first ? "" : ",\n", pos);
|
||||
first = false;
|
||||
}
|
||||
if (json && dump_with_blocks && first_entry)
|
||||
printf("%s{\"offset\":\"0x%lx\",\"entries\":[\n", first_block ? "" : ",\n", pos);
|
||||
dump_journal_entry(num, je, json);
|
||||
first_block = false;
|
||||
});
|
||||
if (json)
|
||||
{
|
||||
if (dump_with_blocks && !first2)
|
||||
printf("\n]}");
|
||||
}
|
||||
else if (r <= 0)
|
||||
if (json && dump_with_blocks && !first_entry)
|
||||
printf("\n]}");
|
||||
else if (!json && r <= 0)
|
||||
printf("end of the journal\n");
|
||||
return r;
|
||||
});
|
||||
}
|
||||
if (json)
|
||||
printf(first ? "]\n" : "\n]\n");
|
||||
printf(first_block ? "]\n" : "\n]\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -209,9 +205,9 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
|
||||
{
|
||||
if (json)
|
||||
{
|
||||
if (!first2)
|
||||
if (!first_entry)
|
||||
printf(",\n");
|
||||
first2 = false;
|
||||
first_entry = false;
|
||||
printf(
|
||||
"{\"crc32\":\"%08x\",\"valid\":%s,\"crc32_prev\":\"%08x\"",
|
||||
je->crc32, (je_crc32(je) == je->crc32 ? "true" : "false"), je->crc32_prev
|
||||
@@ -275,10 +271,12 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
|
||||
else if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
|
||||
{
|
||||
printf(
|
||||
json ? ",\"type\":\"big_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"loc\":\"0x%lx\""
|
||||
: "je_big_write%s oid=%lx:%lx ver=%lu loc=%08lx",
|
||||
json ? ",\"type\":\"big_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%lx\""
|
||||
: "je_big_write%s oid=%lx:%lx ver=%lu offset=%u len=%u loc=%08lx",
|
||||
je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
|
||||
je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location
|
||||
je->big_write.oid.inode, je->big_write.oid.stripe,
|
||||
je->big_write.version, je->big_write.offset, je->big_write.len,
|
||||
je->big_write.location
|
||||
);
|
||||
if (je->big_write.size > sizeof(journal_entry_big_write))
|
||||
{
|
||||
@@ -424,6 +422,8 @@ int disk_tool_t::write_json_journal(json11::Json entries)
|
||||
.stripe = sscanf_json(NULL, rec["stripe"]),
|
||||
},
|
||||
.version = rec["ver"].uint64_value(),
|
||||
.offset = (uint32_t)rec["offset"].uint64_value(),
|
||||
.len = (uint32_t)rec["len"].uint64_value(),
|
||||
.location = sscanf_json(NULL, rec["loc"]),
|
||||
};
|
||||
fromhexstr(rec["bitmap"].string_value(), dsk.clean_entry_bitmap_size, ((uint8_t*)ne) + sizeof(journal_entry_big_write));
|
||||
|
@@ -124,14 +124,14 @@ void disk_tool_t::dump_meta_header(blockstore_meta_header_v1_t *hdr)
|
||||
{
|
||||
printf("{\"version\":\"0.5\",\"meta_block_size\":%lu,\"entries\":[\n", dsk.meta_block_size);
|
||||
}
|
||||
first = true;
|
||||
first_entry = true;
|
||||
}
|
||||
|
||||
void disk_tool_t::dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
|
||||
{
|
||||
printf(
|
||||
#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":%lu,\"stripe\":%lu,\"version\":%lu"
|
||||
(first ? ENTRY_FMT : (",\n" ENTRY_FMT)),
|
||||
#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"version\":%lu"
|
||||
(first_entry ? ENTRY_FMT : (",\n" ENTRY_FMT)),
|
||||
#undef ENTRY_FMT
|
||||
block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),
|
||||
entry->oid.stripe, entry->version
|
||||
@@ -154,7 +154,7 @@ void disk_tool_t::dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, u
|
||||
{
|
||||
printf("}");
|
||||
}
|
||||
first = false;
|
||||
first_entry = false;
|
||||
}
|
||||
|
||||
int disk_tool_t::write_json_meta(json11::Json meta)
|
||||
|
@@ -61,6 +61,11 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||
fprintf(stderr, "%s already contains Vitastor OSD superblock, not creating OSD without --force\n", dev.c_str());
|
||||
return 1;
|
||||
}
|
||||
if (fix_partition_type(dev) != 0)
|
||||
{
|
||||
fprintf(stderr, "%s has incorrect type and we failed to change it to Vitastor type\n", dev.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto dev: std::vector<std::string>{"data", "meta", "journal"})
|
||||
@@ -317,7 +322,8 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
|
||||
{
|
||||
script += "+ "+size+" "+std::string(VITASTOR_PART_TYPE)+"\n";
|
||||
}
|
||||
if (shell_exec({ "sfdisk", "--force", devinfo.path }, script, NULL, NULL) != 0)
|
||||
std::string out;
|
||||
if (shell_exec({ "sfdisk", "--no-reread", "--force", devinfo.path }, script, &out, NULL) != 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to add %lu partition(s) with sfdisk\n", sizes.size());
|
||||
return {};
|
||||
@@ -351,7 +357,8 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
|
||||
{
|
||||
iter++;
|
||||
// Run partprobe
|
||||
if (iter > 1 || (r = shell_exec({ "partprobe", devinfo.path }, "", NULL, NULL)) != 0)
|
||||
std::string out;
|
||||
if (iter > 1 || (r = shell_exec({ "partprobe", devinfo.path }, "", &out, NULL)) != 0)
|
||||
{
|
||||
fprintf(
|
||||
stderr, iter == 1 && r == 255
|
||||
@@ -539,7 +546,7 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
|
||||
fprintf(stderr, "Device list (positional arguments) and --hybrid are incompatible with --data_device\n");
|
||||
return 1;
|
||||
}
|
||||
return prepare_one(options);
|
||||
return prepare_one(options, options.find("hdd") != options.end() ? 1 : 0);
|
||||
}
|
||||
if (!devices.size())
|
||||
{
|
||||
@@ -549,12 +556,12 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
|
||||
options.erase("data_device");
|
||||
options.erase("meta_device");
|
||||
options.erase("journal_device");
|
||||
bool hybrid = options.find("hybrid") != options.end();
|
||||
auto devinfo = collect_devices(devices);
|
||||
if (!devinfo.size())
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
bool hybrid = options.find("hybrid") != options.end();
|
||||
uint64_t osd_per_disk = stoull_full(options["osd_per_disk"]);
|
||||
if (!osd_per_disk)
|
||||
osd_per_disk = 1;
|
||||
@@ -612,7 +619,8 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
prepare_one(options, dev.is_hdd ? 1 : 0);
|
||||
// Treat all disks as SSDs if not in the hybrid mode
|
||||
prepare_one(options, hybrid && dev.is_hdd ? 1 : 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -5,6 +5,7 @@
|
||||
|
||||
#include "disk_tool.h"
|
||||
#include "rw_blocking.h"
|
||||
#include "str_util.h"
|
||||
|
||||
struct __attribute__((__packed__)) vitastor_disk_superblock_t
|
||||
{
|
||||
@@ -54,7 +55,7 @@ int disk_tool_t::udev_import(std::string device)
|
||||
|
||||
int disk_tool_t::read_sb(std::string device)
|
||||
{
|
||||
json11::Json sb = read_osd_superblock(device);
|
||||
json11::Json sb = read_osd_superblock(device, true, options.find("force") != options.end());
|
||||
if (sb.is_null())
|
||||
{
|
||||
return 1;
|
||||
@@ -123,7 +124,7 @@ uint32_t disk_tool_t::write_osd_superblock(std::string device, json11::Json para
|
||||
return sb_size;
|
||||
}
|
||||
|
||||
json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_exist)
|
||||
json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_exist, bool ignore_errors)
|
||||
{
|
||||
vitastor_disk_superblock_t *sb = NULL;
|
||||
uint8_t *buf = NULL;
|
||||
@@ -144,7 +145,7 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_ex
|
||||
goto ex;
|
||||
}
|
||||
sb = (vitastor_disk_superblock_t*)buf;
|
||||
if (sb->magic != VITASTOR_DISK_MAGIC)
|
||||
if (sb->magic != VITASTOR_DISK_MAGIC && !ignore_errors)
|
||||
{
|
||||
if (expect_exist)
|
||||
fprintf(stderr, "Invalid OSD superblock on %s: magic number mismatch\n", device.c_str());
|
||||
@@ -172,7 +173,7 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_ex
|
||||
}
|
||||
sb = (vitastor_disk_superblock_t*)buf;
|
||||
}
|
||||
if (sb->crc32c != crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf)))
|
||||
if (sb->crc32c != crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf)) && !ignore_errors)
|
||||
{
|
||||
if (expect_exist)
|
||||
fprintf(stderr, "Invalid OSD superblock on %s: crc32 mismatch\n", device.c_str());
|
||||
@@ -186,14 +187,14 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_ex
|
||||
goto ex;
|
||||
}
|
||||
// Validate superblock
|
||||
if (!osd_params["osd_num"].uint64_value())
|
||||
if (!osd_params["osd_num"].uint64_value() && !ignore_errors)
|
||||
{
|
||||
if (expect_exist)
|
||||
fprintf(stderr, "OSD superblock on %s lacks osd_num\n", device.c_str());
|
||||
osd_params = json11::Json();
|
||||
goto ex;
|
||||
}
|
||||
if (osd_params["data_device"].string_value() == "")
|
||||
if (osd_params["data_device"].string_value() == "" && !ignore_errors)
|
||||
{
|
||||
if (expect_exist)
|
||||
fprintf(stderr, "OSD superblock on %s lacks data_device\n", device.c_str());
|
||||
@@ -226,7 +227,7 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_ex
|
||||
{
|
||||
device_type = "journal";
|
||||
}
|
||||
else
|
||||
else if (!ignore_errors)
|
||||
{
|
||||
if (expect_exist)
|
||||
fprintf(stderr, "Invalid OSD superblock on %s: does not refer to the device itself\n", device.c_str());
|
||||
@@ -246,7 +247,7 @@ ex:
|
||||
return osd_params;
|
||||
}
|
||||
|
||||
int disk_tool_t::systemd_start_stop_osds(std::vector<std::string> cmd, std::vector<std::string> devices)
|
||||
int disk_tool_t::systemd_start_stop_osds(const std::vector<std::string> & cmd, const std::vector<std::string> & devices)
|
||||
{
|
||||
if (!devices.size())
|
||||
{
|
||||
@@ -306,8 +307,7 @@ int disk_tool_t::exec_osd(std::string device)
|
||||
argv[i] = (char*)argstr[i].c_str();
|
||||
}
|
||||
argv[argstr.size()] = NULL;
|
||||
execvpe(osd_binary.c_str(), argv, environ);
|
||||
return 0;
|
||||
return execvpe(osd_binary.c_str(), argv, environ);
|
||||
}
|
||||
|
||||
static int check_disabled_cache(std::string dev)
|
||||
@@ -362,3 +362,140 @@ int disk_tool_t::pre_exec_osd(std::string device)
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
|
||||
{
|
||||
std::vector<uint64_t> osd_numbers;
|
||||
json11::Json::array superblocks;
|
||||
for (auto & device: devices)
|
||||
{
|
||||
json11::Json sb = read_osd_superblock(device);
|
||||
if (!sb.is_null())
|
||||
{
|
||||
uint64_t osd_num = sb["params"]["osd_num"].uint64_value();
|
||||
osd_numbers.push_back(osd_num);
|
||||
superblocks.push_back(sb);
|
||||
}
|
||||
}
|
||||
if (!osd_numbers.size())
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
std::vector<std::string> rm_osd_cli = { "vitastor-cli", "rm-osd" };
|
||||
for (auto osd_num: osd_numbers)
|
||||
{
|
||||
rm_osd_cli.push_back(std::to_string(osd_num));
|
||||
}
|
||||
// Check for data loss
|
||||
if (options["force"] != "")
|
||||
{
|
||||
rm_osd_cli.push_back("--force");
|
||||
}
|
||||
else if (options["allow_data_loss"] != "")
|
||||
{
|
||||
rm_osd_cli.push_back("--allow-data-loss");
|
||||
}
|
||||
rm_osd_cli.push_back("--dry-run");
|
||||
std::string dry_run_ignore_stdout;
|
||||
if (shell_exec(rm_osd_cli, "", &dry_run_ignore_stdout, NULL) != 0)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
// Disable & stop OSDs
|
||||
std::vector<std::string> systemctl_cli = { "systemctl", "disable", "--now" };
|
||||
for (auto osd_num: osd_numbers)
|
||||
{
|
||||
systemctl_cli.push_back("vitastor-osd@"+std::to_string(osd_num));
|
||||
}
|
||||
if (shell_exec(systemctl_cli, "", NULL, NULL) != 0)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
// Remove OSD metadata
|
||||
rm_osd_cli.pop_back();
|
||||
if (shell_exec(rm_osd_cli, "", NULL, NULL) != 0)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
// Destroy OSD superblocks
|
||||
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
|
||||
for (auto & sb: superblocks)
|
||||
{
|
||||
for (auto dev_type: std::vector<std::string>{ "data", "meta", "journal" })
|
||||
{
|
||||
auto dev = sb["real_"+dev_type+"_device"].string_value();
|
||||
if (dev != "")
|
||||
{
|
||||
int fd = -1, r = open(dev.c_str(), O_DIRECT|O_RDWR);
|
||||
if (r >= 0)
|
||||
{
|
||||
fd = r;
|
||||
r = read_blocking(fd, buf, 4096);
|
||||
if (r == 4096)
|
||||
{
|
||||
// Clear magic and CRC
|
||||
memset(buf, 0, 12);
|
||||
r = lseek64(fd, 0, 0);
|
||||
if (r == 0)
|
||||
{
|
||||
r = write_blocking(fd, buf, 4096);
|
||||
if (r == 4096)
|
||||
r = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (fd >= 0)
|
||||
close(fd);
|
||||
if (r != 0)
|
||||
{
|
||||
fprintf(stderr, "Failed to clear OSD %lu %s device %s superblock: %s\n",
|
||||
sb["params"]["osd_num"].uint64_value(), dev_type.c_str(), dev.c_str(), strerror(errno));
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "OSD %lu %s device %s superblock cleared\n",
|
||||
sb["params"]["osd_num"].uint64_value(), dev_type.c_str(), dev.c_str());
|
||||
}
|
||||
if (sb["params"][dev_type+"_device"].string_value().substr(0, 22) == "/dev/disk/by-partuuid/")
|
||||
{
|
||||
// Delete the partition itself
|
||||
auto uuid_to_del = strtolower(sb["params"][dev_type+"_device"].string_value().substr(22));
|
||||
auto parent_dev = get_parent_device(dev);
|
||||
if (parent_dev == "" || parent_dev == dev)
|
||||
{
|
||||
fprintf(stderr, "Failed to delete partition %s: failed to find parent device\n", dev.c_str());
|
||||
continue;
|
||||
}
|
||||
auto pt = read_parttable("/dev/"+parent_dev);
|
||||
if (!pt.is_object())
|
||||
continue;
|
||||
json11::Json::array newpt = pt["partitions"].array_items();
|
||||
for (int i = 0; i < newpt.size(); i++)
|
||||
{
|
||||
if (strtolower(newpt[i]["uuid"].string_value()) == uuid_to_del)
|
||||
{
|
||||
auto old_part = newpt[i];
|
||||
newpt.erase(newpt.begin()+i, newpt.begin()+i+1);
|
||||
vitastor_dev_info_t devinfo = {
|
||||
.path = "/dev/"+parent_dev,
|
||||
.pt = json11::Json::object{ { "partitions", newpt } },
|
||||
};
|
||||
add_partitions(devinfo, {});
|
||||
struct stat st;
|
||||
if (stat(old_part["node"].string_value().c_str(), &st) == 0 ||
|
||||
errno != ENOENT)
|
||||
{
|
||||
std::string out;
|
||||
shell_exec({ "partprobe", "/dev/"+parent_dev }, "", &out, NULL);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
free(buf);
|
||||
buf = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
@@ -38,42 +38,6 @@ static std::map<std::string, std::string> read_vitastor_unit(std::string unit)
|
||||
return r;
|
||||
}
|
||||
|
||||
static int fix_partition_type(std::string dev_by_uuid)
|
||||
{
|
||||
auto uuid = strtolower(dev_by_uuid.substr(dev_by_uuid.rfind('/')+1));
|
||||
std::string parent_dev = get_parent_device(realpath_str(dev_by_uuid, false));
|
||||
if (parent_dev == "")
|
||||
return 1;
|
||||
auto pt = read_parttable("/dev/"+parent_dev);
|
||||
if (pt.is_null())
|
||||
return 1;
|
||||
std::string script = "label: gpt\n\n";
|
||||
for (const auto & part: pt["partitions"].array_items())
|
||||
{
|
||||
bool this_part = (strtolower(part["uuid"].string_value()) == uuid);
|
||||
if (this_part && strtolower(part["type"].string_value()) == "e7009fac-a5a1-4d72-af72-53de13059903")
|
||||
{
|
||||
// Already correct type
|
||||
return 0;
|
||||
}
|
||||
script += part["node"].string_value()+": ";
|
||||
bool first = true;
|
||||
for (const auto & kv: part.object_items())
|
||||
{
|
||||
if (kv.first != "node")
|
||||
{
|
||||
script += (first ? "" : ", ")+kv.first+"="+
|
||||
(kv.first == "type" && this_part
|
||||
? "e7009fac-a5a1-4d72-af72-53de13059903"
|
||||
: (kv.second.is_string() ? kv.second.string_value() : kv.second.dump()));
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
script += "\n";
|
||||
}
|
||||
return shell_exec({ "sfdisk", "--no-reread", "--force", "/dev/"+parent_dev }, script, NULL, NULL);
|
||||
}
|
||||
|
||||
int disk_tool_t::upgrade_simple_unit(std::string unit)
|
||||
{
|
||||
if (stoull_full(unit) != 0)
|
||||
|
@@ -145,10 +145,10 @@ int disable_cache(std::string dev)
|
||||
closedir(dir);
|
||||
// Check cache_type
|
||||
scsi_disk += "/cache_type";
|
||||
std::string cache_type = read_file(scsi_disk);
|
||||
std::string cache_type = trim(read_file(scsi_disk));
|
||||
if (cache_type == "")
|
||||
return 1;
|
||||
if (cache_type == "write back")
|
||||
if (cache_type != "write through")
|
||||
{
|
||||
int fd = open(scsi_disk.c_str(), O_WRONLY);
|
||||
if (fd < 0 || write_blocking(fd, (void*)"write through", strlen("write through")) != strlen("write through"))
|
||||
@@ -239,7 +239,8 @@ int shell_exec(const std::vector<std::string> & cmd, const std::string & in, std
|
||||
{
|
||||
// Child
|
||||
dup2(child_stdin[0], 0);
|
||||
dup2(child_stdout[1], 1);
|
||||
if (out)
|
||||
dup2(child_stdout[1], 1);
|
||||
if (err)
|
||||
dup2(child_stderr[1], 2);
|
||||
close(child_stdin[0]);
|
||||
@@ -250,9 +251,7 @@ int shell_exec(const std::vector<std::string> & cmd, const std::string & in, std
|
||||
close(child_stderr[1]);
|
||||
char *argv[cmd.size()+1];
|
||||
for (int i = 0; i < cmd.size(); i++)
|
||||
{
|
||||
argv[i] = (char*)cmd[i].c_str();
|
||||
}
|
||||
argv[cmd.size()] = NULL;
|
||||
execvp(argv[0], argv);
|
||||
std::string full_cmd;
|
||||
@@ -306,10 +305,10 @@ int write_zero(int fd, uint64_t offset, uint64_t size)
|
||||
json11::Json read_parttable(std::string dev)
|
||||
{
|
||||
std::string part_dump;
|
||||
int r = shell_exec({ "sfdisk", "--dump", dev, "--json" }, "", &part_dump, NULL);
|
||||
int r = shell_exec({ "sfdisk", "--json", dev }, "", &part_dump, NULL);
|
||||
if (r == 255)
|
||||
{
|
||||
fprintf(stderr, "Error running sfdisk --dump %s --json\n", dev.c_str());
|
||||
fprintf(stderr, "Error running sfdisk --json %s\n", dev.c_str());
|
||||
return json11::Json(false);
|
||||
}
|
||||
// Decode partition table
|
||||
@@ -320,7 +319,7 @@ json11::Json read_parttable(std::string dev)
|
||||
pt = json11::Json::parse(part_dump, err);
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "sfdisk --dump %s --json returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
|
||||
fprintf(stderr, "sfdisk --json %s returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
|
||||
return json11::Json(false);
|
||||
}
|
||||
pt = pt["partitiontable"];
|
||||
@@ -354,3 +353,40 @@ uint64_t free_from_parttable(json11::Json pt)
|
||||
free *= pt["sectorsize"].uint64_value();
|
||||
return free;
|
||||
}
|
||||
|
||||
int fix_partition_type(std::string dev_by_uuid)
|
||||
{
|
||||
auto uuid = strtolower(dev_by_uuid.substr(dev_by_uuid.rfind('/')+1));
|
||||
std::string parent_dev = get_parent_device(realpath_str(dev_by_uuid, false));
|
||||
if (parent_dev == "")
|
||||
return 1;
|
||||
auto pt = read_parttable("/dev/"+parent_dev);
|
||||
if (pt.is_null() || pt.is_bool())
|
||||
return 1;
|
||||
std::string script = "label: gpt\n\n";
|
||||
for (const auto & part: pt["partitions"].array_items())
|
||||
{
|
||||
bool this_part = (strtolower(part["uuid"].string_value()) == uuid);
|
||||
if (this_part && strtolower(part["type"].string_value()) == "e7009fac-a5a1-4d72-af72-53de13059903")
|
||||
{
|
||||
// Already correct type
|
||||
return 0;
|
||||
}
|
||||
script += part["node"].string_value()+": ";
|
||||
bool first = true;
|
||||
for (const auto & kv: part.object_items())
|
||||
{
|
||||
if (kv.first != "node")
|
||||
{
|
||||
script += (first ? "" : ", ")+kv.first+"="+
|
||||
(kv.first == "type" && this_part
|
||||
? "e7009fac-a5a1-4d72-af72-53de13059903"
|
||||
: (kv.second.is_string() ? kv.second.string_value() : kv.second.dump()));
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
script += "\n";
|
||||
}
|
||||
std::string out;
|
||||
return shell_exec({ "sfdisk", "--no-reread", "--force", "/dev/"+parent_dev }, script, &out, NULL);
|
||||
}
|
||||
|
@@ -871,19 +871,33 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
pg_cfg.target_history.clear();
|
||||
pg_cfg.all_peers.clear();
|
||||
// Refuse to start PG if any set of the <osd_sets> has no live OSDs
|
||||
for (auto hist_item: value["osd_sets"].array_items())
|
||||
for (auto & hist_item: value["osd_sets"].array_items())
|
||||
{
|
||||
std::vector<osd_num_t> history_set;
|
||||
for (auto pg_osd: hist_item.array_items())
|
||||
for (auto & pg_osd: hist_item.array_items())
|
||||
{
|
||||
history_set.push_back(pg_osd.uint64_value());
|
||||
osd_num_t pg_osd_num = pg_osd.uint64_value();
|
||||
if (pg_osd_num != 0)
|
||||
{
|
||||
auto it = std::lower_bound(history_set.begin(), history_set.end(), pg_osd_num);
|
||||
if (it == history_set.end() || *it != pg_osd_num)
|
||||
history_set.insert(it, pg_osd_num);
|
||||
}
|
||||
}
|
||||
pg_cfg.target_history.push_back(history_set);
|
||||
auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), history_set);
|
||||
if (it == pg_cfg.target_history.end() || *it != history_set)
|
||||
pg_cfg.target_history.insert(it, history_set);
|
||||
}
|
||||
// Include these additional OSDs when peering the PG
|
||||
for (auto pg_osd: value["all_peers"].array_items())
|
||||
{
|
||||
pg_cfg.all_peers.push_back(pg_osd.uint64_value());
|
||||
osd_num_t pg_osd_num = pg_osd.uint64_value();
|
||||
if (pg_osd_num != 0)
|
||||
{
|
||||
auto it = std::lower_bound(pg_cfg.all_peers.begin(), pg_cfg.all_peers.end(), pg_osd_num);
|
||||
if (it == pg_cfg.all_peers.end() || *it != pg_osd_num)
|
||||
pg_cfg.all_peers.insert(it, pg_osd_num);
|
||||
}
|
||||
}
|
||||
// Read epoch
|
||||
pg_cfg.epoch = value["epoch"].uint64_value();
|
||||
|
@@ -80,12 +80,20 @@ void osd_messenger_t::init()
|
||||
};
|
||||
op->callback = [this, cl](osd_op_t *op)
|
||||
{
|
||||
auto cl_it = clients.find(op->peer_fd);
|
||||
if (cl_it == clients.end() || cl_it->second != cl)
|
||||
{
|
||||
// client is already dropped
|
||||
delete op;
|
||||
return;
|
||||
}
|
||||
int fail_fd = (op->reply.hdr.retval != 0 ? op->peer_fd : -1);
|
||||
auto fail_osd_num = cl->osd_num;
|
||||
cl->ping_time_remaining = 0;
|
||||
delete op;
|
||||
if (fail_fd >= 0)
|
||||
{
|
||||
fprintf(stderr, "Ping failed for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
|
||||
fprintf(stderr, "Ping failed for OSD %lu (client %d), disconnecting peer\n", fail_osd_num, fail_fd);
|
||||
stop_client(fail_fd, true);
|
||||
}
|
||||
};
|
||||
@@ -149,10 +157,10 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
||||
this->rdma_max_sge = 128;
|
||||
this->rdma_max_send = config["rdma_max_send"].uint64_value();
|
||||
if (!this->rdma_max_send)
|
||||
this->rdma_max_send = 1;
|
||||
this->rdma_max_send = 8;
|
||||
this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
|
||||
if (!this->rdma_max_recv)
|
||||
this->rdma_max_recv = 128;
|
||||
this->rdma_max_recv = 16;
|
||||
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
|
||||
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
||||
this->rdma_max_msg = 129*1024;
|
||||
|
@@ -138,6 +138,7 @@ protected:
|
||||
|
||||
std::vector<int> read_ready_clients;
|
||||
std::vector<int> write_ready_clients;
|
||||
// We don't use ringloop->set_immediate here because we may have no ringloop in client :)
|
||||
std::vector<std::function<void()>> set_immediate;
|
||||
|
||||
public:
|
||||
|
@@ -368,9 +368,8 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
||||
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||
{
|
||||
auto rc = cl->rdma_conn;
|
||||
if (!cl->send_list.size() || rc->cur_send > 0)
|
||||
if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
|
||||
{
|
||||
// Only send one batch at a time
|
||||
return true;
|
||||
}
|
||||
uint64_t op_size = 0, op_sge = 0;
|
||||
@@ -380,6 +379,7 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||
iovec & iov = cl->send_list[rc->send_pos];
|
||||
if (op_size >= rc->max_msg || op_sge >= rc->max_sge)
|
||||
{
|
||||
rc->send_sizes.push_back(op_size);
|
||||
try_send_rdma_wr(cl, sge, op_sge);
|
||||
op_sge = 0;
|
||||
op_size = 0;
|
||||
@@ -405,18 +405,24 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||
}
|
||||
if (op_sge > 0)
|
||||
{
|
||||
rc->send_sizes.push_back(op_size);
|
||||
try_send_rdma_wr(cl, sge, op_sge);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void try_recv_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
||||
static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
|
||||
{
|
||||
ibv_sge sge = {
|
||||
.addr = (uintptr_t)buf,
|
||||
.length = (uint32_t)cl->rdma_conn->max_msg,
|
||||
.lkey = cl->rdma_conn->ctx->mr->lkey,
|
||||
};
|
||||
ibv_recv_wr *bad_wr = NULL;
|
||||
ibv_recv_wr wr = {
|
||||
.wr_id = (uint64_t)(cl->peer_fd*2),
|
||||
.sg_list = sge,
|
||||
.num_sge = op_sge,
|
||||
.sg_list = &sge,
|
||||
.num_sge = 1,
|
||||
};
|
||||
int err = ibv_post_recv(cl->rdma_conn->qp, &wr, &bad_wr);
|
||||
if (err || bad_wr)
|
||||
@@ -434,12 +440,7 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
|
||||
{
|
||||
void *buf = malloc_or_die(rc->max_msg);
|
||||
rc->recv_buffers.push_back(buf);
|
||||
ibv_sge sge = {
|
||||
.addr = (uintptr_t)buf,
|
||||
.length = (uint32_t)rc->max_msg,
|
||||
.lkey = rc->ctx->mr->lkey,
|
||||
};
|
||||
try_recv_rdma_wr(cl, &sge, 1);
|
||||
try_recv_rdma_wr(cl, buf);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -476,6 +477,7 @@ void osd_messenger_t::handle_rdma_events()
|
||||
continue;
|
||||
}
|
||||
osd_client_t *cl = cl_it->second;
|
||||
auto rc = cl->rdma_conn;
|
||||
if (wc[i].status != IBV_WC_SUCCESS)
|
||||
{
|
||||
fprintf(stderr, "RDMA work request failed for client %d", client_id);
|
||||
@@ -489,44 +491,59 @@ void osd_messenger_t::handle_rdma_events()
|
||||
}
|
||||
if (!is_send)
|
||||
{
|
||||
cl->rdma_conn->cur_recv--;
|
||||
if (!handle_read_buffer(cl, cl->rdma_conn->recv_buffers[0], wc[i].byte_len))
|
||||
rc->cur_recv--;
|
||||
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
|
||||
{
|
||||
// handle_read_buffer may stop the client
|
||||
continue;
|
||||
}
|
||||
free(cl->rdma_conn->recv_buffers[0]);
|
||||
cl->rdma_conn->recv_buffers.erase(cl->rdma_conn->recv_buffers.begin(), cl->rdma_conn->recv_buffers.begin()+1);
|
||||
try_recv_rdma(cl);
|
||||
try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
|
||||
rc->next_recv_buf = (rc->next_recv_buf+1) % rc->recv_buffers.size();
|
||||
}
|
||||
else
|
||||
{
|
||||
cl->rdma_conn->cur_send--;
|
||||
if (!cl->rdma_conn->cur_send)
|
||||
rc->cur_send--;
|
||||
uint64_t sent_size = rc->send_sizes.at(0);
|
||||
rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
|
||||
int send_pos = 0, send_buf_pos = 0;
|
||||
while (sent_size > 0)
|
||||
{
|
||||
// Wait for the whole batch
|
||||
for (int i = 0; i < cl->rdma_conn->send_pos; i++)
|
||||
if (sent_size >= cl->send_list.at(send_pos).iov_len)
|
||||
{
|
||||
if (cl->outbox[i].flags & MSGR_SENDP_FREE)
|
||||
{
|
||||
// Reply fully sent
|
||||
delete cl->outbox[i].op;
|
||||
}
|
||||
sent_size -= cl->send_list[send_pos].iov_len;
|
||||
send_pos++;
|
||||
}
|
||||
if (cl->rdma_conn->send_pos > 0)
|
||||
else
|
||||
{
|
||||
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+cl->rdma_conn->send_pos);
|
||||
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+cl->rdma_conn->send_pos);
|
||||
cl->rdma_conn->send_pos = 0;
|
||||
send_buf_pos = sent_size;
|
||||
sent_size = 0;
|
||||
}
|
||||
if (cl->rdma_conn->send_buf_pos > 0)
|
||||
{
|
||||
cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + cl->rdma_conn->send_buf_pos;
|
||||
cl->send_list[0].iov_len -= cl->rdma_conn->send_buf_pos;
|
||||
cl->rdma_conn->send_buf_pos = 0;
|
||||
}
|
||||
try_send_rdma(cl);
|
||||
}
|
||||
assert(rc->send_pos >= send_pos);
|
||||
if (rc->send_pos == send_pos)
|
||||
{
|
||||
rc->send_buf_pos -= send_buf_pos;
|
||||
}
|
||||
rc->send_pos -= send_pos;
|
||||
for (int i = 0; i < send_pos; i++)
|
||||
{
|
||||
if (cl->outbox[i].flags & MSGR_SENDP_FREE)
|
||||
{
|
||||
// Reply fully sent
|
||||
delete cl->outbox[i].op;
|
||||
}
|
||||
}
|
||||
if (send_pos > 0)
|
||||
{
|
||||
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+send_pos);
|
||||
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+send_pos);
|
||||
}
|
||||
if (send_buf_pos > 0)
|
||||
{
|
||||
cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + send_buf_pos;
|
||||
cl->send_list[0].iov_len -= send_buf_pos;
|
||||
}
|
||||
try_send_rdma(cl);
|
||||
}
|
||||
}
|
||||
} while (event_count > 0);
|
||||
|
@@ -49,8 +49,9 @@ struct msgr_rdma_connection_t
|
||||
uint64_t max_msg = 0;
|
||||
|
||||
int send_pos = 0, send_buf_pos = 0;
|
||||
int recv_pos = 0, recv_buf_pos = 0;
|
||||
int next_recv_buf = 0;
|
||||
std::vector<void*> recv_buffers;
|
||||
std::vector<uint64_t> send_sizes;
|
||||
|
||||
~msgr_rdma_connection_t();
|
||||
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
|
||||
|
@@ -163,6 +163,9 @@ void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
|
||||
recovery_queue_depth = config["recovery_queue_depth"].uint64_value();
|
||||
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
|
||||
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
||||
recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
|
||||
if (recovery_pg_switch < 1)
|
||||
recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
||||
recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
|
||||
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
|
||||
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||
|
@@ -34,6 +34,7 @@
|
||||
#define DEFAULT_AUTOSYNC_WRITES 128
|
||||
#define MAX_RECOVERY_QUEUE 2048
|
||||
#define DEFAULT_RECOVERY_QUEUE 4
|
||||
#define DEFAULT_RECOVERY_PG_SWITCH 128
|
||||
#define DEFAULT_RECOVERY_BATCH 16
|
||||
|
||||
//#define OSD_STUB
|
||||
@@ -108,6 +109,7 @@ class osd_t
|
||||
int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds
|
||||
int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
|
||||
int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
||||
int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
||||
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||
int inode_vanish_time = 60;
|
||||
int log_level = 0;
|
||||
@@ -135,7 +137,10 @@ class osd_t
|
||||
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
|
||||
int peering_state = 0;
|
||||
std::map<object_id, osd_recovery_op_t> recovery_ops;
|
||||
int recovery_done = 0;
|
||||
bool recovery_last_degraded = true;
|
||||
pool_pg_num_t recovery_last_pg;
|
||||
object_id recovery_last_oid;
|
||||
int recovery_pg_done = 0, recovery_done = 0;
|
||||
osd_op_t *autosync_op = NULL;
|
||||
|
||||
// Unstable writes
|
||||
@@ -200,7 +205,6 @@ class osd_t
|
||||
bool check_peer_config(osd_client_t *cl, json11::Json conf);
|
||||
void repeer_pgs(osd_num_t osd_num);
|
||||
void start_pg_peering(pg_t & pg);
|
||||
void submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
|
||||
void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
|
||||
void discard_list_subop(osd_op_t *list_op);
|
||||
bool stop_pg(pg_t & pg);
|
||||
|
@@ -132,7 +132,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
|
||||
this->osd_num, immediate_commit == IMMEDIATE_ALL ? "all" : "small",
|
||||
cl->osd_num, conf["immediate_commit"].string_value().c_str()
|
||||
);
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
else if (conf["block_size"].uint64_value() != (uint64_t)this->bs_block_size)
|
||||
{
|
||||
@@ -140,7 +140,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
|
||||
"[OSD %lu] My block_size is %u, but peer OSD %lu has %lu. We can't work together\n",
|
||||
this->osd_num, this->bs_block_size, cl->osd_num, conf["block_size"].uint64_value()
|
||||
);
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
else if (conf["bitmap_granularity"].uint64_value() != (uint64_t)this->bs_bitmap_granularity)
|
||||
{
|
||||
@@ -148,7 +148,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
|
||||
"[OSD %lu] My bitmap_granularity is %u, but peer OSD %lu has %lu. We can't work together\n",
|
||||
this->osd_num, this->bs_bitmap_granularity, cl->osd_num, conf["bitmap_granularity"].uint64_value()
|
||||
);
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
@@ -382,30 +382,6 @@ void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num)
|
||||
{
|
||||
auto pg_it = pgs.find({
|
||||
.pool_id = pool_id,
|
||||
.pg_num = pg_num,
|
||||
});
|
||||
if (pg_it != pgs.end() && pg_it->second.epoch > pg_it->second.reported_epoch &&
|
||||
st_cli.pool_config[pool_id].pg_config[pg_num].epoch >= pg_it->second.epoch)
|
||||
{
|
||||
pg_it->second.reported_epoch = st_cli.pool_config[pool_id].pg_config[pg_num].epoch;
|
||||
object_id oid = { 0 };
|
||||
bool first = true;
|
||||
for (auto op: pg_it->second.write_queue)
|
||||
{
|
||||
if (first || oid != op.first)
|
||||
{
|
||||
oid = op.first;
|
||||
first = false;
|
||||
continue_primary_write(op.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::on_load_config_hook(json11::Json::object & global_config)
|
||||
{
|
||||
json11::Json::object osd_config = this->config;
|
||||
@@ -704,13 +680,16 @@ void osd_t::apply_pg_config()
|
||||
}
|
||||
}
|
||||
}
|
||||
auto vec_all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end());
|
||||
if (currently_taken)
|
||||
{
|
||||
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING | PG_PEERED))
|
||||
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING))
|
||||
{
|
||||
if (pg_it->second.target_set == pg_cfg.target_set)
|
||||
if (pg_it->second.target_set == pg_cfg.target_set &&
|
||||
pg_it->second.target_history == pg_cfg.target_history &&
|
||||
pg_it->second.all_peers == vec_all_peers)
|
||||
{
|
||||
// No change in osd_set; history changes are ignored
|
||||
// No change in osd_set and history
|
||||
continue;
|
||||
}
|
||||
else
|
||||
@@ -761,7 +740,7 @@ void osd_t::apply_pg_config()
|
||||
.pg_num = pg_num,
|
||||
.reported_epoch = pg_cfg.epoch,
|
||||
.target_history = pg_cfg.target_history,
|
||||
.all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
|
||||
.all_peers = vec_all_peers,
|
||||
.target_set = pg_cfg.target_set,
|
||||
};
|
||||
if (pg.scheme == POOL_SCHEME_EC)
|
||||
@@ -984,13 +963,6 @@ void osd_t::report_pg_states()
|
||||
}
|
||||
this->pgs.erase(pg_it);
|
||||
}
|
||||
else if (pg_it->second.state & PG_PEERED)
|
||||
{
|
||||
// Activate PG after PG PEERED state is reported along with history
|
||||
// (if the state wasn't changed again)
|
||||
pg_it->second.state = pg_it->second.state & ~PG_PEERED | PG_ACTIVE;
|
||||
report_pg_state(pg_it->second);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Push other PG state updates, if any
|
||||
|
@@ -226,42 +226,51 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
||||
|
||||
bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
||||
{
|
||||
if (!no_recovery)
|
||||
if (!pgs.size())
|
||||
{
|
||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
||||
{
|
||||
if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
|
||||
{
|
||||
for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
|
||||
{
|
||||
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
||||
{
|
||||
op.degraded = true;
|
||||
op.oid = obj_it->first;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if (!no_rebalance)
|
||||
// Restart scanning from the same degraded/misplaced status as the last time
|
||||
for (int tried_degraded = 0; tried_degraded < 2; tried_degraded++)
|
||||
{
|
||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
||||
if (recovery_last_degraded ? !no_recovery : !no_rebalance)
|
||||
{
|
||||
// Don't try to "recover" misplaced objects if "recovery" would make them degraded
|
||||
if ((pg_it->second.state & (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
|
||||
auto mask = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED);
|
||||
auto check = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_HAS_MISPLACED);
|
||||
// Restart scanning from the same PG as the last time
|
||||
for (auto pg_it = pgs.lower_bound(recovery_last_pg); pg_it != pgs.end(); pg_it++)
|
||||
{
|
||||
for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
|
||||
if ((pg_it->second.state & mask) == check)
|
||||
{
|
||||
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
||||
auto & src = recovery_last_degraded ? pg_it->second.degraded_objects : pg_it->second.misplaced_objects;
|
||||
assert(src.size() > 0);
|
||||
// Restart scanning from the next object
|
||||
for (auto obj_it = src.upper_bound(recovery_last_oid); obj_it != src.end(); obj_it++)
|
||||
{
|
||||
op.degraded = false;
|
||||
op.oid = obj_it->first;
|
||||
return true;
|
||||
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
||||
{
|
||||
op.degraded = recovery_last_degraded;
|
||||
recovery_last_oid = op.oid = obj_it->first;
|
||||
recovery_pg_done++;
|
||||
// Switch to another PG after recovery_pg_switch operations
|
||||
// to always mix all PGs during recovery but still benefit
|
||||
// from recovery queue depth greater than 1
|
||||
if (recovery_pg_done >= recovery_pg_switch)
|
||||
{
|
||||
recovery_pg_done = 0;
|
||||
recovery_last_pg.pg_num++;
|
||||
recovery_last_oid = {};
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
recovery_last_degraded = !recovery_last_degraded;
|
||||
recovery_last_pg = {};
|
||||
recovery_last_oid = {};
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
10
src/osd_id.h
10
src/osd_id.h
@@ -28,3 +28,13 @@ inline bool operator < (const pool_pg_num_t & a, const pool_pg_num_t & b)
|
||||
{
|
||||
return a.pool_id < b.pool_id || a.pool_id == b.pool_id && a.pg_num < b.pg_num;
|
||||
}
|
||||
|
||||
inline bool operator == (const pool_pg_num_t & a, const pool_pg_num_t & b)
|
||||
{
|
||||
return a.pool_id == b.pool_id && a.pg_num == b.pg_num;
|
||||
}
|
||||
|
||||
inline bool operator != (const pool_pg_num_t & a, const pool_pg_num_t & b)
|
||||
{
|
||||
return a.pool_id != b.pool_id || a.pg_num != b.pg_num;
|
||||
}
|
||||
|
@@ -9,6 +9,8 @@
|
||||
#include "str_util.h"
|
||||
#include "osd.h"
|
||||
|
||||
#define SELF_FD -1
|
||||
|
||||
// Peering loop
|
||||
void osd_t::handle_peers()
|
||||
{
|
||||
@@ -30,7 +32,16 @@ void osd_t::handle_peers()
|
||||
if (p.second.state & PG_HAS_UNCLEAN)
|
||||
peering_state = peering_state | OSD_FLUSHING_PGS;
|
||||
else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
|
||||
{
|
||||
peering_state = peering_state | OSD_RECOVERING;
|
||||
if (p.second.state & PG_HAS_DEGRADED)
|
||||
{
|
||||
// Restart recovery from degraded objects
|
||||
recovery_last_degraded = true;
|
||||
recovery_last_pg = {};
|
||||
recovery_last_oid = {};
|
||||
}
|
||||
}
|
||||
ringloop->wakeup();
|
||||
return;
|
||||
}
|
||||
@@ -39,10 +50,6 @@ void osd_t::handle_peers()
|
||||
still = true;
|
||||
}
|
||||
}
|
||||
else if (p.second.state & PG_PEERED)
|
||||
{
|
||||
still = true;
|
||||
}
|
||||
}
|
||||
if (!still)
|
||||
{
|
||||
@@ -63,10 +70,6 @@ void osd_t::handle_peers()
|
||||
}
|
||||
still = true;
|
||||
}
|
||||
else if (p.second.state & PG_PEERED)
|
||||
{
|
||||
still = true;
|
||||
}
|
||||
}
|
||||
if (!still)
|
||||
{
|
||||
@@ -89,7 +92,7 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
|
||||
{
|
||||
auto & pg = p.second;
|
||||
bool repeer = false;
|
||||
if (pg.state & (PG_PEERING | PG_PEERED | PG_ACTIVE | PG_INCOMPLETE))
|
||||
if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
|
||||
{
|
||||
for (osd_num_t pg_osd: pg.all_peers)
|
||||
{
|
||||
@@ -300,82 +303,11 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
submit_sync_and_list_subop(peer_osd, pg.peering_state);
|
||||
submit_list_subop(peer_osd, pg.peering_state);
|
||||
}
|
||||
ringloop->wakeup();
|
||||
}
|
||||
|
||||
void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||
{
|
||||
// Sync before listing, if not readonly
|
||||
if (readonly)
|
||||
{
|
||||
submit_list_subop(role_osd, ps);
|
||||
}
|
||||
else if (role_osd == this->osd_num)
|
||||
{
|
||||
// Self
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = 0;
|
||||
op->peer_fd = -1;
|
||||
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
|
||||
op->bs_op = new blockstore_op_t();
|
||||
op->bs_op->opcode = BS_OP_SYNC;
|
||||
op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
|
||||
{
|
||||
if (bs_op->retval < 0)
|
||||
{
|
||||
printf("Local OP_SYNC failed: %d (%s)\n", bs_op->retval, strerror(-bs_op->retval));
|
||||
force_stop(1);
|
||||
return;
|
||||
}
|
||||
add_bs_subop_stats(op);
|
||||
delete op->bs_op;
|
||||
op->bs_op = NULL;
|
||||
delete op;
|
||||
ps->list_ops.erase(role_osd);
|
||||
submit_list_subop(role_osd, ps);
|
||||
};
|
||||
bs->enqueue_op(op->bs_op);
|
||||
ps->list_ops[role_osd] = op;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Peer
|
||||
auto & cl = msgr.clients.at(msgr.osd_peer_fds.at(role_osd));
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = cl->peer_fd;
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_sync = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = msgr.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_SYNC,
|
||||
},
|
||||
},
|
||||
};
|
||||
op->callback = [this, ps, role_osd](osd_op_t *op)
|
||||
{
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
// FIXME: Mark peer as failed and don't reconnect immediately after dropping the connection
|
||||
printf("Failed to sync OSD %lu: %ld (%s), disconnecting peer\n", role_osd, op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
|
||||
int fail_fd = op->peer_fd;
|
||||
ps->list_ops.erase(role_osd);
|
||||
delete op;
|
||||
msgr.stop_client(fail_fd);
|
||||
return;
|
||||
}
|
||||
delete op;
|
||||
ps->list_ops.erase(role_osd);
|
||||
submit_list_subop(role_osd, ps);
|
||||
};
|
||||
msgr.outbox_push(op);
|
||||
ps->list_ops[role_osd] = op;
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||
{
|
||||
if (role_osd == this->osd_num)
|
||||
@@ -383,7 +315,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||
// Self
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = 0;
|
||||
op->peer_fd = -1;
|
||||
op->peer_fd = SELF_FD;
|
||||
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
|
||||
op->bs_op = new blockstore_op_t();
|
||||
op->bs_op->opcode = BS_OP_LIST;
|
||||
@@ -415,8 +347,8 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||
op->bs_op = NULL;
|
||||
delete op;
|
||||
};
|
||||
bs->enqueue_op(op->bs_op);
|
||||
ps->list_ops[role_osd] = op;
|
||||
bs->enqueue_op(op->bs_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -463,14 +395,14 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||
ps->list_ops.erase(role_osd);
|
||||
delete op;
|
||||
};
|
||||
msgr.outbox_push(op);
|
||||
ps->list_ops[role_osd] = op;
|
||||
msgr.outbox_push(op);
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::discard_list_subop(osd_op_t *list_op)
|
||||
{
|
||||
if (list_op->peer_fd == 0)
|
||||
if (list_op->peer_fd == SELF_FD)
|
||||
{
|
||||
// Self
|
||||
list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)
|
||||
@@ -549,13 +481,17 @@ void osd_t::report_pg_state(pg_t & pg)
|
||||
pg.history_changed = true;
|
||||
pg.target_history.clear();
|
||||
pg.all_peers = pg.target_set;
|
||||
std::sort(pg.all_peers.begin(), pg.all_peers.end());
|
||||
pg.cur_peers = pg.target_set;
|
||||
}
|
||||
else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
|
||||
{
|
||||
// Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
|
||||
pg.history_changed = true;
|
||||
pg.target_history.clear();
|
||||
if (pg.target_history.size())
|
||||
{
|
||||
pg.history_changed = true;
|
||||
pg.target_history.clear();
|
||||
}
|
||||
std::set<osd_num_t> dead_peers;
|
||||
for (auto pg_osd: pg.all_peers)
|
||||
{
|
||||
@@ -572,8 +508,12 @@ void osd_t::report_pg_state(pg_t & pg)
|
||||
dead_peers.insert(pg_osd);
|
||||
}
|
||||
}
|
||||
pg.all_peers.clear();
|
||||
pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end());
|
||||
auto new_all_peers = std::vector<osd_num_t>(dead_peers.begin(), dead_peers.end());
|
||||
if (pg.all_peers != new_all_peers)
|
||||
{
|
||||
pg.history_changed = true;
|
||||
pg.all_peers = new_all_peers;
|
||||
}
|
||||
pg.cur_peers.clear();
|
||||
for (auto pg_osd: pg.target_set)
|
||||
{
|
||||
|
@@ -86,24 +86,11 @@ void pg_obj_state_check_t::walk()
|
||||
}
|
||||
if (pg->pg_cursize < pg->pg_size)
|
||||
{
|
||||
// Report PG history and activate
|
||||
pg->state |= PG_DEGRADED | PG_PEERED;
|
||||
std::vector<osd_num_t> history_set;
|
||||
for (auto peer_osd: pg->cur_set)
|
||||
{
|
||||
if (peer_osd != 0)
|
||||
{
|
||||
history_set.push_back(peer_osd);
|
||||
}
|
||||
}
|
||||
pg->target_history.push_back(history_set);
|
||||
pg->history_changed = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Just activate
|
||||
pg->state |= PG_ACTIVE;
|
||||
// Activate as degraded
|
||||
// Current OSD set will be added into target_history on first write
|
||||
pg->state |= PG_DEGRADED;
|
||||
}
|
||||
pg->state |= PG_ACTIVE;
|
||||
if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
|
||||
{
|
||||
pg->state |= PG_LEFT_ON_DEAD;
|
||||
@@ -435,21 +422,44 @@ void pg_t::calc_object_states(int log_level)
|
||||
std::sort(st.list.begin(), st.list.end());
|
||||
// Walk over it and check object states
|
||||
st.walk();
|
||||
if (this->state & (PG_DEGRADED|PG_LEFT_ON_DEAD))
|
||||
if (this->state != PG_ACTIVE)
|
||||
{
|
||||
assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
|
||||
epoch++;
|
||||
}
|
||||
if (log_level > 0)
|
||||
{
|
||||
std::string osd_set_desc;
|
||||
for (auto & osd_num: target_set)
|
||||
{
|
||||
osd_set_desc += (osd_set_desc == "" ? "" : ", ")+std::to_string(osd_num);
|
||||
}
|
||||
printf(
|
||||
"[PG %u/%u] %lu clean objects on target OSD set %s\n",
|
||||
pool_id, pg_num, clean_count, osd_set_desc.c_str()
|
||||
);
|
||||
for (auto & stp: state_dict)
|
||||
{
|
||||
osd_set_desc = "";
|
||||
for (auto & loc: stp.first)
|
||||
{
|
||||
osd_set_desc += (osd_set_desc == "" ? "" : ", ")+
|
||||
std::to_string(loc.osd_num)+
|
||||
(st.replicated ? "" : "("+std::to_string(loc.role)+")")+
|
||||
(loc.outdated ? "(old)" : "");
|
||||
}
|
||||
printf("[PG %u/%u] %lu objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void pg_t::print_state()
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||
(state & PG_STARTING) ? "starting" : "",
|
||||
(state & PG_OFFLINE) ? "offline" : "",
|
||||
(state & PG_PEERING) ? "peering" : "",
|
||||
(state & PG_PEERED) ? "peered" : "",
|
||||
(state & PG_INCOMPLETE) ? "incomplete" : "",
|
||||
(state & PG_ACTIVE) ? "active" : "",
|
||||
(state & PG_REPEERING) ? "repeering" : "",
|
||||
|
@@ -54,5 +54,6 @@ int main(int argc, char *argv[])
|
||||
{
|
||||
printf("dev: state=%lx\n", it.second.state);
|
||||
}
|
||||
delete pg.peering_state;
|
||||
return 0;
|
||||
}
|
||||
|
@@ -228,7 +228,7 @@ resume_1:
|
||||
resume_2:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
finish_op(cur_op, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
cur_op->reply.rw.version = op_data->fact_ver;
|
||||
@@ -350,7 +350,7 @@ resume_2:
|
||||
resume_3:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
// Check CAS version
|
||||
@@ -371,7 +371,7 @@ resume_4:
|
||||
resume_5:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
// Remove version override
|
||||
|
@@ -22,9 +22,9 @@ struct osd_primary_op_data_t
|
||||
pg_num_t pg_num;
|
||||
object_id oid;
|
||||
uint64_t target_ver;
|
||||
uint64_t fact_ver = 0;
|
||||
uint64_t orig_ver = 0, fact_ver = 0;
|
||||
uint64_t scheme = 0;
|
||||
int n_subops = 0, done = 0, errors = 0, epipe = 0;
|
||||
int n_subops = 0, done = 0, errors = 0, errcode = 0;
|
||||
int degraded = 0, pg_size, pg_data_size;
|
||||
osd_rmw_stripe_t *stripes;
|
||||
osd_op_t *subops = NULL;
|
||||
|
@@ -42,7 +42,7 @@ resume_4:
|
||||
{
|
||||
free(op_data->chain_reads);
|
||||
op_data->chain_reads = NULL;
|
||||
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
finish_op(cur_op, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
send_chained_read_results(pg, cur_op);
|
||||
@@ -297,7 +297,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
||||
// Fail it immediately
|
||||
subop->peer_fd = -1;
|
||||
subop->reply.hdr.retval = -EPIPE;
|
||||
subop->callback(subop);
|
||||
ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
||||
}
|
||||
subop_idx++;
|
||||
}
|
||||
|
@@ -122,7 +122,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
|
||||
zero_read = -1;
|
||||
osd_op_t *subops = new osd_op_t[n_subops];
|
||||
op_data->fact_ver = 0;
|
||||
op_data->done = op_data->errors = 0;
|
||||
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||
op_data->n_subops = n_subops;
|
||||
op_data->subops = subops;
|
||||
int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read);
|
||||
@@ -235,7 +235,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
// Fail it immediately
|
||||
subop->peer_fd = -1;
|
||||
subop->reply.hdr.retval = -EPIPE;
|
||||
subop->callback(subop);
|
||||
ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
||||
}
|
||||
}
|
||||
i++;
|
||||
@@ -263,9 +263,11 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
|
||||
blockstore_op_t *bs_op = subop->bs_op;
|
||||
int expected = bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE
|
||||
|| bs_op->opcode == BS_OP_WRITE_STABLE ? bs_op->len : 0;
|
||||
if (bs_op->retval != expected && bs_op->opcode != BS_OP_READ)
|
||||
if (bs_op->retval != expected && bs_op->opcode != BS_OP_READ &&
|
||||
(bs_op->opcode != BS_OP_WRITE && bs_op->opcode != BS_OP_WRITE_STABLE ||
|
||||
bs_op->retval != -ENOSPC))
|
||||
{
|
||||
// die
|
||||
// die on any error except ENOSPC
|
||||
throw std::runtime_error(
|
||||
"local blockstore modification failed (opcode = "+std::to_string(bs_op->opcode)+
|
||||
" retval = "+std::to_string(bs_op->retval)+")"
|
||||
@@ -276,6 +278,8 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
|
||||
subop->reply.hdr.retval = bs_op->retval;
|
||||
if (bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE || bs_op->opcode == BS_OP_WRITE_STABLE)
|
||||
{
|
||||
subop->req.sec_rw.oid = bs_op->oid;
|
||||
subop->req.sec_rw.version = bs_op->version;
|
||||
subop->req.sec_rw.len = bs_op->len;
|
||||
subop->reply.sec_rw.version = bs_op->version;
|
||||
}
|
||||
@@ -337,14 +341,17 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||
osd_op_names[opcode], subop->peer_fd, retval, expected
|
||||
);
|
||||
}
|
||||
if (retval == -EPIPE)
|
||||
// Error priority: EIO > ENOSPC > EPIPE
|
||||
if (op_data->errcode == 0 || retval == -EIO ||
|
||||
retval == -ENOSPC && op_data->errcode == -EPIPE)
|
||||
{
|
||||
op_data->epipe++;
|
||||
op_data->errcode = retval;
|
||||
}
|
||||
op_data->errors++;
|
||||
if (subop->peer_fd >= 0)
|
||||
if (subop->peer_fd >= 0 && (opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE ||
|
||||
retval != -ENOSPC))
|
||||
{
|
||||
// Drop connection on any error
|
||||
// Drop connection on any error expect ENOSPC
|
||||
msgr.stop_client(subop->peer_fd);
|
||||
}
|
||||
}
|
||||
@@ -408,7 +415,8 @@ void osd_t::cancel_primary_write(osd_op_t *cur_op)
|
||||
// are sent to peer OSDs, so we can't just throw them away.
|
||||
// Mark them with an extra EPIPE.
|
||||
cur_op->op_data->errors++;
|
||||
cur_op->op_data->epipe++;
|
||||
if (cur_op->op_data->errcode == 0)
|
||||
cur_op->op_data->errcode = -EPIPE;
|
||||
cur_op->op_data->done--; // Caution: `done` must be signed because may become -1 here
|
||||
}
|
||||
else
|
||||
@@ -460,7 +468,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
op_data->n_subops = chunks_to_delete_count;
|
||||
op_data->done = op_data->errors = 0;
|
||||
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||
if (!op_data->n_subops)
|
||||
{
|
||||
return;
|
||||
@@ -512,7 +520,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
||||
// Fail it immediately
|
||||
subops[i].peer_fd = -1;
|
||||
subops[i].reply.hdr.retval = -EPIPE;
|
||||
subops[i].callback(&subops[i]);
|
||||
ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -523,7 +531,7 @@ int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
int n_osds = op_data->dirty_osd_count;
|
||||
osd_op_t *subops = new osd_op_t[n_osds];
|
||||
op_data->done = op_data->errors = 0;
|
||||
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||
op_data->n_subops = n_osds;
|
||||
op_data->subops = subops;
|
||||
std::map<uint64_t, int>::iterator peer_it;
|
||||
@@ -579,7 +587,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
int n_osds = op_data->unstable_write_osds->size();
|
||||
osd_op_t *subops = new osd_op_t[n_osds];
|
||||
op_data->done = op_data->errors = 0;
|
||||
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||
op_data->n_subops = n_osds;
|
||||
op_data->subops = subops;
|
||||
for (int i = 0; i < n_osds; i++)
|
||||
@@ -627,7 +635,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||
// Fail it immediately
|
||||
subops[i].peer_fd = -1;
|
||||
subops[i].reply.hdr.retval = -EPIPE;
|
||||
subops[i].callback(&subops[i]);
|
||||
ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -240,7 +240,7 @@ resume_8:
|
||||
}
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
finish_op(cur_op, op_data->errcode);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@@ -93,7 +93,7 @@ resume_2:
|
||||
resume_3:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
// Check CAS version
|
||||
@@ -138,6 +138,7 @@ resume_3:
|
||||
}
|
||||
}
|
||||
// Send writes
|
||||
op_data->orig_ver = op_data->fact_ver;
|
||||
if ((op_data->fact_ver >> (64-PG_EPOCH_BITS)) < pg.epoch)
|
||||
{
|
||||
op_data->target_ver = ((uint64_t)pg.epoch << (64-PG_EPOCH_BITS)) | 1;
|
||||
@@ -154,17 +155,36 @@ resume_3:
|
||||
if (pg.epoch > pg.reported_epoch)
|
||||
{
|
||||
// Report newer epoch before writing
|
||||
// FIXME: We may report only one PG state here...
|
||||
// FIXME: We don't have to report all changed PG states here
|
||||
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
if (pg.state != PG_ACTIVE)
|
||||
{
|
||||
// Check that current OSD set is in history and/or add it there
|
||||
std::vector<osd_num_t> history_set;
|
||||
for (auto peer_osd: pg.cur_set)
|
||||
if (peer_osd != 0)
|
||||
history_set.push_back(peer_osd);
|
||||
std::sort(history_set.begin(), history_set.end());
|
||||
auto it = std::lower_bound(pg.target_history.begin(), pg.target_history.end(), history_set);
|
||||
if (it == pg.target_history.end() || *it != history_set)
|
||||
pg.target_history.insert(it, history_set);
|
||||
}
|
||||
pg.history_changed = true;
|
||||
report_pg_states();
|
||||
resume_10:
|
||||
if (pg.epoch > pg.reported_epoch)
|
||||
{
|
||||
op_data->st = 10;
|
||||
#define PG_EPOCH_WAIT_STATE 10
|
||||
op_data->st = PG_EPOCH_WAIT_STATE;
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Recheck PG state after reporting history - maybe it's already stopping/restarting
|
||||
if (pg.state & (PG_STOPPING|PG_REPEERING))
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE);
|
||||
return;
|
||||
}
|
||||
submit_primary_subops(SUBMIT_WRITE, op_data->target_ver, pg.cur_set.data(), cur_op);
|
||||
resume_4:
|
||||
op_data->st = 4;
|
||||
@@ -177,7 +197,7 @@ resume_5:
|
||||
}
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
if (op_data->object_state)
|
||||
@@ -194,7 +214,7 @@ resume_7:
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (op_data->fact_ver == 1)
|
||||
if (op_data->orig_ver == 0)
|
||||
{
|
||||
// Object is created
|
||||
pg.clean_count++;
|
||||
@@ -254,7 +274,7 @@ resume_8:
|
||||
resume_9:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -286,6 +306,50 @@ continue_others:
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num)
|
||||
{
|
||||
auto pg_it = pgs.find({
|
||||
.pool_id = pool_id,
|
||||
.pg_num = pg_num,
|
||||
});
|
||||
if (pg_it == pgs.end())
|
||||
{
|
||||
return;
|
||||
}
|
||||
auto & pg = pg_it->second;
|
||||
if (pg.epoch > pg.reported_epoch &&
|
||||
st_cli.pool_config[pool_id].pg_config[pg_num].epoch >= pg.epoch)
|
||||
{
|
||||
pg.reported_epoch = st_cli.pool_config[pool_id].pg_config[pg_num].epoch;
|
||||
std::vector<object_id> resume_oids;
|
||||
for (auto & op: pg.write_queue)
|
||||
{
|
||||
if (op.second->op_data->st == PG_EPOCH_WAIT_STATE)
|
||||
{
|
||||
// Run separately to prevent side effects
|
||||
resume_oids.push_back(op.first);
|
||||
}
|
||||
}
|
||||
for (auto & oid: resume_oids)
|
||||
{
|
||||
auto pg_it = pgs.find({
|
||||
.pool_id = pool_id,
|
||||
.pg_num = pg_num,
|
||||
});
|
||||
if (pg_it != pgs.end())
|
||||
{
|
||||
auto & pg = pg_it->second;
|
||||
auto op_it = pg.write_queue.find(oid);
|
||||
if (op_it != pg.write_queue.end() &&
|
||||
op_it->second->op_data->st == PG_EPOCH_WAIT_STATE)
|
||||
{
|
||||
continue_primary_write(op_it->second);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
@@ -336,7 +400,7 @@ resume_7:
|
||||
op_data->unstable_write_osds = NULL;
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
246
src/osd_rmw.cpp
246
src/osd_rmw.cpp
@@ -154,6 +154,8 @@ struct reed_sol_matrix_t
|
||||
int refs = 0;
|
||||
int *je_data;
|
||||
uint8_t *isal_data;
|
||||
// 32 bytes = 256/8 = max pg_size/8
|
||||
std::map<std::array<uint8_t, 32>, void*> subdata;
|
||||
std::map<reed_sol_erased_t, void*> decodings;
|
||||
};
|
||||
|
||||
@@ -194,6 +196,12 @@ void use_ec(int pg_size, int pg_minsize, bool use)
|
||||
free(rs_it->second.je_data);
|
||||
if (rs_it->second.isal_data)
|
||||
free(rs_it->second.isal_data);
|
||||
for (auto sub_it = rs_it->second.subdata.begin(); sub_it != rs_it->second.subdata.end();)
|
||||
{
|
||||
void *data = sub_it->second;
|
||||
rs_it->second.subdata.erase(sub_it++);
|
||||
free(data);
|
||||
}
|
||||
for (auto dec_it = rs_it->second.decodings.begin(); dec_it != rs_it->second.decodings.end();)
|
||||
{
|
||||
void *data = dec_it->second;
|
||||
@@ -294,6 +302,47 @@ static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size
|
||||
return dec_it->second;
|
||||
}
|
||||
|
||||
#ifndef WITH_ISAL
|
||||
#define JERASURE_ALIGNMENT 16
|
||||
|
||||
// jerasure requires 16-byte alignment for SSE...
|
||||
// FIXME: jerasure/gf-complete should probably be patched to automatically choose non-sse version for unaligned buffers
|
||||
static void jerasure_matrix_encode_unaligned(int k, int m, int w, int *matrix, char **data_ptrs, char **coding_ptrs, int size)
|
||||
{
|
||||
bool unaligned = false;
|
||||
for (int i = 0; i < k; i++)
|
||||
if (((unsigned long)data_ptrs[i]) % JERASURE_ALIGNMENT)
|
||||
unaligned = true;
|
||||
for (int i = 0; i < m; i++)
|
||||
if (((unsigned long)coding_ptrs[i]) % JERASURE_ALIGNMENT)
|
||||
unaligned = true;
|
||||
if (!unaligned)
|
||||
{
|
||||
jerasure_matrix_encode(k, m, w, matrix, data_ptrs, coding_ptrs, size);
|
||||
return;
|
||||
}
|
||||
int aligned_size = ((size+JERASURE_ALIGNMENT-1)/JERASURE_ALIGNMENT)*JERASURE_ALIGNMENT;
|
||||
int copy_size = aligned_size*(k+m);
|
||||
char local_data[copy_size > 4096 ? 0 : copy_size];
|
||||
char *data_copy = copy_size > 4096 || (unsigned long)local_data % JERASURE_ALIGNMENT
|
||||
? (char*)memalign_or_die(JERASURE_ALIGNMENT, aligned_size*(k+m))
|
||||
: local_data;
|
||||
char *aligned_ptrs[k+m];
|
||||
for (int i = 0; i < k; i++)
|
||||
{
|
||||
memcpy(data_copy + i*aligned_size, data_ptrs[i], size);
|
||||
aligned_ptrs[i] = data_copy + i*aligned_size;
|
||||
}
|
||||
for (int i = 0; i < m; i++)
|
||||
aligned_ptrs[k+i] = data_copy + (k+i)*aligned_size;
|
||||
jerasure_matrix_encode(k, m, w, matrix, aligned_ptrs, aligned_ptrs+k, size);
|
||||
for (int i = 0; i < m; i++)
|
||||
memcpy(coding_ptrs[i], aligned_ptrs[k+i], size);
|
||||
if (copy_size > 4096 || (unsigned long)local_data % JERASURE_ALIGNMENT)
|
||||
free(data_copy);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef WITH_ISAL
|
||||
void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size)
|
||||
{
|
||||
@@ -357,10 +406,12 @@ void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsi
|
||||
{
|
||||
data_ptrs[role] = NULL;
|
||||
}
|
||||
bool recovered = false;
|
||||
for (int role = 0; role < pg_minsize; role++)
|
||||
{
|
||||
if (stripes[role].read_end != 0 && stripes[role].missing)
|
||||
{
|
||||
recovered = true;
|
||||
if (stripes[role].read_end > stripes[role].read_start)
|
||||
{
|
||||
for (int other = 0; other < pg_size; other++)
|
||||
@@ -378,18 +429,64 @@ void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsi
|
||||
data_ptrs, data_ptrs+pg_minsize, stripes[role].read_end - stripes[role].read_start
|
||||
);
|
||||
}
|
||||
for (int other = 0; other < pg_size; other++)
|
||||
}
|
||||
}
|
||||
if (recovered && bitmap_size > 0)
|
||||
{
|
||||
bool unaligned = false;
|
||||
for (int role = 0; role < pg_size; role++)
|
||||
{
|
||||
if (stripes[role].read_end != 0)
|
||||
{
|
||||
if (stripes[other].read_end != 0 && !stripes[other].missing)
|
||||
data_ptrs[role] = (char*)stripes[role].bmp_buf;
|
||||
if (((unsigned long)stripes[role].bmp_buf) % JERASURE_ALIGNMENT)
|
||||
unaligned = true;
|
||||
}
|
||||
}
|
||||
if (!unaligned)
|
||||
{
|
||||
for (int role = 0; role < pg_minsize; role++)
|
||||
{
|
||||
if (stripes[role].read_end != 0 && stripes[role].missing)
|
||||
{
|
||||
data_ptrs[other] = (char*)(stripes[other].bmp_buf);
|
||||
jerasure_matrix_dotprod(
|
||||
pg_minsize, OSD_JERASURE_W, decoding_matrix+(role*pg_minsize), dm_ids, role,
|
||||
data_ptrs, data_ptrs+pg_minsize, bitmap_size
|
||||
);
|
||||
}
|
||||
}
|
||||
data_ptrs[role] = (char*)stripes[role].bmp_buf;
|
||||
jerasure_matrix_dotprod(
|
||||
pg_minsize, OSD_JERASURE_W, decoding_matrix+(role*pg_minsize), dm_ids, role,
|
||||
data_ptrs, data_ptrs+pg_minsize, bitmap_size
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
// jerasure_matrix_dotprod requires 16-byte alignment for SSE...
|
||||
int aligned_size = ((bitmap_size+JERASURE_ALIGNMENT-1)/JERASURE_ALIGNMENT)*JERASURE_ALIGNMENT;
|
||||
int copy_size = aligned_size*pg_size;
|
||||
char local_data[copy_size > 4096 ? 0 : copy_size];
|
||||
bool alloc_copy = copy_size > 4096 || (unsigned long)local_data % JERASURE_ALIGNMENT;
|
||||
char *data_copy = alloc_copy
|
||||
? (char*)memalign_or_die(JERASURE_ALIGNMENT, copy_size)
|
||||
: local_data;
|
||||
for (int role = 0; role < pg_size; role++)
|
||||
{
|
||||
if (stripes[role].read_end != 0)
|
||||
{
|
||||
data_ptrs[role] = data_copy + role*aligned_size;
|
||||
memcpy(data_ptrs[role], stripes[role].bmp_buf, bitmap_size);
|
||||
}
|
||||
}
|
||||
for (int role = 0; role < pg_size; role++)
|
||||
{
|
||||
if (stripes[role].read_end != 0 && stripes[role].missing)
|
||||
{
|
||||
jerasure_matrix_dotprod(
|
||||
pg_minsize, OSD_JERASURE_W, decoding_matrix+(role*pg_minsize), dm_ids, role,
|
||||
data_ptrs, data_ptrs+pg_minsize, bitmap_size
|
||||
);
|
||||
memcpy(stripes[role].bmp_buf, data_ptrs[role], bitmap_size);
|
||||
}
|
||||
}
|
||||
if (alloc_copy)
|
||||
free(data_copy);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -478,15 +575,18 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
|
||||
{
|
||||
if (write_osd_set[role] != 0)
|
||||
{
|
||||
write_parity = 1;
|
||||
write_parity++;
|
||||
if (write_osd_set[role] != read_osd_set[role])
|
||||
{
|
||||
start = 0;
|
||||
end = chunk_size;
|
||||
for (int r2 = pg_minsize; r2 < role; r2++)
|
||||
{
|
||||
stripes[r2].write_start = start;
|
||||
stripes[r2].write_end = end;
|
||||
if (write_osd_set[r2] != 0)
|
||||
{
|
||||
stripes[r2].write_start = start;
|
||||
stripes[r2].write_end = end;
|
||||
}
|
||||
}
|
||||
}
|
||||
stripes[role].write_start = start;
|
||||
@@ -555,7 +655,7 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
|
||||
}
|
||||
}
|
||||
// Allocate read buffers
|
||||
void *rmw_buf = alloc_read_buffer(stripes, pg_size, (write_parity ? pg_size-pg_minsize : 0) * (end - start));
|
||||
void *rmw_buf = alloc_read_buffer(stripes, pg_size, write_parity * (end - start));
|
||||
// Position write buffers
|
||||
uint64_t buf_pos = 0, in_pos = 0;
|
||||
for (int role = 0; role < pg_size; role++)
|
||||
@@ -659,7 +759,18 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
|
||||
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
|
||||
uint32_t &start, uint32_t &end)
|
||||
{
|
||||
if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set)
|
||||
bool required = false;
|
||||
for (int role = pg_minsize; role < pg_size; role++)
|
||||
{
|
||||
if (write_osd_set[role] != 0)
|
||||
{
|
||||
// Whole parity chunk is needed when we move the object
|
||||
if (write_osd_set[role] != read_osd_set[role])
|
||||
end = chunk_size;
|
||||
required = true;
|
||||
}
|
||||
}
|
||||
if (required && end != chunk_size)
|
||||
{
|
||||
// start & end are required for calc_rmw_parity
|
||||
for (int role = 0; role < pg_minsize; role++)
|
||||
@@ -670,14 +781,6 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
|
||||
end = std::max(stripes[role].req_end, end);
|
||||
}
|
||||
}
|
||||
for (int role = pg_minsize; role < pg_size; role++)
|
||||
{
|
||||
if (write_osd_set[role] != 0 && write_osd_set[role] != read_osd_set[role])
|
||||
{
|
||||
start = 0;
|
||||
end = chunk_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Set bitmap bits accordingly
|
||||
if (bitmap_granularity > 0)
|
||||
@@ -804,14 +907,57 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
|
||||
calc_rmw_parity_copy_mod(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, bitmap_granularity, start, end);
|
||||
if (end != 0)
|
||||
{
|
||||
int i;
|
||||
for (i = pg_minsize; i < pg_size; i++)
|
||||
int write_parity = 0;
|
||||
bool is_seq = true;
|
||||
for (int i = pg_size-1; i >= pg_minsize; i--)
|
||||
{
|
||||
if (write_osd_set[i] != 0)
|
||||
break;
|
||||
write_parity++;
|
||||
else if (write_parity != 0)
|
||||
is_seq = false;
|
||||
}
|
||||
if (i < pg_size)
|
||||
if (write_parity > 0)
|
||||
{
|
||||
// First get the coding matrix or sub-matrix
|
||||
void *matrix_data =
|
||||
#ifdef WITH_ISAL
|
||||
matrix->isal_data;
|
||||
#else
|
||||
matrix->je_data;
|
||||
#endif
|
||||
if (!is_seq)
|
||||
{
|
||||
// We need a coding sub-matrix
|
||||
std::array<uint8_t, 32> missing_parity = {};
|
||||
for (int i = pg_minsize; i < pg_size; i++)
|
||||
{
|
||||
if (!write_osd_set[i])
|
||||
missing_parity[(i-pg_minsize) >> 3] |= (1 << ((i-pg_minsize) & 0x7));
|
||||
}
|
||||
auto sub_it = matrix->subdata.find(missing_parity);
|
||||
if (sub_it == matrix->subdata.end())
|
||||
{
|
||||
int item_size =
|
||||
#ifdef WITH_ISAL
|
||||
32;
|
||||
#else
|
||||
sizeof(int);
|
||||
#endif
|
||||
void *subm = malloc_or_die(item_size * write_parity * pg_minsize);
|
||||
for (int i = pg_minsize, j = 0; i < pg_size; i++)
|
||||
{
|
||||
if (write_osd_set[i])
|
||||
{
|
||||
memcpy((uint8_t*)subm + item_size*pg_minsize*j, (uint8_t*)matrix_data + item_size*pg_minsize*(i-pg_minsize), item_size*pg_minsize);
|
||||
j++;
|
||||
}
|
||||
}
|
||||
matrix->subdata[missing_parity] = subm;
|
||||
matrix_data = subm;
|
||||
}
|
||||
else
|
||||
matrix_data = sub_it->second;
|
||||
}
|
||||
// Calculate new coding chunks
|
||||
buf_len_t bufs[pg_size][3];
|
||||
int nbuf[pg_size], curbuf[pg_size];
|
||||
@@ -830,57 +976,67 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
|
||||
}
|
||||
for (int i = pg_minsize; i < pg_size; i++)
|
||||
{
|
||||
bufs[i][nbuf[i]++] = { .buf = stripes[i].write_buf, .len = end-start };
|
||||
positions[i] = start;
|
||||
if (write_osd_set[i] != 0)
|
||||
{
|
||||
bufs[i][nbuf[i]++] = { .buf = stripes[i].write_buf, .len = end-start };
|
||||
positions[i] = start;
|
||||
}
|
||||
}
|
||||
uint32_t pos = start;
|
||||
while (pos < end)
|
||||
{
|
||||
uint32_t next_end = end;
|
||||
for (int i = 0; i < pg_size; i++)
|
||||
for (int i = 0, j = 0; i < pg_size; i++)
|
||||
{
|
||||
assert(curbuf[i] < nbuf[i]);
|
||||
assert(bufs[i][curbuf[i]].buf);
|
||||
data_ptrs[i] = (uint8_t*)bufs[i][curbuf[i]].buf + pos-positions[i];
|
||||
uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
|
||||
if (next_end > this_end)
|
||||
next_end = this_end;
|
||||
if (i < pg_minsize || write_osd_set[i] != 0)
|
||||
{
|
||||
assert(curbuf[i] < nbuf[i]);
|
||||
assert(bufs[i][curbuf[i]].buf);
|
||||
data_ptrs[j++] = (uint8_t*)bufs[i][curbuf[i]].buf + pos-positions[i];
|
||||
uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
|
||||
if (next_end > this_end)
|
||||
next_end = this_end;
|
||||
}
|
||||
}
|
||||
assert(next_end > pos);
|
||||
for (int i = 0; i < pg_size; i++)
|
||||
{
|
||||
uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
|
||||
if (next_end >= this_end)
|
||||
if (i < pg_minsize || write_osd_set[i] != 0)
|
||||
{
|
||||
positions[i] += bufs[i][curbuf[i]].len;
|
||||
curbuf[i]++;
|
||||
uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
|
||||
if (next_end >= this_end)
|
||||
{
|
||||
positions[i] += bufs[i][curbuf[i]].len;
|
||||
curbuf[i]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef WITH_ISAL
|
||||
ec_encode_data(
|
||||
next_end-pos, pg_minsize, pg_size-pg_minsize, matrix->isal_data,
|
||||
next_end-pos, pg_minsize, write_parity, (uint8_t*)matrix_data,
|
||||
(uint8_t**)data_ptrs, (uint8_t**)data_ptrs+pg_minsize
|
||||
);
|
||||
#else
|
||||
jerasure_matrix_encode(
|
||||
pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->je_data,
|
||||
pg_minsize, write_parity, OSD_JERASURE_W, (int*)matrix_data,
|
||||
(char**)data_ptrs, (char**)data_ptrs+pg_minsize, next_end-pos
|
||||
);
|
||||
#endif
|
||||
pos = next_end;
|
||||
}
|
||||
for (int i = 0; i < pg_size; i++)
|
||||
for (int i = 0, j = 0; i < pg_size; i++)
|
||||
{
|
||||
data_ptrs[i] = stripes[i].bmp_buf;
|
||||
if (i < pg_minsize || write_osd_set[i] != 0)
|
||||
data_ptrs[j++] = stripes[i].bmp_buf;
|
||||
}
|
||||
#ifdef WITH_ISAL
|
||||
ec_encode_data(
|
||||
bitmap_size, pg_minsize, pg_size-pg_minsize, matrix->isal_data,
|
||||
bitmap_size, pg_minsize, write_parity, (uint8_t*)matrix_data,
|
||||
(uint8_t**)data_ptrs, (uint8_t**)data_ptrs+pg_minsize
|
||||
);
|
||||
#else
|
||||
jerasure_matrix_encode(
|
||||
pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->je_data,
|
||||
jerasure_matrix_encode_unaligned(
|
||||
pg_minsize, write_parity, OSD_JERASURE_W, (int*)matrix_data,
|
||||
(char**)data_ptrs, (char**)data_ptrs+pg_minsize, bitmap_size
|
||||
);
|
||||
#endif
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user