Compare commits
61 Commits
v0.8.3
...
csi-use-vi
Author | SHA1 | Date | |
---|---|---|---|
2b4e0de397 | |||
726c6d3470 | |||
2389b49a16 | |||
fe1ee67b05 | |||
c775a52a7d | |||
e307dd13ed | |||
a7f63f7c29 | |||
1e307069bc | |||
c3e80abad7 | |||
138ffe4032 | |||
8139a34e97 | |||
4ab630b44d | |||
2c8241b7db | |||
36a7dd3671 | |||
936122bbcf | |||
1a1ba0d1e7 | |||
3d09c9cec7 | |||
3d08a1ad6c | |||
499881d81c | |||
aba93b951b | |||
d125fb1f30 | |||
9d3fd72298 | |||
8b552a01f9 | |||
0385b2f9e8 | |||
749c837045 | |||
98001d845b | |||
c96bcae74b | |||
9f4e34a8cc | |||
81fc8bb94c | |||
bc465c16de | |||
8763e9211c | |||
9e1a80bd17 | |||
3e280f2f08 | |||
fe87b4076b | |||
a38957c1a7 | |||
137309cf29 | |||
373f9d0387 | |||
c4516ea971 | |||
91065c80fc | |||
0f6b946add | |||
465cbf0b2f | |||
41add50e4e | |||
02e7be7dc9 | |||
73940adf07 | |||
e950c024d3 | |||
71d6d9f868 | |||
a4dfa519af | |||
37a6aff2fa | |||
67019f5b02 | |||
0593e5c21c | |||
998e24adf8 | |||
d7bd36dc32 | |||
cf5c562800 | |||
629200b0cc | |||
3589ccec22 | |||
8d55a1e780 | |||
65f6b3a4eb | |||
fd216eac77 | |||
61fca7c426 | |||
1c29ed80b9 | |||
68f3fb795e |
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
|
|||||||
|
|
||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
set(VERSION "0.8.3")
|
set(VERSION "0.8.5")
|
||||||
|
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
@@ -48,9 +48,9 @@ Vitastor, составлены для того, чтобы убедиться,
|
|||||||
интерфейс (прокси), опять же, без открытия в свободный публичный доступ как
|
интерфейс (прокси), опять же, без открытия в свободный публичный доступ как
|
||||||
самой программы, так и прокси.
|
самой программы, так и прокси.
|
||||||
|
|
||||||
Сетевая Публичная Лицензия Vitastor разработана специально чтобы
|
Сетевая Публичная Лицензия Vitastor разработана специально, чтобы
|
||||||
гарантировать, что в таких случаях и модифицированная версия программы, и
|
гарантировать, что в таких случаях и модифицированная версия программы, и
|
||||||
прокси оставались доступными сообществу. Для этого лицензия требует от
|
прокси останутся доступными сообществу. Для этого лицензия требует от
|
||||||
операторов сетевых серверов предоставлять исходный код оригинальной программы,
|
операторов сетевых серверов предоставлять исходный код оригинальной программы,
|
||||||
а также всех других программ, взаимодействующих с ней на их серверах,
|
а также всех других программ, взаимодействующих с ней на их серверах,
|
||||||
пользователям этих серверов, на условиях свободных лицензий. Таким образом,
|
пользователям этих серверов, на условиях свободных лицензий. Таким образом,
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
VERSION ?= v0.8.3
|
VERSION ?= v0.8.5
|
||||||
|
|
||||||
all: build push
|
all: build push
|
||||||
|
|
||||||
|
@@ -49,7 +49,7 @@ spec:
|
|||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
allowPrivilegeEscalation: true
|
allowPrivilegeEscalation: true
|
||||||
image: vitalif/vitastor-csi:v0.8.3
|
image: vitalif/vitastor-csi:v0.8.5
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
@@ -116,7 +116,7 @@ spec:
|
|||||||
privileged: true
|
privileged: true
|
||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
image: vitalif/vitastor-csi:v0.8.3
|
image: vitalif/vitastor-csi:v0.8.5
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
@@ -5,7 +5,7 @@ package vitastor
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
vitastorCSIDriverName = "csi.vitastor.io"
|
vitastorCSIDriverName = "csi.vitastor.io"
|
||||||
vitastorCSIDriverVersion = "0.8.3"
|
vitastorCSIDriverVersion = "0.8.5"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config struct fills the parameters of request or user input
|
// Config struct fills the parameters of request or user input
|
||||||
|
@@ -10,7 +10,6 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
"fmt"
|
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
@@ -21,8 +20,6 @@ import (
|
|||||||
"google.golang.org/grpc/codes"
|
"google.golang.org/grpc/codes"
|
||||||
"google.golang.org/grpc/status"
|
"google.golang.org/grpc/status"
|
||||||
|
|
||||||
"go.etcd.io/etcd/clientv3"
|
|
||||||
|
|
||||||
"github.com/container-storage-interface/spec/lib/go/csi"
|
"github.com/container-storage-interface/spec/lib/go/csi"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -114,6 +111,34 @@ func GetConnectionParams(params map[string]string) (map[string]string, []string,
|
|||||||
return ctxVars, etcdUrl, etcdPrefix
|
return ctxVars, etcdUrl, etcdPrefix
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
|
||||||
|
{
|
||||||
|
if (ctxVars["etcdUrl"] != "")
|
||||||
|
{
|
||||||
|
args = append(args, "--etcd_address", ctxVars["etcdUrl"])
|
||||||
|
}
|
||||||
|
if (ctxVars["etcdPrefix"] != "")
|
||||||
|
{
|
||||||
|
args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
|
||||||
|
}
|
||||||
|
if (ctxVars["configPath"] != "")
|
||||||
|
{
|
||||||
|
args = append(args, "--config_path", ctxVars["configPath"])
|
||||||
|
}
|
||||||
|
c := exec.Command("/usr/bin/vitastor-cli", args...)
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
c.Stdout = &stdout
|
||||||
|
c.Stderr = &stderr
|
||||||
|
err := c.Run()
|
||||||
|
stderrStr := string(stderr.Bytes())
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
|
||||||
|
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
|
||||||
|
}
|
||||||
|
return stdout.Bytes(), nil
|
||||||
|
}
|
||||||
|
|
||||||
// Create the volume
|
// Create the volume
|
||||||
func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
|
func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
|
||||||
{
|
{
|
||||||
@@ -146,128 +171,41 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
|
|||||||
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
|
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: The following should PROBABLY be implemented externally in a management tool
|
ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
|
||||||
|
|
||||||
ctxVars, etcdUrl, etcdPrefix := GetConnectionParams(req.Parameters)
|
|
||||||
if (len(etcdUrl) == 0)
|
if (len(etcdUrl) == 0)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Connect to etcd
|
// Create image using vitastor-cli
|
||||||
cli, err := clientv3.New(clientv3.Config{
|
_, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", string(volSize), "--pool", string(poolId) })
|
||||||
DialTimeout: ETCD_TIMEOUT,
|
|
||||||
Endpoints: etcdUrl,
|
|
||||||
})
|
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
|
if (strings.Index(err.Error(), "already exists") > 0)
|
||||||
}
|
|
||||||
defer cli.Close()
|
|
||||||
|
|
||||||
var imageId uint64 = 0
|
|
||||||
for
|
|
||||||
{
|
|
||||||
// Check if the image exists
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
|
||||||
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
|
|
||||||
cancel()
|
|
||||||
if (err != nil)
|
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
|
||||||
}
|
|
||||||
if (len(resp.Kvs) > 0)
|
|
||||||
{
|
|
||||||
kv := resp.Kvs[0]
|
|
||||||
var v InodeIndex
|
|
||||||
err := json.Unmarshal(kv.Value, &v)
|
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
|
return nil, err
|
||||||
}
|
}
|
||||||
poolId = v.PoolId
|
var inodeCfg []InodeConfig
|
||||||
imageId = v.Id
|
err = json.Unmarshal(stat, &inodeCfg)
|
||||||
inodeCfgKey := fmt.Sprintf("/config/inode/%d/%d", poolId, imageId)
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
|
||||||
resp, err := cli.Get(ctx, etcdPrefix+inodeCfgKey)
|
|
||||||
cancel()
|
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
|
||||||
}
|
}
|
||||||
if (len(resp.Kvs) == 0)
|
if (len(inodeCfg) == 0)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "missing "+inodeCfgKey+" key in etcd")
|
return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
|
||||||
}
|
}
|
||||||
var inodeCfg InodeConfig
|
if (inodeCfg[0].Size < uint64(volSize))
|
||||||
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
if (inodeCfg.Size < uint64(volSize))
|
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
|
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Find a free ID
|
return nil, err
|
||||||
// Create image metadata in a transaction verifying that the image doesn't exist yet AND ID is still free
|
|
||||||
maxIdKey := fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
|
||||||
resp, err := cli.Get(ctx, maxIdKey)
|
|
||||||
cancel()
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
var modRev int64
|
|
||||||
var nextId uint64
|
|
||||||
if (len(resp.Kvs) > 0)
|
|
||||||
{
|
|
||||||
var err error
|
|
||||||
nextId, err = strconv.ParseUint(string(resp.Kvs[0].Value), 10, 64)
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, maxIdKey+" contains invalid ID")
|
|
||||||
}
|
|
||||||
modRev = resp.Kvs[0].ModRevision
|
|
||||||
nextId++
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
nextId = 1
|
|
||||||
}
|
|
||||||
inodeIdxJson, _ := json.Marshal(InodeIndex{
|
|
||||||
Id: nextId,
|
|
||||||
PoolId: poolId,
|
|
||||||
})
|
|
||||||
inodeCfgJson, _ := json.Marshal(InodeConfig{
|
|
||||||
Name: volName,
|
|
||||||
Size: uint64(volSize),
|
|
||||||
})
|
|
||||||
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
|
||||||
txnResp, err := cli.Txn(ctx).If(
|
|
||||||
clientv3.Compare(clientv3.ModRevision(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)), "=", modRev),
|
|
||||||
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)), "=", 0),
|
|
||||||
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId)), "=", 0),
|
|
||||||
).Then(
|
|
||||||
clientv3.OpPut(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId), fmt.Sprintf("%d", nextId)),
|
|
||||||
clientv3.OpPut(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName), string(inodeIdxJson)),
|
|
||||||
clientv3.OpPut(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId), string(inodeCfgJson)),
|
|
||||||
).Commit()
|
|
||||||
cancel()
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "failed to commit transaction in etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
if (txnResp.Succeeded)
|
|
||||||
{
|
|
||||||
imageId = nextId
|
|
||||||
break
|
|
||||||
}
|
|
||||||
// Start over if the transaction fails
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -299,97 +237,12 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
|
|||||||
}
|
}
|
||||||
volName := ctxVars["name"]
|
volName := ctxVars["name"]
|
||||||
|
|
||||||
_, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
|
ctxVars, _, _ = GetConnectionParams(ctxVars)
|
||||||
if (len(etcdUrl) == 0)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
|
||||||
}
|
|
||||||
|
|
||||||
cli, err := clientv3.New(clientv3.Config{
|
_, err = invokeCLI(ctxVars, []string{ "rm", volName })
|
||||||
DialTimeout: ETCD_TIMEOUT,
|
|
||||||
Endpoints: etcdUrl,
|
|
||||||
})
|
|
||||||
if (err != nil)
|
if (err != nil)
|
||||||
{
|
{
|
||||||
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
|
return nil, err
|
||||||
}
|
|
||||||
defer cli.Close()
|
|
||||||
|
|
||||||
// Find inode by name
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
|
||||||
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
|
|
||||||
cancel()
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
if (len(resp.Kvs) == 0)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
|
|
||||||
}
|
|
||||||
var idx InodeIndex
|
|
||||||
err = json.Unmarshal(resp.Kvs[0].Value, &idx)
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get inode config
|
|
||||||
inodeCfgKey := fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)
|
|
||||||
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
|
||||||
resp, err = cli.Get(ctx, inodeCfgKey)
|
|
||||||
cancel()
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
if (len(resp.Kvs) == 0)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
|
|
||||||
}
|
|
||||||
var inodeCfg InodeConfig
|
|
||||||
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
|
|
||||||
// Delete inode data by invoking vitastor-cli
|
|
||||||
args := []string{
|
|
||||||
"rm-data", "--etcd_address", strings.Join(etcdUrl, ","),
|
|
||||||
"--pool", fmt.Sprintf("%d", idx.PoolId),
|
|
||||||
"--inode", fmt.Sprintf("%d", idx.Id),
|
|
||||||
}
|
|
||||||
if (ctxVars["configPath"] != "")
|
|
||||||
{
|
|
||||||
args = append(args, "--config_path", ctxVars["configPath"])
|
|
||||||
}
|
|
||||||
c := exec.Command("/usr/bin/vitastor-cli", args...)
|
|
||||||
var stderr bytes.Buffer
|
|
||||||
c.Stdout = nil
|
|
||||||
c.Stderr = &stderr
|
|
||||||
err = c.Run()
|
|
||||||
stderrStr := string(stderr.Bytes())
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
klog.Errorf("vitastor-cli rm-data failed: %s, status %s\n", stderrStr, err)
|
|
||||||
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Delete inode config in etcd
|
|
||||||
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
|
||||||
txnResp, err := cli.Txn(ctx).Then(
|
|
||||||
clientv3.OpDelete(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)),
|
|
||||||
clientv3.OpDelete(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)),
|
|
||||||
).Commit()
|
|
||||||
cancel()
|
|
||||||
if (err != nil)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: "+err.Error())
|
|
||||||
}
|
|
||||||
if (!txnResp.Succeeded)
|
|
||||||
{
|
|
||||||
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: transaction failed")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return &csi.DeleteVolumeResponse{}, nil
|
return &csi.DeleteVolumeResponse{}, nil
|
||||||
|
4
debian/changelog
vendored
4
debian/changelog
vendored
@@ -1,10 +1,10 @@
|
|||||||
vitastor (0.8.3-1) unstable; urgency=medium
|
vitastor (0.8.5-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Bugfixes
|
* Bugfixes
|
||||||
|
|
||||||
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
||||||
|
|
||||||
vitastor (0.8.3-1) unstable; urgency=medium
|
vitastor (0.8.5-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Implement NFS proxy
|
* Implement NFS proxy
|
||||||
* Add documentation
|
* Add documentation
|
||||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -34,8 +34,8 @@ RUN set -e -x; \
|
|||||||
mkdir -p /root/packages/vitastor-$REL; \
|
mkdir -p /root/packages/vitastor-$REL; \
|
||||||
rm -rf /root/packages/vitastor-$REL/*; \
|
rm -rf /root/packages/vitastor-$REL/*; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
cp -r /root/vitastor vitastor-0.8.3; \
|
cp -r /root/vitastor vitastor-0.8.5; \
|
||||||
cd vitastor-0.8.3; \
|
cd vitastor-0.8.5; \
|
||||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||||
@@ -48,8 +48,8 @@ RUN set -e -x; \
|
|||||||
rm -rf a b; \
|
rm -rf a b; \
|
||||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.3.orig.tar.xz vitastor-0.8.3; \
|
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.5.orig.tar.xz vitastor-0.8.5; \
|
||||||
cd vitastor-0.8.3; \
|
cd vitastor-0.8.5; \
|
||||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||||
|
@@ -17,6 +17,7 @@ initialization and can be changed with an OSD restart.
|
|||||||
- [autosync_interval](#autosync_interval)
|
- [autosync_interval](#autosync_interval)
|
||||||
- [autosync_writes](#autosync_writes)
|
- [autosync_writes](#autosync_writes)
|
||||||
- [recovery_queue_depth](#recovery_queue_depth)
|
- [recovery_queue_depth](#recovery_queue_depth)
|
||||||
|
- [recovery_pg_switch](#recovery_pg_switch)
|
||||||
- [recovery_sync_batch](#recovery_sync_batch)
|
- [recovery_sync_batch](#recovery_sync_batch)
|
||||||
- [readonly](#readonly)
|
- [readonly](#readonly)
|
||||||
- [no_recovery](#no_recovery)
|
- [no_recovery](#no_recovery)
|
||||||
@@ -115,6 +116,16 @@ Maximum recovery operations per one primary OSD at any given moment of time.
|
|||||||
Currently it's the only parameter available to tune the speed or recovery
|
Currently it's the only parameter available to tune the speed or recovery
|
||||||
and rebalancing, but it's planned to implement more.
|
and rebalancing, but it's planned to implement more.
|
||||||
|
|
||||||
|
## recovery_pg_switch
|
||||||
|
|
||||||
|
- Type: integer
|
||||||
|
- Default: 128
|
||||||
|
|
||||||
|
Number of recovery operations before switching to recovery of the next PG.
|
||||||
|
The idea is to mix all PGs during recovery for more even space and load
|
||||||
|
distribution but still benefit from recovery queue depth greater than 1.
|
||||||
|
Degraded PGs are anyway scanned first.
|
||||||
|
|
||||||
## recovery_sync_batch
|
## recovery_sync_batch
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
|
@@ -18,6 +18,7 @@
|
|||||||
- [autosync_interval](#autosync_interval)
|
- [autosync_interval](#autosync_interval)
|
||||||
- [autosync_writes](#autosync_writes)
|
- [autosync_writes](#autosync_writes)
|
||||||
- [recovery_queue_depth](#recovery_queue_depth)
|
- [recovery_queue_depth](#recovery_queue_depth)
|
||||||
|
- [recovery_pg_switch](#recovery_pg_switch)
|
||||||
- [recovery_sync_batch](#recovery_sync_batch)
|
- [recovery_sync_batch](#recovery_sync_batch)
|
||||||
- [readonly](#readonly)
|
- [readonly](#readonly)
|
||||||
- [no_recovery](#no_recovery)
|
- [no_recovery](#no_recovery)
|
||||||
@@ -119,6 +120,17 @@ OSD, чтобы успевать очищать журнал - без них OSD
|
|||||||
для ускорения или замедления восстановления и перебалансировки данных, но
|
для ускорения или замедления восстановления и перебалансировки данных, но
|
||||||
в планах реализация других параметров.
|
в планах реализация других параметров.
|
||||||
|
|
||||||
|
## recovery_pg_switch
|
||||||
|
|
||||||
|
- Тип: целое число
|
||||||
|
- Значение по умолчанию: 128
|
||||||
|
|
||||||
|
Число операций восстановления перед переключением на восстановление другой PG.
|
||||||
|
Идея заключается в том, чтобы восстанавливать все PG одновременно для более
|
||||||
|
равномерного распределения места и нагрузки, но при этом всё равно выигрывать
|
||||||
|
от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом
|
||||||
|
случае сканируются первыми.
|
||||||
|
|
||||||
## recovery_sync_batch
|
## recovery_sync_batch
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
|
@@ -102,6 +102,20 @@
|
|||||||
момент времени. На данный момент единственный параметр, который можно менять
|
момент времени. На данный момент единственный параметр, который можно менять
|
||||||
для ускорения или замедления восстановления и перебалансировки данных, но
|
для ускорения или замедления восстановления и перебалансировки данных, но
|
||||||
в планах реализация других параметров.
|
в планах реализация других параметров.
|
||||||
|
- name: recovery_pg_switch
|
||||||
|
type: int
|
||||||
|
default: 128
|
||||||
|
info: |
|
||||||
|
Number of recovery operations before switching to recovery of the next PG.
|
||||||
|
The idea is to mix all PGs during recovery for more even space and load
|
||||||
|
distribution but still benefit from recovery queue depth greater than 1.
|
||||||
|
Degraded PGs are anyway scanned first.
|
||||||
|
info_ru: |
|
||||||
|
Число операций восстановления перед переключением на восстановление другой PG.
|
||||||
|
Идея заключается в том, чтобы восстанавливать все PG одновременно для более
|
||||||
|
равномерного распределения места и нагрузки, но при этом всё равно выигрывать
|
||||||
|
от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом
|
||||||
|
случае сканируются первыми.
|
||||||
- name: recovery_sync_batch
|
- name: recovery_sync_batch
|
||||||
type: int
|
type: int
|
||||||
default: 16
|
default: 16
|
||||||
|
@@ -9,7 +9,7 @@
|
|||||||
## Debian
|
## Debian
|
||||||
|
|
||||||
- Trust Vitastor package signing key:
|
- Trust Vitastor package signing key:
|
||||||
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
|
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
|
||||||
- Add Vitastor package repository to your /etc/apt/sources.list:
|
- Add Vitastor package repository to your /etc/apt/sources.list:
|
||||||
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
||||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||||
@@ -20,8 +20,8 @@
|
|||||||
## CentOS
|
## CentOS
|
||||||
|
|
||||||
- Add Vitastor package repository:
|
- Add Vitastor package repository:
|
||||||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
|
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
|
||||||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
|
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
|
||||||
- Enable EPEL: `yum/dnf install epel-release`
|
- Enable EPEL: `yum/dnf install epel-release`
|
||||||
- Enable additional CentOS repositories:
|
- Enable additional CentOS repositories:
|
||||||
- CentOS 7: `yum install centos-release-scl`
|
- CentOS 7: `yum install centos-release-scl`
|
||||||
|
@@ -9,7 +9,7 @@
|
|||||||
## Debian
|
## Debian
|
||||||
|
|
||||||
- Добавьте ключ репозитория Vitastor:
|
- Добавьте ключ репозитория Vitastor:
|
||||||
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
|
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
|
||||||
- Добавьте репозиторий Vitastor в /etc/apt/sources.list:
|
- Добавьте репозиторий Vitastor в /etc/apt/sources.list:
|
||||||
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
||||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||||
@@ -20,8 +20,8 @@
|
|||||||
## CentOS
|
## CentOS
|
||||||
|
|
||||||
- Добавьте в систему репозиторий Vitastor:
|
- Добавьте в систему репозиторий Vitastor:
|
||||||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
|
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
|
||||||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
|
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
|
||||||
- Включите EPEL: `yum/dnf install epel-release`
|
- Включите EPEL: `yum/dnf install epel-release`
|
||||||
- Включите дополнительные репозитории CentOS:
|
- Включите дополнительные репозитории CentOS:
|
||||||
- CentOS 7: `yum install centos-release-scl`
|
- CentOS 7: `yum install centos-release-scl`
|
||||||
|
@@ -70,7 +70,7 @@ For EC pools the configuration should look like the following:
|
|||||||
|
|
||||||
```
|
```
|
||||||
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
|
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
|
||||||
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
|
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}'
|
||||||
```
|
```
|
||||||
|
|
||||||
After you do this, one of the monitors will configure PGs and OSDs will start them.
|
After you do this, one of the monitors will configure PGs and OSDs will start them.
|
||||||
|
@@ -71,7 +71,7 @@ etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
|
|||||||
|
|
||||||
```
|
```
|
||||||
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
|
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
|
||||||
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
|
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}'
|
||||||
```
|
```
|
||||||
|
|
||||||
После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.
|
После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.
|
||||||
|
@@ -14,6 +14,7 @@ It supports the following commands:
|
|||||||
- [df](#df)
|
- [df](#df)
|
||||||
- [ls](#ls)
|
- [ls](#ls)
|
||||||
- [create](#create)
|
- [create](#create)
|
||||||
|
- [snap-create](#create)
|
||||||
- [modify](#modify)
|
- [modify](#modify)
|
||||||
- [rm](#rm)
|
- [rm](#rm)
|
||||||
- [flatten](#flatten)
|
- [flatten](#flatten)
|
||||||
@@ -123,6 +124,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
|||||||
|
|
||||||
Create a snapshot of image `<name>` (either form can be used). May be used live if only a single writer is active.
|
Create a snapshot of image `<name>` (either form can be used). May be used live if only a single writer is active.
|
||||||
|
|
||||||
|
See also about [how to export snapshots](qemu.en.md#exporting-snapshots).
|
||||||
|
|
||||||
## modify
|
## modify
|
||||||
|
|
||||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
||||||
|
@@ -15,6 +15,7 @@ vitastor-cli - интерфейс командной строки для адм
|
|||||||
- [df](#df)
|
- [df](#df)
|
||||||
- [ls](#ls)
|
- [ls](#ls)
|
||||||
- [create](#create)
|
- [create](#create)
|
||||||
|
- [snap-create](#create)
|
||||||
- [modify](#modify)
|
- [modify](#modify)
|
||||||
- [rm](#rm)
|
- [rm](#rm)
|
||||||
- [flatten](#flatten)
|
- [flatten](#flatten)
|
||||||
@@ -126,6 +127,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
|||||||
Создать снимок образа `<name>` (можно использовать любую форму команды). Снимок можно создавать без остановки
|
Создать снимок образа `<name>` (можно использовать любую форму команды). Снимок можно создавать без остановки
|
||||||
клиентов, если пишущий клиент максимум 1.
|
клиентов, если пишущий клиент максимум 1.
|
||||||
|
|
||||||
|
Смотрите также информацию о том, [как экспортировать снимки](qemu.ru.md#экспорт-снимков).
|
||||||
|
|
||||||
## modify
|
## modify
|
||||||
|
|
||||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
||||||
|
@@ -46,3 +46,40 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=192.168.7
|
|||||||
|
|
||||||
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
|
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
|
||||||
if you don't want to use inode metadata.
|
if you don't want to use inode metadata.
|
||||||
|
|
||||||
|
### Exporting snapshots
|
||||||
|
|
||||||
|
Starting with 0.8.4, you can also export individual layers (snapshot diffs) using `qemu-img`.
|
||||||
|
|
||||||
|
Suppose you have an image `testimg` and a snapshot `testimg@0` created with `vitastor-cli snap-create testimg@0`.
|
||||||
|
|
||||||
|
Then you can export the `testimg@0` snapshot and the data written to `testimg` after creating
|
||||||
|
the snapshot separately using the following commands (key points are using `skip-parents=1` and
|
||||||
|
`-B backing_file` option):
|
||||||
|
|
||||||
|
```
|
||||||
|
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
|
||||||
|
-O qcow2 testimg_0.qcow2
|
||||||
|
|
||||||
|
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
|
||||||
|
-O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
|
||||||
|
```
|
||||||
|
|
||||||
|
In fact, with `cluster_size=4k` any QCOW2 file can be used instead `-B testimg_0.qcow2`, even an empty one.
|
||||||
|
|
||||||
|
QCOW2 `cluster_size=4k` option is required if you want `testimg.qcow2` to contain only the data
|
||||||
|
overwritten **exactly** in the child layer. With the default 64 KB QCOW2 cluster size you'll
|
||||||
|
get a bit of extra data from parent layers, i.e. a 4 KB overwrite will result in `testimg.qcow2`
|
||||||
|
containing 64 KB of data. And this extra data will be taken by `qemu-img` from the file passed
|
||||||
|
in `-B` option, so you really need 4 KB cluster if you use an empty image in `-B`.
|
||||||
|
|
||||||
|
After this procedure you'll get two chained QCOW2 images. To detach `testimg.qcow2` from
|
||||||
|
its parent, run:
|
||||||
|
|
||||||
|
```
|
||||||
|
qemu-img rebase -u -b '' testimg.qcow2
|
||||||
|
```
|
||||||
|
|
||||||
|
This can be used for backups. Just note that exporting an image that is currently being written to
|
||||||
|
is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
|
||||||
|
on a live VM.
|
||||||
|
@@ -50,3 +50,40 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.
|
|||||||
|
|
||||||
Если вы не хотите обращаться к образу по имени, вместо `:image=<IMAGE>` можно указать номер пула, номер инода и размер:
|
Если вы не хотите обращаться к образу по имени, вместо `:image=<IMAGE>` можно указать номер пула, номер инода и размер:
|
||||||
`:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
|
`:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
|
||||||
|
|
||||||
|
### Экспорт снимков
|
||||||
|
|
||||||
|
Начиная с 0.8.4 вы можете экспортировать отдельные слои (изменения в снимках) с помощью `qemu-img`.
|
||||||
|
|
||||||
|
Допустим, что у вас есть образ `testimg` и его снимок `testimg@0`, созданный с помощью `vitastor-cli snap-create testimg@0`.
|
||||||
|
|
||||||
|
Тогда вы можете выгрузить снимок `testimg@0` и данные, изменённые в `testimg` после создания снимка, отдельно,
|
||||||
|
с помощью следующих команд (ключевые моменты - использование `skip-parents=1` и опции `-B backing_file.qcow2`):
|
||||||
|
|
||||||
|
```
|
||||||
|
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
|
||||||
|
-O qcow2 testimg_0.qcow2
|
||||||
|
|
||||||
|
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
|
||||||
|
-O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
|
||||||
|
```
|
||||||
|
|
||||||
|
На самом деле, с `cluster_size=4k` вместо `-B testimg_0.qcow2` можно использовать любой qcow2-файл,
|
||||||
|
даже пустой.
|
||||||
|
|
||||||
|
Опция QCOW2 `cluster_size=4k` нужна, если вы хотите, чтобы `testimg.qcow2` содержал **в точности**
|
||||||
|
данные, перезаписанные в дочернем слое. С размером кластера QCOW2 по умолчанию, составляющим 64 КБ,
|
||||||
|
вы получите немного "лишних" данных из родительских слоёв - перезапись 4 КБ будет приводить к тому,
|
||||||
|
что в `testimg.qcow2` будет появляться 64 КБ данных. Причём "лишние" данные qemu-img будет брать
|
||||||
|
как раз из файла, указанного в опции `-B`, так что если там указан пустой образ, кластер обязан быть 4 КБ.
|
||||||
|
|
||||||
|
После данной процедуры вы получите два QCOW2-образа, связанных в цепочку. Чтобы "отцепить" образ
|
||||||
|
`testimg.qcow2` от базового, выполните:
|
||||||
|
|
||||||
|
```
|
||||||
|
qemu-img rebase -u -b '' testimg.qcow2
|
||||||
|
```
|
||||||
|
|
||||||
|
Это можно использовать для резервного копирования. Только помните, что экспортировать образ, в который
|
||||||
|
в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
|
||||||
|
с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.
|
||||||
|
2
json11
2
json11
Submodule json11 updated: 52a3af664f...fd37016cf8
@@ -21,7 +21,7 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p
|
|||||||
{
|
{
|
||||||
for (const pg of oh.osd_sets)
|
for (const pg of oh.osd_sets)
|
||||||
{
|
{
|
||||||
nh.osd_sets[pg.join(' ')] = pg;
|
nh.osd_sets[pg.join(' ')] = pg.map(osd_num => Number(osd_num));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (oh && oh.all_peers && oh.all_peers.length)
|
if (oh && oh.all_peers && oh.all_peers.length)
|
||||||
|
@@ -550,8 +550,8 @@ function random_combinations(osd_tree, pg_size, count, ordered)
|
|||||||
seed ^= seed << 5;
|
seed ^= seed << 5;
|
||||||
return seed + 2147483648;
|
return seed + 2147483648;
|
||||||
};
|
};
|
||||||
const hosts = Object.keys(osd_tree).sort();
|
|
||||||
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
|
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
|
||||||
|
const hosts = Object.keys(osd_tree).sort().filter(h => osds[h].length > 0);
|
||||||
const r = {};
|
const r = {};
|
||||||
// Generate random combinations including each OSD at least once
|
// Generate random combinations including each OSD at least once
|
||||||
for (let h = 0; h < hosts.length; h++)
|
for (let h = 0; h < hosts.length; h++)
|
||||||
|
@@ -79,7 +79,7 @@ StartLimitInterval=0
|
|||||||
RestartSec=10
|
RestartSec=10
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=local.target
|
WantedBy=multi-user.target
|
||||||
`);
|
`);
|
||||||
await system(`useradd etcd`);
|
await system(`useradd etcd`);
|
||||||
await system(`systemctl daemon-reload`);
|
await system(`systemctl daemon-reload`);
|
||||||
|
49
mon/mon.js
49
mon/mon.js
@@ -70,9 +70,9 @@ const etcd_tree = {
|
|||||||
rdma_gid_index: 0,
|
rdma_gid_index: 0,
|
||||||
rdma_mtu: 4096,
|
rdma_mtu: 4096,
|
||||||
rdma_max_sge: 128,
|
rdma_max_sge: 128,
|
||||||
rdma_max_send: 32,
|
rdma_max_send: 64,
|
||||||
rdma_max_recv: 8,
|
rdma_max_recv: 128,
|
||||||
rdma_max_msg: 1048576,
|
rdma_max_msg: 132096,
|
||||||
log_level: 0,
|
log_level: 0,
|
||||||
block_size: 131072,
|
block_size: 131072,
|
||||||
disk_alignment: 4096,
|
disk_alignment: 4096,
|
||||||
@@ -107,6 +107,10 @@ const etcd_tree = {
|
|||||||
slow_log_interval: 10,
|
slow_log_interval: 10,
|
||||||
inode_vanish_time: 60,
|
inode_vanish_time: 60,
|
||||||
osd_memlock: false,
|
osd_memlock: false,
|
||||||
|
scrub_interval: '30d', // 1s/1m/1h/1d
|
||||||
|
scrub_queue_depth: 1,
|
||||||
|
scrub_sleep: 0, // milliseconds
|
||||||
|
scrub_list_limit: 1000, // objects to list on one scrub iteration
|
||||||
// blockstore - fixed in superblock
|
// blockstore - fixed in superblock
|
||||||
block_size,
|
block_size,
|
||||||
disk_alignment,
|
disk_alignment,
|
||||||
@@ -168,6 +172,8 @@ const etcd_tree = {
|
|||||||
osd_tags?: 'nvme' | [ 'nvme', ... ],
|
osd_tags?: 'nvme' | [ 'nvme', ... ],
|
||||||
// prefer to put primary on OSD with these tags
|
// prefer to put primary on OSD with these tags
|
||||||
primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
|
primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
|
||||||
|
// scrub interval
|
||||||
|
scrub_interval?: '30d',
|
||||||
},
|
},
|
||||||
...
|
...
|
||||||
}, */
|
}, */
|
||||||
@@ -261,9 +267,9 @@ const etcd_tree = {
|
|||||||
/* <pool_id>: {
|
/* <pool_id>: {
|
||||||
<pg_id>: {
|
<pg_id>: {
|
||||||
primary: osd_num_t,
|
primary: osd_num_t,
|
||||||
state: ("starting"|"peering"|"peered"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
||||||
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
"degraded"|"has_corrupted"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
||||||
"has_invalid"|"left_on_dead")[],
|
"has_invalid"|"left_on_dead"|"scrubbing")[],
|
||||||
}
|
}
|
||||||
}, */
|
}, */
|
||||||
},
|
},
|
||||||
@@ -285,6 +291,7 @@ const etcd_tree = {
|
|||||||
osd_sets: osd_num_t[][],
|
osd_sets: osd_num_t[][],
|
||||||
all_peers: osd_num_t[],
|
all_peers: osd_num_t[],
|
||||||
epoch: uint64_t,
|
epoch: uint64_t,
|
||||||
|
scrub_ts: uint64_t,
|
||||||
},
|
},
|
||||||
}, */
|
}, */
|
||||||
},
|
},
|
||||||
@@ -663,12 +670,15 @@ class Mon
|
|||||||
async save_last_clean()
|
async save_last_clean()
|
||||||
{
|
{
|
||||||
// last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
|
// last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
|
||||||
|
const new_clean_pgs = { items: {} };
|
||||||
|
next_pool:
|
||||||
for (const pool_id in this.state.config.pools)
|
for (const pool_id in this.state.config.pools)
|
||||||
{
|
{
|
||||||
|
new_clean_pgs.items[pool_id] = (this.state.history.last_clean_pgs.items||{})[pool_id];
|
||||||
const pool_cfg = this.state.config.pools[pool_id];
|
const pool_cfg = this.state.config.pools[pool_id];
|
||||||
if (!this.validate_pool_cfg(pool_id, pool_cfg, false))
|
if (!this.validate_pool_cfg(pool_id, pool_cfg, false))
|
||||||
{
|
{
|
||||||
continue;
|
continue next_pool;
|
||||||
}
|
}
|
||||||
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
|
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
|
||||||
{
|
{
|
||||||
@@ -677,17 +687,18 @@ class Mon
|
|||||||
!(this.state.pg.state[pool_id][pg_num].state instanceof Array))
|
!(this.state.pg.state[pool_id][pg_num].state instanceof Array))
|
||||||
{
|
{
|
||||||
// Unclean
|
// Unclean
|
||||||
return;
|
continue next_pool;
|
||||||
}
|
}
|
||||||
let st = this.state.pg.state[pool_id][pg_num].state.join(',');
|
let st = this.state.pg.state[pool_id][pg_num].state.join(',');
|
||||||
if (st != 'active' && st != 'active,left_on_dead' && st != 'left_on_dead,active')
|
if (st != 'active' && st != 'active,left_on_dead' && st != 'left_on_dead,active')
|
||||||
{
|
{
|
||||||
// Unclean
|
// Unclean
|
||||||
return;
|
continue next_pool;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
new_clean_pgs.items[pool_id] = this.state.config.pgs.items[pool_id];
|
||||||
}
|
}
|
||||||
this.state.history.last_clean_pgs = JSON.parse(JSON.stringify(this.state.config.pgs));
|
this.state.history.last_clean_pgs = new_clean_pgs;
|
||||||
await this.etcd_call('/kv/txn', {
|
await this.etcd_call('/kv/txn', {
|
||||||
success: [ { requestPut: {
|
success: [ { requestPut: {
|
||||||
key: b64(this.etcd_prefix+'/history/last_clean_pgs'),
|
key: b64(this.etcd_prefix+'/history/last_clean_pgs'),
|
||||||
@@ -1374,16 +1385,14 @@ class Mon
|
|||||||
// This is required for multiple change events to trigger at most 1 recheck in 1s
|
// This is required for multiple change events to trigger at most 1 recheck in 1s
|
||||||
schedule_recheck()
|
schedule_recheck()
|
||||||
{
|
{
|
||||||
if (this.recheck_timer)
|
if (!this.recheck_timer)
|
||||||
{
|
{
|
||||||
clearTimeout(this.recheck_timer);
|
this.recheck_timer = setTimeout(() =>
|
||||||
this.recheck_timer = null;
|
{
|
||||||
|
this.recheck_timer = null;
|
||||||
|
this.recheck_pgs().catch(this.die);
|
||||||
|
}, this.config.mon_change_timeout || 1000);
|
||||||
}
|
}
|
||||||
this.recheck_timer = setTimeout(() =>
|
|
||||||
{
|
|
||||||
this.recheck_timer = null;
|
|
||||||
this.recheck_pgs().catch(this.die);
|
|
||||||
}, this.config.mon_change_timeout || 1000);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sum_op_stats(timestamp, prev_stats)
|
sum_op_stats(timestamp, prev_stats)
|
||||||
@@ -1719,11 +1728,11 @@ class Mon
|
|||||||
else if (key_parts[0] === 'osd' && key_parts[1] === 'stats')
|
else if (key_parts[0] === 'osd' && key_parts[1] === 'stats')
|
||||||
{
|
{
|
||||||
// Recheck OSD tree on OSD addition/deletion
|
// Recheck OSD tree on OSD addition/deletion
|
||||||
if ((!old) != (!kv.value) || old && kv.value && (old.size != kv.value.size || old.time != kv.value.time))
|
if ((!old) != (!kv.value) || old && kv.value && old.size != kv.value.size)
|
||||||
{
|
{
|
||||||
this.schedule_recheck();
|
this.schedule_recheck();
|
||||||
}
|
}
|
||||||
// Recheck PGs <osd_out_time> later
|
// Recheck PGs <osd_out_time> after last OSD statistics report
|
||||||
this.schedule_next_recheck_at(
|
this.schedule_next_recheck_at(
|
||||||
!this.state.osd.stats[key[2]] ? 0 : this.state.osd.stats[key[2]].time+this.config.osd_out_time
|
!this.state.osd.stats[key[2]] ? 0 : this.state.osd.stats[key[2]].time+this.config.osd_out_time
|
||||||
);
|
);
|
||||||
|
@@ -16,6 +16,11 @@ use PVE::Tools qw(run_command);
|
|||||||
|
|
||||||
use base qw(PVE::Storage::Plugin);
|
use base qw(PVE::Storage::Plugin);
|
||||||
|
|
||||||
|
if (@PVE::Storage::Plugin::SHARED_STORAGE)
|
||||||
|
{
|
||||||
|
push @PVE::Storage::Plugin::SHARED_STORAGE, 'vitastor';
|
||||||
|
}
|
||||||
|
|
||||||
sub api
|
sub api
|
||||||
{
|
{
|
||||||
# Trick it :)
|
# Trick it :)
|
||||||
@@ -133,9 +138,11 @@ sub properties
|
|||||||
sub options
|
sub options
|
||||||
{
|
{
|
||||||
return {
|
return {
|
||||||
|
shared => { optional => 1 },
|
||||||
|
content => { optional => 1 },
|
||||||
nodes => { optional => 1 },
|
nodes => { optional => 1 },
|
||||||
disable => { optional => 1 },
|
disable => { optional => 1 },
|
||||||
vitastor_etcd_address => { optional => 1},
|
vitastor_etcd_address => { optional => 1 },
|
||||||
vitastor_etcd_prefix => { optional => 1 },
|
vitastor_etcd_prefix => { optional => 1 },
|
||||||
vitastor_config_path => { optional => 1 },
|
vitastor_config_path => { optional => 1 },
|
||||||
vitastor_prefix => { optional => 1 },
|
vitastor_prefix => { optional => 1 },
|
||||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
|||||||
from cinder.volume import driver
|
from cinder.volume import driver
|
||||||
from cinder.volume import volume_utils
|
from cinder.volume import volume_utils
|
||||||
|
|
||||||
VERSION = '0.8.3'
|
VERSION = '0.8.5'
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@@ -25,4 +25,4 @@ rm fio
|
|||||||
mv fio-copy fio
|
mv fio-copy fio
|
||||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||||
tar --transform 's#^#vitastor-0.8.3/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.3$(rpm --eval '%dist').tar.gz *
|
tar --transform 's#^#vitastor-0.8.5/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.5$(rpm --eval '%dist').tar.gz *
|
||||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-0.8.3.el7.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-0.8.5.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 0.8.3
|
Version: 0.8.5
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-0.8.3.el7.tar.gz
|
Source0: vitastor-0.8.5.el7.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
@@ -35,6 +35,7 @@ Summary: Vitastor - OSD
|
|||||||
Requires: libJerasure2
|
Requires: libJerasure2
|
||||||
Requires: libisa-l
|
Requires: libisa-l
|
||||||
Requires: liburing >= 0.6
|
Requires: liburing >= 0.6
|
||||||
|
Requires: liburing < 2
|
||||||
Requires: vitastor-client = %{version}-%{release}
|
Requires: vitastor-client = %{version}-%{release}
|
||||||
Requires: util-linux
|
Requires: util-linux
|
||||||
Requires: parted
|
Requires: parted
|
||||||
@@ -59,6 +60,7 @@ scheduling cluster-level operations.
|
|||||||
%package -n vitastor-client
|
%package -n vitastor-client
|
||||||
Summary: Vitastor - client
|
Summary: Vitastor - client
|
||||||
Requires: liburing >= 0.6
|
Requires: liburing >= 0.6
|
||||||
|
Requires: liburing < 2
|
||||||
|
|
||||||
|
|
||||||
%description -n vitastor-client
|
%description -n vitastor-client
|
||||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-0.8.3.el8.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-0.8.5.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 0.8.3
|
Version: 0.8.5
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-0.8.3.el8.tar.gz
|
Source0: vitastor-0.8.5.el8.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
@@ -34,6 +34,7 @@ Summary: Vitastor - OSD
|
|||||||
Requires: libJerasure2
|
Requires: libJerasure2
|
||||||
Requires: libisa-l
|
Requires: libisa-l
|
||||||
Requires: liburing >= 0.6
|
Requires: liburing >= 0.6
|
||||||
|
Requires: liburing < 2
|
||||||
Requires: vitastor-client = %{version}-%{release}
|
Requires: vitastor-client = %{version}-%{release}
|
||||||
Requires: util-linux
|
Requires: util-linux
|
||||||
Requires: parted
|
Requires: parted
|
||||||
@@ -57,6 +58,7 @@ scheduling cluster-level operations.
|
|||||||
%package -n vitastor-client
|
%package -n vitastor-client
|
||||||
Summary: Vitastor - client
|
Summary: Vitastor - client
|
||||||
Requires: liburing >= 0.6
|
Requires: liburing >= 0.6
|
||||||
|
Requires: liburing < 2
|
||||||
|
|
||||||
|
|
||||||
%description -n vitastor-client
|
%description -n vitastor-client
|
||||||
|
@@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 2.8)
|
|||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
include(GNUInstallDirs)
|
include(GNUInstallDirs)
|
||||||
|
include(CTest)
|
||||||
|
|
||||||
set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree")
|
set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree")
|
||||||
set(WITH_FIO true CACHE BOOL "Build FIO driver")
|
set(WITH_FIO true CACHE BOOL "Build FIO driver")
|
||||||
@@ -15,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
|||||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_definitions(-DVERSION="0.8.3")
|
add_definitions(-DVERSION="0.8.5")
|
||||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||||
if (${WITH_ASAN})
|
if (${WITH_ASAN})
|
||||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||||
@@ -55,6 +56,14 @@ if (ISAL_LIBRARIES)
|
|||||||
add_definitions(-DWITH_ISAL)
|
add_definitions(-DWITH_ISAL)
|
||||||
endif (ISAL_LIBRARIES)
|
endif (ISAL_LIBRARIES)
|
||||||
|
|
||||||
|
add_custom_target(build_tests)
|
||||||
|
add_custom_target(test
|
||||||
|
COMMAND
|
||||||
|
echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
|
||||||
|
env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
|
||||||
|
)
|
||||||
|
add_dependencies(test build_tests)
|
||||||
|
|
||||||
include_directories(
|
include_directories(
|
||||||
../
|
../
|
||||||
/usr/include/jerasure
|
/usr/include/jerasure
|
||||||
@@ -102,7 +111,7 @@ target_compile_options(vitastor_common PUBLIC -fPIC)
|
|||||||
add_executable(vitastor-osd
|
add_executable(vitastor-osd
|
||||||
osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
|
osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
|
||||||
osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
|
osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
|
||||||
osd_cluster.cpp osd_rmw.cpp
|
osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp
|
||||||
)
|
)
|
||||||
target_link_libraries(vitastor-osd
|
target_link_libraries(vitastor-osd
|
||||||
vitastor_common
|
vitastor_common
|
||||||
@@ -145,7 +154,6 @@ add_library(vitastor_client SHARED
|
|||||||
set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h")
|
set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h")
|
||||||
target_link_libraries(vitastor_client
|
target_link_libraries(vitastor_client
|
||||||
vitastor_common
|
vitastor_common
|
||||||
tcmalloc_minimal
|
|
||||||
${LIBURING_LIBRARIES}
|
${LIBURING_LIBRARIES}
|
||||||
${IBVERBS_LIBRARIES}
|
${IBVERBS_LIBRARIES}
|
||||||
)
|
)
|
||||||
@@ -235,8 +243,18 @@ add_executable(osd_test osd_test.cpp rw_blocking.cpp addr_util.cpp)
|
|||||||
target_link_libraries(osd_test tcmalloc_minimal)
|
target_link_libraries(osd_test tcmalloc_minimal)
|
||||||
|
|
||||||
# osd_rmw_test
|
# osd_rmw_test
|
||||||
add_executable(osd_rmw_test osd_rmw_test.cpp allocator.cpp)
|
add_executable(osd_rmw_test EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
|
||||||
target_link_libraries(osd_rmw_test Jerasure ${ISAL_LIBRARIES} tcmalloc_minimal)
|
target_link_libraries(osd_rmw_test Jerasure ${ISAL_LIBRARIES} tcmalloc_minimal)
|
||||||
|
add_dependencies(build_tests osd_rmw_test)
|
||||||
|
add_test(NAME osd_rmw_test COMMAND osd_rmw_test)
|
||||||
|
|
||||||
|
if (ISAL_LIBRARIES)
|
||||||
|
add_executable(osd_rmw_test_je EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
|
||||||
|
target_compile_definitions(osd_rmw_test_je PUBLIC -DNO_ISAL)
|
||||||
|
target_link_libraries(osd_rmw_test_je Jerasure tcmalloc_minimal)
|
||||||
|
add_dependencies(build_tests osd_rmw_test_je)
|
||||||
|
add_test(NAME osd_rmw_test_jerasure COMMAND osd_rmw_test_je)
|
||||||
|
endif (ISAL_LIBRARIES)
|
||||||
|
|
||||||
# stub_uring_osd
|
# stub_uring_osd
|
||||||
add_executable(stub_uring_osd
|
add_executable(stub_uring_osd
|
||||||
@@ -250,11 +268,15 @@ target_link_libraries(stub_uring_osd
|
|||||||
)
|
)
|
||||||
|
|
||||||
# osd_peering_pg_test
|
# osd_peering_pg_test
|
||||||
add_executable(osd_peering_pg_test osd_peering_pg_test.cpp osd_peering_pg.cpp)
|
add_executable(osd_peering_pg_test EXCLUDE_FROM_ALL osd_peering_pg_test.cpp osd_peering_pg.cpp)
|
||||||
target_link_libraries(osd_peering_pg_test tcmalloc_minimal)
|
target_link_libraries(osd_peering_pg_test tcmalloc_minimal)
|
||||||
|
add_dependencies(build_tests osd_peering_pg_test)
|
||||||
|
add_test(NAME osd_peering_pg_test COMMAND osd_peering_pg_test)
|
||||||
|
|
||||||
# test_allocator
|
# test_allocator
|
||||||
add_executable(test_allocator test_allocator.cpp allocator.cpp)
|
add_executable(test_allocator EXCLUDE_FROM_ALL test_allocator.cpp allocator.cpp)
|
||||||
|
add_dependencies(build_tests test_allocator)
|
||||||
|
add_test(NAME test_allocator COMMAND test_allocator)
|
||||||
|
|
||||||
# test_cas
|
# test_cas
|
||||||
add_executable(test_cas
|
add_executable(test_cas
|
||||||
@@ -274,12 +296,15 @@ target_link_libraries(test_crc32
|
|||||||
|
|
||||||
# test_cluster_client
|
# test_cluster_client
|
||||||
add_executable(test_cluster_client
|
add_executable(test_cluster_client
|
||||||
|
EXCLUDE_FROM_ALL
|
||||||
test_cluster_client.cpp
|
test_cluster_client.cpp
|
||||||
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
|
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
|
||||||
etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
|
etcd_state_client.cpp timerfd_manager.cpp str_util.cpp ../json11/json11.cpp
|
||||||
)
|
)
|
||||||
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
|
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
|
||||||
target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
|
target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
|
||||||
|
add_dependencies(build_tests test_cluster_client)
|
||||||
|
add_test(NAME test_cluster_client COMMAND test_cluster_client)
|
||||||
|
|
||||||
## test_blockstore, test_shit
|
## test_blockstore, test_shit
|
||||||
#add_executable(test_blockstore test_blockstore.cpp)
|
#add_executable(test_blockstore test_blockstore.cpp)
|
||||||
|
@@ -122,11 +122,14 @@ Output:
|
|||||||
Get a list of all objects in this Blockstore.
|
Get a list of all objects in this Blockstore.
|
||||||
|
|
||||||
Input:
|
Input:
|
||||||
- oid.stripe = PG alignment
|
- pg_alignment = PG alignment
|
||||||
- len = PG count or 0 to list all objects
|
- pg_count = PG count or 0 to list all objects
|
||||||
- offset = PG number
|
- pg_number = PG number
|
||||||
- oid.inode = min inode number or 0 to list all inodes
|
- list_stable_limit = max number of clean objects in the reply
|
||||||
- version = max inode number or 0 to list all inodes
|
it's guaranteed that dirty objects are returned from the same interval,
|
||||||
|
i.e. from (min_oid .. min(max_oid, max(returned stable OIDs)))
|
||||||
|
- min_oid = min inode/stripe or 0 to list all objects
|
||||||
|
- max_oid = max inode/stripe or 0 to list all objects
|
||||||
|
|
||||||
Output:
|
Output:
|
||||||
- retval = total obj_ver_id count
|
- retval = total obj_ver_id count
|
||||||
@@ -143,10 +146,27 @@ struct blockstore_op_t
|
|||||||
uint64_t opcode;
|
uint64_t opcode;
|
||||||
// finish callback
|
// finish callback
|
||||||
std::function<void (blockstore_op_t*)> callback;
|
std::function<void (blockstore_op_t*)> callback;
|
||||||
object_id oid;
|
union
|
||||||
uint64_t version;
|
{
|
||||||
uint32_t offset;
|
// R/W
|
||||||
uint32_t len;
|
struct
|
||||||
|
{
|
||||||
|
object_id oid;
|
||||||
|
uint64_t version;
|
||||||
|
uint32_t offset;
|
||||||
|
uint32_t len;
|
||||||
|
};
|
||||||
|
// List
|
||||||
|
struct __attribute__((__packed__))
|
||||||
|
{
|
||||||
|
object_id min_oid;
|
||||||
|
object_id max_oid;
|
||||||
|
uint32_t pg_alignment;
|
||||||
|
uint32_t pg_count;
|
||||||
|
uint32_t pg_number;
|
||||||
|
uint32_t list_stable_limit;
|
||||||
|
};
|
||||||
|
};
|
||||||
void *buf;
|
void *buf;
|
||||||
void *bitmap;
|
void *bitmap;
|
||||||
int retval;
|
int retval;
|
||||||
|
@@ -162,7 +162,8 @@ void journal_flusher_t::mark_trim_possible()
|
|||||||
if (trim_wanted > 0)
|
if (trim_wanted > 0)
|
||||||
{
|
{
|
||||||
dequeuing = true;
|
dequeuing = true;
|
||||||
journal_trim_counter++;
|
if (!journal_trim_counter)
|
||||||
|
journal_trim_counter = journal_trim_interval;
|
||||||
bs->ringloop->wakeup();
|
bs->ringloop->wakeup();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -193,6 +193,7 @@ void blockstore_impl_t::loop()
|
|||||||
}
|
}
|
||||||
if (wr_st == 2)
|
if (wr_st == 2)
|
||||||
{
|
{
|
||||||
|
submit_queue[op_idx] = NULL;
|
||||||
new_idx--;
|
new_idx--;
|
||||||
}
|
}
|
||||||
if (wr_st == 0)
|
if (wr_st == 0)
|
||||||
@@ -324,7 +325,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
|
|||||||
{
|
{
|
||||||
// Basic verification not passed
|
// Basic verification not passed
|
||||||
op->retval = -EINVAL;
|
op->retval = -EINVAL;
|
||||||
std::function<void (blockstore_op_t*)>(op->callback)(op);
|
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (op->opcode == BS_OP_SYNC_STAB_ALL)
|
if (op->opcode == BS_OP_SYNC_STAB_ALL)
|
||||||
@@ -367,7 +368,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
|
|||||||
}
|
}
|
||||||
if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
|
if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
|
||||||
{
|
{
|
||||||
std::function<void (blockstore_op_t*)>(op->callback)(op);
|
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Call constructor without allocating memory. We'll call destructor before returning op back
|
// Call constructor without allocating memory. We'll call destructor before returning op back
|
||||||
@@ -444,11 +445,11 @@ void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint
|
|||||||
|
|
||||||
void blockstore_impl_t::process_list(blockstore_op_t *op)
|
void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
uint32_t list_pg = op->offset+1;
|
uint32_t list_pg = op->pg_number+1;
|
||||||
uint32_t pg_count = op->len;
|
uint32_t pg_count = op->pg_count;
|
||||||
uint64_t pg_stripe_size = op->oid.stripe;
|
uint64_t pg_stripe_size = op->pg_alignment;
|
||||||
uint64_t min_inode = op->oid.inode;
|
uint64_t min_inode = op->min_oid.inode;
|
||||||
uint64_t max_inode = op->version;
|
uint64_t max_inode = op->max_oid.inode;
|
||||||
// Check PG
|
// Check PG
|
||||||
if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count))
|
if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count))
|
||||||
{
|
{
|
||||||
@@ -495,7 +496,13 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
|||||||
stable_alloc += clean_db.size();
|
stable_alloc += clean_db.size();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
if (op->list_stable_limit > 0)
|
||||||
|
{
|
||||||
|
stable_alloc = op->list_stable_limit;
|
||||||
|
if (stable_alloc > 1024*1024)
|
||||||
|
stable_alloc = 1024*1024;
|
||||||
|
}
|
||||||
|
if (stable_alloc < 32768)
|
||||||
{
|
{
|
||||||
stable_alloc = 32768;
|
stable_alloc = 32768;
|
||||||
}
|
}
|
||||||
@@ -506,22 +513,21 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
|||||||
FINISH_OP(op);
|
FINISH_OP(op);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
auto max_oid = op->max_oid;
|
||||||
|
bool limited = false;
|
||||||
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||||
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||||
shard_it++)
|
shard_it++)
|
||||||
{
|
{
|
||||||
auto & clean_db = shard_it->second;
|
auto & clean_db = shard_it->second;
|
||||||
auto clean_it = clean_db.begin(), clean_end = clean_db.end();
|
auto clean_it = clean_db.begin(), clean_end = clean_db.end();
|
||||||
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
|
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
|
||||||
{
|
{
|
||||||
clean_it = clean_db.lower_bound({
|
clean_it = clean_db.lower_bound(op->min_oid);
|
||||||
.inode = min_inode,
|
}
|
||||||
.stripe = 0,
|
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
|
||||||
});
|
{
|
||||||
clean_end = clean_db.upper_bound({
|
clean_end = clean_db.upper_bound(max_oid);
|
||||||
.inode = max_inode,
|
|
||||||
.stripe = UINT64_MAX,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
for (; clean_it != clean_end; clean_it++)
|
for (; clean_it != clean_end; clean_it++)
|
||||||
{
|
{
|
||||||
@@ -540,11 +546,24 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
|||||||
.oid = clean_it->first,
|
.oid = clean_it->first,
|
||||||
.version = clean_it->second.version,
|
.version = clean_it->second.version,
|
||||||
};
|
};
|
||||||
|
if (op->list_stable_limit > 0 && !limited && stable_count >= op->list_stable_limit)
|
||||||
|
{
|
||||||
|
limited = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (op->list_stable_limit > 0 && first_shard != last_shard)
|
||||||
|
{
|
||||||
|
// To maintain the order, we have to include objects in the same range from other shards
|
||||||
|
std::sort(stable, stable+stable_count);
|
||||||
|
if (stable_count > op->list_stable_limit)
|
||||||
|
stable_count = op->list_stable_limit;
|
||||||
|
max_oid = stable[stable_count-1].oid;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (first_shard != last_shard)
|
if (op->list_stable_limit == 0 && first_shard != last_shard)
|
||||||
{
|
{
|
||||||
// If that's not a per-PG listing, sort clean entries
|
// If that's not a per-PG listing, sort clean entries (already sorted if list_stable_limit != 0)
|
||||||
std::sort(stable, stable+stable_count);
|
std::sort(stable, stable+stable_count);
|
||||||
}
|
}
|
||||||
int clean_stable_count = stable_count;
|
int clean_stable_count = stable_count;
|
||||||
@@ -553,20 +572,17 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
|||||||
obj_ver_id *unstable = NULL;
|
obj_ver_id *unstable = NULL;
|
||||||
{
|
{
|
||||||
auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
|
auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
|
||||||
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
|
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
|
||||||
{
|
{
|
||||||
dirty_it = dirty_db.lower_bound({
|
dirty_it = dirty_db.lower_bound({
|
||||||
.oid = {
|
.oid = op->min_oid,
|
||||||
.inode = min_inode,
|
|
||||||
.stripe = 0,
|
|
||||||
},
|
|
||||||
.version = 0,
|
.version = 0,
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
|
||||||
|
{
|
||||||
dirty_end = dirty_db.upper_bound({
|
dirty_end = dirty_db.upper_bound({
|
||||||
.oid = {
|
.oid = max_oid,
|
||||||
.inode = max_inode,
|
|
||||||
.stripe = UINT64_MAX,
|
|
||||||
},
|
|
||||||
.version = UINT64_MAX,
|
.version = UINT64_MAX,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -582,7 +598,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
|||||||
replace_stable(dirty_it->first.oid, 0, clean_stable_count, stable_count, stable);
|
replace_stable(dirty_it->first.oid, 0, clean_stable_count, stable_count, stable);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (IS_STABLE(dirty_it->second.state))
|
else if (IS_STABLE(dirty_it->second.state) || (dirty_it->second.state & BS_ST_INSTANT))
|
||||||
{
|
{
|
||||||
// First try to replace a clean stable version in the first part of the list
|
// First try to replace a clean stable version in the first part of the list
|
||||||
if (!replace_stable(dirty_it->first.oid, dirty_it->first.version, 0, clean_stable_count, stable))
|
if (!replace_stable(dirty_it->first.oid, dirty_it->first.version, 0, clean_stable_count, stable))
|
||||||
|
@@ -16,6 +16,7 @@
|
|||||||
// FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
|
// FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
|
||||||
// writing more than can be stabilized afterwards
|
// writing more than can be stabilized afterwards
|
||||||
#define JOURNAL_STABILIZE_RESERVATION 65536
|
#define JOURNAL_STABILIZE_RESERVATION 65536
|
||||||
|
#define JOURNAL_INSTANT_RESERVATION 131072
|
||||||
|
|
||||||
// Journal entries
|
// Journal entries
|
||||||
// Journal entries are linked to each other by their crc32 value
|
// Journal entries are linked to each other by their crc32 value
|
||||||
|
@@ -286,7 +286,10 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
|
|||||||
{
|
{
|
||||||
auto used = --journal.used_sectors[rv.journal_sector-1];
|
auto used = --journal.used_sectors[rv.journal_sector-1];
|
||||||
if (used == 0)
|
if (used == 0)
|
||||||
|
{
|
||||||
journal.used_sectors.erase(rv.journal_sector-1);
|
journal.used_sectors.erase(rv.journal_sector-1);
|
||||||
|
flusher->mark_trim_possible();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -127,7 +127,6 @@ resume_4:
|
|||||||
{
|
{
|
||||||
mark_rolled_back(*v);
|
mark_rolled_back(*v);
|
||||||
}
|
}
|
||||||
flusher->mark_trim_possible();
|
|
||||||
// Acknowledge op
|
// Acknowledge op
|
||||||
op->retval = 0;
|
op->retval = 0;
|
||||||
FINISH_OP(op);
|
FINISH_OP(op);
|
||||||
@@ -232,6 +231,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
|
|||||||
if (used == 0)
|
if (used == 0)
|
||||||
{
|
{
|
||||||
journal.used_sectors.erase(dirty_it->second.journal_sector);
|
journal.used_sectors.erase(dirty_it->second.journal_sector);
|
||||||
|
flusher->mark_trim_possible();
|
||||||
}
|
}
|
||||||
if (dsk.clean_entry_bitmap_size > sizeof(void*))
|
if (dsk.clean_entry_bitmap_size > sizeof(void*))
|
||||||
{
|
{
|
||||||
|
@@ -89,6 +89,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Invalid version requested
|
// Invalid version requested
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("Write %lx:%lx v%lu requested, but we already have v%lu\n", op->oid.inode, op->oid.stripe, op->version, version);
|
||||||
|
#endif
|
||||||
op->retval = -EEXIST;
|
op->retval = -EEXIST;
|
||||||
if (!is_del && dsk.clean_entry_bitmap_size > sizeof(void*))
|
if (!is_del && dsk.clean_entry_bitmap_size > sizeof(void*))
|
||||||
{
|
{
|
||||||
@@ -115,8 +118,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
|||||||
else if (!wait_del)
|
else if (!wait_del)
|
||||||
printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
|
printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
|
||||||
#endif
|
#endif
|
||||||
// FIXME No strict need to add it into dirty_db here, it's just left
|
// No strict need to add it into dirty_db here except maybe for listings to return
|
||||||
// from the previous implementation where reads waited for writes
|
// correct data when there are inflight operations in the queue
|
||||||
uint32_t state;
|
uint32_t state;
|
||||||
if (is_del)
|
if (is_del)
|
||||||
state = BS_ST_DELETE | BS_ST_IN_FLIGHT;
|
state = BS_ST_DELETE | BS_ST_IN_FLIGHT;
|
||||||
@@ -182,9 +185,15 @@ void blockstore_impl_t::cancel_all_writes(blockstore_op_t *op, blockstore_dirty_
|
|||||||
bool found = false;
|
bool found = false;
|
||||||
for (auto other_op: submit_queue)
|
for (auto other_op: submit_queue)
|
||||||
{
|
{
|
||||||
// <op> may be present in queue multiple times due to moving operations in submit_queue
|
if (!other_op)
|
||||||
if (other_op == op)
|
{
|
||||||
|
// freed operations during submitting are zeroed
|
||||||
|
}
|
||||||
|
else if (other_op == op)
|
||||||
|
{
|
||||||
|
// <op> may be present in queue multiple times due to moving operations in submit_queue
|
||||||
found = true;
|
found = true;
|
||||||
|
}
|
||||||
else if (found && other_op->oid == op->oid &&
|
else if (found && other_op->oid == op->oid &&
|
||||||
(other_op->opcode == BS_OP_WRITE || other_op->opcode == BS_OP_WRITE_STABLE))
|
(other_op->opcode == BS_OP_WRITE || other_op->opcode == BS_OP_WRITE_STABLE))
|
||||||
{
|
{
|
||||||
@@ -252,7 +261,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||||||
{
|
{
|
||||||
blockstore_journal_check_t space_check(this);
|
blockstore_journal_check_t space_check(this);
|
||||||
if (!space_check.check_available(op, unsynced_big_write_count + 1,
|
if (!space_check.check_available(op, unsynced_big_write_count + 1,
|
||||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
|
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
|
||||||
|
(dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -332,7 +342,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||||||
!space_check.check_available(op, unsynced_big_write_count,
|
!space_check.check_available(op, unsynced_big_write_count,
|
||||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0)
|
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0)
|
||||||
|| !space_check.check_available(op, 1,
|
|| !space_check.check_available(op, 1,
|
||||||
sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size, op->len + JOURNAL_STABILIZE_RESERVATION))
|
sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size,
|
||||||
|
op->len + ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -443,18 +454,19 @@ int blockstore_impl_t::continue_write(blockstore_op_t *op)
|
|||||||
resume_2:
|
resume_2:
|
||||||
// Only for the immediate_commit mode: prepare and submit big_write journal entry
|
// Only for the immediate_commit mode: prepare and submit big_write journal entry
|
||||||
{
|
{
|
||||||
blockstore_journal_check_t space_check(this);
|
|
||||||
if (!space_check.check_available(op, 1,
|
|
||||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
BS_SUBMIT_CHECK_SQES(1);
|
|
||||||
auto dirty_it = dirty_db.find((obj_ver_id){
|
auto dirty_it = dirty_db.find((obj_ver_id){
|
||||||
.oid = op->oid,
|
.oid = op->oid,
|
||||||
.version = op->version,
|
.version = op->version,
|
||||||
});
|
});
|
||||||
assert(dirty_it != dirty_db.end());
|
assert(dirty_it != dirty_db.end());
|
||||||
|
blockstore_journal_check_t space_check(this);
|
||||||
|
if (!space_check.check_available(op, 1,
|
||||||
|
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
|
||||||
|
((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
BS_SUBMIT_CHECK_SQES(1);
|
||||||
journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
|
journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
|
||||||
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
|
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
|
||||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size
|
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size
|
||||||
@@ -641,7 +653,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
|||||||
});
|
});
|
||||||
assert(dirty_it != dirty_db.end());
|
assert(dirty_it != dirty_db.end());
|
||||||
blockstore_journal_check_t space_check(this);
|
blockstore_journal_check_t space_check(this);
|
||||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_STABILIZE_RESERVATION))
|
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_INSTANT_RESERVATION))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@@ -121,8 +121,7 @@ resume_1:
|
|||||||
}
|
}
|
||||||
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
uint64_t pg_real_size = pool_stats[pool_cfg.id]["pg_real_size"].uint64_value();
|
pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
|
||||||
pool_avail = pg_real_size > 0 ? pool_avail * (pool_cfg.pg_size - pool_cfg.parity_chunks) / pg_real_size : 0;
|
|
||||||
}
|
}
|
||||||
pool_stats[pool_cfg.id] = json11::Json::object {
|
pool_stats[pool_cfg.id] = json11::Json::object {
|
||||||
{ "name", pool_cfg.name },
|
{ "name", pool_cfg.name },
|
||||||
|
@@ -403,7 +403,7 @@ struct snap_merger_t
|
|||||||
op->opcode = OSD_OP_READ_BITMAP;
|
op->opcode = OSD_OP_READ_BITMAP;
|
||||||
op->inode = target;
|
op->inode = target;
|
||||||
op->offset = offset;
|
op->offset = offset;
|
||||||
op->len = 0;
|
op->len = target_block_size;
|
||||||
op->callback = [this](cluster_op_t *op)
|
op->callback = [this](cluster_op_t *op)
|
||||||
{
|
{
|
||||||
if (op->retval < 0)
|
if (op->retval < 0)
|
||||||
|
@@ -92,6 +92,7 @@ struct rm_inode_t
|
|||||||
|
|
||||||
void send_ops(rm_pg_t *cur_list)
|
void send_ops(rm_pg_t *cur_list)
|
||||||
{
|
{
|
||||||
|
parent->cli->init_msgr();
|
||||||
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
||||||
parent->cli->msgr.osd_peer_fds.end())
|
parent->cli->msgr.osd_peer_fds.end())
|
||||||
{
|
{
|
||||||
|
@@ -5,6 +5,7 @@
|
|||||||
#include "cli.h"
|
#include "cli.h"
|
||||||
#include "cluster_client.h"
|
#include "cluster_client.h"
|
||||||
#include "str_util.h"
|
#include "str_util.h"
|
||||||
|
#include "epoll_manager.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
@@ -14,13 +15,21 @@ struct rm_osd_t
|
|||||||
cli_tool_t *parent;
|
cli_tool_t *parent;
|
||||||
|
|
||||||
bool dry_run, force_warning, force_dataloss;
|
bool dry_run, force_warning, force_dataloss;
|
||||||
|
uint64_t etcd_tx_retry_ms = 500;
|
||||||
|
uint64_t etcd_tx_retries = 10000;
|
||||||
std::vector<uint64_t> osd_ids;
|
std::vector<uint64_t> osd_ids;
|
||||||
|
|
||||||
int state = 0;
|
int state = 0;
|
||||||
cli_result_t result;
|
cli_result_t result;
|
||||||
|
|
||||||
std::set<uint64_t> to_remove;
|
std::set<uint64_t> to_remove;
|
||||||
|
std::set<uint64_t> to_restart;
|
||||||
json11::Json::array pool_effects;
|
json11::Json::array pool_effects;
|
||||||
|
json11::Json::array history_updates, history_checks;
|
||||||
|
json11::Json new_pgs, new_clean_pgs;
|
||||||
|
uint64_t new_pgs_mod_rev, new_clean_pgs_mod_rev;
|
||||||
|
uint64_t cur_retry = 0;
|
||||||
|
uint64_t retry_wait = 0;
|
||||||
bool is_warning, is_dataloss;
|
bool is_warning, is_dataloss;
|
||||||
|
|
||||||
bool is_done()
|
bool is_done()
|
||||||
@@ -32,6 +41,12 @@ struct rm_osd_t
|
|||||||
{
|
{
|
||||||
if (state == 1)
|
if (state == 1)
|
||||||
goto resume_1;
|
goto resume_1;
|
||||||
|
else if (state == 2)
|
||||||
|
goto resume_2;
|
||||||
|
else if (state == 3)
|
||||||
|
goto resume_3;
|
||||||
|
else if (state == 4)
|
||||||
|
goto resume_4;
|
||||||
if (!osd_ids.size())
|
if (!osd_ids.size())
|
||||||
{
|
{
|
||||||
result = (cli_result_t){ .err = EINVAL, .text = "OSD numbers are not specified" };
|
result = (cli_result_t){ .err = EINVAL, .text = "OSD numbers are not specified" };
|
||||||
@@ -152,14 +167,48 @@ struct rm_osd_t
|
|||||||
result.text = error;
|
result.text = error;
|
||||||
if (dry_run || is_dataloss && !force_dataloss || is_warning && !force_warning)
|
if (dry_run || is_dataloss && !force_dataloss || is_warning && !force_warning)
|
||||||
{
|
{
|
||||||
result.err = is_dataloss || is_warning ? EBUSY : 0;
|
result.err = is_dataloss && !force_dataloss || is_warning && !force_warning ? EBUSY : 0;
|
||||||
state = 100;
|
state = 100;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
parent->etcd_txn(json11::Json::object { { "success", json11::Json::array {
|
||||||
|
json11::Json::object {
|
||||||
|
{ "request_range", json11::Json::object {
|
||||||
|
{ "key", base64_encode(
|
||||||
|
parent->cli->st_cli.etcd_prefix+"/config/pgs"
|
||||||
|
) },
|
||||||
|
} },
|
||||||
|
},
|
||||||
|
json11::Json::object {
|
||||||
|
{ "request_range", json11::Json::object {
|
||||||
|
{ "key", base64_encode(
|
||||||
|
parent->cli->st_cli.etcd_prefix+"/history/last_clean_pgs"
|
||||||
|
) },
|
||||||
|
} },
|
||||||
|
},
|
||||||
|
} } });
|
||||||
|
resume_4:
|
||||||
|
state = 4;
|
||||||
|
if (parent->waiting > 0)
|
||||||
|
return;
|
||||||
|
if (parent->etcd_err.err)
|
||||||
|
{
|
||||||
|
result = parent->etcd_err;
|
||||||
|
state = 100;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]);
|
||||||
|
new_pgs = remove_osds_from_pgs(kv);
|
||||||
|
new_pgs_mod_rev = kv.mod_revision;
|
||||||
|
kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][1]["response_range"]["kvs"][0]);
|
||||||
|
new_clean_pgs = remove_osds_from_pgs(kv);
|
||||||
|
new_clean_pgs_mod_rev = kv.mod_revision;
|
||||||
|
}
|
||||||
// Remove keys from etcd
|
// Remove keys from etcd
|
||||||
{
|
{
|
||||||
json11::Json::array rm_items;
|
json11::Json::array rm_items, rm_checks;
|
||||||
for (auto osd_id: osd_ids)
|
for (auto osd_id: osd_ids)
|
||||||
{
|
{
|
||||||
rm_items.push_back("/config/osd/"+std::to_string(osd_id));
|
rm_items.push_back("/config/osd/"+std::to_string(osd_id));
|
||||||
@@ -178,7 +227,39 @@ struct rm_osd_t
|
|||||||
} },
|
} },
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
parent->etcd_txn(json11::Json::object { { "success", rm_items } });
|
if (!new_pgs.is_null())
|
||||||
|
{
|
||||||
|
auto pgs_key = base64_encode(parent->cli->st_cli.etcd_prefix+"/config/pgs");
|
||||||
|
rm_items.push_back(json11::Json::object {
|
||||||
|
{ "request_put", json11::Json::object {
|
||||||
|
{ "key", pgs_key },
|
||||||
|
{ "value", base64_encode(new_pgs.dump()) },
|
||||||
|
} },
|
||||||
|
});
|
||||||
|
rm_checks.push_back(json11::Json::object {
|
||||||
|
{ "target", "MOD" },
|
||||||
|
{ "key", pgs_key },
|
||||||
|
{ "result", "LESS" },
|
||||||
|
{ "mod_revision", new_pgs_mod_rev+1 },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (!new_clean_pgs.is_null())
|
||||||
|
{
|
||||||
|
auto pgs_key = base64_encode(parent->cli->st_cli.etcd_prefix+"/history/last_clean_pgs");
|
||||||
|
rm_items.push_back(json11::Json::object {
|
||||||
|
{ "request_put", json11::Json::object {
|
||||||
|
{ "key", pgs_key },
|
||||||
|
{ "value", base64_encode(new_clean_pgs.dump()) },
|
||||||
|
} },
|
||||||
|
});
|
||||||
|
rm_checks.push_back(json11::Json::object {
|
||||||
|
{ "target", "MOD" },
|
||||||
|
{ "key", pgs_key },
|
||||||
|
{ "result", "LESS" },
|
||||||
|
{ "mod_revision", new_clean_pgs_mod_rev+1 },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
parent->etcd_txn(json11::Json::object { { "success", rm_items }, { "checks", rm_checks } });
|
||||||
}
|
}
|
||||||
resume_1:
|
resume_1:
|
||||||
state = 1;
|
state = 1;
|
||||||
@@ -190,6 +271,46 @@ struct rm_osd_t
|
|||||||
state = 100;
|
state = 100;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
// Remove old OSD from PG all_peers to prevent left_on_dead and from
|
||||||
|
// target_history to prevent INCOMPLETE if --allow-data-loss is specified
|
||||||
|
for (auto & rsp: parent->etcd_result["responses"].array_items())
|
||||||
|
{
|
||||||
|
if (rsp["response_delete_range"]["deleted"].uint64_value() > 0)
|
||||||
|
{
|
||||||
|
// Wait for mon_change_timeout before updating PG history, or the monitor's change will likely interfere with ours
|
||||||
|
retry_wait = parent->cli->merged_config["mon_change_timeout"].uint64_value();
|
||||||
|
if (!retry_wait)
|
||||||
|
retry_wait = 1000;
|
||||||
|
retry_wait += etcd_tx_retry_ms;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
resume_2:
|
||||||
|
if (!remove_osds_from_history(2))
|
||||||
|
return;
|
||||||
|
resume_3:
|
||||||
|
state = 3;
|
||||||
|
if (parent->waiting > 0)
|
||||||
|
return;
|
||||||
|
if (parent->etcd_err.err)
|
||||||
|
{
|
||||||
|
result = parent->etcd_err;
|
||||||
|
state = 100;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (parent->etcd_result["succeeded"].bool_value())
|
||||||
|
break;
|
||||||
|
if ((++cur_retry) >= etcd_tx_retries)
|
||||||
|
{
|
||||||
|
result.err = EAGAIN;
|
||||||
|
result.text += "Failed to remove OSDs from PG history due to update conflicts."
|
||||||
|
" Some PGs may remain left_on_dead or incomplete. Please retry later\n";
|
||||||
|
state = 100;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
retry_wait = etcd_tx_retry_ms;
|
||||||
|
}
|
||||||
std::string ids = "";
|
std::string ids = "";
|
||||||
for (auto osd_id: osd_ids)
|
for (auto osd_id: osd_ids)
|
||||||
{
|
{
|
||||||
@@ -200,6 +321,141 @@ struct rm_osd_t
|
|||||||
result.text = (result.text != "" ? ids+"\n"+result.text : ids);
|
result.text = (result.text != "" ? ids+"\n"+result.text : ids);
|
||||||
result.err = 0;
|
result.err = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
json11::Json remove_osds_from_pgs(const etcd_kv_t & kv)
|
||||||
|
{
|
||||||
|
if (kv.value.is_null())
|
||||||
|
{
|
||||||
|
return kv.value;
|
||||||
|
}
|
||||||
|
json11::Json::object new_pgs;
|
||||||
|
for (auto & pp: kv.value["items"].object_items())
|
||||||
|
{
|
||||||
|
if (pp.second.is_object())
|
||||||
|
{
|
||||||
|
json11::Json::object new_pool;
|
||||||
|
for (auto & pgp: pp.second.object_items())
|
||||||
|
{
|
||||||
|
json11::Json::array osd_set;
|
||||||
|
for (auto & osd_json: pgp.second["osd_set"].array_items())
|
||||||
|
{
|
||||||
|
uint64_t osd_num = osd_json.uint64_value();
|
||||||
|
osd_set.push_back(osd_num == 0 || to_remove.find(osd_num) != to_remove.end() ? 0 : osd_num);
|
||||||
|
}
|
||||||
|
json11::Json::object new_pg = pgp.second.object_items();
|
||||||
|
new_pg["osd_set"] = osd_set;
|
||||||
|
new_pool[pgp.first] = new_pg;
|
||||||
|
}
|
||||||
|
new_pgs[pp.first] = new_pool;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
new_pgs[pp.first] = pp.second;
|
||||||
|
}
|
||||||
|
auto res = kv.value.object_items();
|
||||||
|
res["items"] = new_pgs;
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool remove_osds_from_history(int base_state)
|
||||||
|
{
|
||||||
|
if (state == base_state+0)
|
||||||
|
goto resume_0;
|
||||||
|
history_updates.clear();
|
||||||
|
history_checks.clear();
|
||||||
|
for (auto & pp: parent->cli->st_cli.pool_config)
|
||||||
|
{
|
||||||
|
bool update_pg_history = false;
|
||||||
|
auto & pool_cfg = pp.second;
|
||||||
|
for (auto & pgp: pool_cfg.pg_config)
|
||||||
|
{
|
||||||
|
auto pg_num = pgp.first;
|
||||||
|
auto & pg_cfg = pgp.second;
|
||||||
|
for (int i = 0; i < pg_cfg.all_peers.size(); i++)
|
||||||
|
{
|
||||||
|
if (to_remove.find(pg_cfg.all_peers[i]) != to_remove.end())
|
||||||
|
{
|
||||||
|
update_pg_history = true;
|
||||||
|
pg_cfg.all_peers.erase(pg_cfg.all_peers.begin()+i, pg_cfg.all_peers.begin()+i+1);
|
||||||
|
i--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = 0; i < pg_cfg.target_history.size(); i++)
|
||||||
|
{
|
||||||
|
int hist_size = 0, hist_rm = 0;
|
||||||
|
for (auto & old_osd: pg_cfg.target_history[i])
|
||||||
|
{
|
||||||
|
if (old_osd != 0)
|
||||||
|
{
|
||||||
|
hist_size++;
|
||||||
|
if (to_remove.find(old_osd) != to_remove.end())
|
||||||
|
{
|
||||||
|
hist_rm++;
|
||||||
|
old_osd = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (hist_rm > 0)
|
||||||
|
{
|
||||||
|
if (hist_size-hist_rm == 0)
|
||||||
|
{
|
||||||
|
pg_cfg.target_history.erase(pg_cfg.target_history.begin()+i, pg_cfg.target_history.begin()+i+1);
|
||||||
|
i--;
|
||||||
|
}
|
||||||
|
update_pg_history = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (update_pg_history)
|
||||||
|
{
|
||||||
|
std::string history_key = base64_encode(
|
||||||
|
parent->cli->st_cli.etcd_prefix+"/pg/history/"+
|
||||||
|
std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num)
|
||||||
|
);
|
||||||
|
auto hist = json11::Json::object {
|
||||||
|
{ "epoch", pg_cfg.epoch },
|
||||||
|
{ "all_peers", pg_cfg.all_peers },
|
||||||
|
{ "osd_sets", pg_cfg.target_history },
|
||||||
|
};
|
||||||
|
if (pg_cfg.scrub_ts)
|
||||||
|
hist["scrub_ts"] = pg_cfg.scrub_ts;
|
||||||
|
history_updates.push_back(json11::Json::object {
|
||||||
|
{ "request_put", json11::Json::object {
|
||||||
|
{ "key", history_key },
|
||||||
|
{ "value", base64_encode(json11::Json(hist).dump()) },
|
||||||
|
} },
|
||||||
|
});
|
||||||
|
history_checks.push_back(json11::Json::object {
|
||||||
|
{ "target", "MOD" },
|
||||||
|
{ "key", history_key },
|
||||||
|
{ "result", "LESS" },
|
||||||
|
{ "mod_revision", parent->cli->st_cli.etcd_watch_revision+1 },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (history_updates.size())
|
||||||
|
{
|
||||||
|
if (retry_wait)
|
||||||
|
{
|
||||||
|
parent->waiting++;
|
||||||
|
parent->epmgr->tfd->set_timer(retry_wait, false, [this](int timer_id)
|
||||||
|
{
|
||||||
|
parent->waiting--;
|
||||||
|
parent->ringloop->wakeup();
|
||||||
|
});
|
||||||
|
resume_0:
|
||||||
|
state = base_state+0;
|
||||||
|
if (parent->waiting > 0)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
parent->etcd_txn(json11::Json::object {
|
||||||
|
{ "success", history_updates },
|
||||||
|
{ "compare", history_checks },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else
|
||||||
|
parent->etcd_result = json11::Json::object{ { "succeeded", true } };
|
||||||
|
return true;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
std::function<bool(cli_result_t &)> cli_tool_t::start_rm_osd(json11::Json cfg)
|
std::function<bool(cli_result_t &)> cli_tool_t::start_rm_osd(json11::Json cfg)
|
||||||
@@ -209,6 +465,14 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_rm_osd(json11::Json cfg)
|
|||||||
rm_osd->dry_run = cfg["dry_run"].bool_value();
|
rm_osd->dry_run = cfg["dry_run"].bool_value();
|
||||||
rm_osd->force_dataloss = cfg["allow_data_loss"].bool_value();
|
rm_osd->force_dataloss = cfg["allow_data_loss"].bool_value();
|
||||||
rm_osd->force_warning = rm_osd->force_dataloss || cfg["force"].bool_value();
|
rm_osd->force_warning = rm_osd->force_dataloss || cfg["force"].bool_value();
|
||||||
|
if (!cfg["etcd_tx_retries"].is_null())
|
||||||
|
rm_osd->etcd_tx_retries = cfg["etcd_tx_retries"].uint64_value();
|
||||||
|
if (!cfg["etcd_tx_retry_ms"].is_null())
|
||||||
|
{
|
||||||
|
rm_osd->etcd_tx_retry_ms = cfg["etcd_tx_retry_ms"].uint64_value();
|
||||||
|
if (rm_osd->etcd_tx_retry_ms < 100)
|
||||||
|
rm_osd->etcd_tx_retry_ms = 100;
|
||||||
|
}
|
||||||
if (cfg["osd_id"].is_number() || cfg["osd_id"].is_string())
|
if (cfg["osd_id"].is_number() || cfg["osd_id"].is_string())
|
||||||
rm_osd->osd_ids.push_back(cfg["osd_id"].uint64_value());
|
rm_osd->osd_ids.push_back(cfg["osd_id"].uint64_value());
|
||||||
else
|
else
|
||||||
|
@@ -59,7 +59,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
|||||||
delete op;
|
delete op;
|
||||||
};
|
};
|
||||||
msgr.parse_config(this->config);
|
msgr.parse_config(this->config);
|
||||||
msgr.init();
|
|
||||||
|
|
||||||
st_cli.tfd = tfd;
|
st_cli.tfd = tfd;
|
||||||
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
|
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
|
||||||
@@ -73,17 +72,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
|||||||
|
|
||||||
scrap_buffer_size = SCRAP_BUFFER_SIZE;
|
scrap_buffer_size = SCRAP_BUFFER_SIZE;
|
||||||
scrap_buffer = malloc_or_die(scrap_buffer_size);
|
scrap_buffer = malloc_or_die(scrap_buffer_size);
|
||||||
|
|
||||||
if (ringloop)
|
|
||||||
{
|
|
||||||
consumer.loop = [this]()
|
|
||||||
{
|
|
||||||
msgr.read_requests();
|
|
||||||
msgr.send_replies();
|
|
||||||
this->ringloop->submit();
|
|
||||||
};
|
|
||||||
ringloop->register_consumer(&consumer);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cluster_client_t::~cluster_client_t()
|
cluster_client_t::~cluster_client_t()
|
||||||
@@ -115,6 +103,24 @@ cluster_op_t::~cluster_op_t()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cluster_client_t::init_msgr()
|
||||||
|
{
|
||||||
|
if (msgr_initialized)
|
||||||
|
return;
|
||||||
|
msgr.init();
|
||||||
|
msgr_initialized = true;
|
||||||
|
if (ringloop)
|
||||||
|
{
|
||||||
|
consumer.loop = [this]()
|
||||||
|
{
|
||||||
|
msgr.read_requests();
|
||||||
|
msgr.send_replies();
|
||||||
|
this->ringloop->submit();
|
||||||
|
};
|
||||||
|
ringloop->register_consumer(&consumer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void cluster_client_t::calc_wait(cluster_op_t *op)
|
void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||||
{
|
{
|
||||||
op->prev_wait = 0;
|
op->prev_wait = 0;
|
||||||
@@ -143,7 +149,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
|||||||
if (!op->prev_wait)
|
if (!op->prev_wait)
|
||||||
continue_sync(op);
|
continue_sync(op);
|
||||||
}
|
}
|
||||||
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
|
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) */
|
||||||
{
|
{
|
||||||
for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
|
for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
|
||||||
{
|
{
|
||||||
@@ -151,7 +157,8 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
|||||||
{
|
{
|
||||||
op->prev_wait++;
|
op->prev_wait++;
|
||||||
}
|
}
|
||||||
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
|
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ ||
|
||||||
|
prev->opcode == OSD_OP_READ_BITMAP || prev->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||||
{
|
{
|
||||||
// Flushes are always in the beginning (we're scanning from the beginning of the queue)
|
// Flushes are always in the beginning (we're scanning from the beginning of the queue)
|
||||||
break;
|
break;
|
||||||
@@ -171,7 +178,8 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
|
|||||||
auto n2 = next->next;
|
auto n2 = next->next;
|
||||||
if (next->opcode == OSD_OP_SYNC && !(flags & OP_IMMEDIATE_COMMIT) ||
|
if (next->opcode == OSD_OP_SYNC && !(flags & OP_IMMEDIATE_COMMIT) ||
|
||||||
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
|
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
|
||||||
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
|
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP ||
|
||||||
|
next->opcode == OSD_OP_READ_CHAIN_BITMAP) && (flags & OP_FLUSH_BUFFER))
|
||||||
{
|
{
|
||||||
next->prev_wait += inc;
|
next->prev_wait += inc;
|
||||||
assert(next->prev_wait >= 0);
|
assert(next->prev_wait >= 0);
|
||||||
@@ -221,11 +229,14 @@ void cluster_client_t::erase_op(cluster_op_t *op)
|
|||||||
if (op_queue_tail == op)
|
if (op_queue_tail == op)
|
||||||
op_queue_tail = op->prev;
|
op_queue_tail = op->prev;
|
||||||
op->next = op->prev = NULL;
|
op->next = op->prev = NULL;
|
||||||
|
if (flags & OP_FLUSH_BUFFER)
|
||||||
|
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||||
if (!(flags & OP_IMMEDIATE_COMMIT))
|
if (!(flags & OP_IMMEDIATE_COMMIT))
|
||||||
inc_wait(opcode, flags, next, -1);
|
inc_wait(opcode, flags, next, -1);
|
||||||
// Call callback at the end to avoid inconsistencies in prev_wait
|
// Call callback at the end to avoid inconsistencies in prev_wait
|
||||||
// if the callback adds more operations itself
|
// if the callback adds more operations itself
|
||||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
if (!(flags & OP_FLUSH_BUFFER))
|
||||||
|
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cluster_client_t::continue_ops(bool up_retry)
|
void cluster_client_t::continue_ops(bool up_retry)
|
||||||
@@ -337,7 +348,8 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
|
|||||||
// And now they have to be resliced!
|
// And now they have to be resliced!
|
||||||
for (auto op = op_queue_head; op; op = op->next)
|
for (auto op = op_queue_head; op; op = op->next)
|
||||||
{
|
{
|
||||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) &&
|
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ ||
|
||||||
|
op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) &&
|
||||||
INODE_POOL(op->cur_inode) == pool_item.first)
|
INODE_POOL(op->cur_inode) == pool_item.first)
|
||||||
{
|
{
|
||||||
op->needs_reslice = true;
|
op->needs_reslice = true;
|
||||||
@@ -409,7 +421,7 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
|
|||||||
void cluster_client_t::execute(cluster_op_t *op)
|
void cluster_client_t::execute(cluster_op_t *op)
|
||||||
{
|
{
|
||||||
if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ &&
|
if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ &&
|
||||||
op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_WRITE)
|
op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_WRITE)
|
||||||
{
|
{
|
||||||
op->retval = -EINVAL;
|
op->retval = -EINVAL;
|
||||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||||
@@ -441,7 +453,7 @@ void cluster_client_t::execute(cluster_op_t *op)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Check alignment
|
// Check alignment
|
||||||
if ((op->opcode == OSD_OP_READ || op->opcode == OSD_OP_WRITE) && !op->len ||
|
if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
|
||||||
op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
|
op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
|
||||||
{
|
{
|
||||||
op->retval = -EINVAL;
|
op->retval = -EINVAL;
|
||||||
@@ -702,8 +714,7 @@ resume_3:
|
|||||||
// Finished successfully
|
// Finished successfully
|
||||||
// Even if the PG count has changed in meanwhile we treat it as success
|
// Even if the PG count has changed in meanwhile we treat it as success
|
||||||
// because if some operations were invalid for the new PG count we'd get errors
|
// because if some operations were invalid for the new PG count we'd get errors
|
||||||
bool is_read = op->opcode == OSD_OP_READ;
|
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||||
if (is_read)
|
|
||||||
{
|
{
|
||||||
// Check parent inode
|
// Check parent inode
|
||||||
auto ino_it = st_cli.inode_config.find(op->cur_inode);
|
auto ino_it = st_cli.inode_config.find(op->cur_inode);
|
||||||
@@ -727,6 +738,11 @@ resume_3:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
op->retval = op->len;
|
op->retval = op->len;
|
||||||
|
if (op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||||
|
{
|
||||||
|
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
|
||||||
|
op->retval = op->len / pool_cfg.bitmap_granularity;
|
||||||
|
}
|
||||||
erase_op(op);
|
erase_op(op);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@@ -750,7 +766,10 @@ resume_3:
|
|||||||
{
|
{
|
||||||
for (int i = 0; i < op->parts.size(); i++)
|
for (int i = 0; i < op->parts.size(); i++)
|
||||||
{
|
{
|
||||||
op->parts[i].flags = PART_RETRY;
|
if (!(op->parts[i].flags & PART_DONE))
|
||||||
|
{
|
||||||
|
op->parts[i].flags = PART_RETRY;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
goto resume_2;
|
goto resume_2;
|
||||||
}
|
}
|
||||||
@@ -809,23 +828,19 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
|||||||
uint64_t last_stripe = op->len > 0 ? ((op->offset + op->len - 1) / pg_block_size) * pg_block_size : first_stripe;
|
uint64_t last_stripe = op->len > 0 ? ((op->offset + op->len - 1) / pg_block_size) * pg_block_size : first_stripe;
|
||||||
op->retval = 0;
|
op->retval = 0;
|
||||||
op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
|
op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
|
||||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
|
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||||
{
|
{
|
||||||
// Allocate memory for the bitmap
|
// Allocate memory for the bitmap
|
||||||
unsigned object_bitmap_size = (((op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : op->len) / pool_cfg.bitmap_granularity + 7) / 8);
|
unsigned object_bitmap_size = ((op->len / pool_cfg.bitmap_granularity + 7) / 8);
|
||||||
object_bitmap_size = (object_bitmap_size < 8 ? 8 : object_bitmap_size);
|
object_bitmap_size = (object_bitmap_size < 8 ? 8 : object_bitmap_size);
|
||||||
unsigned bitmap_mem = object_bitmap_size + (pool_cfg.data_block_size / pool_cfg.bitmap_granularity / 8 * pg_data_size) * op->parts.size();
|
unsigned bitmap_mem = object_bitmap_size + (pool_cfg.data_block_size / pool_cfg.bitmap_granularity / 8 * pg_data_size) * op->parts.size();
|
||||||
if (op->bitmap_buf_size < bitmap_mem)
|
if (!op->bitmap_buf || op->bitmap_buf_size < bitmap_mem)
|
||||||
{
|
{
|
||||||
op->bitmap_buf = realloc_or_die(op->bitmap_buf, bitmap_mem);
|
op->bitmap_buf = realloc_or_die(op->bitmap_buf, bitmap_mem);
|
||||||
if (!op->bitmap_buf_size)
|
|
||||||
{
|
|
||||||
// First allocation
|
|
||||||
memset(op->bitmap_buf, 0, object_bitmap_size);
|
|
||||||
}
|
|
||||||
op->part_bitmaps = (uint8_t*)op->bitmap_buf + object_bitmap_size;
|
op->part_bitmaps = (uint8_t*)op->bitmap_buf + object_bitmap_size;
|
||||||
op->bitmap_buf_size = bitmap_mem;
|
op->bitmap_buf_size = bitmap_mem;
|
||||||
}
|
}
|
||||||
|
memset(op->bitmap_buf, 0, bitmap_mem);
|
||||||
}
|
}
|
||||||
int iov_idx = 0;
|
int iov_idx = 0;
|
||||||
size_t iov_pos = 0;
|
size_t iov_pos = 0;
|
||||||
@@ -876,13 +891,14 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
|||||||
if (end == begin)
|
if (end == begin)
|
||||||
op->done_count++;
|
op->done_count++;
|
||||||
}
|
}
|
||||||
else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
|
else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||||
{
|
{
|
||||||
add_iov(end-begin, false, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
|
add_iov(end-begin, false, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
|
||||||
}
|
}
|
||||||
op->parts[i].parent = op;
|
op->parts[i].parent = op;
|
||||||
op->parts[i].offset = begin;
|
op->parts[i].offset = begin;
|
||||||
op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
|
op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ||
|
||||||
|
op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
|
||||||
op->parts[i].pg_num = pg_num;
|
op->parts[i].pg_num = pg_num;
|
||||||
op->parts[i].osd_num = 0;
|
op->parts[i].osd_num = 0;
|
||||||
op->parts[i].flags = 0;
|
op->parts[i].flags = 0;
|
||||||
@@ -911,6 +927,10 @@ bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len
|
|||||||
|
|
||||||
bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||||
{
|
{
|
||||||
|
if (!msgr_initialized)
|
||||||
|
{
|
||||||
|
init_msgr();
|
||||||
|
}
|
||||||
auto part = &op->parts[i];
|
auto part = &op->parts[i];
|
||||||
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
|
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
|
||||||
auto pg_it = pool_cfg.pg_config.find(part->pg_num);
|
auto pg_it = pool_cfg.pg_config.find(part->pg_num);
|
||||||
@@ -929,7 +949,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
|||||||
pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
|
pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
|
||||||
);
|
);
|
||||||
uint64_t meta_rev = 0;
|
uint64_t meta_rev = 0;
|
||||||
if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
|
if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||||
{
|
{
|
||||||
auto ino_it = st_cli.inode_config.find(op->inode);
|
auto ino_it = st_cli.inode_config.find(op->inode);
|
||||||
if (ino_it != st_cli.inode_config.end())
|
if (ino_it != st_cli.inode_config.end())
|
||||||
@@ -942,7 +962,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
|||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = next_op_id(),
|
.id = next_op_id(),
|
||||||
.opcode = op->opcode == OSD_OP_READ_BITMAP ? OSD_OP_READ : op->opcode,
|
.opcode = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ? OSD_OP_READ : op->opcode,
|
||||||
},
|
},
|
||||||
.inode = op->cur_inode,
|
.inode = op->cur_inode,
|
||||||
.offset = part->offset,
|
.offset = part->offset,
|
||||||
@@ -950,8 +970,10 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
|||||||
.meta_revision = meta_rev,
|
.meta_revision = meta_rev,
|
||||||
.version = op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE ? op->version : 0,
|
.version = op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE ? op->version : 0,
|
||||||
} },
|
} },
|
||||||
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
|
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
|
||||||
.bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? pg_bitmap_size : 0),
|
? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
|
||||||
|
.bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
|
||||||
|
? pg_bitmap_size : 0),
|
||||||
.callback = [this, part](osd_op_t *op_part)
|
.callback = [this, part](osd_op_t *op_part)
|
||||||
{
|
{
|
||||||
handle_op_part(part);
|
handle_op_part(part);
|
||||||
@@ -1130,11 +1152,11 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
// OK
|
// OK
|
||||||
if (!(op->flags & OP_IMMEDIATE_COMMIT))
|
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && !(op->flags & OP_IMMEDIATE_COMMIT))
|
||||||
dirty_osds.insert(part->osd_num);
|
dirty_osds.insert(part->osd_num);
|
||||||
part->flags |= PART_DONE;
|
part->flags |= PART_DONE;
|
||||||
op->done_count++;
|
op->done_count++;
|
||||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
|
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||||
{
|
{
|
||||||
copy_part_bitmap(op, part);
|
copy_part_bitmap(op, part);
|
||||||
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
||||||
@@ -1158,7 +1180,12 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
|
|||||||
);
|
);
|
||||||
uint32_t object_offset = (part->op.req.rw.offset - op->offset) / pool_cfg.bitmap_granularity;
|
uint32_t object_offset = (part->op.req.rw.offset - op->offset) / pool_cfg.bitmap_granularity;
|
||||||
uint32_t part_offset = (part->op.req.rw.offset % pg_block_size) / pool_cfg.bitmap_granularity;
|
uint32_t part_offset = (part->op.req.rw.offset % pg_block_size) / pool_cfg.bitmap_granularity;
|
||||||
uint32_t part_len = (op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : part->op.req.rw.len) / pool_cfg.bitmap_granularity;
|
uint32_t op_len = op->len / pool_cfg.bitmap_granularity;
|
||||||
|
uint32_t part_len = pg_block_size/pool_cfg.bitmap_granularity - part_offset;
|
||||||
|
if (part_len > op_len-object_offset)
|
||||||
|
{
|
||||||
|
part_len = op_len-object_offset;
|
||||||
|
}
|
||||||
if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
|
if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
|
||||||
{
|
{
|
||||||
// Copy bytes
|
// Copy bytes
|
||||||
|
@@ -11,6 +11,7 @@
|
|||||||
#define INODE_LIST_DONE 1
|
#define INODE_LIST_DONE 1
|
||||||
#define INODE_LIST_HAS_UNSTABLE 2
|
#define INODE_LIST_HAS_UNSTABLE 2
|
||||||
#define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
|
#define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
|
||||||
|
#define OSD_OP_READ_CHAIN_BITMAP 0x102
|
||||||
|
|
||||||
#define OSD_OP_IGNORE_READONLY 0x08
|
#define OSD_OP_IGNORE_READONLY 0x08
|
||||||
|
|
||||||
@@ -30,7 +31,7 @@ struct cluster_op_part_t
|
|||||||
|
|
||||||
struct cluster_op_t
|
struct cluster_op_t
|
||||||
{
|
{
|
||||||
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP
|
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP, OSD_OP_READ_CHAIN_BITMAP
|
||||||
uint64_t inode;
|
uint64_t inode;
|
||||||
uint64_t offset;
|
uint64_t offset;
|
||||||
uint64_t len;
|
uint64_t len;
|
||||||
@@ -39,9 +40,13 @@ struct cluster_op_t
|
|||||||
uint64_t version = 0;
|
uint64_t version = 0;
|
||||||
// now only OSD_OP_IGNORE_READONLY is supported
|
// now only OSD_OP_IGNORE_READONLY is supported
|
||||||
uint64_t flags = 0;
|
uint64_t flags = 0;
|
||||||
|
// negative retval is an error number
|
||||||
|
// write and read return len on success
|
||||||
|
// sync and delete return 0 on success
|
||||||
|
// read_bitmap and read_chain_bitmap return the length of bitmap in bits(!)
|
||||||
int retval;
|
int retval;
|
||||||
osd_op_buf_list_t iov;
|
osd_op_buf_list_t iov;
|
||||||
// READ and READ_BITMAP return the bitmap here
|
// READ, READ_BITMAP, READ_CHAIN_BITMAP return the bitmap here
|
||||||
void *bitmap_buf = NULL;
|
void *bitmap_buf = NULL;
|
||||||
std::function<void(cluster_op_t*)> callback;
|
std::function<void(cluster_op_t*)> callback;
|
||||||
~cluster_op_t();
|
~cluster_op_t();
|
||||||
@@ -99,10 +104,14 @@ class cluster_client_t
|
|||||||
std::vector<std::function<void(void)>> on_ready_hooks;
|
std::vector<std::function<void(void)>> on_ready_hooks;
|
||||||
std::vector<inode_list_t*> lists;
|
std::vector<inode_list_t*> lists;
|
||||||
int continuing_ops = 0;
|
int continuing_ops = 0;
|
||||||
|
bool msgr_initialized = false;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
etcd_state_client_t st_cli;
|
etcd_state_client_t st_cli;
|
||||||
|
|
||||||
osd_messenger_t msgr;
|
osd_messenger_t msgr;
|
||||||
|
void init_msgr();
|
||||||
|
|
||||||
json11::Json config;
|
json11::Json config;
|
||||||
json11::Json::object merged_config;
|
json11::Json::object merged_config;
|
||||||
|
|
||||||
|
@@ -387,6 +387,14 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
|
|||||||
rm_osd_cli.push_back(std::to_string(osd_num));
|
rm_osd_cli.push_back(std::to_string(osd_num));
|
||||||
}
|
}
|
||||||
// Check for data loss
|
// Check for data loss
|
||||||
|
if (options["force"] != "")
|
||||||
|
{
|
||||||
|
rm_osd_cli.push_back("--force");
|
||||||
|
}
|
||||||
|
else if (options["allow_data_loss"] != "")
|
||||||
|
{
|
||||||
|
rm_osd_cli.push_back("--allow-data-loss");
|
||||||
|
}
|
||||||
rm_osd_cli.push_back("--dry-run");
|
rm_osd_cli.push_back("--dry-run");
|
||||||
std::string dry_run_ignore_stdout;
|
std::string dry_run_ignore_stdout;
|
||||||
if (shell_exec(rm_osd_cli, "", &dry_run_ignore_stdout, NULL) != 0)
|
if (shell_exec(rm_osd_cli, "", &dry_run_ignore_stdout, NULL) != 0)
|
||||||
@@ -405,14 +413,6 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
|
|||||||
}
|
}
|
||||||
// Remove OSD metadata
|
// Remove OSD metadata
|
||||||
rm_osd_cli.pop_back();
|
rm_osd_cli.pop_back();
|
||||||
if (options["force"] != "")
|
|
||||||
{
|
|
||||||
rm_osd_cli.push_back("--force");
|
|
||||||
}
|
|
||||||
else if (options["allow_data_loss"] != "")
|
|
||||||
{
|
|
||||||
rm_osd_cli.push_back("--allow-data-loss");
|
|
||||||
}
|
|
||||||
if (shell_exec(rm_osd_cli, "", NULL, NULL) != 0)
|
if (shell_exec(rm_osd_cli, "", NULL, NULL) != 0)
|
||||||
{
|
{
|
||||||
return 1;
|
return 1;
|
||||||
|
@@ -305,10 +305,10 @@ int write_zero(int fd, uint64_t offset, uint64_t size)
|
|||||||
json11::Json read_parttable(std::string dev)
|
json11::Json read_parttable(std::string dev)
|
||||||
{
|
{
|
||||||
std::string part_dump;
|
std::string part_dump;
|
||||||
int r = shell_exec({ "sfdisk", "--dump", dev, "--json" }, "", &part_dump, NULL);
|
int r = shell_exec({ "sfdisk", "--json", dev }, "", &part_dump, NULL);
|
||||||
if (r == 255)
|
if (r == 255)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Error running sfdisk --dump %s --json\n", dev.c_str());
|
fprintf(stderr, "Error running sfdisk --json %s\n", dev.c_str());
|
||||||
return json11::Json(false);
|
return json11::Json(false);
|
||||||
}
|
}
|
||||||
// Decode partition table
|
// Decode partition table
|
||||||
@@ -319,7 +319,7 @@ json11::Json read_parttable(std::string dev)
|
|||||||
pt = json11::Json::parse(part_dump, err);
|
pt = json11::Json::parse(part_dump, err);
|
||||||
if (err != "")
|
if (err != "")
|
||||||
{
|
{
|
||||||
fprintf(stderr, "sfdisk --dump %s --json returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
|
fprintf(stderr, "sfdisk --json %s returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
|
||||||
return json11::Json(false);
|
return json11::Json(false);
|
||||||
}
|
}
|
||||||
pt = pt["partitiontable"];
|
pt = pt["partitiontable"];
|
||||||
|
@@ -7,8 +7,8 @@
|
|||||||
#ifndef __MOCK__
|
#ifndef __MOCK__
|
||||||
#include "addr_util.h"
|
#include "addr_util.h"
|
||||||
#include "http_client.h"
|
#include "http_client.h"
|
||||||
#include "str_util.h"
|
|
||||||
#endif
|
#endif
|
||||||
|
#include "str_util.h"
|
||||||
|
|
||||||
etcd_state_client_t::~etcd_state_client_t()
|
etcd_state_client_t::~etcd_state_client_t()
|
||||||
{
|
{
|
||||||
@@ -759,6 +759,10 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
fprintf(stderr, "Pool %u has invalid bitmap_granularity (must divide block_size), skipping pool\n", pool_id);
|
fprintf(stderr, "Pool %u has invalid bitmap_granularity (must divide block_size), skipping pool\n", pool_id);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
// Scrub Interval
|
||||||
|
pc.scrub_interval = parse_time(pool_item.second["scrub_interval"].string_value());
|
||||||
|
if (!pc.scrub_interval)
|
||||||
|
pc.scrub_interval = 0;
|
||||||
// Immediate Commit Mode
|
// Immediate Commit Mode
|
||||||
pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
|
pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
|
||||||
? (pool_item.second["immediate_commit"].string_value() == "all"
|
? (pool_item.second["immediate_commit"].string_value() == "all"
|
||||||
@@ -871,22 +875,38 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
pg_cfg.target_history.clear();
|
pg_cfg.target_history.clear();
|
||||||
pg_cfg.all_peers.clear();
|
pg_cfg.all_peers.clear();
|
||||||
// Refuse to start PG if any set of the <osd_sets> has no live OSDs
|
// Refuse to start PG if any set of the <osd_sets> has no live OSDs
|
||||||
for (auto hist_item: value["osd_sets"].array_items())
|
for (auto & hist_item: value["osd_sets"].array_items())
|
||||||
{
|
{
|
||||||
std::vector<osd_num_t> history_set;
|
std::vector<osd_num_t> history_set;
|
||||||
for (auto pg_osd: hist_item.array_items())
|
for (auto & pg_osd: hist_item.array_items())
|
||||||
{
|
{
|
||||||
history_set.push_back(pg_osd.uint64_value());
|
osd_num_t pg_osd_num = pg_osd.uint64_value();
|
||||||
|
if (pg_osd_num != 0)
|
||||||
|
{
|
||||||
|
auto it = std::lower_bound(history_set.begin(), history_set.end(), pg_osd_num);
|
||||||
|
if (it == history_set.end() || *it != pg_osd_num)
|
||||||
|
history_set.insert(it, pg_osd_num);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
pg_cfg.target_history.push_back(history_set);
|
auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), history_set);
|
||||||
|
if (it == pg_cfg.target_history.end() || *it != history_set)
|
||||||
|
pg_cfg.target_history.insert(it, history_set);
|
||||||
}
|
}
|
||||||
// Include these additional OSDs when peering the PG
|
// Include these additional OSDs when peering the PG
|
||||||
for (auto pg_osd: value["all_peers"].array_items())
|
for (auto pg_osd: value["all_peers"].array_items())
|
||||||
{
|
{
|
||||||
pg_cfg.all_peers.push_back(pg_osd.uint64_value());
|
osd_num_t pg_osd_num = pg_osd.uint64_value();
|
||||||
|
if (pg_osd_num != 0)
|
||||||
|
{
|
||||||
|
auto it = std::lower_bound(pg_cfg.all_peers.begin(), pg_cfg.all_peers.end(), pg_osd_num);
|
||||||
|
if (it == pg_cfg.all_peers.end() || *it != pg_osd_num)
|
||||||
|
pg_cfg.all_peers.insert(it, pg_osd_num);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Read epoch
|
// Read epoch
|
||||||
pg_cfg.epoch = value["epoch"].uint64_value();
|
pg_cfg.epoch = value["epoch"].uint64_value();
|
||||||
|
// Scrub timestamp
|
||||||
|
pg_cfg.scrub_ts = parse_time(value["scrub_ts"].string_value());
|
||||||
if (on_change_pg_history_hook != NULL)
|
if (on_change_pg_history_hook != NULL)
|
||||||
{
|
{
|
||||||
on_change_pg_history_hook(pool_id, pg_num);
|
on_change_pg_history_hook(pool_id, pg_num);
|
||||||
|
@@ -39,6 +39,7 @@ struct pg_config_t
|
|||||||
osd_num_t cur_primary;
|
osd_num_t cur_primary;
|
||||||
int cur_state;
|
int cur_state;
|
||||||
uint64_t epoch;
|
uint64_t epoch;
|
||||||
|
uint64_t scrub_ts;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct pool_config_t
|
struct pool_config_t
|
||||||
@@ -55,6 +56,7 @@ struct pool_config_t
|
|||||||
uint64_t max_osd_combinations;
|
uint64_t max_osd_combinations;
|
||||||
uint64_t pg_stripe_size;
|
uint64_t pg_stripe_size;
|
||||||
std::map<pg_num_t, pg_config_t> pg_config;
|
std::map<pg_num_t, pg_config_t> pg_config;
|
||||||
|
uint64_t scrub_interval;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct inode_config_t
|
struct inode_config_t
|
||||||
|
@@ -157,7 +157,7 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
|||||||
this->rdma_max_sge = 128;
|
this->rdma_max_sge = 128;
|
||||||
this->rdma_max_send = config["rdma_max_send"].uint64_value();
|
this->rdma_max_send = config["rdma_max_send"].uint64_value();
|
||||||
if (!this->rdma_max_send)
|
if (!this->rdma_max_send)
|
||||||
this->rdma_max_send = 1;
|
this->rdma_max_send = 64;
|
||||||
this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
|
this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
|
||||||
if (!this->rdma_max_recv)
|
if (!this->rdma_max_recv)
|
||||||
this->rdma_max_recv = 128;
|
this->rdma_max_recv = 128;
|
||||||
|
@@ -138,6 +138,7 @@ protected:
|
|||||||
|
|
||||||
std::vector<int> read_ready_clients;
|
std::vector<int> read_ready_clients;
|
||||||
std::vector<int> write_ready_clients;
|
std::vector<int> write_ready_clients;
|
||||||
|
// We don't use ringloop->set_immediate here because we may have no ringloop in client :)
|
||||||
std::vector<std::function<void()>> set_immediate;
|
std::vector<std::function<void()>> set_immediate;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
@@ -368,9 +368,8 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
|||||||
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
auto rc = cl->rdma_conn;
|
auto rc = cl->rdma_conn;
|
||||||
if (!cl->send_list.size() || rc->cur_send > 0)
|
if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
|
||||||
{
|
{
|
||||||
// Only send one batch at a time
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
uint64_t op_size = 0, op_sge = 0;
|
uint64_t op_size = 0, op_sge = 0;
|
||||||
@@ -380,6 +379,7 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
|||||||
iovec & iov = cl->send_list[rc->send_pos];
|
iovec & iov = cl->send_list[rc->send_pos];
|
||||||
if (op_size >= rc->max_msg || op_sge >= rc->max_sge)
|
if (op_size >= rc->max_msg || op_sge >= rc->max_sge)
|
||||||
{
|
{
|
||||||
|
rc->send_sizes.push_back(op_size);
|
||||||
try_send_rdma_wr(cl, sge, op_sge);
|
try_send_rdma_wr(cl, sge, op_sge);
|
||||||
op_sge = 0;
|
op_sge = 0;
|
||||||
op_size = 0;
|
op_size = 0;
|
||||||
@@ -405,18 +405,24 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
|||||||
}
|
}
|
||||||
if (op_sge > 0)
|
if (op_sge > 0)
|
||||||
{
|
{
|
||||||
|
rc->send_sizes.push_back(op_size);
|
||||||
try_send_rdma_wr(cl, sge, op_sge);
|
try_send_rdma_wr(cl, sge, op_sge);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void try_recv_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
|
||||||
{
|
{
|
||||||
|
ibv_sge sge = {
|
||||||
|
.addr = (uintptr_t)buf,
|
||||||
|
.length = (uint32_t)cl->rdma_conn->max_msg,
|
||||||
|
.lkey = cl->rdma_conn->ctx->mr->lkey,
|
||||||
|
};
|
||||||
ibv_recv_wr *bad_wr = NULL;
|
ibv_recv_wr *bad_wr = NULL;
|
||||||
ibv_recv_wr wr = {
|
ibv_recv_wr wr = {
|
||||||
.wr_id = (uint64_t)(cl->peer_fd*2),
|
.wr_id = (uint64_t)(cl->peer_fd*2),
|
||||||
.sg_list = sge,
|
.sg_list = &sge,
|
||||||
.num_sge = op_sge,
|
.num_sge = 1,
|
||||||
};
|
};
|
||||||
int err = ibv_post_recv(cl->rdma_conn->qp, &wr, &bad_wr);
|
int err = ibv_post_recv(cl->rdma_conn->qp, &wr, &bad_wr);
|
||||||
if (err || bad_wr)
|
if (err || bad_wr)
|
||||||
@@ -434,12 +440,7 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
|
|||||||
{
|
{
|
||||||
void *buf = malloc_or_die(rc->max_msg);
|
void *buf = malloc_or_die(rc->max_msg);
|
||||||
rc->recv_buffers.push_back(buf);
|
rc->recv_buffers.push_back(buf);
|
||||||
ibv_sge sge = {
|
try_recv_rdma_wr(cl, buf);
|
||||||
.addr = (uintptr_t)buf,
|
|
||||||
.length = (uint32_t)rc->max_msg,
|
|
||||||
.lkey = rc->ctx->mr->lkey,
|
|
||||||
};
|
|
||||||
try_recv_rdma_wr(cl, &sge, 1);
|
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -476,6 +477,7 @@ void osd_messenger_t::handle_rdma_events()
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
osd_client_t *cl = cl_it->second;
|
osd_client_t *cl = cl_it->second;
|
||||||
|
auto rc = cl->rdma_conn;
|
||||||
if (wc[i].status != IBV_WC_SUCCESS)
|
if (wc[i].status != IBV_WC_SUCCESS)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "RDMA work request failed for client %d", client_id);
|
fprintf(stderr, "RDMA work request failed for client %d", client_id);
|
||||||
@@ -489,44 +491,59 @@ void osd_messenger_t::handle_rdma_events()
|
|||||||
}
|
}
|
||||||
if (!is_send)
|
if (!is_send)
|
||||||
{
|
{
|
||||||
cl->rdma_conn->cur_recv--;
|
rc->cur_recv--;
|
||||||
if (!handle_read_buffer(cl, cl->rdma_conn->recv_buffers[0], wc[i].byte_len))
|
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
|
||||||
{
|
{
|
||||||
// handle_read_buffer may stop the client
|
// handle_read_buffer may stop the client
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
free(cl->rdma_conn->recv_buffers[0]);
|
try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
|
||||||
cl->rdma_conn->recv_buffers.erase(cl->rdma_conn->recv_buffers.begin(), cl->rdma_conn->recv_buffers.begin()+1);
|
rc->next_recv_buf = (rc->next_recv_buf+1) % rc->recv_buffers.size();
|
||||||
try_recv_rdma(cl);
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
cl->rdma_conn->cur_send--;
|
rc->cur_send--;
|
||||||
if (!cl->rdma_conn->cur_send)
|
uint64_t sent_size = rc->send_sizes.at(0);
|
||||||
|
rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
|
||||||
|
int send_pos = 0, send_buf_pos = 0;
|
||||||
|
while (sent_size > 0)
|
||||||
{
|
{
|
||||||
// Wait for the whole batch
|
if (sent_size >= cl->send_list.at(send_pos).iov_len)
|
||||||
for (int i = 0; i < cl->rdma_conn->send_pos; i++)
|
|
||||||
{
|
{
|
||||||
if (cl->outbox[i].flags & MSGR_SENDP_FREE)
|
sent_size -= cl->send_list[send_pos].iov_len;
|
||||||
{
|
send_pos++;
|
||||||
// Reply fully sent
|
|
||||||
delete cl->outbox[i].op;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (cl->rdma_conn->send_pos > 0)
|
else
|
||||||
{
|
{
|
||||||
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+cl->rdma_conn->send_pos);
|
send_buf_pos = sent_size;
|
||||||
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+cl->rdma_conn->send_pos);
|
sent_size = 0;
|
||||||
cl->rdma_conn->send_pos = 0;
|
|
||||||
}
|
}
|
||||||
if (cl->rdma_conn->send_buf_pos > 0)
|
|
||||||
{
|
|
||||||
cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + cl->rdma_conn->send_buf_pos;
|
|
||||||
cl->send_list[0].iov_len -= cl->rdma_conn->send_buf_pos;
|
|
||||||
cl->rdma_conn->send_buf_pos = 0;
|
|
||||||
}
|
|
||||||
try_send_rdma(cl);
|
|
||||||
}
|
}
|
||||||
|
assert(rc->send_pos >= send_pos);
|
||||||
|
if (rc->send_pos == send_pos)
|
||||||
|
{
|
||||||
|
rc->send_buf_pos -= send_buf_pos;
|
||||||
|
}
|
||||||
|
rc->send_pos -= send_pos;
|
||||||
|
for (int i = 0; i < send_pos; i++)
|
||||||
|
{
|
||||||
|
if (cl->outbox[i].flags & MSGR_SENDP_FREE)
|
||||||
|
{
|
||||||
|
// Reply fully sent
|
||||||
|
delete cl->outbox[i].op;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (send_pos > 0)
|
||||||
|
{
|
||||||
|
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+send_pos);
|
||||||
|
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+send_pos);
|
||||||
|
}
|
||||||
|
if (send_buf_pos > 0)
|
||||||
|
{
|
||||||
|
cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + send_buf_pos;
|
||||||
|
cl->send_list[0].iov_len -= send_buf_pos;
|
||||||
|
}
|
||||||
|
try_send_rdma(cl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (event_count > 0);
|
} while (event_count > 0);
|
||||||
|
@@ -49,8 +49,9 @@ struct msgr_rdma_connection_t
|
|||||||
uint64_t max_msg = 0;
|
uint64_t max_msg = 0;
|
||||||
|
|
||||||
int send_pos = 0, send_buf_pos = 0;
|
int send_pos = 0, send_buf_pos = 0;
|
||||||
int recv_pos = 0, recv_buf_pos = 0;
|
int next_recv_buf = 0;
|
||||||
std::vector<void*> recv_buffers;
|
std::vector<void*> recv_buffers;
|
||||||
|
std::vector<uint64_t> send_sizes;
|
||||||
|
|
||||||
~msgr_rdma_connection_t();
|
~msgr_rdma_connection_t();
|
||||||
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
|
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
|
||||||
|
32
src/osd.cpp
32
src/osd.cpp
@@ -163,6 +163,9 @@ void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
|
|||||||
recovery_queue_depth = config["recovery_queue_depth"].uint64_value();
|
recovery_queue_depth = config["recovery_queue_depth"].uint64_value();
|
||||||
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
|
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
|
||||||
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
||||||
|
recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
|
||||||
|
if (recovery_pg_switch < 1)
|
||||||
|
recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
||||||
recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
|
recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
|
||||||
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
|
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
|
||||||
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||||
@@ -175,6 +178,16 @@ void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
|
|||||||
inode_vanish_time = config["inode_vanish_time"].uint64_value();
|
inode_vanish_time = config["inode_vanish_time"].uint64_value();
|
||||||
if (!inode_vanish_time)
|
if (!inode_vanish_time)
|
||||||
inode_vanish_time = 60;
|
inode_vanish_time = 60;
|
||||||
|
global_scrub_interval = config["scrub_interval"].uint64_value();
|
||||||
|
if (!global_scrub_interval)
|
||||||
|
global_scrub_interval = 30*86400;
|
||||||
|
scrub_queue_depth = config["scrub_queue_depth"].uint64_value();
|
||||||
|
if (scrub_queue_depth < 1 || scrub_queue_depth > MAX_RECOVERY_QUEUE)
|
||||||
|
scrub_queue_depth = 1;
|
||||||
|
scrub_sleep_ms = config["scrub_sleep"].uint64_value();
|
||||||
|
scrub_list_limit = config["scrub_list_limit"].uint64_value();
|
||||||
|
if (!scrub_list_limit)
|
||||||
|
scrub_list_limit = 1000;
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::bind_socket()
|
void osd_t::bind_socket()
|
||||||
@@ -259,7 +272,8 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
|||||||
cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
|
cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
|
||||||
(cur_op->req.rw.len > OSD_RW_MAX ||
|
(cur_op->req.rw.len > OSD_RW_MAX ||
|
||||||
cur_op->req.rw.len % bs_bitmap_granularity ||
|
cur_op->req.rw.len % bs_bitmap_granularity ||
|
||||||
cur_op->req.rw.offset % bs_bitmap_granularity)))
|
cur_op->req.rw.offset % bs_bitmap_granularity)) ||
|
||||||
|
cur_op->req.hdr.opcode == OSD_OP_SCRUB && cur_op->peer_fd != -1)
|
||||||
{
|
{
|
||||||
// Bad command
|
// Bad command
|
||||||
finish_op(cur_op, -EINVAL);
|
finish_op(cur_op, -EINVAL);
|
||||||
@@ -276,6 +290,7 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
|||||||
cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
|
cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
|
||||||
cur_op->req.hdr.opcode != OSD_OP_READ &&
|
cur_op->req.hdr.opcode != OSD_OP_READ &&
|
||||||
cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
|
cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
|
||||||
|
cur_op->req.hdr.opcode != OSD_OP_SCRUB &&
|
||||||
cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
|
cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
|
||||||
{
|
{
|
||||||
// Readonly mode
|
// Readonly mode
|
||||||
@@ -306,6 +321,10 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
|||||||
{
|
{
|
||||||
continue_primary_del(cur_op);
|
continue_primary_del(cur_op);
|
||||||
}
|
}
|
||||||
|
else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
|
||||||
|
{
|
||||||
|
continue_primary_scrub(cur_op);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
exec_secondary(cur_op);
|
exec_secondary(cur_op);
|
||||||
@@ -370,6 +389,10 @@ void osd_t::print_stats()
|
|||||||
recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
|
recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (corrupted_objects > 0)
|
||||||
|
{
|
||||||
|
printf("[OSD %lu] %lu object(s) corrupted\n", osd_num, corrupted_objects);
|
||||||
|
}
|
||||||
if (incomplete_objects > 0)
|
if (incomplete_objects > 0)
|
||||||
{
|
{
|
||||||
printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
|
printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
|
||||||
@@ -437,10 +460,11 @@ void osd_t::print_slow()
|
|||||||
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
|
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
|
||||||
{
|
{
|
||||||
bufprintf(
|
bufprintf(
|
||||||
" inode=%lx-%lx pg=%u/%u, stripe=%lu",
|
" oid=%lx/%lx-%lx/%lx pg=%u/%u, stripe=%lu, limit=%u",
|
||||||
op->req.sec_list.min_inode, op->req.sec_list.max_inode,
|
op->req.sec_list.min_inode, op->req.sec_list.min_stripe,
|
||||||
|
op->req.sec_list.max_inode, op->req.sec_list.max_stripe,
|
||||||
op->req.sec_list.list_pg, op->req.sec_list.pg_count,
|
op->req.sec_list.list_pg, op->req.sec_list.pg_count,
|
||||||
op->req.sec_list.pg_stripe_size
|
op->req.sec_list.pg_stripe_size, op->req.sec_list.stable_limit
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
|
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
|
||||||
|
40
src/osd.h
40
src/osd.h
@@ -28,12 +28,14 @@
|
|||||||
#define OSD_PEERING_PGS 0x04
|
#define OSD_PEERING_PGS 0x04
|
||||||
#define OSD_FLUSHING_PGS 0x08
|
#define OSD_FLUSHING_PGS 0x08
|
||||||
#define OSD_RECOVERING 0x10
|
#define OSD_RECOVERING 0x10
|
||||||
|
#define OSD_SCRUBBING 0x20
|
||||||
|
|
||||||
#define MAX_AUTOSYNC_INTERVAL 3600
|
#define MAX_AUTOSYNC_INTERVAL 3600
|
||||||
#define DEFAULT_AUTOSYNC_INTERVAL 5
|
#define DEFAULT_AUTOSYNC_INTERVAL 5
|
||||||
#define DEFAULT_AUTOSYNC_WRITES 128
|
#define DEFAULT_AUTOSYNC_WRITES 128
|
||||||
#define MAX_RECOVERY_QUEUE 2048
|
#define MAX_RECOVERY_QUEUE 2048
|
||||||
#define DEFAULT_RECOVERY_QUEUE 4
|
#define DEFAULT_RECOVERY_QUEUE 4
|
||||||
|
#define DEFAULT_RECOVERY_PG_SWITCH 128
|
||||||
#define DEFAULT_RECOVERY_BATCH 16
|
#define DEFAULT_RECOVERY_BATCH 16
|
||||||
|
|
||||||
//#define OSD_STUB
|
//#define OSD_STUB
|
||||||
@@ -108,9 +110,14 @@ class osd_t
|
|||||||
int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds
|
int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds
|
||||||
int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
|
int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
|
||||||
int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
||||||
|
int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
||||||
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||||
int inode_vanish_time = 60;
|
int inode_vanish_time = 60;
|
||||||
int log_level = 0;
|
int log_level = 0;
|
||||||
|
uint64_t global_scrub_interval = 30*86400;
|
||||||
|
uint64_t scrub_queue_depth = 1;
|
||||||
|
uint64_t scrub_sleep_ms = 0;
|
||||||
|
uint32_t scrub_list_limit = 1000;
|
||||||
|
|
||||||
// cluster state
|
// cluster state
|
||||||
|
|
||||||
@@ -132,12 +139,24 @@ class osd_t
|
|||||||
std::set<pool_pg_num_t> dirty_pgs;
|
std::set<pool_pg_num_t> dirty_pgs;
|
||||||
std::set<osd_num_t> dirty_osds;
|
std::set<osd_num_t> dirty_osds;
|
||||||
int copies_to_delete_after_sync_count = 0;
|
int copies_to_delete_after_sync_count = 0;
|
||||||
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
|
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, corrupted_objects = 0;
|
||||||
int peering_state = 0;
|
int peering_state = 0;
|
||||||
std::map<object_id, osd_recovery_op_t> recovery_ops;
|
std::map<object_id, osd_recovery_op_t> recovery_ops;
|
||||||
int recovery_done = 0;
|
std::map<object_id, osd_op_t*> scrub_ops;
|
||||||
|
bool recovery_last_degraded = true;
|
||||||
|
pool_pg_num_t recovery_last_pg;
|
||||||
|
object_id recovery_last_oid;
|
||||||
|
int recovery_pg_done = 0, recovery_done = 0;
|
||||||
osd_op_t *autosync_op = NULL;
|
osd_op_t *autosync_op = NULL;
|
||||||
|
|
||||||
|
// Scrubbing
|
||||||
|
uint64_t scrub_nearest_ts = 0;
|
||||||
|
int scrub_timer_id = -1;
|
||||||
|
pool_pg_num_t scrub_last_pg;
|
||||||
|
osd_op_t *scrub_list_op;
|
||||||
|
pg_list_result_t scrub_cur_list = {};
|
||||||
|
uint64_t scrub_list_pos = 0;
|
||||||
|
|
||||||
// Unstable writes
|
// Unstable writes
|
||||||
uint64_t unstable_write_count = 0;
|
uint64_t unstable_write_count = 0;
|
||||||
std::map<osd_object_id_t, uint64_t> unstable_writes;
|
std::map<osd_object_id_t, uint64_t> unstable_writes;
|
||||||
@@ -200,7 +219,6 @@ class osd_t
|
|||||||
bool check_peer_config(osd_client_t *cl, json11::Json conf);
|
bool check_peer_config(osd_client_t *cl, json11::Json conf);
|
||||||
void repeer_pgs(osd_num_t osd_num);
|
void repeer_pgs(osd_num_t osd_num);
|
||||||
void start_pg_peering(pg_t & pg);
|
void start_pg_peering(pg_t & pg);
|
||||||
void submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
|
|
||||||
void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
|
void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
|
||||||
void discard_list_subop(osd_op_t *list_op);
|
void discard_list_subop(osd_op_t *list_op);
|
||||||
bool stop_pg(pg_t & pg);
|
bool stop_pg(pg_t & pg);
|
||||||
@@ -216,6 +234,13 @@ class osd_t
|
|||||||
bool continue_recovery();
|
bool continue_recovery();
|
||||||
pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg);
|
pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg);
|
||||||
|
|
||||||
|
// scrub
|
||||||
|
void scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid);
|
||||||
|
bool pick_next_scrub(object_id & next_oid);
|
||||||
|
void submit_scrub_op(object_id oid);
|
||||||
|
bool continue_scrub();
|
||||||
|
void schedule_scrub(pg_t & pg);
|
||||||
|
|
||||||
// op execution
|
// op execution
|
||||||
void exec_op(osd_op_t *cur_op);
|
void exec_op(osd_op_t *cur_op);
|
||||||
void finish_op(osd_op_t *cur_op, int retval);
|
void finish_op(osd_op_t *cur_op, int retval);
|
||||||
@@ -230,13 +255,15 @@ class osd_t
|
|||||||
void autosync();
|
void autosync();
|
||||||
bool prepare_primary_rw(osd_op_t *cur_op);
|
bool prepare_primary_rw(osd_op_t *cur_op);
|
||||||
void continue_primary_read(osd_op_t *cur_op);
|
void continue_primary_read(osd_op_t *cur_op);
|
||||||
|
void continue_primary_scrub(osd_op_t *cur_op);
|
||||||
void continue_primary_write(osd_op_t *cur_op);
|
void continue_primary_write(osd_op_t *cur_op);
|
||||||
void cancel_primary_write(osd_op_t *cur_op);
|
void cancel_primary_write(osd_op_t *cur_op);
|
||||||
void continue_primary_sync(osd_op_t *cur_op);
|
void continue_primary_sync(osd_op_t *cur_op);
|
||||||
void continue_primary_del(osd_op_t *cur_op);
|
void continue_primary_del(osd_op_t *cur_op);
|
||||||
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
|
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
|
||||||
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
|
void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true);
|
||||||
void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
|
pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref);
|
||||||
|
void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref);
|
||||||
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
|
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
|
||||||
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
|
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
|
||||||
void handle_primary_bs_subop(osd_op_t *subop);
|
void handle_primary_bs_subop(osd_op_t *subop);
|
||||||
@@ -251,10 +278,11 @@ class osd_t
|
|||||||
int submit_primary_sync_subops(osd_op_t *cur_op);
|
int submit_primary_sync_subops(osd_op_t *cur_op);
|
||||||
void submit_primary_stab_subops(osd_op_t *cur_op);
|
void submit_primary_stab_subops(osd_op_t *cur_op);
|
||||||
|
|
||||||
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state);
|
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state);
|
||||||
|
|
||||||
void continue_chained_read(osd_op_t *cur_op);
|
void continue_chained_read(osd_op_t *cur_op);
|
||||||
int submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op);
|
int submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op);
|
||||||
|
void check_corrupted_chained(pg_t & pg, osd_op_t *cur_op);
|
||||||
void send_chained_read_results(pg_t & pg, osd_op_t *cur_op);
|
void send_chained_read_results(pg_t & pg, osd_op_t *cur_op);
|
||||||
std::vector<osd_chain_read_t> collect_chained_read_requests(osd_op_t *cur_op);
|
std::vector<osd_chain_read_t> collect_chained_read_requests(osd_op_t *cur_op);
|
||||||
int collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitmap_request_t> & bitmap_requests);
|
int collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitmap_request_t> & bitmap_requests);
|
||||||
|
@@ -132,7 +132,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
|
|||||||
this->osd_num, immediate_commit == IMMEDIATE_ALL ? "all" : "small",
|
this->osd_num, immediate_commit == IMMEDIATE_ALL ? "all" : "small",
|
||||||
cl->osd_num, conf["immediate_commit"].string_value().c_str()
|
cl->osd_num, conf["immediate_commit"].string_value().c_str()
|
||||||
);
|
);
|
||||||
return true;
|
return false;
|
||||||
}
|
}
|
||||||
else if (conf["block_size"].uint64_value() != (uint64_t)this->bs_block_size)
|
else if (conf["block_size"].uint64_value() != (uint64_t)this->bs_block_size)
|
||||||
{
|
{
|
||||||
@@ -140,7 +140,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
|
|||||||
"[OSD %lu] My block_size is %u, but peer OSD %lu has %lu. We can't work together\n",
|
"[OSD %lu] My block_size is %u, but peer OSD %lu has %lu. We can't work together\n",
|
||||||
this->osd_num, this->bs_block_size, cl->osd_num, conf["block_size"].uint64_value()
|
this->osd_num, this->bs_block_size, cl->osd_num, conf["block_size"].uint64_value()
|
||||||
);
|
);
|
||||||
return true;
|
return false;
|
||||||
}
|
}
|
||||||
else if (conf["bitmap_granularity"].uint64_value() != (uint64_t)this->bs_bitmap_granularity)
|
else if (conf["bitmap_granularity"].uint64_value() != (uint64_t)this->bs_bitmap_granularity)
|
||||||
{
|
{
|
||||||
@@ -148,7 +148,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
|
|||||||
"[OSD %lu] My bitmap_granularity is %u, but peer OSD %lu has %lu. We can't work together\n",
|
"[OSD %lu] My bitmap_granularity is %u, but peer OSD %lu has %lu. We can't work together\n",
|
||||||
this->osd_num, this->bs_bitmap_granularity, cl->osd_num, conf["bitmap_granularity"].uint64_value()
|
this->osd_num, this->bs_bitmap_granularity, cl->osd_num, conf["bitmap_granularity"].uint64_value()
|
||||||
);
|
);
|
||||||
return true;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@@ -336,6 +336,8 @@ void osd_t::report_statistics()
|
|||||||
pg_stats["misplaced_count"] = pg.misplaced_objects.size();
|
pg_stats["misplaced_count"] = pg.misplaced_objects.size();
|
||||||
pg_stats["degraded_count"] = pg.degraded_objects.size();
|
pg_stats["degraded_count"] = pg.degraded_objects.size();
|
||||||
pg_stats["incomplete_count"] = pg.incomplete_objects.size();
|
pg_stats["incomplete_count"] = pg.incomplete_objects.size();
|
||||||
|
if (pg.corrupted_count)
|
||||||
|
pg_stats["corrupted_count"] = pg.corrupted_count;
|
||||||
pg_stats["write_osd_set"] = pg.cur_set;
|
pg_stats["write_osd_set"] = pg.cur_set;
|
||||||
txn.push_back(json11::Json::object {
|
txn.push_back(json11::Json::object {
|
||||||
{ "request_put", json11::Json::object {
|
{ "request_put", json11::Json::object {
|
||||||
@@ -382,30 +384,6 @@ void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num)
|
|
||||||
{
|
|
||||||
auto pg_it = pgs.find({
|
|
||||||
.pool_id = pool_id,
|
|
||||||
.pg_num = pg_num,
|
|
||||||
});
|
|
||||||
if (pg_it != pgs.end() && pg_it->second.epoch > pg_it->second.reported_epoch &&
|
|
||||||
st_cli.pool_config[pool_id].pg_config[pg_num].epoch >= pg_it->second.epoch)
|
|
||||||
{
|
|
||||||
pg_it->second.reported_epoch = st_cli.pool_config[pool_id].pg_config[pg_num].epoch;
|
|
||||||
object_id oid = { 0 };
|
|
||||||
bool first = true;
|
|
||||||
for (auto op: pg_it->second.write_queue)
|
|
||||||
{
|
|
||||||
if (first || oid != op.first)
|
|
||||||
{
|
|
||||||
oid = op.first;
|
|
||||||
first = false;
|
|
||||||
continue_primary_write(op.second);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void osd_t::on_load_config_hook(json11::Json::object & global_config)
|
void osd_t::on_load_config_hook(json11::Json::object & global_config)
|
||||||
{
|
{
|
||||||
json11::Json::object osd_config = this->config;
|
json11::Json::object osd_config = this->config;
|
||||||
@@ -704,13 +682,22 @@ void osd_t::apply_pg_config()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
auto vec_all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end());
|
||||||
if (currently_taken)
|
if (currently_taken)
|
||||||
{
|
{
|
||||||
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING | PG_PEERED))
|
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING))
|
||||||
{
|
{
|
||||||
if (pg_it->second.target_set == pg_cfg.target_set)
|
if (pg_it->second.target_set == pg_cfg.target_set &&
|
||||||
|
pg_it->second.target_history == pg_cfg.target_history &&
|
||||||
|
pg_it->second.all_peers == vec_all_peers)
|
||||||
{
|
{
|
||||||
// No change in osd_set; history changes are ignored
|
// No change in osd_set and history
|
||||||
|
if (pg_it->second.scrub_ts != pg_cfg.scrub_ts)
|
||||||
|
{
|
||||||
|
pg_it->second.scrub_ts = pg_cfg.scrub_ts;
|
||||||
|
peering_state = peering_state | OSD_SCRUBBING;
|
||||||
|
ringloop->wakeup();
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -761,7 +748,8 @@ void osd_t::apply_pg_config()
|
|||||||
.pg_num = pg_num,
|
.pg_num = pg_num,
|
||||||
.reported_epoch = pg_cfg.epoch,
|
.reported_epoch = pg_cfg.epoch,
|
||||||
.target_history = pg_cfg.target_history,
|
.target_history = pg_cfg.target_history,
|
||||||
.all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
|
.all_peers = vec_all_peers,
|
||||||
|
.scrub_ts = pg_cfg.scrub_ts,
|
||||||
.target_set = pg_cfg.target_set,
|
.target_set = pg_cfg.target_set,
|
||||||
};
|
};
|
||||||
if (pg.scheme == POOL_SCHEME_EC)
|
if (pg.scheme == POOL_SCHEME_EC)
|
||||||
@@ -892,6 +880,8 @@ void osd_t::report_pg_states()
|
|||||||
{ "all_peers", pg.all_peers },
|
{ "all_peers", pg.all_peers },
|
||||||
{ "osd_sets", pg.target_history },
|
{ "osd_sets", pg.target_history },
|
||||||
};
|
};
|
||||||
|
if (pg.scrub_ts)
|
||||||
|
history_value["scrub_ts"] = pg.scrub_ts;
|
||||||
checks.push_back(json11::Json::object {
|
checks.push_back(json11::Json::object {
|
||||||
{ "target", "MOD" },
|
{ "target", "MOD" },
|
||||||
{ "key", history_key },
|
{ "key", history_key },
|
||||||
@@ -984,13 +974,6 @@ void osd_t::report_pg_states()
|
|||||||
}
|
}
|
||||||
this->pgs.erase(pg_it);
|
this->pgs.erase(pg_it);
|
||||||
}
|
}
|
||||||
else if (pg_it->second.state & PG_PEERED)
|
|
||||||
{
|
|
||||||
// Activate PG after PG PEERED state is reported along with history
|
|
||||||
// (if the state wasn't changed again)
|
|
||||||
pg_it->second.state = pg_it->second.state & ~PG_PEERED | PG_ACTIVE;
|
|
||||||
report_pg_state(pg_it->second);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Push other PG state updates, if any
|
// Push other PG state updates, if any
|
||||||
|
@@ -182,7 +182,9 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
|||||||
op->bs_op = NULL;
|
op->bs_op = NULL;
|
||||||
delete op;
|
delete op;
|
||||||
},
|
},
|
||||||
.len = (uint32_t)count,
|
{
|
||||||
|
.len = (uint32_t)count,
|
||||||
|
},
|
||||||
.buf = op->buf,
|
.buf = op->buf,
|
||||||
});
|
});
|
||||||
bs->enqueue_op(op->bs_op);
|
bs->enqueue_op(op->bs_op);
|
||||||
@@ -226,42 +228,51 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
|||||||
|
|
||||||
bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
||||||
{
|
{
|
||||||
if (!no_recovery)
|
if (!pgs.size())
|
||||||
{
|
{
|
||||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
return false;
|
||||||
{
|
|
||||||
if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
|
|
||||||
{
|
|
||||||
for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
|
|
||||||
{
|
|
||||||
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
|
||||||
{
|
|
||||||
op.degraded = true;
|
|
||||||
op.oid = obj_it->first;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (!no_rebalance)
|
// Restart scanning from the same degraded/misplaced status as the last time
|
||||||
|
for (int tried_degraded = 0; tried_degraded < 2; tried_degraded++)
|
||||||
{
|
{
|
||||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
if (recovery_last_degraded ? !no_recovery : !no_rebalance)
|
||||||
{
|
{
|
||||||
// Don't try to "recover" misplaced objects if "recovery" would make them degraded
|
// Don't try to "recover" misplaced objects if "recovery" would make them degraded
|
||||||
if ((pg_it->second.state & (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
|
auto mask = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED);
|
||||||
|
auto check = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_HAS_MISPLACED);
|
||||||
|
// Restart scanning from the same PG as the last time
|
||||||
|
for (auto pg_it = pgs.lower_bound(recovery_last_pg); pg_it != pgs.end(); pg_it++)
|
||||||
{
|
{
|
||||||
for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
|
if ((pg_it->second.state & mask) == check)
|
||||||
{
|
{
|
||||||
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
auto & src = recovery_last_degraded ? pg_it->second.degraded_objects : pg_it->second.misplaced_objects;
|
||||||
|
assert(src.size() > 0);
|
||||||
|
// Restart scanning from the next object
|
||||||
|
for (auto obj_it = src.upper_bound(recovery_last_oid); obj_it != src.end(); obj_it++)
|
||||||
{
|
{
|
||||||
op.degraded = false;
|
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
||||||
op.oid = obj_it->first;
|
{
|
||||||
return true;
|
op.degraded = recovery_last_degraded;
|
||||||
|
recovery_last_oid = op.oid = obj_it->first;
|
||||||
|
recovery_pg_done++;
|
||||||
|
// Switch to another PG after recovery_pg_switch operations
|
||||||
|
// to always mix all PGs during recovery but still benefit
|
||||||
|
// from recovery queue depth greater than 1
|
||||||
|
if (recovery_pg_done >= recovery_pg_switch)
|
||||||
|
{
|
||||||
|
recovery_pg_done = 0;
|
||||||
|
recovery_last_pg.pg_num++;
|
||||||
|
recovery_last_oid = {};
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
recovery_last_degraded = !recovery_last_degraded;
|
||||||
|
recovery_last_pg = {};
|
||||||
|
recovery_last_oid = {};
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -291,19 +302,17 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
|||||||
if (osd_op->reply.hdr.retval < 0)
|
if (osd_op->reply.hdr.retval < 0)
|
||||||
{
|
{
|
||||||
// Error recovering object
|
// Error recovering object
|
||||||
if (osd_op->reply.hdr.retval == -EPIPE)
|
// EPIPE is totally harmless (peer is gone), others like EIO/EDOM may be not
|
||||||
{
|
printf(
|
||||||
// PG is stopped or one of the OSDs is gone, error is harmless
|
"Recovery operation failed with object %lx:%lx (PG %u/%u): error %ld\n",
|
||||||
printf(
|
op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
|
||||||
"Recovery operation failed with object %lx:%lx (PG %u/%u)\n",
|
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
|
||||||
op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
|
osd_op->reply.hdr.retval
|
||||||
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size)
|
);
|
||||||
);
|
}
|
||||||
}
|
else if (log_level > 2)
|
||||||
else
|
{
|
||||||
{
|
printf("Recovery operation done for %lx:%lx\n", op->oid.inode, op->oid.stripe);
|
||||||
throw std::runtime_error("Failed to recover an object");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
|
// CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
|
||||||
op->osd_op = NULL;
|
op->osd_op = NULL;
|
||||||
|
10
src/osd_id.h
10
src/osd_id.h
@@ -28,3 +28,13 @@ inline bool operator < (const pool_pg_num_t & a, const pool_pg_num_t & b)
|
|||||||
{
|
{
|
||||||
return a.pool_id < b.pool_id || a.pool_id == b.pool_id && a.pg_num < b.pg_num;
|
return a.pool_id < b.pool_id || a.pool_id == b.pool_id && a.pg_num < b.pg_num;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool operator == (const pool_pg_num_t & a, const pool_pg_num_t & b)
|
||||||
|
{
|
||||||
|
return a.pool_id == b.pool_id && a.pg_num == b.pg_num;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator != (const pool_pg_num_t & a, const pool_pg_num_t & b)
|
||||||
|
{
|
||||||
|
return a.pool_id != b.pool_id || a.pg_num != b.pg_num;
|
||||||
|
}
|
||||||
|
@@ -29,7 +29,8 @@
|
|||||||
#define OSD_OP_DELETE 14
|
#define OSD_OP_DELETE 14
|
||||||
#define OSD_OP_PING 15
|
#define OSD_OP_PING 15
|
||||||
#define OSD_OP_SEC_READ_BMP 16
|
#define OSD_OP_SEC_READ_BMP 16
|
||||||
#define OSD_OP_MAX 16
|
#define OSD_OP_SCRUB 17
|
||||||
|
#define OSD_OP_MAX 17
|
||||||
#define OSD_RW_MAX 64*1024*1024
|
#define OSD_RW_MAX 64*1024*1024
|
||||||
#define OSD_PROTOCOL_VERSION 1
|
#define OSD_PROTOCOL_VERSION 1
|
||||||
|
|
||||||
@@ -173,6 +174,11 @@ struct __attribute__((__packed__)) osd_op_sec_list_t
|
|||||||
uint64_t pg_stripe_size;
|
uint64_t pg_stripe_size;
|
||||||
// inode range (used to select pools)
|
// inode range (used to select pools)
|
||||||
uint64_t min_inode, max_inode;
|
uint64_t min_inode, max_inode;
|
||||||
|
// min/max oid stripe, added after inodes for backwards compatibility
|
||||||
|
// also for backwards compatibility, max_stripe=UINT64_MAX means 0 and 0 means UINT64_MAX O_o
|
||||||
|
uint64_t min_stripe, max_stripe;
|
||||||
|
// max stable object count
|
||||||
|
uint32_t stable_limit;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct __attribute__((__packed__)) osd_reply_sec_list_t
|
struct __attribute__((__packed__)) osd_reply_sec_list_t
|
||||||
|
@@ -24,6 +24,7 @@ void osd_t::handle_peers()
|
|||||||
if (!p.second.peering_state->list_ops.size())
|
if (!p.second.peering_state->list_ops.size())
|
||||||
{
|
{
|
||||||
p.second.calc_object_states(log_level);
|
p.second.calc_object_states(log_level);
|
||||||
|
schedule_scrub(p.second);
|
||||||
report_pg_state(p.second);
|
report_pg_state(p.second);
|
||||||
incomplete_objects += p.second.incomplete_objects.size();
|
incomplete_objects += p.second.incomplete_objects.size();
|
||||||
misplaced_objects += p.second.misplaced_objects.size();
|
misplaced_objects += p.second.misplaced_objects.size();
|
||||||
@@ -32,7 +33,16 @@ void osd_t::handle_peers()
|
|||||||
if (p.second.state & PG_HAS_UNCLEAN)
|
if (p.second.state & PG_HAS_UNCLEAN)
|
||||||
peering_state = peering_state | OSD_FLUSHING_PGS;
|
peering_state = peering_state | OSD_FLUSHING_PGS;
|
||||||
else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
|
else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
|
||||||
|
{
|
||||||
peering_state = peering_state | OSD_RECOVERING;
|
peering_state = peering_state | OSD_RECOVERING;
|
||||||
|
if (p.second.state & PG_HAS_DEGRADED)
|
||||||
|
{
|
||||||
|
// Restart recovery from degraded objects
|
||||||
|
recovery_last_degraded = true;
|
||||||
|
recovery_last_pg = {};
|
||||||
|
recovery_last_oid = {};
|
||||||
|
}
|
||||||
|
}
|
||||||
ringloop->wakeup();
|
ringloop->wakeup();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -41,10 +51,6 @@ void osd_t::handle_peers()
|
|||||||
still = true;
|
still = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (p.second.state & PG_PEERED)
|
|
||||||
{
|
|
||||||
still = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (!still)
|
if (!still)
|
||||||
{
|
{
|
||||||
@@ -65,10 +71,6 @@ void osd_t::handle_peers()
|
|||||||
}
|
}
|
||||||
still = true;
|
still = true;
|
||||||
}
|
}
|
||||||
else if (p.second.state & PG_PEERED)
|
|
||||||
{
|
|
||||||
still = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (!still)
|
if (!still)
|
||||||
{
|
{
|
||||||
@@ -82,6 +84,13 @@ void osd_t::handle_peers()
|
|||||||
peering_state = peering_state & ~OSD_RECOVERING;
|
peering_state = peering_state & ~OSD_RECOVERING;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (peering_state & OSD_SCRUBBING)
|
||||||
|
{
|
||||||
|
if (!continue_scrub())
|
||||||
|
{
|
||||||
|
peering_state = peering_state & ~OSD_SCRUBBING;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::repeer_pgs(osd_num_t peer_osd)
|
void osd_t::repeer_pgs(osd_num_t peer_osd)
|
||||||
@@ -91,7 +100,7 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
|
|||||||
{
|
{
|
||||||
auto & pg = p.second;
|
auto & pg = p.second;
|
||||||
bool repeer = false;
|
bool repeer = false;
|
||||||
if (pg.state & (PG_PEERING | PG_PEERED | PG_ACTIVE | PG_INCOMPLETE))
|
if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
|
||||||
{
|
{
|
||||||
for (osd_num_t pg_osd: pg.all_peers)
|
for (osd_num_t pg_osd: pg.all_peers)
|
||||||
{
|
{
|
||||||
@@ -127,9 +136,11 @@ void osd_t::reset_pg(pg_t & pg)
|
|||||||
pg.state_dict.clear();
|
pg.state_dict.clear();
|
||||||
copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
|
copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
|
||||||
pg.copies_to_delete_after_sync.clear();
|
pg.copies_to_delete_after_sync.clear();
|
||||||
|
corrupted_objects -= pg.corrupted_count;
|
||||||
incomplete_objects -= pg.incomplete_objects.size();
|
incomplete_objects -= pg.incomplete_objects.size();
|
||||||
misplaced_objects -= pg.misplaced_objects.size();
|
misplaced_objects -= pg.misplaced_objects.size();
|
||||||
degraded_objects -= pg.degraded_objects.size();
|
degraded_objects -= pg.degraded_objects.size();
|
||||||
|
pg.corrupted_count = 0;
|
||||||
pg.incomplete_objects.clear();
|
pg.incomplete_objects.clear();
|
||||||
pg.misplaced_objects.clear();
|
pg.misplaced_objects.clear();
|
||||||
pg.degraded_objects.clear();
|
pg.degraded_objects.clear();
|
||||||
@@ -205,7 +216,7 @@ void osd_t::start_pg_peering(pg_t & pg)
|
|||||||
pg.cur_loc_set.push_back({
|
pg.cur_loc_set.push_back({
|
||||||
.role = (uint64_t)role,
|
.role = (uint64_t)role,
|
||||||
.osd_num = pg.cur_set[role],
|
.osd_num = pg.cur_set[role],
|
||||||
.outdated = false,
|
.loc_bad = 0,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -302,82 +313,11 @@ void osd_t::start_pg_peering(pg_t & pg)
|
|||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
submit_sync_and_list_subop(peer_osd, pg.peering_state);
|
submit_list_subop(peer_osd, pg.peering_state);
|
||||||
}
|
}
|
||||||
ringloop->wakeup();
|
ringloop->wakeup();
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
|
||||||
{
|
|
||||||
// Sync before listing, if not readonly
|
|
||||||
if (readonly)
|
|
||||||
{
|
|
||||||
submit_list_subop(role_osd, ps);
|
|
||||||
}
|
|
||||||
else if (role_osd == this->osd_num)
|
|
||||||
{
|
|
||||||
// Self
|
|
||||||
osd_op_t *op = new osd_op_t();
|
|
||||||
op->op_type = 0;
|
|
||||||
op->peer_fd = SELF_FD;
|
|
||||||
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
|
|
||||||
op->bs_op = new blockstore_op_t();
|
|
||||||
op->bs_op->opcode = BS_OP_SYNC;
|
|
||||||
op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
|
|
||||||
{
|
|
||||||
if (bs_op->retval < 0)
|
|
||||||
{
|
|
||||||
printf("Local OP_SYNC failed: %d (%s)\n", bs_op->retval, strerror(-bs_op->retval));
|
|
||||||
force_stop(1);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
add_bs_subop_stats(op);
|
|
||||||
delete op->bs_op;
|
|
||||||
op->bs_op = NULL;
|
|
||||||
delete op;
|
|
||||||
ps->list_ops.erase(role_osd);
|
|
||||||
submit_list_subop(role_osd, ps);
|
|
||||||
};
|
|
||||||
ps->list_ops[role_osd] = op;
|
|
||||||
bs->enqueue_op(op->bs_op);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Peer
|
|
||||||
auto & cl = msgr.clients.at(msgr.osd_peer_fds.at(role_osd));
|
|
||||||
osd_op_t *op = new osd_op_t();
|
|
||||||
op->op_type = OSD_OP_OUT;
|
|
||||||
op->peer_fd = cl->peer_fd;
|
|
||||||
op->req = (osd_any_op_t){
|
|
||||||
.sec_sync = {
|
|
||||||
.header = {
|
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
|
||||||
.id = msgr.next_subop_id++,
|
|
||||||
.opcode = OSD_OP_SEC_SYNC,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
};
|
|
||||||
op->callback = [this, ps, role_osd](osd_op_t *op)
|
|
||||||
{
|
|
||||||
if (op->reply.hdr.retval < 0)
|
|
||||||
{
|
|
||||||
// FIXME: Mark peer as failed and don't reconnect immediately after dropping the connection
|
|
||||||
printf("Failed to sync OSD %lu: %ld (%s), disconnecting peer\n", role_osd, op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
|
|
||||||
int fail_fd = op->peer_fd;
|
|
||||||
ps->list_ops.erase(role_osd);
|
|
||||||
delete op;
|
|
||||||
msgr.stop_client(fail_fd);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
delete op;
|
|
||||||
ps->list_ops.erase(role_osd);
|
|
||||||
submit_list_subop(role_osd, ps);
|
|
||||||
};
|
|
||||||
ps->list_ops[role_osd] = op;
|
|
||||||
msgr.outbox_push(op);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||||
{
|
{
|
||||||
if (role_osd == this->osd_num)
|
if (role_osd == this->osd_num)
|
||||||
@@ -389,11 +329,12 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
|||||||
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
|
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
|
||||||
op->bs_op = new blockstore_op_t();
|
op->bs_op = new blockstore_op_t();
|
||||||
op->bs_op->opcode = BS_OP_LIST;
|
op->bs_op->opcode = BS_OP_LIST;
|
||||||
op->bs_op->oid.stripe = st_cli.pool_config[ps->pool_id].pg_stripe_size;
|
op->bs_op->pg_alignment = st_cli.pool_config[ps->pool_id].pg_stripe_size;
|
||||||
op->bs_op->oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
|
op->bs_op->min_oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
|
||||||
op->bs_op->version = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
|
op->bs_op->max_oid.inode = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
|
||||||
op->bs_op->len = pg_counts[ps->pool_id];
|
op->bs_op->max_oid.stripe = UINT64_MAX;
|
||||||
op->bs_op->offset = ps->pg_num-1;
|
op->bs_op->pg_count = pg_counts[ps->pool_id];
|
||||||
|
op->bs_op->pg_number = ps->pg_num-1;
|
||||||
op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
|
op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
|
||||||
{
|
{
|
||||||
if (op->bs_op->retval < 0)
|
if (op->bs_op->retval < 0)
|
||||||
@@ -551,13 +492,17 @@ void osd_t::report_pg_state(pg_t & pg)
|
|||||||
pg.history_changed = true;
|
pg.history_changed = true;
|
||||||
pg.target_history.clear();
|
pg.target_history.clear();
|
||||||
pg.all_peers = pg.target_set;
|
pg.all_peers = pg.target_set;
|
||||||
|
std::sort(pg.all_peers.begin(), pg.all_peers.end());
|
||||||
pg.cur_peers = pg.target_set;
|
pg.cur_peers = pg.target_set;
|
||||||
}
|
}
|
||||||
else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
|
else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
|
||||||
{
|
{
|
||||||
// Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
|
// Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
|
||||||
pg.history_changed = true;
|
if (pg.target_history.size())
|
||||||
pg.target_history.clear();
|
{
|
||||||
|
pg.history_changed = true;
|
||||||
|
pg.target_history.clear();
|
||||||
|
}
|
||||||
std::set<osd_num_t> dead_peers;
|
std::set<osd_num_t> dead_peers;
|
||||||
for (auto pg_osd: pg.all_peers)
|
for (auto pg_osd: pg.all_peers)
|
||||||
{
|
{
|
||||||
@@ -574,8 +519,12 @@ void osd_t::report_pg_state(pg_t & pg)
|
|||||||
dead_peers.insert(pg_osd);
|
dead_peers.insert(pg_osd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pg.all_peers.clear();
|
auto new_all_peers = std::vector<osd_num_t>(dead_peers.begin(), dead_peers.end());
|
||||||
pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end());
|
if (pg.all_peers != new_all_peers)
|
||||||
|
{
|
||||||
|
pg.history_changed = true;
|
||||||
|
pg.all_peers = new_all_peers;
|
||||||
|
}
|
||||||
pg.cur_peers.clear();
|
pg.cur_peers.clear();
|
||||||
for (auto pg_osd: pg.target_set)
|
for (auto pg_osd: pg.target_set)
|
||||||
{
|
{
|
||||||
|
@@ -86,24 +86,11 @@ void pg_obj_state_check_t::walk()
|
|||||||
}
|
}
|
||||||
if (pg->pg_cursize < pg->pg_size)
|
if (pg->pg_cursize < pg->pg_size)
|
||||||
{
|
{
|
||||||
// Report PG history and activate
|
// Activate as degraded
|
||||||
pg->state |= PG_DEGRADED | PG_PEERED;
|
// Current OSD set will be added into target_history on first write
|
||||||
std::vector<osd_num_t> history_set;
|
pg->state |= PG_DEGRADED;
|
||||||
for (auto peer_osd: pg->cur_set)
|
|
||||||
{
|
|
||||||
if (peer_osd != 0)
|
|
||||||
{
|
|
||||||
history_set.push_back(peer_osd);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pg->target_history.push_back(history_set);
|
|
||||||
pg->history_changed = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Just activate
|
|
||||||
pg->state |= PG_ACTIVE;
|
|
||||||
}
|
}
|
||||||
|
pg->state |= PG_ACTIVE;
|
||||||
if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
|
if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
|
||||||
{
|
{
|
||||||
pg->state |= PG_LEFT_ON_DEAD;
|
pg->state |= PG_LEFT_ON_DEAD;
|
||||||
@@ -293,7 +280,7 @@ void pg_obj_state_check_t::finish_object()
|
|||||||
osd_set.push_back((pg_obj_loc_t){
|
osd_set.push_back((pg_obj_loc_t){
|
||||||
.role = (list[i].oid.stripe & STRIPE_MASK),
|
.role = (list[i].oid.stripe & STRIPE_MASK),
|
||||||
.osd_num = list[i].osd_num,
|
.osd_num = list[i].osd_num,
|
||||||
.outdated = false,
|
.loc_bad = 0,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -315,7 +302,7 @@ void pg_obj_state_check_t::finish_object()
|
|||||||
osd_set.push_back((pg_obj_loc_t){
|
osd_set.push_back((pg_obj_loc_t){
|
||||||
.role = (list[i].oid.stripe & STRIPE_MASK),
|
.role = (list[i].oid.stripe & STRIPE_MASK),
|
||||||
.osd_num = list[i].osd_num,
|
.osd_num = list[i].osd_num,
|
||||||
.outdated = true,
|
.loc_bad = LOC_OUTDATED,
|
||||||
});
|
});
|
||||||
if (!(state & (OBJ_INCOMPLETE | OBJ_DEGRADED)))
|
if (!(state & (OBJ_INCOMPLETE | OBJ_DEGRADED)))
|
||||||
{
|
{
|
||||||
@@ -335,67 +322,73 @@ void pg_obj_state_check_t::finish_object()
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
auto it = pg->state_dict.find(osd_set);
|
pg->add_object_to_state(oid, state, osd_set);
|
||||||
if (it == pg->state_dict.end())
|
|
||||||
{
|
|
||||||
std::vector<uint64_t> read_target;
|
|
||||||
if (replicated)
|
|
||||||
{
|
|
||||||
for (auto & o: osd_set)
|
|
||||||
{
|
|
||||||
if (!o.outdated)
|
|
||||||
{
|
|
||||||
read_target.push_back(o.osd_num);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
while (read_target.size() < pg->pg_size)
|
|
||||||
{
|
|
||||||
// FIXME: This is because we then use .data() and assume it's at least <pg_size> long
|
|
||||||
read_target.push_back(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
read_target.resize(pg->pg_size);
|
|
||||||
for (int i = 0; i < pg->pg_size; i++)
|
|
||||||
{
|
|
||||||
read_target[i] = 0;
|
|
||||||
}
|
|
||||||
for (auto & o: osd_set)
|
|
||||||
{
|
|
||||||
if (!o.outdated)
|
|
||||||
{
|
|
||||||
read_target[o.role] = o.osd_num;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pg->state_dict[osd_set] = {
|
|
||||||
.read_target = read_target,
|
|
||||||
.osd_set = osd_set,
|
|
||||||
.state = state,
|
|
||||||
.object_count = 1,
|
|
||||||
};
|
|
||||||
it = pg->state_dict.find(osd_set);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
it->second.object_count++;
|
|
||||||
}
|
|
||||||
if (state & OBJ_INCOMPLETE)
|
|
||||||
{
|
|
||||||
pg->incomplete_objects[oid] = &it->second;
|
|
||||||
}
|
|
||||||
else if (state & OBJ_DEGRADED)
|
|
||||||
{
|
|
||||||
pg->degraded_objects[oid] = &it->second;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
pg->misplaced_objects[oid] = &it->second;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set)
|
||||||
|
{
|
||||||
|
auto it = state_dict.find(osd_set);
|
||||||
|
if (it == state_dict.end())
|
||||||
|
{
|
||||||
|
std::vector<osd_num_t> read_target;
|
||||||
|
if (scheme == POOL_SCHEME_REPLICATED)
|
||||||
|
{
|
||||||
|
for (auto & o: osd_set)
|
||||||
|
{
|
||||||
|
if (!o.loc_bad)
|
||||||
|
{
|
||||||
|
read_target.push_back(o.osd_num);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while (read_target.size() < pg_size)
|
||||||
|
{
|
||||||
|
// FIXME: This is because we then use .data() and assume it's at least <pg_size> long
|
||||||
|
read_target.push_back(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
read_target.resize(pg_size);
|
||||||
|
for (int i = 0; i < pg_size; i++)
|
||||||
|
{
|
||||||
|
read_target[i] = 0;
|
||||||
|
}
|
||||||
|
for (auto & o: osd_set)
|
||||||
|
{
|
||||||
|
if (!o.loc_bad)
|
||||||
|
{
|
||||||
|
read_target[o.role] = o.osd_num;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
state_dict[osd_set] = {
|
||||||
|
.read_target = read_target,
|
||||||
|
.osd_set = osd_set,
|
||||||
|
.state = state,
|
||||||
|
.object_count = 1,
|
||||||
|
};
|
||||||
|
it = state_dict.find(osd_set);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
it->second.object_count++;
|
||||||
|
}
|
||||||
|
if (state & OBJ_INCOMPLETE)
|
||||||
|
{
|
||||||
|
incomplete_objects[oid] = &it->second;
|
||||||
|
}
|
||||||
|
else if (state & OBJ_DEGRADED)
|
||||||
|
{
|
||||||
|
degraded_objects[oid] = &it->second;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
misplaced_objects[oid] = &it->second;
|
||||||
|
}
|
||||||
|
return &it->second;
|
||||||
|
}
|
||||||
|
|
||||||
// FIXME: Write at least some tests for this function
|
// FIXME: Write at least some tests for this function
|
||||||
void pg_t::calc_object_states(int log_level)
|
void pg_t::calc_object_states(int log_level)
|
||||||
{
|
{
|
||||||
@@ -435,32 +428,58 @@ void pg_t::calc_object_states(int log_level)
|
|||||||
std::sort(st.list.begin(), st.list.end());
|
std::sort(st.list.begin(), st.list.end());
|
||||||
// Walk over it and check object states
|
// Walk over it and check object states
|
||||||
st.walk();
|
st.walk();
|
||||||
if (this->state & (PG_DEGRADED|PG_LEFT_ON_DEAD))
|
if (this->state != PG_ACTIVE)
|
||||||
{
|
{
|
||||||
assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
|
assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
|
||||||
epoch++;
|
epoch++;
|
||||||
}
|
}
|
||||||
|
if (log_level > 0)
|
||||||
|
{
|
||||||
|
std::string osd_set_desc;
|
||||||
|
for (auto & osd_num: target_set)
|
||||||
|
{
|
||||||
|
osd_set_desc += (osd_set_desc == "" ? "" : ", ")+std::to_string(osd_num);
|
||||||
|
}
|
||||||
|
printf(
|
||||||
|
"[PG %u/%u] %lu clean objects on target OSD set %s\n",
|
||||||
|
pool_id, pg_num, clean_count, osd_set_desc.c_str()
|
||||||
|
);
|
||||||
|
for (auto & stp: state_dict)
|
||||||
|
{
|
||||||
|
osd_set_desc = "";
|
||||||
|
for (auto & loc: stp.first)
|
||||||
|
{
|
||||||
|
osd_set_desc += (osd_set_desc == "" ? "" : ", ")+
|
||||||
|
std::to_string(loc.osd_num)+
|
||||||
|
(st.replicated ? "" : "("+std::to_string(loc.role)+")")+
|
||||||
|
(loc.loc_bad & LOC_OUTDATED ? "(old)" : "")+
|
||||||
|
(loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : "");
|
||||||
|
}
|
||||||
|
printf("[PG %u/%u] %lu objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void pg_t::print_state()
|
void pg_t::print_state()
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||||
(state & PG_STARTING) ? "starting" : "",
|
(state & PG_STARTING) ? "starting" : "",
|
||||||
(state & PG_OFFLINE) ? "offline" : "",
|
(state & PG_OFFLINE) ? "offline" : "",
|
||||||
(state & PG_PEERING) ? "peering" : "",
|
(state & PG_PEERING) ? "peering" : "",
|
||||||
(state & PG_PEERED) ? "peered" : "",
|
|
||||||
(state & PG_INCOMPLETE) ? "incomplete" : "",
|
(state & PG_INCOMPLETE) ? "incomplete" : "",
|
||||||
(state & PG_ACTIVE) ? "active" : "",
|
(state & PG_ACTIVE) ? "active" : "",
|
||||||
(state & PG_REPEERING) ? "repeering" : "",
|
(state & PG_REPEERING) ? "repeering" : "",
|
||||||
(state & PG_STOPPING) ? "stopping" : "",
|
(state & PG_STOPPING) ? "stopping" : "",
|
||||||
(state & PG_DEGRADED) ? " + degraded" : "",
|
(state & PG_DEGRADED) ? " + degraded" : "",
|
||||||
|
(state & PG_HAS_CORRUPTED) ? " + has_corrupted" : "",
|
||||||
(state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
|
(state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
|
||||||
(state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
|
(state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
|
||||||
(state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
|
(state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
|
||||||
(state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
|
(state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
|
||||||
(state & PG_HAS_INVALID) ? " + has_invalid" : "",
|
(state & PG_HAS_INVALID) ? " + has_invalid" : "",
|
||||||
(state & PG_LEFT_ON_DEAD) ? " + left_on_dead" : "",
|
(state & PG_LEFT_ON_DEAD) ? " + left_on_dead" : "",
|
||||||
|
(state & PG_SCRUBBING) ? " + scrubbing" : "",
|
||||||
total_count
|
total_count
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@@ -13,11 +13,14 @@
|
|||||||
|
|
||||||
#define PG_EPOCH_BITS 48
|
#define PG_EPOCH_BITS 48
|
||||||
|
|
||||||
|
#define LOC_OUTDATED 1
|
||||||
|
#define LOC_CORRUPTED 2
|
||||||
|
|
||||||
struct pg_obj_loc_t
|
struct pg_obj_loc_t
|
||||||
{
|
{
|
||||||
uint64_t role;
|
uint64_t role;
|
||||||
osd_num_t osd_num;
|
osd_num_t osd_num;
|
||||||
bool outdated;
|
uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
|
typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
|
||||||
@@ -30,6 +33,7 @@ struct pg_osd_set_state_t
|
|||||||
pg_osd_set_t osd_set;
|
pg_osd_set_t osd_set;
|
||||||
uint64_t state = 0;
|
uint64_t state = 0;
|
||||||
uint64_t object_count = 0;
|
uint64_t object_count = 0;
|
||||||
|
uint64_t ref_count = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct pg_list_result_t
|
struct pg_list_result_t
|
||||||
@@ -91,6 +95,8 @@ struct pg_t
|
|||||||
// target history and all potential peers
|
// target history and all potential peers
|
||||||
std::vector<std::vector<osd_num_t>> target_history;
|
std::vector<std::vector<osd_num_t>> target_history;
|
||||||
std::vector<osd_num_t> all_peers;
|
std::vector<osd_num_t> all_peers;
|
||||||
|
// last scrub time
|
||||||
|
uint64_t scrub_ts = 0;
|
||||||
bool history_changed = false;
|
bool history_changed = false;
|
||||||
// peer list from the last peering event
|
// peer list from the last peering event
|
||||||
std::vector<osd_num_t> cur_peers;
|
std::vector<osd_num_t> cur_peers;
|
||||||
@@ -106,6 +112,7 @@ struct pg_t
|
|||||||
// it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
|
// it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
|
||||||
// which is up to ~192 MB per 1 TB in the worst case scenario
|
// which is up to ~192 MB per 1 TB in the worst case scenario
|
||||||
std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
|
std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
|
||||||
|
uint64_t corrupted_count;
|
||||||
btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
|
btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
|
||||||
std::map<obj_piece_id_t, flush_action_t> flush_actions;
|
std::map<obj_piece_id_t, flush_action_t> flush_actions;
|
||||||
std::vector<obj_ver_osd_t> copies_to_delete_after_sync;
|
std::vector<obj_ver_osd_t> copies_to_delete_after_sync;
|
||||||
@@ -116,15 +123,16 @@ struct pg_t
|
|||||||
int inflight = 0; // including write_queue
|
int inflight = 0; // including write_queue
|
||||||
std::multimap<object_id, osd_op_t*> write_queue;
|
std::multimap<object_id, osd_op_t*> write_queue;
|
||||||
|
|
||||||
|
pg_osd_set_state_t* add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set);
|
||||||
void calc_object_states(int log_level);
|
void calc_object_states(int log_level);
|
||||||
void print_state();
|
void print_state();
|
||||||
};
|
};
|
||||||
|
|
||||||
inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
|
inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
|
||||||
{
|
{
|
||||||
return a.outdated < b.outdated ||
|
return a.loc_bad < b.loc_bad ||
|
||||||
a.outdated == b.outdated && a.role < b.role ||
|
a.loc_bad == b.loc_bad && a.role < b.role ||
|
||||||
a.outdated == b.outdated && a.role == b.role && a.osd_num < b.osd_num;
|
a.loc_bad == b.loc_bad && a.role == b.role && a.osd_num < b.osd_num;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
|
inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
|
||||||
|
@@ -54,5 +54,6 @@ int main(int argc, char *argv[])
|
|||||||
{
|
{
|
||||||
printf("dev: state=%lx\n", it.second.state);
|
printf("dev: state=%lx\n", it.second.state);
|
||||||
}
|
}
|
||||||
|
delete pg.peering_state;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@@ -52,7 +52,9 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
|||||||
finish_op(cur_op, -EINVAL);
|
finish_op(cur_op, -EINVAL);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size);
|
// Scrub is similar to r/w, so it's also handled here
|
||||||
|
int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED
|
||||||
|
&& cur_op->req.hdr.opcode != OSD_OP_SCRUB ? 1 : pg_it->second.pg_size);
|
||||||
int chain_size = 0;
|
int chain_size = 0;
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_READ && cur_op->req.rw.meta_revision > 0)
|
if (cur_op->req.hdr.opcode == OSD_OP_READ && cur_op->req.rw.meta_revision > 0)
|
||||||
{
|
{
|
||||||
@@ -90,6 +92,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
|||||||
chain_size * (
|
chain_size * (
|
||||||
// - copy of the chain
|
// - copy of the chain
|
||||||
sizeof(inode_t) +
|
sizeof(inode_t) +
|
||||||
|
// - object states for every chain item
|
||||||
|
sizeof(void*) +
|
||||||
// - bitmap buffers for chained read
|
// - bitmap buffers for chained read
|
||||||
stripe_count * clean_entry_bitmap_size +
|
stripe_count * clean_entry_bitmap_size +
|
||||||
// - 'missing' flags for chained reads
|
// - 'missing' flags for chained reads
|
||||||
@@ -117,6 +121,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
|||||||
{
|
{
|
||||||
op_data->read_chain = (inode_t*)data_buf;
|
op_data->read_chain = (inode_t*)data_buf;
|
||||||
data_buf = (uint8_t*)data_buf + sizeof(inode_t) * chain_size;
|
data_buf = (uint8_t*)data_buf + sizeof(inode_t) * chain_size;
|
||||||
|
op_data->chain_states = (pg_osd_set_state_t**)data_buf;
|
||||||
|
data_buf = (uint8_t*)data_buf + sizeof(pg_osd_set_state_t*) * chain_size;
|
||||||
op_data->snapshot_bitmaps = data_buf;
|
op_data->snapshot_bitmaps = data_buf;
|
||||||
data_buf = (uint8_t*)data_buf + chain_size * stripe_count * clean_entry_bitmap_size;
|
data_buf = (uint8_t*)data_buf + chain_size * stripe_count * clean_entry_bitmap_size;
|
||||||
op_data->missing_flags = (uint8_t*)data_buf;
|
op_data->missing_flags = (uint8_t*)data_buf;
|
||||||
@@ -131,6 +137,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
|||||||
inode_it->second.parent_id != cur_op->req.rw.inode)
|
inode_it->second.parent_id != cur_op->req.rw.inode)
|
||||||
{
|
{
|
||||||
op_data->read_chain[chain_num++] = inode_it->second.parent_id;
|
op_data->read_chain[chain_num++] = inode_it->second.parent_id;
|
||||||
|
op_data->chain_states[chain_num++] = NULL;
|
||||||
inode_it = st_cli.inode_config.find(inode_it->second.parent_id);
|
inode_it = st_cli.inode_config.find(inode_it->second.parent_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -138,12 +145,12 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
|
uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state)
|
||||||
{
|
{
|
||||||
if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
|
if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
|
||||||
{
|
{
|
||||||
*object_state = NULL;
|
*object_state = NULL;
|
||||||
return def;
|
return pg.cur_set.data();
|
||||||
}
|
}
|
||||||
auto st_it = pg.incomplete_objects.find(oid);
|
auto st_it = pg.incomplete_objects.find(oid);
|
||||||
if (st_it != pg.incomplete_objects.end())
|
if (st_it != pg.incomplete_objects.end())
|
||||||
@@ -164,7 +171,7 @@ uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_
|
|||||||
return st_it->second->read_target.data();
|
return st_it->second->read_target.data();
|
||||||
}
|
}
|
||||||
*object_state = NULL;
|
*object_state = NULL;
|
||||||
return def;
|
return pg.cur_set.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::continue_primary_read(osd_op_t *cur_op)
|
void osd_t::continue_primary_read(osd_op_t *cur_op)
|
||||||
@@ -183,6 +190,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
|
|||||||
goto resume_1;
|
goto resume_1;
|
||||||
else if (op_data->st == 2)
|
else if (op_data->st == 2)
|
||||||
goto resume_2;
|
goto resume_2;
|
||||||
|
resume_0:
|
||||||
cur_op->reply.rw.bitmap_len = 0;
|
cur_op->reply.rw.bitmap_len = 0;
|
||||||
{
|
{
|
||||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||||
@@ -194,15 +202,17 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
|
|||||||
// Determine version
|
// Determine version
|
||||||
auto vo_it = pg.ver_override.find(op_data->oid);
|
auto vo_it = pg.ver_override.find(op_data->oid);
|
||||||
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||||
op_data->prev_set = pg.cur_set.data();
|
// PG may have degraded or misplaced objects
|
||||||
if (pg.state != PG_ACTIVE)
|
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
||||||
{
|
|
||||||
// PG may be degraded or have misplaced objects
|
|
||||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
|
||||||
}
|
|
||||||
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
|
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
// Fast happy-path
|
// Fast happy-path
|
||||||
|
if (op_data->scheme == POOL_SCHEME_REPLICATED &&
|
||||||
|
op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
|
||||||
|
{
|
||||||
|
finish_op(cur_op, -EIO);
|
||||||
|
return;
|
||||||
|
}
|
||||||
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
|
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
|
||||||
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
|
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
|
||||||
op_data->st = 1;
|
op_data->st = 1;
|
||||||
@@ -228,7 +238,15 @@ resume_1:
|
|||||||
resume_2:
|
resume_2:
|
||||||
if (op_data->errors > 0)
|
if (op_data->errors > 0)
|
||||||
{
|
{
|
||||||
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
|
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||||
|
{
|
||||||
|
// I/O or checksum error
|
||||||
|
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||||
|
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||||
|
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
|
||||||
|
goto resume_0;
|
||||||
|
}
|
||||||
|
finish_op(cur_op, op_data->errcode);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
cur_op->reply.rw.version = op_data->fact_ver;
|
cur_op->reply.rw.version = op_data->fact_ver;
|
||||||
@@ -266,10 +284,144 @@ resume_2:
|
|||||||
finish_op(cur_op, cur_op->req.rw.len);
|
finish_op(cur_op, cur_op->req.rw.len);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
|
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref)
|
||||||
void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
|
|
||||||
{
|
{
|
||||||
if (object_state->state & OBJ_INCOMPLETE)
|
pg_osd_set_state_t *object_state = NULL;
|
||||||
|
get_object_osd_set(pg, oid, &object_state);
|
||||||
|
if (prev_object_state != object_state)
|
||||||
|
{
|
||||||
|
// Object state changed in between by a parallel I/O operation, skip marking as failed
|
||||||
|
if (ref)
|
||||||
|
{
|
||||||
|
deref_object_state(pg, &prev_object_state, ref);
|
||||||
|
if (object_state)
|
||||||
|
object_state->ref_count++;
|
||||||
|
}
|
||||||
|
return object_state;
|
||||||
|
}
|
||||||
|
pg_osd_set_t corrupted_set;
|
||||||
|
if (object_state)
|
||||||
|
{
|
||||||
|
corrupted_set = object_state->osd_set;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int i = 0; i < pg.cur_set.size(); i++)
|
||||||
|
{
|
||||||
|
corrupted_set.push_back((pg_obj_loc_t){
|
||||||
|
.role = (pg.scheme == POOL_SCHEME_REPLICATED ? 0 : (uint64_t)i),
|
||||||
|
.osd_num = pg.cur_set[i],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Mark object chunk(s) as corrupted
|
||||||
|
uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_corrupted = 0;
|
||||||
|
for (auto & chunk: corrupted_set)
|
||||||
|
{
|
||||||
|
bool corrupted = stripes[chunk.role].osd_num == chunk.osd_num && stripes[chunk.role].read_error;
|
||||||
|
if (corrupted && !(chunk.loc_bad & LOC_CORRUPTED))
|
||||||
|
n_corrupted++;
|
||||||
|
chunk.loc_bad = chunk.loc_bad | (corrupted ? LOC_CORRUPTED : 0);
|
||||||
|
if (!chunk.loc_bad)
|
||||||
|
{
|
||||||
|
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||||
|
n_roles = 1;
|
||||||
|
else if (!(has_roles & (1 << chunk.role)))
|
||||||
|
{
|
||||||
|
n_roles++;
|
||||||
|
has_roles |= (1 << chunk.role);
|
||||||
|
}
|
||||||
|
n_copies++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!n_corrupted)
|
||||||
|
{
|
||||||
|
// No chunks newly marked as corrupted - object is already marked or moved
|
||||||
|
return object_state;
|
||||||
|
}
|
||||||
|
int old_pg_state = pg.state;
|
||||||
|
if (object_state)
|
||||||
|
{
|
||||||
|
remove_object_from_state(oid, &object_state, pg, false);
|
||||||
|
deref_object_state(pg, &object_state, ref);
|
||||||
|
}
|
||||||
|
// Calculate object state
|
||||||
|
uint64_t obj_state = OBJ_CORRUPTED;
|
||||||
|
int pg_state_bits = PG_HAS_CORRUPTED;
|
||||||
|
this->corrupted_objects++;
|
||||||
|
pg.corrupted_count++;
|
||||||
|
if (log_level > 1)
|
||||||
|
{
|
||||||
|
printf("Marking object %lx:%lx corrupted: %lu chunks / %lu copies available, %lu corrupted\n",
|
||||||
|
oid.inode, oid.stripe, n_roles, n_copies, n_corrupted);
|
||||||
|
}
|
||||||
|
if (n_roles < pg.pg_data_size)
|
||||||
|
{
|
||||||
|
this->incomplete_objects++;
|
||||||
|
obj_state |= OBJ_INCOMPLETE;
|
||||||
|
pg_state_bits = PG_HAS_INCOMPLETE;
|
||||||
|
}
|
||||||
|
else if (n_roles < pg.pg_cursize)
|
||||||
|
{
|
||||||
|
this->degraded_objects++;
|
||||||
|
obj_state |= OBJ_DEGRADED;
|
||||||
|
pg_state_bits = PG_HAS_DEGRADED;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
this->misplaced_objects++;
|
||||||
|
obj_state |= OBJ_MISPLACED;
|
||||||
|
pg_state_bits = PG_HAS_MISPLACED;
|
||||||
|
}
|
||||||
|
pg.state |= pg_state_bits;
|
||||||
|
if (pg.state != old_pg_state)
|
||||||
|
{
|
||||||
|
report_pg_state(pg);
|
||||||
|
if ((pg.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)) !=
|
||||||
|
(old_pg_state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
|
||||||
|
{
|
||||||
|
peering_state = peering_state | OSD_RECOVERING;
|
||||||
|
if ((pg.state & PG_HAS_DEGRADED) != (old_pg_state & PG_HAS_DEGRADED))
|
||||||
|
{
|
||||||
|
// Restart recovery from degraded objects
|
||||||
|
recovery_last_degraded = true;
|
||||||
|
recovery_last_pg = {};
|
||||||
|
recovery_last_oid = {};
|
||||||
|
}
|
||||||
|
ringloop->wakeup();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Insert object into the new state and retry
|
||||||
|
object_state = pg.add_object_to_state(oid, obj_state, corrupted_set);
|
||||||
|
if (ref)
|
||||||
|
object_state->ref_count++;
|
||||||
|
return object_state;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
|
||||||
|
void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t & pg, bool report)
|
||||||
|
{
|
||||||
|
if (!*object_state)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
pg_osd_set_state_t *recheck_state = NULL;
|
||||||
|
get_object_osd_set(pg, oid, &recheck_state);
|
||||||
|
if (recheck_state != *object_state)
|
||||||
|
{
|
||||||
|
recheck_state->ref_count++;
|
||||||
|
(*object_state)->ref_count--;
|
||||||
|
*object_state = recheck_state;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
(*object_state)->object_count--;
|
||||||
|
if ((*object_state)->state & OBJ_CORRUPTED)
|
||||||
|
{
|
||||||
|
this->corrupted_objects--;
|
||||||
|
pg.corrupted_count--;
|
||||||
|
}
|
||||||
|
bool changed = false;
|
||||||
|
if ((*object_state)->state & OBJ_INCOMPLETE)
|
||||||
{
|
{
|
||||||
// Successful write means that object is not incomplete anymore
|
// Successful write means that object is not incomplete anymore
|
||||||
this->incomplete_objects--;
|
this->incomplete_objects--;
|
||||||
@@ -277,41 +429,52 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
|
|||||||
if (!pg.incomplete_objects.size())
|
if (!pg.incomplete_objects.size())
|
||||||
{
|
{
|
||||||
pg.state = pg.state & ~PG_HAS_INCOMPLETE;
|
pg.state = pg.state & ~PG_HAS_INCOMPLETE;
|
||||||
report_pg_state(pg);
|
changed = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (object_state->state & OBJ_DEGRADED)
|
else if ((*object_state)->state & OBJ_DEGRADED)
|
||||||
{
|
{
|
||||||
this->degraded_objects--;
|
this->degraded_objects--;
|
||||||
pg.degraded_objects.erase(oid);
|
pg.degraded_objects.erase(oid);
|
||||||
if (!pg.degraded_objects.size())
|
if (!pg.degraded_objects.size())
|
||||||
{
|
{
|
||||||
pg.state = pg.state & ~PG_HAS_DEGRADED;
|
pg.state = pg.state & ~PG_HAS_DEGRADED;
|
||||||
report_pg_state(pg);
|
changed = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (object_state->state & OBJ_MISPLACED)
|
else if ((*object_state)->state & OBJ_MISPLACED)
|
||||||
{
|
{
|
||||||
this->misplaced_objects--;
|
this->misplaced_objects--;
|
||||||
pg.misplaced_objects.erase(oid);
|
pg.misplaced_objects.erase(oid);
|
||||||
if (!pg.misplaced_objects.size())
|
if (!pg.misplaced_objects.size())
|
||||||
{
|
{
|
||||||
pg.state = pg.state & ~PG_HAS_MISPLACED;
|
pg.state = pg.state & ~PG_HAS_MISPLACED;
|
||||||
report_pg_state(pg);
|
changed = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
|
throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state));
|
||||||
|
}
|
||||||
|
if (changed && report)
|
||||||
|
{
|
||||||
|
report_pg_state(pg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::free_object_state(pg_t & pg, pg_osd_set_state_t **object_state)
|
void osd_t::deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref)
|
||||||
{
|
{
|
||||||
if (*object_state && !(--(*object_state)->object_count))
|
if (*object_state)
|
||||||
{
|
{
|
||||||
pg.state_dict.erase((*object_state)->osd_set);
|
if (deref)
|
||||||
*object_state = NULL;
|
{
|
||||||
|
(*object_state)->ref_count--;
|
||||||
|
}
|
||||||
|
if (!(*object_state)->object_count && !(*object_state)->ref_count)
|
||||||
|
{
|
||||||
|
pg.state_dict.erase((*object_state)->osd_set);
|
||||||
|
*object_state = NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -341,21 +504,28 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
|
|||||||
}
|
}
|
||||||
resume_1:
|
resume_1:
|
||||||
// Determine which OSDs contain this object and delete it
|
// Determine which OSDs contain this object and delete it
|
||||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
||||||
|
if (op_data->object_state)
|
||||||
|
{
|
||||||
|
op_data->object_state->ref_count++;
|
||||||
|
}
|
||||||
// Submit 1 read to determine the actual version number
|
// Submit 1 read to determine the actual version number
|
||||||
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
|
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
|
||||||
|
op_data->prev_set = NULL;
|
||||||
resume_2:
|
resume_2:
|
||||||
op_data->st = 2;
|
op_data->st = 2;
|
||||||
return;
|
return;
|
||||||
resume_3:
|
resume_3:
|
||||||
if (op_data->errors > 0)
|
if (op_data->errors > 0)
|
||||||
{
|
{
|
||||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
deref_object_state(pg, &op_data->object_state, true);
|
||||||
|
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Check CAS version
|
// Check CAS version
|
||||||
if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
|
if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
|
||||||
{
|
{
|
||||||
|
deref_object_state(pg, &op_data->object_state, true);
|
||||||
cur_op->reply.hdr.retval = -EINTR;
|
cur_op->reply.hdr.retval = -EINTR;
|
||||||
cur_op->reply.rw.version = op_data->fact_ver;
|
cur_op->reply.rw.version = op_data->fact_ver;
|
||||||
goto continue_others;
|
goto continue_others;
|
||||||
@@ -371,7 +541,8 @@ resume_4:
|
|||||||
resume_5:
|
resume_5:
|
||||||
if (op_data->errors > 0)
|
if (op_data->errors > 0)
|
||||||
{
|
{
|
||||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
deref_object_state(pg, &op_data->object_state, true);
|
||||||
|
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Remove version override
|
// Remove version override
|
||||||
@@ -383,8 +554,8 @@ resume_5:
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
remove_object_from_state(op_data->oid, &op_data->object_state, pg);
|
||||||
free_object_state(pg, &op_data->object_state);
|
deref_object_state(pg, &op_data->object_state, true);
|
||||||
}
|
}
|
||||||
pg.total_count--;
|
pg.total_count--;
|
||||||
cur_op->reply.hdr.retval = 0;
|
cur_op->reply.hdr.retval = 0;
|
||||||
|
@@ -9,6 +9,7 @@
|
|||||||
#define SUBMIT_READ 0
|
#define SUBMIT_READ 0
|
||||||
#define SUBMIT_RMW_READ 1
|
#define SUBMIT_RMW_READ 1
|
||||||
#define SUBMIT_WRITE 2
|
#define SUBMIT_WRITE 2
|
||||||
|
#define SUBMIT_SCRUB_READ 3
|
||||||
|
|
||||||
struct unstable_osd_num_t
|
struct unstable_osd_num_t
|
||||||
{
|
{
|
||||||
@@ -24,7 +25,7 @@ struct osd_primary_op_data_t
|
|||||||
uint64_t target_ver;
|
uint64_t target_ver;
|
||||||
uint64_t orig_ver = 0, fact_ver = 0;
|
uint64_t orig_ver = 0, fact_ver = 0;
|
||||||
uint64_t scheme = 0;
|
uint64_t scheme = 0;
|
||||||
int n_subops = 0, done = 0, errors = 0, epipe = 0;
|
int n_subops = 0, done = 0, errors = 0, errcode = 0;
|
||||||
int degraded = 0, pg_size, pg_data_size;
|
int degraded = 0, pg_size, pg_data_size;
|
||||||
osd_rmw_stripe_t *stripes;
|
osd_rmw_stripe_t *stripes;
|
||||||
osd_op_t *subops = NULL;
|
osd_op_t *subops = NULL;
|
||||||
@@ -50,6 +51,7 @@ struct osd_primary_op_data_t
|
|||||||
// for read_bitmaps
|
// for read_bitmaps
|
||||||
void *snapshot_bitmaps;
|
void *snapshot_bitmaps;
|
||||||
inode_t *read_chain;
|
inode_t *read_chain;
|
||||||
|
pg_osd_set_state_t **chain_states;
|
||||||
uint8_t *missing_flags;
|
uint8_t *missing_flags;
|
||||||
int chain_size;
|
int chain_size;
|
||||||
osd_chain_read_t *chain_reads;
|
osd_chain_read_t *chain_reads;
|
||||||
|
@@ -40,10 +40,24 @@ resume_3:
|
|||||||
resume_4:
|
resume_4:
|
||||||
if (op_data->errors > 0)
|
if (op_data->errors > 0)
|
||||||
{
|
{
|
||||||
free(op_data->chain_reads);
|
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||||
op_data->chain_reads = NULL;
|
{
|
||||||
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
|
// Handle corrupted reads and retry...
|
||||||
return;
|
check_corrupted_chained(pg, cur_op);
|
||||||
|
free(cur_op->buf);
|
||||||
|
cur_op->buf = NULL;
|
||||||
|
free(op_data->chain_reads);
|
||||||
|
op_data->chain_reads = NULL;
|
||||||
|
// FIXME: We can in theory retry only specific parts instead of the whole operation
|
||||||
|
goto resume_1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
free(op_data->chain_reads);
|
||||||
|
op_data->chain_reads = NULL;
|
||||||
|
finish_op(cur_op, op_data->errcode);
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
send_chained_read_results(pg, cur_op);
|
send_chained_read_results(pg, cur_op);
|
||||||
finish_op(cur_op, cur_op->req.rw.len);
|
finish_op(cur_op, cur_op->req.rw.len);
|
||||||
@@ -131,8 +145,7 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitm
|
|||||||
object_id cur_oid = { .inode = op_data->read_chain[chain_num], .stripe = op_data->oid.stripe };
|
object_id cur_oid = { .inode = op_data->read_chain[chain_num], .stripe = op_data->oid.stripe };
|
||||||
auto vo_it = pg.ver_override.find(cur_oid);
|
auto vo_it = pg.ver_override.find(cur_oid);
|
||||||
uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||||
pg_osd_set_state_t *object_state;
|
uint64_t* cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_num]);
|
||||||
uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
|
|
||||||
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
osd_num_t read_target = 0;
|
osd_num_t read_target = 0;
|
||||||
@@ -247,6 +260,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
|||||||
osd_op_t *subop = op_data->subops+subop_idx;
|
osd_op_t *subop = op_data->subops+subop_idx;
|
||||||
subop->op_type = OSD_OP_OUT;
|
subop->op_type = OSD_OP_OUT;
|
||||||
// FIXME: Use the pre-allocated buffer
|
// FIXME: Use the pre-allocated buffer
|
||||||
|
assert(!subop->buf);
|
||||||
subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
|
subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
|
||||||
subop->req = (osd_any_op_t){
|
subop->req = (osd_any_op_t){
|
||||||
.sec_read_bmp = {
|
.sec_read_bmp = {
|
||||||
@@ -297,7 +311,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
|||||||
// Fail it immediately
|
// Fail it immediately
|
||||||
subop->peer_fd = -1;
|
subop->peer_fd = -1;
|
||||||
subop->reply.hdr.retval = -EPIPE;
|
subop->reply.hdr.retval = -EPIPE;
|
||||||
subop->callback(subop);
|
ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
||||||
}
|
}
|
||||||
subop_idx++;
|
subop_idx++;
|
||||||
}
|
}
|
||||||
@@ -375,6 +389,8 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
|||||||
op_data->chain_read_count = chain_reads.size();
|
op_data->chain_read_count = chain_reads.size();
|
||||||
op_data->chain_reads = (osd_chain_read_t*)calloc_or_die(
|
op_data->chain_reads = (osd_chain_read_t*)calloc_or_die(
|
||||||
1, sizeof(osd_chain_read_t) * chain_reads.size()
|
1, sizeof(osd_chain_read_t) * chain_reads.size()
|
||||||
|
// FIXME: Allocate only <chain_reads.size()> instead of <chain_size> stripes
|
||||||
|
// (but it's slightly harder to handle in send_chained_read_results())
|
||||||
+ sizeof(osd_rmw_stripe_t) * stripe_count * op_data->chain_size
|
+ sizeof(osd_rmw_stripe_t) * stripe_count * op_data->chain_size
|
||||||
);
|
);
|
||||||
osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
|
osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
|
||||||
@@ -403,8 +419,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
|||||||
uint64_t *cur_set = pg.cur_set.data();
|
uint64_t *cur_set = pg.cur_set.data();
|
||||||
if (pg.state != PG_ACTIVE)
|
if (pg.state != PG_ACTIVE)
|
||||||
{
|
{
|
||||||
pg_osd_set_state_t *object_state;
|
cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_reads[cri].chain_pos]);
|
||||||
cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
|
|
||||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0)
|
if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0)
|
||||||
@@ -416,6 +431,17 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
|||||||
}
|
}
|
||||||
op_data->degraded = 1;
|
op_data->degraded = 1;
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
|
||||||
|
if (cur_state && (cur_state->state & OBJ_INCOMPLETE))
|
||||||
|
{
|
||||||
|
free(op_data->chain_reads);
|
||||||
|
op_data->chain_reads = NULL;
|
||||||
|
finish_op(cur_op, -EIO);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
@@ -433,6 +459,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
assert(!cur_op->buf);
|
||||||
cur_op->buf = memalign_or_die(MEM_ALIGNMENT, read_buffer_size);
|
cur_op->buf = memalign_or_die(MEM_ALIGNMENT, read_buffer_size);
|
||||||
void *cur_buf = cur_op->buf;
|
void *cur_buf = cur_op->buf;
|
||||||
for (int cri = 0; cri < chain_reads.size(); cri++)
|
for (int cri = 0; cri < chain_reads.size(); cri++)
|
||||||
@@ -468,12 +495,8 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
|||||||
object_id cur_oid = { .inode = chain_reads[cri].inode, .stripe = op_data->oid.stripe };
|
object_id cur_oid = { .inode = chain_reads[cri].inode, .stripe = op_data->oid.stripe };
|
||||||
auto vo_it = pg.ver_override.find(cur_oid);
|
auto vo_it = pg.ver_override.find(cur_oid);
|
||||||
uint64_t target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
uint64_t target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||||
uint64_t *cur_set = pg.cur_set.data();
|
auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
|
||||||
if (pg.state != PG_ACTIVE)
|
uint64_t *cur_set = (pg.state != PG_ACTIVE && cur_state ? cur_state->read_target.data() : pg.cur_set.data());
|
||||||
{
|
|
||||||
pg_osd_set_state_t *object_state;
|
|
||||||
cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
|
|
||||||
}
|
|
||||||
int zero_read = -1;
|
int zero_read = -1;
|
||||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
@@ -487,6 +510,33 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op)
|
||||||
|
{
|
||||||
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
|
int stripe_count = (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size);
|
||||||
|
osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
|
||||||
|
(uint8_t*)op_data->chain_reads + sizeof(osd_chain_read_t) * op_data->chain_read_count
|
||||||
|
);
|
||||||
|
for (int cri = 0; cri < op_data->chain_read_count; cri++)
|
||||||
|
{
|
||||||
|
object_id cur_oid = { .inode = op_data->chain_reads[cri].inode, .stripe = op_data->oid.stripe };
|
||||||
|
osd_rmw_stripe_t *stripes = chain_stripes + op_data->chain_reads[cri].chain_pos*stripe_count;
|
||||||
|
bool corrupted = false;
|
||||||
|
for (int i = 0; i < stripe_count; i++)
|
||||||
|
{
|
||||||
|
if (stripes[i].read_error)
|
||||||
|
{
|
||||||
|
corrupted = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (corrupted)
|
||||||
|
{
|
||||||
|
mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void osd_t::send_chained_read_results(pg_t & pg, osd_op_t *cur_op)
|
void osd_t::send_chained_read_results(pg_t & pg, osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
|
@@ -9,6 +9,7 @@ void osd_t::autosync()
|
|||||||
{
|
{
|
||||||
autosync_op = new osd_op_t();
|
autosync_op = new osd_op_t();
|
||||||
autosync_op->op_type = OSD_OP_IN;
|
autosync_op->op_type = OSD_OP_IN;
|
||||||
|
autosync_op->peer_fd = -1;
|
||||||
autosync_op->req = (osd_any_op_t){
|
autosync_op->req = (osd_any_op_t){
|
||||||
.sync = {
|
.sync = {
|
||||||
.header = {
|
.header = {
|
||||||
@@ -122,7 +123,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
|
|||||||
zero_read = -1;
|
zero_read = -1;
|
||||||
osd_op_t *subops = new osd_op_t[n_subops];
|
osd_op_t *subops = new osd_op_t[n_subops];
|
||||||
op_data->fact_ver = 0;
|
op_data->fact_ver = 0;
|
||||||
op_data->done = op_data->errors = 0;
|
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||||
op_data->n_subops = n_subops;
|
op_data->n_subops = n_subops;
|
||||||
op_data->subops = subops;
|
op_data->subops = subops;
|
||||||
int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read);
|
int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read);
|
||||||
@@ -139,34 +140,40 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
|||||||
for (int role = 0; role < op_data->pg_size; role++)
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
{
|
{
|
||||||
// We always submit zero-length writes to all replicas, even if the stripe is not modified
|
// We always submit zero-length writes to all replicas, even if the stripe is not modified
|
||||||
if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role))
|
if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role || submit_type == SUBMIT_SCRUB_READ))
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
osd_num_t role_osd_num = osd_set[role];
|
osd_num_t role_osd_num = osd_set[role];
|
||||||
|
int stripe_num = rep ? 0 : role;
|
||||||
if (role_osd_num != 0)
|
if (role_osd_num != 0)
|
||||||
{
|
{
|
||||||
int stripe_num = rep ? 0 : role;
|
|
||||||
osd_op_t *subop = op_data->subops + i;
|
osd_op_t *subop = op_data->subops + i;
|
||||||
|
stripes[stripe_num].osd_num = role_osd_num;
|
||||||
|
stripes[stripe_num].read_error = false;
|
||||||
|
subop->bitmap = stripes[stripe_num].bmp_buf;
|
||||||
|
subop->bitmap_len = clean_entry_bitmap_size;
|
||||||
|
// Using rmw_buf to pass pointer to stripes. Dirty but should work
|
||||||
|
subop->rmw_buf = stripes+stripe_num;
|
||||||
if (role_osd_num == this->osd_num)
|
if (role_osd_num == this->osd_num)
|
||||||
{
|
{
|
||||||
clock_gettime(CLOCK_REALTIME, &subop->tv_begin);
|
clock_gettime(CLOCK_REALTIME, &subop->tv_begin);
|
||||||
subop->op_type = (uint64_t)cur_op;
|
subop->op_type = (uint64_t)cur_op;
|
||||||
subop->bitmap = stripes[stripe_num].bmp_buf;
|
subop->bs_op = new blockstore_op_t((blockstore_op_t){
|
||||||
subop->bitmap_len = clean_entry_bitmap_size;
|
|
||||||
subop->bs_op = new blockstore_op_t({
|
|
||||||
.opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ),
|
.opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ),
|
||||||
.callback = [subop, this](blockstore_op_t *bs_subop)
|
.callback = [subop, this](blockstore_op_t *bs_subop)
|
||||||
{
|
{
|
||||||
handle_primary_bs_subop(subop);
|
handle_primary_bs_subop(subop);
|
||||||
},
|
},
|
||||||
.oid = {
|
{
|
||||||
.inode = inode,
|
.oid = (object_id){
|
||||||
.stripe = op_data->oid.stripe | stripe_num,
|
.inode = inode,
|
||||||
|
.stripe = op_data->oid.stripe | stripe_num,
|
||||||
|
},
|
||||||
|
.version = op_version,
|
||||||
|
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
|
||||||
|
.len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
|
||||||
},
|
},
|
||||||
.version = op_version,
|
|
||||||
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
|
|
||||||
.len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
|
|
||||||
.buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf,
|
.buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf,
|
||||||
.bitmap = stripes[stripe_num].bmp_buf,
|
.bitmap = stripes[stripe_num].bmp_buf,
|
||||||
});
|
});
|
||||||
@@ -182,8 +189,6 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
subop->op_type = OSD_OP_OUT;
|
subop->op_type = OSD_OP_OUT;
|
||||||
subop->bitmap = stripes[stripe_num].bmp_buf;
|
|
||||||
subop->bitmap_len = clean_entry_bitmap_size;
|
|
||||||
subop->req.sec_rw = {
|
subop->req.sec_rw = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
@@ -235,11 +240,15 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
|||||||
// Fail it immediately
|
// Fail it immediately
|
||||||
subop->peer_fd = -1;
|
subop->peer_fd = -1;
|
||||||
subop->reply.hdr.retval = -EPIPE;
|
subop->reply.hdr.retval = -EPIPE;
|
||||||
subop->callback(subop);
|
ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
stripes[stripe_num].osd_num = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return i-subop_idx;
|
return i-subop_idx;
|
||||||
}
|
}
|
||||||
@@ -263,9 +272,11 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
|
|||||||
blockstore_op_t *bs_op = subop->bs_op;
|
blockstore_op_t *bs_op = subop->bs_op;
|
||||||
int expected = bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE
|
int expected = bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE
|
||||||
|| bs_op->opcode == BS_OP_WRITE_STABLE ? bs_op->len : 0;
|
|| bs_op->opcode == BS_OP_WRITE_STABLE ? bs_op->len : 0;
|
||||||
if (bs_op->retval != expected && bs_op->opcode != BS_OP_READ)
|
if (bs_op->retval != expected && bs_op->opcode != BS_OP_READ &&
|
||||||
|
(bs_op->opcode != BS_OP_WRITE && bs_op->opcode != BS_OP_WRITE_STABLE ||
|
||||||
|
bs_op->retval != -ENOSPC))
|
||||||
{
|
{
|
||||||
// die
|
// die on any error except ENOSPC
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
"local blockstore modification failed (opcode = "+std::to_string(bs_op->opcode)+
|
"local blockstore modification failed (opcode = "+std::to_string(bs_op->opcode)+
|
||||||
" retval = "+std::to_string(bs_op->retval)+")"
|
" retval = "+std::to_string(bs_op->retval)+")"
|
||||||
@@ -276,6 +287,8 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
|
|||||||
subop->reply.hdr.retval = bs_op->retval;
|
subop->reply.hdr.retval = bs_op->retval;
|
||||||
if (bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE || bs_op->opcode == BS_OP_WRITE_STABLE)
|
if (bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE || bs_op->opcode == BS_OP_WRITE_STABLE)
|
||||||
{
|
{
|
||||||
|
subop->req.sec_rw.oid = bs_op->oid;
|
||||||
|
subop->req.sec_rw.version = bs_op->version;
|
||||||
subop->req.sec_rw.len = bs_op->len;
|
subop->req.sec_rw.len = bs_op->len;
|
||||||
subop->reply.sec_rw.version = bs_op->version;
|
subop->reply.sec_rw.version = bs_op->version;
|
||||||
}
|
}
|
||||||
@@ -325,9 +338,11 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
|||||||
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
|
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"%s subop to %lx:%lx v%lu failed on peer %d: retval = %d (expected %d)\n",
|
subop->peer_fd >= 0
|
||||||
|
? "%1$s subop to %2$lx:%3$lx v%4$lu failed on peer %7$d: retval = %5$d (expected %6$d)\n"
|
||||||
|
: "%1$s subop to %2$lx:%3$lx v%4$lu failed locally: retval = %5$d (expected %6$d)\n",
|
||||||
osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version,
|
osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version,
|
||||||
subop->peer_fd, retval, expected
|
retval, expected, subop->peer_fd
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -337,19 +352,32 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
|||||||
osd_op_names[opcode], subop->peer_fd, retval, expected
|
osd_op_names[opcode], subop->peer_fd, retval, expected
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
if (retval == -EPIPE)
|
if (opcode == OSD_OP_SEC_READ && (retval == -EIO || retval == -EDOM))
|
||||||
{
|
{
|
||||||
op_data->epipe++;
|
// We'll retry reads from other replica(s) on EIO/EDOM and mark object as corrupted
|
||||||
|
((osd_rmw_stripe_t*)subop->rmw_buf)->read_error = true;
|
||||||
|
}
|
||||||
|
subop->rmw_buf = NULL;
|
||||||
|
// Error priority: EIO > EDOM > ENOSPC > EPIPE
|
||||||
|
if (op_data->errcode == 0 ||
|
||||||
|
retval == -EIO ||
|
||||||
|
retval == -EDOM && (op_data->errcode == -ENOSPC || op_data->errcode == -EPIPE) ||
|
||||||
|
retval == -ENOSPC && op_data->errcode == -EPIPE)
|
||||||
|
{
|
||||||
|
op_data->errcode = retval;
|
||||||
}
|
}
|
||||||
op_data->errors++;
|
op_data->errors++;
|
||||||
if (subop->peer_fd >= 0)
|
if (subop->peer_fd >= 0 && retval != -EDOM &&
|
||||||
|
(retval != -ENOSPC || opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE) &&
|
||||||
|
(retval != -EIO || opcode != OSD_OP_SEC_READ))
|
||||||
{
|
{
|
||||||
// Drop connection on any error
|
// Drop connection on unexpected errors
|
||||||
msgr.stop_client(subop->peer_fd);
|
msgr.stop_client(subop->peer_fd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
subop->rmw_buf = NULL;
|
||||||
op_data->done++;
|
op_data->done++;
|
||||||
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
|
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
|
||||||
{
|
{
|
||||||
@@ -393,6 +421,10 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
|||||||
{
|
{
|
||||||
continue_primary_del(cur_op);
|
continue_primary_del(cur_op);
|
||||||
}
|
}
|
||||||
|
else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
|
||||||
|
{
|
||||||
|
continue_primary_scrub(cur_op);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
throw std::runtime_error("BUG: unknown opcode");
|
throw std::runtime_error("BUG: unknown opcode");
|
||||||
@@ -408,7 +440,8 @@ void osd_t::cancel_primary_write(osd_op_t *cur_op)
|
|||||||
// are sent to peer OSDs, so we can't just throw them away.
|
// are sent to peer OSDs, so we can't just throw them away.
|
||||||
// Mark them with an extra EPIPE.
|
// Mark them with an extra EPIPE.
|
||||||
cur_op->op_data->errors++;
|
cur_op->op_data->errors++;
|
||||||
cur_op->op_data->epipe++;
|
if (cur_op->op_data->errcode == 0)
|
||||||
|
cur_op->op_data->errcode = -EPIPE;
|
||||||
cur_op->op_data->done--; // Caution: `done` must be signed because may become -1 here
|
cur_op->op_data->done--; // Caution: `done` must be signed because may become -1 here
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -460,7 +493,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
|||||||
{
|
{
|
||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
op_data->n_subops = chunks_to_delete_count;
|
op_data->n_subops = chunks_to_delete_count;
|
||||||
op_data->done = op_data->errors = 0;
|
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||||
if (!op_data->n_subops)
|
if (!op_data->n_subops)
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
@@ -512,7 +545,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
|||||||
// Fail it immediately
|
// Fail it immediately
|
||||||
subops[i].peer_fd = -1;
|
subops[i].peer_fd = -1;
|
||||||
subops[i].reply.hdr.retval = -EPIPE;
|
subops[i].reply.hdr.retval = -EPIPE;
|
||||||
subops[i].callback(&subops[i]);
|
ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -523,7 +556,7 @@ int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
|||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
int n_osds = op_data->dirty_osd_count;
|
int n_osds = op_data->dirty_osd_count;
|
||||||
osd_op_t *subops = new osd_op_t[n_osds];
|
osd_op_t *subops = new osd_op_t[n_osds];
|
||||||
op_data->done = op_data->errors = 0;
|
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||||
op_data->n_subops = n_osds;
|
op_data->n_subops = n_osds;
|
||||||
op_data->subops = subops;
|
op_data->subops = subops;
|
||||||
std::map<uint64_t, int>::iterator peer_it;
|
std::map<uint64_t, int>::iterator peer_it;
|
||||||
@@ -579,7 +612,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
|||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
int n_osds = op_data->unstable_write_osds->size();
|
int n_osds = op_data->unstable_write_osds->size();
|
||||||
osd_op_t *subops = new osd_op_t[n_osds];
|
osd_op_t *subops = new osd_op_t[n_osds];
|
||||||
op_data->done = op_data->errors = 0;
|
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||||
op_data->n_subops = n_osds;
|
op_data->n_subops = n_osds;
|
||||||
op_data->subops = subops;
|
op_data->subops = subops;
|
||||||
for (int i = 0; i < n_osds; i++)
|
for (int i = 0; i < n_osds; i++)
|
||||||
@@ -595,7 +628,9 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
|||||||
{
|
{
|
||||||
handle_primary_bs_subop(subop);
|
handle_primary_bs_subop(subop);
|
||||||
},
|
},
|
||||||
.len = (uint32_t)stab_osd.len,
|
{
|
||||||
|
.len = (uint32_t)stab_osd.len,
|
||||||
|
},
|
||||||
.buf = (void*)(op_data->unstable_writes + stab_osd.start),
|
.buf = (void*)(op_data->unstable_writes + stab_osd.start),
|
||||||
});
|
});
|
||||||
bs->enqueue_op(subops[i].bs_op);
|
bs->enqueue_op(subops[i].bs_op);
|
||||||
@@ -627,7 +662,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
|||||||
// Fail it immediately
|
// Fail it immediately
|
||||||
subops[i].peer_fd = -1;
|
subops[i].peer_fd = -1;
|
||||||
subops[i].reply.hdr.retval = -EPIPE;
|
subops[i].reply.hdr.retval = -EPIPE;
|
||||||
subops[i].callback(&subops[i]);
|
ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -240,7 +240,7 @@ resume_8:
|
|||||||
}
|
}
|
||||||
if (op_data->errors > 0)
|
if (op_data->errors > 0)
|
||||||
{
|
{
|
||||||
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
|
finish_op(cur_op, op_data->errcode);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@@ -58,7 +58,13 @@ resume_1:
|
|||||||
// Determine blocks to read and write
|
// Determine blocks to read and write
|
||||||
// Missing chunks are allowed to be overwritten even in incomplete objects
|
// Missing chunks are allowed to be overwritten even in incomplete objects
|
||||||
// FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
|
// FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
|
||||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
||||||
|
if (op_data->object_state)
|
||||||
|
{
|
||||||
|
// Protect object_state from being freed by a parallel read operation changing it
|
||||||
|
op_data->object_state->ref_count++;
|
||||||
|
}
|
||||||
|
retry_1:
|
||||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||||
{
|
{
|
||||||
// Simplified algorithm
|
// Simplified algorithm
|
||||||
@@ -68,6 +74,12 @@ resume_1:
|
|||||||
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
|
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
|
||||||
op_data->stripes[0].write_end != bs_block_size))
|
op_data->stripes[0].write_end != bs_block_size))
|
||||||
{
|
{
|
||||||
|
if (op_data->object_state->state & OBJ_INCOMPLETE)
|
||||||
|
{
|
||||||
|
// Refuse partial overwrite of an incomplete (corrupted) object
|
||||||
|
cur_op->reply.hdr.retval = -EIO;
|
||||||
|
goto continue_others;
|
||||||
|
}
|
||||||
// Object is degraded/misplaced and will be moved to <write_osd_set>
|
// Object is degraded/misplaced and will be moved to <write_osd_set>
|
||||||
op_data->stripes[0].read_start = 0;
|
op_data->stripes[0].read_start = 0;
|
||||||
op_data->stripes[0].read_end = bs_block_size;
|
op_data->stripes[0].read_end = bs_block_size;
|
||||||
@@ -81,24 +93,66 @@ resume_1:
|
|||||||
if (!cur_op->rmw_buf)
|
if (!cur_op->rmw_buf)
|
||||||
{
|
{
|
||||||
// Refuse partial overwrite of an incomplete object
|
// Refuse partial overwrite of an incomplete object
|
||||||
cur_op->reply.hdr.retval = -EINVAL;
|
cur_op->reply.hdr.retval = -EIO;
|
||||||
goto continue_others;
|
goto continue_others;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Read required blocks
|
// Read required blocks
|
||||||
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
|
{
|
||||||
|
if (op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
|
||||||
|
{
|
||||||
|
// Allow to read version number (just version number!) from corrupted chunks
|
||||||
|
// to allow full overwrite of a corrupted object
|
||||||
|
bool found = false;
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
if (op_data->prev_set[role] != 0 || op_data->stripes[role].read_end > op_data->stripes[role].read_start)
|
||||||
|
{
|
||||||
|
found = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!found)
|
||||||
|
{
|
||||||
|
osd_num_t corrupted_target[op_data->pg_size];
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
corrupted_target[role] = 0;
|
||||||
|
}
|
||||||
|
for (auto & loc: op_data->object_state->osd_set)
|
||||||
|
{
|
||||||
|
if (!(loc.loc_bad & LOC_OUTDATED) && !corrupted_target[loc.role])
|
||||||
|
{
|
||||||
|
corrupted_target[loc.role] = loc.osd_num;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, corrupted_target, cur_op);
|
||||||
|
goto resume_2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
|
||||||
|
}
|
||||||
resume_2:
|
resume_2:
|
||||||
op_data->st = 2;
|
op_data->st = 2;
|
||||||
return;
|
return;
|
||||||
resume_3:
|
resume_3:
|
||||||
if (op_data->errors > 0)
|
if (op_data->errors > 0)
|
||||||
{
|
{
|
||||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||||
|
{
|
||||||
|
// Mark object corrupted and retry
|
||||||
|
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true);
|
||||||
|
op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data();
|
||||||
|
goto retry_1;
|
||||||
|
}
|
||||||
|
deref_object_state(pg, &op_data->object_state, true);
|
||||||
|
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Check CAS version
|
// Check CAS version
|
||||||
if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
|
if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
|
||||||
{
|
{
|
||||||
|
deref_object_state(pg, &op_data->object_state, true);
|
||||||
cur_op->reply.hdr.retval = -EINTR;
|
cur_op->reply.hdr.retval = -EINTR;
|
||||||
cur_op->reply.rw.version = op_data->fact_ver;
|
cur_op->reply.rw.version = op_data->fact_ver;
|
||||||
goto continue_others;
|
goto continue_others;
|
||||||
@@ -155,17 +209,37 @@ resume_3:
|
|||||||
if (pg.epoch > pg.reported_epoch)
|
if (pg.epoch > pg.reported_epoch)
|
||||||
{
|
{
|
||||||
// Report newer epoch before writing
|
// Report newer epoch before writing
|
||||||
// FIXME: We may report only one PG state here...
|
// FIXME: We don't have to report all changed PG states here
|
||||||
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||||
|
if (pg.state != PG_ACTIVE)
|
||||||
|
{
|
||||||
|
// Check that current OSD set is in history and/or add it there
|
||||||
|
std::vector<osd_num_t> history_set;
|
||||||
|
for (auto peer_osd: pg.cur_set)
|
||||||
|
if (peer_osd != 0)
|
||||||
|
history_set.push_back(peer_osd);
|
||||||
|
std::sort(history_set.begin(), history_set.end());
|
||||||
|
auto it = std::lower_bound(pg.target_history.begin(), pg.target_history.end(), history_set);
|
||||||
|
if (it == pg.target_history.end() || *it != history_set)
|
||||||
|
pg.target_history.insert(it, history_set);
|
||||||
|
}
|
||||||
pg.history_changed = true;
|
pg.history_changed = true;
|
||||||
report_pg_states();
|
report_pg_states();
|
||||||
resume_10:
|
resume_10:
|
||||||
if (pg.epoch > pg.reported_epoch)
|
if (pg.epoch > pg.reported_epoch)
|
||||||
{
|
{
|
||||||
op_data->st = 10;
|
#define PG_EPOCH_WAIT_STATE 10
|
||||||
|
op_data->st = PG_EPOCH_WAIT_STATE;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Recheck PG state after reporting history - maybe it's already stopping/restarting
|
||||||
|
if (pg.state & (PG_STOPPING|PG_REPEERING))
|
||||||
|
{
|
||||||
|
deref_object_state(pg, &op_data->object_state, true);
|
||||||
|
pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE);
|
||||||
|
return;
|
||||||
|
}
|
||||||
submit_primary_subops(SUBMIT_WRITE, op_data->target_ver, pg.cur_set.data(), cur_op);
|
submit_primary_subops(SUBMIT_WRITE, op_data->target_ver, pg.cur_set.data(), cur_op);
|
||||||
resume_4:
|
resume_4:
|
||||||
op_data->st = 4;
|
op_data->st = 4;
|
||||||
@@ -178,7 +252,8 @@ resume_5:
|
|||||||
}
|
}
|
||||||
if (op_data->errors > 0)
|
if (op_data->errors > 0)
|
||||||
{
|
{
|
||||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
deref_object_state(pg, &op_data->object_state, true);
|
||||||
|
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (op_data->object_state)
|
if (op_data->object_state)
|
||||||
@@ -186,7 +261,7 @@ resume_5:
|
|||||||
// We must forget the unclean state of the object before deleting it
|
// We must forget the unclean state of the object before deleting it
|
||||||
// so the next reads don't accidentally read a deleted version
|
// so the next reads don't accidentally read a deleted version
|
||||||
// And it should be done at the same time as the removal of the version override
|
// And it should be done at the same time as the removal of the version override
|
||||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
remove_object_from_state(op_data->oid, &op_data->object_state, pg);
|
||||||
pg.clean_count++;
|
pg.clean_count++;
|
||||||
}
|
}
|
||||||
resume_6:
|
resume_6:
|
||||||
@@ -241,12 +316,12 @@ resume_7:
|
|||||||
copies_to_delete_after_sync_count++;
|
copies_to_delete_after_sync_count++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
free_object_state(pg, &op_data->object_state);
|
deref_object_state(pg, &op_data->object_state, true);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
|
submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
|
||||||
free_object_state(pg, &op_data->object_state);
|
deref_object_state(pg, &op_data->object_state, true);
|
||||||
if (op_data->n_subops > 0)
|
if (op_data->n_subops > 0)
|
||||||
{
|
{
|
||||||
resume_8:
|
resume_8:
|
||||||
@@ -255,7 +330,7 @@ resume_8:
|
|||||||
resume_9:
|
resume_9:
|
||||||
if (op_data->errors > 0)
|
if (op_data->errors > 0)
|
||||||
{
|
{
|
||||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -287,6 +362,50 @@ continue_others:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void osd_t::on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num)
|
||||||
|
{
|
||||||
|
auto pg_it = pgs.find({
|
||||||
|
.pool_id = pool_id,
|
||||||
|
.pg_num = pg_num,
|
||||||
|
});
|
||||||
|
if (pg_it == pgs.end())
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto & pg = pg_it->second;
|
||||||
|
if (pg.epoch > pg.reported_epoch &&
|
||||||
|
st_cli.pool_config[pool_id].pg_config[pg_num].epoch >= pg.epoch)
|
||||||
|
{
|
||||||
|
pg.reported_epoch = st_cli.pool_config[pool_id].pg_config[pg_num].epoch;
|
||||||
|
std::vector<object_id> resume_oids;
|
||||||
|
for (auto & op: pg.write_queue)
|
||||||
|
{
|
||||||
|
if (op.second->op_data->st == PG_EPOCH_WAIT_STATE)
|
||||||
|
{
|
||||||
|
// Run separately to prevent side effects
|
||||||
|
resume_oids.push_back(op.first);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto & oid: resume_oids)
|
||||||
|
{
|
||||||
|
auto pg_it = pgs.find({
|
||||||
|
.pool_id = pool_id,
|
||||||
|
.pg_num = pg_num,
|
||||||
|
});
|
||||||
|
if (pg_it != pgs.end())
|
||||||
|
{
|
||||||
|
auto & pg = pg_it->second;
|
||||||
|
auto op_it = pg.write_queue.find(oid);
|
||||||
|
if (op_it != pg.write_queue.end() &&
|
||||||
|
op_it->second->op_data->st == PG_EPOCH_WAIT_STATE)
|
||||||
|
{
|
||||||
|
continue_primary_write(op_it->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
|
bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
|
||||||
{
|
{
|
||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
@@ -337,7 +456,7 @@ resume_7:
|
|||||||
op_data->unstable_write_osds = NULL;
|
op_data->unstable_write_osds = NULL;
|
||||||
if (op_data->errors > 0)
|
if (op_data->errors > 0)
|
||||||
{
|
{
|
||||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
|
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
201
src/osd_rmw.cpp
201
src/osd_rmw.cpp
@@ -154,6 +154,8 @@ struct reed_sol_matrix_t
|
|||||||
int refs = 0;
|
int refs = 0;
|
||||||
int *je_data;
|
int *je_data;
|
||||||
uint8_t *isal_data;
|
uint8_t *isal_data;
|
||||||
|
// 32 bytes = 256/8 = max pg_size/8
|
||||||
|
std::map<std::array<uint8_t, 32>, void*> subdata;
|
||||||
std::map<reed_sol_erased_t, void*> decodings;
|
std::map<reed_sol_erased_t, void*> decodings;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -194,6 +196,12 @@ void use_ec(int pg_size, int pg_minsize, bool use)
|
|||||||
free(rs_it->second.je_data);
|
free(rs_it->second.je_data);
|
||||||
if (rs_it->second.isal_data)
|
if (rs_it->second.isal_data)
|
||||||
free(rs_it->second.isal_data);
|
free(rs_it->second.isal_data);
|
||||||
|
for (auto sub_it = rs_it->second.subdata.begin(); sub_it != rs_it->second.subdata.end();)
|
||||||
|
{
|
||||||
|
void *data = sub_it->second;
|
||||||
|
rs_it->second.subdata.erase(sub_it++);
|
||||||
|
free(data);
|
||||||
|
}
|
||||||
for (auto dec_it = rs_it->second.decodings.begin(); dec_it != rs_it->second.decodings.end();)
|
for (auto dec_it = rs_it->second.decodings.begin(); dec_it != rs_it->second.decodings.end();)
|
||||||
{
|
{
|
||||||
void *data = dec_it->second;
|
void *data = dec_it->second;
|
||||||
@@ -294,6 +302,47 @@ static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size
|
|||||||
return dec_it->second;
|
return dec_it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef WITH_ISAL
|
||||||
|
#define JERASURE_ALIGNMENT 16
|
||||||
|
|
||||||
|
// jerasure requires 16-byte alignment for SSE...
|
||||||
|
// FIXME: jerasure/gf-complete should probably be patched to automatically choose non-sse version for unaligned buffers
|
||||||
|
static void jerasure_matrix_encode_unaligned(int k, int m, int w, int *matrix, char **data_ptrs, char **coding_ptrs, int size)
|
||||||
|
{
|
||||||
|
bool unaligned = false;
|
||||||
|
for (int i = 0; i < k; i++)
|
||||||
|
if (((unsigned long)data_ptrs[i]) % JERASURE_ALIGNMENT)
|
||||||
|
unaligned = true;
|
||||||
|
for (int i = 0; i < m; i++)
|
||||||
|
if (((unsigned long)coding_ptrs[i]) % JERASURE_ALIGNMENT)
|
||||||
|
unaligned = true;
|
||||||
|
if (!unaligned)
|
||||||
|
{
|
||||||
|
jerasure_matrix_encode(k, m, w, matrix, data_ptrs, coding_ptrs, size);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int aligned_size = ((size+JERASURE_ALIGNMENT-1)/JERASURE_ALIGNMENT)*JERASURE_ALIGNMENT;
|
||||||
|
int copy_size = aligned_size*(k+m);
|
||||||
|
char local_data[copy_size > 4096 ? 0 : copy_size];
|
||||||
|
char *data_copy = copy_size > 4096 || (unsigned long)local_data % JERASURE_ALIGNMENT
|
||||||
|
? (char*)memalign_or_die(JERASURE_ALIGNMENT, aligned_size*(k+m))
|
||||||
|
: local_data;
|
||||||
|
char *aligned_ptrs[k+m];
|
||||||
|
for (int i = 0; i < k; i++)
|
||||||
|
{
|
||||||
|
memcpy(data_copy + i*aligned_size, data_ptrs[i], size);
|
||||||
|
aligned_ptrs[i] = data_copy + i*aligned_size;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < m; i++)
|
||||||
|
aligned_ptrs[k+i] = data_copy + (k+i)*aligned_size;
|
||||||
|
jerasure_matrix_encode(k, m, w, matrix, aligned_ptrs, aligned_ptrs+k, size);
|
||||||
|
for (int i = 0; i < m; i++)
|
||||||
|
memcpy(coding_ptrs[i], aligned_ptrs[k+i], size);
|
||||||
|
if (copy_size > 4096 || (unsigned long)local_data % JERASURE_ALIGNMENT)
|
||||||
|
free(data_copy);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef WITH_ISAL
|
#ifdef WITH_ISAL
|
||||||
void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size)
|
void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size)
|
||||||
{
|
{
|
||||||
@@ -357,10 +406,12 @@ void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsi
|
|||||||
{
|
{
|
||||||
data_ptrs[role] = NULL;
|
data_ptrs[role] = NULL;
|
||||||
}
|
}
|
||||||
|
bool recovered = false;
|
||||||
for (int role = 0; role < pg_minsize; role++)
|
for (int role = 0; role < pg_minsize; role++)
|
||||||
{
|
{
|
||||||
if (stripes[role].read_end != 0 && stripes[role].missing)
|
if (stripes[role].read_end != 0 && stripes[role].missing)
|
||||||
{
|
{
|
||||||
|
recovered = true;
|
||||||
if (stripes[role].read_end > stripes[role].read_start)
|
if (stripes[role].read_end > stripes[role].read_start)
|
||||||
{
|
{
|
||||||
for (int other = 0; other < pg_size; other++)
|
for (int other = 0; other < pg_size; other++)
|
||||||
@@ -378,18 +429,64 @@ void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsi
|
|||||||
data_ptrs, data_ptrs+pg_minsize, stripes[role].read_end - stripes[role].read_start
|
data_ptrs, data_ptrs+pg_minsize, stripes[role].read_end - stripes[role].read_start
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
for (int other = 0; other < pg_size; other++)
|
}
|
||||||
|
}
|
||||||
|
if (recovered && bitmap_size > 0)
|
||||||
|
{
|
||||||
|
bool unaligned = false;
|
||||||
|
for (int role = 0; role < pg_size; role++)
|
||||||
|
{
|
||||||
|
if (stripes[role].read_end != 0)
|
||||||
{
|
{
|
||||||
if (stripes[other].read_end != 0 && !stripes[other].missing)
|
data_ptrs[role] = (char*)stripes[role].bmp_buf;
|
||||||
|
if (((unsigned long)stripes[role].bmp_buf) % JERASURE_ALIGNMENT)
|
||||||
|
unaligned = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!unaligned)
|
||||||
|
{
|
||||||
|
for (int role = 0; role < pg_minsize; role++)
|
||||||
|
{
|
||||||
|
if (stripes[role].read_end != 0 && stripes[role].missing)
|
||||||
{
|
{
|
||||||
data_ptrs[other] = (char*)(stripes[other].bmp_buf);
|
jerasure_matrix_dotprod(
|
||||||
|
pg_minsize, OSD_JERASURE_W, decoding_matrix+(role*pg_minsize), dm_ids, role,
|
||||||
|
data_ptrs, data_ptrs+pg_minsize, bitmap_size
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
data_ptrs[role] = (char*)stripes[role].bmp_buf;
|
}
|
||||||
jerasure_matrix_dotprod(
|
else
|
||||||
pg_minsize, OSD_JERASURE_W, decoding_matrix+(role*pg_minsize), dm_ids, role,
|
{
|
||||||
data_ptrs, data_ptrs+pg_minsize, bitmap_size
|
// jerasure_matrix_dotprod requires 16-byte alignment for SSE...
|
||||||
);
|
int aligned_size = ((bitmap_size+JERASURE_ALIGNMENT-1)/JERASURE_ALIGNMENT)*JERASURE_ALIGNMENT;
|
||||||
|
int copy_size = aligned_size*pg_size;
|
||||||
|
char local_data[copy_size > 4096 ? 0 : copy_size];
|
||||||
|
bool alloc_copy = copy_size > 4096 || (unsigned long)local_data % JERASURE_ALIGNMENT;
|
||||||
|
char *data_copy = alloc_copy
|
||||||
|
? (char*)memalign_or_die(JERASURE_ALIGNMENT, copy_size)
|
||||||
|
: local_data;
|
||||||
|
for (int role = 0; role < pg_size; role++)
|
||||||
|
{
|
||||||
|
if (stripes[role].read_end != 0)
|
||||||
|
{
|
||||||
|
data_ptrs[role] = data_copy + role*aligned_size;
|
||||||
|
memcpy(data_ptrs[role], stripes[role].bmp_buf, bitmap_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int role = 0; role < pg_size; role++)
|
||||||
|
{
|
||||||
|
if (stripes[role].read_end != 0 && stripes[role].missing)
|
||||||
|
{
|
||||||
|
jerasure_matrix_dotprod(
|
||||||
|
pg_minsize, OSD_JERASURE_W, decoding_matrix+(role*pg_minsize), dm_ids, role,
|
||||||
|
data_ptrs, data_ptrs+pg_minsize, bitmap_size
|
||||||
|
);
|
||||||
|
memcpy(stripes[role].bmp_buf, data_ptrs[role], bitmap_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (alloc_copy)
|
||||||
|
free(data_copy);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -662,7 +759,18 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
|
|||||||
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
|
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
|
||||||
uint32_t &start, uint32_t &end)
|
uint32_t &start, uint32_t &end)
|
||||||
{
|
{
|
||||||
if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set)
|
bool required = false;
|
||||||
|
for (int role = pg_minsize; role < pg_size; role++)
|
||||||
|
{
|
||||||
|
if (write_osd_set[role] != 0)
|
||||||
|
{
|
||||||
|
// Whole parity chunk is needed when we move the object
|
||||||
|
if (write_osd_set[role] != read_osd_set[role])
|
||||||
|
end = chunk_size;
|
||||||
|
required = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (required && end != chunk_size)
|
||||||
{
|
{
|
||||||
// start & end are required for calc_rmw_parity
|
// start & end are required for calc_rmw_parity
|
||||||
for (int role = 0; role < pg_minsize; role++)
|
for (int role = 0; role < pg_minsize; role++)
|
||||||
@@ -673,14 +781,6 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
|
|||||||
end = std::max(stripes[role].req_end, end);
|
end = std::max(stripes[role].req_end, end);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int role = pg_minsize; role < pg_size; role++)
|
|
||||||
{
|
|
||||||
if (write_osd_set[role] != 0 && write_osd_set[role] != read_osd_set[role])
|
|
||||||
{
|
|
||||||
start = 0;
|
|
||||||
end = chunk_size;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// Set bitmap bits accordingly
|
// Set bitmap bits accordingly
|
||||||
if (bitmap_granularity > 0)
|
if (bitmap_granularity > 0)
|
||||||
@@ -808,11 +908,56 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
|
|||||||
if (end != 0)
|
if (end != 0)
|
||||||
{
|
{
|
||||||
int write_parity = 0;
|
int write_parity = 0;
|
||||||
for (int i = pg_minsize; i < pg_size; i++)
|
bool is_seq = true;
|
||||||
|
for (int i = pg_size-1; i >= pg_minsize; i--)
|
||||||
|
{
|
||||||
if (write_osd_set[i] != 0)
|
if (write_osd_set[i] != 0)
|
||||||
write_parity++;
|
write_parity++;
|
||||||
|
else if (write_parity != 0)
|
||||||
|
is_seq = false;
|
||||||
|
}
|
||||||
if (write_parity > 0)
|
if (write_parity > 0)
|
||||||
{
|
{
|
||||||
|
// First get the coding matrix or sub-matrix
|
||||||
|
void *matrix_data =
|
||||||
|
#ifdef WITH_ISAL
|
||||||
|
matrix->isal_data;
|
||||||
|
#else
|
||||||
|
matrix->je_data;
|
||||||
|
#endif
|
||||||
|
if (!is_seq)
|
||||||
|
{
|
||||||
|
// We need a coding sub-matrix
|
||||||
|
std::array<uint8_t, 32> missing_parity = {};
|
||||||
|
for (int i = pg_minsize; i < pg_size; i++)
|
||||||
|
{
|
||||||
|
if (!write_osd_set[i])
|
||||||
|
missing_parity[(i-pg_minsize) >> 3] |= (1 << ((i-pg_minsize) & 0x7));
|
||||||
|
}
|
||||||
|
auto sub_it = matrix->subdata.find(missing_parity);
|
||||||
|
if (sub_it == matrix->subdata.end())
|
||||||
|
{
|
||||||
|
int item_size =
|
||||||
|
#ifdef WITH_ISAL
|
||||||
|
32;
|
||||||
|
#else
|
||||||
|
sizeof(int);
|
||||||
|
#endif
|
||||||
|
void *subm = malloc_or_die(item_size * write_parity * pg_minsize);
|
||||||
|
for (int i = pg_minsize, j = 0; i < pg_size; i++)
|
||||||
|
{
|
||||||
|
if (write_osd_set[i])
|
||||||
|
{
|
||||||
|
memcpy((uint8_t*)subm + item_size*pg_minsize*j, (uint8_t*)matrix_data + item_size*pg_minsize*(i-pg_minsize), item_size*pg_minsize);
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
matrix->subdata[missing_parity] = subm;
|
||||||
|
matrix_data = subm;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
matrix_data = sub_it->second;
|
||||||
|
}
|
||||||
// Calculate new coding chunks
|
// Calculate new coding chunks
|
||||||
buf_len_t bufs[pg_size][3];
|
buf_len_t bufs[pg_size][3];
|
||||||
int nbuf[pg_size], curbuf[pg_size];
|
int nbuf[pg_size], curbuf[pg_size];
|
||||||
@@ -841,13 +986,13 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
|
|||||||
while (pos < end)
|
while (pos < end)
|
||||||
{
|
{
|
||||||
uint32_t next_end = end;
|
uint32_t next_end = end;
|
||||||
for (int i = 0; i < pg_size; i++)
|
for (int i = 0, j = 0; i < pg_size; i++)
|
||||||
{
|
{
|
||||||
if (i < pg_minsize || write_osd_set[i] != 0)
|
if (i < pg_minsize || write_osd_set[i] != 0)
|
||||||
{
|
{
|
||||||
assert(curbuf[i] < nbuf[i]);
|
assert(curbuf[i] < nbuf[i]);
|
||||||
assert(bufs[i][curbuf[i]].buf);
|
assert(bufs[i][curbuf[i]].buf);
|
||||||
data_ptrs[i] = (uint8_t*)bufs[i][curbuf[i]].buf + pos-positions[i];
|
data_ptrs[j++] = (uint8_t*)bufs[i][curbuf[i]].buf + pos-positions[i];
|
||||||
uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
|
uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
|
||||||
if (next_end > this_end)
|
if (next_end > this_end)
|
||||||
next_end = this_end;
|
next_end = this_end;
|
||||||
@@ -868,32 +1013,30 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
|
|||||||
}
|
}
|
||||||
#ifdef WITH_ISAL
|
#ifdef WITH_ISAL
|
||||||
ec_encode_data(
|
ec_encode_data(
|
||||||
next_end-pos, pg_minsize, write_parity, matrix->isal_data,
|
next_end-pos, pg_minsize, write_parity, (uint8_t*)matrix_data,
|
||||||
(uint8_t**)data_ptrs, (uint8_t**)data_ptrs+pg_minsize
|
(uint8_t**)data_ptrs, (uint8_t**)data_ptrs+pg_minsize
|
||||||
);
|
);
|
||||||
#else
|
#else
|
||||||
jerasure_matrix_encode(
|
jerasure_matrix_encode(
|
||||||
pg_minsize, write_parity, OSD_JERASURE_W, matrix->je_data,
|
pg_minsize, write_parity, OSD_JERASURE_W, (int*)matrix_data,
|
||||||
(char**)data_ptrs, (char**)data_ptrs+pg_minsize, next_end-pos
|
(char**)data_ptrs, (char**)data_ptrs+pg_minsize, next_end-pos
|
||||||
);
|
);
|
||||||
#endif
|
#endif
|
||||||
pos = next_end;
|
pos = next_end;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < pg_size; i++)
|
for (int i = 0, j = 0; i < pg_size; i++)
|
||||||
{
|
{
|
||||||
if (i < pg_minsize || write_osd_set[i] != 0)
|
if (i < pg_minsize || write_osd_set[i] != 0)
|
||||||
{
|
data_ptrs[j++] = stripes[i].bmp_buf;
|
||||||
data_ptrs[i] = stripes[i].bmp_buf;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#ifdef WITH_ISAL
|
#ifdef WITH_ISAL
|
||||||
ec_encode_data(
|
ec_encode_data(
|
||||||
bitmap_size, pg_minsize, write_parity, matrix->isal_data,
|
bitmap_size, pg_minsize, write_parity, (uint8_t*)matrix_data,
|
||||||
(uint8_t**)data_ptrs, (uint8_t**)data_ptrs+pg_minsize
|
(uint8_t**)data_ptrs, (uint8_t**)data_ptrs+pg_minsize
|
||||||
);
|
);
|
||||||
#else
|
#else
|
||||||
jerasure_matrix_encode(
|
jerasure_matrix_encode_unaligned(
|
||||||
pg_minsize, write_parity, OSD_JERASURE_W, matrix->je_data,
|
pg_minsize, write_parity, OSD_JERASURE_W, (int*)matrix_data,
|
||||||
(char**)data_ptrs, (char**)data_ptrs+pg_minsize, bitmap_size
|
(char**)data_ptrs, (char**)data_ptrs+pg_minsize, bitmap_size
|
||||||
);
|
);
|
||||||
#endif
|
#endif
|
||||||
|
@@ -25,7 +25,9 @@ struct osd_rmw_stripe_t
|
|||||||
uint32_t req_start, req_end;
|
uint32_t req_start, req_end;
|
||||||
uint32_t read_start, read_end;
|
uint32_t read_start, read_end;
|
||||||
uint32_t write_start, write_end;
|
uint32_t write_start, write_end;
|
||||||
bool missing;
|
osd_num_t osd_num;
|
||||||
|
bool missing: 1;
|
||||||
|
bool read_error: 1;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate
|
// Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate
|
||||||
|
@@ -3,6 +3,10 @@
|
|||||||
|
|
||||||
#define RMW_DEBUG
|
#define RMW_DEBUG
|
||||||
|
|
||||||
|
#ifdef NO_ISAL
|
||||||
|
#undef WITH_ISAL
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "osd_rmw.cpp"
|
#include "osd_rmw.cpp"
|
||||||
#include "test_pattern.h"
|
#include "test_pattern.h"
|
||||||
@@ -20,7 +24,8 @@ void test11();
|
|||||||
void test12();
|
void test12();
|
||||||
void test13();
|
void test13();
|
||||||
void test14();
|
void test14();
|
||||||
void test15();
|
void test15(bool second);
|
||||||
|
void test16();
|
||||||
|
|
||||||
int main(int narg, char *args[])
|
int main(int narg, char *args[])
|
||||||
{
|
{
|
||||||
@@ -49,7 +54,10 @@ int main(int narg, char *args[])
|
|||||||
// Test 14
|
// Test 14
|
||||||
test14();
|
test14();
|
||||||
// Test 15
|
// Test 15
|
||||||
test15();
|
test15(false);
|
||||||
|
test15(true);
|
||||||
|
// Test 16
|
||||||
|
test16();
|
||||||
// End
|
// End
|
||||||
printf("all ok\n");
|
printf("all ok\n");
|
||||||
return 0;
|
return 0;
|
||||||
@@ -819,12 +827,11 @@ void test14()
|
|||||||
|
|
||||||
***/
|
***/
|
||||||
|
|
||||||
void test15()
|
void test15(bool second)
|
||||||
{
|
{
|
||||||
const int bmp = 64*1024 / 4096 / 8;
|
const int bmp = 64*1024 / 4096 / 8;
|
||||||
use_ec(4, 2, true);
|
use_ec(4, 2, true);
|
||||||
osd_num_t osd_set[4] = { 1, 2, 3, 0 };
|
osd_num_t osd_set[4] = { 1, 2, (osd_num_t)(second ? 0 : 3), (osd_num_t)(second ? 4 : 0) };
|
||||||
osd_num_t write_osd_set[4] = { 1, 2, 3, 0 };
|
|
||||||
osd_rmw_stripe_t stripes[4] = {};
|
osd_rmw_stripe_t stripes[4] = {};
|
||||||
unsigned bitmaps[4] = { 0 };
|
unsigned bitmaps[4] = { 0 };
|
||||||
// Test 15.0
|
// Test 15.0
|
||||||
@@ -835,7 +842,7 @@ void test15()
|
|||||||
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
|
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
|
||||||
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
|
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
|
||||||
// Test 15.1
|
// Test 15.1
|
||||||
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, write_osd_set, 64*1024, bmp);
|
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, osd_set, 64*1024, bmp);
|
||||||
for (int i = 0; i < 4; i++)
|
for (int i = 0; i < 4; i++)
|
||||||
stripes[i].bmp_buf = bitmaps+i;
|
stripes[i].bmp_buf = bitmaps+i;
|
||||||
assert(rmw_buf);
|
assert(rmw_buf);
|
||||||
@@ -845,34 +852,139 @@ void test15()
|
|||||||
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
|
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
|
||||||
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
||||||
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
||||||
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
|
assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
|
||||||
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
|
assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
|
||||||
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
|
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
|
||||||
assert(stripes[1].read_buf == NULL);
|
assert(stripes[1].read_buf == NULL);
|
||||||
assert(stripes[2].read_buf == NULL);
|
assert(stripes[2].read_buf == NULL);
|
||||||
assert(stripes[3].read_buf == NULL);
|
assert(stripes[3].read_buf == NULL);
|
||||||
assert(stripes[0].write_buf == NULL);
|
assert(stripes[0].write_buf == NULL);
|
||||||
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
||||||
assert(stripes[2].write_buf == rmw_buf);
|
assert(stripes[2+second].write_buf == rmw_buf);
|
||||||
assert(stripes[3].write_buf == NULL);
|
assert(stripes[3-second].write_buf == NULL);
|
||||||
// Test 15.2 - encode
|
// Test 15.2 - encode
|
||||||
set_pattern(write_buf, 4*1024, PATTERN1);
|
set_pattern(write_buf, 4*1024, PATTERN1);
|
||||||
set_pattern(stripes[0].read_buf, 4*1024, PATTERN2);
|
set_pattern(stripes[0].read_buf, 4*1024, PATTERN2);
|
||||||
memset(stripes[0].bmp_buf, 0, bmp);
|
memset(stripes[0].bmp_buf, 0, bmp);
|
||||||
memset(stripes[1].bmp_buf, 0, bmp);
|
memset(stripes[1].bmp_buf, 0, bmp);
|
||||||
calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 64*1024, bmp);
|
memset(stripes[2+second].write_buf, 0, 4096);
|
||||||
assert(*(uint32_t*)stripes[2].bmp_buf == 0x80);
|
calc_rmw_parity_ec(stripes, 4, 2, osd_set, osd_set, 64*1024, bmp);
|
||||||
|
assert(second || *(uint32_t*)stripes[2].bmp_buf == 0x80);
|
||||||
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
||||||
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
||||||
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
|
assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
|
||||||
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
|
assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
|
||||||
assert(stripes[0].write_buf == NULL);
|
assert(stripes[0].write_buf == NULL);
|
||||||
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
||||||
assert(stripes[2].write_buf == rmw_buf);
|
assert(stripes[2+second].write_buf == rmw_buf);
|
||||||
assert(stripes[3].write_buf == NULL);
|
assert(stripes[3-second].write_buf == NULL);
|
||||||
check_pattern(stripes[2].write_buf, 4*1024, PATTERN1^PATTERN2); // first parity is always xor :)
|
// first parity is always xor :), second isn't...
|
||||||
|
check_pattern(stripes[2+second].write_buf, 4*1024, second ? 0xb79a59a0ce8b9b81 : PATTERN1^PATTERN2);
|
||||||
// Done
|
// Done
|
||||||
free(rmw_buf);
|
free(rmw_buf);
|
||||||
free(write_buf);
|
free(write_buf);
|
||||||
use_ec(3, 2, false);
|
use_ec(4, 2, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/***
|
||||||
|
|
||||||
|
16. EC 2+2 write one parity block with another missing
|
||||||
|
calc_rmw(offset=0, len=0, osd_set=[1,2,0,0], write_set=[1,2,0,3])
|
||||||
|
= {
|
||||||
|
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
|
||||||
|
write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ], [ 0, 128K ] ],
|
||||||
|
input buffer: [],
|
||||||
|
rmw buffer: [ write3, read0, read1 ],
|
||||||
|
}
|
||||||
|
|
||||||
|
***/
|
||||||
|
|
||||||
|
void test16()
|
||||||
|
{
|
||||||
|
const int bmp = 128*1024 / 4096 / 8;
|
||||||
|
use_ec(4, 2, true);
|
||||||
|
osd_num_t osd_set[4] = { 1, 2, 0, 0 };
|
||||||
|
osd_num_t write_osd_set[4] = { 1, 2, 0, 3 };
|
||||||
|
osd_rmw_stripe_t stripes[4] = {};
|
||||||
|
unsigned bitmaps[4] = { 0 };
|
||||||
|
// Test 16.0
|
||||||
|
void *write_buf = NULL;
|
||||||
|
split_stripes(2, 128*1024, 0, 0, stripes);
|
||||||
|
assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
|
||||||
|
assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
|
||||||
|
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
|
||||||
|
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
|
||||||
|
// Test 16.1
|
||||||
|
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, write_osd_set, 128*1024, bmp);
|
||||||
|
for (int i = 0; i < 4; i++)
|
||||||
|
stripes[i].bmp_buf = bitmaps+i;
|
||||||
|
assert(rmw_buf);
|
||||||
|
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
|
||||||
|
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
|
||||||
|
assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
|
||||||
|
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
|
||||||
|
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
||||||
|
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
|
||||||
|
assert(stripes[2].write_start == 0 && stripes[2].write_end == 0);
|
||||||
|
assert(stripes[3].write_start == 0 && stripes[3].write_end == 128*1024);
|
||||||
|
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+128*1024);
|
||||||
|
assert(stripes[1].read_buf == (uint8_t*)rmw_buf+256*1024);
|
||||||
|
assert(stripes[2].read_buf == NULL);
|
||||||
|
assert(stripes[3].read_buf == NULL);
|
||||||
|
assert(stripes[0].write_buf == NULL);
|
||||||
|
assert(stripes[1].write_buf == NULL);
|
||||||
|
assert(stripes[2].write_buf == NULL);
|
||||||
|
assert(stripes[3].write_buf == rmw_buf);
|
||||||
|
// Test 16.2 - encode
|
||||||
|
set_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
|
||||||
|
set_pattern(stripes[1].read_buf, 128*1024, PATTERN2);
|
||||||
|
memset(stripes[0].bmp_buf, 0xff, bmp);
|
||||||
|
memset(stripes[1].bmp_buf, 0xff, bmp);
|
||||||
|
calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 128*1024, bmp);
|
||||||
|
assert(*(uint32_t*)stripes[2].bmp_buf == 0);
|
||||||
|
assert(*(uint32_t*)stripes[3].bmp_buf == 0xF1F1F1F1);
|
||||||
|
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
||||||
|
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
|
||||||
|
assert(stripes[2].write_start == 0 && stripes[2].write_end == 0);
|
||||||
|
assert(stripes[3].write_start == 0 && stripes[3].write_end == 128*1024);
|
||||||
|
assert(stripes[0].write_buf == NULL);
|
||||||
|
assert(stripes[1].write_buf == NULL);
|
||||||
|
assert(stripes[2].write_buf == NULL);
|
||||||
|
assert(stripes[3].write_buf == rmw_buf);
|
||||||
|
check_pattern(stripes[3].write_buf, 128*1024, 0x7eb9ae9cd8e652c3); // 2nd EC chunk
|
||||||
|
// Test 16.3 - decode and verify
|
||||||
|
osd_num_t read_osd_set[4] = { 0, 2, 0, 3 };
|
||||||
|
memset(stripes, 0, sizeof(stripes));
|
||||||
|
split_stripes(2, 128*1024, 0, 256*1024, stripes);
|
||||||
|
assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
|
||||||
|
assert(stripes[1].req_start == 0 && stripes[1].req_end == 128*1024);
|
||||||
|
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
|
||||||
|
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
|
||||||
|
for (int role = 0; role < 4; role++)
|
||||||
|
{
|
||||||
|
stripes[role].read_start = stripes[role].req_start;
|
||||||
|
stripes[role].read_end = stripes[role].req_end;
|
||||||
|
}
|
||||||
|
assert(extend_missing_stripes(stripes, read_osd_set, 2, 4) == 0);
|
||||||
|
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
|
||||||
|
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
|
||||||
|
assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
|
||||||
|
assert(stripes[3].read_start == 0 && stripes[3].read_end == 128*1024);
|
||||||
|
void *read_buf = alloc_read_buffer(stripes, 4, 0);
|
||||||
|
for (int i = 0; i < 4; i++)
|
||||||
|
stripes[i].bmp_buf = bitmaps+i;
|
||||||
|
assert(read_buf);
|
||||||
|
assert(stripes[0].read_buf == read_buf);
|
||||||
|
assert(stripes[1].read_buf == (uint8_t*)read_buf+128*1024);
|
||||||
|
assert(stripes[3].read_buf == (uint8_t*)read_buf+2*128*1024);
|
||||||
|
set_pattern(stripes[1].read_buf, 128*1024, PATTERN2);
|
||||||
|
memcpy(stripes[3].read_buf, rmw_buf, 128*1024);
|
||||||
|
reconstruct_stripes_ec(stripes, 4, 2, bmp);
|
||||||
|
assert(bitmaps[0] == 0xFFFFFFFF);
|
||||||
|
check_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
|
||||||
|
free(read_buf);
|
||||||
|
// Done
|
||||||
|
free(rmw_buf);
|
||||||
|
free(write_buf);
|
||||||
|
use_ec(4, 2, false);
|
||||||
}
|
}
|
||||||
|
531
src/osd_scrub.cpp
Normal file
531
src/osd_scrub.cpp
Normal file
@@ -0,0 +1,531 @@
|
|||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 (see README.md for details)
|
||||||
|
|
||||||
|
#include "osd_primary.h"
|
||||||
|
|
||||||
|
#define SELF_FD -1
|
||||||
|
|
||||||
|
void osd_t::scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid)
|
||||||
|
{
|
||||||
|
pool_id_t pool_id = pg_id.pool_id;
|
||||||
|
pg_num_t pg_num = pg_id.pg_num;
|
||||||
|
assert(!scrub_list_op);
|
||||||
|
if (role_osd == this->osd_num)
|
||||||
|
{
|
||||||
|
// Self
|
||||||
|
osd_op_t *op = new osd_op_t();
|
||||||
|
op->op_type = 0;
|
||||||
|
op->peer_fd = SELF_FD;
|
||||||
|
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
|
||||||
|
op->bs_op = new blockstore_op_t();
|
||||||
|
op->bs_op->opcode = BS_OP_LIST;
|
||||||
|
op->bs_op->pg_alignment = st_cli.pool_config[pool_id].pg_stripe_size;
|
||||||
|
if (min_oid.inode != 0 || min_oid.stripe != 0)
|
||||||
|
op->bs_op->min_oid = min_oid;
|
||||||
|
else
|
||||||
|
op->bs_op->min_oid.inode = ((uint64_t)pool_id << (64 - POOL_ID_BITS));
|
||||||
|
op->bs_op->max_oid.inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1;
|
||||||
|
op->bs_op->max_oid.stripe = UINT64_MAX;
|
||||||
|
op->bs_op->list_stable_limit = scrub_list_limit;
|
||||||
|
op->bs_op->pg_count = pg_counts[pool_id];
|
||||||
|
op->bs_op->pg_number = pg_num-1;
|
||||||
|
op->bs_op->callback = [this, op](blockstore_op_t *bs_op)
|
||||||
|
{
|
||||||
|
scrub_list_op = NULL;
|
||||||
|
if (op->bs_op->retval < 0)
|
||||||
|
{
|
||||||
|
printf("Local OP_LIST failed: retval=%d\n", op->bs_op->retval);
|
||||||
|
force_stop(1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
add_bs_subop_stats(op);
|
||||||
|
scrub_cur_list = {
|
||||||
|
.buf = (obj_ver_id*)op->bs_op->buf,
|
||||||
|
.total_count = (uint64_t)op->bs_op->retval,
|
||||||
|
.stable_count = op->bs_op->version,
|
||||||
|
};
|
||||||
|
delete op->bs_op;
|
||||||
|
op->bs_op = NULL;
|
||||||
|
delete op;
|
||||||
|
continue_scrub();
|
||||||
|
};
|
||||||
|
scrub_list_op = op;
|
||||||
|
bs->enqueue_op(op->bs_op);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Peer
|
||||||
|
osd_op_t *op = new osd_op_t();
|
||||||
|
op->op_type = OSD_OP_OUT;
|
||||||
|
op->peer_fd = msgr.osd_peer_fds.at(role_osd);
|
||||||
|
op->req = (osd_any_op_t){
|
||||||
|
.sec_list = {
|
||||||
|
.header = {
|
||||||
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
|
.id = msgr.next_subop_id++,
|
||||||
|
.opcode = OSD_OP_SEC_LIST,
|
||||||
|
},
|
||||||
|
.list_pg = pg_num,
|
||||||
|
.pg_count = pg_counts[pool_id],
|
||||||
|
.pg_stripe_size = st_cli.pool_config[pool_id].pg_stripe_size,
|
||||||
|
.min_inode = min_oid.inode ? min_oid.inode : ((uint64_t)(pool_id) << (64 - POOL_ID_BITS)),
|
||||||
|
.max_inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1,
|
||||||
|
.min_stripe = min_oid.stripe,
|
||||||
|
.stable_limit = scrub_list_limit,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
op->callback = [this, role_osd](osd_op_t *op)
|
||||||
|
{
|
||||||
|
scrub_list_op = NULL;
|
||||||
|
if (op->reply.hdr.retval < 0)
|
||||||
|
{
|
||||||
|
printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
|
||||||
|
int fail_fd = op->peer_fd;
|
||||||
|
delete op;
|
||||||
|
msgr.stop_client(fail_fd);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
scrub_cur_list = {
|
||||||
|
.buf = (obj_ver_id*)op->buf,
|
||||||
|
.total_count = (uint64_t)op->reply.hdr.retval,
|
||||||
|
.stable_count = op->reply.sec_list.stable_count,
|
||||||
|
};
|
||||||
|
// set op->buf to NULL so it doesn't get freed
|
||||||
|
op->buf = NULL;
|
||||||
|
delete op;
|
||||||
|
continue_scrub();
|
||||||
|
};
|
||||||
|
scrub_list_op = op;
|
||||||
|
msgr.outbox_push(op);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool osd_t::pick_next_scrub(object_id & next_oid)
|
||||||
|
{
|
||||||
|
if (!pgs.size())
|
||||||
|
{
|
||||||
|
if (scrub_cur_list.buf)
|
||||||
|
{
|
||||||
|
free(scrub_cur_list.buf);
|
||||||
|
scrub_cur_list = {};
|
||||||
|
scrub_last_pg = {};
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
timespec tv_now;
|
||||||
|
clock_gettime(CLOCK_REALTIME, &tv_now);
|
||||||
|
bool rescan = scrub_last_pg.pool_id != 0 || scrub_last_pg.pg_num != 0;
|
||||||
|
// Restart scanning from the same PG as the last time
|
||||||
|
auto pg_it = pgs.lower_bound(scrub_last_pg);
|
||||||
|
while (pg_it != pgs.end())
|
||||||
|
{
|
||||||
|
if (pg_it->second.state & PG_ACTIVE)
|
||||||
|
{
|
||||||
|
auto & pool_cfg = st_cli.pool_config.at(pg_it->first.pool_id);
|
||||||
|
auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval;
|
||||||
|
if (pg_it->second.scrub_ts < tv_now.tv_sec-interval)
|
||||||
|
{
|
||||||
|
// Continue scrubbing from the next object
|
||||||
|
if (scrub_last_pg == pg_it->first)
|
||||||
|
{
|
||||||
|
while (scrub_list_pos < scrub_cur_list.total_count)
|
||||||
|
{
|
||||||
|
auto oid = scrub_cur_list.buf[scrub_list_pos].oid;
|
||||||
|
oid.stripe &= ~STRIPE_MASK;
|
||||||
|
scrub_list_pos++;
|
||||||
|
if (recovery_ops.find(oid) == recovery_ops.end() &&
|
||||||
|
scrub_ops.find(oid) == scrub_ops.end())
|
||||||
|
{
|
||||||
|
next_oid = oid;
|
||||||
|
if (!(pg_it->second.state & PG_SCRUBBING))
|
||||||
|
{
|
||||||
|
// Currently scrubbing this PG
|
||||||
|
pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
|
||||||
|
report_pg_state(pg_it->second);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (scrub_last_pg == pg_it->first &&
|
||||||
|
scrub_cur_list.total_count && scrub_list_pos >= scrub_cur_list.total_count &&
|
||||||
|
scrub_cur_list.stable_count < scrub_list_limit)
|
||||||
|
{
|
||||||
|
// End of the list, mark this PG as scrubbed and go to the next PG
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Continue listing
|
||||||
|
object_id scrub_last_oid;
|
||||||
|
if (scrub_last_pg != pg_it->first)
|
||||||
|
scrub_last_oid = (object_id){};
|
||||||
|
else if (scrub_cur_list.stable_count > 0)
|
||||||
|
{
|
||||||
|
scrub_last_oid = scrub_cur_list.buf[scrub_cur_list.stable_count-1].oid;
|
||||||
|
scrub_last_oid.stripe++;
|
||||||
|
}
|
||||||
|
osd_num_t scrub_osd = 0;
|
||||||
|
for (osd_num_t pg_osd: pg_it->second.cur_set)
|
||||||
|
{
|
||||||
|
if (pg_osd == this->osd_num || scrub_osd == 0)
|
||||||
|
scrub_osd = pg_osd;
|
||||||
|
}
|
||||||
|
if (!(pg_it->second.state & PG_SCRUBBING))
|
||||||
|
{
|
||||||
|
// Currently scrubbing this PG
|
||||||
|
pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
|
||||||
|
report_pg_state(pg_it->second);
|
||||||
|
}
|
||||||
|
if (scrub_cur_list.buf)
|
||||||
|
{
|
||||||
|
free(scrub_cur_list.buf);
|
||||||
|
scrub_cur_list = {};
|
||||||
|
scrub_last_oid = {};
|
||||||
|
}
|
||||||
|
scrub_last_pg = pg_it->first;
|
||||||
|
scrub_list(pg_it->first, scrub_osd, scrub_last_oid);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (pg_it->second.state & PG_SCRUBBING)
|
||||||
|
{
|
||||||
|
pg_it->second.scrub_ts = tv_now.tv_sec;
|
||||||
|
pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING;
|
||||||
|
pg_it->second.history_changed = true;
|
||||||
|
report_pg_state(pg_it->second);
|
||||||
|
schedule_scrub(pg_it->second);
|
||||||
|
}
|
||||||
|
// The list is definitely not needed anymore
|
||||||
|
if (scrub_cur_list.buf)
|
||||||
|
{
|
||||||
|
free(scrub_cur_list.buf);
|
||||||
|
scrub_cur_list = {};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pg_it++;
|
||||||
|
if (pg_it == pgs.end() && rescan)
|
||||||
|
{
|
||||||
|
// Scan one more time to guarantee that there are no PGs to scrub
|
||||||
|
pg_it = pgs.begin();
|
||||||
|
rescan = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Scanned all PGs - no more scrubs to do
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_t::submit_scrub_op(object_id oid)
|
||||||
|
{
|
||||||
|
auto osd_op = new osd_op_t();
|
||||||
|
osd_op->op_type = OSD_OP_OUT;
|
||||||
|
osd_op->req = (osd_any_op_t){
|
||||||
|
.rw = {
|
||||||
|
.header = {
|
||||||
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
|
.id = 1,
|
||||||
|
.opcode = OSD_OP_SCRUB,
|
||||||
|
},
|
||||||
|
.inode = oid.inode,
|
||||||
|
.offset = oid.stripe,
|
||||||
|
.len = 0,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
if (log_level > 2)
|
||||||
|
{
|
||||||
|
printf("Submitting scrub for %lx:%lx\n", oid.inode, oid.stripe);
|
||||||
|
}
|
||||||
|
osd_op->callback = [this](osd_op_t *osd_op)
|
||||||
|
{
|
||||||
|
object_id oid = { .inode = osd_op->req.rw.inode, .stripe = osd_op->req.rw.offset };
|
||||||
|
if (osd_op->reply.hdr.retval < 0 && osd_op->reply.hdr.retval != -ENOENT)
|
||||||
|
{
|
||||||
|
// Scrub error
|
||||||
|
printf(
|
||||||
|
"Scrub failed with object %lx:%lx (PG %u/%u): error %ld\n",
|
||||||
|
oid.inode, oid.stripe, INODE_POOL(oid.inode),
|
||||||
|
map_to_pg(oid, st_cli.pool_config.at(INODE_POOL(oid.inode)).pg_stripe_size),
|
||||||
|
osd_op->reply.hdr.retval
|
||||||
|
);
|
||||||
|
}
|
||||||
|
else if (log_level > 2)
|
||||||
|
{
|
||||||
|
printf("Scrubbed %lx:%lx OK\n", oid.inode, oid.stripe);
|
||||||
|
}
|
||||||
|
delete osd_op;
|
||||||
|
if (scrub_sleep_ms)
|
||||||
|
{
|
||||||
|
this->tfd->set_timer(scrub_sleep_ms, false, [this, oid](int timer_id)
|
||||||
|
{
|
||||||
|
scrub_ops.erase(oid);
|
||||||
|
continue_scrub();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
scrub_ops.erase(oid);
|
||||||
|
continue_scrub();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
scrub_ops[oid] = osd_op;
|
||||||
|
exec_op(osd_op);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Triggers scrub requests
|
||||||
|
// Scrub reads data from all replicas and compares it
|
||||||
|
// To scrub first we need to read objects listings
|
||||||
|
bool osd_t::continue_scrub()
|
||||||
|
{
|
||||||
|
if (scrub_list_op)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
while (scrub_ops.size() < scrub_queue_depth)
|
||||||
|
{
|
||||||
|
object_id oid;
|
||||||
|
if (pick_next_scrub(oid))
|
||||||
|
submit_scrub_op(oid);
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_t::schedule_scrub(pg_t & pg)
|
||||||
|
{
|
||||||
|
auto & pool_cfg = st_cli.pool_config.at(pg.pool_id);
|
||||||
|
auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval;
|
||||||
|
if (!scrub_nearest_ts || scrub_nearest_ts > pg.scrub_ts+interval)
|
||||||
|
{
|
||||||
|
scrub_nearest_ts = pg.scrub_ts+interval;
|
||||||
|
timespec tv_now;
|
||||||
|
clock_gettime(CLOCK_REALTIME, &tv_now);
|
||||||
|
if (scrub_timer_id >= 0)
|
||||||
|
{
|
||||||
|
tfd->clear_timer(scrub_timer_id);
|
||||||
|
scrub_timer_id = -1;
|
||||||
|
}
|
||||||
|
if (tv_now.tv_sec > scrub_nearest_ts)
|
||||||
|
{
|
||||||
|
scrub_nearest_ts = 0;
|
||||||
|
peering_state = peering_state | OSD_SCRUBBING;
|
||||||
|
ringloop->wakeup();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
scrub_timer_id = tfd->set_timer((scrub_nearest_ts-tv_now.tv_sec)*1000, false, [this](int timer_id)
|
||||||
|
{
|
||||||
|
scrub_timer_id = -1;
|
||||||
|
scrub_nearest_ts = 0;
|
||||||
|
peering_state = peering_state | OSD_SCRUBBING;
|
||||||
|
ringloop->wakeup();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_t::continue_primary_scrub(osd_op_t *cur_op)
|
||||||
|
{
|
||||||
|
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
|
||||||
|
return;
|
||||||
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
|
if (op_data->st == 1)
|
||||||
|
goto resume_1;
|
||||||
|
else if (op_data->st == 2)
|
||||||
|
goto resume_2;
|
||||||
|
{
|
||||||
|
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||||
|
// Determine version
|
||||||
|
auto vo_it = pg.ver_override.find(op_data->oid);
|
||||||
|
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||||
|
// PG may have degraded or misplaced objects
|
||||||
|
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
||||||
|
// Read all available chunks
|
||||||
|
int n_copies = 0;
|
||||||
|
op_data->degraded = false;
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
op_data->stripes[role].read_start = 0;
|
||||||
|
op_data->stripes[role].read_end = bs_block_size;
|
||||||
|
if (op_data->prev_set[role] != 0)
|
||||||
|
{
|
||||||
|
n_copies++;
|
||||||
|
}
|
||||||
|
else if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
|
||||||
|
{
|
||||||
|
op_data->degraded = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (n_copies <= op_data->pg_data_size)
|
||||||
|
{
|
||||||
|
// Nothing to compare, even if we'd like to
|
||||||
|
finish_op(cur_op, 0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size,
|
||||||
|
op_data->scheme != POOL_SCHEME_REPLICATED ? bs_block_size*(op_data->pg_size-op_data->pg_data_size) : 0);
|
||||||
|
// Submit reads
|
||||||
|
osd_op_t *subops = new osd_op_t[n_copies];
|
||||||
|
op_data->fact_ver = 0;
|
||||||
|
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||||
|
op_data->n_subops = n_copies;
|
||||||
|
op_data->subops = subops;
|
||||||
|
int sent = submit_primary_subop_batch(SUBMIT_SCRUB_READ, op_data->oid.inode, op_data->target_ver,
|
||||||
|
op_data->stripes, op_data->prev_set, cur_op, 0, -1);
|
||||||
|
assert(sent == n_copies);
|
||||||
|
op_data->st = 1;
|
||||||
|
}
|
||||||
|
resume_1:
|
||||||
|
return;
|
||||||
|
resume_2:
|
||||||
|
if (op_data->errors > 0)
|
||||||
|
{
|
||||||
|
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||||
|
{
|
||||||
|
// I/O or checksum error
|
||||||
|
int n_copies = 0;
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
if (op_data->stripes[role].read_end != 0 &&
|
||||||
|
!op_data->stripes[role].read_error)
|
||||||
|
{
|
||||||
|
n_copies++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (n_copies <= op_data->pg_data_size)
|
||||||
|
{
|
||||||
|
// Nothing to compare, just mark the object as corrupted
|
||||||
|
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||||
|
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||||
|
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
|
||||||
|
// Operation is treated as unsuccessful only if the object becomes unreadable
|
||||||
|
finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Proceed, we can still compare chunks that were successfully read
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
finish_op(cur_op, op_data->errcode);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||||
|
{
|
||||||
|
// Check that all chunks have returned the same data
|
||||||
|
int total = 0;
|
||||||
|
int eq_to[op_data->pg_size];
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
eq_to[role] = -1;
|
||||||
|
if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].read_error)
|
||||||
|
{
|
||||||
|
total++;
|
||||||
|
eq_to[role] = role;
|
||||||
|
for (int other = 0; other < role; other++)
|
||||||
|
{
|
||||||
|
// Only compare with unique chunks (eq_to[other] == other)
|
||||||
|
if (eq_to[other] == other && memcmp(op_data->stripes[role].read_buf, op_data->stripes[other].read_buf, bs_block_size) == 0)
|
||||||
|
{
|
||||||
|
eq_to[role] = eq_to[other];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int votes[op_data->pg_size];
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
votes[role] = 0;
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
if (eq_to[role] != -1)
|
||||||
|
votes[eq_to[role]]++;
|
||||||
|
}
|
||||||
|
int best = -1;
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
if (best < 0 && votes[role] > 0 || votes[role] > votes[best])
|
||||||
|
best = role;
|
||||||
|
}
|
||||||
|
if (best > 0 && votes[best] < total)
|
||||||
|
{
|
||||||
|
// FIXME Add a flag to allow to skip such objects and not recover them automatically
|
||||||
|
bool unknown = false;
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
if (role != best && votes[role] == votes[best])
|
||||||
|
unknown = true;
|
||||||
|
if (votes[role] > 0 && votes[role] < votes[best])
|
||||||
|
{
|
||||||
|
printf(
|
||||||
|
"[PG %u/%u] Object %lx:%lx copy on OSD %lu doesn't match %d other copies, marking it as corrupted\n",
|
||||||
|
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||||
|
op_data->oid.inode, op_data->oid.stripe, op_data->stripes[role].osd_num, votes[best]
|
||||||
|
);
|
||||||
|
op_data->stripes[role].read_error = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (unknown)
|
||||||
|
{
|
||||||
|
// It's unknown which replica is good. There are multiple versions with no majority
|
||||||
|
best = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR);
|
||||||
|
if (op_data->degraded)
|
||||||
|
{
|
||||||
|
// Reconstruct missing stripes
|
||||||
|
// XOR shouldn't come here as it only has 1 parity chunk
|
||||||
|
assert(op_data->scheme == POOL_SCHEME_EC);
|
||||||
|
reconstruct_stripes_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
|
||||||
|
}
|
||||||
|
// Generate parity chunks and compare them with actual data
|
||||||
|
osd_num_t fake_osd_set[op_data->pg_size];
|
||||||
|
for (int i = 0; i < op_data->pg_size; i++)
|
||||||
|
{
|
||||||
|
fake_osd_set[i] = 1;
|
||||||
|
op_data->stripes[i].write_buf = i >= op_data->pg_data_size
|
||||||
|
? ((uint8_t*)cur_op->buf + (i-op_data->pg_data_size)*bs_block_size)
|
||||||
|
: op_data->stripes[i].read_buf;
|
||||||
|
}
|
||||||
|
if (op_data->scheme == POOL_SCHEME_XOR)
|
||||||
|
{
|
||||||
|
calc_rmw_parity_xor(op_data->stripes, op_data->pg_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
|
||||||
|
}
|
||||||
|
else if (op_data->scheme == POOL_SCHEME_EC)
|
||||||
|
{
|
||||||
|
calc_rmw_parity_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
|
||||||
|
}
|
||||||
|
// Now compare that write_buf == read_buf
|
||||||
|
for (int role = op_data->pg_data_size; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error &&
|
||||||
|
memcmp(op_data->stripes[role].read_buf, op_data->stripes[role].write_buf, bs_block_size) != 0)
|
||||||
|
{
|
||||||
|
// Chunks don't match - something's wrong... but we don't know what :D
|
||||||
|
// FIXME: Try to locate errors (may be possible with >= 2 parity chunks)
|
||||||
|
printf(
|
||||||
|
"[PG %u/%u] Object %lx:%lx parity chunk %d on OSD %lu doesn't match data, marking it as corrupted\n",
|
||||||
|
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||||
|
op_data->oid.inode, op_data->oid.stripe,
|
||||||
|
role-op_data->pg_data_size, op_data->stripes[role].osd_num
|
||||||
|
);
|
||||||
|
op_data->stripes[role].read_error = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error)
|
||||||
|
{
|
||||||
|
// Got at least 1 read error or mismatch, mark the object as corrupted
|
||||||
|
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||||
|
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||||
|
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
finish_op(cur_op, 0);
|
||||||
|
}
|
@@ -125,11 +125,18 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
|
|||||||
secondary_op_callback(cur_op);
|
secondary_op_callback(cur_op);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size;
|
cur_op->bs_op->pg_alignment = cur_op->req.sec_list.pg_stripe_size;
|
||||||
cur_op->bs_op->len = cur_op->req.sec_list.pg_count;
|
cur_op->bs_op->pg_count = cur_op->req.sec_list.pg_count;
|
||||||
cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1;
|
cur_op->bs_op->pg_number = cur_op->req.sec_list.list_pg - 1;
|
||||||
cur_op->bs_op->oid.inode = cur_op->req.sec_list.min_inode;
|
cur_op->bs_op->min_oid.inode = cur_op->req.sec_list.min_inode;
|
||||||
cur_op->bs_op->version = cur_op->req.sec_list.max_inode;
|
cur_op->bs_op->min_oid.stripe = cur_op->req.sec_list.min_stripe;
|
||||||
|
cur_op->bs_op->max_oid.inode = cur_op->req.sec_list.max_inode;
|
||||||
|
if (cur_op->req.sec_list.max_inode && cur_op->req.sec_list.max_stripe != UINT64_MAX)
|
||||||
|
{
|
||||||
|
cur_op->bs_op->max_oid.stripe = cur_op->req.sec_list.max_stripe
|
||||||
|
? cur_op->req.sec_list.max_stripe : UINT64_MAX;
|
||||||
|
}
|
||||||
|
cur_op->bs_op->list_stable_limit = cur_op->req.sec_list.stable_limit;
|
||||||
#ifdef OSD_STUB
|
#ifdef OSD_STUB
|
||||||
cur_op->bs_op->retval = 0;
|
cur_op->bs_op->retval = 0;
|
||||||
cur_op->bs_op->buf = NULL;
|
cur_op->bs_op->buf = NULL;
|
||||||
|
@@ -8,35 +8,37 @@ const int pg_state_bit_count = 16;
|
|||||||
const int pg_state_bits[16] = {
|
const int pg_state_bits[16] = {
|
||||||
PG_STARTING,
|
PG_STARTING,
|
||||||
PG_PEERING,
|
PG_PEERING,
|
||||||
PG_PEERED,
|
|
||||||
PG_INCOMPLETE,
|
PG_INCOMPLETE,
|
||||||
PG_ACTIVE,
|
PG_ACTIVE,
|
||||||
PG_REPEERING,
|
PG_REPEERING,
|
||||||
PG_STOPPING,
|
PG_STOPPING,
|
||||||
PG_OFFLINE,
|
PG_OFFLINE,
|
||||||
PG_DEGRADED,
|
PG_DEGRADED,
|
||||||
|
PG_HAS_CORRUPTED,
|
||||||
PG_HAS_INCOMPLETE,
|
PG_HAS_INCOMPLETE,
|
||||||
PG_HAS_DEGRADED,
|
PG_HAS_DEGRADED,
|
||||||
PG_HAS_MISPLACED,
|
PG_HAS_MISPLACED,
|
||||||
PG_HAS_UNCLEAN,
|
PG_HAS_UNCLEAN,
|
||||||
PG_HAS_INVALID,
|
PG_HAS_INVALID,
|
||||||
PG_LEFT_ON_DEAD,
|
PG_LEFT_ON_DEAD,
|
||||||
|
PG_SCRUBBING,
|
||||||
};
|
};
|
||||||
|
|
||||||
const char *pg_state_names[16] = {
|
const char *pg_state_names[16] = {
|
||||||
"starting",
|
"starting",
|
||||||
"peering",
|
"peering",
|
||||||
"peered",
|
|
||||||
"incomplete",
|
"incomplete",
|
||||||
"active",
|
"active",
|
||||||
"repeering",
|
"repeering",
|
||||||
"stopping",
|
"stopping",
|
||||||
"offline",
|
"offline",
|
||||||
"degraded",
|
"degraded",
|
||||||
|
"has_corrupted",
|
||||||
"has_incomplete",
|
"has_incomplete",
|
||||||
"has_degraded",
|
"has_degraded",
|
||||||
"has_misplaced",
|
"has_misplaced",
|
||||||
"has_unclean",
|
"has_unclean",
|
||||||
"has_invalid",
|
"has_invalid",
|
||||||
"left_on_dead",
|
"left_on_dead",
|
||||||
|
"scrubbing",
|
||||||
};
|
};
|
||||||
|
@@ -4,27 +4,27 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
// Placement group states
|
// Placement group states
|
||||||
// STARTING -> [acquire lock] -> PEERING -> PEERED
|
// STARTING -> [acquire lock] -> PEERING -> INCOMPLETE|ACTIVE
|
||||||
// PEERED -> [report history if required!] -> INCOMPLETE|ACTIVE
|
|
||||||
// ACTIVE -> REPEERING -> PEERING
|
// ACTIVE -> REPEERING -> PEERING
|
||||||
// ACTIVE -> STOPPING -> OFFLINE -> [release lock]
|
// ACTIVE -> STOPPING -> OFFLINE -> [release lock]
|
||||||
// Exactly one of these:
|
// Exactly one of these:
|
||||||
#define PG_STARTING (1<<0)
|
#define PG_STARTING (1<<0)
|
||||||
#define PG_PEERING (1<<1)
|
#define PG_PEERING (1<<1)
|
||||||
#define PG_PEERED (1<<2)
|
#define PG_INCOMPLETE (1<<2)
|
||||||
#define PG_INCOMPLETE (1<<3)
|
#define PG_ACTIVE (1<<3)
|
||||||
#define PG_ACTIVE (1<<4)
|
#define PG_REPEERING (1<<4)
|
||||||
#define PG_REPEERING (1<<5)
|
#define PG_STOPPING (1<<5)
|
||||||
#define PG_STOPPING (1<<6)
|
#define PG_OFFLINE (1<<6)
|
||||||
#define PG_OFFLINE (1<<7)
|
|
||||||
// Plus any of these:
|
// Plus any of these:
|
||||||
#define PG_DEGRADED (1<<8)
|
#define PG_DEGRADED (1<<7)
|
||||||
#define PG_HAS_INCOMPLETE (1<<9)
|
#define PG_HAS_INCOMPLETE (1<<8)
|
||||||
#define PG_HAS_DEGRADED (1<<10)
|
#define PG_HAS_DEGRADED (1<<9)
|
||||||
#define PG_HAS_MISPLACED (1<<11)
|
#define PG_HAS_MISPLACED (1<<10)
|
||||||
#define PG_HAS_UNCLEAN (1<<12)
|
#define PG_HAS_UNCLEAN (1<<11)
|
||||||
#define PG_HAS_INVALID (1<<13)
|
#define PG_HAS_INVALID (1<<12)
|
||||||
|
#define PG_HAS_CORRUPTED (1<<13)
|
||||||
#define PG_LEFT_ON_DEAD (1<<14)
|
#define PG_LEFT_ON_DEAD (1<<14)
|
||||||
|
#define PG_SCRUBBING (1<<15)
|
||||||
|
|
||||||
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
|
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
|
||||||
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
|
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
|
||||||
@@ -34,6 +34,8 @@
|
|||||||
#define OBJ_DEGRADED 0x02
|
#define OBJ_DEGRADED 0x02
|
||||||
#define OBJ_INCOMPLETE 0x04
|
#define OBJ_INCOMPLETE 0x04
|
||||||
#define OBJ_MISPLACED 0x08
|
#define OBJ_MISPLACED 0x08
|
||||||
|
// OBJ_CORRUPTED is always set with one of OBJ_INCOMPLETE/OBJ_DEGRADED/OBJ_MISPLACED
|
||||||
|
#define OBJ_CORRUPTED 0x10
|
||||||
#define OBJ_NEEDS_STABLE 0x10000
|
#define OBJ_NEEDS_STABLE 0x10000
|
||||||
#define OBJ_NEEDS_ROLLBACK 0x20000
|
#define OBJ_NEEDS_ROLLBACK 0x20000
|
||||||
|
|
||||||
|
@@ -53,6 +53,7 @@ typedef struct VitastorClient
|
|||||||
char *etcd_host;
|
char *etcd_host;
|
||||||
char *etcd_prefix;
|
char *etcd_prefix;
|
||||||
char *image;
|
char *image;
|
||||||
|
int skip_parents;
|
||||||
uint64_t inode;
|
uint64_t inode;
|
||||||
uint64_t pool;
|
uint64_t pool;
|
||||||
uint64_t size;
|
uint64_t size;
|
||||||
@@ -63,6 +64,10 @@ typedef struct VitastorClient
|
|||||||
int rdma_gid_index;
|
int rdma_gid_index;
|
||||||
int rdma_mtu;
|
int rdma_mtu;
|
||||||
QemuMutex mutex;
|
QemuMutex mutex;
|
||||||
|
|
||||||
|
uint64_t last_bitmap_inode, last_bitmap_offset, last_bitmap_len;
|
||||||
|
uint32_t last_bitmap_granularity;
|
||||||
|
uint8_t *last_bitmap;
|
||||||
} VitastorClient;
|
} VitastorClient;
|
||||||
|
|
||||||
typedef struct VitastorRPC
|
typedef struct VitastorRPC
|
||||||
@@ -72,6 +77,9 @@ typedef struct VitastorRPC
|
|||||||
QEMUIOVector *iov;
|
QEMUIOVector *iov;
|
||||||
long ret;
|
long ret;
|
||||||
int complete;
|
int complete;
|
||||||
|
uint64_t inode, offset, len;
|
||||||
|
uint32_t bitmap_granularity;
|
||||||
|
uint8_t *bitmap;
|
||||||
} VitastorRPC;
|
} VitastorRPC;
|
||||||
|
|
||||||
static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task);
|
static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task);
|
||||||
@@ -147,6 +155,7 @@ static void vitastor_parse_filename(const char *filename, QDict *options, Error
|
|||||||
if (!strcmp(name, "inode") ||
|
if (!strcmp(name, "inode") ||
|
||||||
!strcmp(name, "pool") ||
|
!strcmp(name, "pool") ||
|
||||||
!strcmp(name, "size") ||
|
!strcmp(name, "size") ||
|
||||||
|
!strcmp(name, "skip-parents") ||
|
||||||
!strcmp(name, "use-rdma") ||
|
!strcmp(name, "use-rdma") ||
|
||||||
!strcmp(name, "rdma-port_num") ||
|
!strcmp(name, "rdma-port_num") ||
|
||||||
!strcmp(name, "rdma-gid-index") ||
|
!strcmp(name, "rdma-gid-index") ||
|
||||||
@@ -227,13 +236,16 @@ static void vitastor_aio_set_fd_handler(void *ctx, int fd, int unused1, IOHandle
|
|||||||
|
|
||||||
static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
|
static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
|
||||||
{
|
{
|
||||||
|
VitastorRPC task;
|
||||||
VitastorClient *client = bs->opaque;
|
VitastorClient *client = bs->opaque;
|
||||||
|
void *image = NULL;
|
||||||
int64_t ret = 0;
|
int64_t ret = 0;
|
||||||
qemu_mutex_init(&client->mutex);
|
qemu_mutex_init(&client->mutex);
|
||||||
client->config_path = g_strdup(qdict_get_try_str(options, "config-path"));
|
client->config_path = g_strdup(qdict_get_try_str(options, "config-path"));
|
||||||
// FIXME: Rename to etcd_address
|
// FIXME: Rename to etcd_address
|
||||||
client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd-host"));
|
client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd-host"));
|
||||||
client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd-prefix"));
|
client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd-prefix"));
|
||||||
|
client->skip_parents = qdict_get_try_int(options, "skip-parents", 0);
|
||||||
client->use_rdma = qdict_get_try_int(options, "use-rdma", -1);
|
client->use_rdma = qdict_get_try_int(options, "use-rdma", -1);
|
||||||
client->rdma_device = g_strdup(qdict_get_try_str(options, "rdma-device"));
|
client->rdma_device = g_strdup(qdict_get_try_str(options, "rdma-device"));
|
||||||
client->rdma_port_num = qdict_get_try_int(options, "rdma-port-num", 0);
|
client->rdma_port_num = qdict_get_try_int(options, "rdma-port-num", 0);
|
||||||
@@ -243,23 +255,25 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
|||||||
vitastor_aio_set_fd_handler, bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix,
|
vitastor_aio_set_fd_handler, bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix,
|
||||||
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
|
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
|
||||||
);
|
);
|
||||||
client->image = g_strdup(qdict_get_try_str(options, "image"));
|
image = client->image = g_strdup(qdict_get_try_str(options, "image"));
|
||||||
client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
|
client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
|
||||||
|
// Get image metadata (size and readonly flag) or just wait until the client is ready
|
||||||
|
if (!image)
|
||||||
|
client->image = (char*)"x";
|
||||||
|
task.complete = 0;
|
||||||
|
task.bs = bs;
|
||||||
|
if (qemu_in_coroutine())
|
||||||
|
{
|
||||||
|
vitastor_co_get_metadata(&task);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
|
||||||
|
BDRV_POLL_WHILE(bs, !task.complete);
|
||||||
|
}
|
||||||
|
client->image = image;
|
||||||
if (client->image)
|
if (client->image)
|
||||||
{
|
{
|
||||||
// Get image metadata (size and readonly flag)
|
|
||||||
VitastorRPC task;
|
|
||||||
task.complete = 0;
|
|
||||||
task.bs = bs;
|
|
||||||
if (qemu_in_coroutine())
|
|
||||||
{
|
|
||||||
vitastor_co_get_metadata(&task);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
|
|
||||||
BDRV_POLL_WHILE(bs, !task.complete);
|
|
||||||
}
|
|
||||||
client->watch = (void*)task.ret;
|
client->watch = (void*)task.ret;
|
||||||
client->readonly = client->readonly || vitastor_c_inode_get_readonly(client->watch);
|
client->readonly = client->readonly || vitastor_c_inode_get_readonly(client->watch);
|
||||||
client->size = vitastor_c_inode_get_size(client->watch);
|
client->size = vitastor_c_inode_get_size(client->watch);
|
||||||
@@ -284,6 +298,7 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
|||||||
client->inode = (client->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
|
client->inode = (client->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
|
||||||
}
|
}
|
||||||
client->size = qdict_get_try_int(options, "size", 0);
|
client->size = qdict_get_try_int(options, "size", 0);
|
||||||
|
vitastor_c_close_watch(client->proxy, (void*)task.ret);
|
||||||
}
|
}
|
||||||
if (!client->size)
|
if (!client->size)
|
||||||
{
|
{
|
||||||
@@ -305,6 +320,7 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
|||||||
qdict_del(options, "inode");
|
qdict_del(options, "inode");
|
||||||
qdict_del(options, "pool");
|
qdict_del(options, "pool");
|
||||||
qdict_del(options, "size");
|
qdict_del(options, "size");
|
||||||
|
qdict_del(options, "skip-parents");
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -321,6 +337,8 @@ static void vitastor_close(BlockDriverState *bs)
|
|||||||
g_free(client->etcd_prefix);
|
g_free(client->etcd_prefix);
|
||||||
if (client->image)
|
if (client->image)
|
||||||
g_free(client->image);
|
g_free(client->image);
|
||||||
|
free(client->last_bitmap);
|
||||||
|
client->last_bitmap = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
|
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
|
||||||
@@ -486,6 +504,13 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
|
|||||||
vitastor_co_init_task(bs, &task);
|
vitastor_co_init_task(bs, &task);
|
||||||
task.iov = iov;
|
task.iov = iov;
|
||||||
|
|
||||||
|
if (client->last_bitmap)
|
||||||
|
{
|
||||||
|
// Invalidate last bitmap on write
|
||||||
|
free(client->last_bitmap);
|
||||||
|
client->last_bitmap = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
||||||
qemu_mutex_lock(&client->mutex);
|
qemu_mutex_lock(&client->mutex);
|
||||||
vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_bh_cb, &task);
|
vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_bh_cb, &task);
|
||||||
@@ -499,6 +524,140 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
|
|||||||
return task.ret;
|
return task.ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
|
||||||
|
#if QEMU_VERSION_MAJOR >= 2 || QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7
|
||||||
|
static void vitastor_co_read_bitmap_cb(void *opaque, long retval, uint8_t *bitmap)
|
||||||
|
{
|
||||||
|
VitastorRPC *task = opaque;
|
||||||
|
VitastorClient *client = task->bs->opaque;
|
||||||
|
task->ret = retval;
|
||||||
|
task->complete = 1;
|
||||||
|
if (retval >= 0)
|
||||||
|
{
|
||||||
|
task->bitmap = bitmap;
|
||||||
|
if (client->last_bitmap_inode == task->inode &&
|
||||||
|
client->last_bitmap_offset == task->offset &&
|
||||||
|
client->last_bitmap_len == task->len)
|
||||||
|
{
|
||||||
|
free(client->last_bitmap);
|
||||||
|
client->last_bitmap = bitmap;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (qemu_coroutine_self() != task->co)
|
||||||
|
{
|
||||||
|
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8
|
||||||
|
aio_co_wake(task->co);
|
||||||
|
#else
|
||||||
|
qemu_coroutine_enter(task->co, NULL);
|
||||||
|
qemu_aio_release(task);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int coroutine_fn vitastor_co_block_status(
|
||||||
|
BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
|
||||||
|
int64_t *pnum, int64_t *map, BlockDriverState **file)
|
||||||
|
{
|
||||||
|
// Allocated => return BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID
|
||||||
|
// Not allocated => return 0
|
||||||
|
// Error => return -errno
|
||||||
|
// Set pnum to length of the extent, `*map` = `offset`, `*file` = `bs`
|
||||||
|
VitastorRPC task;
|
||||||
|
VitastorClient *client = bs->opaque;
|
||||||
|
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
||||||
|
uint8_t bit = 0;
|
||||||
|
if (client->last_bitmap && client->last_bitmap_inode == inode &&
|
||||||
|
client->last_bitmap_offset <= offset &&
|
||||||
|
client->last_bitmap_offset+client->last_bitmap_len >= (want_zero ? offset+1 : offset+bytes))
|
||||||
|
{
|
||||||
|
// Use the previously read bitmap
|
||||||
|
task.bitmap_granularity = client->last_bitmap_granularity;
|
||||||
|
task.offset = client->last_bitmap_offset;
|
||||||
|
task.len = client->last_bitmap_len;
|
||||||
|
task.bitmap = client->last_bitmap;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Read bitmap from this position, rounding to full inode PG blocks
|
||||||
|
uint32_t block_size = vitastor_c_inode_get_block_size(client->proxy, inode);
|
||||||
|
if (!block_size)
|
||||||
|
return -EAGAIN;
|
||||||
|
// Init coroutine
|
||||||
|
vitastor_co_init_task(bs, &task);
|
||||||
|
free(client->last_bitmap);
|
||||||
|
task.inode = client->last_bitmap_inode = inode;
|
||||||
|
task.bitmap_granularity = client->last_bitmap_granularity = vitastor_c_inode_get_bitmap_granularity(client->proxy, inode);
|
||||||
|
task.offset = client->last_bitmap_offset = offset / block_size * block_size;
|
||||||
|
task.len = client->last_bitmap_len = (offset+bytes+block_size-1) / block_size * block_size - task.offset;
|
||||||
|
task.bitmap = client->last_bitmap = NULL;
|
||||||
|
qemu_mutex_lock(&client->mutex);
|
||||||
|
vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
|
||||||
|
qemu_mutex_unlock(&client->mutex);
|
||||||
|
while (!task.complete)
|
||||||
|
{
|
||||||
|
qemu_coroutine_yield();
|
||||||
|
}
|
||||||
|
if (task.ret < 0)
|
||||||
|
{
|
||||||
|
// Error
|
||||||
|
return task.ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (want_zero)
|
||||||
|
{
|
||||||
|
// Get precise mapping with all holes
|
||||||
|
uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
|
||||||
|
uint64_t bmp_len = task.len / task.bitmap_granularity;
|
||||||
|
uint64_t bmp_end = bmp_pos+1;
|
||||||
|
bit = (task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1;
|
||||||
|
while (bmp_end < bmp_len && ((task.bitmap[bmp_end >> 3] >> (bmp_end & 0x7)) & 1) == bit)
|
||||||
|
{
|
||||||
|
bmp_end++;
|
||||||
|
}
|
||||||
|
*pnum = (bmp_end-bmp_pos) * task.bitmap_granularity;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Get larger allocated extents, possibly with false positives
|
||||||
|
uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
|
||||||
|
uint64_t bmp_end = (offset+bytes-task.offset) / task.bitmap_granularity - bmp_pos;
|
||||||
|
while (bmp_pos < bmp_end)
|
||||||
|
{
|
||||||
|
if (!(bmp_pos & 7) && bmp_end >= bmp_pos+8)
|
||||||
|
{
|
||||||
|
bit = bit || task.bitmap[bmp_pos >> 3];
|
||||||
|
bmp_pos += 8;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
bit = bit || ((task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1);
|
||||||
|
bmp_pos++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*pnum = bytes;
|
||||||
|
}
|
||||||
|
if (bit)
|
||||||
|
{
|
||||||
|
*map = offset;
|
||||||
|
*file = bs;
|
||||||
|
}
|
||||||
|
return (bit ? (BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID) : 0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
|
||||||
|
// QEMU 1.7-2.11
|
||||||
|
static int64_t coroutine_fn vitastor_co_get_block_status(BlockDriverState *bs,
|
||||||
|
int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
|
||||||
|
{
|
||||||
|
int64_t map = 0;
|
||||||
|
int64_t pnumbytes = 0;
|
||||||
|
int r = vitastor_co_block_status(bs, 1, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, &pnumbytes, &map, &file);
|
||||||
|
*pnum = pnumbytes/BDRV_SECTOR_SIZE;
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#if !( QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7 )
|
#if !( QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7 )
|
||||||
static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
|
static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
|
||||||
{
|
{
|
||||||
@@ -606,6 +765,15 @@ static BlockDriver bdrv_vitastor = {
|
|||||||
.bdrv_co_truncate = vitastor_co_truncate,
|
.bdrv_co_truncate = vitastor_co_truncate,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
|
||||||
|
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 12
|
||||||
|
// For snapshot export
|
||||||
|
.bdrv_co_block_status = vitastor_co_block_status,
|
||||||
|
#elif QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
|
||||||
|
.bdrv_co_get_block_status = vitastor_co_get_block_status,
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7
|
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7
|
||||||
.bdrv_co_preadv = vitastor_co_preadv,
|
.bdrv_co_preadv = vitastor_co_preadv,
|
||||||
.bdrv_co_pwritev = vitastor_co_pwritev,
|
.bdrv_co_pwritev = vitastor_co_pwritev,
|
||||||
|
@@ -25,7 +25,6 @@ ring_loop_t::ring_loop_t(int qd)
|
|||||||
{
|
{
|
||||||
free_ring_data[i] = i;
|
free_ring_data[i] = i;
|
||||||
}
|
}
|
||||||
wait_sqe_id = 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ring_loop_t::~ring_loop_t()
|
ring_loop_t::~ring_loop_t()
|
||||||
@@ -83,17 +82,19 @@ void ring_loop_t::loop()
|
|||||||
}
|
}
|
||||||
io_uring_cqe_seen(&ring, cqe);
|
io_uring_cqe_seen(&ring, cqe);
|
||||||
}
|
}
|
||||||
while (get_sqe_queue.size() > 0)
|
|
||||||
{
|
|
||||||
(get_sqe_queue[0].second)();
|
|
||||||
get_sqe_queue.erase(get_sqe_queue.begin());
|
|
||||||
}
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
loop_again = false;
|
loop_again = false;
|
||||||
for (int i = 0; i < consumers.size(); i++)
|
for (int i = 0; i < consumers.size(); i++)
|
||||||
{
|
{
|
||||||
consumers[i]->loop();
|
consumers[i]->loop();
|
||||||
|
if (immediate_queue.size())
|
||||||
|
{
|
||||||
|
immediate_queue2.swap(immediate_queue);
|
||||||
|
for (auto & cb: immediate_queue2)
|
||||||
|
cb();
|
||||||
|
immediate_queue2.clear();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} while (loop_again);
|
} while (loop_again);
|
||||||
}
|
}
|
||||||
|
@@ -119,11 +119,10 @@ struct ring_consumer_t
|
|||||||
|
|
||||||
class ring_loop_t
|
class ring_loop_t
|
||||||
{
|
{
|
||||||
std::vector<std::pair<int,std::function<void()>>> get_sqe_queue;
|
std::vector<std::function<void()>> immediate_queue, immediate_queue2;
|
||||||
std::vector<ring_consumer_t*> consumers;
|
std::vector<ring_consumer_t*> consumers;
|
||||||
struct ring_data_t *ring_datas;
|
struct ring_data_t *ring_datas;
|
||||||
int *free_ring_data;
|
int *free_ring_data;
|
||||||
int wait_sqe_id;
|
|
||||||
unsigned free_ring_data_ptr;
|
unsigned free_ring_data_ptr;
|
||||||
bool loop_again;
|
bool loop_again;
|
||||||
struct io_uring ring;
|
struct io_uring ring;
|
||||||
@@ -145,20 +144,9 @@ public:
|
|||||||
}
|
}
|
||||||
return sqe;
|
return sqe;
|
||||||
}
|
}
|
||||||
inline int wait_sqe(std::function<void()> cb)
|
inline void set_immediate(const std::function<void()> cb)
|
||||||
{
|
{
|
||||||
get_sqe_queue.push_back({ wait_sqe_id, cb });
|
immediate_queue.push_back(cb);
|
||||||
return wait_sqe_id++;
|
|
||||||
}
|
|
||||||
inline void cancel_wait_sqe(int wait_id)
|
|
||||||
{
|
|
||||||
for (int i = 0; i < get_sqe_queue.size(); i++)
|
|
||||||
{
|
|
||||||
if (get_sqe_queue[i].first == wait_id)
|
|
||||||
{
|
|
||||||
get_sqe_queue.erase(get_sqe_queue.begin()+i, get_sqe_queue.begin()+i+1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
inline int submit()
|
inline int submit()
|
||||||
{
|
{
|
||||||
|
@@ -249,3 +249,35 @@ void print_help(const char *help_text, std::string exe_name, std::string cmd, bo
|
|||||||
fwrite(filtered_text.data(), filtered_text.size(), 1, stdout);
|
fwrite(filtered_text.data(), filtered_text.size(), 1, stdout);
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t parse_time(std::string time_str, bool *ok)
|
||||||
|
{
|
||||||
|
if (!time_str.length())
|
||||||
|
{
|
||||||
|
if (ok)
|
||||||
|
*ok = false;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
uint64_t mul = 1;
|
||||||
|
char type_char = tolower(time_str[time_str.length()-1]);
|
||||||
|
if (type_char == 's' || type_char == 'm' || type_char == 'h' || type_char == 'd' || type_char == 'y')
|
||||||
|
{
|
||||||
|
if (type_char == 's')
|
||||||
|
mul = 1;
|
||||||
|
else if (time_str[time_str.length()-1] == 'M')
|
||||||
|
mul = 30*86400;
|
||||||
|
else if (type_char == 'm')
|
||||||
|
mul = 60;
|
||||||
|
else if (type_char == 'h')
|
||||||
|
mul = 3600;
|
||||||
|
else if (type_char == 'd')
|
||||||
|
mul = 86400;
|
||||||
|
else /*if (type_char == 'y')*/
|
||||||
|
mul = 86400*365;
|
||||||
|
time_str = time_str.substr(0, time_str.length()-1);
|
||||||
|
}
|
||||||
|
uint64_t ts = stoull_full(time_str, 0) * mul;
|
||||||
|
if (ok)
|
||||||
|
*ok = !(ts == 0 && time_str != "0" && (time_str != "" || mul != 1));
|
||||||
|
return ts;
|
||||||
|
}
|
||||||
|
@@ -15,3 +15,4 @@ std::string str_replace(const std::string & in, const std::string & needle, cons
|
|||||||
uint64_t stoull_full(const std::string & str, int base = 0);
|
uint64_t stoull_full(const std::string & str, int base = 0);
|
||||||
std::string format_size(uint64_t size, bool nobytes = false);
|
std::string format_size(uint64_t size, bool nobytes = false);
|
||||||
void print_help(const char *help_text, std::string exe_name, std::string cmd, bool all);
|
void print_help(const char *help_text, std::string exe_name, std::string cmd, bool all);
|
||||||
|
uint64_t parse_time(std::string time_str, bool *ok = NULL);
|
||||||
|
@@ -8,7 +8,6 @@
|
|||||||
|
|
||||||
void configure_single_pg_pool(cluster_client_t *cli)
|
void configure_single_pg_pool(cluster_client_t *cli)
|
||||||
{
|
{
|
||||||
cli->st_cli.on_load_pgs_hook(true);
|
|
||||||
cli->st_cli.parse_state((etcd_kv_t){
|
cli->st_cli.parse_state((etcd_kv_t){
|
||||||
.key = "/config/pools",
|
.key = "/config/pools",
|
||||||
.value = json11::Json::object {
|
.value = json11::Json::object {
|
||||||
@@ -43,6 +42,7 @@ void configure_single_pg_pool(cluster_client_t *cli)
|
|||||||
{ "state", json11::Json::array { "active" } },
|
{ "state", json11::Json::array { "active" } },
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
cli->st_cli.on_load_pgs_hook(true);
|
||||||
std::map<std::string, etcd_kv_t> changes;
|
std::map<std::string, etcd_kv_t> changes;
|
||||||
cli->st_cli.on_change_hook(changes);
|
cli->st_cli.on_change_hook(changes);
|
||||||
}
|
}
|
||||||
@@ -188,7 +188,6 @@ void test1()
|
|||||||
int *r1 = test_write(cli, 0, 4096, 0x55);
|
int *r1 = test_write(cli, 0, 4096, 0x55);
|
||||||
configure_single_pg_pool(cli);
|
configure_single_pg_pool(cli);
|
||||||
pretend_connected(cli, 1);
|
pretend_connected(cli, 1);
|
||||||
cli->continue_ops(true);
|
|
||||||
can_complete(r1);
|
can_complete(r1);
|
||||||
check_op_count(cli, 1, 1);
|
check_op_count(cli, 1, 1);
|
||||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
|
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
|
||||||
@@ -196,8 +195,6 @@ void test1()
|
|||||||
pretend_disconnected(cli, 1);
|
pretend_disconnected(cli, 1);
|
||||||
int *r2 = test_sync(cli);
|
int *r2 = test_sync(cli);
|
||||||
pretend_connected(cli, 1);
|
pretend_connected(cli, 1);
|
||||||
check_op_count(cli, 1, 0);
|
|
||||||
cli->continue_ops(true);
|
|
||||||
check_op_count(cli, 1, 1);
|
check_op_count(cli, 1, 1);
|
||||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
|
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
|
||||||
check_op_count(cli, 1, 1);
|
check_op_count(cli, 1, 1);
|
||||||
@@ -303,8 +300,6 @@ void test1()
|
|||||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), -EPIPE);
|
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), -EPIPE);
|
||||||
check_disconnected(cli, 1);
|
check_disconnected(cli, 1);
|
||||||
pretend_connected(cli, 1);
|
pretend_connected(cli, 1);
|
||||||
check_op_count(cli, 1, 0);
|
|
||||||
cli->continue_ops(true);
|
|
||||||
check_op_count(cli, 1, 1);
|
check_op_count(cli, 1, 1);
|
||||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
|
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
|
||||||
check_op_count(cli, 1, 1);
|
check_op_count(cli, 1, 1);
|
||||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
|||||||
|
|
||||||
Name: Vitastor
|
Name: Vitastor
|
||||||
Description: Vitastor client library
|
Description: Vitastor client library
|
||||||
Version: 0.8.3
|
Version: 0.8.5
|
||||||
Libs: -L${libdir} -lvitastor_client
|
Libs: -L${libdir} -lvitastor_client
|
||||||
Cflags: -I${includedir}
|
Cflags: -I${includedir}
|
||||||
|
|
||||||
|
@@ -207,6 +207,28 @@ void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint6
|
|||||||
client->cli->execute(op);
|
client->cli->execute(op);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
|
||||||
|
int with_parents, VitastorReadBitmapHandler cb, void *opaque)
|
||||||
|
{
|
||||||
|
cluster_op_t *op = new cluster_op_t;
|
||||||
|
op->opcode = with_parents ? OSD_OP_READ_CHAIN_BITMAP : OSD_OP_READ_BITMAP;
|
||||||
|
op->inode = inode;
|
||||||
|
op->offset = offset;
|
||||||
|
op->len = len;
|
||||||
|
op->callback = [cb, opaque](cluster_op_t *op)
|
||||||
|
{
|
||||||
|
uint8_t *bitmap = NULL;
|
||||||
|
if (op->retval >= 0)
|
||||||
|
{
|
||||||
|
bitmap = (uint8_t*)op->bitmap_buf;
|
||||||
|
op->bitmap_buf = NULL;
|
||||||
|
}
|
||||||
|
cb(opaque, op->retval, bitmap);
|
||||||
|
delete op;
|
||||||
|
};
|
||||||
|
client->cli->execute(op);
|
||||||
|
}
|
||||||
|
|
||||||
void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque)
|
void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque)
|
||||||
{
|
{
|
||||||
cluster_op_t *op = new cluster_op_t;
|
cluster_op_t *op = new cluster_op_t;
|
||||||
@@ -245,6 +267,25 @@ uint64_t vitastor_c_inode_get_num(void *handle)
|
|||||||
return watch->cfg.num;
|
return watch->cfg.num;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint32_t vitastor_c_inode_get_block_size(vitastor_c *client, uint64_t inode_num)
|
||||||
|
{
|
||||||
|
auto pool_it = client->cli->st_cli.pool_config.find(INODE_POOL(inode_num));
|
||||||
|
if (pool_it == client->cli->st_cli.pool_config.end())
|
||||||
|
return 0;
|
||||||
|
auto & pool_cfg = pool_it->second;
|
||||||
|
uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
||||||
|
return pool_cfg.data_block_size * pg_data_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t vitastor_c_inode_get_bitmap_granularity(vitastor_c *client, uint64_t inode_num)
|
||||||
|
{
|
||||||
|
auto pool_it = client->cli->st_cli.pool_config.find(INODE_POOL(inode_num));
|
||||||
|
if (pool_it == client->cli->st_cli.pool_config.end())
|
||||||
|
return 0;
|
||||||
|
// FIXME: READ_BITMAP may fails if parent bitmap granularity differs from inode bitmap granularity
|
||||||
|
return pool_it->second.bitmap_granularity;
|
||||||
|
}
|
||||||
|
|
||||||
int vitastor_c_inode_get_readonly(void *handle)
|
int vitastor_c_inode_get_readonly(void *handle)
|
||||||
{
|
{
|
||||||
inode_watch_t *watch = (inode_watch_t*)handle;
|
inode_watch_t *watch = (inode_watch_t*)handle;
|
||||||
|
@@ -6,6 +6,9 @@
|
|||||||
#ifndef VITASTOR_QEMU_PROXY_H
|
#ifndef VITASTOR_QEMU_PROXY_H
|
||||||
#define VITASTOR_QEMU_PROXY_H
|
#define VITASTOR_QEMU_PROXY_H
|
||||||
|
|
||||||
|
// C API wrapper version
|
||||||
|
#define VITASTOR_C_API_VERSION 1
|
||||||
|
|
||||||
#ifndef POOL_ID_BITS
|
#ifndef POOL_ID_BITS
|
||||||
#define POOL_ID_BITS 16
|
#define POOL_ID_BITS 16
|
||||||
#endif
|
#endif
|
||||||
@@ -21,6 +24,7 @@ typedef struct vitastor_c vitastor_c;
|
|||||||
|
|
||||||
typedef void VitastorReadHandler(void *opaque, long retval, uint64_t version);
|
typedef void VitastorReadHandler(void *opaque, long retval, uint64_t version);
|
||||||
typedef void VitastorIOHandler(void *opaque, long retval);
|
typedef void VitastorIOHandler(void *opaque, long retval);
|
||||||
|
typedef void VitastorReadBitmapHandler(void *opaque, long retval, uint8_t *bitmap);
|
||||||
|
|
||||||
// QEMU
|
// QEMU
|
||||||
typedef void IOHandler(void *opaque);
|
typedef void IOHandler(void *opaque);
|
||||||
@@ -42,11 +46,15 @@ void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64
|
|||||||
struct iovec *iov, int iovcnt, VitastorReadHandler cb, void *opaque);
|
struct iovec *iov, int iovcnt, VitastorReadHandler cb, void *opaque);
|
||||||
void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
|
void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
|
||||||
struct iovec *iov, int iovcnt, VitastorIOHandler cb, void *opaque);
|
struct iovec *iov, int iovcnt, VitastorIOHandler cb, void *opaque);
|
||||||
|
void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
|
||||||
|
int with_parents, VitastorReadBitmapHandler cb, void *opaque);
|
||||||
void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque);
|
void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque);
|
||||||
void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler cb, void *opaque);
|
void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler cb, void *opaque);
|
||||||
void vitastor_c_close_watch(vitastor_c *client, void *handle);
|
void vitastor_c_close_watch(vitastor_c *client, void *handle);
|
||||||
uint64_t vitastor_c_inode_get_size(void *handle);
|
uint64_t vitastor_c_inode_get_size(void *handle);
|
||||||
uint64_t vitastor_c_inode_get_num(void *handle);
|
uint64_t vitastor_c_inode_get_num(void *handle);
|
||||||
|
uint32_t vitastor_c_inode_get_block_size(vitastor_c *client, uint64_t inode_num);
|
||||||
|
uint32_t vitastor_c_inode_get_bitmap_granularity(vitastor_c *client, uint64_t inode_num);
|
||||||
int vitastor_c_inode_get_readonly(void *handle);
|
int vitastor_c_inode_get_readonly(void *handle);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
@@ -22,6 +22,16 @@ LD_PRELOAD="build/src/libfio_vitastor.so" \
|
|||||||
LD_PRELOAD="build/src/libfio_vitastor.so" \
|
LD_PRELOAD="build/src/libfio_vitastor.so" \
|
||||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -rw=read -etcd=$ETCD_URL -pool=1 -inode=3 -size=32M
|
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -rw=read -etcd=$ETCD_URL -pool=1 -inode=3 -size=32M
|
||||||
|
|
||||||
|
qemu-img convert -p \
|
||||||
|
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=2:size=$((32*1024*1024)):skip-parents=1" \
|
||||||
|
-O qcow2 ./testdata/layer0.qcow2
|
||||||
|
|
||||||
|
qemu-img create -f qcow2 ./testdata/empty.qcow2 32M
|
||||||
|
|
||||||
|
qemu-img convert -p \
|
||||||
|
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024)):skip-parents=1" \
|
||||||
|
-O qcow2 -o 'cluster_size=4k' -B empty.qcow2 ./testdata/layer1.qcow2
|
||||||
|
|
||||||
qemu-img convert -S 4096 -p \
|
qemu-img convert -S 4096 -p \
|
||||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024))" \
|
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024))" \
|
||||||
-O raw ./testdata/merged.bin
|
-O raw ./testdata/merged.bin
|
||||||
@@ -52,4 +62,18 @@ qemu-img convert -S 4096 -p \
|
|||||||
|
|
||||||
cmp ./testdata/merged.bin ./testdata/merged-by-tool.bin
|
cmp ./testdata/merged.bin ./testdata/merged-by-tool.bin
|
||||||
|
|
||||||
|
# Test merge by qemu-img
|
||||||
|
|
||||||
|
qemu-img rebase -u -b layer0.qcow2 ./testdata/layer1.qcow2
|
||||||
|
|
||||||
|
qemu-img convert -S 4096 -f qcow2 ./testdata/layer1.qcow2 -O raw ./testdata/rebased.bin
|
||||||
|
|
||||||
|
cmp ./testdata/merged.bin ./testdata/rebased.bin
|
||||||
|
|
||||||
|
qemu-img rebase -u -b '' ./testdata/layer1.qcow2
|
||||||
|
|
||||||
|
qemu-img convert -S 4096 -f qcow2 ./testdata/layer1.qcow2 -O raw ./testdata/rebased.bin
|
||||||
|
|
||||||
|
cmp ./testdata/layer1.bin ./testdata/rebased.bin
|
||||||
|
|
||||||
format_green OK
|
format_green OK
|
||||||
|
Reference in New Issue
Block a user