Compare commits

...

24 Commits

Author SHA1 Message Date
8810eae8fb Release 0.8.6
Important fixes:

- Fix possibly incorrect EC parity chunk updates with EC n+k, k > 1 and when
  the first parity chunk is missing

Minor fixes and improvements:

- Fix incorrect EC free space statistics in vitastor-cli df output
- Speedup vitastor-cli startup in clusters with RDMA
- Remove unused PG "peered" state (previously used to update PG epoch)
- Use sfdisk with just --json in vitastor-disk (--dump --json isn't needed)
- Allow trailing comma in sfdisk output (fixes sfdisk 2.36 compatibility)
- Slightly improve RDMA send/receive code
- Reduce RDMA memory consumption by default (rdma_max_recv/send = 16/8)
- Use vitastor-cli instead of direct etcd interaction in the CSI driver
2023-02-28 11:18:48 +03:00
c1365f46c9 Use vitastor-cli instead of direct etcd interaction in the CSI driver 2023-02-28 11:02:50 +03:00
14d6acbcba Set default rdma_max_recv/send to 16/8, fix documentation 2023-02-28 11:00:56 +03:00
1e307069bc Fix missing parity chunk calculation for EC n+k, k > 1 and first parity chunk missing 2023-02-28 02:40:19 +03:00
c3e80abad7 Allow to send more than 1 operation at a time 2023-02-26 02:01:04 +03:00
138ffe4032 Reuse incoming RDMA buffers 2023-02-26 00:55:01 +03:00
8139a34e97 Fix json11: allow trailing comma 2023-02-23 01:16:01 +03:00
4ab630b44d Use just sfdisk --json, --dump is not needed 2023-02-23 00:55:47 +03:00
2c8241b7db Remove PG "peered" state 2023-02-21 01:30:42 +03:00
36a7dd3671 Move tests to "make test" 2023-02-21 01:30:42 +03:00
936122bbcf Initialize msgr lazily in client to speedup vitastor-cli with RDMA enabled 2023-02-19 18:59:07 +03:00
1a1ba0d1e7 Add set_immediate to ringloop and use it for bs/osd ops to prevent reenterability issues 2023-02-09 17:37:26 +03:00
3d09c9cec7 Remove unused wait_sqe() from ringloop 2023-02-09 17:37:26 +03:00
3d08a1ad6c Fix cluster_client test after last reenterability fixes 2023-02-05 01:47:32 +03:00
499881d81c Fix typo 2023-01-27 01:52:02 +03:00
aba93b951b Fix incorrect EC free space statistics in vitastor-cli df output 2023-01-26 02:04:29 +03:00
d125fb1f30 Release 0.8.5
- Fix a possible "double free" bug in the client library happening on OSD restart
- Fix a possible write hang on PG history update when only epoch is changed
- Fix incorrect systemd target "local.target" in mon/make-etcd
- Allow "content" option in PVE storage plugin to allow to enable containers
- Build client library without tcmalloc which fixes "attempt to free invalid pointer"
  errors when, for example, trying to run QEMU with both Vitastor and Ceph RBD disks
2023-01-25 01:43:49 +03:00
9d3fd72298 Require liburing < 2 in rpm specs 2023-01-25 01:43:49 +03:00
8b552a01f9 Do not retry successful operation parts in client (could lead to "double free" bugs) 2023-01-25 01:30:36 +03:00
0385b2f9e8 Fix write hangs on PG epoch update - always set pg.history_changed to true 2023-01-25 01:30:15 +03:00
749c837045 Replace non-existing local.target with multi-user.target 2023-01-25 01:29:31 +03:00
98001d845b Remove version from vitastor-release.rpm links 2023-01-23 14:03:33 +03:00
c96bcae74b Allow "content" option in PVE storage plugin to allow to enable containers 2023-01-16 18:14:45 +03:00
9f4e34a8cc Build client library without tcmalloc
Fixes "[src/tcmalloc.cc:332] Attempt to free invalid pointer ..." when trying
to run QEMU with both Vitastor and Ceph RBD disks and other possible allocator
collisions.
2023-01-15 00:01:11 +03:00
52 changed files with 366 additions and 424 deletions

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
project(vitastor) project(vitastor)
set(VERSION "0.8.4") set(VERSION "0.8.6")
add_subdirectory(src) add_subdirectory(src)

View File

@@ -48,9 +48,9 @@ Vitastor, составлены для того, чтобы убедиться,
интерфейс (прокси), опять же, без открытия в свободный публичный доступ как интерфейс (прокси), опять же, без открытия в свободный публичный доступ как
самой программы, так и прокси. самой программы, так и прокси.
Сетевая Публичная Лицензия Vitastor разработана специально чтобы Сетевая Публичная Лицензия Vitastor разработана специально, чтобы
гарантировать, что в таких случаях и модифицированная версия программы, и гарантировать, что в таких случаях и модифицированная версия программы, и
прокси оставались доступными сообществу. Для этого лицензия требует от прокси останутся доступными сообществу. Для этого лицензия требует от
операторов сетевых серверов предоставлять исходный код оригинальной программы, операторов сетевых серверов предоставлять исходный код оригинальной программы,
а также всех других программ, взаимодействующих с ней на их серверах, а также всех других программ, взаимодействующих с ней на их серверах,
пользователям этих серверов, на условиях свободных лицензий. Таким образом, пользователям этих серверов, на условиях свободных лицензий. Таким образом,

View File

@@ -1,4 +1,4 @@
VERSION ?= v0.8.4 VERSION ?= v0.8.6
all: build push all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities: capabilities:
add: ["SYS_ADMIN"] add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v0.8.4 image: vitalif/vitastor-csi:v0.8.6
args: args:
- "--node=$(NODE_ID)" - "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)" - "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -116,7 +116,7 @@ spec:
privileged: true privileged: true
capabilities: capabilities:
add: ["SYS_ADMIN"] add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v0.8.4 image: vitalif/vitastor-csi:v0.8.6
args: args:
- "--node=$(NODE_ID)" - "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)" - "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -5,7 +5,7 @@ package vitastor
const ( const (
vitastorCSIDriverName = "csi.vitastor.io" vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "0.8.4" vitastorCSIDriverVersion = "0.8.6"
) )
// Config struct fills the parameters of request or user input // Config struct fills the parameters of request or user input

View File

@@ -10,7 +10,6 @@ import (
"bytes" "bytes"
"strconv" "strconv"
"time" "time"
"fmt"
"os" "os"
"os/exec" "os/exec"
"io/ioutil" "io/ioutil"
@@ -21,8 +20,6 @@ import (
"google.golang.org/grpc/codes" "google.golang.org/grpc/codes"
"google.golang.org/grpc/status" "google.golang.org/grpc/status"
"go.etcd.io/etcd/clientv3"
"github.com/container-storage-interface/spec/lib/go/csi" "github.com/container-storage-interface/spec/lib/go/csi"
) )
@@ -114,6 +111,34 @@ func GetConnectionParams(params map[string]string) (map[string]string, []string,
return ctxVars, etcdUrl, etcdPrefix return ctxVars, etcdUrl, etcdPrefix
} }
func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
{
if (ctxVars["etcdUrl"] != "")
{
args = append(args, "--etcd_address", ctxVars["etcdUrl"])
}
if (ctxVars["etcdPrefix"] != "")
{
args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
}
if (ctxVars["configPath"] != "")
{
args = append(args, "--config_path", ctxVars["configPath"])
}
c := exec.Command("/usr/bin/vitastor-cli", args...)
var stdout, stderr bytes.Buffer
c.Stdout = &stdout
c.Stderr = &stderr
err := c.Run()
stderrStr := string(stderr.Bytes())
if (err != nil)
{
klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
}
return stdout.Bytes(), nil
}
// Create the volume // Create the volume
func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error) func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
{ {
@@ -146,128 +171,41 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
} }
// FIXME: The following should PROBABLY be implemented externally in a management tool ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
ctxVars, etcdUrl, etcdPrefix := GetConnectionParams(req.Parameters)
if (len(etcdUrl) == 0) if (len(etcdUrl) == 0)
{ {
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf") return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
} }
// Connect to etcd // Create image using vitastor-cli
cli, err := clientv3.New(clientv3.Config{ _, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", string(volSize), "--pool", string(poolId) })
DialTimeout: ETCD_TIMEOUT,
Endpoints: etcdUrl,
})
if (err != nil) if (err != nil)
{ {
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error()) if (strings.Index(err.Error(), "already exists") > 0)
}
defer cli.Close()
var imageId uint64 = 0
for
{
// Check if the image exists
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
cancel()
if (err != nil)
{ {
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error()) stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
}
if (len(resp.Kvs) > 0)
{
kv := resp.Kvs[0]
var v InodeIndex
err := json.Unmarshal(kv.Value, &v)
if (err != nil) if (err != nil)
{ {
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error()) return nil, err
} }
poolId = v.PoolId var inodeCfg []InodeConfig
imageId = v.Id err = json.Unmarshal(stat, &inodeCfg)
inodeCfgKey := fmt.Sprintf("/config/inode/%d/%d", poolId, imageId)
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
resp, err := cli.Get(ctx, etcdPrefix+inodeCfgKey)
cancel()
if (err != nil) if (err != nil)
{ {
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error()) return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
} }
if (len(resp.Kvs) == 0) if (len(inodeCfg) == 0)
{ {
return nil, status.Error(codes.Internal, "missing "+inodeCfgKey+" key in etcd") return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
} }
var inodeCfg InodeConfig if (inodeCfg[0].Size < uint64(volSize))
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
if (err != nil)
{
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
}
if (inodeCfg.Size < uint64(volSize))
{ {
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected") return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
} }
} }
else else
{ {
// Find a free ID return nil, err
// Create image metadata in a transaction verifying that the image doesn't exist yet AND ID is still free
maxIdKey := fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
resp, err := cli.Get(ctx, maxIdKey)
cancel()
if (err != nil)
{
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
}
var modRev int64
var nextId uint64
if (len(resp.Kvs) > 0)
{
var err error
nextId, err = strconv.ParseUint(string(resp.Kvs[0].Value), 10, 64)
if (err != nil)
{
return nil, status.Error(codes.Internal, maxIdKey+" contains invalid ID")
}
modRev = resp.Kvs[0].ModRevision
nextId++
}
else
{
nextId = 1
}
inodeIdxJson, _ := json.Marshal(InodeIndex{
Id: nextId,
PoolId: poolId,
})
inodeCfgJson, _ := json.Marshal(InodeConfig{
Name: volName,
Size: uint64(volSize),
})
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
txnResp, err := cli.Txn(ctx).If(
clientv3.Compare(clientv3.ModRevision(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)), "=", modRev),
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)), "=", 0),
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId)), "=", 0),
).Then(
clientv3.OpPut(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId), fmt.Sprintf("%d", nextId)),
clientv3.OpPut(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName), string(inodeIdxJson)),
clientv3.OpPut(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId), string(inodeCfgJson)),
).Commit()
cancel()
if (err != nil)
{
return nil, status.Error(codes.Internal, "failed to commit transaction in etcd: "+err.Error())
}
if (txnResp.Succeeded)
{
imageId = nextId
break
}
// Start over if the transaction fails
} }
} }
@@ -299,97 +237,12 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
} }
volName := ctxVars["name"] volName := ctxVars["name"]
_, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars) ctxVars, _, _ = GetConnectionParams(ctxVars)
if (len(etcdUrl) == 0)
{
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
}
cli, err := clientv3.New(clientv3.Config{ _, err = invokeCLI(ctxVars, []string{ "rm", volName })
DialTimeout: ETCD_TIMEOUT,
Endpoints: etcdUrl,
})
if (err != nil) if (err != nil)
{ {
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error()) return nil, err
}
defer cli.Close()
// Find inode by name
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
cancel()
if (err != nil)
{
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
}
if (len(resp.Kvs) == 0)
{
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
}
var idx InodeIndex
err = json.Unmarshal(resp.Kvs[0].Value, &idx)
if (err != nil)
{
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
}
// Get inode config
inodeCfgKey := fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
resp, err = cli.Get(ctx, inodeCfgKey)
cancel()
if (err != nil)
{
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
}
if (len(resp.Kvs) == 0)
{
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
}
var inodeCfg InodeConfig
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
if (err != nil)
{
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
}
// Delete inode data by invoking vitastor-cli
args := []string{
"rm-data", "--etcd_address", strings.Join(etcdUrl, ","),
"--pool", fmt.Sprintf("%d", idx.PoolId),
"--inode", fmt.Sprintf("%d", idx.Id),
}
if (ctxVars["configPath"] != "")
{
args = append(args, "--config_path", ctxVars["configPath"])
}
c := exec.Command("/usr/bin/vitastor-cli", args...)
var stderr bytes.Buffer
c.Stdout = nil
c.Stderr = &stderr
err = c.Run()
stderrStr := string(stderr.Bytes())
if (err != nil)
{
klog.Errorf("vitastor-cli rm-data failed: %s, status %s\n", stderrStr, err)
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
}
// Delete inode config in etcd
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
txnResp, err := cli.Txn(ctx).Then(
clientv3.OpDelete(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)),
clientv3.OpDelete(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)),
).Commit()
cancel()
if (err != nil)
{
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: "+err.Error())
}
if (!txnResp.Succeeded)
{
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: transaction failed")
} }
return &csi.DeleteVolumeResponse{}, nil return &csi.DeleteVolumeResponse{}, nil

4
debian/changelog vendored
View File

@@ -1,10 +1,10 @@
vitastor (0.8.4-1) unstable; urgency=medium vitastor (0.8.6-1) unstable; urgency=medium
* Bugfixes * Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300 -- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
vitastor (0.8.4-1) unstable; urgency=medium vitastor (0.8.6-1) unstable; urgency=medium
* Implement NFS proxy * Implement NFS proxy
* Add documentation * Add documentation

View File

@@ -34,8 +34,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \ mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \ rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \ cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.8.4; \ cp -r /root/vitastor vitastor-0.8.6; \
cd vitastor-0.8.4; \ cd vitastor-0.8.6; \
ln -s /root/fio-build/fio-*/ ./fio; \ ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \ ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -48,8 +48,8 @@ RUN set -e -x; \
rm -rf a b; \ rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \ echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \ cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.4.orig.tar.xz vitastor-0.8.4; \ tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.6.orig.tar.xz vitastor-0.8.6; \
cd vitastor-0.8.4; \ cd vitastor-0.8.6; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \ DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \ DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -19,6 +19,7 @@ between clients, OSDs and etcd.
- [rdma_max_sge](#rdma_max_sge) - [rdma_max_sge](#rdma_max_sge)
- [rdma_max_msg](#rdma_max_msg) - [rdma_max_msg](#rdma_max_msg)
- [rdma_max_recv](#rdma_max_recv) - [rdma_max_recv](#rdma_max_recv)
- [rdma_max_send](#rdma_max_send)
- [peer_connect_interval](#peer_connect_interval) - [peer_connect_interval](#peer_connect_interval)
- [peer_connect_timeout](#peer_connect_timeout) - [peer_connect_timeout](#peer_connect_timeout)
- [osd_idle_timeout](#osd_idle_timeout) - [osd_idle_timeout](#osd_idle_timeout)
@@ -74,6 +75,12 @@ to work. For example, Mellanox ConnectX-3 and older adapters don't have
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
root to list available RDMA devices and their features. root to list available RDMA devices and their features.
Remember that you also have to configure your network switches if you use
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
the manual of your network vendor for details about setting up the switch
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
## rdma_port_num ## rdma_port_num
- Type: integer - Type: integer
@@ -116,20 +123,30 @@ required to change this parameter.
## rdma_max_msg ## rdma_max_msg
- Type: integer - Type: integer
- Default: 1048576 - Default: 132096
Maximum size of a single RDMA send or receive operation in bytes. Maximum size of a single RDMA send or receive operation in bytes.
## rdma_max_recv ## rdma_max_recv
- Type: integer
- Default: 16
Maximum number of RDMA receive buffers per connection (RDMA requires
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
in size. So this setting directly affects memory usage: a single Vitastor
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
Default is roughly 2 MB * number of OSDs.
## rdma_max_send
- Type: integer - Type: integer
- Default: 8 - Default: 8
Maximum number of parallel RDMA receive operations. Note that this number Maximum number of outstanding RDMA send operations per connection. Should be
of receive buffers `rdma_max_msg` in size are allocated for each client, less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
so this setting actually affects memory usage. This is because RDMA receive Doesn't affect memory usage - additional memory isn't allocated for send
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in operations.
later versions.
## peer_connect_interval ## peer_connect_interval

View File

@@ -19,6 +19,7 @@
- [rdma_max_sge](#rdma_max_sge) - [rdma_max_sge](#rdma_max_sge)
- [rdma_max_msg](#rdma_max_msg) - [rdma_max_msg](#rdma_max_msg)
- [rdma_max_recv](#rdma_max_recv) - [rdma_max_recv](#rdma_max_recv)
- [rdma_max_send](#rdma_max_send)
- [peer_connect_interval](#peer_connect_interval) - [peer_connect_interval](#peer_connect_interval)
- [peer_connect_timeout](#peer_connect_timeout) - [peer_connect_timeout](#peer_connect_timeout)
- [osd_idle_timeout](#osd_idle_timeout) - [osd_idle_timeout](#osd_idle_timeout)
@@ -78,6 +79,13 @@ Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Наприме
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
параметры и возможности. параметры и возможности.
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
нестабильной производительностью. Подробную информацию о настройке
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
подразумевает настройку сети без потерь на основе PFC (Priority Flow
Control) и ECN (Explicit Congestion Notification).
## rdma_port_num ## rdma_port_num
- Тип: целое число - Тип: целое число
@@ -121,22 +129,32 @@ OSD в любом случае согласовывают реальное зн
## rdma_max_msg ## rdma_max_msg
- Тип: целое число - Тип: целое число
- Значение по умолчанию: 1048576 - Значение по умолчанию: 132096
Максимальный размер одной RDMA-операции отправки или приёма. Максимальный размер одной RDMA-операции отправки или приёма.
## rdma_max_recv ## rdma_max_recv
- Тип: целое число
- Значение по умолчанию: 16
Максимальное число буферов для RDMA-приёма данных на одно соединение
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
потребление памяти - один Vitastor-клиент с RDMA использует
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
примерно 2 МБ * число OSD.
## rdma_max_send
- Тип: целое число - Тип: целое число
- Значение по умолчанию: 8 - Значение по умолчанию: 8
Максимальное число параллельных RDMA-операций получения данных. Следует Максимальное число RDMA-операций отправки, отправляемых в очередь одного
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
для каждого подключённого клиентского соединения, так что данная настройка у принимающей стороны в процессе работы не заканчивались буферы на приём.
влияет на потребление памяти. Это так потому, что RDMA-приём данных в Не влияет на потребление памяти - дополнительная память на операции отправки
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз не выделяется.
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
более новых версиях Vitastor.
## peer_connect_interval ## peer_connect_interval

View File

@@ -53,6 +53,12 @@
to work. For example, Mellanox ConnectX-3 and older adapters don't have to work. For example, Mellanox ConnectX-3 and older adapters don't have
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
root to list available RDMA devices and their features. root to list available RDMA devices and their features.
Remember that you also have to configure your network switches if you use
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
the manual of your network vendor for details about setting up the switch
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
info_ru: | info_ru: |
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0"). Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
@@ -61,6 +67,13 @@
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
параметры и возможности. параметры и возможности.
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
нестабильной производительностью. Подробную информацию о настройке
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
подразумевает настройку сети без потерь на основе PFC (Priority Flow
Control) и ECN (Explicit Congestion Notification).
- name: rdma_port_num - name: rdma_port_num
type: int type: int
default: 1 default: 1
@@ -114,26 +127,39 @@
так что менять этот параметр обычно не нужно. так что менять этот параметр обычно не нужно.
- name: rdma_max_msg - name: rdma_max_msg
type: int type: int
default: 1048576 default: 132096
info: Maximum size of a single RDMA send or receive operation in bytes. info: Maximum size of a single RDMA send or receive operation in bytes.
info_ru: Максимальный размер одной RDMA-операции отправки или приёма. info_ru: Максимальный размер одной RDMA-операции отправки или приёма.
- name: rdma_max_recv - name: rdma_max_recv
type: int
default: 16
info: |
Maximum number of RDMA receive buffers per connection (RDMA requires
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
in size. So this setting directly affects memory usage: a single Vitastor
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
Default is roughly 2 MB * number of OSDs.
info_ru: |
Максимальное число буферов для RDMA-приёма данных на одно соединение
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
потребление памяти - один Vitastor-клиент с RDMA использует
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
примерно 2 МБ * число OSD.
- name: rdma_max_send
type: int type: int
default: 8 default: 8
info: | info: |
Maximum number of parallel RDMA receive operations. Note that this number Maximum number of outstanding RDMA send operations per connection. Should be
of receive buffers `rdma_max_msg` in size are allocated for each client, less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
so this setting actually affects memory usage. This is because RDMA receive Doesn't affect memory usage - additional memory isn't allocated for send
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in operations.
later versions.
info_ru: | info_ru: |
Максимальное число параллельных RDMA-операций получения данных. Следует Максимальное число RDMA-операций отправки, отправляемых в очередь одного
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
для каждого подключённого клиентского соединения, так что данная настройка у принимающей стороны в процессе работы не заканчивались буферы на приём.
влияет на потребление памяти. Это так потому, что RDMA-приём данных в Не влияет на потребление памяти - дополнительная память на операции отправки
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз не выделяется.
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
более новых версиях Vitastor.
- name: peer_connect_interval - name: peer_connect_interval
type: sec type: sec
min: 1 min: 1

View File

@@ -20,8 +20,8 @@
## CentOS ## CentOS
- Add Vitastor package repository: - Add Vitastor package repository:
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm` - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm` - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
- Enable EPEL: `yum/dnf install epel-release` - Enable EPEL: `yum/dnf install epel-release`
- Enable additional CentOS repositories: - Enable additional CentOS repositories:
- CentOS 7: `yum install centos-release-scl` - CentOS 7: `yum install centos-release-scl`

View File

@@ -20,8 +20,8 @@
## CentOS ## CentOS
- Добавьте в систему репозиторий Vitastor: - Добавьте в систему репозиторий Vitastor:
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm` - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm` - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
- Включите EPEL: `yum/dnf install epel-release` - Включите EPEL: `yum/dnf install epel-release`
- Включите дополнительные репозитории CentOS: - Включите дополнительные репозитории CentOS:
- CentOS 7: `yum install centos-release-scl` - CentOS 7: `yum install centos-release-scl`

View File

@@ -1,4 +1,4 @@
[Documentation](../../README.md#documentation) → Usage → Disk Tool [Documentation](../../README.md#documentation) → Usage → Disk management tool
----- -----

View File

@@ -1,4 +1,4 @@
[Документация](../../README-ru.md#документация) → Использование → Управление дисками [Документация](../../README-ru.md#документация) → Использование → Инструмент управления дисками
----- -----

2
json11

Submodule json11 updated: 52a3af664f...fd37016cf8

View File

@@ -79,7 +79,7 @@ StartLimitInterval=0
RestartSec=10 RestartSec=10
[Install] [Install]
WantedBy=local.target WantedBy=multi-user.target
`); `);
await system(`useradd etcd`); await system(`useradd etcd`);
await system(`systemctl daemon-reload`); await system(`systemctl daemon-reload`);

View File

@@ -70,9 +70,9 @@ const etcd_tree = {
rdma_gid_index: 0, rdma_gid_index: 0,
rdma_mtu: 4096, rdma_mtu: 4096,
rdma_max_sge: 128, rdma_max_sge: 128,
rdma_max_send: 32, rdma_max_send: 8,
rdma_max_recv: 8, rdma_max_recv: 16,
rdma_max_msg: 1048576, rdma_max_msg: 132096,
log_level: 0, log_level: 0,
block_size: 131072, block_size: 131072,
disk_alignment: 4096, disk_alignment: 4096,
@@ -261,7 +261,7 @@ const etcd_tree = {
/* <pool_id>: { /* <pool_id>: {
<pg_id>: { <pg_id>: {
primary: osd_num_t, primary: osd_num_t,
state: ("starting"|"peering"|"peered"|"incomplete"|"active"|"repeering"|"stopping"|"offline"| state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"| "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
"has_invalid"|"left_on_dead")[], "has_invalid"|"left_on_dead")[],
} }

View File

@@ -139,6 +139,7 @@ sub options
{ {
return { return {
shared => { optional => 1 }, shared => { optional => 1 },
content => { optional => 1 },
nodes => { optional => 1 }, nodes => { optional => 1 },
disable => { optional => 1 }, disable => { optional => 1 },
vitastor_etcd_address => { optional => 1 }, vitastor_etcd_address => { optional => 1 },

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver from cinder.volume import driver
from cinder.volume import volume_utils from cinder.volume import volume_utils
VERSION = '0.8.4' VERSION = '0.8.6'
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)

View File

@@ -25,4 +25,4 @@ rm fio
mv fio-copy fio mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'` FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.8.4/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.4$(rpm --eval '%dist').tar.gz * tar --transform 's#^#vitastor-0.8.6/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.6$(rpm --eval '%dist').tar.gz *

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \ RUN set -e; \
cd /root/vitastor/rpm; \ cd /root/vitastor/rpm; \
sh build-tarball.sh; \ sh build-tarball.sh; \
cp /root/vitastor-0.8.4.el7.tar.gz ~/rpmbuild/SOURCES; \ cp /root/vitastor-0.8.6.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \ cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \ cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \ rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor Name: vitastor
Version: 0.8.4 Version: 0.8.6
Release: 1%{?dist} Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1 License: Vitastor Network Public License 1.1
URL: https://vitastor.io/ URL: https://vitastor.io/
Source0: vitastor-0.8.4.el7.tar.gz Source0: vitastor-0.8.6.el7.tar.gz
BuildRequires: liburing-devel >= 0.6 BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel BuildRequires: gperftools-devel
@@ -35,6 +35,7 @@ Summary: Vitastor - OSD
Requires: libJerasure2 Requires: libJerasure2
Requires: libisa-l Requires: libisa-l
Requires: liburing >= 0.6 Requires: liburing >= 0.6
Requires: liburing < 2
Requires: vitastor-client = %{version}-%{release} Requires: vitastor-client = %{version}-%{release}
Requires: util-linux Requires: util-linux
Requires: parted Requires: parted
@@ -59,6 +60,7 @@ scheduling cluster-level operations.
%package -n vitastor-client %package -n vitastor-client
Summary: Vitastor - client Summary: Vitastor - client
Requires: liburing >= 0.6 Requires: liburing >= 0.6
Requires: liburing < 2
%description -n vitastor-client %description -n vitastor-client

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \ RUN set -e; \
cd /root/vitastor/rpm; \ cd /root/vitastor/rpm; \
sh build-tarball.sh; \ sh build-tarball.sh; \
cp /root/vitastor-0.8.4.el8.tar.gz ~/rpmbuild/SOURCES; \ cp /root/vitastor-0.8.6.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \ cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \ cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \ rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor Name: vitastor
Version: 0.8.4 Version: 0.8.6
Release: 1%{?dist} Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1 License: Vitastor Network Public License 1.1
URL: https://vitastor.io/ URL: https://vitastor.io/
Source0: vitastor-0.8.4.el8.tar.gz Source0: vitastor-0.8.6.el8.tar.gz
BuildRequires: liburing-devel >= 0.6 BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel BuildRequires: gperftools-devel
@@ -34,6 +34,7 @@ Summary: Vitastor - OSD
Requires: libJerasure2 Requires: libJerasure2
Requires: libisa-l Requires: libisa-l
Requires: liburing >= 0.6 Requires: liburing >= 0.6
Requires: liburing < 2
Requires: vitastor-client = %{version}-%{release} Requires: vitastor-client = %{version}-%{release}
Requires: util-linux Requires: util-linux
Requires: parted Requires: parted
@@ -57,6 +58,7 @@ scheduling cluster-level operations.
%package -n vitastor-client %package -n vitastor-client
Summary: Vitastor - client Summary: Vitastor - client
Requires: liburing >= 0.6 Requires: liburing >= 0.6
Requires: liburing < 2
%description -n vitastor-client %description -n vitastor-client

View File

@@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 2.8)
project(vitastor) project(vitastor)
include(GNUInstallDirs) include(GNUInstallDirs)
include(CTest)
set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree") set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree")
set(WITH_FIO true CACHE BOOL "Build FIO driver") set(WITH_FIO true CACHE BOOL "Build FIO driver")
@@ -15,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif() endif()
add_definitions(-DVERSION="0.8.4") add_definitions(-DVERSION="0.8.6")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src) add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN}) if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer) add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -55,6 +56,14 @@ if (ISAL_LIBRARIES)
add_definitions(-DWITH_ISAL) add_definitions(-DWITH_ISAL)
endif (ISAL_LIBRARIES) endif (ISAL_LIBRARIES)
add_custom_target(build_tests)
add_custom_target(test
COMMAND
echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
)
add_dependencies(test build_tests)
include_directories( include_directories(
../ ../
/usr/include/jerasure /usr/include/jerasure
@@ -145,7 +154,6 @@ add_library(vitastor_client SHARED
set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h") set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h")
target_link_libraries(vitastor_client target_link_libraries(vitastor_client
vitastor_common vitastor_common
tcmalloc_minimal
${LIBURING_LIBRARIES} ${LIBURING_LIBRARIES}
${IBVERBS_LIBRARIES} ${IBVERBS_LIBRARIES}
) )
@@ -235,14 +243,17 @@ add_executable(osd_test osd_test.cpp rw_blocking.cpp addr_util.cpp)
target_link_libraries(osd_test tcmalloc_minimal) target_link_libraries(osd_test tcmalloc_minimal)
# osd_rmw_test # osd_rmw_test
# FIXME: Move to tests add_executable(osd_rmw_test EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
add_executable(osd_rmw_test osd_rmw_test.cpp allocator.cpp)
target_link_libraries(osd_rmw_test Jerasure ${ISAL_LIBRARIES} tcmalloc_minimal) target_link_libraries(osd_rmw_test Jerasure ${ISAL_LIBRARIES} tcmalloc_minimal)
add_dependencies(build_tests osd_rmw_test)
add_test(NAME osd_rmw_test COMMAND osd_rmw_test)
if (ISAL_LIBRARIES) if (ISAL_LIBRARIES)
add_executable(osd_rmw_test_je osd_rmw_test.cpp allocator.cpp) add_executable(osd_rmw_test_je EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
target_compile_definitions(osd_rmw_test_je PUBLIC -DNO_ISAL) target_compile_definitions(osd_rmw_test_je PUBLIC -DNO_ISAL)
target_link_libraries(osd_rmw_test_je Jerasure tcmalloc_minimal) target_link_libraries(osd_rmw_test_je Jerasure tcmalloc_minimal)
add_dependencies(build_tests osd_rmw_test_je)
add_test(NAME osd_rmw_test_jerasure COMMAND osd_rmw_test_je)
endif (ISAL_LIBRARIES) endif (ISAL_LIBRARIES)
# stub_uring_osd # stub_uring_osd
@@ -257,11 +268,15 @@ target_link_libraries(stub_uring_osd
) )
# osd_peering_pg_test # osd_peering_pg_test
add_executable(osd_peering_pg_test osd_peering_pg_test.cpp osd_peering_pg.cpp) add_executable(osd_peering_pg_test EXCLUDE_FROM_ALL osd_peering_pg_test.cpp osd_peering_pg.cpp)
target_link_libraries(osd_peering_pg_test tcmalloc_minimal) target_link_libraries(osd_peering_pg_test tcmalloc_minimal)
add_dependencies(build_tests osd_peering_pg_test)
add_test(NAME osd_peering_pg_test COMMAND osd_peering_pg_test)
# test_allocator # test_allocator
add_executable(test_allocator test_allocator.cpp allocator.cpp) add_executable(test_allocator EXCLUDE_FROM_ALL test_allocator.cpp allocator.cpp)
add_dependencies(build_tests test_allocator)
add_test(NAME test_allocator COMMAND test_allocator)
# test_cas # test_cas
add_executable(test_cas add_executable(test_cas
@@ -281,12 +296,15 @@ target_link_libraries(test_crc32
# test_cluster_client # test_cluster_client
add_executable(test_cluster_client add_executable(test_cluster_client
EXCLUDE_FROM_ALL
test_cluster_client.cpp test_cluster_client.cpp
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
) )
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__) target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock) target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
add_dependencies(build_tests test_cluster_client)
add_test(NAME test_cluster_client COMMAND test_cluster_client)
## test_blockstore, test_shit ## test_blockstore, test_shit
#add_executable(test_blockstore test_blockstore.cpp) #add_executable(test_blockstore test_blockstore.cpp)

View File

@@ -325,7 +325,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
{ {
// Basic verification not passed // Basic verification not passed
op->retval = -EINVAL; op->retval = -EINVAL;
std::function<void (blockstore_op_t*)>(op->callback)(op); ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
return; return;
} }
if (op->opcode == BS_OP_SYNC_STAB_ALL) if (op->opcode == BS_OP_SYNC_STAB_ALL)
@@ -368,7 +368,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
} }
if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op)) if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
{ {
std::function<void (blockstore_op_t*)>(op->callback)(op); ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
return; return;
} }
// Call constructor without allocating memory. We'll call destructor before returning op back // Call constructor without allocating memory. We'll call destructor before returning op back

View File

@@ -121,8 +121,7 @@ resume_1:
} }
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED) if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
{ {
uint64_t pg_real_size = pool_stats[pool_cfg.id]["pg_real_size"].uint64_value(); pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
pool_avail = pg_real_size > 0 ? pool_avail * (pool_cfg.pg_size - pool_cfg.parity_chunks) / pg_real_size : 0;
} }
pool_stats[pool_cfg.id] = json11::Json::object { pool_stats[pool_cfg.id] = json11::Json::object {
{ "name", pool_cfg.name }, { "name", pool_cfg.name },

View File

@@ -92,6 +92,7 @@ struct rm_inode_t
void send_ops(rm_pg_t *cur_list) void send_ops(rm_pg_t *cur_list)
{ {
parent->cli->init_msgr();
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) == if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
parent->cli->msgr.osd_peer_fds.end()) parent->cli->msgr.osd_peer_fds.end())
{ {

View File

@@ -59,7 +59,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
delete op; delete op;
}; };
msgr.parse_config(this->config); msgr.parse_config(this->config);
msgr.init();
st_cli.tfd = tfd; st_cli.tfd = tfd;
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); }; st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
@@ -73,17 +72,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
scrap_buffer_size = SCRAP_BUFFER_SIZE; scrap_buffer_size = SCRAP_BUFFER_SIZE;
scrap_buffer = malloc_or_die(scrap_buffer_size); scrap_buffer = malloc_or_die(scrap_buffer_size);
if (ringloop)
{
consumer.loop = [this]()
{
msgr.read_requests();
msgr.send_replies();
this->ringloop->submit();
};
ringloop->register_consumer(&consumer);
}
} }
cluster_client_t::~cluster_client_t() cluster_client_t::~cluster_client_t()
@@ -115,6 +103,24 @@ cluster_op_t::~cluster_op_t()
} }
} }
void cluster_client_t::init_msgr()
{
if (msgr_initialized)
return;
msgr.init();
msgr_initialized = true;
if (ringloop)
{
consumer.loop = [this]()
{
msgr.read_requests();
msgr.send_replies();
this->ringloop->submit();
};
ringloop->register_consumer(&consumer);
}
}
void cluster_client_t::calc_wait(cluster_op_t *op) void cluster_client_t::calc_wait(cluster_op_t *op)
{ {
op->prev_wait = 0; op->prev_wait = 0;
@@ -223,11 +229,14 @@ void cluster_client_t::erase_op(cluster_op_t *op)
if (op_queue_tail == op) if (op_queue_tail == op)
op_queue_tail = op->prev; op_queue_tail = op->prev;
op->next = op->prev = NULL; op->next = op->prev = NULL;
if (flags & OP_FLUSH_BUFFER)
std::function<void(cluster_op_t*)>(op->callback)(op);
if (!(flags & OP_IMMEDIATE_COMMIT)) if (!(flags & OP_IMMEDIATE_COMMIT))
inc_wait(opcode, flags, next, -1); inc_wait(opcode, flags, next, -1);
// Call callback at the end to avoid inconsistencies in prev_wait // Call callback at the end to avoid inconsistencies in prev_wait
// if the callback adds more operations itself // if the callback adds more operations itself
std::function<void(cluster_op_t*)>(op->callback)(op); if (!(flags & OP_FLUSH_BUFFER))
std::function<void(cluster_op_t*)>(op->callback)(op);
} }
void cluster_client_t::continue_ops(bool up_retry) void cluster_client_t::continue_ops(bool up_retry)
@@ -757,7 +766,10 @@ resume_3:
{ {
for (int i = 0; i < op->parts.size(); i++) for (int i = 0; i < op->parts.size(); i++)
{ {
op->parts[i].flags = PART_RETRY; if (!(op->parts[i].flags & PART_DONE))
{
op->parts[i].flags = PART_RETRY;
}
} }
goto resume_2; goto resume_2;
} }
@@ -915,6 +927,10 @@ bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len
bool cluster_client_t::try_send(cluster_op_t *op, int i) bool cluster_client_t::try_send(cluster_op_t *op, int i)
{ {
if (!msgr_initialized)
{
init_msgr();
}
auto part = &op->parts[i]; auto part = &op->parts[i];
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode)); auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
auto pg_it = pool_cfg.pg_config.find(part->pg_num); auto pg_it = pool_cfg.pg_config.find(part->pg_num);

View File

@@ -104,10 +104,14 @@ class cluster_client_t
std::vector<std::function<void(void)>> on_ready_hooks; std::vector<std::function<void(void)>> on_ready_hooks;
std::vector<inode_list_t*> lists; std::vector<inode_list_t*> lists;
int continuing_ops = 0; int continuing_ops = 0;
bool msgr_initialized = false;
public: public:
etcd_state_client_t st_cli; etcd_state_client_t st_cli;
osd_messenger_t msgr; osd_messenger_t msgr;
void init_msgr();
json11::Json config; json11::Json config;
json11::Json::object merged_config; json11::Json::object merged_config;

View File

@@ -305,10 +305,10 @@ int write_zero(int fd, uint64_t offset, uint64_t size)
json11::Json read_parttable(std::string dev) json11::Json read_parttable(std::string dev)
{ {
std::string part_dump; std::string part_dump;
int r = shell_exec({ "sfdisk", "--dump", dev, "--json" }, "", &part_dump, NULL); int r = shell_exec({ "sfdisk", "--json", dev }, "", &part_dump, NULL);
if (r == 255) if (r == 255)
{ {
fprintf(stderr, "Error running sfdisk --dump %s --json\n", dev.c_str()); fprintf(stderr, "Error running sfdisk --json %s\n", dev.c_str());
return json11::Json(false); return json11::Json(false);
} }
// Decode partition table // Decode partition table
@@ -319,7 +319,7 @@ json11::Json read_parttable(std::string dev)
pt = json11::Json::parse(part_dump, err); pt = json11::Json::parse(part_dump, err);
if (err != "") if (err != "")
{ {
fprintf(stderr, "sfdisk --dump %s --json returned bad JSON: %s\n", dev.c_str(), part_dump.c_str()); fprintf(stderr, "sfdisk --json %s returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
return json11::Json(false); return json11::Json(false);
} }
pt = pt["partitiontable"]; pt = pt["partitiontable"];

View File

@@ -157,10 +157,10 @@ void osd_messenger_t::parse_config(const json11::Json & config)
this->rdma_max_sge = 128; this->rdma_max_sge = 128;
this->rdma_max_send = config["rdma_max_send"].uint64_value(); this->rdma_max_send = config["rdma_max_send"].uint64_value();
if (!this->rdma_max_send) if (!this->rdma_max_send)
this->rdma_max_send = 1; this->rdma_max_send = 8;
this->rdma_max_recv = config["rdma_max_recv"].uint64_value(); this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
if (!this->rdma_max_recv) if (!this->rdma_max_recv)
this->rdma_max_recv = 128; this->rdma_max_recv = 16;
this->rdma_max_msg = config["rdma_max_msg"].uint64_value(); this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024) if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
this->rdma_max_msg = 129*1024; this->rdma_max_msg = 129*1024;

View File

@@ -138,6 +138,7 @@ protected:
std::vector<int> read_ready_clients; std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients; std::vector<int> write_ready_clients;
// We don't use ringloop->set_immediate here because we may have no ringloop in client :)
std::vector<std::function<void()>> set_immediate; std::vector<std::function<void()>> set_immediate;
public: public:

View File

@@ -368,9 +368,8 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
bool osd_messenger_t::try_send_rdma(osd_client_t *cl) bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
{ {
auto rc = cl->rdma_conn; auto rc = cl->rdma_conn;
if (!cl->send_list.size() || rc->cur_send > 0) if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
{ {
// Only send one batch at a time
return true; return true;
} }
uint64_t op_size = 0, op_sge = 0; uint64_t op_size = 0, op_sge = 0;
@@ -380,6 +379,7 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
iovec & iov = cl->send_list[rc->send_pos]; iovec & iov = cl->send_list[rc->send_pos];
if (op_size >= rc->max_msg || op_sge >= rc->max_sge) if (op_size >= rc->max_msg || op_sge >= rc->max_sge)
{ {
rc->send_sizes.push_back(op_size);
try_send_rdma_wr(cl, sge, op_sge); try_send_rdma_wr(cl, sge, op_sge);
op_sge = 0; op_sge = 0;
op_size = 0; op_size = 0;
@@ -405,18 +405,24 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
} }
if (op_sge > 0) if (op_sge > 0)
{ {
rc->send_sizes.push_back(op_size);
try_send_rdma_wr(cl, sge, op_sge); try_send_rdma_wr(cl, sge, op_sge);
} }
return true; return true;
} }
static void try_recv_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge) static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
{ {
ibv_sge sge = {
.addr = (uintptr_t)buf,
.length = (uint32_t)cl->rdma_conn->max_msg,
.lkey = cl->rdma_conn->ctx->mr->lkey,
};
ibv_recv_wr *bad_wr = NULL; ibv_recv_wr *bad_wr = NULL;
ibv_recv_wr wr = { ibv_recv_wr wr = {
.wr_id = (uint64_t)(cl->peer_fd*2), .wr_id = (uint64_t)(cl->peer_fd*2),
.sg_list = sge, .sg_list = &sge,
.num_sge = op_sge, .num_sge = 1,
}; };
int err = ibv_post_recv(cl->rdma_conn->qp, &wr, &bad_wr); int err = ibv_post_recv(cl->rdma_conn->qp, &wr, &bad_wr);
if (err || bad_wr) if (err || bad_wr)
@@ -434,12 +440,7 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
{ {
void *buf = malloc_or_die(rc->max_msg); void *buf = malloc_or_die(rc->max_msg);
rc->recv_buffers.push_back(buf); rc->recv_buffers.push_back(buf);
ibv_sge sge = { try_recv_rdma_wr(cl, buf);
.addr = (uintptr_t)buf,
.length = (uint32_t)rc->max_msg,
.lkey = rc->ctx->mr->lkey,
};
try_recv_rdma_wr(cl, &sge, 1);
} }
return true; return true;
} }
@@ -476,6 +477,7 @@ void osd_messenger_t::handle_rdma_events()
continue; continue;
} }
osd_client_t *cl = cl_it->second; osd_client_t *cl = cl_it->second;
auto rc = cl->rdma_conn;
if (wc[i].status != IBV_WC_SUCCESS) if (wc[i].status != IBV_WC_SUCCESS)
{ {
fprintf(stderr, "RDMA work request failed for client %d", client_id); fprintf(stderr, "RDMA work request failed for client %d", client_id);
@@ -489,44 +491,59 @@ void osd_messenger_t::handle_rdma_events()
} }
if (!is_send) if (!is_send)
{ {
cl->rdma_conn->cur_recv--; rc->cur_recv--;
if (!handle_read_buffer(cl, cl->rdma_conn->recv_buffers[0], wc[i].byte_len)) if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
{ {
// handle_read_buffer may stop the client // handle_read_buffer may stop the client
continue; continue;
} }
free(cl->rdma_conn->recv_buffers[0]); try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
cl->rdma_conn->recv_buffers.erase(cl->rdma_conn->recv_buffers.begin(), cl->rdma_conn->recv_buffers.begin()+1); rc->next_recv_buf = (rc->next_recv_buf+1) % rc->recv_buffers.size();
try_recv_rdma(cl);
} }
else else
{ {
cl->rdma_conn->cur_send--; rc->cur_send--;
if (!cl->rdma_conn->cur_send) uint64_t sent_size = rc->send_sizes.at(0);
rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
int send_pos = 0, send_buf_pos = 0;
while (sent_size > 0)
{ {
// Wait for the whole batch if (sent_size >= cl->send_list.at(send_pos).iov_len)
for (int i = 0; i < cl->rdma_conn->send_pos; i++)
{ {
if (cl->outbox[i].flags & MSGR_SENDP_FREE) sent_size -= cl->send_list[send_pos].iov_len;
{ send_pos++;
// Reply fully sent
delete cl->outbox[i].op;
}
} }
if (cl->rdma_conn->send_pos > 0) else
{ {
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+cl->rdma_conn->send_pos); send_buf_pos = sent_size;
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+cl->rdma_conn->send_pos); sent_size = 0;
cl->rdma_conn->send_pos = 0;
} }
if (cl->rdma_conn->send_buf_pos > 0)
{
cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + cl->rdma_conn->send_buf_pos;
cl->send_list[0].iov_len -= cl->rdma_conn->send_buf_pos;
cl->rdma_conn->send_buf_pos = 0;
}
try_send_rdma(cl);
} }
assert(rc->send_pos >= send_pos);
if (rc->send_pos == send_pos)
{
rc->send_buf_pos -= send_buf_pos;
}
rc->send_pos -= send_pos;
for (int i = 0; i < send_pos; i++)
{
if (cl->outbox[i].flags & MSGR_SENDP_FREE)
{
// Reply fully sent
delete cl->outbox[i].op;
}
}
if (send_pos > 0)
{
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+send_pos);
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+send_pos);
}
if (send_buf_pos > 0)
{
cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + send_buf_pos;
cl->send_list[0].iov_len -= send_buf_pos;
}
try_send_rdma(cl);
} }
} }
} while (event_count > 0); } while (event_count > 0);

View File

@@ -49,8 +49,9 @@ struct msgr_rdma_connection_t
uint64_t max_msg = 0; uint64_t max_msg = 0;
int send_pos = 0, send_buf_pos = 0; int send_pos = 0, send_buf_pos = 0;
int recv_pos = 0, recv_buf_pos = 0; int next_recv_buf = 0;
std::vector<void*> recv_buffers; std::vector<void*> recv_buffers;
std::vector<uint64_t> send_sizes;
~msgr_rdma_connection_t(); ~msgr_rdma_connection_t();
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg); static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);

View File

@@ -683,7 +683,7 @@ void osd_t::apply_pg_config()
auto vec_all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()); auto vec_all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end());
if (currently_taken) if (currently_taken)
{ {
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING | PG_PEERED)) if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING))
{ {
if (pg_it->second.target_set == pg_cfg.target_set && if (pg_it->second.target_set == pg_cfg.target_set &&
pg_it->second.target_history == pg_cfg.target_history && pg_it->second.target_history == pg_cfg.target_history &&
@@ -963,13 +963,6 @@ void osd_t::report_pg_states()
} }
this->pgs.erase(pg_it); this->pgs.erase(pg_it);
} }
else if (pg_it->second.state & PG_PEERED)
{
// Activate PG after PG PEERED state is reported along with history
// (if the state wasn't changed again)
pg_it->second.state = pg_it->second.state & ~PG_PEERED | PG_ACTIVE;
report_pg_state(pg_it->second);
}
} }
} }
// Push other PG state updates, if any // Push other PG state updates, if any

View File

@@ -50,10 +50,6 @@ void osd_t::handle_peers()
still = true; still = true;
} }
} }
else if (p.second.state & PG_PEERED)
{
still = true;
}
} }
if (!still) if (!still)
{ {
@@ -74,10 +70,6 @@ void osd_t::handle_peers()
} }
still = true; still = true;
} }
else if (p.second.state & PG_PEERED)
{
still = true;
}
} }
if (!still) if (!still)
{ {
@@ -100,7 +92,7 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
{ {
auto & pg = p.second; auto & pg = p.second;
bool repeer = false; bool repeer = false;
if (pg.state & (PG_PEERING | PG_PEERED | PG_ACTIVE | PG_INCOMPLETE)) if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
{ {
for (osd_num_t pg_osd: pg.all_peers) for (osd_num_t pg_osd: pg.all_peers)
{ {

View File

@@ -88,13 +88,9 @@ void pg_obj_state_check_t::walk()
{ {
// Activate as degraded // Activate as degraded
// Current OSD set will be added into target_history on first write // Current OSD set will be added into target_history on first write
pg->state |= PG_DEGRADED | PG_PEERED; pg->state |= PG_DEGRADED;
}
else
{
// Just activate
pg->state |= PG_ACTIVE;
} }
pg->state |= PG_ACTIVE;
if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size()) if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
{ {
pg->state |= PG_LEFT_ON_DEAD; pg->state |= PG_LEFT_ON_DEAD;
@@ -460,11 +456,10 @@ void pg_t::calc_object_states(int log_level)
void pg_t::print_state() void pg_t::print_state()
{ {
printf( printf(
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num, "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
(state & PG_STARTING) ? "starting" : "", (state & PG_STARTING) ? "starting" : "",
(state & PG_OFFLINE) ? "offline" : "", (state & PG_OFFLINE) ? "offline" : "",
(state & PG_PEERING) ? "peering" : "", (state & PG_PEERING) ? "peering" : "",
(state & PG_PEERED) ? "peered" : "",
(state & PG_INCOMPLETE) ? "incomplete" : "", (state & PG_INCOMPLETE) ? "incomplete" : "",
(state & PG_ACTIVE) ? "active" : "", (state & PG_ACTIVE) ? "active" : "",
(state & PG_REPEERING) ? "repeering" : "", (state & PG_REPEERING) ? "repeering" : "",

View File

@@ -54,5 +54,6 @@ int main(int argc, char *argv[])
{ {
printf("dev: state=%lx\n", it.second.state); printf("dev: state=%lx\n", it.second.state);
} }
delete pg.peering_state;
return 0; return 0;
} }

View File

@@ -297,7 +297,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
// Fail it immediately // Fail it immediately
subop->peer_fd = -1; subop->peer_fd = -1;
subop->reply.hdr.retval = -EPIPE; subop->reply.hdr.retval = -EPIPE;
subop->callback(subop); ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
} }
subop_idx++; subop_idx++;
} }

View File

@@ -235,7 +235,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
// Fail it immediately // Fail it immediately
subop->peer_fd = -1; subop->peer_fd = -1;
subop->reply.hdr.retval = -EPIPE; subop->reply.hdr.retval = -EPIPE;
subop->callback(subop); ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
} }
} }
i++; i++;
@@ -520,7 +520,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
// Fail it immediately // Fail it immediately
subops[i].peer_fd = -1; subops[i].peer_fd = -1;
subops[i].reply.hdr.retval = -EPIPE; subops[i].reply.hdr.retval = -EPIPE;
subops[i].callback(&subops[i]); ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
} }
} }
} }
@@ -635,7 +635,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
// Fail it immediately // Fail it immediately
subops[i].peer_fd = -1; subops[i].peer_fd = -1;
subops[i].reply.hdr.retval = -EPIPE; subops[i].reply.hdr.retval = -EPIPE;
subops[i].callback(&subops[i]); ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
} }
} }
} }

View File

@@ -168,8 +168,8 @@ resume_3:
auto it = std::lower_bound(pg.target_history.begin(), pg.target_history.end(), history_set); auto it = std::lower_bound(pg.target_history.begin(), pg.target_history.end(), history_set);
if (it == pg.target_history.end() || *it != history_set) if (it == pg.target_history.end() || *it != history_set)
pg.target_history.insert(it, history_set); pg.target_history.insert(it, history_set);
pg.history_changed = true;
} }
pg.history_changed = true;
report_pg_states(); report_pg_states();
resume_10: resume_10:
if (pg.epoch > pg.reported_epoch) if (pg.epoch > pg.reported_epoch)

View File

@@ -759,7 +759,18 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
uint32_t &start, uint32_t &end) uint32_t &start, uint32_t &end)
{ {
if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set) bool required = false;
for (int role = pg_minsize; role < pg_size; role++)
{
if (write_osd_set[role] != 0)
{
// Whole parity chunk is needed when we move the object
if (write_osd_set[role] != read_osd_set[role])
end = chunk_size;
required = true;
}
}
if (required && end != chunk_size)
{ {
// start & end are required for calc_rmw_parity // start & end are required for calc_rmw_parity
for (int role = 0; role < pg_minsize; role++) for (int role = 0; role < pg_minsize; role++)
@@ -770,14 +781,6 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
end = std::max(stripes[role].req_end, end); end = std::max(stripes[role].req_end, end);
} }
} }
for (int role = pg_minsize; role < pg_size; role++)
{
if (write_osd_set[role] != 0 && write_osd_set[role] != read_osd_set[role])
{
start = 0;
end = chunk_size;
}
}
} }
// Set bitmap bits accordingly // Set bitmap bits accordingly
if (bitmap_granularity > 0) if (bitmap_granularity > 0)

View File

@@ -24,7 +24,7 @@ void test11();
void test12(); void test12();
void test13(); void test13();
void test14(); void test14();
void test15(); void test15(bool second);
void test16(); void test16();
int main(int narg, char *args[]) int main(int narg, char *args[])
@@ -54,7 +54,8 @@ int main(int narg, char *args[])
// Test 14 // Test 14
test14(); test14();
// Test 15 // Test 15
test15(); test15(false);
test15(true);
// Test 16 // Test 16
test16(); test16();
// End // End
@@ -826,12 +827,11 @@ void test14()
***/ ***/
void test15() void test15(bool second)
{ {
const int bmp = 64*1024 / 4096 / 8; const int bmp = 64*1024 / 4096 / 8;
use_ec(4, 2, true); use_ec(4, 2, true);
osd_num_t osd_set[4] = { 1, 2, 3, 0 }; osd_num_t osd_set[4] = { 1, 2, (osd_num_t)(second ? 0 : 3), (osd_num_t)(second ? 4 : 0) };
osd_num_t write_osd_set[4] = { 1, 2, 3, 0 };
osd_rmw_stripe_t stripes[4] = {}; osd_rmw_stripe_t stripes[4] = {};
unsigned bitmaps[4] = { 0 }; unsigned bitmaps[4] = { 0 };
// Test 15.0 // Test 15.0
@@ -842,7 +842,7 @@ void test15()
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0); assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0); assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
// Test 15.1 // Test 15.1
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, write_osd_set, 64*1024, bmp); void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, osd_set, 64*1024, bmp);
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
stripes[i].bmp_buf = bitmaps+i; stripes[i].bmp_buf = bitmaps+i;
assert(rmw_buf); assert(rmw_buf);
@@ -852,36 +852,38 @@ void test15()
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0); assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0); assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024); assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024); assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0); assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024); assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
assert(stripes[1].read_buf == NULL); assert(stripes[1].read_buf == NULL);
assert(stripes[2].read_buf == NULL); assert(stripes[2].read_buf == NULL);
assert(stripes[3].read_buf == NULL); assert(stripes[3].read_buf == NULL);
assert(stripes[0].write_buf == NULL); assert(stripes[0].write_buf == NULL);
assert(stripes[1].write_buf == (uint8_t*)write_buf); assert(stripes[1].write_buf == (uint8_t*)write_buf);
assert(stripes[2].write_buf == rmw_buf); assert(stripes[2+second].write_buf == rmw_buf);
assert(stripes[3].write_buf == NULL); assert(stripes[3-second].write_buf == NULL);
// Test 15.2 - encode // Test 15.2 - encode
set_pattern(write_buf, 4*1024, PATTERN1); set_pattern(write_buf, 4*1024, PATTERN1);
set_pattern(stripes[0].read_buf, 4*1024, PATTERN2); set_pattern(stripes[0].read_buf, 4*1024, PATTERN2);
memset(stripes[0].bmp_buf, 0, bmp); memset(stripes[0].bmp_buf, 0, bmp);
memset(stripes[1].bmp_buf, 0, bmp); memset(stripes[1].bmp_buf, 0, bmp);
calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 64*1024, bmp); memset(stripes[2+second].write_buf, 0, 4096);
assert(*(uint32_t*)stripes[2].bmp_buf == 0x80); calc_rmw_parity_ec(stripes, 4, 2, osd_set, osd_set, 64*1024, bmp);
assert(second || *(uint32_t*)stripes[2].bmp_buf == 0x80);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0); assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024); assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024); assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0); assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
assert(stripes[0].write_buf == NULL); assert(stripes[0].write_buf == NULL);
assert(stripes[1].write_buf == (uint8_t*)write_buf); assert(stripes[1].write_buf == (uint8_t*)write_buf);
assert(stripes[2].write_buf == rmw_buf); assert(stripes[2+second].write_buf == rmw_buf);
assert(stripes[3].write_buf == NULL); assert(stripes[3-second].write_buf == NULL);
check_pattern(stripes[2].write_buf, 4*1024, PATTERN1^PATTERN2); // first parity is always xor :) // first parity is always xor :), second isn't...
check_pattern(stripes[2+second].write_buf, 4*1024, second ? 0xb79a59a0ce8b9b81 : PATTERN1^PATTERN2);
// Done // Done
free(rmw_buf); free(rmw_buf);
free(write_buf); free(write_buf);
use_ec(3, 2, false); use_ec(4, 2, false);
} }
/*** /***
@@ -984,5 +986,5 @@ void test16()
// Done // Done
free(rmw_buf); free(rmw_buf);
free(write_buf); free(write_buf);
use_ec(3, 2, false); use_ec(4, 2, false);
} }

View File

@@ -3,12 +3,11 @@
#include "pg_states.h" #include "pg_states.h"
const int pg_state_bit_count = 16; const int pg_state_bit_count = 14;
const int pg_state_bits[16] = { const int pg_state_bits[14] = {
PG_STARTING, PG_STARTING,
PG_PEERING, PG_PEERING,
PG_PEERED,
PG_INCOMPLETE, PG_INCOMPLETE,
PG_ACTIVE, PG_ACTIVE,
PG_REPEERING, PG_REPEERING,
@@ -23,10 +22,9 @@ const int pg_state_bits[16] = {
PG_LEFT_ON_DEAD, PG_LEFT_ON_DEAD,
}; };
const char *pg_state_names[16] = { const char *pg_state_names[14] = {
"starting", "starting",
"peering", "peering",
"peered",
"incomplete", "incomplete",
"active", "active",
"repeering", "repeering",

View File

@@ -4,27 +4,25 @@
#pragma once #pragma once
// Placement group states // Placement group states
// STARTING -> [acquire lock] -> PEERING -> PEERED // STARTING -> [acquire lock] -> PEERING -> INCOMPLETE|ACTIVE
// PEERED -> [report history if required!] -> INCOMPLETE|ACTIVE
// ACTIVE -> REPEERING -> PEERING // ACTIVE -> REPEERING -> PEERING
// ACTIVE -> STOPPING -> OFFLINE -> [release lock] // ACTIVE -> STOPPING -> OFFLINE -> [release lock]
// Exactly one of these: // Exactly one of these:
#define PG_STARTING (1<<0) #define PG_STARTING (1<<0)
#define PG_PEERING (1<<1) #define PG_PEERING (1<<1)
#define PG_PEERED (1<<2) #define PG_INCOMPLETE (1<<2)
#define PG_INCOMPLETE (1<<3) #define PG_ACTIVE (1<<3)
#define PG_ACTIVE (1<<4) #define PG_REPEERING (1<<4)
#define PG_REPEERING (1<<5) #define PG_STOPPING (1<<5)
#define PG_STOPPING (1<<6) #define PG_OFFLINE (1<<6)
#define PG_OFFLINE (1<<7)
// Plus any of these: // Plus any of these:
#define PG_DEGRADED (1<<8) #define PG_DEGRADED (1<<7)
#define PG_HAS_INCOMPLETE (1<<9) #define PG_HAS_INCOMPLETE (1<<8)
#define PG_HAS_DEGRADED (1<<10) #define PG_HAS_DEGRADED (1<<9)
#define PG_HAS_MISPLACED (1<<11) #define PG_HAS_MISPLACED (1<<10)
#define PG_HAS_UNCLEAN (1<<12) #define PG_HAS_UNCLEAN (1<<11)
#define PG_HAS_INVALID (1<<13) #define PG_HAS_INVALID (1<<12)
#define PG_LEFT_ON_DEAD (1<<14) #define PG_LEFT_ON_DEAD (1<<13)
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication) // Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size // 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size

View File

@@ -25,7 +25,6 @@ ring_loop_t::ring_loop_t(int qd)
{ {
free_ring_data[i] = i; free_ring_data[i] = i;
} }
wait_sqe_id = 1;
} }
ring_loop_t::~ring_loop_t() ring_loop_t::~ring_loop_t()
@@ -83,17 +82,19 @@ void ring_loop_t::loop()
} }
io_uring_cqe_seen(&ring, cqe); io_uring_cqe_seen(&ring, cqe);
} }
while (get_sqe_queue.size() > 0)
{
(get_sqe_queue[0].second)();
get_sqe_queue.erase(get_sqe_queue.begin());
}
do do
{ {
loop_again = false; loop_again = false;
for (int i = 0; i < consumers.size(); i++) for (int i = 0; i < consumers.size(); i++)
{ {
consumers[i]->loop(); consumers[i]->loop();
if (immediate_queue.size())
{
immediate_queue2.swap(immediate_queue);
for (auto & cb: immediate_queue2)
cb();
immediate_queue2.clear();
}
} }
} while (loop_again); } while (loop_again);
} }

View File

@@ -119,11 +119,10 @@ struct ring_consumer_t
class ring_loop_t class ring_loop_t
{ {
std::vector<std::pair<int,std::function<void()>>> get_sqe_queue; std::vector<std::function<void()>> immediate_queue, immediate_queue2;
std::vector<ring_consumer_t*> consumers; std::vector<ring_consumer_t*> consumers;
struct ring_data_t *ring_datas; struct ring_data_t *ring_datas;
int *free_ring_data; int *free_ring_data;
int wait_sqe_id;
unsigned free_ring_data_ptr; unsigned free_ring_data_ptr;
bool loop_again; bool loop_again;
struct io_uring ring; struct io_uring ring;
@@ -145,20 +144,9 @@ public:
} }
return sqe; return sqe;
} }
inline int wait_sqe(std::function<void()> cb) inline void set_immediate(const std::function<void()> cb)
{ {
get_sqe_queue.push_back({ wait_sqe_id, cb }); immediate_queue.push_back(cb);
return wait_sqe_id++;
}
inline void cancel_wait_sqe(int wait_id)
{
for (int i = 0; i < get_sqe_queue.size(); i++)
{
if (get_sqe_queue[i].first == wait_id)
{
get_sqe_queue.erase(get_sqe_queue.begin()+i, get_sqe_queue.begin()+i+1);
}
}
} }
inline int submit() inline int submit()
{ {

View File

@@ -8,7 +8,6 @@
void configure_single_pg_pool(cluster_client_t *cli) void configure_single_pg_pool(cluster_client_t *cli)
{ {
cli->st_cli.on_load_pgs_hook(true);
cli->st_cli.parse_state((etcd_kv_t){ cli->st_cli.parse_state((etcd_kv_t){
.key = "/config/pools", .key = "/config/pools",
.value = json11::Json::object { .value = json11::Json::object {
@@ -43,6 +42,7 @@ void configure_single_pg_pool(cluster_client_t *cli)
{ "state", json11::Json::array { "active" } }, { "state", json11::Json::array { "active" } },
}, },
}); });
cli->st_cli.on_load_pgs_hook(true);
std::map<std::string, etcd_kv_t> changes; std::map<std::string, etcd_kv_t> changes;
cli->st_cli.on_change_hook(changes); cli->st_cli.on_change_hook(changes);
} }
@@ -188,7 +188,6 @@ void test1()
int *r1 = test_write(cli, 0, 4096, 0x55); int *r1 = test_write(cli, 0, 4096, 0x55);
configure_single_pg_pool(cli); configure_single_pg_pool(cli);
pretend_connected(cli, 1); pretend_connected(cli, 1);
cli->continue_ops(true);
can_complete(r1); can_complete(r1);
check_op_count(cli, 1, 1); check_op_count(cli, 1, 1);
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0); pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
@@ -196,8 +195,6 @@ void test1()
pretend_disconnected(cli, 1); pretend_disconnected(cli, 1);
int *r2 = test_sync(cli); int *r2 = test_sync(cli);
pretend_connected(cli, 1); pretend_connected(cli, 1);
check_op_count(cli, 1, 0);
cli->continue_ops(true);
check_op_count(cli, 1, 1); check_op_count(cli, 1, 1);
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0); pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
check_op_count(cli, 1, 1); check_op_count(cli, 1, 1);
@@ -303,8 +300,6 @@ void test1()
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), -EPIPE); pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), -EPIPE);
check_disconnected(cli, 1); check_disconnected(cli, 1);
pretend_connected(cli, 1); pretend_connected(cli, 1);
check_op_count(cli, 1, 0);
cli->continue_ops(true);
check_op_count(cli, 1, 1); check_op_count(cli, 1, 1);
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0); pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
check_op_count(cli, 1, 1); check_op_count(cli, 1, 1);

View File

@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor Name: Vitastor
Description: Vitastor client library Description: Vitastor client library
Version: 0.8.4 Version: 0.8.6
Libs: -L${libdir} -lvitastor_client Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir} Cflags: -I${includedir}