Compare commits
1 Commits
rdma-flow-
...
rm-left-on
Author | SHA1 | Date | |
---|---|---|---|
0f964d62db |
@@ -1,7 +1,7 @@
|
||||
cmake_minimum_required(VERSION 2.8.12)
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VERSION "0.8.6")
|
||||
set(VERSION "0.8.3")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
@@ -48,9 +48,9 @@ Vitastor, составлены для того, чтобы убедиться,
|
||||
интерфейс (прокси), опять же, без открытия в свободный публичный доступ как
|
||||
самой программы, так и прокси.
|
||||
|
||||
Сетевая Публичная Лицензия Vitastor разработана специально, чтобы
|
||||
Сетевая Публичная Лицензия Vitastor разработана специально чтобы
|
||||
гарантировать, что в таких случаях и модифицированная версия программы, и
|
||||
прокси останутся доступными сообществу. Для этого лицензия требует от
|
||||
прокси оставались доступными сообществу. Для этого лицензия требует от
|
||||
операторов сетевых серверов предоставлять исходный код оригинальной программы,
|
||||
а также всех других программ, взаимодействующих с ней на их серверах,
|
||||
пользователям этих серверов, на условиях свободных лицензий. Таким образом,
|
||||
|
@@ -1,4 +1,4 @@
|
||||
VERSION ?= v0.8.6
|
||||
VERSION ?= v0.8.3
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v0.8.6
|
||||
image: vitalif/vitastor-csi:v0.8.3
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -116,7 +116,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v0.8.6
|
||||
image: vitalif/vitastor-csi:v0.8.3
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "0.8.6"
|
||||
vitastorCSIDriverVersion = "0.8.3"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
@@ -6,11 +6,11 @@ package vitastor
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"bytes"
|
||||
"strconv"
|
||||
"time"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"io/ioutil"
|
||||
@@ -21,6 +21,8 @@ import (
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/status"
|
||||
|
||||
"go.etcd.io/etcd/clientv3"
|
||||
|
||||
"github.com/container-storage-interface/spec/lib/go/csi"
|
||||
)
|
||||
|
||||
@@ -112,34 +114,6 @@ func GetConnectionParams(params map[string]string) (map[string]string, []string,
|
||||
return ctxVars, etcdUrl, etcdPrefix
|
||||
}
|
||||
|
||||
func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
|
||||
{
|
||||
if (ctxVars["etcdUrl"] != "")
|
||||
{
|
||||
args = append(args, "--etcd_address", ctxVars["etcdUrl"])
|
||||
}
|
||||
if (ctxVars["etcdPrefix"] != "")
|
||||
{
|
||||
args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
|
||||
}
|
||||
if (ctxVars["configPath"] != "")
|
||||
{
|
||||
args = append(args, "--config_path", ctxVars["configPath"])
|
||||
}
|
||||
c := exec.Command("/usr/bin/vitastor-cli", args...)
|
||||
var stdout, stderr bytes.Buffer
|
||||
c.Stdout = &stdout
|
||||
c.Stderr = &stderr
|
||||
err := c.Run()
|
||||
stderrStr := string(stderr.Bytes())
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
|
||||
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
|
||||
}
|
||||
return stdout.Bytes(), nil
|
||||
}
|
||||
|
||||
// Create the volume
|
||||
func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
|
||||
{
|
||||
@@ -172,41 +146,128 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
|
||||
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
|
||||
}
|
||||
|
||||
ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
|
||||
// FIXME: The following should PROBABLY be implemented externally in a management tool
|
||||
|
||||
ctxVars, etcdUrl, etcdPrefix := GetConnectionParams(req.Parameters)
|
||||
if (len(etcdUrl) == 0)
|
||||
{
|
||||
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
||||
}
|
||||
|
||||
// Create image using vitastor-cli
|
||||
_, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
|
||||
// Connect to etcd
|
||||
cli, err := clientv3.New(clientv3.Config{
|
||||
DialTimeout: ETCD_TIMEOUT,
|
||||
Endpoints: etcdUrl,
|
||||
})
|
||||
if (err != nil)
|
||||
{
|
||||
if (strings.Index(err.Error(), "already exists") > 0)
|
||||
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
|
||||
}
|
||||
defer cli.Close()
|
||||
|
||||
var imageId uint64 = 0
|
||||
for
|
||||
{
|
||||
// Check if the image exists
|
||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
|
||||
cancel()
|
||||
if (err != nil)
|
||||
{
|
||||
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
|
||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||
}
|
||||
if (len(resp.Kvs) > 0)
|
||||
{
|
||||
kv := resp.Kvs[0]
|
||||
var v InodeIndex
|
||||
err := json.Unmarshal(kv.Value, &v)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
|
||||
}
|
||||
var inodeCfg []InodeConfig
|
||||
err = json.Unmarshal(stat, &inodeCfg)
|
||||
poolId = v.PoolId
|
||||
imageId = v.Id
|
||||
inodeCfgKey := fmt.Sprintf("/config/inode/%d/%d", poolId, imageId)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||
resp, err := cli.Get(ctx, etcdPrefix+inodeCfgKey)
|
||||
cancel()
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
|
||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||
}
|
||||
if (len(inodeCfg) == 0)
|
||||
if (len(resp.Kvs) == 0)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
|
||||
return nil, status.Error(codes.Internal, "missing "+inodeCfgKey+" key in etcd")
|
||||
}
|
||||
if (inodeCfg[0].Size < uint64(volSize))
|
||||
var inodeCfg InodeConfig
|
||||
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
|
||||
}
|
||||
if (inodeCfg.Size < uint64(volSize))
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return nil, err
|
||||
// Find a free ID
|
||||
// Create image metadata in a transaction verifying that the image doesn't exist yet AND ID is still free
|
||||
maxIdKey := fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||
resp, err := cli.Get(ctx, maxIdKey)
|
||||
cancel()
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||
}
|
||||
var modRev int64
|
||||
var nextId uint64
|
||||
if (len(resp.Kvs) > 0)
|
||||
{
|
||||
var err error
|
||||
nextId, err = strconv.ParseUint(string(resp.Kvs[0].Value), 10, 64)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, maxIdKey+" contains invalid ID")
|
||||
}
|
||||
modRev = resp.Kvs[0].ModRevision
|
||||
nextId++
|
||||
}
|
||||
else
|
||||
{
|
||||
nextId = 1
|
||||
}
|
||||
inodeIdxJson, _ := json.Marshal(InodeIndex{
|
||||
Id: nextId,
|
||||
PoolId: poolId,
|
||||
})
|
||||
inodeCfgJson, _ := json.Marshal(InodeConfig{
|
||||
Name: volName,
|
||||
Size: uint64(volSize),
|
||||
})
|
||||
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||
txnResp, err := cli.Txn(ctx).If(
|
||||
clientv3.Compare(clientv3.ModRevision(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)), "=", modRev),
|
||||
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)), "=", 0),
|
||||
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId)), "=", 0),
|
||||
).Then(
|
||||
clientv3.OpPut(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId), fmt.Sprintf("%d", nextId)),
|
||||
clientv3.OpPut(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName), string(inodeIdxJson)),
|
||||
clientv3.OpPut(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId), string(inodeCfgJson)),
|
||||
).Commit()
|
||||
cancel()
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to commit transaction in etcd: "+err.Error())
|
||||
}
|
||||
if (txnResp.Succeeded)
|
||||
{
|
||||
imageId = nextId
|
||||
break
|
||||
}
|
||||
// Start over if the transaction fails
|
||||
}
|
||||
}
|
||||
|
||||
@@ -238,12 +299,97 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
|
||||
}
|
||||
volName := ctxVars["name"]
|
||||
|
||||
ctxVars, _, _ = GetConnectionParams(ctxVars)
|
||||
_, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
|
||||
if (len(etcdUrl) == 0)
|
||||
{
|
||||
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
||||
}
|
||||
|
||||
_, err = invokeCLI(ctxVars, []string{ "rm", volName })
|
||||
cli, err := clientv3.New(clientv3.Config{
|
||||
DialTimeout: ETCD_TIMEOUT,
|
||||
Endpoints: etcdUrl,
|
||||
})
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, err
|
||||
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
|
||||
}
|
||||
defer cli.Close()
|
||||
|
||||
// Find inode by name
|
||||
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
|
||||
cancel()
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||
}
|
||||
if (len(resp.Kvs) == 0)
|
||||
{
|
||||
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
|
||||
}
|
||||
var idx InodeIndex
|
||||
err = json.Unmarshal(resp.Kvs[0].Value, &idx)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
|
||||
}
|
||||
|
||||
// Get inode config
|
||||
inodeCfgKey := fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)
|
||||
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||
resp, err = cli.Get(ctx, inodeCfgKey)
|
||||
cancel()
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||
}
|
||||
if (len(resp.Kvs) == 0)
|
||||
{
|
||||
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
|
||||
}
|
||||
var inodeCfg InodeConfig
|
||||
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
|
||||
}
|
||||
|
||||
// Delete inode data by invoking vitastor-cli
|
||||
args := []string{
|
||||
"rm-data", "--etcd_address", strings.Join(etcdUrl, ","),
|
||||
"--pool", fmt.Sprintf("%d", idx.PoolId),
|
||||
"--inode", fmt.Sprintf("%d", idx.Id),
|
||||
}
|
||||
if (ctxVars["configPath"] != "")
|
||||
{
|
||||
args = append(args, "--config_path", ctxVars["configPath"])
|
||||
}
|
||||
c := exec.Command("/usr/bin/vitastor-cli", args...)
|
||||
var stderr bytes.Buffer
|
||||
c.Stdout = nil
|
||||
c.Stderr = &stderr
|
||||
err = c.Run()
|
||||
stderrStr := string(stderr.Bytes())
|
||||
if (err != nil)
|
||||
{
|
||||
klog.Errorf("vitastor-cli rm-data failed: %s, status %s\n", stderrStr, err)
|
||||
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
|
||||
}
|
||||
|
||||
// Delete inode config in etcd
|
||||
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||
txnResp, err := cli.Txn(ctx).Then(
|
||||
clientv3.OpDelete(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)),
|
||||
clientv3.OpDelete(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)),
|
||||
).Commit()
|
||||
cancel()
|
||||
if (err != nil)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: "+err.Error())
|
||||
}
|
||||
if (!txnResp.Succeeded)
|
||||
{
|
||||
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: transaction failed")
|
||||
}
|
||||
|
||||
return &csi.DeleteVolumeResponse{}, nil
|
||||
|
4
debian/changelog
vendored
4
debian/changelog
vendored
@@ -1,10 +1,10 @@
|
||||
vitastor (0.8.6-1) unstable; urgency=medium
|
||||
vitastor (0.8.3-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
||||
|
||||
vitastor (0.8.6-1) unstable; urgency=medium
|
||||
vitastor (0.8.3-1) unstable; urgency=medium
|
||||
|
||||
* Implement NFS proxy
|
||||
* Add documentation
|
||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -34,8 +34,8 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-0.8.6; \
|
||||
cd vitastor-0.8.6; \
|
||||
cp -r /root/vitastor vitastor-0.8.3; \
|
||||
cd vitastor-0.8.3; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
@@ -48,8 +48,8 @@ RUN set -e -x; \
|
||||
rm -rf a b; \
|
||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.6.orig.tar.xz vitastor-0.8.6; \
|
||||
cd vitastor-0.8.6; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.3.orig.tar.xz vitastor-0.8.3; \
|
||||
cd vitastor-0.8.3; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
@@ -19,7 +19,6 @@ between clients, OSDs and etcd.
|
||||
- [rdma_max_sge](#rdma_max_sge)
|
||||
- [rdma_max_msg](#rdma_max_msg)
|
||||
- [rdma_max_recv](#rdma_max_recv)
|
||||
- [rdma_max_send](#rdma_max_send)
|
||||
- [peer_connect_interval](#peer_connect_interval)
|
||||
- [peer_connect_timeout](#peer_connect_timeout)
|
||||
- [osd_idle_timeout](#osd_idle_timeout)
|
||||
@@ -75,12 +74,6 @@ to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||
root to list available RDMA devices and their features.
|
||||
|
||||
Remember that you also have to configure your network switches if you use
|
||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
||||
the manual of your network vendor for details about setting up the switch
|
||||
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
|
||||
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
||||
|
||||
## rdma_port_num
|
||||
|
||||
- Type: integer
|
||||
@@ -123,30 +116,20 @@ required to change this parameter.
|
||||
## rdma_max_msg
|
||||
|
||||
- Type: integer
|
||||
- Default: 132096
|
||||
- Default: 1048576
|
||||
|
||||
Maximum size of a single RDMA send or receive operation in bytes.
|
||||
|
||||
## rdma_max_recv
|
||||
|
||||
- Type: integer
|
||||
- Default: 16
|
||||
|
||||
Maximum number of RDMA receive buffers per connection (RDMA requires
|
||||
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
|
||||
in size. So this setting directly affects memory usage: a single Vitastor
|
||||
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
|
||||
Default is roughly 2 MB * number of OSDs.
|
||||
|
||||
## rdma_max_send
|
||||
|
||||
- Type: integer
|
||||
- Default: 8
|
||||
|
||||
Maximum number of outstanding RDMA send operations per connection. Should be
|
||||
less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
|
||||
Doesn't affect memory usage - additional memory isn't allocated for send
|
||||
operations.
|
||||
Maximum number of parallel RDMA receive operations. Note that this number
|
||||
of receive buffers `rdma_max_msg` in size are allocated for each client,
|
||||
so this setting actually affects memory usage. This is because RDMA receive
|
||||
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
|
||||
later versions.
|
||||
|
||||
## peer_connect_interval
|
||||
|
||||
|
@@ -19,7 +19,6 @@
|
||||
- [rdma_max_sge](#rdma_max_sge)
|
||||
- [rdma_max_msg](#rdma_max_msg)
|
||||
- [rdma_max_recv](#rdma_max_recv)
|
||||
- [rdma_max_send](#rdma_max_send)
|
||||
- [peer_connect_interval](#peer_connect_interval)
|
||||
- [peer_connect_timeout](#peer_connect_timeout)
|
||||
- [osd_idle_timeout](#osd_idle_timeout)
|
||||
@@ -79,13 +78,6 @@ Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Наприме
|
||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||
параметры и возможности.
|
||||
|
||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
||||
нестабильной производительностью. Подробную информацию о настройке
|
||||
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
|
||||
подразумевает настройку сети без потерь на основе PFC (Priority Flow
|
||||
Control) и ECN (Explicit Congestion Notification).
|
||||
|
||||
## rdma_port_num
|
||||
|
||||
- Тип: целое число
|
||||
@@ -129,32 +121,22 @@ OSD в любом случае согласовывают реальное зн
|
||||
## rdma_max_msg
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 132096
|
||||
- Значение по умолчанию: 1048576
|
||||
|
||||
Максимальный размер одной RDMA-операции отправки или приёма.
|
||||
|
||||
## rdma_max_recv
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 16
|
||||
|
||||
Максимальное число буферов для RDMA-приёма данных на одно соединение
|
||||
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
|
||||
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
|
||||
потребление памяти - один Vitastor-клиент с RDMA использует
|
||||
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
|
||||
примерно 2 МБ * число OSD.
|
||||
|
||||
## rdma_max_send
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 8
|
||||
|
||||
Максимальное число RDMA-операций отправки, отправляемых в очередь одного
|
||||
соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
|
||||
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
||||
не выделяется.
|
||||
Максимальное число параллельных RDMA-операций получения данных. Следует
|
||||
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
|
||||
для каждого подключённого клиентского соединения, так что данная настройка
|
||||
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
|
||||
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
|
||||
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
|
||||
более новых версиях Vitastor.
|
||||
|
||||
## peer_connect_interval
|
||||
|
||||
|
@@ -53,12 +53,6 @@
|
||||
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||
root to list available RDMA devices and their features.
|
||||
|
||||
Remember that you also have to configure your network switches if you use
|
||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
||||
the manual of your network vendor for details about setting up the switch
|
||||
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
|
||||
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
||||
info_ru: |
|
||||
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
||||
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
||||
@@ -67,13 +61,6 @@
|
||||
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||
параметры и возможности.
|
||||
|
||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
||||
нестабильной производительностью. Подробную информацию о настройке
|
||||
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
|
||||
подразумевает настройку сети без потерь на основе PFC (Priority Flow
|
||||
Control) и ECN (Explicit Congestion Notification).
|
||||
- name: rdma_port_num
|
||||
type: int
|
||||
default: 1
|
||||
@@ -127,39 +114,26 @@
|
||||
так что менять этот параметр обычно не нужно.
|
||||
- name: rdma_max_msg
|
||||
type: int
|
||||
default: 132096
|
||||
default: 1048576
|
||||
info: Maximum size of a single RDMA send or receive operation in bytes.
|
||||
info_ru: Максимальный размер одной RDMA-операции отправки или приёма.
|
||||
- name: rdma_max_recv
|
||||
type: int
|
||||
default: 16
|
||||
info: |
|
||||
Maximum number of RDMA receive buffers per connection (RDMA requires
|
||||
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
|
||||
in size. So this setting directly affects memory usage: a single Vitastor
|
||||
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
|
||||
Default is roughly 2 MB * number of OSDs.
|
||||
info_ru: |
|
||||
Максимальное число буферов для RDMA-приёма данных на одно соединение
|
||||
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
|
||||
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
|
||||
потребление памяти - один Vitastor-клиент с RDMA использует
|
||||
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
|
||||
примерно 2 МБ * число OSD.
|
||||
- name: rdma_max_send
|
||||
type: int
|
||||
default: 8
|
||||
info: |
|
||||
Maximum number of outstanding RDMA send operations per connection. Should be
|
||||
less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
|
||||
Doesn't affect memory usage - additional memory isn't allocated for send
|
||||
operations.
|
||||
Maximum number of parallel RDMA receive operations. Note that this number
|
||||
of receive buffers `rdma_max_msg` in size are allocated for each client,
|
||||
so this setting actually affects memory usage. This is because RDMA receive
|
||||
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
|
||||
later versions.
|
||||
info_ru: |
|
||||
Максимальное число RDMA-операций отправки, отправляемых в очередь одного
|
||||
соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
|
||||
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
||||
не выделяется.
|
||||
Максимальное число параллельных RDMA-операций получения данных. Следует
|
||||
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
|
||||
для каждого подключённого клиентского соединения, так что данная настройка
|
||||
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
|
||||
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
|
||||
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
|
||||
более новых версиях Vitastor.
|
||||
- name: peer_connect_interval
|
||||
type: sec
|
||||
min: 1
|
||||
|
@@ -9,7 +9,7 @@
|
||||
## Debian
|
||||
|
||||
- Trust Vitastor package signing key:
|
||||
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
|
||||
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
|
||||
- Add Vitastor package repository to your /etc/apt/sources.list:
|
||||
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
@@ -20,8 +20,8 @@
|
||||
## CentOS
|
||||
|
||||
- Add Vitastor package repository:
|
||||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
|
||||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
|
||||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
|
||||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
|
||||
- Enable EPEL: `yum/dnf install epel-release`
|
||||
- Enable additional CentOS repositories:
|
||||
- CentOS 7: `yum install centos-release-scl`
|
||||
|
@@ -9,7 +9,7 @@
|
||||
## Debian
|
||||
|
||||
- Добавьте ключ репозитория Vitastor:
|
||||
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
|
||||
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
|
||||
- Добавьте репозиторий Vitastor в /etc/apt/sources.list:
|
||||
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
|
||||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
|
||||
@@ -20,8 +20,8 @@
|
||||
## CentOS
|
||||
|
||||
- Добавьте в систему репозиторий Vitastor:
|
||||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
|
||||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
|
||||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
|
||||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
|
||||
- Включите EPEL: `yum/dnf install epel-release`
|
||||
- Включите дополнительные репозитории CentOS:
|
||||
- CentOS 7: `yum install centos-release-scl`
|
||||
|
@@ -35,24 +35,15 @@ Write amplification for 4 KB blocks is usually 3-5 in Vitastor:
|
||||
If you manage to get an SSD which handles 512 byte blocks well (Optane?) you may
|
||||
lower 1, 3 and 4 to 512 bytes (1/8 of data size) and get WA as low as 2.375.
|
||||
|
||||
Implemented NVDIMM support can basically eliminate WA at all - all extra writes will
|
||||
go to DRAM memory. But this requires a test cluster with NVDIMM - please contact me
|
||||
if you want to provide me with such cluster for tests.
|
||||
|
||||
Lazy fsync also reduces WA for parallel workloads because journal blocks are only
|
||||
written when they fill up or fsync is requested.
|
||||
|
||||
## In Practice
|
||||
|
||||
In practice, using tests from [Understanding Performance](understanding.en.md), decent TCP network,
|
||||
good server-grade SSD/NVMe drives and disabled CPU power saving, you should head for:
|
||||
In practice, using tests from [Understanding Performance](understanding.en.md)
|
||||
and good server-grade SSD/NVMe drives, you should head for:
|
||||
- At least 5000 T1Q1 replicated read and write iops (maximum 0.2ms latency)
|
||||
- At least 5000 T1Q1 EC read IOPS and at least 2200 EC write IOPS (maximum 0.45ms latency)
|
||||
- At least ~80k parallel read iops or ~30k write iops per 1 core (1 OSD)
|
||||
- Disk-speed or wire-speed linear reads and writes, whichever is the bottleneck in your case
|
||||
|
||||
Lower results may mean that you have bad drives, bad network or some kind of misconfiguration.
|
||||
|
||||
Current latency records:
|
||||
- 9668 T1Q1 replicated write iops (0.103 ms latency) with TCP and NVMe
|
||||
- 9143 T1Q1 replicated read iops (0.109 ms latency) with TCP and NVMe
|
||||
|
@@ -36,25 +36,6 @@ WA (мультипликатор записи) для 4 КБ блоков в Vit
|
||||
Если вы найдёте SSD, хорошо работающий с 512-байтными блоками данных (Optane?),
|
||||
то 1, 3 и 4 можно снизить до 512 байт (1/8 от размера данных) и получить WA всего 2.375.
|
||||
|
||||
Если реализовать поддержку NVDIMM, то WA можно, условно говоря, ликвидировать вообще - все
|
||||
дополнительные операции записи смогут обслуживаться DRAM памятью. Но для этого необходим
|
||||
тестовый кластер с NVDIMM - пишите, если готовы предоставить такой для тестов.
|
||||
|
||||
Кроме того, WA снижается при использовании отложенного/ленивого сброса при параллельной
|
||||
нагрузке, т.к. блоки журнала записываются на диск только когда они заполняются или явным
|
||||
образом запрашивается fsync.
|
||||
|
||||
## На практике
|
||||
|
||||
На практике, используя тесты fio со страницы [Понимание сути производительности систем хранения](understanding.ru.md),
|
||||
нормальную TCP-сеть, хорошие серверные SSD/NVMe, при отключённом энергосбережении процессоров вы можете рассчитывать на:
|
||||
- От 5000 IOPS в 1 поток (T1Q1) и на чтение, и на запись при использовании репликации (задержка до 0.2мс)
|
||||
- От 5000 IOPS в 1 поток (T1Q1) на чтение и 2200 IOPS в 1 поток на запись при использовании EC (задержка до 0.45мс)
|
||||
- От 80000 IOPS на чтение в параллельном режиме на 1 ядро, от 30000 IOPS на запись на 1 ядро (на 1 OSD)
|
||||
- Скорость параллельного линейного чтения и записи, равная меньшему значению из скорости дисков или сети
|
||||
|
||||
Худшие результаты означают, что у вас либо медленные диски, либо медленная сеть, либо что-то неправильно настроено.
|
||||
|
||||
Зафиксированный на данный момент рекорд задержки:
|
||||
- 9668 IOPS (0.103 мс задержка) в 1 поток (T1Q1) на запись с TCP и NVMe при использовании репликации
|
||||
- 9143 IOPS (0.109 мс задержка) в 1 поток (T1Q1) на чтение с TCP и NVMe при использовании репликации
|
||||
|
@@ -14,7 +14,6 @@ It supports the following commands:
|
||||
- [df](#df)
|
||||
- [ls](#ls)
|
||||
- [create](#create)
|
||||
- [snap-create](#create)
|
||||
- [modify](#modify)
|
||||
- [rm](#rm)
|
||||
- [flatten](#flatten)
|
||||
@@ -124,8 +123,6 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
|
||||
Create a snapshot of image `<name>` (either form can be used). May be used live if only a single writer is active.
|
||||
|
||||
See also about [how to export snapshots](qemu.en.md#exporting-snapshots).
|
||||
|
||||
## modify
|
||||
|
||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
||||
|
@@ -15,7 +15,6 @@ vitastor-cli - интерфейс командной строки для адм
|
||||
- [df](#df)
|
||||
- [ls](#ls)
|
||||
- [create](#create)
|
||||
- [snap-create](#create)
|
||||
- [modify](#modify)
|
||||
- [rm](#rm)
|
||||
- [flatten](#flatten)
|
||||
@@ -127,8 +126,6 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
||||
Создать снимок образа `<name>` (можно использовать любую форму команды). Снимок можно создавать без остановки
|
||||
клиентов, если пишущий клиент максимум 1.
|
||||
|
||||
Смотрите также информацию о том, [как экспортировать снимки](qemu.ru.md#экспорт-снимков).
|
||||
|
||||
## modify
|
||||
|
||||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
|
||||
|
@@ -1,4 +1,4 @@
|
||||
[Documentation](../../README.md#documentation) → Usage → Disk management tool
|
||||
[Documentation](../../README.md#documentation) → Usage → Disk Tool
|
||||
|
||||
-----
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
[Документация](../../README-ru.md#документация) → Использование → Инструмент управления дисками
|
||||
[Документация](../../README-ru.md#документация) → Использование → Управление дисками
|
||||
|
||||
-----
|
||||
|
||||
|
@@ -46,40 +46,3 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=192.168.7
|
||||
|
||||
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
|
||||
if you don't want to use inode metadata.
|
||||
|
||||
### Exporting snapshots
|
||||
|
||||
Starting with 0.8.4, you can also export individual layers (snapshot diffs) using `qemu-img`.
|
||||
|
||||
Suppose you have an image `testimg` and a snapshot `testimg@0` created with `vitastor-cli snap-create testimg@0`.
|
||||
|
||||
Then you can export the `testimg@0` snapshot and the data written to `testimg` after creating
|
||||
the snapshot separately using the following commands (key points are using `skip-parents=1` and
|
||||
`-B backing_file` option):
|
||||
|
||||
```
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
|
||||
-O qcow2 testimg_0.qcow2
|
||||
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
|
||||
-O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
|
||||
```
|
||||
|
||||
In fact, with `cluster_size=4k` any QCOW2 file can be used instead `-B testimg_0.qcow2`, even an empty one.
|
||||
|
||||
QCOW2 `cluster_size=4k` option is required if you want `testimg.qcow2` to contain only the data
|
||||
overwritten **exactly** in the child layer. With the default 64 KB QCOW2 cluster size you'll
|
||||
get a bit of extra data from parent layers, i.e. a 4 KB overwrite will result in `testimg.qcow2`
|
||||
containing 64 KB of data. And this extra data will be taken by `qemu-img` from the file passed
|
||||
in `-B` option, so you really need 4 KB cluster if you use an empty image in `-B`.
|
||||
|
||||
After this procedure you'll get two chained QCOW2 images. To detach `testimg.qcow2` from
|
||||
its parent, run:
|
||||
|
||||
```
|
||||
qemu-img rebase -u -b '' testimg.qcow2
|
||||
```
|
||||
|
||||
This can be used for backups. Just note that exporting an image that is currently being written to
|
||||
is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
|
||||
on a live VM.
|
||||
|
@@ -50,40 +50,3 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.
|
||||
|
||||
Если вы не хотите обращаться к образу по имени, вместо `:image=<IMAGE>` можно указать номер пула, номер инода и размер:
|
||||
`:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
|
||||
|
||||
### Экспорт снимков
|
||||
|
||||
Начиная с 0.8.4 вы можете экспортировать отдельные слои (изменения в снимках) с помощью `qemu-img`.
|
||||
|
||||
Допустим, что у вас есть образ `testimg` и его снимок `testimg@0`, созданный с помощью `vitastor-cli snap-create testimg@0`.
|
||||
|
||||
Тогда вы можете выгрузить снимок `testimg@0` и данные, изменённые в `testimg` после создания снимка, отдельно,
|
||||
с помощью следующих команд (ключевые моменты - использование `skip-parents=1` и опции `-B backing_file.qcow2`):
|
||||
|
||||
```
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
|
||||
-O qcow2 testimg_0.qcow2
|
||||
|
||||
qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
|
||||
-O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
|
||||
```
|
||||
|
||||
На самом деле, с `cluster_size=4k` вместо `-B testimg_0.qcow2` можно использовать любой qcow2-файл,
|
||||
даже пустой.
|
||||
|
||||
Опция QCOW2 `cluster_size=4k` нужна, если вы хотите, чтобы `testimg.qcow2` содержал **в точности**
|
||||
данные, перезаписанные в дочернем слое. С размером кластера QCOW2 по умолчанию, составляющим 64 КБ,
|
||||
вы получите немного "лишних" данных из родительских слоёв - перезапись 4 КБ будет приводить к тому,
|
||||
что в `testimg.qcow2` будет появляться 64 КБ данных. Причём "лишние" данные qemu-img будет брать
|
||||
как раз из файла, указанного в опции `-B`, так что если там указан пустой образ, кластер обязан быть 4 КБ.
|
||||
|
||||
После данной процедуры вы получите два QCOW2-образа, связанных в цепочку. Чтобы "отцепить" образ
|
||||
`testimg.qcow2` от базового, выполните:
|
||||
|
||||
```
|
||||
qemu-img rebase -u -b '' testimg.qcow2
|
||||
```
|
||||
|
||||
Это можно использовать для резервного копирования. Только помните, что экспортировать образ, в который
|
||||
в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
|
||||
с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.
|
||||
|
2
json11
2
json11
Submodule json11 updated: fd37016cf8...52a3af664f
@@ -550,8 +550,8 @@ function random_combinations(osd_tree, pg_size, count, ordered)
|
||||
seed ^= seed << 5;
|
||||
return seed + 2147483648;
|
||||
};
|
||||
const hosts = Object.keys(osd_tree).sort();
|
||||
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
|
||||
const hosts = Object.keys(osd_tree).sort().filter(h => osds[h].length > 0);
|
||||
const r = {};
|
||||
// Generate random combinations including each OSD at least once
|
||||
for (let h = 0; h < hosts.length; h++)
|
||||
|
@@ -79,7 +79,7 @@ StartLimitInterval=0
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
WantedBy=local.target
|
||||
`);
|
||||
await system(`useradd etcd`);
|
||||
await system(`systemctl daemon-reload`);
|
||||
|
@@ -70,9 +70,9 @@ const etcd_tree = {
|
||||
rdma_gid_index: 0,
|
||||
rdma_mtu: 4096,
|
||||
rdma_max_sge: 128,
|
||||
rdma_max_send: 8,
|
||||
rdma_max_recv: 16,
|
||||
rdma_max_msg: 132096,
|
||||
rdma_max_send: 32,
|
||||
rdma_max_recv: 8,
|
||||
rdma_max_msg: 1048576,
|
||||
log_level: 0,
|
||||
block_size: 131072,
|
||||
disk_alignment: 4096,
|
||||
@@ -261,7 +261,7 @@ const etcd_tree = {
|
||||
/* <pool_id>: {
|
||||
<pg_id>: {
|
||||
primary: osd_num_t,
|
||||
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
||||
state: ("starting"|"peering"|"peered"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
||||
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
||||
"has_invalid"|"left_on_dead")[],
|
||||
}
|
||||
|
@@ -16,11 +16,6 @@ use PVE::Tools qw(run_command);
|
||||
|
||||
use base qw(PVE::Storage::Plugin);
|
||||
|
||||
if (@PVE::Storage::Plugin::SHARED_STORAGE)
|
||||
{
|
||||
push @PVE::Storage::Plugin::SHARED_STORAGE, 'vitastor';
|
||||
}
|
||||
|
||||
sub api
|
||||
{
|
||||
# Trick it :)
|
||||
@@ -138,11 +133,9 @@ sub properties
|
||||
sub options
|
||||
{
|
||||
return {
|
||||
shared => { optional => 1 },
|
||||
content => { optional => 1 },
|
||||
nodes => { optional => 1 },
|
||||
disable => { optional => 1 },
|
||||
vitastor_etcd_address => { optional => 1 },
|
||||
vitastor_etcd_address => { optional => 1},
|
||||
vitastor_etcd_prefix => { optional => 1 },
|
||||
vitastor_config_path => { optional => 1 },
|
||||
vitastor_prefix => { optional => 1 },
|
||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VERSION = '0.8.6'
|
||||
VERSION = '0.8.3'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
@@ -25,4 +25,4 @@ rm fio
|
||||
mv fio-copy fio
|
||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-0.8.6/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.6$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-0.8.3/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.3$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.8.6.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.8.3.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.8.6
|
||||
Version: 0.8.3
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.8.6.el7.tar.gz
|
||||
Source0: vitastor-0.8.3.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
@@ -35,7 +35,6 @@ Summary: Vitastor - OSD
|
||||
Requires: libJerasure2
|
||||
Requires: libisa-l
|
||||
Requires: liburing >= 0.6
|
||||
Requires: liburing < 2
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
Requires: util-linux
|
||||
Requires: parted
|
||||
@@ -60,7 +59,6 @@ scheduling cluster-level operations.
|
||||
%package -n vitastor-client
|
||||
Summary: Vitastor - client
|
||||
Requires: liburing >= 0.6
|
||||
Requires: liburing < 2
|
||||
|
||||
|
||||
%description -n vitastor-client
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.8.6.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.8.3.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.8.6
|
||||
Version: 0.8.3
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.8.6.el8.tar.gz
|
||||
Source0: vitastor-0.8.3.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
@@ -34,7 +34,6 @@ Summary: Vitastor - OSD
|
||||
Requires: libJerasure2
|
||||
Requires: libisa-l
|
||||
Requires: liburing >= 0.6
|
||||
Requires: liburing < 2
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
Requires: util-linux
|
||||
Requires: parted
|
||||
@@ -58,7 +57,6 @@ scheduling cluster-level operations.
|
||||
%package -n vitastor-client
|
||||
Summary: Vitastor - client
|
||||
Requires: liburing >= 0.6
|
||||
Requires: liburing < 2
|
||||
|
||||
|
||||
%description -n vitastor-client
|
||||
|
@@ -1,9 +1,8 @@
|
||||
cmake_minimum_required(VERSION 2.8.12)
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
include(GNUInstallDirs)
|
||||
include(CTest)
|
||||
|
||||
set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree")
|
||||
set(WITH_FIO true CACHE BOOL "Build FIO driver")
|
||||
@@ -16,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="0.8.6")
|
||||
add_definitions(-DVERSION="0.8.3")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||
@@ -56,14 +55,6 @@ if (ISAL_LIBRARIES)
|
||||
add_definitions(-DWITH_ISAL)
|
||||
endif (ISAL_LIBRARIES)
|
||||
|
||||
add_custom_target(build_tests)
|
||||
add_custom_target(test
|
||||
COMMAND
|
||||
echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
|
||||
env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
|
||||
)
|
||||
add_dependencies(test build_tests)
|
||||
|
||||
include_directories(
|
||||
../
|
||||
/usr/include/jerasure
|
||||
@@ -154,6 +145,7 @@ add_library(vitastor_client SHARED
|
||||
set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h")
|
||||
target_link_libraries(vitastor_client
|
||||
vitastor_common
|
||||
tcmalloc_minimal
|
||||
${LIBURING_LIBRARIES}
|
||||
${IBVERBS_LIBRARIES}
|
||||
)
|
||||
@@ -243,17 +235,14 @@ add_executable(osd_test osd_test.cpp rw_blocking.cpp addr_util.cpp)
|
||||
target_link_libraries(osd_test tcmalloc_minimal)
|
||||
|
||||
# osd_rmw_test
|
||||
add_executable(osd_rmw_test EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
|
||||
# FIXME: Move to tests
|
||||
add_executable(osd_rmw_test osd_rmw_test.cpp allocator.cpp)
|
||||
target_link_libraries(osd_rmw_test Jerasure ${ISAL_LIBRARIES} tcmalloc_minimal)
|
||||
add_dependencies(build_tests osd_rmw_test)
|
||||
add_test(NAME osd_rmw_test COMMAND osd_rmw_test)
|
||||
|
||||
if (ISAL_LIBRARIES)
|
||||
add_executable(osd_rmw_test_je EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
|
||||
add_executable(osd_rmw_test_je osd_rmw_test.cpp allocator.cpp)
|
||||
target_compile_definitions(osd_rmw_test_je PUBLIC -DNO_ISAL)
|
||||
target_link_libraries(osd_rmw_test_je Jerasure tcmalloc_minimal)
|
||||
add_dependencies(build_tests osd_rmw_test_je)
|
||||
add_test(NAME osd_rmw_test_jerasure COMMAND osd_rmw_test_je)
|
||||
endif (ISAL_LIBRARIES)
|
||||
|
||||
# stub_uring_osd
|
||||
@@ -268,15 +257,11 @@ target_link_libraries(stub_uring_osd
|
||||
)
|
||||
|
||||
# osd_peering_pg_test
|
||||
add_executable(osd_peering_pg_test EXCLUDE_FROM_ALL osd_peering_pg_test.cpp osd_peering_pg.cpp)
|
||||
add_executable(osd_peering_pg_test osd_peering_pg_test.cpp osd_peering_pg.cpp)
|
||||
target_link_libraries(osd_peering_pg_test tcmalloc_minimal)
|
||||
add_dependencies(build_tests osd_peering_pg_test)
|
||||
add_test(NAME osd_peering_pg_test COMMAND osd_peering_pg_test)
|
||||
|
||||
# test_allocator
|
||||
add_executable(test_allocator EXCLUDE_FROM_ALL test_allocator.cpp allocator.cpp)
|
||||
add_dependencies(build_tests test_allocator)
|
||||
add_test(NAME test_allocator COMMAND test_allocator)
|
||||
add_executable(test_allocator test_allocator.cpp allocator.cpp)
|
||||
|
||||
# test_cas
|
||||
add_executable(test_cas
|
||||
@@ -296,15 +281,12 @@ target_link_libraries(test_crc32
|
||||
|
||||
# test_cluster_client
|
||||
add_executable(test_cluster_client
|
||||
EXCLUDE_FROM_ALL
|
||||
test_cluster_client.cpp
|
||||
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
|
||||
etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
|
||||
)
|
||||
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
|
||||
target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
|
||||
add_dependencies(build_tests test_cluster_client)
|
||||
add_test(NAME test_cluster_client COMMAND test_cluster_client)
|
||||
|
||||
## test_blockstore, test_shit
|
||||
#add_executable(test_blockstore test_blockstore.cpp)
|
||||
|
@@ -325,7 +325,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
|
||||
{
|
||||
// Basic verification not passed
|
||||
op->retval = -EINVAL;
|
||||
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
|
||||
std::function<void (blockstore_op_t*)>(op->callback)(op);
|
||||
return;
|
||||
}
|
||||
if (op->opcode == BS_OP_SYNC_STAB_ALL)
|
||||
@@ -368,7 +368,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
|
||||
}
|
||||
if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
|
||||
{
|
||||
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
|
||||
std::function<void (blockstore_op_t*)>(op->callback)(op);
|
||||
return;
|
||||
}
|
||||
// Call constructor without allocating memory. We'll call destructor before returning op back
|
||||
|
@@ -121,7 +121,8 @@ resume_1:
|
||||
}
|
||||
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
|
||||
uint64_t pg_real_size = pool_stats[pool_cfg.id]["pg_real_size"].uint64_value();
|
||||
pool_avail = pg_real_size > 0 ? pool_avail * (pool_cfg.pg_size - pool_cfg.parity_chunks) / pg_real_size : 0;
|
||||
}
|
||||
pool_stats[pool_cfg.id] = json11::Json::object {
|
||||
{ "name", pool_cfg.name },
|
||||
|
@@ -403,7 +403,7 @@ struct snap_merger_t
|
||||
op->opcode = OSD_OP_READ_BITMAP;
|
||||
op->inode = target;
|
||||
op->offset = offset;
|
||||
op->len = target_block_size;
|
||||
op->len = 0;
|
||||
op->callback = [this](cluster_op_t *op)
|
||||
{
|
||||
if (op->retval < 0)
|
||||
|
@@ -92,7 +92,6 @@ struct rm_inode_t
|
||||
|
||||
void send_ops(rm_pg_t *cur_list)
|
||||
{
|
||||
parent->cli->init_msgr();
|
||||
if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
|
||||
parent->cli->msgr.osd_peer_fds.end())
|
||||
{
|
||||
|
@@ -59,6 +59,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||
delete op;
|
||||
};
|
||||
msgr.parse_config(this->config);
|
||||
msgr.init();
|
||||
|
||||
st_cli.tfd = tfd;
|
||||
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
|
||||
@@ -72,6 +73,17 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||
|
||||
scrap_buffer_size = SCRAP_BUFFER_SIZE;
|
||||
scrap_buffer = malloc_or_die(scrap_buffer_size);
|
||||
|
||||
if (ringloop)
|
||||
{
|
||||
consumer.loop = [this]()
|
||||
{
|
||||
msgr.read_requests();
|
||||
msgr.send_replies();
|
||||
this->ringloop->submit();
|
||||
};
|
||||
ringloop->register_consumer(&consumer);
|
||||
}
|
||||
}
|
||||
|
||||
cluster_client_t::~cluster_client_t()
|
||||
@@ -103,24 +115,6 @@ cluster_op_t::~cluster_op_t()
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::init_msgr()
|
||||
{
|
||||
if (msgr_initialized)
|
||||
return;
|
||||
msgr.init();
|
||||
msgr_initialized = true;
|
||||
if (ringloop)
|
||||
{
|
||||
consumer.loop = [this]()
|
||||
{
|
||||
msgr.read_requests();
|
||||
msgr.send_replies();
|
||||
this->ringloop->submit();
|
||||
};
|
||||
ringloop->register_consumer(&consumer);
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
{
|
||||
op->prev_wait = 0;
|
||||
@@ -149,7 +143,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
if (!op->prev_wait)
|
||||
continue_sync(op);
|
||||
}
|
||||
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) */
|
||||
else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
|
||||
{
|
||||
for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
|
||||
{
|
||||
@@ -157,8 +151,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
{
|
||||
op->prev_wait++;
|
||||
}
|
||||
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ ||
|
||||
prev->opcode == OSD_OP_READ_BITMAP || prev->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
|
||||
{
|
||||
// Flushes are always in the beginning (we're scanning from the beginning of the queue)
|
||||
break;
|
||||
@@ -178,8 +171,7 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
|
||||
auto n2 = next->next;
|
||||
if (next->opcode == OSD_OP_SYNC && !(flags & OP_IMMEDIATE_COMMIT) ||
|
||||
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
|
||||
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP ||
|
||||
next->opcode == OSD_OP_READ_CHAIN_BITMAP) && (flags & OP_FLUSH_BUFFER))
|
||||
(next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
|
||||
{
|
||||
next->prev_wait += inc;
|
||||
assert(next->prev_wait >= 0);
|
||||
@@ -229,14 +221,11 @@ void cluster_client_t::erase_op(cluster_op_t *op)
|
||||
if (op_queue_tail == op)
|
||||
op_queue_tail = op->prev;
|
||||
op->next = op->prev = NULL;
|
||||
if (flags & OP_FLUSH_BUFFER)
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
if (!(flags & OP_IMMEDIATE_COMMIT))
|
||||
inc_wait(opcode, flags, next, -1);
|
||||
// Call callback at the end to avoid inconsistencies in prev_wait
|
||||
// if the callback adds more operations itself
|
||||
if (!(flags & OP_FLUSH_BUFFER))
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
}
|
||||
|
||||
void cluster_client_t::continue_ops(bool up_retry)
|
||||
@@ -348,8 +337,7 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
|
||||
// And now they have to be resliced!
|
||||
for (auto op = op_queue_head; op; op = op->next)
|
||||
{
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ ||
|
||||
op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) &&
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) &&
|
||||
INODE_POOL(op->cur_inode) == pool_item.first)
|
||||
{
|
||||
op->needs_reslice = true;
|
||||
@@ -421,7 +409,7 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
|
||||
void cluster_client_t::execute(cluster_op_t *op)
|
||||
{
|
||||
if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ &&
|
||||
op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_WRITE)
|
||||
op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_WRITE)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
@@ -453,7 +441,7 @@ void cluster_client_t::execute(cluster_op_t *op)
|
||||
return;
|
||||
}
|
||||
// Check alignment
|
||||
if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
|
||||
if ((op->opcode == OSD_OP_READ || op->opcode == OSD_OP_WRITE) && !op->len ||
|
||||
op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
@@ -714,7 +702,8 @@ resume_3:
|
||||
// Finished successfully
|
||||
// Even if the PG count has changed in meanwhile we treat it as success
|
||||
// because if some operations were invalid for the new PG count we'd get errors
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||
bool is_read = op->opcode == OSD_OP_READ;
|
||||
if (is_read)
|
||||
{
|
||||
// Check parent inode
|
||||
auto ino_it = st_cli.inode_config.find(op->cur_inode);
|
||||
@@ -738,11 +727,6 @@ resume_3:
|
||||
}
|
||||
}
|
||||
op->retval = op->len;
|
||||
if (op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||
{
|
||||
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
|
||||
op->retval = op->len / pool_cfg.bitmap_granularity;
|
||||
}
|
||||
erase_op(op);
|
||||
return 1;
|
||||
}
|
||||
@@ -766,10 +750,7 @@ resume_3:
|
||||
{
|
||||
for (int i = 0; i < op->parts.size(); i++)
|
||||
{
|
||||
if (!(op->parts[i].flags & PART_DONE))
|
||||
{
|
||||
op->parts[i].flags = PART_RETRY;
|
||||
}
|
||||
op->parts[i].flags = PART_RETRY;
|
||||
}
|
||||
goto resume_2;
|
||||
}
|
||||
@@ -828,19 +809,23 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
||||
uint64_t last_stripe = op->len > 0 ? ((op->offset + op->len - 1) / pg_block_size) * pg_block_size : first_stripe;
|
||||
op->retval = 0;
|
||||
op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
|
||||
{
|
||||
// Allocate memory for the bitmap
|
||||
unsigned object_bitmap_size = ((op->len / pool_cfg.bitmap_granularity + 7) / 8);
|
||||
unsigned object_bitmap_size = (((op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : op->len) / pool_cfg.bitmap_granularity + 7) / 8);
|
||||
object_bitmap_size = (object_bitmap_size < 8 ? 8 : object_bitmap_size);
|
||||
unsigned bitmap_mem = object_bitmap_size + (pool_cfg.data_block_size / pool_cfg.bitmap_granularity / 8 * pg_data_size) * op->parts.size();
|
||||
if (!op->bitmap_buf || op->bitmap_buf_size < bitmap_mem)
|
||||
if (op->bitmap_buf_size < bitmap_mem)
|
||||
{
|
||||
op->bitmap_buf = realloc_or_die(op->bitmap_buf, bitmap_mem);
|
||||
if (!op->bitmap_buf_size)
|
||||
{
|
||||
// First allocation
|
||||
memset(op->bitmap_buf, 0, object_bitmap_size);
|
||||
}
|
||||
op->part_bitmaps = (uint8_t*)op->bitmap_buf + object_bitmap_size;
|
||||
op->bitmap_buf_size = bitmap_mem;
|
||||
}
|
||||
memset(op->bitmap_buf, 0, bitmap_mem);
|
||||
}
|
||||
int iov_idx = 0;
|
||||
size_t iov_pos = 0;
|
||||
@@ -891,14 +876,13 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
|
||||
if (end == begin)
|
||||
op->done_count++;
|
||||
}
|
||||
else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
{
|
||||
add_iov(end-begin, false, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
|
||||
}
|
||||
op->parts[i].parent = op;
|
||||
op->parts[i].offset = begin;
|
||||
op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ||
|
||||
op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
|
||||
op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
|
||||
op->parts[i].pg_num = pg_num;
|
||||
op->parts[i].osd_num = 0;
|
||||
op->parts[i].flags = 0;
|
||||
@@ -927,10 +911,6 @@ bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len
|
||||
|
||||
bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
{
|
||||
if (!msgr_initialized)
|
||||
{
|
||||
init_msgr();
|
||||
}
|
||||
auto part = &op->parts[i];
|
||||
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
|
||||
auto pg_it = pool_cfg.pg_config.find(part->pg_num);
|
||||
@@ -949,7 +929,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
|
||||
);
|
||||
uint64_t meta_rev = 0;
|
||||
if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
|
||||
{
|
||||
auto ino_it = st_cli.inode_config.find(op->inode);
|
||||
if (ino_it != st_cli.inode_config.end())
|
||||
@@ -962,7 +942,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = next_op_id(),
|
||||
.opcode = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ? OSD_OP_READ : op->opcode,
|
||||
.opcode = op->opcode == OSD_OP_READ_BITMAP ? OSD_OP_READ : op->opcode,
|
||||
},
|
||||
.inode = op->cur_inode,
|
||||
.offset = part->offset,
|
||||
@@ -970,10 +950,8 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
|
||||
.meta_revision = meta_rev,
|
||||
.version = op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE ? op->version : 0,
|
||||
} },
|
||||
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
|
||||
? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
|
||||
.bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
|
||||
? pg_bitmap_size : 0),
|
||||
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
|
||||
.bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? pg_bitmap_size : 0),
|
||||
.callback = [this, part](osd_op_t *op_part)
|
||||
{
|
||||
handle_op_part(part);
|
||||
@@ -1152,11 +1130,11 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
else
|
||||
{
|
||||
// OK
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && !(op->flags & OP_IMMEDIATE_COMMIT))
|
||||
if (!(op->flags & OP_IMMEDIATE_COMMIT))
|
||||
dirty_osds.insert(part->osd_num);
|
||||
part->flags |= PART_DONE;
|
||||
op->done_count++;
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
|
||||
if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
|
||||
{
|
||||
copy_part_bitmap(op, part);
|
||||
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
||||
@@ -1180,12 +1158,7 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
|
||||
);
|
||||
uint32_t object_offset = (part->op.req.rw.offset - op->offset) / pool_cfg.bitmap_granularity;
|
||||
uint32_t part_offset = (part->op.req.rw.offset % pg_block_size) / pool_cfg.bitmap_granularity;
|
||||
uint32_t op_len = op->len / pool_cfg.bitmap_granularity;
|
||||
uint32_t part_len = pg_block_size/pool_cfg.bitmap_granularity - part_offset;
|
||||
if (part_len > op_len-object_offset)
|
||||
{
|
||||
part_len = op_len-object_offset;
|
||||
}
|
||||
uint32_t part_len = (op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : part->op.req.rw.len) / pool_cfg.bitmap_granularity;
|
||||
if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
|
||||
{
|
||||
// Copy bytes
|
||||
|
@@ -11,7 +11,6 @@
|
||||
#define INODE_LIST_DONE 1
|
||||
#define INODE_LIST_HAS_UNSTABLE 2
|
||||
#define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
|
||||
#define OSD_OP_READ_CHAIN_BITMAP 0x102
|
||||
|
||||
#define OSD_OP_IGNORE_READONLY 0x08
|
||||
|
||||
@@ -31,7 +30,7 @@ struct cluster_op_part_t
|
||||
|
||||
struct cluster_op_t
|
||||
{
|
||||
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP, OSD_OP_READ_CHAIN_BITMAP
|
||||
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP
|
||||
uint64_t inode;
|
||||
uint64_t offset;
|
||||
uint64_t len;
|
||||
@@ -40,13 +39,9 @@ struct cluster_op_t
|
||||
uint64_t version = 0;
|
||||
// now only OSD_OP_IGNORE_READONLY is supported
|
||||
uint64_t flags = 0;
|
||||
// negative retval is an error number
|
||||
// write and read return len on success
|
||||
// sync and delete return 0 on success
|
||||
// read_bitmap and read_chain_bitmap return the length of bitmap in bits(!)
|
||||
int retval;
|
||||
osd_op_buf_list_t iov;
|
||||
// READ, READ_BITMAP, READ_CHAIN_BITMAP return the bitmap here
|
||||
// READ and READ_BITMAP return the bitmap here
|
||||
void *bitmap_buf = NULL;
|
||||
std::function<void(cluster_op_t*)> callback;
|
||||
~cluster_op_t();
|
||||
@@ -104,14 +99,10 @@ class cluster_client_t
|
||||
std::vector<std::function<void(void)>> on_ready_hooks;
|
||||
std::vector<inode_list_t*> lists;
|
||||
int continuing_ops = 0;
|
||||
bool msgr_initialized = false;
|
||||
|
||||
public:
|
||||
etcd_state_client_t st_cli;
|
||||
|
||||
osd_messenger_t msgr;
|
||||
void init_msgr();
|
||||
|
||||
json11::Json config;
|
||||
json11::Json::object merged_config;
|
||||
|
||||
|
@@ -281,7 +281,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
|
||||
if (je->big_write.size > sizeof(journal_entry_big_write))
|
||||
{
|
||||
printf(json ? ",\"bitmap\":\"" : " (bitmap: ");
|
||||
for (int i = sizeof(journal_entry_big_write); i < je->big_write.size; i++)
|
||||
for (int i = sizeof(journal_entry_big_write); i < je->small_write.size; i++)
|
||||
{
|
||||
printf("%02x", ((uint8_t*)je)[i]);
|
||||
}
|
||||
|
@@ -26,7 +26,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)>
|
||||
buf_size = dsk.meta_len;
|
||||
void *data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
||||
lseek64(dsk.meta_fd, dsk.meta_offset, 0);
|
||||
read_blocking(dsk.meta_fd, data, dsk.meta_block_size);
|
||||
read_blocking(dsk.meta_fd, data, buf_size);
|
||||
// Check superblock
|
||||
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
|
||||
if (hdr->zero == 0 &&
|
||||
@@ -41,11 +41,8 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)>
|
||||
if (buf_size % dsk.meta_block_size)
|
||||
{
|
||||
buf_size = 8*dsk.meta_block_size;
|
||||
void *new_data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
||||
memcpy(new_data, data, dsk.meta_block_size);
|
||||
free(data);
|
||||
data = new_data;
|
||||
hdr = (blockstore_meta_header_v1_t *)data;
|
||||
data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
||||
}
|
||||
}
|
||||
dsk.bitmap_granularity = hdr->bitmap_granularity;
|
||||
|
@@ -305,10 +305,10 @@ int write_zero(int fd, uint64_t offset, uint64_t size)
|
||||
json11::Json read_parttable(std::string dev)
|
||||
{
|
||||
std::string part_dump;
|
||||
int r = shell_exec({ "sfdisk", "--json", dev }, "", &part_dump, NULL);
|
||||
int r = shell_exec({ "sfdisk", "--dump", dev, "--json" }, "", &part_dump, NULL);
|
||||
if (r == 255)
|
||||
{
|
||||
fprintf(stderr, "Error running sfdisk --json %s\n", dev.c_str());
|
||||
fprintf(stderr, "Error running sfdisk --dump %s --json\n", dev.c_str());
|
||||
return json11::Json(false);
|
||||
}
|
||||
// Decode partition table
|
||||
@@ -319,7 +319,7 @@ json11::Json read_parttable(std::string dev)
|
||||
pt = json11::Json::parse(part_dump, err);
|
||||
if (err != "")
|
||||
{
|
||||
fprintf(stderr, "sfdisk --json %s returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
|
||||
fprintf(stderr, "sfdisk --dump %s --json returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
|
||||
return json11::Json(false);
|
||||
}
|
||||
pt = pt["partitiontable"];
|
||||
|
@@ -867,6 +867,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "RECEIVED PG %u/%u HISTORY: %s\n", pool_id, pg_num, value.dump().c_str());
|
||||
auto & pg_cfg = this->pool_config[pool_id].pg_config[pg_num];
|
||||
pg_cfg.target_history.clear();
|
||||
pg_cfg.all_peers.clear();
|
||||
|
@@ -157,10 +157,10 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
||||
this->rdma_max_sge = 128;
|
||||
this->rdma_max_send = config["rdma_max_send"].uint64_value();
|
||||
if (!this->rdma_max_send)
|
||||
this->rdma_max_send = 8;
|
||||
this->rdma_max_send = 1;
|
||||
this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
|
||||
if (!this->rdma_max_recv)
|
||||
this->rdma_max_recv = 16;
|
||||
this->rdma_max_recv = 128;
|
||||
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
|
||||
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
||||
this->rdma_max_msg = 129*1024;
|
||||
|
@@ -138,7 +138,6 @@ protected:
|
||||
|
||||
std::vector<int> read_ready_clients;
|
||||
std::vector<int> write_ready_clients;
|
||||
// We don't use ringloop->set_immediate here because we may have no ringloop in client :)
|
||||
std::vector<std::function<void()>> set_immediate;
|
||||
|
||||
public:
|
||||
|
@@ -353,10 +353,8 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
||||
.wr_id = (uint64_t)(cl->peer_fd*2+1),
|
||||
.sg_list = sge,
|
||||
.num_sge = op_sge,
|
||||
.opcode = cl->rdma_conn->avail_recv > 0 ? IBV_WR_SEND_WITH_IMM : IBV_WR_SEND,
|
||||
.opcode = IBV_WR_SEND,
|
||||
.send_flags = IBV_SEND_SIGNALED,
|
||||
// Notify peer about our available incoming buffers
|
||||
.imm_data = (uint32_t)cl->rdma_conn->avail_recv,
|
||||
};
|
||||
int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
|
||||
if (err || bad_wr)
|
||||
@@ -365,25 +363,14 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
||||
exit(1);
|
||||
}
|
||||
cl->rdma_conn->cur_send++;
|
||||
cl->rdma_conn->avail_send--;
|
||||
cl->rdma_conn->avail_recv = 0;
|
||||
}
|
||||
|
||||
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||
{
|
||||
auto rc = cl->rdma_conn;
|
||||
if (rc->cur_send >= rc->max_send)
|
||||
if (!cl->send_list.size() || rc->cur_send > 0)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
if (!cl->send_list.size() || rc->use_flow_control && rc->avail_send <= 0)
|
||||
{
|
||||
if (rc->avail_recv)
|
||||
{
|
||||
// Only notify about available buffers so 2 peers don't lock each other
|
||||
rc->send_sizes.push_back(0);
|
||||
try_send_rdma_wr(cl, NULL, 0);
|
||||
}
|
||||
// Only send one batch at a time
|
||||
return true;
|
||||
}
|
||||
uint64_t op_size = 0, op_sge = 0;
|
||||
@@ -393,7 +380,6 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||
iovec & iov = cl->send_list[rc->send_pos];
|
||||
if (op_size >= rc->max_msg || op_sge >= rc->max_sge)
|
||||
{
|
||||
rc->send_sizes.push_back(op_size);
|
||||
try_send_rdma_wr(cl, sge, op_sge);
|
||||
op_sge = 0;
|
||||
op_size = 0;
|
||||
@@ -419,24 +405,18 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||
}
|
||||
if (op_sge > 0)
|
||||
{
|
||||
rc->send_sizes.push_back(op_size);
|
||||
try_send_rdma_wr(cl, sge, op_sge);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
|
||||
static void try_recv_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
||||
{
|
||||
ibv_sge sge = {
|
||||
.addr = (uintptr_t)buf,
|
||||
.length = (uint32_t)cl->rdma_conn->max_msg,
|
||||
.lkey = cl->rdma_conn->ctx->mr->lkey,
|
||||
};
|
||||
ibv_recv_wr *bad_wr = NULL;
|
||||
ibv_recv_wr wr = {
|
||||
.wr_id = (uint64_t)(cl->peer_fd*2),
|
||||
.sg_list = &sge,
|
||||
.num_sge = 1,
|
||||
.sg_list = sge,
|
||||
.num_sge = op_sge,
|
||||
};
|
||||
int err = ibv_post_recv(cl->rdma_conn->qp, &wr, &bad_wr);
|
||||
if (err || bad_wr)
|
||||
@@ -445,7 +425,6 @@ static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
|
||||
exit(1);
|
||||
}
|
||||
cl->rdma_conn->cur_recv++;
|
||||
cl->rdma_conn->avail_recv++;
|
||||
}
|
||||
|
||||
bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
|
||||
@@ -455,7 +434,12 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
|
||||
{
|
||||
void *buf = malloc_or_die(rc->max_msg);
|
||||
rc->recv_buffers.push_back(buf);
|
||||
try_recv_rdma_wr(cl, buf);
|
||||
ibv_sge sge = {
|
||||
.addr = (uintptr_t)buf,
|
||||
.length = (uint32_t)rc->max_msg,
|
||||
.lkey = rc->ctx->mr->lkey,
|
||||
};
|
||||
try_recv_rdma_wr(cl, &sge, 1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -492,7 +476,6 @@ void osd_messenger_t::handle_rdma_events()
|
||||
continue;
|
||||
}
|
||||
osd_client_t *cl = cl_it->second;
|
||||
auto rc = cl->rdma_conn;
|
||||
if (wc[i].status != IBV_WC_SUCCESS)
|
||||
{
|
||||
fprintf(stderr, "RDMA work request failed for client %d", client_id);
|
||||
@@ -506,65 +489,44 @@ void osd_messenger_t::handle_rdma_events()
|
||||
}
|
||||
if (!is_send)
|
||||
{
|
||||
rc->cur_recv--;
|
||||
if ((wc[i].wc_flags & IBV_WC_WITH_IMM) && wc[i].imm_data > 0)
|
||||
{
|
||||
rc->avail_send += wc[i].imm_data;
|
||||
rc->use_flow_control = true;
|
||||
}
|
||||
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
|
||||
cl->rdma_conn->cur_recv--;
|
||||
if (!handle_read_buffer(cl, cl->rdma_conn->recv_buffers[0], wc[i].byte_len))
|
||||
{
|
||||
// handle_read_buffer may stop the client
|
||||
continue;
|
||||
}
|
||||
try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
|
||||
rc->next_recv_buf = (rc->next_recv_buf+1) % rc->recv_buffers.size();
|
||||
try_send_rdma(cl);
|
||||
free(cl->rdma_conn->recv_buffers[0]);
|
||||
cl->rdma_conn->recv_buffers.erase(cl->rdma_conn->recv_buffers.begin(), cl->rdma_conn->recv_buffers.begin()+1);
|
||||
try_recv_rdma(cl);
|
||||
}
|
||||
else
|
||||
{
|
||||
rc->cur_send--;
|
||||
uint64_t sent_size = rc->send_sizes.at(0);
|
||||
rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
|
||||
int send_pos = 0, send_buf_pos = 0;
|
||||
while (sent_size > 0)
|
||||
cl->rdma_conn->cur_send--;
|
||||
if (!cl->rdma_conn->cur_send)
|
||||
{
|
||||
if (sent_size >= cl->send_list.at(send_pos).iov_len)
|
||||
// Wait for the whole batch
|
||||
for (int i = 0; i < cl->rdma_conn->send_pos; i++)
|
||||
{
|
||||
sent_size -= cl->send_list[send_pos].iov_len;
|
||||
send_pos++;
|
||||
if (cl->outbox[i].flags & MSGR_SENDP_FREE)
|
||||
{
|
||||
// Reply fully sent
|
||||
delete cl->outbox[i].op;
|
||||
}
|
||||
}
|
||||
else
|
||||
if (cl->rdma_conn->send_pos > 0)
|
||||
{
|
||||
send_buf_pos = sent_size;
|
||||
sent_size = 0;
|
||||
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+cl->rdma_conn->send_pos);
|
||||
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+cl->rdma_conn->send_pos);
|
||||
cl->rdma_conn->send_pos = 0;
|
||||
}
|
||||
}
|
||||
assert(rc->send_pos >= send_pos);
|
||||
if (rc->send_pos == send_pos)
|
||||
{
|
||||
rc->send_buf_pos -= send_buf_pos;
|
||||
}
|
||||
rc->send_pos -= send_pos;
|
||||
for (int i = 0; i < send_pos; i++)
|
||||
{
|
||||
if (cl->outbox[i].flags & MSGR_SENDP_FREE)
|
||||
if (cl->rdma_conn->send_buf_pos > 0)
|
||||
{
|
||||
// Reply fully sent
|
||||
delete cl->outbox[i].op;
|
||||
cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + cl->rdma_conn->send_buf_pos;
|
||||
cl->send_list[0].iov_len -= cl->rdma_conn->send_buf_pos;
|
||||
cl->rdma_conn->send_buf_pos = 0;
|
||||
}
|
||||
try_send_rdma(cl);
|
||||
}
|
||||
if (send_pos > 0)
|
||||
{
|
||||
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+send_pos);
|
||||
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+send_pos);
|
||||
}
|
||||
if (send_buf_pos > 0)
|
||||
{
|
||||
cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + send_buf_pos;
|
||||
cl->send_list[0].iov_len -= send_buf_pos;
|
||||
}
|
||||
try_send_rdma(cl);
|
||||
}
|
||||
}
|
||||
} while (event_count > 0);
|
||||
|
@@ -45,14 +45,12 @@ struct msgr_rdma_connection_t
|
||||
ibv_qp *qp = NULL;
|
||||
msgr_rdma_address_t addr;
|
||||
int max_send = 0, max_recv = 0, max_sge = 0;
|
||||
int cur_send = 0, cur_recv = 0, avail_recv = 0, avail_send = 0;
|
||||
bool use_flow_control = false;
|
||||
int cur_send = 0, cur_recv = 0;
|
||||
uint64_t max_msg = 0;
|
||||
|
||||
int send_pos = 0, send_buf_pos = 0;
|
||||
int next_recv_buf = 0;
|
||||
int recv_pos = 0, recv_buf_pos = 0;
|
||||
std::vector<void*> recv_buffers;
|
||||
std::vector<uint64_t> send_sizes;
|
||||
|
||||
~msgr_rdma_connection_t();
|
||||
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
|
||||
|
10
src/osd.cpp
10
src/osd.cpp
@@ -44,10 +44,9 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
||||
// FIXME: Use timerfd_interval based directly on io_uring
|
||||
this->tfd = epmgr->tfd;
|
||||
|
||||
if (!json_is_true(this->config["disable_blockstore"]))
|
||||
auto bs_cfg = json_to_bs(this->config);
|
||||
this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
|
||||
{
|
||||
auto bs_cfg = json_to_bs(this->config);
|
||||
this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
|
||||
// Autosync based on the number of unstable writes to prevent stalls due to insufficient journal space
|
||||
uint64_t max_autosync = bs->get_journal_size() / bs->get_block_size() / 2;
|
||||
if (autosync_writes > max_autosync)
|
||||
@@ -94,8 +93,7 @@ osd_t::~osd_t()
|
||||
{
|
||||
ringloop->unregister_consumer(&consumer);
|
||||
delete epmgr;
|
||||
if (bs)
|
||||
delete bs;
|
||||
delete bs;
|
||||
close(listen_fd);
|
||||
free(zero_buffer);
|
||||
}
|
||||
@@ -477,7 +475,7 @@ void osd_t::print_slow()
|
||||
}
|
||||
}
|
||||
}
|
||||
if (has_slow && bs)
|
||||
if (has_slow)
|
||||
{
|
||||
bs->dump_diagnostics();
|
||||
}
|
||||
|
@@ -152,7 +152,7 @@ class osd_t
|
||||
|
||||
bool stopping = false;
|
||||
int inflight_ops = 0;
|
||||
blockstore_t *bs = NULL;
|
||||
blockstore_t *bs;
|
||||
void *zero_buffer = NULL;
|
||||
uint64_t zero_buffer_size = 0;
|
||||
uint32_t bs_block_size, bs_bitmap_granularity, clean_entry_bitmap_size;
|
||||
|
@@ -182,10 +182,10 @@ json11::Json osd_t::get_statistics()
|
||||
char time_str[50] = { 0 };
|
||||
sprintf(time_str, "%ld.%03ld", ts.tv_sec, ts.tv_nsec/1000000);
|
||||
st["time"] = time_str;
|
||||
st["blockstore_ready"] = bs->is_started();
|
||||
st["data_block_size"] = (uint64_t)bs->get_block_size();
|
||||
if (bs)
|
||||
{
|
||||
st["blockstore_ready"] = bs->is_started();
|
||||
st["data_block_size"] = (uint64_t)bs->get_block_size();
|
||||
st["size"] = bs->get_block_count() * bs->get_block_size();
|
||||
st["free"] = bs->get_free_block_count() * bs->get_block_size();
|
||||
}
|
||||
@@ -233,8 +233,7 @@ void osd_t::report_statistics()
|
||||
json11::Json::object inode_space;
|
||||
json11::Json::object last_stat;
|
||||
pool_id_t last_pool = 0;
|
||||
std::map<uint64_t, uint64_t> bs_empty_space;
|
||||
auto & bs_inode_space = bs ? bs->get_inode_space_stats() : bs_empty_space;
|
||||
auto & bs_inode_space = bs->get_inode_space_stats();
|
||||
for (auto kv: bs_inode_space)
|
||||
{
|
||||
pool_id_t pool_id = INODE_POOL(kv.first);
|
||||
@@ -684,7 +683,7 @@ void osd_t::apply_pg_config()
|
||||
auto vec_all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end());
|
||||
if (currently_taken)
|
||||
{
|
||||
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING))
|
||||
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING | PG_PEERED))
|
||||
{
|
||||
if (pg_it->second.target_set == pg_cfg.target_set &&
|
||||
pg_it->second.target_history == pg_cfg.target_history &&
|
||||
@@ -695,6 +694,20 @@ void osd_t::apply_pg_config()
|
||||
}
|
||||
else
|
||||
{
|
||||
printf(
|
||||
"Repeer %u/%u because of history: %s vs %s\n",
|
||||
pool_id, pg_num,
|
||||
json11::Json(json11::Json::object {
|
||||
{ "target_set", pg_cfg.target_set },
|
||||
{ "osd_sets", pg_cfg.target_history },
|
||||
{ "all_peers", vec_all_peers },
|
||||
}).dump().c_str(),
|
||||
json11::Json(json11::Json::object {
|
||||
{ "target_set", pg_it->second.target_set },
|
||||
{ "osd_sets", pg_it->second.target_history },
|
||||
{ "all_peers", pg_it->second.all_peers },
|
||||
}).dump().c_str()
|
||||
);
|
||||
// Stop PG, reapply change after stopping
|
||||
stop_pg(pg_it->second);
|
||||
all_applied = false;
|
||||
@@ -872,6 +885,7 @@ void osd_t::report_pg_states()
|
||||
{ "all_peers", pg.all_peers },
|
||||
{ "osd_sets", pg.target_history },
|
||||
};
|
||||
printf("PG %u/%u HISTORY -> %s\n", pg.pool_id, pg.pg_num, json11::Json(history_value).dump().c_str());
|
||||
checks.push_back(json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", history_key },
|
||||
@@ -964,6 +978,13 @@ void osd_t::report_pg_states()
|
||||
}
|
||||
this->pgs.erase(pg_it);
|
||||
}
|
||||
else if (pg_it->second.state & PG_PEERED)
|
||||
{
|
||||
// Activate PG after PG PEERED state is reported along with history
|
||||
// (if the state wasn't changed again)
|
||||
pg_it->second.state = pg_it->second.state & ~PG_PEERED | PG_ACTIVE;
|
||||
report_pg_state(pg_it->second);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Push other PG state updates, if any
|
||||
|
@@ -50,6 +50,10 @@ void osd_t::handle_peers()
|
||||
still = true;
|
||||
}
|
||||
}
|
||||
else if (p.second.state & PG_PEERED)
|
||||
{
|
||||
still = true;
|
||||
}
|
||||
}
|
||||
if (!still)
|
||||
{
|
||||
@@ -70,6 +74,10 @@ void osd_t::handle_peers()
|
||||
}
|
||||
still = true;
|
||||
}
|
||||
else if (p.second.state & PG_PEERED)
|
||||
{
|
||||
still = true;
|
||||
}
|
||||
}
|
||||
if (!still)
|
||||
{
|
||||
@@ -92,7 +100,7 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
|
||||
{
|
||||
auto & pg = p.second;
|
||||
bool repeer = false;
|
||||
if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
|
||||
if (pg.state & (PG_PEERING | PG_PEERED | PG_ACTIVE | PG_INCOMPLETE))
|
||||
{
|
||||
for (osd_num_t pg_osd: pg.all_peers)
|
||||
{
|
||||
|
@@ -88,9 +88,13 @@ void pg_obj_state_check_t::walk()
|
||||
{
|
||||
// Activate as degraded
|
||||
// Current OSD set will be added into target_history on first write
|
||||
pg->state |= PG_DEGRADED;
|
||||
pg->state |= PG_DEGRADED | PG_PEERED;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Just activate
|
||||
pg->state |= PG_ACTIVE;
|
||||
}
|
||||
pg->state |= PG_ACTIVE;
|
||||
if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
|
||||
{
|
||||
pg->state |= PG_LEFT_ON_DEAD;
|
||||
@@ -456,10 +460,11 @@ void pg_t::calc_object_states(int log_level)
|
||||
void pg_t::print_state()
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||
(state & PG_STARTING) ? "starting" : "",
|
||||
(state & PG_OFFLINE) ? "offline" : "",
|
||||
(state & PG_PEERING) ? "peering" : "",
|
||||
(state & PG_PEERED) ? "peered" : "",
|
||||
(state & PG_INCOMPLETE) ? "incomplete" : "",
|
||||
(state & PG_ACTIVE) ? "active" : "",
|
||||
(state & PG_REPEERING) ? "repeering" : "",
|
||||
|
@@ -54,6 +54,5 @@ int main(int argc, char *argv[])
|
||||
{
|
||||
printf("dev: state=%lx\n", it.second.state);
|
||||
}
|
||||
delete pg.peering_state;
|
||||
return 0;
|
||||
}
|
||||
|
@@ -297,7 +297,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
||||
// Fail it immediately
|
||||
subop->peer_fd = -1;
|
||||
subop->reply.hdr.retval = -EPIPE;
|
||||
ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
||||
subop->callback(subop);
|
||||
}
|
||||
subop_idx++;
|
||||
}
|
||||
|
@@ -53,10 +53,7 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
||||
inode_stats[cur_op->req.rw.inode].op_count[inode_st_op]++;
|
||||
inode_stats[cur_op->req.rw.inode].op_sum[inode_st_op] += usec;
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
|
||||
{
|
||||
if (cur_op->op_data)
|
||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
|
||||
}
|
||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
|
||||
else
|
||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len;
|
||||
}
|
||||
@@ -238,7 +235,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
// Fail it immediately
|
||||
subop->peer_fd = -1;
|
||||
subop->reply.hdr.retval = -EPIPE;
|
||||
ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
||||
subop->callback(subop);
|
||||
}
|
||||
}
|
||||
i++;
|
||||
@@ -523,7 +520,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
||||
// Fail it immediately
|
||||
subops[i].peer_fd = -1;
|
||||
subops[i].reply.hdr.retval = -EPIPE;
|
||||
ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
||||
subops[i].callback(&subops[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -638,7 +635,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||
// Fail it immediately
|
||||
subops[i].peer_fd = -1;
|
||||
subops[i].reply.hdr.retval = -EPIPE;
|
||||
ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
|
||||
subops[i].callback(&subops[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -168,8 +168,8 @@ resume_3:
|
||||
auto it = std::lower_bound(pg.target_history.begin(), pg.target_history.end(), history_set);
|
||||
if (it == pg.target_history.end() || *it != history_set)
|
||||
pg.target_history.insert(it, history_set);
|
||||
pg.history_changed = true;
|
||||
}
|
||||
pg.history_changed = true;
|
||||
report_pg_states();
|
||||
resume_10:
|
||||
if (pg.epoch > pg.reported_epoch)
|
||||
|
@@ -759,18 +759,7 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
|
||||
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
|
||||
uint32_t &start, uint32_t &end)
|
||||
{
|
||||
bool required = false;
|
||||
for (int role = pg_minsize; role < pg_size; role++)
|
||||
{
|
||||
if (write_osd_set[role] != 0)
|
||||
{
|
||||
// Whole parity chunk is needed when we move the object
|
||||
if (write_osd_set[role] != read_osd_set[role])
|
||||
end = chunk_size;
|
||||
required = true;
|
||||
}
|
||||
}
|
||||
if (required && end != chunk_size)
|
||||
if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set)
|
||||
{
|
||||
// start & end are required for calc_rmw_parity
|
||||
for (int role = 0; role < pg_minsize; role++)
|
||||
@@ -781,6 +770,14 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
|
||||
end = std::max(stripes[role].req_end, end);
|
||||
}
|
||||
}
|
||||
for (int role = pg_minsize; role < pg_size; role++)
|
||||
{
|
||||
if (write_osd_set[role] != 0 && write_osd_set[role] != read_osd_set[role])
|
||||
{
|
||||
start = 0;
|
||||
end = chunk_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Set bitmap bits accordingly
|
||||
if (bitmap_granularity > 0)
|
||||
@@ -948,7 +945,7 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
|
||||
{
|
||||
if (write_osd_set[i])
|
||||
{
|
||||
memcpy((uint8_t*)subm + item_size*pg_minsize*j, (uint8_t*)matrix_data + item_size*pg_minsize*(i-pg_minsize), item_size*pg_minsize);
|
||||
memcpy(subm + item_size*pg_minsize*j, matrix_data + item_size*pg_minsize*(i-pg_minsize), item_size*pg_minsize);
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
@@ -24,7 +24,7 @@ void test11();
|
||||
void test12();
|
||||
void test13();
|
||||
void test14();
|
||||
void test15(bool second);
|
||||
void test15();
|
||||
void test16();
|
||||
|
||||
int main(int narg, char *args[])
|
||||
@@ -54,8 +54,7 @@ int main(int narg, char *args[])
|
||||
// Test 14
|
||||
test14();
|
||||
// Test 15
|
||||
test15(false);
|
||||
test15(true);
|
||||
test15();
|
||||
// Test 16
|
||||
test16();
|
||||
// End
|
||||
@@ -827,11 +826,12 @@ void test14()
|
||||
|
||||
***/
|
||||
|
||||
void test15(bool second)
|
||||
void test15()
|
||||
{
|
||||
const int bmp = 64*1024 / 4096 / 8;
|
||||
use_ec(4, 2, true);
|
||||
osd_num_t osd_set[4] = { 1, 2, (osd_num_t)(second ? 0 : 3), (osd_num_t)(second ? 4 : 0) };
|
||||
osd_num_t osd_set[4] = { 1, 2, 3, 0 };
|
||||
osd_num_t write_osd_set[4] = { 1, 2, 3, 0 };
|
||||
osd_rmw_stripe_t stripes[4] = {};
|
||||
unsigned bitmaps[4] = { 0 };
|
||||
// Test 15.0
|
||||
@@ -842,7 +842,7 @@ void test15(bool second)
|
||||
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
|
||||
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
|
||||
// Test 15.1
|
||||
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, osd_set, 64*1024, bmp);
|
||||
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, write_osd_set, 64*1024, bmp);
|
||||
for (int i = 0; i < 4; i++)
|
||||
stripes[i].bmp_buf = bitmaps+i;
|
||||
assert(rmw_buf);
|
||||
@@ -852,38 +852,36 @@ void test15(bool second)
|
||||
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
|
||||
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
||||
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
||||
assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
|
||||
assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
|
||||
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
|
||||
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
|
||||
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
|
||||
assert(stripes[1].read_buf == NULL);
|
||||
assert(stripes[2].read_buf == NULL);
|
||||
assert(stripes[3].read_buf == NULL);
|
||||
assert(stripes[0].write_buf == NULL);
|
||||
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
||||
assert(stripes[2+second].write_buf == rmw_buf);
|
||||
assert(stripes[3-second].write_buf == NULL);
|
||||
assert(stripes[2].write_buf == rmw_buf);
|
||||
assert(stripes[3].write_buf == NULL);
|
||||
// Test 15.2 - encode
|
||||
set_pattern(write_buf, 4*1024, PATTERN1);
|
||||
set_pattern(stripes[0].read_buf, 4*1024, PATTERN2);
|
||||
memset(stripes[0].bmp_buf, 0, bmp);
|
||||
memset(stripes[1].bmp_buf, 0, bmp);
|
||||
memset(stripes[2+second].write_buf, 0, 4096);
|
||||
calc_rmw_parity_ec(stripes, 4, 2, osd_set, osd_set, 64*1024, bmp);
|
||||
assert(second || *(uint32_t*)stripes[2].bmp_buf == 0x80);
|
||||
calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 64*1024, bmp);
|
||||
assert(*(uint32_t*)stripes[2].bmp_buf == 0x80);
|
||||
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
|
||||
assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
|
||||
assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
|
||||
assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
|
||||
assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
|
||||
assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
|
||||
assert(stripes[0].write_buf == NULL);
|
||||
assert(stripes[1].write_buf == (uint8_t*)write_buf);
|
||||
assert(stripes[2+second].write_buf == rmw_buf);
|
||||
assert(stripes[3-second].write_buf == NULL);
|
||||
// first parity is always xor :), second isn't...
|
||||
check_pattern(stripes[2+second].write_buf, 4*1024, second ? 0xb79a59a0ce8b9b81 : PATTERN1^PATTERN2);
|
||||
assert(stripes[2].write_buf == rmw_buf);
|
||||
assert(stripes[3].write_buf == NULL);
|
||||
check_pattern(stripes[2].write_buf, 4*1024, PATTERN1^PATTERN2); // first parity is always xor :)
|
||||
// Done
|
||||
free(rmw_buf);
|
||||
free(write_buf);
|
||||
use_ec(4, 2, false);
|
||||
use_ec(3, 2, false);
|
||||
}
|
||||
|
||||
/***
|
||||
@@ -986,5 +984,5 @@ void test16()
|
||||
// Done
|
||||
free(rmw_buf);
|
||||
free(write_buf);
|
||||
use_ec(4, 2, false);
|
||||
use_ec(3, 2, false);
|
||||
}
|
||||
|
@@ -150,7 +150,6 @@ int connect_osd(const char *osd_address, int osd_port)
|
||||
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
||||
{
|
||||
perror("connect");
|
||||
close(connect_fd);
|
||||
return -1;
|
||||
}
|
||||
int one = 1;
|
||||
|
@@ -3,11 +3,12 @@
|
||||
|
||||
#include "pg_states.h"
|
||||
|
||||
const int pg_state_bit_count = 14;
|
||||
const int pg_state_bit_count = 16;
|
||||
|
||||
const int pg_state_bits[14] = {
|
||||
const int pg_state_bits[16] = {
|
||||
PG_STARTING,
|
||||
PG_PEERING,
|
||||
PG_PEERED,
|
||||
PG_INCOMPLETE,
|
||||
PG_ACTIVE,
|
||||
PG_REPEERING,
|
||||
@@ -22,9 +23,10 @@ const int pg_state_bits[14] = {
|
||||
PG_LEFT_ON_DEAD,
|
||||
};
|
||||
|
||||
const char *pg_state_names[14] = {
|
||||
const char *pg_state_names[16] = {
|
||||
"starting",
|
||||
"peering",
|
||||
"peered",
|
||||
"incomplete",
|
||||
"active",
|
||||
"repeering",
|
||||
|
@@ -4,25 +4,27 @@
|
||||
#pragma once
|
||||
|
||||
// Placement group states
|
||||
// STARTING -> [acquire lock] -> PEERING -> INCOMPLETE|ACTIVE
|
||||
// STARTING -> [acquire lock] -> PEERING -> PEERED
|
||||
// PEERED -> [report history if required!] -> INCOMPLETE|ACTIVE
|
||||
// ACTIVE -> REPEERING -> PEERING
|
||||
// ACTIVE -> STOPPING -> OFFLINE -> [release lock]
|
||||
// Exactly one of these:
|
||||
#define PG_STARTING (1<<0)
|
||||
#define PG_PEERING (1<<1)
|
||||
#define PG_INCOMPLETE (1<<2)
|
||||
#define PG_ACTIVE (1<<3)
|
||||
#define PG_REPEERING (1<<4)
|
||||
#define PG_STOPPING (1<<5)
|
||||
#define PG_OFFLINE (1<<6)
|
||||
#define PG_PEERED (1<<2)
|
||||
#define PG_INCOMPLETE (1<<3)
|
||||
#define PG_ACTIVE (1<<4)
|
||||
#define PG_REPEERING (1<<5)
|
||||
#define PG_STOPPING (1<<6)
|
||||
#define PG_OFFLINE (1<<7)
|
||||
// Plus any of these:
|
||||
#define PG_DEGRADED (1<<7)
|
||||
#define PG_HAS_INCOMPLETE (1<<8)
|
||||
#define PG_HAS_DEGRADED (1<<9)
|
||||
#define PG_HAS_MISPLACED (1<<10)
|
||||
#define PG_HAS_UNCLEAN (1<<11)
|
||||
#define PG_HAS_INVALID (1<<12)
|
||||
#define PG_LEFT_ON_DEAD (1<<13)
|
||||
#define PG_DEGRADED (1<<8)
|
||||
#define PG_HAS_INCOMPLETE (1<<9)
|
||||
#define PG_HAS_DEGRADED (1<<10)
|
||||
#define PG_HAS_MISPLACED (1<<11)
|
||||
#define PG_HAS_UNCLEAN (1<<12)
|
||||
#define PG_HAS_INVALID (1<<13)
|
||||
#define PG_LEFT_ON_DEAD (1<<14)
|
||||
|
||||
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
|
||||
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
|
||||
|
@@ -53,7 +53,6 @@ typedef struct VitastorClient
|
||||
char *etcd_host;
|
||||
char *etcd_prefix;
|
||||
char *image;
|
||||
int skip_parents;
|
||||
uint64_t inode;
|
||||
uint64_t pool;
|
||||
uint64_t size;
|
||||
@@ -64,10 +63,6 @@ typedef struct VitastorClient
|
||||
int rdma_gid_index;
|
||||
int rdma_mtu;
|
||||
QemuMutex mutex;
|
||||
|
||||
uint64_t last_bitmap_inode, last_bitmap_offset, last_bitmap_len;
|
||||
uint32_t last_bitmap_granularity;
|
||||
uint8_t *last_bitmap;
|
||||
} VitastorClient;
|
||||
|
||||
typedef struct VitastorRPC
|
||||
@@ -77,9 +72,6 @@ typedef struct VitastorRPC
|
||||
QEMUIOVector *iov;
|
||||
long ret;
|
||||
int complete;
|
||||
uint64_t inode, offset, len;
|
||||
uint32_t bitmap_granularity;
|
||||
uint8_t *bitmap;
|
||||
} VitastorRPC;
|
||||
|
||||
static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task);
|
||||
@@ -155,7 +147,6 @@ static void vitastor_parse_filename(const char *filename, QDict *options, Error
|
||||
if (!strcmp(name, "inode") ||
|
||||
!strcmp(name, "pool") ||
|
||||
!strcmp(name, "size") ||
|
||||
!strcmp(name, "skip-parents") ||
|
||||
!strcmp(name, "use-rdma") ||
|
||||
!strcmp(name, "rdma-port_num") ||
|
||||
!strcmp(name, "rdma-gid-index") ||
|
||||
@@ -236,16 +227,13 @@ static void vitastor_aio_set_fd_handler(void *ctx, int fd, int unused1, IOHandle
|
||||
|
||||
static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
|
||||
{
|
||||
VitastorRPC task;
|
||||
VitastorClient *client = bs->opaque;
|
||||
void *image = NULL;
|
||||
int64_t ret = 0;
|
||||
qemu_mutex_init(&client->mutex);
|
||||
client->config_path = g_strdup(qdict_get_try_str(options, "config-path"));
|
||||
// FIXME: Rename to etcd_address
|
||||
client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd-host"));
|
||||
client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd-prefix"));
|
||||
client->skip_parents = qdict_get_try_int(options, "skip-parents", 0);
|
||||
client->use_rdma = qdict_get_try_int(options, "use-rdma", -1);
|
||||
client->rdma_device = g_strdup(qdict_get_try_str(options, "rdma-device"));
|
||||
client->rdma_port_num = qdict_get_try_int(options, "rdma-port-num", 0);
|
||||
@@ -255,25 +243,23 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
||||
vitastor_aio_set_fd_handler, bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix,
|
||||
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
|
||||
);
|
||||
image = client->image = g_strdup(qdict_get_try_str(options, "image"));
|
||||
client->image = g_strdup(qdict_get_try_str(options, "image"));
|
||||
client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
|
||||
// Get image metadata (size and readonly flag) or just wait until the client is ready
|
||||
if (!image)
|
||||
client->image = (char*)"x";
|
||||
task.complete = 0;
|
||||
task.bs = bs;
|
||||
if (qemu_in_coroutine())
|
||||
{
|
||||
vitastor_co_get_metadata(&task);
|
||||
}
|
||||
else
|
||||
{
|
||||
bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
|
||||
BDRV_POLL_WHILE(bs, !task.complete);
|
||||
}
|
||||
client->image = image;
|
||||
if (client->image)
|
||||
{
|
||||
// Get image metadata (size and readonly flag)
|
||||
VitastorRPC task;
|
||||
task.complete = 0;
|
||||
task.bs = bs;
|
||||
if (qemu_in_coroutine())
|
||||
{
|
||||
vitastor_co_get_metadata(&task);
|
||||
}
|
||||
else
|
||||
{
|
||||
bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
|
||||
BDRV_POLL_WHILE(bs, !task.complete);
|
||||
}
|
||||
client->watch = (void*)task.ret;
|
||||
client->readonly = client->readonly || vitastor_c_inode_get_readonly(client->watch);
|
||||
client->size = vitastor_c_inode_get_size(client->watch);
|
||||
@@ -298,7 +284,6 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
||||
client->inode = (client->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
|
||||
}
|
||||
client->size = qdict_get_try_int(options, "size", 0);
|
||||
vitastor_c_close_watch(client->proxy, (void*)task.ret);
|
||||
}
|
||||
if (!client->size)
|
||||
{
|
||||
@@ -320,7 +305,6 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
||||
qdict_del(options, "inode");
|
||||
qdict_del(options, "pool");
|
||||
qdict_del(options, "size");
|
||||
qdict_del(options, "skip-parents");
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -337,8 +321,6 @@ static void vitastor_close(BlockDriverState *bs)
|
||||
g_free(client->etcd_prefix);
|
||||
if (client->image)
|
||||
g_free(client->image);
|
||||
free(client->last_bitmap);
|
||||
client->last_bitmap = NULL;
|
||||
}
|
||||
|
||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
|
||||
@@ -504,13 +486,6 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
|
||||
vitastor_co_init_task(bs, &task);
|
||||
task.iov = iov;
|
||||
|
||||
if (client->last_bitmap)
|
||||
{
|
||||
// Invalidate last bitmap on write
|
||||
free(client->last_bitmap);
|
||||
client->last_bitmap = NULL;
|
||||
}
|
||||
|
||||
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_bh_cb, &task);
|
||||
@@ -524,140 +499,6 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
|
||||
return task.ret;
|
||||
}
|
||||
|
||||
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
|
||||
#if QEMU_VERSION_MAJOR >= 2 || QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7
|
||||
static void vitastor_co_read_bitmap_cb(void *opaque, long retval, uint8_t *bitmap)
|
||||
{
|
||||
VitastorRPC *task = opaque;
|
||||
VitastorClient *client = task->bs->opaque;
|
||||
task->ret = retval;
|
||||
task->complete = 1;
|
||||
if (retval >= 0)
|
||||
{
|
||||
task->bitmap = bitmap;
|
||||
if (client->last_bitmap_inode == task->inode &&
|
||||
client->last_bitmap_offset == task->offset &&
|
||||
client->last_bitmap_len == task->len)
|
||||
{
|
||||
free(client->last_bitmap);
|
||||
client->last_bitmap = bitmap;
|
||||
}
|
||||
}
|
||||
if (qemu_coroutine_self() != task->co)
|
||||
{
|
||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8
|
||||
aio_co_wake(task->co);
|
||||
#else
|
||||
qemu_coroutine_enter(task->co, NULL);
|
||||
qemu_aio_release(task);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static int coroutine_fn vitastor_co_block_status(
|
||||
BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
|
||||
int64_t *pnum, int64_t *map, BlockDriverState **file)
|
||||
{
|
||||
// Allocated => return BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID
|
||||
// Not allocated => return 0
|
||||
// Error => return -errno
|
||||
// Set pnum to length of the extent, `*map` = `offset`, `*file` = `bs`
|
||||
VitastorRPC task;
|
||||
VitastorClient *client = bs->opaque;
|
||||
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
||||
uint8_t bit = 0;
|
||||
if (client->last_bitmap && client->last_bitmap_inode == inode &&
|
||||
client->last_bitmap_offset <= offset &&
|
||||
client->last_bitmap_offset+client->last_bitmap_len >= (want_zero ? offset+1 : offset+bytes))
|
||||
{
|
||||
// Use the previously read bitmap
|
||||
task.bitmap_granularity = client->last_bitmap_granularity;
|
||||
task.offset = client->last_bitmap_offset;
|
||||
task.len = client->last_bitmap_len;
|
||||
task.bitmap = client->last_bitmap;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Read bitmap from this position, rounding to full inode PG blocks
|
||||
uint32_t block_size = vitastor_c_inode_get_block_size(client->proxy, inode);
|
||||
if (!block_size)
|
||||
return -EAGAIN;
|
||||
// Init coroutine
|
||||
vitastor_co_init_task(bs, &task);
|
||||
free(client->last_bitmap);
|
||||
task.inode = client->last_bitmap_inode = inode;
|
||||
task.bitmap_granularity = client->last_bitmap_granularity = vitastor_c_inode_get_bitmap_granularity(client->proxy, inode);
|
||||
task.offset = client->last_bitmap_offset = offset / block_size * block_size;
|
||||
task.len = client->last_bitmap_len = (offset+bytes+block_size-1) / block_size * block_size - task.offset;
|
||||
task.bitmap = client->last_bitmap = NULL;
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
|
||||
qemu_mutex_unlock(&client->mutex);
|
||||
while (!task.complete)
|
||||
{
|
||||
qemu_coroutine_yield();
|
||||
}
|
||||
if (task.ret < 0)
|
||||
{
|
||||
// Error
|
||||
return task.ret;
|
||||
}
|
||||
}
|
||||
if (want_zero)
|
||||
{
|
||||
// Get precise mapping with all holes
|
||||
uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
|
||||
uint64_t bmp_len = task.len / task.bitmap_granularity;
|
||||
uint64_t bmp_end = bmp_pos+1;
|
||||
bit = (task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1;
|
||||
while (bmp_end < bmp_len && ((task.bitmap[bmp_end >> 3] >> (bmp_end & 0x7)) & 1) == bit)
|
||||
{
|
||||
bmp_end++;
|
||||
}
|
||||
*pnum = (bmp_end-bmp_pos) * task.bitmap_granularity;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Get larger allocated extents, possibly with false positives
|
||||
uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
|
||||
uint64_t bmp_end = (offset+bytes-task.offset) / task.bitmap_granularity - bmp_pos;
|
||||
while (bmp_pos < bmp_end)
|
||||
{
|
||||
if (!(bmp_pos & 7) && bmp_end >= bmp_pos+8)
|
||||
{
|
||||
bit = bit || task.bitmap[bmp_pos >> 3];
|
||||
bmp_pos += 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
bit = bit || ((task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1);
|
||||
bmp_pos++;
|
||||
}
|
||||
}
|
||||
*pnum = bytes;
|
||||
}
|
||||
if (bit)
|
||||
{
|
||||
*map = offset;
|
||||
*file = bs;
|
||||
}
|
||||
return (bit ? (BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID) : 0);
|
||||
}
|
||||
#endif
|
||||
#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
|
||||
// QEMU 1.7-2.11
|
||||
static int64_t coroutine_fn vitastor_co_get_block_status(BlockDriverState *bs,
|
||||
int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
|
||||
{
|
||||
int64_t map = 0;
|
||||
int64_t pnumbytes = 0;
|
||||
int r = vitastor_co_block_status(bs, 1, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, &pnumbytes, &map, &file);
|
||||
*pnum = pnumbytes/BDRV_SECTOR_SIZE;
|
||||
return r;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !( QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7 )
|
||||
static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
|
||||
{
|
||||
@@ -765,15 +606,6 @@ static BlockDriver bdrv_vitastor = {
|
||||
.bdrv_co_truncate = vitastor_co_truncate,
|
||||
#endif
|
||||
|
||||
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
|
||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 12
|
||||
// For snapshot export
|
||||
.bdrv_co_block_status = vitastor_co_block_status,
|
||||
#elif QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
|
||||
.bdrv_co_get_block_status = vitastor_co_get_block_status,
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7
|
||||
.bdrv_co_preadv = vitastor_co_preadv,
|
||||
.bdrv_co_pwritev = vitastor_co_pwritev,
|
||||
|
@@ -25,6 +25,7 @@ ring_loop_t::ring_loop_t(int qd)
|
||||
{
|
||||
free_ring_data[i] = i;
|
||||
}
|
||||
wait_sqe_id = 1;
|
||||
}
|
||||
|
||||
ring_loop_t::~ring_loop_t()
|
||||
@@ -82,19 +83,17 @@ void ring_loop_t::loop()
|
||||
}
|
||||
io_uring_cqe_seen(&ring, cqe);
|
||||
}
|
||||
while (get_sqe_queue.size() > 0)
|
||||
{
|
||||
(get_sqe_queue[0].second)();
|
||||
get_sqe_queue.erase(get_sqe_queue.begin());
|
||||
}
|
||||
do
|
||||
{
|
||||
loop_again = false;
|
||||
for (int i = 0; i < consumers.size(); i++)
|
||||
{
|
||||
consumers[i]->loop();
|
||||
if (immediate_queue.size())
|
||||
{
|
||||
immediate_queue2.swap(immediate_queue);
|
||||
for (auto & cb: immediate_queue2)
|
||||
cb();
|
||||
immediate_queue2.clear();
|
||||
}
|
||||
}
|
||||
} while (loop_again);
|
||||
}
|
||||
|
@@ -119,10 +119,11 @@ struct ring_consumer_t
|
||||
|
||||
class ring_loop_t
|
||||
{
|
||||
std::vector<std::function<void()>> immediate_queue, immediate_queue2;
|
||||
std::vector<std::pair<int,std::function<void()>>> get_sqe_queue;
|
||||
std::vector<ring_consumer_t*> consumers;
|
||||
struct ring_data_t *ring_datas;
|
||||
int *free_ring_data;
|
||||
int wait_sqe_id;
|
||||
unsigned free_ring_data_ptr;
|
||||
bool loop_again;
|
||||
struct io_uring ring;
|
||||
@@ -144,9 +145,20 @@ public:
|
||||
}
|
||||
return sqe;
|
||||
}
|
||||
inline void set_immediate(const std::function<void()> cb)
|
||||
inline int wait_sqe(std::function<void()> cb)
|
||||
{
|
||||
immediate_queue.push_back(cb);
|
||||
get_sqe_queue.push_back({ wait_sqe_id, cb });
|
||||
return wait_sqe_id++;
|
||||
}
|
||||
inline void cancel_wait_sqe(int wait_id)
|
||||
{
|
||||
for (int i = 0; i < get_sqe_queue.size(); i++)
|
||||
{
|
||||
if (get_sqe_queue[i].first == wait_id)
|
||||
{
|
||||
get_sqe_queue.erase(get_sqe_queue.begin()+i, get_sqe_queue.begin()+i+1);
|
||||
}
|
||||
}
|
||||
}
|
||||
inline int submit()
|
||||
{
|
||||
|
@@ -15,7 +15,7 @@ int read_blocking(int fd, void *read_buf, size_t remaining)
|
||||
size_t done = 0;
|
||||
while (done < remaining)
|
||||
{
|
||||
ssize_t r = read(fd, read_buf, remaining-done);
|
||||
size_t r = read(fd, read_buf, remaining-done);
|
||||
if (r <= 0)
|
||||
{
|
||||
if (!errno)
|
||||
@@ -41,7 +41,7 @@ int write_blocking(int fd, void *write_buf, size_t remaining)
|
||||
size_t done = 0;
|
||||
while (done < remaining)
|
||||
{
|
||||
ssize_t r = write(fd, write_buf, remaining-done);
|
||||
size_t r = write(fd, write_buf, remaining-done);
|
||||
if (r < 0)
|
||||
{
|
||||
if (errno != EINTR && errno != EAGAIN && errno != EPIPE)
|
||||
|
@@ -83,7 +83,6 @@ int connect_stub(const char *server_address, int server_port)
|
||||
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
||||
{
|
||||
perror("connect");
|
||||
close(connect_fd);
|
||||
return -1;
|
||||
}
|
||||
int one = 1;
|
||||
|
@@ -8,6 +8,7 @@
|
||||
|
||||
void configure_single_pg_pool(cluster_client_t *cli)
|
||||
{
|
||||
cli->st_cli.on_load_pgs_hook(true);
|
||||
cli->st_cli.parse_state((etcd_kv_t){
|
||||
.key = "/config/pools",
|
||||
.value = json11::Json::object {
|
||||
@@ -42,7 +43,6 @@ void configure_single_pg_pool(cluster_client_t *cli)
|
||||
{ "state", json11::Json::array { "active" } },
|
||||
},
|
||||
});
|
||||
cli->st_cli.on_load_pgs_hook(true);
|
||||
std::map<std::string, etcd_kv_t> changes;
|
||||
cli->st_cli.on_change_hook(changes);
|
||||
}
|
||||
@@ -188,6 +188,7 @@ void test1()
|
||||
int *r1 = test_write(cli, 0, 4096, 0x55);
|
||||
configure_single_pg_pool(cli);
|
||||
pretend_connected(cli, 1);
|
||||
cli->continue_ops(true);
|
||||
can_complete(r1);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
|
||||
@@ -195,6 +196,8 @@ void test1()
|
||||
pretend_disconnected(cli, 1);
|
||||
int *r2 = test_sync(cli);
|
||||
pretend_connected(cli, 1);
|
||||
check_op_count(cli, 1, 0);
|
||||
cli->continue_ops(true);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
|
||||
check_op_count(cli, 1, 1);
|
||||
@@ -300,6 +303,8 @@ void test1()
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), -EPIPE);
|
||||
check_disconnected(cli, 1);
|
||||
pretend_connected(cli, 1);
|
||||
check_op_count(cli, 1, 0);
|
||||
cli->continue_ops(true);
|
||||
check_op_count(cli, 1, 1);
|
||||
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
|
||||
check_op_count(cli, 1, 1);
|
||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
Name: Vitastor
|
||||
Description: Vitastor client library
|
||||
Version: 0.8.6
|
||||
Version: 0.8.3
|
||||
Libs: -L${libdir} -lvitastor_client
|
||||
Cflags: -I${includedir}
|
||||
|
||||
|
@@ -207,28 +207,6 @@ void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint6
|
||||
client->cli->execute(op);
|
||||
}
|
||||
|
||||
void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
|
||||
int with_parents, VitastorReadBitmapHandler cb, void *opaque)
|
||||
{
|
||||
cluster_op_t *op = new cluster_op_t;
|
||||
op->opcode = with_parents ? OSD_OP_READ_CHAIN_BITMAP : OSD_OP_READ_BITMAP;
|
||||
op->inode = inode;
|
||||
op->offset = offset;
|
||||
op->len = len;
|
||||
op->callback = [cb, opaque](cluster_op_t *op)
|
||||
{
|
||||
uint8_t *bitmap = NULL;
|
||||
if (op->retval >= 0)
|
||||
{
|
||||
bitmap = (uint8_t*)op->bitmap_buf;
|
||||
op->bitmap_buf = NULL;
|
||||
}
|
||||
cb(opaque, op->retval, bitmap);
|
||||
delete op;
|
||||
};
|
||||
client->cli->execute(op);
|
||||
}
|
||||
|
||||
void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque)
|
||||
{
|
||||
cluster_op_t *op = new cluster_op_t;
|
||||
@@ -267,25 +245,6 @@ uint64_t vitastor_c_inode_get_num(void *handle)
|
||||
return watch->cfg.num;
|
||||
}
|
||||
|
||||
uint32_t vitastor_c_inode_get_block_size(vitastor_c *client, uint64_t inode_num)
|
||||
{
|
||||
auto pool_it = client->cli->st_cli.pool_config.find(INODE_POOL(inode_num));
|
||||
if (pool_it == client->cli->st_cli.pool_config.end())
|
||||
return 0;
|
||||
auto & pool_cfg = pool_it->second;
|
||||
uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
|
||||
return pool_cfg.data_block_size * pg_data_size;
|
||||
}
|
||||
|
||||
uint32_t vitastor_c_inode_get_bitmap_granularity(vitastor_c *client, uint64_t inode_num)
|
||||
{
|
||||
auto pool_it = client->cli->st_cli.pool_config.find(INODE_POOL(inode_num));
|
||||
if (pool_it == client->cli->st_cli.pool_config.end())
|
||||
return 0;
|
||||
// FIXME: READ_BITMAP may fails if parent bitmap granularity differs from inode bitmap granularity
|
||||
return pool_it->second.bitmap_granularity;
|
||||
}
|
||||
|
||||
int vitastor_c_inode_get_readonly(void *handle)
|
||||
{
|
||||
inode_watch_t *watch = (inode_watch_t*)handle;
|
||||
|
@@ -6,9 +6,6 @@
|
||||
#ifndef VITASTOR_QEMU_PROXY_H
|
||||
#define VITASTOR_QEMU_PROXY_H
|
||||
|
||||
// C API wrapper version
|
||||
#define VITASTOR_C_API_VERSION 1
|
||||
|
||||
#ifndef POOL_ID_BITS
|
||||
#define POOL_ID_BITS 16
|
||||
#endif
|
||||
@@ -24,7 +21,6 @@ typedef struct vitastor_c vitastor_c;
|
||||
|
||||
typedef void VitastorReadHandler(void *opaque, long retval, uint64_t version);
|
||||
typedef void VitastorIOHandler(void *opaque, long retval);
|
||||
typedef void VitastorReadBitmapHandler(void *opaque, long retval, uint8_t *bitmap);
|
||||
|
||||
// QEMU
|
||||
typedef void IOHandler(void *opaque);
|
||||
@@ -46,15 +42,11 @@ void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64
|
||||
struct iovec *iov, int iovcnt, VitastorReadHandler cb, void *opaque);
|
||||
void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
|
||||
struct iovec *iov, int iovcnt, VitastorIOHandler cb, void *opaque);
|
||||
void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
|
||||
int with_parents, VitastorReadBitmapHandler cb, void *opaque);
|
||||
void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque);
|
||||
void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler cb, void *opaque);
|
||||
void vitastor_c_close_watch(vitastor_c *client, void *handle);
|
||||
uint64_t vitastor_c_inode_get_size(void *handle);
|
||||
uint64_t vitastor_c_inode_get_num(void *handle);
|
||||
uint32_t vitastor_c_inode_get_block_size(vitastor_c *client, uint64_t inode_num);
|
||||
uint32_t vitastor_c_inode_get_bitmap_granularity(vitastor_c *client, uint64_t inode_num);
|
||||
int vitastor_c_inode_get_readonly(void *handle);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@@ -22,16 +22,6 @@ LD_PRELOAD="build/src/libfio_vitastor.so" \
|
||||
LD_PRELOAD="build/src/libfio_vitastor.so" \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -rw=read -etcd=$ETCD_URL -pool=1 -inode=3 -size=32M
|
||||
|
||||
qemu-img convert -p \
|
||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=2:size=$((32*1024*1024)):skip-parents=1" \
|
||||
-O qcow2 ./testdata/layer0.qcow2
|
||||
|
||||
qemu-img create -f qcow2 ./testdata/empty.qcow2 32M
|
||||
|
||||
qemu-img convert -p \
|
||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024)):skip-parents=1" \
|
||||
-O qcow2 -o 'cluster_size=4k' -B empty.qcow2 ./testdata/layer1.qcow2
|
||||
|
||||
qemu-img convert -S 4096 -p \
|
||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024))" \
|
||||
-O raw ./testdata/merged.bin
|
||||
@@ -62,18 +52,4 @@ qemu-img convert -S 4096 -p \
|
||||
|
||||
cmp ./testdata/merged.bin ./testdata/merged-by-tool.bin
|
||||
|
||||
# Test merge by qemu-img
|
||||
|
||||
qemu-img rebase -u -b layer0.qcow2 ./testdata/layer1.qcow2
|
||||
|
||||
qemu-img convert -S 4096 -f qcow2 ./testdata/layer1.qcow2 -O raw ./testdata/rebased.bin
|
||||
|
||||
cmp ./testdata/merged.bin ./testdata/rebased.bin
|
||||
|
||||
qemu-img rebase -u -b '' ./testdata/layer1.qcow2
|
||||
|
||||
qemu-img convert -S 4096 -f qcow2 ./testdata/layer1.qcow2 -O raw ./testdata/rebased.bin
|
||||
|
||||
cmp ./testdata/layer1.bin ./testdata/rebased.bin
|
||||
|
||||
format_green OK
|
||||
|
Reference in New Issue
Block a user